mirror of
https://github.com/CatalaLang/catala.git
synced 2024-11-08 07:51:43 +03:00
Resync the three lexers
We'll need to factorise that better in the not-too-far future: there was already a discrepancy between en and fr; and this won't scale if we keep adding languages.
This commit is contained in:
parent
b31bee71ad
commit
3b5c4c17cd
@ -310,7 +310,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "day" ->
|
||||
L.update_acc lexbuf;
|
||||
DAY
|
||||
| 0x24, Star white_space, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
|
||||
| 0x24, Star hspace, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
|
||||
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
|
||||
let full_str = Utf8.lexeme lexbuf in
|
||||
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
|
||||
@ -552,7 +552,8 @@ let lex_law (lexbuf : lexbuf) : token =
|
||||
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||
L.get_law_heading lexbuf
|
||||
| _ -> (
|
||||
(* Nested match for lower priority; `_` matches length 0 *)
|
||||
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
|
||||
sub-match at the same point *)
|
||||
let lexbuf = lexbuf in
|
||||
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||
match%sedlex lexbuf with
|
||||
|
@ -43,6 +43,8 @@ let token_list : (string * token) list =
|
||||
("somme", SUM);
|
||||
("rempli", FILLED);
|
||||
("définition", DEFINITION);
|
||||
("étiquette", LABEL);
|
||||
("exception", EXCEPTION);
|
||||
("égal à", DEFINED_AS);
|
||||
("selon", MATCH);
|
||||
("sous forme", WITH);
|
||||
@ -69,12 +71,12 @@ let token_list : (string * token) list =
|
||||
("ou", OR);
|
||||
("ou bien", XOR);
|
||||
("non", NOT);
|
||||
("nombre", CARDINAL);
|
||||
("maximum", MAXIMUM);
|
||||
("minimum", MINIMUM);
|
||||
("filtre", FILTER);
|
||||
("application", MAP);
|
||||
("initial", INIT);
|
||||
("nombre", CARDINAL);
|
||||
("an", YEAR);
|
||||
("mois", MONTH);
|
||||
("jour", DAY);
|
||||
@ -83,6 +85,7 @@ let token_list : (string * token) list =
|
||||
]
|
||||
@ L.token_list_language_agnostic
|
||||
|
||||
(** Localised builtin functions *)
|
||||
let builtins : (string * Ast.builtin_expression) list =
|
||||
[
|
||||
("entier_vers_décimal", Ast.IntToDec);
|
||||
@ -108,7 +111,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| white_space | '\n' ->
|
||||
| white_space ->
|
||||
(* Whitespaces *)
|
||||
L.update_acc lexbuf;
|
||||
lex_code lexbuf
|
||||
@ -280,9 +283,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "non" ->
|
||||
L.update_acc lexbuf;
|
||||
NOT
|
||||
| "nombre" ->
|
||||
L.update_acc lexbuf;
|
||||
CARDINAL
|
||||
| "maximum" ->
|
||||
L.update_acc lexbuf;
|
||||
MAXIMUM
|
||||
@ -298,6 +298,9 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "initial" ->
|
||||
L.update_acc lexbuf;
|
||||
INIT
|
||||
| "nombre" ->
|
||||
L.update_acc lexbuf;
|
||||
CARDINAL
|
||||
| "vrai" ->
|
||||
L.update_acc lexbuf;
|
||||
TRUE
|
||||
@ -313,7 +316,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "jour" ->
|
||||
L.update_acc lexbuf;
|
||||
DAY
|
||||
| digit, Star (digit | white_space), Opt (',', Rep (digit, 0 .. 2)), Star white_space, 0x20AC ->
|
||||
| digit, Star (digit | hspace), Opt (',', Rep (digit, 0 .. 2)), Star hspace, 0x20AC ->
|
||||
let extract_parts = R.regexp "([0-9]([0-9 ]*[0-9]|))(,([0-9]{0,2})|)" in
|
||||
let full_str = Utf8.lexeme lexbuf in
|
||||
let only_numbers_str = String.trim (String.sub full_str 0 (String.length full_str - 1)) in
|
||||
@ -514,7 +517,7 @@ let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||
let s = Utf8.lexeme lexbuf in
|
||||
let i = String.index s '.' in
|
||||
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||
| Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||
| Plus hspace -> lex_directive_args lexbuf
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
@ -529,7 +532,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
|
||||
| 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
|
||||
| "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
|
||||
| "Inclusion" -> LAW_INCLUDE
|
||||
| ':', Star hspace ->
|
||||
| ':' ->
|
||||
L.context := Directive_args;
|
||||
COLON
|
||||
| '\n' | eof ->
|
||||
@ -549,13 +552,14 @@ let lex_law (lexbuf : lexbuf) : token =
|
||||
L.context := Code;
|
||||
Buffer.clear L.code_buffer;
|
||||
BEGIN_CODE
|
||||
| '>', Star hspace ->
|
||||
| '>' ->
|
||||
L.context := Directive;
|
||||
BEGIN_DIRECTIVE
|
||||
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||
L.get_law_heading lexbuf
|
||||
| _ -> (
|
||||
(* Nested match for lower priority; `_` matches length 0 *)
|
||||
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
|
||||
sub-match at the same point *)
|
||||
let lexbuf = lexbuf in
|
||||
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||
match%sedlex lexbuf with
|
||||
|
@ -312,12 +312,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "dzien" ->
|
||||
L.update_acc lexbuf;
|
||||
DAY
|
||||
| ( Star white_space,
|
||||
digit,
|
||||
Star (digit | ','),
|
||||
Opt ('.', Rep (digit, 0 .. 2)),
|
||||
Star white_space,
|
||||
"PLN" ) ->
|
||||
| digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)), Star hspace, "PLN" ->
|
||||
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
|
||||
let full_str = Utf8.lexeme lexbuf in
|
||||
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
|
||||
@ -518,7 +513,7 @@ let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||
let s = Utf8.lexeme lexbuf in
|
||||
let i = String.index s '.' in
|
||||
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||
| Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||
| Plus hspace -> lex_directive_args lexbuf
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
@ -532,7 +527,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
|
||||
| Plus hspace -> lex_directive lexbuf
|
||||
| "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
|
||||
| "Koniec", Plus hspace, "metadanych" -> END_METADATA
|
||||
| "Include", Star hspace -> LAW_INCLUDE
|
||||
| "Include" -> LAW_INCLUDE
|
||||
| ":" ->
|
||||
L.context := Directive_args;
|
||||
COLON
|
||||
@ -542,7 +537,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
(** Main lexing function used outside code blocks *)
|
||||
and lex_law (lexbuf : lexbuf) : token =
|
||||
let lex_law (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
||||
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
||||
@ -553,13 +548,14 @@ and lex_law (lexbuf : lexbuf) : token =
|
||||
L.context := Code;
|
||||
Buffer.clear L.code_buffer;
|
||||
BEGIN_CODE
|
||||
| '>', Star hspace ->
|
||||
| '>' ->
|
||||
L.context := Directive;
|
||||
BEGIN_DIRECTIVE
|
||||
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||
L.get_law_heading lexbuf
|
||||
| _ -> (
|
||||
(* Nested match for lower priority; `_` matches length 0 *)
|
||||
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
|
||||
sub-match at the same point *)
|
||||
let lexbuf = lexbuf in
|
||||
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||
match%sedlex lexbuf with
|
||||
|
Loading…
Reference in New Issue
Block a user