Resync the three lexers

We'll need to factorise that better in the not-too-far future: there was already a discrepancy between en and fr; and this won't scale if we keep adding languages.
2024-11-08 07:51:43 +03:00 · 2021-08-18 18:20:56 +02:00 · 2021-08-18 18:20:56 +02:00 · 3b5c4c17cd
commit 3b5c4c17cd
parent b31bee71ad
3 changed files with 24 additions and 23 deletions
--- a/compiler/surface/lexer_en.ml
+++ b/compiler/surface/lexer_en.ml
@ -310,7 +310,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "day" ->
      L.update_acc lexbuf;
      DAY
-  | 0x24, Star white_space, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
+  | 0x24, Star hspace, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
      let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
      let full_str = Utf8.lexeme lexbuf in
      let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
@ -552,7 +552,8 @@ let lex_law (lexbuf : lexbuf) : token =
    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
        L.get_law_heading lexbuf
    | _ -> (
-        (* Nested match for lower priority; `_` matches length 0 *)
+        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
+           sub-match at the same point *)
        let lexbuf = lexbuf in
        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
        match%sedlex lexbuf with
--- a/compiler/surface/lexer_fr.ml
+++ b/compiler/surface/lexer_fr.ml
@ -43,6 +43,8 @@ let token_list : (string * token) list =
    ("somme", SUM);
    ("rempli", FILLED);
    ("définition", DEFINITION);
+    ("étiquette", LABEL);
+    ("exception", EXCEPTION);
    ("égal à", DEFINED_AS);
    ("selon", MATCH);
    ("sous forme", WITH);
@ -69,12 +71,12 @@ let token_list : (string * token) list =
    ("ou", OR);
    ("ou bien", XOR);
    ("non", NOT);
-    ("nombre", CARDINAL);
    ("maximum", MAXIMUM);
    ("minimum", MINIMUM);
    ("filtre", FILTER);
    ("application", MAP);
    ("initial", INIT);
+    ("nombre", CARDINAL);
    ("an", YEAR);
    ("mois", MONTH);
    ("jour", DAY);
@ -83,6 +85,7 @@ let token_list : (string * token) list =
  ]
  @ L.token_list_language_agnostic

+(** Localised builtin functions *)
 let builtins : (string * Ast.builtin_expression) list =
  [
    ("entier_vers_décimal", Ast.IntToDec);
@ -108,7 +111,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
-  | white_space | '\n' ->
+  | white_space ->
      (* Whitespaces *)
      L.update_acc lexbuf;
      lex_code lexbuf
@ -280,9 +283,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "non" ->
      L.update_acc lexbuf;
      NOT
-  | "nombre" ->
-      L.update_acc lexbuf;
-      CARDINAL
  | "maximum" ->
      L.update_acc lexbuf;
      MAXIMUM
@ -298,6 +298,9 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "initial" ->
      L.update_acc lexbuf;
      INIT
+  | "nombre" ->
+      L.update_acc lexbuf;
+      CARDINAL
  | "vrai" ->
      L.update_acc lexbuf;
      TRUE
@ -313,7 +316,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "jour" ->
      L.update_acc lexbuf;
      DAY
-  | digit, Star (digit | white_space), Opt (',', Rep (digit, 0 .. 2)), Star white_space, 0x20AC ->
+  | digit, Star (digit | hspace), Opt (',', Rep (digit, 0 .. 2)), Star hspace, 0x20AC ->
      let extract_parts = R.regexp "([0-9]([0-9 ]*[0-9]|))(,([0-9]{0,2})|)" in
      let full_str = Utf8.lexeme lexbuf in
      let only_numbers_str = String.trim (String.sub full_str 0 (String.length full_str - 1)) in
@ -514,7 +517,7 @@ let rec lex_directive_args (lexbuf : lexbuf) : token =
      let s = Utf8.lexeme lexbuf in
      let i = String.index s '.' in
      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
-  | Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
+  | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
  | Plus hspace -> lex_directive_args lexbuf
  | '\n' | eof ->
      L.context := Law;
@ -529,7 +532,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
  | 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
  | "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
  | "Inclusion" -> LAW_INCLUDE
-  | ':', Star hspace ->
+  | ':' ->
      L.context := Directive_args;
      COLON
  | '\n' | eof ->
@ -549,13 +552,14 @@ let lex_law (lexbuf : lexbuf) : token =
        L.context := Code;
        Buffer.clear L.code_buffer;
        BEGIN_CODE
-    | '>', Star hspace ->
+    | '>' ->
        L.context := Directive;
        BEGIN_DIRECTIVE
    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
        L.get_law_heading lexbuf
    | _ -> (
-        (* Nested match for lower priority; `_` matches length 0 *)
+        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
+           sub-match at the same point *)
        let lexbuf = lexbuf in
        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
        match%sedlex lexbuf with
--- a/compiler/surface/lexer_pl.ml
+++ b/compiler/surface/lexer_pl.ml
@ -312,12 +312,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "dzien" ->
      L.update_acc lexbuf;
      DAY
-  | ( Star white_space,
-      digit,
-      Star (digit | ','),
-      Opt ('.', Rep (digit, 0 .. 2)),
-      Star white_space,
-      "PLN" ) ->
+  | digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)), Star hspace, "PLN" ->
      let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
      let full_str = Utf8.lexeme lexbuf in
      let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
@ -518,7 +513,7 @@ let rec lex_directive_args (lexbuf : lexbuf) : token =
      let s = Utf8.lexeme lexbuf in
      let i = String.index s '.' in
      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
-  | Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
+  | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
  | Plus hspace -> lex_directive_args lexbuf
  | '\n' | eof ->
      L.context := Law;
@ -532,7 +527,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
  | Plus hspace -> lex_directive lexbuf
  | "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
  | "Koniec", Plus hspace, "metadanych" -> END_METADATA
-  | "Include", Star hspace -> LAW_INCLUDE
+  | "Include" -> LAW_INCLUDE
  | ":" ->
      L.context := Directive_args;
      COLON
@ -542,7 +537,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

 (** Main lexing function used outside code blocks *)
-and lex_law (lexbuf : lexbuf) : token =
+let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
@ -553,13 +548,14 @@ and lex_law (lexbuf : lexbuf) : token =
        L.context := Code;
        Buffer.clear L.code_buffer;
        BEGIN_CODE
-    | '>', Star hspace ->
+    | '>' ->
        L.context := Directive;
        BEGIN_DIRECTIVE
    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
        L.get_law_heading lexbuf
    | _ -> (
-        (* Nested match for lower priority; `_` matches length 0 *)
+        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
+           sub-match at the same point *)
        let lexbuf = lexbuf in
        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
        match%sedlex lexbuf with