feat(lexer): improve grave accent lexing outside code blocks

2024-09-20 00:41:05 +03:00 · 2021-05-24 23:04:43 +02:00 · 2021-05-24 23:04:43 +02:00 · 866a29a643
commit 866a29a643
parent bcb2303b13
8 changed files with 72 additions and 32 deletions
--- a/src/catala/surface/lexer.ml
+++ b/src/catala/surface/lexer.ml
@ -532,6 +532,16 @@ let rec lex_code (lexbuf : lexbuf) : token =
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
+  let compl_catala =
+    [%sedlex.regexp?
+      ( Compl 'c'
+      | 'c', Compl 'a'
+      | "ca", Compl 't'
+      | "cat", Compl 'a'
+      | "cata", Compl 'l'
+      | "catal", Compl 'a'
+      | "catala", Compl (white_space | '\n') )]
+  in
  match%sedlex lexbuf with
  | "```catala" ->
      is_code := true;
@ -565,11 +575,9 @@ let lex_law (lexbuf : lexbuf) : token =
      ( Compl ('#' | '`' | '>')
      (* Following literals allow to match grave accents as long as they don't conflict with the
         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
-         be followed by a white space or a newline character. *)
+         not be followed by 'catala'. *)
      | Rep ('`', 1 .. 2), Compl '`'
-      | "```", (white_space | '\n')
-      (* @note (EmileRolley): for a more permisive constraint, [white_space] could be replaced by
-         [Compl 'c'] but it lacks consistency in my opinion. *) ) ->
+      | "```", compl_catala ) ->
      LAW_TEXT (Utf8.lexeme lexbuf)
  | _ -> raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

--- a/src/catala/surface/lexer_en.ml
+++ b/src/catala/surface/lexer_en.ml
@ -492,6 +492,16 @@ let rec lex_code (lexbuf : lexbuf) : token =
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
+  let compl_catala =
+    [%sedlex.regexp?
+      ( Compl 'c'
+      | 'c', Compl 'a'
+      | "ca", Compl 't'
+      | "cat", Compl 'a'
+      | "cata", Compl 'l'
+      | "catal", Compl 'a'
+      | "catala", Compl (white_space | '\n') )]
+  in
  match%sedlex lexbuf with
  | "```catala" ->
      L.is_code := true;
@ -525,11 +535,9 @@ let lex_law (lexbuf : lexbuf) : token =
      ( Compl ('#' | '`' | '>')
      (* Following literals allow to match grave accents as long as they don't conflict with the
         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
-         be followed by a white space or a newline character. *)
+         not be followed by 'catala'. *)
      | Rep ('`', 1 .. 2), Compl '`'
-      | "```", (white_space | '\n')
-      (* @note (EmileRolley): for a more permisive constraint, [white_space] could be replaced by
-         [Compl 'c'] but it lacks consistency in my opinion. *) ) ->
+      | "```", compl_catala ) ->
      LAW_TEXT (Utf8.lexeme lexbuf)
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

--- a/src/catala/surface/lexer_fr.ml
+++ b/src/catala/surface/lexer_fr.ml
@ -495,6 +495,16 @@ let rec lex_code (lexbuf : lexbuf) : token =
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
+  let compl_catala =
+    [%sedlex.regexp?
+      ( Compl 'c'
+      | 'c', Compl 'a'
+      | "ca", Compl 't'
+      | "cat", Compl 'a'
+      | "cata", Compl 'l'
+      | "catal", Compl 'a'
+      | "catala", Compl (white_space | '\n') )]
+  in
  match%sedlex lexbuf with
  | "```catala" ->
      L.is_code := true;
@ -532,11 +542,9 @@ let lex_law (lexbuf : lexbuf) : token =
      ( Compl ('#' | '`' | '>')
      (* Following literals allow to match grave accents as long as they don't conflict with the
         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
-         be followed by a white space or a newline character. *)
+         not be followed by 'catala'. *)
      | Rep ('`', 1 .. 2), Compl '`'
-      | "```", (white_space | '\n')
-      (* @note (EmileRolley): for a more permisive constraint, [white_space] could be replaced by
-         [Compl 'c'] but it lacks consistency in my opinion. *) ) ->
+      | "```", compl_catala ) ->
      LAW_TEXT (Utf8.lexeme lexbuf)
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

--- a/src/catala/surface/lexer_pl.ml
+++ b/src/catala/surface/lexer_pl.ml
@ -499,6 +499,16 @@ let rec lex_code (lexbuf : lexbuf) : token =
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
+  let compl_catala =
+    [%sedlex.regexp?
+      ( Compl 'c'
+      | 'c', Compl 'a'
+      | "ca", Compl 't'
+      | "cat", Compl 'a'
+      | "cata", Compl 'l'
+      | "catal", Compl 'a'
+      | "catala", Compl (white_space | '\n') )]
+  in
  match%sedlex lexbuf with
  | "```catala" ->
      L.is_code := true;
@ -533,11 +543,9 @@ let lex_law (lexbuf : lexbuf) : token =
      ( Compl ('#' | '`' | '>')
      (* Following literals allow to match grave accents as long as they don't conflict with the
         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
-         be followed by a white space or a newline character. *)
+         not be followed by 'catala'. *)
      | Rep ('`', 1 .. 2), Compl '`'
-      | "```", (white_space | '\n')
-      (* @note (EmileRolley): for a more permisive constraint, [white_space] could be replaced by
-         [Compl 'c'] but it lacks consistency in my opinion. *) ) ->
+      | "```", compl_catala ) ->
      LAW_TEXT (Utf8.lexeme lexbuf)
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

--- a/tests/test_literate/good/test_grave_char.catala
+++ b/tests/test_literate/good/test_grave_char.catala
@ -9,12 +9,10 @@ This allows to:

 * use Fenced Code Blocks:

-```
+```ocaml
 let () = print_endline "Hello world!"
 ```

-**Remark**: for Fenced Code Blocks, ``` must be followed by a space or a new line.
-
 ```catala
 new scope A:
  param literate_parsing_is_ok content bool
@ -23,4 +21,8 @@ scope A:
  def literate_parsing_is_ok := true
 ```

-Even after `Catala` code block.
+Even after `Catala` code block:
+
+```c
+int main(void) { return 0; }
+```
--- a/tests/test_literate/good/test_grave_char.catala_en
+++ b/tests/test_literate/good/test_grave_char.catala_en
@ -9,12 +9,10 @@ This allows to:

 * use Fenced Code Blocks:

-```
+```ocaml
 let () = print_endline "Hello world!"
 ```

-**Remark**: for Fenced Code Blocks, ``` must be followed by a space or a new line.
-
 ```catala
 declaration scope A:
  context literate_parsing_is_ok content boolean
@ -23,4 +21,8 @@ scope A:
  definition literate_parsing_is_ok equals true
 ```

-Even after `Catala` code block.
+Even after `Catala` code block:
+
+```c
+int main(void) { return 0; }
+```
--- a/tests/test_literate/good/test_grave_char.catala_fr
+++ b/tests/test_literate/good/test_grave_char.catala_fr
@ -9,12 +9,10 @@ This allows to:

 * use Fenced Code Blocks:

-```
+```ocaml
 let () = print_endline "Hello world!"
 ```

-**Remark**: for Fenced Code Blocks, ``` must be followed by a space or a new line.
-
 ```catala
 déclaration champ d'application A:
  contexte literate_parsing_is_ok contenu booléen
@ -23,4 +21,8 @@ champ d'application A:
  définition literate_parsing_is_ok égal à vrai
 ```

-Even after `Catala` code block.
+Even after `Catala` code block:
+
+```c
+int main(void) { return 0; }
+```
--- a/tests/test_literate/good/test_grave_char.catala_pl
+++ b/tests/test_literate/good/test_grave_char.catala_pl
@ -9,12 +9,10 @@ This allows to:

 * use Fenced Code Blocks:

-```
+```ocaml
 let () = print_endline "Hello world!"
 ```

-**Remark**: for Fenced Code Blocks, ``` must be followed by a space or a new line.
-
 ```catala
 deklaracja zakres A:
  kontekst literate_parsing_is_ok typu zerojedynkowy
@ -23,4 +21,8 @@ zakres A:
  definicja literate_parsing_is_ok wynosi prawda
 ```

-Even after `Catala` code block.
+Even after `Catala` code block:
+
+```c
+int main(void) { return 0; }
+```