Fix lexing of verbatim blocks

Catala doesn't interpret them at all, but it needs to refrain from interpreting its contents as markdown (titles, etc.)
2024-11-08 07:51:43 +03:00 · 2024-05-16 15:45:16 +02:00 · 2024-05-16 15:45:16 +02:00 · 33ce233a29
commit 33ce233a29
parent caf9135b4e
3 changed files with 33 additions and 3 deletions
--- a/compiler/surface/lexer.cppo.ml
+++ b/compiler/surface/lexer.cppo.ml
@ -765,6 +765,31 @@ let rec lex_directive (lexbuf : lexbuf) : token =
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

+let lex_raw (lexbuf : lexbuf) : token =
+  let prev_lexeme = Utf8.lexeme lexbuf in
+  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
+  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
+  if at_bol then
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | "```", Star hspace, ('\n' | eof) ->
+        L.context := Law;
+        LAW_TEXT (Utf8.lexeme lexbuf)
+    | _ -> (
+        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
+           sub-match at the same point *)
+        let lexbuf = lexbuf in
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12
+           (fixed in 3.1) *)
+        match%sedlex lexbuf with
+        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
+  else
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+
 (** Main lexing function used outside code blocks *)
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
@ -781,6 +806,9 @@ let lex_law (lexbuf : lexbuf) : token =
        L.context := Code;
        Buffer.clear L.code_buffer;
        BEGIN_METADATA
+    | "```", Star (idchar | '-') ->
+        L.context := Raw;
+        LAW_TEXT (Utf8.lexeme lexbuf)
    | '>' ->
        L.context := Directive;
        BEGIN_DIRECTIVE
@ -790,7 +818,8 @@ let lex_law (lexbuf : lexbuf) : token =
        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
           sub-match at the same point *)
        let lexbuf = lexbuf in
-        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12
+           (fixed in 3.1) *)
        match%sedlex lexbuf with
        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
@ -805,6 +834,7 @@ let lex_law (lexbuf : lexbuf) : token =
 let lexer (lexbuf : lexbuf) : token =
  match !L.context with
  | Law -> lex_law lexbuf
+  | Raw -> lex_raw lexbuf
  | Code -> lex_code lexbuf
  | Directive -> lex_directive lexbuf
  | Directive_args -> lex_directive_args lexbuf
--- a/compiler/surface/lexer_common.ml
+++ b/compiler/surface/lexer_common.ml
@ -42,7 +42,7 @@ let get_law_heading (lexbuf : lexbuf) : token =
  let precedence = calc_precedence (String.trim (R.get_substring rex 1)) in
  LAW_HEADING (title, article_id, is_archive, precedence)

-type lexing_context = Law | Code | Directive | Directive_args
+type lexing_context = Law | Raw | Code | Directive | Directive_args

 (** Boolean reference, used by the lexer as the mutable state to distinguish
    whether it is lexing code or law. *)
--- a/compiler/surface/lexer_common.mli
+++ b/compiler/surface/lexer_common.mli
@ -17,7 +17,7 @@

 (** Auxiliary functions used by all lexers. *)

-type lexing_context = Law | Code | Directive | Directive_args
+type lexing_context = Law | Raw | Code | Directive | Directive_args

 val context : lexing_context ref
 (** Reference, used by the lexer as the mutable state to distinguish whether it