Fix lexing of verbatim blocks

Catala doesn't interpret them at all, but it needs to refrain from interpreting
its contents as markdown (titles, etc.)
This commit is contained in:
Louis Gesbert 2024-05-16 15:45:16 +02:00
parent caf9135b4e
commit 33ce233a29
3 changed files with 33 additions and 3 deletions

View File

@ -765,6 +765,31 @@ let rec lex_directive (lexbuf : lexbuf) : token =
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let lex_raw (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```", Star hspace, ('\n' | eof) ->
L.context := Law;
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
let lexbuf = lexbuf in
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12
(fixed in 3.1) *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
@ -781,6 +806,9 @@ let lex_law (lexbuf : lexbuf) : token =
L.context := Code;
Buffer.clear L.code_buffer;
BEGIN_METADATA
| "```", Star (idchar | '-') ->
L.context := Raw;
LAW_TEXT (Utf8.lexeme lexbuf)
| '>' ->
L.context := Directive;
BEGIN_DIRECTIVE
@ -790,7 +818,8 @@ let lex_law (lexbuf : lexbuf) : token =
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
let lexbuf = lexbuf in
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12
(fixed in 3.1) *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
@ -805,6 +834,7 @@ let lex_law (lexbuf : lexbuf) : token =
let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Raw -> lex_raw lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf

View File

@ -42,7 +42,7 @@ let get_law_heading (lexbuf : lexbuf) : token =
let precedence = calc_precedence (String.trim (R.get_substring rex 1)) in
LAW_HEADING (title, article_id, is_archive, precedence)
type lexing_context = Law | Code | Directive | Directive_args
type lexing_context = Law | Raw | Code | Directive | Directive_args
(** Boolean reference, used by the lexer as the mutable state to distinguish
whether it is lexing code or law. *)

View File

@ -17,7 +17,7 @@
(** Auxiliary functions used by all lexers. *)
type lexing_context = Law | Code | Directive | Directive_args
type lexing_context = Law | Raw | Code | Directive | Directive_args
val context : lexing_context ref
(** Reference, used by the lexer as the mutable state to distinguish whether it