From 33ce233a29826fe6073bd2c8e0ced1d643576c59 Mon Sep 17 00:00:00 2001 From: Louis Gesbert Date: Thu, 16 May 2024 15:45:16 +0200 Subject: [PATCH] Fix lexing of verbatim blocks Catala doesn't interpret them at all, but it needs to refrain from interpreting its contents as markdown (titles, etc.) --- compiler/surface/lexer.cppo.ml | 32 ++++++++++++++++++++++++++++++- compiler/surface/lexer_common.ml | 2 +- compiler/surface/lexer_common.mli | 2 +- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/compiler/surface/lexer.cppo.ml b/compiler/surface/lexer.cppo.ml index 244de0a9..9e0ac90a 100644 --- a/compiler/surface/lexer.cppo.ml +++ b/compiler/surface/lexer.cppo.ml @@ -765,6 +765,31 @@ let rec lex_directive (lexbuf : lexbuf) : token = END_DIRECTIVE | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme +let lex_raw (lexbuf : lexbuf) : token = + let prev_lexeme = Utf8.lexeme lexbuf in + let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in + let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in + if at_bol then + match%sedlex lexbuf with + | eof -> EOF + | "```", Star hspace, ('\n' | eof) -> + L.context := Law; + LAW_TEXT (Utf8.lexeme lexbuf) + | _ -> ( + (* Nested match for lower priority; `_` matches length 0 so we effectively retry the + sub-match at the same point *) + let lexbuf = lexbuf in + (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 + (fixed in 3.1) *) + match%sedlex lexbuf with + | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) + | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme) + else + match%sedlex lexbuf with + | eof -> EOF + | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) + | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme + (** Main lexing function used outside code blocks *) let lex_law (lexbuf : lexbuf) : token = let prev_lexeme = Utf8.lexeme lexbuf in @@ -781,6 +806,9 @@ let lex_law (lexbuf : lexbuf) : token = L.context := Code; Buffer.clear L.code_buffer; BEGIN_METADATA + | "```", Star (idchar | '-') -> + L.context := Raw; + LAW_TEXT (Utf8.lexeme lexbuf) | '>' -> L.context := Directive; BEGIN_DIRECTIVE @@ -790,7 +818,8 @@ let lex_law (lexbuf : lexbuf) : token = (* Nested match for lower priority; `_` matches length 0 so we effectively retry the sub-match at the same point *) let lexbuf = lexbuf in - (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *) + (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 + (fixed in 3.1) *) match%sedlex lexbuf with | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme) @@ -805,6 +834,7 @@ let lex_law (lexbuf : lexbuf) : token = let lexer (lexbuf : lexbuf) : token = match !L.context with | Law -> lex_law lexbuf + | Raw -> lex_raw lexbuf | Code -> lex_code lexbuf | Directive -> lex_directive lexbuf | Directive_args -> lex_directive_args lexbuf diff --git a/compiler/surface/lexer_common.ml b/compiler/surface/lexer_common.ml index 57281e93..166eea20 100644 --- a/compiler/surface/lexer_common.ml +++ b/compiler/surface/lexer_common.ml @@ -42,7 +42,7 @@ let get_law_heading (lexbuf : lexbuf) : token = let precedence = calc_precedence (String.trim (R.get_substring rex 1)) in LAW_HEADING (title, article_id, is_archive, precedence) -type lexing_context = Law | Code | Directive | Directive_args +type lexing_context = Law | Raw | Code | Directive | Directive_args (** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing code or law. *) diff --git a/compiler/surface/lexer_common.mli b/compiler/surface/lexer_common.mli index d75c76af..d9273b80 100644 --- a/compiler/surface/lexer_common.mli +++ b/compiler/surface/lexer_common.mli @@ -17,7 +17,7 @@ (** Auxiliary functions used by all lexers. *) -type lexing_context = Law | Code | Directive | Directive_args +type lexing_context = Law | Raw | Code | Directive | Directive_args val context : lexing_context ref (** Reference, used by the lexer as the mutable state to distinguish whether it