Merge pull request #138 from AltGr/lexer-refactor

Cleanup the lexer, and refactor for more generic directives
2024-09-19 00:15:39 +03:00 · 2021-08-19 12:12:53 +02:00 · 2021-08-19 12:12:53 +02:00 · 7cbb4a9149
commit 7cbb4a9149
parent b56299f3d3 47e79de6a7
11 changed files with 512 additions and 506 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -55,4 +55,5 @@ jobs:
      - name: Make all
        run: |
          eval $(opam env)
+          export OCAMLRUNPARAM=b
          make all
--- a/2
+++ b/2
@ -21,7 +21,7 @@ dependencies-ocaml:
 		ocamlformat ANSITerminal sedlex	menhir menhirLib dune cmdliner obelisk \
 		re obelisk unionfind bindlib zarith.1.11 zarith_stubs_js.v0.14.0 ocamlgraph \
 		js_of_ocaml-compiler js_of_ocaml js_of_ocaml-ppx calendar camomile \
-		visitors benchmark
+		visitors benchmark odoc

 dependencies-js:
 	$(MAKE) -C $(FRENCH_LAW_JS_LIB_DIR) dependencies
--- a/compiler/surface/dune
+++ b/compiler/surface/dune
@ -21,7 +21,7 @@
 (menhir
 (modules tokens parser)
 (merge_into parser)
- (flags --external-tokens Tokens --table))
+ (flags --external-tokens Tokens --table --explain))

 (documentation
 (package catala)
--- a/compiler/surface/lexer_common.ml
+++ b/compiler/surface/lexer_common.ml
@ -35,17 +35,19 @@ let get_law_heading (lexbuf : lexbuf) : token =
  let precedence = calc_precedence (String.trim (get_substring 1)) in
  LAW_HEADING (title, article_id, article_expiration_date, precedence)

+type lexing_context = Law | Code | Directive | Directive_args
+
 (** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
    code or law. *)
-let is_code : bool ref = ref false
+let context : lexing_context ref = ref Law

 (** Mutable string reference that accumulates the string representation of the body of code being
    lexed. This string representation is used in the literate programming backends to faithfully
    capture the spacing pattern of the original program *)
-let code_string_acc : string ref = ref ""
+let code_buffer : Buffer.t = Buffer.create 4000

-(** Updates {!val:code_string_acc} with the current lexeme *)
-let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf
+(** Updates {!val:code_buffer} with the current lexeme *)
+let update_acc (lexbuf : lexbuf) : unit = Buffer.add_string code_buffer (Utf8.lexeme lexbuf)

 (** Error-generating helper *)
 let raise_lexer_error (loc : Pos.t) (token : string) =
--- a/compiler/surface/lexer_common.mli
+++ b/compiler/surface/lexer_common.mli
@ -14,17 +14,19 @@

 (** Auxiliary functions used by all lexers. *)

-val is_code : bool ref
-(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
-    code or law. *)
+type lexing_context = Law | Code | Directive | Directive_args

-val code_string_acc : string ref
-(** Mutable string reference that accumulates the string representation of the body of code being
-    lexed. This string representation is used in the literate programming backends to faithfully
-    capture the spacing pattern of the original program *)
+val context : lexing_context ref
+(** Reference, used by the lexer as the mutable state to distinguish whether it is lexing code or
+    law. *)
+
+val code_buffer : Buffer.t
+(** Buffer that accumulates the string representation of the body of code being lexed. This string
+    representation is used in the literate programming backends to faithfully capture the spacing
+    pattern of the original program *)

 val update_acc : Sedlexing.lexbuf -> unit
-(** Updates {!val:code_string_acc} with the current lexeme *)
+(** Updates {!val:code_buffer} with the current lexeme *)

 val raise_lexer_error : Utils.Pos.t -> string -> 'a
 (** Error-generating helper *)
--- a/compiler/surface/lexer_en.ml
+++ b/compiler/surface/lexer_en.ml
@ -103,6 +103,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
 (** Regexp matching at least one space. *)
 let space_plus = [%sedlex.regexp? Plus white_space]

+(** Regexp matching white space but not newlines *)
+let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
+
 (** Main lexing function used in code blocks *)
 let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
@ -118,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
      lex_code lexbuf
  | "```" ->
      (* End of code section *)
-      L.is_code := false;
-      END_CODE !L.code_string_acc
+      L.context := Law;
+      END_CODE (Buffer.contents L.code_buffer)
  | "scope" ->
      L.update_acc lexbuf;
      SCOPE
@ -307,7 +310,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "day" ->
      L.update_acc lexbuf;
      DAY
-  | 0x24, Star white_space, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
+  | 0x24, Star hspace, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
      let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
      let full_str = Utf8.lexeme lexbuf in
      let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
@ -500,60 +503,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
      INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

+let rec lex_directive_args (lexbuf : lexbuf) : token =
+  let prev_lexeme = Utf8.lexeme lexbuf in
+  let prev_pos = lexing_positions lexbuf in
+  match%sedlex lexbuf with
+  | '@', Star hspace, "p.", Star hspace, Plus digit ->
+      let s = Utf8.lexeme lexbuf in
+      let i = String.index s '.' in
+      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
+  | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
+  | Plus hspace -> lex_directive_args lexbuf
+  | '\n' | eof ->
+      L.context := Law;
+      END_DIRECTIVE
+  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+
+let rec lex_directive (lexbuf : lexbuf) : token =
+  let prev_lexeme = Utf8.lexeme lexbuf in
+  let prev_pos = lexing_positions lexbuf in
+  match%sedlex lexbuf with
+  | Plus hspace -> lex_directive lexbuf
+  | "Begin", Plus hspace, "metadata" -> BEGIN_METADATA
+  | "End", Plus hspace, "metadata" -> END_METADATA
+  | "Include" -> LAW_INCLUDE
+  | ":" ->
+      L.context := Directive_args;
+      COLON
+  | '\n' | eof ->
+      L.context := Law;
+      END_DIRECTIVE
+  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+
 (** Main lexing function used outside code blocks *)
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
-  let prev_pos = lexing_positions lexbuf in
-  let compl_catala =
-    [%sedlex.regexp?
-      ( Compl 'c'
-      | 'c', Compl 'a'
-      | "ca", Compl 't'
-      | "cat", Compl 'a'
-      | "cata", Compl 'l'
-      | "catal", Compl 'a'
-      | "catala", Compl (white_space | '\n') )]
-  in
-  match%sedlex lexbuf with
-  | "```catala" ->
-      L.is_code := true;
-      L.code_string_acc := "";
-      BEGIN_CODE
-  | eof -> EOF
-  | '>', Star white_space, "Begin metadata" -> BEGIN_METADATA
-  | '>', Star white_space, "End metadata" -> END_METADATA
-  | ( '>',
-      Star white_space,
-      "Include:",
-      Star white_space,
-      Plus (Compl ('@' | '\n')),
-      Star white_space,
-      Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
-      '\n' ) ->
-      let extract_components =
-        R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
-      in
-      let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
-      let name = get_component 1 in
-      let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
-      let pos = lexing_positions lexbuf in
-      if Filename.extension name = ".pdf" then
-        LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
-      else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
-  | Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
-      L.get_law_heading lexbuf
-  | Plus
-      (* Match non-special characters, i.e. characters that doesn't appear at the start of a
-         previous regexp. *)
-      ( Compl ('#' | '`' | '>')
-      (* Following literals allow to match grave accents as long as they don't conflict with the
-         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
-         not be followed by 'catala'. *)
-      | Rep ('`', 1 .. 2), Compl '`'
-      | "```", compl_catala ) ->
-      LAW_TEXT (Utf8.lexeme lexbuf)
-  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
+  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
+  if at_bol then
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | "```catala", Plus white_space ->
+        L.context := Code;
+        Buffer.clear L.code_buffer;
+        BEGIN_CODE
+    | '>' ->
+        L.context := Directive;
+        BEGIN_DIRECTIVE
+    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
+        L.get_law_heading lexbuf
+    | _ -> (
+        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
+           sub-match at the same point *)
+        let lexbuf = lexbuf in
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
+        match%sedlex lexbuf with
+        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
+  else
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

 (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
    Surface.Lexer_common.is_code}. *)
-let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
+let lexer (lexbuf : lexbuf) : token =
+  match !L.context with
+  | Law -> lex_law lexbuf
+  | Code -> lex_code lexbuf
+  | Directive -> lex_directive lexbuf
+  | Directive_args -> lex_directive_args lexbuf
--- a/compiler/surface/lexer_fr.ml
+++ b/compiler/surface/lexer_fr.ml
@ -43,6 +43,8 @@ let token_list : (string * token) list =
    ("somme", SUM);
    ("rempli", FILLED);
    ("définition", DEFINITION);
+    ("étiquette", LABEL);
+    ("exception", EXCEPTION);
    ("égal à", DEFINED_AS);
    ("selon", MATCH);
    ("sous forme", WITH);
@ -69,12 +71,12 @@ let token_list : (string * token) list =
    ("ou", OR);
    ("ou bien", XOR);
    ("non", NOT);
-    ("nombre", CARDINAL);
    ("maximum", MAXIMUM);
    ("minimum", MINIMUM);
    ("filtre", FILTER);
    ("application", MAP);
    ("initial", INIT);
+    ("nombre", CARDINAL);
    ("an", YEAR);
    ("mois", MONTH);
    ("jour", DAY);
@ -83,6 +85,7 @@ let token_list : (string * token) list =
  ]
  @ L.token_list_language_agnostic

+(** Localised builtin functions *)
 let builtins : (string * Ast.builtin_expression) list =
  [
    ("entier_vers_décimal", Ast.IntToDec);
@ -100,12 +103,15 @@ let digit = [%sedlex.regexp? '0' .. '9']
 (** Regexp matching at least one space. *)
 let space_plus = [%sedlex.regexp? Plus white_space]

+(** Regexp matching white space but not newlines *)
+let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
+
 (** Main lexing function used in code blocks *)
 let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
-  | white_space | '\n' ->
+  | white_space ->
      (* Whitespaces *)
      L.update_acc lexbuf;
      lex_code lexbuf
@ -115,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
      lex_code lexbuf
  | "```" ->
      (* End of code section *)
-      L.is_code := false;
-      END_CODE !L.code_string_acc
+      L.context := Law;
+      END_CODE (Buffer.contents L.code_buffer)
  | "champ", space_plus, "d\'application" ->
      L.update_acc lexbuf;
      SCOPE
@ -277,9 +283,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "non" ->
      L.update_acc lexbuf;
      NOT
-  | "nombre" ->
-      L.update_acc lexbuf;
-      CARDINAL
  | "maximum" ->
      L.update_acc lexbuf;
      MAXIMUM
@ -295,6 +298,9 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "initial" ->
      L.update_acc lexbuf;
      INIT
+  | "nombre" ->
+      L.update_acc lexbuf;
+      CARDINAL
  | "vrai" ->
      L.update_acc lexbuf;
      TRUE
@ -310,11 +316,10 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "jour" ->
      L.update_acc lexbuf;
      DAY
-  | digit, Star (digit | white_space), Opt (',', Rep (digit, 0 .. 2)), Star white_space, 0x20AC ->
+  | digit, Star (digit | hspace), Opt (',', Rep (digit, 0 .. 2)), Star hspace, 0x20AC ->
      let extract_parts = R.regexp "([0-9]([0-9 ]*[0-9]|))(,([0-9]{0,2})|)" in
-      let full_str = Utf8.lexeme lexbuf in
-      let only_numbers_str = String.trim (String.sub full_str 0 (String.length full_str - 1)) in
-      let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
+      let str = Utf8.lexeme lexbuf in
+      let parts = R.get_substring (R.exec ~rex:extract_parts str) in
      (* Integer literal*)
      let units = parts 1 in
      let remove_spaces = R.regexp " " in
@ -503,64 +508,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
      INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

+let rec lex_directive_args (lexbuf : lexbuf) : token =
+  let prev_lexeme = Utf8.lexeme lexbuf in
+  let prev_pos = lexing_positions lexbuf in
+  match%sedlex lexbuf with
+  | '@', Star hspace, "p.", Star hspace, Plus digit ->
+      let s = Utf8.lexeme lexbuf in
+      let i = String.index s '.' in
+      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
+  | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
+  | Plus hspace -> lex_directive_args lexbuf
+  | '\n' | eof ->
+      L.context := Law;
+      END_DIRECTIVE
+  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+
+let rec lex_directive (lexbuf : lexbuf) : token =
+  let prev_lexeme = Utf8.lexeme lexbuf in
+  let prev_pos = lexing_positions lexbuf in
+  match%sedlex lexbuf with
+  | Plus hspace -> lex_directive lexbuf
+  | 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
+  | "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
+  | "Inclusion" -> LAW_INCLUDE
+  | ':' ->
+      L.context := Directive_args;
+      COLON
+  | '\n' | eof ->
+      L.context := Law;
+      END_DIRECTIVE
+  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+
 (** Main lexing function used outside code blocks *)
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
-  let prev_pos = lexing_positions lexbuf in
-  let compl_catala =
-    [%sedlex.regexp?
-      ( Compl 'c'
-      | 'c', Compl 'a'
-      | "ca", Compl 't'
-      | "cat", Compl 'a'
-      | "cata", Compl 'l'
-      | "catal", Compl 'a'
-      | "catala", Compl (white_space | '\n') )]
-  in
-  match%sedlex lexbuf with
-  | "```catala" ->
-      L.is_code := true;
-      L.code_string_acc := "";
-
-      BEGIN_CODE
-  | eof -> EOF
-  | '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
-  | '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
-  | ( '>',
-      Star white_space,
-      "Inclusion:",
-      Star white_space,
-      Plus (Compl ('@' | '\n')),
-      Star white_space,
-      Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
-      '\n' ) ->
-      let extract_components =
-        R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
-      in
-      let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
-      let jorftext = R.regexp "JORFTEXT\\d{12}" in
-      let name = get_component 1 in
-      let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
-      let pos = lexing_positions lexbuf in
-      if R.pmatch ~rex:jorftext name then
-        LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
-      else if Filename.extension name = ".pdf" then
-        LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
-      else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
-  | Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
-      L.get_law_heading lexbuf
-  | Plus
-      (* Match non-special characters, i.e. characters that doesn't appear at the start of a
-         previous regexp. *)
-      ( Compl ('#' | '`' | '>')
-      (* Following literals allow to match grave accents as long as they don't conflict with the
-         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
-         not be followed by 'catala'. *)
-      | Rep ('`', 1 .. 2), Compl '`'
-      | "```", compl_catala ) ->
-      LAW_TEXT (Utf8.lexeme lexbuf)
-  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
+  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
+  if at_bol then
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | "```catala", Plus white_space ->
+        L.context := Code;
+        Buffer.clear L.code_buffer;
+        BEGIN_CODE
+    | '>' ->
+        L.context := Directive;
+        BEGIN_DIRECTIVE
+    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
+        L.get_law_heading lexbuf
+    | _ -> (
+        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
+           sub-match at the same point *)
+        let lexbuf = lexbuf in
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
+        match%sedlex lexbuf with
+        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
+  else
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

 (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
    Surface.Lexer_common.is_code}. *)
-let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
+let lexer (lexbuf : lexbuf) : token =
+  match !L.context with
+  | Law -> lex_law lexbuf
+  | Code -> lex_code lexbuf
+  | Directive -> lex_directive lexbuf
+  | Directive_args -> lex_directive_args lexbuf
--- a/compiler/surface/lexer_pl.ml
+++ b/compiler/surface/lexer_pl.ml
@ -105,6 +105,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
 (** Regexp matching at least one space. *)
 let space_plus = [%sedlex.regexp? Plus white_space]

+(** Regexp matching white space but not newlines *)
+let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
+
 (** Main lexing function used in code blocks *)
 let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
@ -120,8 +123,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
      lex_code lexbuf
  | "```" ->
      (* End of code section *)
-      L.is_code := false;
-      END_CODE !L.code_string_acc
+      L.context := Law;
+      END_CODE (Buffer.contents L.code_buffer)
  | "zakres" ->
      L.update_acc lexbuf;
      SCOPE
@ -309,16 +312,10 @@ let rec lex_code (lexbuf : lexbuf) : token =
  | "dzien" ->
      L.update_acc lexbuf;
      DAY
-  | ( Star white_space,
-      digit,
-      Star (digit | ','),
-      Opt ('.', Rep (digit, 0 .. 2)),
-      Star white_space,
-      "PLN" ) ->
+  | digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)), Star hspace, "PLN" ->
      let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
-      let full_str = Utf8.lexeme lexbuf in
-      let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
-      let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
+      let str = Utf8.lexeme lexbuf in
+      let parts = R.get_substring (R.exec ~rex:extract_parts str) in
      (* Integer literal*)
      let units = parts 1 in
      let remove_commas = R.regexp "," in
@ -507,61 +504,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
      INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

+let rec lex_directive_args (lexbuf : lexbuf) : token =
+  let prev_lexeme = Utf8.lexeme lexbuf in
+  let prev_pos = lexing_positions lexbuf in
+  match%sedlex lexbuf with
+  | '@', Star hspace, "p.", Star hspace, Plus digit ->
+      let s = Utf8.lexeme lexbuf in
+      let i = String.index s '.' in
+      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
+  | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
+  | Plus hspace -> lex_directive_args lexbuf
+  | '\n' | eof ->
+      L.context := Law;
+      END_DIRECTIVE
+  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+
+let rec lex_directive (lexbuf : lexbuf) : token =
+  let prev_lexeme = Utf8.lexeme lexbuf in
+  let prev_pos = lexing_positions lexbuf in
+  match%sedlex lexbuf with
+  | Plus hspace -> lex_directive lexbuf
+  | "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
+  | "Koniec", Plus hspace, "metadanych" -> END_METADATA
+  | "Include" -> LAW_INCLUDE
+  | ":" ->
+      L.context := Directive_args;
+      COLON
+  | '\n' | eof ->
+      L.context := Law;
+      END_DIRECTIVE
+  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+
 (** Main lexing function used outside code blocks *)
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
-  let prev_pos = lexing_positions lexbuf in
-  let compl_catala =
-    [%sedlex.regexp?
-      ( Compl 'c'
-      | 'c', Compl 'a'
-      | "ca", Compl 't'
-      | "cat", Compl 'a'
-      | "cata", Compl 'l'
-      | "catal", Compl 'a'
-      | "catala", Compl (white_space | '\n') )]
-  in
-  match%sedlex lexbuf with
-  | "```catala" ->
-      L.is_code := true;
-      L.code_string_acc := "";
-
-      BEGIN_CODE
-  | eof -> EOF
-  | '>', Star white_space, "Poczatek metadanych" -> BEGIN_METADATA
-  | '>', Star white_space, "Koniec metadanych" -> END_METADATA
-  | ( '>',
-      Star white_space,
-      "Include:",
-      Star white_space,
-      Plus (Compl ('@' | '\n')),
-      Star white_space,
-      Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
-      '\n' ) ->
-      let extract_components =
-        R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
-      in
-      let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
-      let name = get_component 1 in
-      let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
-      let pos = lexing_positions lexbuf in
-      if Filename.extension name = ".pdf" then
-        LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
-      else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
-  | Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
-      L.get_law_heading lexbuf
-  | Plus
-      (* Match non-special characters, i.e. characters that doesn't appear at the start of a
-         previous regexp. *)
-      ( Compl ('#' | '`' | '>')
-      (* Following literals allow to match grave accents as long as they don't conflict with the
-         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
-         not be followed by 'catala'. *)
-      | Rep ('`', 1 .. 2), Compl '`'
-      | "```", compl_catala ) ->
-      LAW_TEXT (Utf8.lexeme lexbuf)
-  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
+  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
+  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
+  if at_bol then
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | "```catala", Plus white_space ->
+        L.context := Code;
+        Buffer.clear L.code_buffer;
+        BEGIN_CODE
+    | '>' ->
+        L.context := Directive;
+        BEGIN_DIRECTIVE
+    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
+        L.get_law_heading lexbuf
+    | _ -> (
+        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
+           sub-match at the same point *)
+        let lexbuf = lexbuf in
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
+        match%sedlex lexbuf with
+        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
+  else
+    match%sedlex lexbuf with
+    | eof -> EOF
+    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
+    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

 (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
    Surface.Lexer_common.is_code}. *)
-let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
+let lexer (lexbuf : lexbuf) : token =
+  match !L.context with
+  | Law -> lex_law lexbuf
+  | Code -> lex_code lexbuf
+  | Directive -> lex_directive lexbuf
+  | Directive_args -> lex_directive_args lexbuf
--- a/compiler/surface/parser.messages
+++ b/compiler/surface/parser.messages
--- a/compiler/surface/parser.mly
+++ b/compiler/surface/parser.mly
@ -29,6 +29,10 @@ end>

 %start source_file

+(* The token is returned for every line of law text, make them right-associative
+   so that we concat them efficiently as much as possible. *)
+%right LAW_TEXT
+
 %%

 typ_base:
@ -550,7 +554,7 @@ code:
 | code = list(code_item) { (code, Pos.from_lpos $sloc) }

 metadata_block:
-| BEGIN_METADATA option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text)  END_METADATA {
+| BEGIN_DIRECTIVE BEGIN_METADATA END_DIRECTIVE option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) BEGIN_DIRECTIVE END_METADATA END_DIRECTIVE {
  let (code, pos) = code_and_pos in
   (code, (text, pos))
 }
@ -567,7 +571,7 @@ law_heading:
 }

 law_text:
-| text = LAW_TEXT { String.trim text }
+| lines = nonempty_list(LAW_TEXT) { String.trim (String.concat "" lines) }

 source_file_item:
 | text = law_text { LawText text }
@ -582,8 +586,16 @@ source_file_item:
  let (code, source_repr) = code in
  CodeBlock (code, source_repr, true)
 }
-| includ = LAW_INCLUDE {
-  LawInclude includ
+| BEGIN_DIRECTIVE LAW_INCLUDE COLON args = nonempty_list(DIRECTIVE_ARG) page = option(AT_PAGE) END_DIRECTIVE {
+  let filename = String.trim (String.concat "" args) in
+  let pos = Pos.from_lpos $sloc in
+  let jorftext = Re.Pcre.regexp "JORFTEXT\\d{12}" in
+  if Re.Pcre.pmatch ~rex:jorftext filename && page = None then
+    LawInclude (Ast.LegislativeText (filename, pos))
+  else if Filename.extension filename = ".pdf" || page <> None then
+    LawInclude (Ast.PdfFile ((filename, pos), page))
+  else
+    LawInclude (Ast.CatalaFile (filename, pos))
 }


--- a/compiler/surface/tokens.mly
+++ b/compiler/surface/tokens.mly
@ -23,7 +23,11 @@

 %token EOF
 %token<string * string option * string option * int> LAW_HEADING
-%token<Ast.law_include> LAW_INCLUDE
+
+%token BEGIN_DIRECTIVE END_DIRECTIVE LAW_INCLUDE
+%token<int> AT_PAGE
+%token<string> DIRECTIVE_ARG
+
 %token<string> LAW_TEXT
 %token<string> CONSTRUCTOR IDENT
 %token<string> END_CODE