Cleanup the lexer, and refactor for more generic directives

2024-11-09 22:16:10 +03:00 · 2021-08-17 15:49:48 +02:00 · 2021-08-17 15:49:48 +02:00 · b31bee71ad
commit b31bee71ad
parent 0659816782
8 changed files with 486 additions and 484 deletions
--- a/compiler/surface/lexer_common.ml
+++ b/compiler/surface/lexer_common.ml
@ -35,17 +35,19 @@ let get_law_heading (lexbuf : lexbuf) : token =
  let precedence = calc_precedence (String.trim (get_substring 1)) in
  LAW_HEADING (title, article_id, article_expiration_date, precedence)
 type lexing_context = Law | Code | Directive | Directive_args
 (** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
    code or law. *)
-let is_code : bool ref = ref false
+let context : lexing_context ref = ref Law
 (** Mutable string reference that accumulates the string representation of the body of code being
    lexed. This string representation is used in the literate programming backends to faithfully
    capture the spacing pattern of the original program *)
-let code_string_acc : string ref = ref ""
+let code_buffer : Buffer.t = Buffer.create 4000
-(** Updates {!val:code_string_acc} with the current lexeme *)
+(** Updates {!val:code_buffer} with the current lexeme *)
-let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf
+let update_acc (lexbuf : lexbuf) : unit = Buffer.add_string code_buffer (Utf8.lexeme lexbuf)
 (** Error-generating helper *)
 let raise_lexer_error (loc : Pos.t) (token : string) =
--- a/compiler/surface/lexer_common.mli
+++ b/compiler/surface/lexer_common.mli
@ -14,17 +14,19 @@
 (** Auxiliary functions used by all lexers. *)
-val is_code : bool ref
+type lexing_context = Law | Code | Directive | Directive_args
 (** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
    code or law. *)
-val code_string_acc : string ref
+val context : lexing_context ref
-(** Mutable string reference that accumulates the string representation of the body of code being
+(** Reference, used by the lexer as the mutable state to distinguish whether it is lexing code or
-    lexed. This string representation is used in the literate programming backends to faithfully
+    law. *)
-    capture the spacing pattern of the original program *)
+
 val code_buffer : Buffer.t
 (** Buffer that accumulates the string representation of the body of code being lexed. This string
    representation is used in the literate programming backends to faithfully capture the spacing
    pattern of the original program *)
 val update_acc : Sedlexing.lexbuf -> unit
-(** Updates {!val:code_string_acc} with the current lexeme *)
+(** Updates {!val:code_buffer} with the current lexeme *)
 val raise_lexer_error : Utils.Pos.t -> string -> 'a
 (** Error-generating helper *)
--- a/compiler/surface/lexer_en.ml
+++ b/compiler/surface/lexer_en.ml
@ -103,6 +103,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
 (** Regexp matching at least one space. *)
 let space_plus = [%sedlex.regexp? Plus white_space]
 (** Regexp matching white space but not newlines *)
 let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
 (** Main lexing function used in code blocks *)
 let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
@ -118,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
      lex_code lexbuf
  | "```" ->
      (* End of code section *)
-      L.is_code := false;
+      L.context := Law;
-      END_CODE !L.code_string_acc
+      END_CODE (Buffer.contents L.code_buffer)
  | "scope" ->
      L.update_acc lexbuf;
      SCOPE
@ -500,60 +503,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
      INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 let rec lex_directive_args (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | '@', Star hspace, "p.", Star hspace, Plus digit ->
      let s = Utf8.lexeme lexbuf in
      let i = String.index s '.' in
      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
  | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
  | Plus hspace -> lex_directive_args lexbuf
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 let rec lex_directive (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | Plus hspace -> lex_directive lexbuf
  | "Begin", Plus hspace, "metadata" -> BEGIN_METADATA
  | "End", Plus hspace, "metadata" -> END_METADATA
  | "Include" -> LAW_INCLUDE
  | ":" ->
      L.context := Directive_args;
      COLON
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 (** Main lexing function used outside code blocks *)
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
-  let prev_pos = lexing_positions lexbuf in
+  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
-  let compl_catala =
+  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
-    [%sedlex.regexp?
+  if at_bol then
      ( Compl 'c'
      | 'c', Compl 'a'
      | "ca", Compl 't'
      | "cat", Compl 'a'
      | "cata", Compl 'l'
      | "catal", Compl 'a'
      | "catala", Compl (white_space | '\n') )]
  in
    match%sedlex lexbuf with
  | "```catala" ->
      L.is_code := true;
      L.code_string_acc := "";
      BEGIN_CODE
    | eof -> EOF
-  | '>', Star white_space, "Begin metadata" -> BEGIN_METADATA
+    | "```catala", Plus white_space ->
-  | '>', Star white_space, "End metadata" -> END_METADATA
+        L.context := Code;
-  | ( '>',
+        Buffer.clear L.code_buffer;
-      Star white_space,
+        BEGIN_CODE
-      "Include:",
+    | '>' ->
-      Star white_space,
+        L.context := Directive;
-      Plus (Compl ('@' | '\n')),
+        BEGIN_DIRECTIVE
-      Star white_space,
+    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
      Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
      '\n' ) ->
      let extract_components =
        R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
      in
      let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
      let name = get_component 1 in
      let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
      let pos = lexing_positions lexbuf in
      if Filename.extension name = ".pdf" then
        LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
      else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
  | Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
        L.get_law_heading lexbuf
-  | Plus
+    | _ -> (
-      (* Match non-special characters, i.e. characters that doesn't appear at the start of a
+        (* Nested match for lower priority; `_` matches length 0 *)
-         previous regexp. *)
+        let lexbuf = lexbuf in
-      ( Compl ('#' | '`' | '>')
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
-      (* Following literals allow to match grave accents as long as they don't conflict with the
+        match%sedlex lexbuf with
-         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
+        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
-         not be followed by 'catala'. *)
+        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
-      | Rep ('`', 1 .. 2), Compl '`'
+  else
-      | "```", compl_catala ) ->
+    match%sedlex lexbuf with
-      LAW_TEXT (Utf8.lexeme lexbuf)
+    | eof -> EOF
    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
    Surface.Lexer_common.is_code}. *)
-let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
+let lexer (lexbuf : lexbuf) : token =
  match !L.context with
  | Law -> lex_law lexbuf
  | Code -> lex_code lexbuf
  | Directive -> lex_directive lexbuf
  | Directive_args -> lex_directive_args lexbuf
--- a/compiler/surface/lexer_fr.ml
+++ b/compiler/surface/lexer_fr.ml
@ -100,6 +100,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
 (** Regexp matching at least one space. *)
 let space_plus = [%sedlex.regexp? Plus white_space]
 (** Regexp matching white space but not newlines *)
 let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
 (** Main lexing function used in code blocks *)
 let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
@ -115,8 +118,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
      lex_code lexbuf
  | "```" ->
      (* End of code section *)
-      L.is_code := false;
+      L.context := Law;
-      END_CODE !L.code_string_acc
+      END_CODE (Buffer.contents L.code_buffer)
  | "champ", space_plus, "d\'application" ->
      L.update_acc lexbuf;
      SCOPE
@ -503,64 +506,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
      INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 let rec lex_directive_args (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | '@', Star hspace, "p.", Star hspace, Plus digit ->
      let s = Utf8.lexeme lexbuf in
      let i = String.index s '.' in
      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
  | Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
  | Plus hspace -> lex_directive_args lexbuf
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 let rec lex_directive (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | Plus hspace -> lex_directive lexbuf
  | 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
  | "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
  | "Inclusion" -> LAW_INCLUDE
  | ':', Star hspace ->
      L.context := Directive_args;
      COLON
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 (** Main lexing function used outside code blocks *)
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
-  let prev_pos = lexing_positions lexbuf in
+  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
-  let compl_catala =
+  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
-    [%sedlex.regexp?
+  if at_bol then
      ( Compl 'c'
      | 'c', Compl 'a'
      | "ca", Compl 't'
      | "cat", Compl 'a'
      | "cata", Compl 'l'
      | "catal", Compl 'a'
      | "catala", Compl (white_space | '\n') )]
  in
    match%sedlex lexbuf with
  | "```catala" ->
      L.is_code := true;
      L.code_string_acc := "";
      BEGIN_CODE
    | eof -> EOF
-  | '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
+    | "```catala", Plus white_space ->
-  | '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
+        L.context := Code;
-  | ( '>',
+        Buffer.clear L.code_buffer;
-      Star white_space,
+        BEGIN_CODE
-      "Inclusion:",
+    | '>', Star hspace ->
-      Star white_space,
+        L.context := Directive;
-      Plus (Compl ('@' | '\n')),
+        BEGIN_DIRECTIVE
-      Star white_space,
+    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
      Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
      '\n' ) ->
      let extract_components =
        R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
      in
      let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
      let jorftext = R.regexp "JORFTEXT\\d{12}" in
      let name = get_component 1 in
      let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
      let pos = lexing_positions lexbuf in
      if R.pmatch ~rex:jorftext name then
        LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
      else if Filename.extension name = ".pdf" then
        LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
      else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
  | Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
        L.get_law_heading lexbuf
-  | Plus
+    | _ -> (
-      (* Match non-special characters, i.e. characters that doesn't appear at the start of a
+        (* Nested match for lower priority; `_` matches length 0 *)
-         previous regexp. *)
+        let lexbuf = lexbuf in
-      ( Compl ('#' | '`' | '>')
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
-      (* Following literals allow to match grave accents as long as they don't conflict with the
+        match%sedlex lexbuf with
-         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
+        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
-         not be followed by 'catala'. *)
+        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
-      | Rep ('`', 1 .. 2), Compl '`'
+  else
-      | "```", compl_catala ) ->
+    match%sedlex lexbuf with
-      LAW_TEXT (Utf8.lexeme lexbuf)
+    | eof -> EOF
    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
    Surface.Lexer_common.is_code}. *)
-let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
+let lexer (lexbuf : lexbuf) : token =
  match !L.context with
  | Law -> lex_law lexbuf
  | Code -> lex_code lexbuf
  | Directive -> lex_directive lexbuf
  | Directive_args -> lex_directive_args lexbuf
--- a/compiler/surface/lexer_pl.ml
+++ b/compiler/surface/lexer_pl.ml
@ -105,6 +105,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
 (** Regexp matching at least one space. *)
 let space_plus = [%sedlex.regexp? Plus white_space]
 (** Regexp matching white space but not newlines *)
 let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
 (** Main lexing function used in code blocks *)
 let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
@ -120,8 +123,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
      lex_code lexbuf
  | "```" ->
      (* End of code section *)
-      L.is_code := false;
+      L.context := Law;
-      END_CODE !L.code_string_acc
+      END_CODE (Buffer.contents L.code_buffer)
  | "zakres" ->
      L.update_acc lexbuf;
      SCOPE
@ -507,61 +510,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
      INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
-(** Main lexing function used outside code blocks *)
+let rec lex_directive_args (lexbuf : lexbuf) : token =
 let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  let compl_catala =
    [%sedlex.regexp?
      ( Compl 'c'
      | 'c', Compl 'a'
      | "ca", Compl 't'
      | "cat", Compl 'a'
      | "cata", Compl 'l'
      | "catal", Compl 'a'
      | "catala", Compl (white_space | '\n') )]
  in
  match%sedlex lexbuf with
-  | "```catala" ->
+  | '@', Star hspace, "p.", Star hspace, Plus digit ->
-      L.is_code := true;
+      let s = Utf8.lexeme lexbuf in
-      L.code_string_acc := "";
+      let i = String.index s '.' in
      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
  | Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
  | Plus hspace -> lex_directive_args lexbuf
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
-      BEGIN_CODE
+let rec lex_directive (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | Plus hspace -> lex_directive lexbuf
  | "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
  | "Koniec", Plus hspace, "metadanych" -> END_METADATA
  | "Include", Star hspace -> LAW_INCLUDE
  | ":" ->
      L.context := Directive_args;
      COLON
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 (** Main lexing function used outside code blocks *)
 and lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
  if at_bol then
    match%sedlex lexbuf with
    | eof -> EOF
-  | '>', Star white_space, "Poczatek metadanych" -> BEGIN_METADATA
+    | "```catala", Plus white_space ->
-  | '>', Star white_space, "Koniec metadanych" -> END_METADATA
+        L.context := Code;
-  | ( '>',
+        Buffer.clear L.code_buffer;
-      Star white_space,
+        BEGIN_CODE
-      "Include:",
+    | '>', Star hspace ->
-      Star white_space,
+        L.context := Directive;
-      Plus (Compl ('@' | '\n')),
+        BEGIN_DIRECTIVE
-      Star white_space,
+    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
      Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
      '\n' ) ->
      let extract_components =
        R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
      in
      let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
      let name = get_component 1 in
      let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
      let pos = lexing_positions lexbuf in
      if Filename.extension name = ".pdf" then
        LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
      else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
  | Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
        L.get_law_heading lexbuf
-  | Plus
+    | _ -> (
-      (* Match non-special characters, i.e. characters that doesn't appear at the start of a
+        (* Nested match for lower priority; `_` matches length 0 *)
-         previous regexp. *)
+        let lexbuf = lexbuf in
-      ( Compl ('#' | '`' | '>')
+        (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
-      (* Following literals allow to match grave accents as long as they don't conflict with the
+        match%sedlex lexbuf with
-         [BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
+        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
-         not be followed by 'catala'. *)
+        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
-      | Rep ('`', 1 .. 2), Compl '`'
+  else
-      | "```", compl_catala ) ->
+    match%sedlex lexbuf with
-      LAW_TEXT (Utf8.lexeme lexbuf)
+    | eof -> EOF
    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
 (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
    Surface.Lexer_common.is_code}. *)
-let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
+let lexer (lexbuf : lexbuf) : token =
  match !L.context with
  | Law -> lex_law lexbuf
  | Code -> lex_code lexbuf
  | Directive -> lex_directive lexbuf
  | Directive_args -> lex_directive_args lexbuf
--- a/compiler/surface/parser.messages
+++ b/compiler/surface/parser.messages
--- a/compiler/surface/parser.mly
+++ b/compiler/surface/parser.mly
@ -550,7 +550,7 @@ code:
 | code = list(code_item) { (code, Pos.from_lpos $sloc) }
 metadata_block:
-| BEGIN_METADATA option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text)  END_METADATA {
+| BEGIN_DIRECTIVE BEGIN_METADATA END_DIRECTIVE option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) BEGIN_DIRECTIVE END_METADATA END_DIRECTIVE {
  let (code, pos) = code_and_pos in
   (code, (text, pos))
 }
@ -567,7 +567,7 @@ law_heading:
 }
 law_text:
-| text = LAW_TEXT { String.trim text }
+| lines = nonempty_list(LAW_TEXT) { String.trim (String.concat "" lines) }
 source_file_item:
 | text = law_text { LawText text }
@ -582,8 +582,16 @@ source_file_item:
  let (code, source_repr) = code in
  CodeBlock (code, source_repr, true)
 }
-| includ = LAW_INCLUDE {
+| BEGIN_DIRECTIVE LAW_INCLUDE COLON args = nonempty_list(DIRECTIVE_ARG) page = option(AT_PAGE) END_DIRECTIVE {
-  LawInclude includ
+  let filename = String.trim (String.concat "" args) in
  let pos = Pos.from_lpos $sloc in
  let jorftext = Re.Pcre.regexp "JORFTEXT\\d{12}" in
  if Re.Pcre.pmatch ~rex:jorftext filename && page = None then
    LawInclude (Ast.LegislativeText (filename, pos))
  else if Filename.extension filename = ".pdf" || page <> None then
    LawInclude (Ast.PdfFile ((filename, pos), page))
  else
    LawInclude (Ast.CatalaFile (filename, pos))
 }
--- a/compiler/surface/tokens.mly
+++ b/compiler/surface/tokens.mly
@ -23,7 +23,11 @@
 %token EOF
 %token<string * string option * string option * int> LAW_HEADING
-%token<Ast.law_include> LAW_INCLUDE
+
 %token BEGIN_DIRECTIVE END_DIRECTIVE LAW_INCLUDE
 %token<int> AT_PAGE
 %token<string> DIRECTIVE_ARG
 %token<string> LAW_TEXT
 %token<string> CONSTRUCTOR IDENT
 %token<string> END_CODE