Cleanup the lexer, and refactor for more generic directives

This commit is contained in:
Louis Gesbert 2021-08-17 15:49:48 +02:00
parent 0659816782
commit b31bee71ad
8 changed files with 486 additions and 484 deletions

View File

@ -35,17 +35,19 @@ let get_law_heading (lexbuf : lexbuf) : token =
let precedence = calc_precedence (String.trim (get_substring 1)) in let precedence = calc_precedence (String.trim (get_substring 1)) in
LAW_HEADING (title, article_id, article_expiration_date, precedence) LAW_HEADING (title, article_id, article_expiration_date, precedence)
type lexing_context = Law | Code | Directive | Directive_args
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing (** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
code or law. *) code or law. *)
let is_code : bool ref = ref false let context : lexing_context ref = ref Law
(** Mutable string reference that accumulates the string representation of the body of code being (** Mutable string reference that accumulates the string representation of the body of code being
lexed. This string representation is used in the literate programming backends to faithfully lexed. This string representation is used in the literate programming backends to faithfully
capture the spacing pattern of the original program *) capture the spacing pattern of the original program *)
let code_string_acc : string ref = ref "" let code_buffer : Buffer.t = Buffer.create 4000
(** Updates {!val:code_string_acc} with the current lexeme *) (** Updates {!val:code_buffer} with the current lexeme *)
let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf let update_acc (lexbuf : lexbuf) : unit = Buffer.add_string code_buffer (Utf8.lexeme lexbuf)
(** Error-generating helper *) (** Error-generating helper *)
let raise_lexer_error (loc : Pos.t) (token : string) = let raise_lexer_error (loc : Pos.t) (token : string) =

View File

@ -14,17 +14,19 @@
(** Auxiliary functions used by all lexers. *) (** Auxiliary functions used by all lexers. *)
val is_code : bool ref type lexing_context = Law | Code | Directive | Directive_args
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
code or law. *)
val code_string_acc : string ref val context : lexing_context ref
(** Mutable string reference that accumulates the string representation of the body of code being (** Reference, used by the lexer as the mutable state to distinguish whether it is lexing code or
lexed. This string representation is used in the literate programming backends to faithfully law. *)
capture the spacing pattern of the original program *)
val code_buffer : Buffer.t
(** Buffer that accumulates the string representation of the body of code being lexed. This string
representation is used in the literate programming backends to faithfully capture the spacing
pattern of the original program *)
val update_acc : Sedlexing.lexbuf -> unit val update_acc : Sedlexing.lexbuf -> unit
(** Updates {!val:code_string_acc} with the current lexeme *) (** Updates {!val:code_buffer} with the current lexeme *)
val raise_lexer_error : Utils.Pos.t -> string -> 'a val raise_lexer_error : Utils.Pos.t -> string -> 'a
(** Error-generating helper *) (** Error-generating helper *)

View File

@ -103,6 +103,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *) (** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space] let space_plus = [%sedlex.regexp? Plus white_space]
(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Main lexing function used in code blocks *) (** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token = let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in let prev_lexeme = Utf8.lexeme lexbuf in
@ -118,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
lex_code lexbuf lex_code lexbuf
| "```" -> | "```" ->
(* End of code section *) (* End of code section *)
L.is_code := false; L.context := Law;
END_CODE !L.code_string_acc END_CODE (Buffer.contents L.code_buffer)
| "scope" -> | "scope" ->
L.update_acc lexbuf; L.update_acc lexbuf;
SCOPE SCOPE
@ -500,60 +503,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf)) INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive_args (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| '@', Star hspace, "p.", Star hspace, Plus digit ->
let s = Utf8.lexeme lexbuf in
let i = String.index s '.' in
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| Plus hspace -> lex_directive lexbuf
| "Begin", Plus hspace, "metadata" -> BEGIN_METADATA
| "End", Plus hspace, "metadata" -> END_METADATA
| "Include" -> LAW_INCLUDE
| ":" ->
L.context := Directive_args;
COLON
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *) (** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token = let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let compl_catala = let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
[%sedlex.regexp? if at_bol then
( Compl 'c'
| 'c', Compl 'a'
| "ca", Compl 't'
| "cat", Compl 'a'
| "cata", Compl 'l'
| "catal", Compl 'a'
| "catala", Compl (white_space | '\n') )]
in
match%sedlex lexbuf with match%sedlex lexbuf with
| "```catala" ->
L.is_code := true;
L.code_string_acc := "";
BEGIN_CODE
| eof -> EOF | eof -> EOF
| '>', Star white_space, "Begin metadata" -> BEGIN_METADATA | "```catala", Plus white_space ->
| '>', Star white_space, "End metadata" -> END_METADATA L.context := Code;
| ( '>', Buffer.clear L.code_buffer;
Star white_space, BEGIN_CODE
"Include:", | '>' ->
Star white_space, L.context := Directive;
Plus (Compl ('@' | '\n')), BEGIN_DIRECTIVE
Star white_space, | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
'\n' ) ->
let extract_components =
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if Filename.extension name = ".pdf" then
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
L.get_law_heading lexbuf L.get_law_heading lexbuf
| Plus | _ -> (
(* Match non-special characters, i.e. characters that doesn't appear at the start of a (* Nested match for lower priority; `_` matches length 0 *)
previous regexp. *) let lexbuf = lexbuf in
( Compl ('#' | '`' | '>') (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
(* Following literals allow to match grave accents as long as they don't conflict with the match%sedlex lexbuf with
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
not be followed by 'catala'. *) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
| Rep ('`', 1 .. 2), Compl '`' else
| "```", compl_catala ) -> match%sedlex lexbuf with
LAW_TEXT (Utf8.lexeme lexbuf) | eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val: (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
Surface.Lexer_common.is_code}. *) Surface.Lexer_common.is_code}. *)
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf

View File

@ -100,6 +100,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *) (** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space] let space_plus = [%sedlex.regexp? Plus white_space]
(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Main lexing function used in code blocks *) (** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token = let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in let prev_lexeme = Utf8.lexeme lexbuf in
@ -115,8 +118,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
lex_code lexbuf lex_code lexbuf
| "```" -> | "```" ->
(* End of code section *) (* End of code section *)
L.is_code := false; L.context := Law;
END_CODE !L.code_string_acc END_CODE (Buffer.contents L.code_buffer)
| "champ", space_plus, "d\'application" -> | "champ", space_plus, "d\'application" ->
L.update_acc lexbuf; L.update_acc lexbuf;
SCOPE SCOPE
@ -503,64 +506,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf)) INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive_args (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| '@', Star hspace, "p.", Star hspace, Plus digit ->
let s = Utf8.lexeme lexbuf in
let i = String.index s '.' in
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
| Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| Plus hspace -> lex_directive lexbuf
| 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
| "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
| "Inclusion" -> LAW_INCLUDE
| ':', Star hspace ->
L.context := Directive_args;
COLON
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *) (** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token = let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let compl_catala = let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
[%sedlex.regexp? if at_bol then
( Compl 'c'
| 'c', Compl 'a'
| "ca", Compl 't'
| "cat", Compl 'a'
| "cata", Compl 'l'
| "catal", Compl 'a'
| "catala", Compl (white_space | '\n') )]
in
match%sedlex lexbuf with match%sedlex lexbuf with
| "```catala" ->
L.is_code := true;
L.code_string_acc := "";
BEGIN_CODE
| eof -> EOF | eof -> EOF
| '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA | "```catala", Plus white_space ->
| '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA L.context := Code;
| ( '>', Buffer.clear L.code_buffer;
Star white_space, BEGIN_CODE
"Inclusion:", | '>', Star hspace ->
Star white_space, L.context := Directive;
Plus (Compl ('@' | '\n')), BEGIN_DIRECTIVE
Star white_space, | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
'\n' ) ->
let extract_components =
R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let jorftext = R.regexp "JORFTEXT\\d{12}" in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if R.pmatch ~rex:jorftext name then
LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
else if Filename.extension name = ".pdf" then
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
L.get_law_heading lexbuf L.get_law_heading lexbuf
| Plus | _ -> (
(* Match non-special characters, i.e. characters that doesn't appear at the start of a (* Nested match for lower priority; `_` matches length 0 *)
previous regexp. *) let lexbuf = lexbuf in
( Compl ('#' | '`' | '>') (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
(* Following literals allow to match grave accents as long as they don't conflict with the match%sedlex lexbuf with
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
not be followed by 'catala'. *) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
| Rep ('`', 1 .. 2), Compl '`' else
| "```", compl_catala ) -> match%sedlex lexbuf with
LAW_TEXT (Utf8.lexeme lexbuf) | eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val: (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
Surface.Lexer_common.is_code}. *) Surface.Lexer_common.is_code}. *)
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf

View File

@ -105,6 +105,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *) (** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space] let space_plus = [%sedlex.regexp? Plus white_space]
(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Main lexing function used in code blocks *) (** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token = let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in let prev_lexeme = Utf8.lexeme lexbuf in
@ -120,8 +123,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
lex_code lexbuf lex_code lexbuf
| "```" -> | "```" ->
(* End of code section *) (* End of code section *)
L.is_code := false; L.context := Law;
END_CODE !L.code_string_acc END_CODE (Buffer.contents L.code_buffer)
| "zakres" -> | "zakres" ->
L.update_acc lexbuf; L.update_acc lexbuf;
SCOPE SCOPE
@ -507,61 +510,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf)) INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *) let rec lex_directive_args (lexbuf : lexbuf) : token =
let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in let prev_pos = lexing_positions lexbuf in
let compl_catala =
[%sedlex.regexp?
( Compl 'c'
| 'c', Compl 'a'
| "ca", Compl 't'
| "cat", Compl 'a'
| "cata", Compl 'l'
| "catal", Compl 'a'
| "catala", Compl (white_space | '\n') )]
in
match%sedlex lexbuf with match%sedlex lexbuf with
| "```catala" -> | '@', Star hspace, "p.", Star hspace, Plus digit ->
L.is_code := true; let s = Utf8.lexeme lexbuf in
L.code_string_acc := ""; let i = String.index s '.' in
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
| Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
BEGIN_CODE let rec lex_directive (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| Plus hspace -> lex_directive lexbuf
| "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
| "Koniec", Plus hspace, "metadanych" -> END_METADATA
| "Include", Star hspace -> LAW_INCLUDE
| ":" ->
L.context := Directive_args;
COLON
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *)
and lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
if at_bol then
match%sedlex lexbuf with
| eof -> EOF | eof -> EOF
| '>', Star white_space, "Poczatek metadanych" -> BEGIN_METADATA | "```catala", Plus white_space ->
| '>', Star white_space, "Koniec metadanych" -> END_METADATA L.context := Code;
| ( '>', Buffer.clear L.code_buffer;
Star white_space, BEGIN_CODE
"Include:", | '>', Star hspace ->
Star white_space, L.context := Directive;
Plus (Compl ('@' | '\n')), BEGIN_DIRECTIVE
Star white_space, | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
'\n' ) ->
let extract_components =
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if Filename.extension name = ".pdf" then
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
L.get_law_heading lexbuf L.get_law_heading lexbuf
| Plus | _ -> (
(* Match non-special characters, i.e. characters that doesn't appear at the start of a (* Nested match for lower priority; `_` matches length 0 *)
previous regexp. *) let lexbuf = lexbuf in
( Compl ('#' | '`' | '>') (* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
(* Following literals allow to match grave accents as long as they don't conflict with the match%sedlex lexbuf with
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
not be followed by 'catala'. *) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
| Rep ('`', 1 .. 2), Compl '`' else
| "```", compl_catala ) -> match%sedlex lexbuf with
LAW_TEXT (Utf8.lexeme lexbuf) | eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val: (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
Surface.Lexer_common.is_code}. *) Surface.Lexer_common.is_code}. *)
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf

File diff suppressed because it is too large Load Diff

View File

@ -550,7 +550,7 @@ code:
| code = list(code_item) { (code, Pos.from_lpos $sloc) } | code = list(code_item) { (code, Pos.from_lpos $sloc) }
metadata_block: metadata_block:
| BEGIN_METADATA option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) END_METADATA { | BEGIN_DIRECTIVE BEGIN_METADATA END_DIRECTIVE option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) BEGIN_DIRECTIVE END_METADATA END_DIRECTIVE {
let (code, pos) = code_and_pos in let (code, pos) = code_and_pos in
(code, (text, pos)) (code, (text, pos))
} }
@ -567,7 +567,7 @@ law_heading:
} }
law_text: law_text:
| text = LAW_TEXT { String.trim text } | lines = nonempty_list(LAW_TEXT) { String.trim (String.concat "" lines) }
source_file_item: source_file_item:
| text = law_text { LawText text } | text = law_text { LawText text }
@ -582,8 +582,16 @@ source_file_item:
let (code, source_repr) = code in let (code, source_repr) = code in
CodeBlock (code, source_repr, true) CodeBlock (code, source_repr, true)
} }
| includ = LAW_INCLUDE { | BEGIN_DIRECTIVE LAW_INCLUDE COLON args = nonempty_list(DIRECTIVE_ARG) page = option(AT_PAGE) END_DIRECTIVE {
LawInclude includ let filename = String.trim (String.concat "" args) in
let pos = Pos.from_lpos $sloc in
let jorftext = Re.Pcre.regexp "JORFTEXT\\d{12}" in
if Re.Pcre.pmatch ~rex:jorftext filename && page = None then
LawInclude (Ast.LegislativeText (filename, pos))
else if Filename.extension filename = ".pdf" || page <> None then
LawInclude (Ast.PdfFile ((filename, pos), page))
else
LawInclude (Ast.CatalaFile (filename, pos))
} }

View File

@ -23,7 +23,11 @@
%token EOF %token EOF
%token<string * string option * string option * int> LAW_HEADING %token<string * string option * string option * int> LAW_HEADING
%token<Ast.law_include> LAW_INCLUDE
%token BEGIN_DIRECTIVE END_DIRECTIVE LAW_INCLUDE
%token<int> AT_PAGE
%token<string> DIRECTIVE_ARG
%token<string> LAW_TEXT %token<string> LAW_TEXT
%token<string> CONSTRUCTOR IDENT %token<string> CONSTRUCTOR IDENT
%token<string> END_CODE %token<string> END_CODE