Merge pull request #138 from AltGr/lexer-refactor

Cleanup the lexer, and refactor for more generic directives
This commit is contained in:
Denis Merigoux 2021-08-19 12:12:53 +02:00 committed by GitHub
commit 7cbb4a9149
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 512 additions and 506 deletions

View File

@ -55,4 +55,5 @@ jobs:
- name: Make all
run: |
eval $(opam env)
export OCAMLRUNPARAM=b
make all

View File

@ -21,7 +21,7 @@ dependencies-ocaml:
ocamlformat ANSITerminal sedlex menhir menhirLib dune cmdliner obelisk \
re obelisk unionfind bindlib zarith.1.11 zarith_stubs_js.v0.14.0 ocamlgraph \
js_of_ocaml-compiler js_of_ocaml js_of_ocaml-ppx calendar camomile \
visitors benchmark
visitors benchmark odoc
dependencies-js:
$(MAKE) -C $(FRENCH_LAW_JS_LIB_DIR) dependencies

View File

@ -21,7 +21,7 @@
(menhir
(modules tokens parser)
(merge_into parser)
(flags --external-tokens Tokens --table))
(flags --external-tokens Tokens --table --explain))
(documentation
(package catala)

View File

@ -35,17 +35,19 @@ let get_law_heading (lexbuf : lexbuf) : token =
let precedence = calc_precedence (String.trim (get_substring 1)) in
LAW_HEADING (title, article_id, article_expiration_date, precedence)
type lexing_context = Law | Code | Directive | Directive_args
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
code or law. *)
let is_code : bool ref = ref false
let context : lexing_context ref = ref Law
(** Mutable string reference that accumulates the string representation of the body of code being
lexed. This string representation is used in the literate programming backends to faithfully
capture the spacing pattern of the original program *)
let code_string_acc : string ref = ref ""
let code_buffer : Buffer.t = Buffer.create 4000
(** Updates {!val:code_string_acc} with the current lexeme *)
let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf
(** Updates {!val:code_buffer} with the current lexeme *)
let update_acc (lexbuf : lexbuf) : unit = Buffer.add_string code_buffer (Utf8.lexeme lexbuf)
(** Error-generating helper *)
let raise_lexer_error (loc : Pos.t) (token : string) =

View File

@ -14,17 +14,19 @@
(** Auxiliary functions used by all lexers. *)
val is_code : bool ref
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
code or law. *)
type lexing_context = Law | Code | Directive | Directive_args
val code_string_acc : string ref
(** Mutable string reference that accumulates the string representation of the body of code being
lexed. This string representation is used in the literate programming backends to faithfully
capture the spacing pattern of the original program *)
val context : lexing_context ref
(** Reference, used by the lexer as the mutable state to distinguish whether it is lexing code or
law. *)
val code_buffer : Buffer.t
(** Buffer that accumulates the string representation of the body of code being lexed. This string
representation is used in the literate programming backends to faithfully capture the spacing
pattern of the original program *)
val update_acc : Sedlexing.lexbuf -> unit
(** Updates {!val:code_string_acc} with the current lexeme *)
(** Updates {!val:code_buffer} with the current lexeme *)
val raise_lexer_error : Utils.Pos.t -> string -> 'a
(** Error-generating helper *)

View File

@ -103,6 +103,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space]
(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
@ -118,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
lex_code lexbuf
| "```" ->
(* End of code section *)
L.is_code := false;
END_CODE !L.code_string_acc
L.context := Law;
END_CODE (Buffer.contents L.code_buffer)
| "scope" ->
L.update_acc lexbuf;
SCOPE
@ -307,7 +310,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "day" ->
L.update_acc lexbuf;
DAY
| 0x24, Star white_space, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
| 0x24, Star hspace, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
let full_str = Utf8.lexeme lexbuf in
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
@ -500,60 +503,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive_args (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| '@', Star hspace, "p.", Star hspace, Plus digit ->
let s = Utf8.lexeme lexbuf in
let i = String.index s '.' in
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| Plus hspace -> lex_directive lexbuf
| "Begin", Plus hspace, "metadata" -> BEGIN_METADATA
| "End", Plus hspace, "metadata" -> END_METADATA
| "Include" -> LAW_INCLUDE
| ":" ->
L.context := Directive_args;
COLON
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
let compl_catala =
[%sedlex.regexp?
( Compl 'c'
| 'c', Compl 'a'
| "ca", Compl 't'
| "cat", Compl 'a'
| "cata", Compl 'l'
| "catal", Compl 'a'
| "catala", Compl (white_space | '\n') )]
in
match%sedlex lexbuf with
| "```catala" ->
L.is_code := true;
L.code_string_acc := "";
BEGIN_CODE
| eof -> EOF
| '>', Star white_space, "Begin metadata" -> BEGIN_METADATA
| '>', Star white_space, "End metadata" -> END_METADATA
| ( '>',
Star white_space,
"Include:",
Star white_space,
Plus (Compl ('@' | '\n')),
Star white_space,
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
'\n' ) ->
let extract_components =
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if Filename.extension name = ".pdf" then
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
L.get_law_heading lexbuf
| Plus
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
previous regexp. *)
( Compl ('#' | '`' | '>')
(* Following literals allow to match grave accents as long as they don't conflict with the
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
not be followed by 'catala'. *)
| Rep ('`', 1 .. 2), Compl '`'
| "```", compl_catala ) ->
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```catala", Plus white_space ->
L.context := Code;
Buffer.clear L.code_buffer;
BEGIN_CODE
| '>' ->
L.context := Directive;
BEGIN_DIRECTIVE
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
L.get_law_heading lexbuf
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
let lexbuf = lexbuf in
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
Surface.Lexer_common.is_code}. *)
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf

View File

@ -43,6 +43,8 @@ let token_list : (string * token) list =
("somme", SUM);
("rempli", FILLED);
("définition", DEFINITION);
("étiquette", LABEL);
("exception", EXCEPTION);
("égal à", DEFINED_AS);
("selon", MATCH);
("sous forme", WITH);
@ -69,12 +71,12 @@ let token_list : (string * token) list =
("ou", OR);
("ou bien", XOR);
("non", NOT);
("nombre", CARDINAL);
("maximum", MAXIMUM);
("minimum", MINIMUM);
("filtre", FILTER);
("application", MAP);
("initial", INIT);
("nombre", CARDINAL);
("an", YEAR);
("mois", MONTH);
("jour", DAY);
@ -83,6 +85,7 @@ let token_list : (string * token) list =
]
@ L.token_list_language_agnostic
(** Localised builtin functions *)
let builtins : (string * Ast.builtin_expression) list =
[
("entier_vers_décimal", Ast.IntToDec);
@ -100,12 +103,15 @@ let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space]
(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| white_space | '\n' ->
| white_space ->
(* Whitespaces *)
L.update_acc lexbuf;
lex_code lexbuf
@ -115,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
lex_code lexbuf
| "```" ->
(* End of code section *)
L.is_code := false;
END_CODE !L.code_string_acc
L.context := Law;
END_CODE (Buffer.contents L.code_buffer)
| "champ", space_plus, "d\'application" ->
L.update_acc lexbuf;
SCOPE
@ -277,9 +283,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "non" ->
L.update_acc lexbuf;
NOT
| "nombre" ->
L.update_acc lexbuf;
CARDINAL
| "maximum" ->
L.update_acc lexbuf;
MAXIMUM
@ -295,6 +298,9 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "initial" ->
L.update_acc lexbuf;
INIT
| "nombre" ->
L.update_acc lexbuf;
CARDINAL
| "vrai" ->
L.update_acc lexbuf;
TRUE
@ -310,11 +316,10 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "jour" ->
L.update_acc lexbuf;
DAY
| digit, Star (digit | white_space), Opt (',', Rep (digit, 0 .. 2)), Star white_space, 0x20AC ->
| digit, Star (digit | hspace), Opt (',', Rep (digit, 0 .. 2)), Star hspace, 0x20AC ->
let extract_parts = R.regexp "([0-9]([0-9 ]*[0-9]|))(,([0-9]{0,2})|)" in
let full_str = Utf8.lexeme lexbuf in
let only_numbers_str = String.trim (String.sub full_str 0 (String.length full_str - 1)) in
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
let str = Utf8.lexeme lexbuf in
let parts = R.get_substring (R.exec ~rex:extract_parts str) in
(* Integer literal*)
let units = parts 1 in
let remove_spaces = R.regexp " " in
@ -503,64 +508,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive_args (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| '@', Star hspace, "p.", Star hspace, Plus digit ->
let s = Utf8.lexeme lexbuf in
let i = String.index s '.' in
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| Plus hspace -> lex_directive lexbuf
| 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
| "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
| "Inclusion" -> LAW_INCLUDE
| ':' ->
L.context := Directive_args;
COLON
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
let compl_catala =
[%sedlex.regexp?
( Compl 'c'
| 'c', Compl 'a'
| "ca", Compl 't'
| "cat", Compl 'a'
| "cata", Compl 'l'
| "catal", Compl 'a'
| "catala", Compl (white_space | '\n') )]
in
match%sedlex lexbuf with
| "```catala" ->
L.is_code := true;
L.code_string_acc := "";
BEGIN_CODE
| eof -> EOF
| '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
| '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
| ( '>',
Star white_space,
"Inclusion:",
Star white_space,
Plus (Compl ('@' | '\n')),
Star white_space,
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
'\n' ) ->
let extract_components =
R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let jorftext = R.regexp "JORFTEXT\\d{12}" in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if R.pmatch ~rex:jorftext name then
LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
else if Filename.extension name = ".pdf" then
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
L.get_law_heading lexbuf
| Plus
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
previous regexp. *)
( Compl ('#' | '`' | '>')
(* Following literals allow to match grave accents as long as they don't conflict with the
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
not be followed by 'catala'. *)
| Rep ('`', 1 .. 2), Compl '`'
| "```", compl_catala ) ->
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```catala", Plus white_space ->
L.context := Code;
Buffer.clear L.code_buffer;
BEGIN_CODE
| '>' ->
L.context := Directive;
BEGIN_DIRECTIVE
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
L.get_law_heading lexbuf
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
let lexbuf = lexbuf in
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
Surface.Lexer_common.is_code}. *)
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf

View File

@ -105,6 +105,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space]
(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
@ -120,8 +123,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
lex_code lexbuf
| "```" ->
(* End of code section *)
L.is_code := false;
END_CODE !L.code_string_acc
L.context := Law;
END_CODE (Buffer.contents L.code_buffer)
| "zakres" ->
L.update_acc lexbuf;
SCOPE
@ -309,16 +312,10 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "dzien" ->
L.update_acc lexbuf;
DAY
| ( Star white_space,
digit,
Star (digit | ','),
Opt ('.', Rep (digit, 0 .. 2)),
Star white_space,
"PLN" ) ->
| digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)), Star hspace, "PLN" ->
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
let full_str = Utf8.lexeme lexbuf in
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
let str = Utf8.lexeme lexbuf in
let parts = R.get_substring (R.exec ~rex:extract_parts str) in
(* Integer literal*)
let units = parts 1 in
let remove_commas = R.regexp "," in
@ -507,61 +504,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive_args (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| '@', Star hspace, "p.", Star hspace, Plus digit ->
let s = Utf8.lexeme lexbuf in
let i = String.index s '.' in
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| Plus hspace -> lex_directive lexbuf
| "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
| "Koniec", Plus hspace, "metadanych" -> END_METADATA
| "Include" -> LAW_INCLUDE
| ":" ->
L.context := Directive_args;
COLON
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
let compl_catala =
[%sedlex.regexp?
( Compl 'c'
| 'c', Compl 'a'
| "ca", Compl 't'
| "cat", Compl 'a'
| "cata", Compl 'l'
| "catal", Compl 'a'
| "catala", Compl (white_space | '\n') )]
in
match%sedlex lexbuf with
| "```catala" ->
L.is_code := true;
L.code_string_acc := "";
BEGIN_CODE
| eof -> EOF
| '>', Star white_space, "Poczatek metadanych" -> BEGIN_METADATA
| '>', Star white_space, "Koniec metadanych" -> END_METADATA
| ( '>',
Star white_space,
"Include:",
Star white_space,
Plus (Compl ('@' | '\n')),
Star white_space,
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
'\n' ) ->
let extract_components =
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if Filename.extension name = ".pdf" then
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
L.get_law_heading lexbuf
| Plus
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
previous regexp. *)
( Compl ('#' | '`' | '>')
(* Following literals allow to match grave accents as long as they don't conflict with the
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
not be followed by 'catala'. *)
| Rep ('`', 1 .. 2), Compl '`'
| "```", compl_catala ) ->
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```catala", Plus white_space ->
L.context := Code;
Buffer.clear L.code_buffer;
BEGIN_CODE
| '>' ->
L.context := Directive;
BEGIN_DIRECTIVE
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
L.get_law_heading lexbuf
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
let lexbuf = lexbuf in
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
Surface.Lexer_common.is_code}. *)
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,10 @@ end>
%start source_file
(* The token is returned for every line of law text, make them right-associative
so that we concat them efficiently as much as possible. *)
%right LAW_TEXT
%%
typ_base:
@ -550,7 +554,7 @@ code:
| code = list(code_item) { (code, Pos.from_lpos $sloc) }
metadata_block:
| BEGIN_METADATA option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) END_METADATA {
| BEGIN_DIRECTIVE BEGIN_METADATA END_DIRECTIVE option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) BEGIN_DIRECTIVE END_METADATA END_DIRECTIVE {
let (code, pos) = code_and_pos in
(code, (text, pos))
}
@ -567,7 +571,7 @@ law_heading:
}
law_text:
| text = LAW_TEXT { String.trim text }
| lines = nonempty_list(LAW_TEXT) { String.trim (String.concat "" lines) }
source_file_item:
| text = law_text { LawText text }
@ -582,8 +586,16 @@ source_file_item:
let (code, source_repr) = code in
CodeBlock (code, source_repr, true)
}
| includ = LAW_INCLUDE {
LawInclude includ
| BEGIN_DIRECTIVE LAW_INCLUDE COLON args = nonempty_list(DIRECTIVE_ARG) page = option(AT_PAGE) END_DIRECTIVE {
let filename = String.trim (String.concat "" args) in
let pos = Pos.from_lpos $sloc in
let jorftext = Re.Pcre.regexp "JORFTEXT\\d{12}" in
if Re.Pcre.pmatch ~rex:jorftext filename && page = None then
LawInclude (Ast.LegislativeText (filename, pos))
else if Filename.extension filename = ".pdf" || page <> None then
LawInclude (Ast.PdfFile ((filename, pos), page))
else
LawInclude (Ast.CatalaFile (filename, pos))
}

View File

@ -23,7 +23,11 @@
%token EOF
%token<string * string option * string option * int> LAW_HEADING
%token<Ast.law_include> LAW_INCLUDE
%token BEGIN_DIRECTIVE END_DIRECTIVE LAW_INCLUDE
%token<int> AT_PAGE
%token<string> DIRECTIVE_ARG
%token<string> LAW_TEXT
%token<string> CONSTRUCTOR IDENT
%token<string> END_CODE