mirror of
https://github.com/CatalaLang/catala.git
synced 2024-09-19 16:28:12 +03:00
Merge pull request #138 from AltGr/lexer-refactor
Cleanup the lexer, and refactor for more generic directives
This commit is contained in:
commit
7cbb4a9149
1
.github/workflows/build.yml
vendored
1
.github/workflows/build.yml
vendored
@ -55,4 +55,5 @@ jobs:
|
||||
- name: Make all
|
||||
run: |
|
||||
eval $(opam env)
|
||||
export OCAMLRUNPARAM=b
|
||||
make all
|
||||
|
2
Makefile
2
Makefile
@ -21,7 +21,7 @@ dependencies-ocaml:
|
||||
ocamlformat ANSITerminal sedlex menhir menhirLib dune cmdliner obelisk \
|
||||
re obelisk unionfind bindlib zarith.1.11 zarith_stubs_js.v0.14.0 ocamlgraph \
|
||||
js_of_ocaml-compiler js_of_ocaml js_of_ocaml-ppx calendar camomile \
|
||||
visitors benchmark
|
||||
visitors benchmark odoc
|
||||
|
||||
dependencies-js:
|
||||
$(MAKE) -C $(FRENCH_LAW_JS_LIB_DIR) dependencies
|
||||
|
@ -21,7 +21,7 @@
|
||||
(menhir
|
||||
(modules tokens parser)
|
||||
(merge_into parser)
|
||||
(flags --external-tokens Tokens --table))
|
||||
(flags --external-tokens Tokens --table --explain))
|
||||
|
||||
(documentation
|
||||
(package catala)
|
||||
|
@ -35,17 +35,19 @@ let get_law_heading (lexbuf : lexbuf) : token =
|
||||
let precedence = calc_precedence (String.trim (get_substring 1)) in
|
||||
LAW_HEADING (title, article_id, article_expiration_date, precedence)
|
||||
|
||||
type lexing_context = Law | Code | Directive | Directive_args
|
||||
|
||||
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
|
||||
code or law. *)
|
||||
let is_code : bool ref = ref false
|
||||
let context : lexing_context ref = ref Law
|
||||
|
||||
(** Mutable string reference that accumulates the string representation of the body of code being
|
||||
lexed. This string representation is used in the literate programming backends to faithfully
|
||||
capture the spacing pattern of the original program *)
|
||||
let code_string_acc : string ref = ref ""
|
||||
let code_buffer : Buffer.t = Buffer.create 4000
|
||||
|
||||
(** Updates {!val:code_string_acc} with the current lexeme *)
|
||||
let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf
|
||||
(** Updates {!val:code_buffer} with the current lexeme *)
|
||||
let update_acc (lexbuf : lexbuf) : unit = Buffer.add_string code_buffer (Utf8.lexeme lexbuf)
|
||||
|
||||
(** Error-generating helper *)
|
||||
let raise_lexer_error (loc : Pos.t) (token : string) =
|
||||
|
@ -14,17 +14,19 @@
|
||||
|
||||
(** Auxiliary functions used by all lexers. *)
|
||||
|
||||
val is_code : bool ref
|
||||
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
|
||||
code or law. *)
|
||||
type lexing_context = Law | Code | Directive | Directive_args
|
||||
|
||||
val code_string_acc : string ref
|
||||
(** Mutable string reference that accumulates the string representation of the body of code being
|
||||
lexed. This string representation is used in the literate programming backends to faithfully
|
||||
capture the spacing pattern of the original program *)
|
||||
val context : lexing_context ref
|
||||
(** Reference, used by the lexer as the mutable state to distinguish whether it is lexing code or
|
||||
law. *)
|
||||
|
||||
val code_buffer : Buffer.t
|
||||
(** Buffer that accumulates the string representation of the body of code being lexed. This string
|
||||
representation is used in the literate programming backends to faithfully capture the spacing
|
||||
pattern of the original program *)
|
||||
|
||||
val update_acc : Sedlexing.lexbuf -> unit
|
||||
(** Updates {!val:code_string_acc} with the current lexeme *)
|
||||
(** Updates {!val:code_buffer} with the current lexeme *)
|
||||
|
||||
val raise_lexer_error : Utils.Pos.t -> string -> 'a
|
||||
(** Error-generating helper *)
|
||||
|
@ -103,6 +103,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
|
||||
(** Regexp matching at least one space. *)
|
||||
let space_plus = [%sedlex.regexp? Plus white_space]
|
||||
|
||||
(** Regexp matching white space but not newlines *)
|
||||
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
|
||||
|
||||
(** Main lexing function used in code blocks *)
|
||||
let rec lex_code (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
@ -118,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
lex_code lexbuf
|
||||
| "```" ->
|
||||
(* End of code section *)
|
||||
L.is_code := false;
|
||||
END_CODE !L.code_string_acc
|
||||
L.context := Law;
|
||||
END_CODE (Buffer.contents L.code_buffer)
|
||||
| "scope" ->
|
||||
L.update_acc lexbuf;
|
||||
SCOPE
|
||||
@ -307,7 +310,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "day" ->
|
||||
L.update_acc lexbuf;
|
||||
DAY
|
||||
| 0x24, Star white_space, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
|
||||
| 0x24, Star hspace, digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)) ->
|
||||
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
|
||||
let full_str = Utf8.lexeme lexbuf in
|
||||
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
|
||||
@ -500,60 +503,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| '@', Star hspace, "p.", Star hspace, Plus digit ->
|
||||
let s = Utf8.lexeme lexbuf in
|
||||
let i = String.index s '.' in
|
||||
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||
| Plus hspace -> lex_directive_args lexbuf
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
END_DIRECTIVE
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
let rec lex_directive (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| Plus hspace -> lex_directive lexbuf
|
||||
| "Begin", Plus hspace, "metadata" -> BEGIN_METADATA
|
||||
| "End", Plus hspace, "metadata" -> END_METADATA
|
||||
| "Include" -> LAW_INCLUDE
|
||||
| ":" ->
|
||||
L.context := Directive_args;
|
||||
COLON
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
END_DIRECTIVE
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
(** Main lexing function used outside code blocks *)
|
||||
let lex_law (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
let compl_catala =
|
||||
[%sedlex.regexp?
|
||||
( Compl 'c'
|
||||
| 'c', Compl 'a'
|
||||
| "ca", Compl 't'
|
||||
| "cat", Compl 'a'
|
||||
| "cata", Compl 'l'
|
||||
| "catal", Compl 'a'
|
||||
| "catala", Compl (white_space | '\n') )]
|
||||
in
|
||||
match%sedlex lexbuf with
|
||||
| "```catala" ->
|
||||
L.is_code := true;
|
||||
L.code_string_acc := "";
|
||||
BEGIN_CODE
|
||||
| eof -> EOF
|
||||
| '>', Star white_space, "Begin metadata" -> BEGIN_METADATA
|
||||
| '>', Star white_space, "End metadata" -> END_METADATA
|
||||
| ( '>',
|
||||
Star white_space,
|
||||
"Include:",
|
||||
Star white_space,
|
||||
Plus (Compl ('@' | '\n')),
|
||||
Star white_space,
|
||||
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
|
||||
'\n' ) ->
|
||||
let extract_components =
|
||||
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
|
||||
in
|
||||
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
|
||||
let name = get_component 1 in
|
||||
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
|
||||
let pos = lexing_positions lexbuf in
|
||||
if Filename.extension name = ".pdf" then
|
||||
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
|
||||
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
|
||||
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
|
||||
L.get_law_heading lexbuf
|
||||
| Plus
|
||||
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
|
||||
previous regexp. *)
|
||||
( Compl ('#' | '`' | '>')
|
||||
(* Following literals allow to match grave accents as long as they don't conflict with the
|
||||
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
|
||||
not be followed by 'catala'. *)
|
||||
| Rep ('`', 1 .. 2), Compl '`'
|
||||
| "```", compl_catala ) ->
|
||||
LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
||||
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
||||
if at_bol then
|
||||
match%sedlex lexbuf with
|
||||
| eof -> EOF
|
||||
| "```catala", Plus white_space ->
|
||||
L.context := Code;
|
||||
Buffer.clear L.code_buffer;
|
||||
BEGIN_CODE
|
||||
| '>' ->
|
||||
L.context := Directive;
|
||||
BEGIN_DIRECTIVE
|
||||
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||
L.get_law_heading lexbuf
|
||||
| _ -> (
|
||||
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
|
||||
sub-match at the same point *)
|
||||
let lexbuf = lexbuf in
|
||||
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||
match%sedlex lexbuf with
|
||||
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
|
||||
else
|
||||
match%sedlex lexbuf with
|
||||
| eof -> EOF
|
||||
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
||||
Surface.Lexer_common.is_code}. *)
|
||||
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
|
||||
let lexer (lexbuf : lexbuf) : token =
|
||||
match !L.context with
|
||||
| Law -> lex_law lexbuf
|
||||
| Code -> lex_code lexbuf
|
||||
| Directive -> lex_directive lexbuf
|
||||
| Directive_args -> lex_directive_args lexbuf
|
||||
|
@ -43,6 +43,8 @@ let token_list : (string * token) list =
|
||||
("somme", SUM);
|
||||
("rempli", FILLED);
|
||||
("définition", DEFINITION);
|
||||
("étiquette", LABEL);
|
||||
("exception", EXCEPTION);
|
||||
("égal à", DEFINED_AS);
|
||||
("selon", MATCH);
|
||||
("sous forme", WITH);
|
||||
@ -69,12 +71,12 @@ let token_list : (string * token) list =
|
||||
("ou", OR);
|
||||
("ou bien", XOR);
|
||||
("non", NOT);
|
||||
("nombre", CARDINAL);
|
||||
("maximum", MAXIMUM);
|
||||
("minimum", MINIMUM);
|
||||
("filtre", FILTER);
|
||||
("application", MAP);
|
||||
("initial", INIT);
|
||||
("nombre", CARDINAL);
|
||||
("an", YEAR);
|
||||
("mois", MONTH);
|
||||
("jour", DAY);
|
||||
@ -83,6 +85,7 @@ let token_list : (string * token) list =
|
||||
]
|
||||
@ L.token_list_language_agnostic
|
||||
|
||||
(** Localised builtin functions *)
|
||||
let builtins : (string * Ast.builtin_expression) list =
|
||||
[
|
||||
("entier_vers_décimal", Ast.IntToDec);
|
||||
@ -100,12 +103,15 @@ let digit = [%sedlex.regexp? '0' .. '9']
|
||||
(** Regexp matching at least one space. *)
|
||||
let space_plus = [%sedlex.regexp? Plus white_space]
|
||||
|
||||
(** Regexp matching white space but not newlines *)
|
||||
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
|
||||
|
||||
(** Main lexing function used in code blocks *)
|
||||
let rec lex_code (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| white_space | '\n' ->
|
||||
| white_space ->
|
||||
(* Whitespaces *)
|
||||
L.update_acc lexbuf;
|
||||
lex_code lexbuf
|
||||
@ -115,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
lex_code lexbuf
|
||||
| "```" ->
|
||||
(* End of code section *)
|
||||
L.is_code := false;
|
||||
END_CODE !L.code_string_acc
|
||||
L.context := Law;
|
||||
END_CODE (Buffer.contents L.code_buffer)
|
||||
| "champ", space_plus, "d\'application" ->
|
||||
L.update_acc lexbuf;
|
||||
SCOPE
|
||||
@ -277,9 +283,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "non" ->
|
||||
L.update_acc lexbuf;
|
||||
NOT
|
||||
| "nombre" ->
|
||||
L.update_acc lexbuf;
|
||||
CARDINAL
|
||||
| "maximum" ->
|
||||
L.update_acc lexbuf;
|
||||
MAXIMUM
|
||||
@ -295,6 +298,9 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "initial" ->
|
||||
L.update_acc lexbuf;
|
||||
INIT
|
||||
| "nombre" ->
|
||||
L.update_acc lexbuf;
|
||||
CARDINAL
|
||||
| "vrai" ->
|
||||
L.update_acc lexbuf;
|
||||
TRUE
|
||||
@ -310,11 +316,10 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "jour" ->
|
||||
L.update_acc lexbuf;
|
||||
DAY
|
||||
| digit, Star (digit | white_space), Opt (',', Rep (digit, 0 .. 2)), Star white_space, 0x20AC ->
|
||||
| digit, Star (digit | hspace), Opt (',', Rep (digit, 0 .. 2)), Star hspace, 0x20AC ->
|
||||
let extract_parts = R.regexp "([0-9]([0-9 ]*[0-9]|))(,([0-9]{0,2})|)" in
|
||||
let full_str = Utf8.lexeme lexbuf in
|
||||
let only_numbers_str = String.trim (String.sub full_str 0 (String.length full_str - 1)) in
|
||||
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
|
||||
let str = Utf8.lexeme lexbuf in
|
||||
let parts = R.get_substring (R.exec ~rex:extract_parts str) in
|
||||
(* Integer literal*)
|
||||
let units = parts 1 in
|
||||
let remove_spaces = R.regexp " " in
|
||||
@ -503,64 +508,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| '@', Star hspace, "p.", Star hspace, Plus digit ->
|
||||
let s = Utf8.lexeme lexbuf in
|
||||
let i = String.index s '.' in
|
||||
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||
| Plus hspace -> lex_directive_args lexbuf
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
END_DIRECTIVE
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
let rec lex_directive (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| Plus hspace -> lex_directive lexbuf
|
||||
| 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
|
||||
| "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
|
||||
| "Inclusion" -> LAW_INCLUDE
|
||||
| ':' ->
|
||||
L.context := Directive_args;
|
||||
COLON
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
END_DIRECTIVE
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
(** Main lexing function used outside code blocks *)
|
||||
let lex_law (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
let compl_catala =
|
||||
[%sedlex.regexp?
|
||||
( Compl 'c'
|
||||
| 'c', Compl 'a'
|
||||
| "ca", Compl 't'
|
||||
| "cat", Compl 'a'
|
||||
| "cata", Compl 'l'
|
||||
| "catal", Compl 'a'
|
||||
| "catala", Compl (white_space | '\n') )]
|
||||
in
|
||||
match%sedlex lexbuf with
|
||||
| "```catala" ->
|
||||
L.is_code := true;
|
||||
L.code_string_acc := "";
|
||||
|
||||
BEGIN_CODE
|
||||
| eof -> EOF
|
||||
| '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
|
||||
| '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
|
||||
| ( '>',
|
||||
Star white_space,
|
||||
"Inclusion:",
|
||||
Star white_space,
|
||||
Plus (Compl ('@' | '\n')),
|
||||
Star white_space,
|
||||
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
|
||||
'\n' ) ->
|
||||
let extract_components =
|
||||
R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
|
||||
in
|
||||
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
|
||||
let jorftext = R.regexp "JORFTEXT\\d{12}" in
|
||||
let name = get_component 1 in
|
||||
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
|
||||
let pos = lexing_positions lexbuf in
|
||||
if R.pmatch ~rex:jorftext name then
|
||||
LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
|
||||
else if Filename.extension name = ".pdf" then
|
||||
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
|
||||
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
|
||||
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
|
||||
L.get_law_heading lexbuf
|
||||
| Plus
|
||||
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
|
||||
previous regexp. *)
|
||||
( Compl ('#' | '`' | '>')
|
||||
(* Following literals allow to match grave accents as long as they don't conflict with the
|
||||
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
|
||||
not be followed by 'catala'. *)
|
||||
| Rep ('`', 1 .. 2), Compl '`'
|
||||
| "```", compl_catala ) ->
|
||||
LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
||||
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
||||
if at_bol then
|
||||
match%sedlex lexbuf with
|
||||
| eof -> EOF
|
||||
| "```catala", Plus white_space ->
|
||||
L.context := Code;
|
||||
Buffer.clear L.code_buffer;
|
||||
BEGIN_CODE
|
||||
| '>' ->
|
||||
L.context := Directive;
|
||||
BEGIN_DIRECTIVE
|
||||
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||
L.get_law_heading lexbuf
|
||||
| _ -> (
|
||||
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
|
||||
sub-match at the same point *)
|
||||
let lexbuf = lexbuf in
|
||||
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||
match%sedlex lexbuf with
|
||||
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
|
||||
else
|
||||
match%sedlex lexbuf with
|
||||
| eof -> EOF
|
||||
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
||||
Surface.Lexer_common.is_code}. *)
|
||||
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
|
||||
let lexer (lexbuf : lexbuf) : token =
|
||||
match !L.context with
|
||||
| Law -> lex_law lexbuf
|
||||
| Code -> lex_code lexbuf
|
||||
| Directive -> lex_directive lexbuf
|
||||
| Directive_args -> lex_directive_args lexbuf
|
||||
|
@ -105,6 +105,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
|
||||
(** Regexp matching at least one space. *)
|
||||
let space_plus = [%sedlex.regexp? Plus white_space]
|
||||
|
||||
(** Regexp matching white space but not newlines *)
|
||||
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
|
||||
|
||||
(** Main lexing function used in code blocks *)
|
||||
let rec lex_code (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
@ -120,8 +123,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
lex_code lexbuf
|
||||
| "```" ->
|
||||
(* End of code section *)
|
||||
L.is_code := false;
|
||||
END_CODE !L.code_string_acc
|
||||
L.context := Law;
|
||||
END_CODE (Buffer.contents L.code_buffer)
|
||||
| "zakres" ->
|
||||
L.update_acc lexbuf;
|
||||
SCOPE
|
||||
@ -309,16 +312,10 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "dzien" ->
|
||||
L.update_acc lexbuf;
|
||||
DAY
|
||||
| ( Star white_space,
|
||||
digit,
|
||||
Star (digit | ','),
|
||||
Opt ('.', Rep (digit, 0 .. 2)),
|
||||
Star white_space,
|
||||
"PLN" ) ->
|
||||
| digit, Star (digit | ','), Opt ('.', Rep (digit, 0 .. 2)), Star hspace, "PLN" ->
|
||||
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
|
||||
let full_str = Utf8.lexeme lexbuf in
|
||||
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
|
||||
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
|
||||
let str = Utf8.lexeme lexbuf in
|
||||
let parts = R.get_substring (R.exec ~rex:extract_parts str) in
|
||||
(* Integer literal*)
|
||||
let units = parts 1 in
|
||||
let remove_commas = R.regexp "," in
|
||||
@ -507,61 +504,73 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| '@', Star hspace, "p.", Star hspace, Plus digit ->
|
||||
let s = Utf8.lexeme lexbuf in
|
||||
let i = String.index s '.' in
|
||||
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||
| Plus hspace -> lex_directive_args lexbuf
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
END_DIRECTIVE
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
let rec lex_directive (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
match%sedlex lexbuf with
|
||||
| Plus hspace -> lex_directive lexbuf
|
||||
| "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
|
||||
| "Koniec", Plus hspace, "metadanych" -> END_METADATA
|
||||
| "Include" -> LAW_INCLUDE
|
||||
| ":" ->
|
||||
L.context := Directive_args;
|
||||
COLON
|
||||
| '\n' | eof ->
|
||||
L.context := Law;
|
||||
END_DIRECTIVE
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
(** Main lexing function used outside code blocks *)
|
||||
let lex_law (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
let prev_pos = lexing_positions lexbuf in
|
||||
let compl_catala =
|
||||
[%sedlex.regexp?
|
||||
( Compl 'c'
|
||||
| 'c', Compl 'a'
|
||||
| "ca", Compl 't'
|
||||
| "cat", Compl 'a'
|
||||
| "cata", Compl 'l'
|
||||
| "catal", Compl 'a'
|
||||
| "catala", Compl (white_space | '\n') )]
|
||||
in
|
||||
match%sedlex lexbuf with
|
||||
| "```catala" ->
|
||||
L.is_code := true;
|
||||
L.code_string_acc := "";
|
||||
|
||||
BEGIN_CODE
|
||||
| eof -> EOF
|
||||
| '>', Star white_space, "Poczatek metadanych" -> BEGIN_METADATA
|
||||
| '>', Star white_space, "Koniec metadanych" -> END_METADATA
|
||||
| ( '>',
|
||||
Star white_space,
|
||||
"Include:",
|
||||
Star white_space,
|
||||
Plus (Compl ('@' | '\n')),
|
||||
Star white_space,
|
||||
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
|
||||
'\n' ) ->
|
||||
let extract_components =
|
||||
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
|
||||
in
|
||||
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
|
||||
let name = get_component 1 in
|
||||
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
|
||||
let pos = lexing_positions lexbuf in
|
||||
if Filename.extension name = ".pdf" then
|
||||
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
|
||||
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
|
||||
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
|
||||
L.get_law_heading lexbuf
|
||||
| Plus
|
||||
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
|
||||
previous regexp. *)
|
||||
( Compl ('#' | '`' | '>')
|
||||
(* Following literals allow to match grave accents as long as they don't conflict with the
|
||||
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
|
||||
not be followed by 'catala'. *)
|
||||
| Rep ('`', 1 .. 2), Compl '`'
|
||||
| "```", compl_catala ) ->
|
||||
LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
||||
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
||||
if at_bol then
|
||||
match%sedlex lexbuf with
|
||||
| eof -> EOF
|
||||
| "```catala", Plus white_space ->
|
||||
L.context := Code;
|
||||
Buffer.clear L.code_buffer;
|
||||
BEGIN_CODE
|
||||
| '>' ->
|
||||
L.context := Directive;
|
||||
BEGIN_DIRECTIVE
|
||||
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||
L.get_law_heading lexbuf
|
||||
| _ -> (
|
||||
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
|
||||
sub-match at the same point *)
|
||||
let lexbuf = lexbuf in
|
||||
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||
match%sedlex lexbuf with
|
||||
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
|
||||
else
|
||||
match%sedlex lexbuf with
|
||||
| eof -> EOF
|
||||
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||
|
||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
||||
Surface.Lexer_common.is_code}. *)
|
||||
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
|
||||
let lexer (lexbuf : lexbuf) : token =
|
||||
match !L.context with
|
||||
| Law -> lex_law lexbuf
|
||||
| Code -> lex_code lexbuf
|
||||
| Directive -> lex_directive lexbuf
|
||||
| Directive_args -> lex_directive_args lexbuf
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -29,6 +29,10 @@ end>
|
||||
|
||||
%start source_file
|
||||
|
||||
(* The token is returned for every line of law text, make them right-associative
|
||||
so that we concat them efficiently as much as possible. *)
|
||||
%right LAW_TEXT
|
||||
|
||||
%%
|
||||
|
||||
typ_base:
|
||||
@ -550,7 +554,7 @@ code:
|
||||
| code = list(code_item) { (code, Pos.from_lpos $sloc) }
|
||||
|
||||
metadata_block:
|
||||
| BEGIN_METADATA option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) END_METADATA {
|
||||
| BEGIN_DIRECTIVE BEGIN_METADATA END_DIRECTIVE option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) BEGIN_DIRECTIVE END_METADATA END_DIRECTIVE {
|
||||
let (code, pos) = code_and_pos in
|
||||
(code, (text, pos))
|
||||
}
|
||||
@ -567,7 +571,7 @@ law_heading:
|
||||
}
|
||||
|
||||
law_text:
|
||||
| text = LAW_TEXT { String.trim text }
|
||||
| lines = nonempty_list(LAW_TEXT) { String.trim (String.concat "" lines) }
|
||||
|
||||
source_file_item:
|
||||
| text = law_text { LawText text }
|
||||
@ -582,8 +586,16 @@ source_file_item:
|
||||
let (code, source_repr) = code in
|
||||
CodeBlock (code, source_repr, true)
|
||||
}
|
||||
| includ = LAW_INCLUDE {
|
||||
LawInclude includ
|
||||
| BEGIN_DIRECTIVE LAW_INCLUDE COLON args = nonempty_list(DIRECTIVE_ARG) page = option(AT_PAGE) END_DIRECTIVE {
|
||||
let filename = String.trim (String.concat "" args) in
|
||||
let pos = Pos.from_lpos $sloc in
|
||||
let jorftext = Re.Pcre.regexp "JORFTEXT\\d{12}" in
|
||||
if Re.Pcre.pmatch ~rex:jorftext filename && page = None then
|
||||
LawInclude (Ast.LegislativeText (filename, pos))
|
||||
else if Filename.extension filename = ".pdf" || page <> None then
|
||||
LawInclude (Ast.PdfFile ((filename, pos), page))
|
||||
else
|
||||
LawInclude (Ast.CatalaFile (filename, pos))
|
||||
}
|
||||
|
||||
|
||||
|
@ -23,7 +23,11 @@
|
||||
|
||||
%token EOF
|
||||
%token<string * string option * string option * int> LAW_HEADING
|
||||
%token<Ast.law_include> LAW_INCLUDE
|
||||
|
||||
%token BEGIN_DIRECTIVE END_DIRECTIVE LAW_INCLUDE
|
||||
%token<int> AT_PAGE
|
||||
%token<string> DIRECTIVE_ARG
|
||||
|
||||
%token<string> LAW_TEXT
|
||||
%token<string> CONSTRUCTOR IDENT
|
||||
%token<string> END_CODE
|
||||
|
Loading…
Reference in New Issue
Block a user