mirror of
https://github.com/CatalaLang/catala.git
synced 2024-11-09 22:16:10 +03:00
Cleanup the lexer, and refactor for more generic directives
This commit is contained in:
parent
0659816782
commit
b31bee71ad
@ -35,17 +35,19 @@ let get_law_heading (lexbuf : lexbuf) : token =
|
|||||||
let precedence = calc_precedence (String.trim (get_substring 1)) in
|
let precedence = calc_precedence (String.trim (get_substring 1)) in
|
||||||
LAW_HEADING (title, article_id, article_expiration_date, precedence)
|
LAW_HEADING (title, article_id, article_expiration_date, precedence)
|
||||||
|
|
||||||
|
type lexing_context = Law | Code | Directive | Directive_args
|
||||||
|
|
||||||
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
|
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
|
||||||
code or law. *)
|
code or law. *)
|
||||||
let is_code : bool ref = ref false
|
let context : lexing_context ref = ref Law
|
||||||
|
|
||||||
(** Mutable string reference that accumulates the string representation of the body of code being
|
(** Mutable string reference that accumulates the string representation of the body of code being
|
||||||
lexed. This string representation is used in the literate programming backends to faithfully
|
lexed. This string representation is used in the literate programming backends to faithfully
|
||||||
capture the spacing pattern of the original program *)
|
capture the spacing pattern of the original program *)
|
||||||
let code_string_acc : string ref = ref ""
|
let code_buffer : Buffer.t = Buffer.create 4000
|
||||||
|
|
||||||
(** Updates {!val:code_string_acc} with the current lexeme *)
|
(** Updates {!val:code_buffer} with the current lexeme *)
|
||||||
let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf
|
let update_acc (lexbuf : lexbuf) : unit = Buffer.add_string code_buffer (Utf8.lexeme lexbuf)
|
||||||
|
|
||||||
(** Error-generating helper *)
|
(** Error-generating helper *)
|
||||||
let raise_lexer_error (loc : Pos.t) (token : string) =
|
let raise_lexer_error (loc : Pos.t) (token : string) =
|
||||||
|
@ -14,17 +14,19 @@
|
|||||||
|
|
||||||
(** Auxiliary functions used by all lexers. *)
|
(** Auxiliary functions used by all lexers. *)
|
||||||
|
|
||||||
val is_code : bool ref
|
type lexing_context = Law | Code | Directive | Directive_args
|
||||||
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
|
|
||||||
code or law. *)
|
|
||||||
|
|
||||||
val code_string_acc : string ref
|
val context : lexing_context ref
|
||||||
(** Mutable string reference that accumulates the string representation of the body of code being
|
(** Reference, used by the lexer as the mutable state to distinguish whether it is lexing code or
|
||||||
lexed. This string representation is used in the literate programming backends to faithfully
|
law. *)
|
||||||
capture the spacing pattern of the original program *)
|
|
||||||
|
val code_buffer : Buffer.t
|
||||||
|
(** Buffer that accumulates the string representation of the body of code being lexed. This string
|
||||||
|
representation is used in the literate programming backends to faithfully capture the spacing
|
||||||
|
pattern of the original program *)
|
||||||
|
|
||||||
val update_acc : Sedlexing.lexbuf -> unit
|
val update_acc : Sedlexing.lexbuf -> unit
|
||||||
(** Updates {!val:code_string_acc} with the current lexeme *)
|
(** Updates {!val:code_buffer} with the current lexeme *)
|
||||||
|
|
||||||
val raise_lexer_error : Utils.Pos.t -> string -> 'a
|
val raise_lexer_error : Utils.Pos.t -> string -> 'a
|
||||||
(** Error-generating helper *)
|
(** Error-generating helper *)
|
||||||
|
@ -103,6 +103,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
|
|||||||
(** Regexp matching at least one space. *)
|
(** Regexp matching at least one space. *)
|
||||||
let space_plus = [%sedlex.regexp? Plus white_space]
|
let space_plus = [%sedlex.regexp? Plus white_space]
|
||||||
|
|
||||||
|
(** Regexp matching white space but not newlines *)
|
||||||
|
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
|
||||||
|
|
||||||
(** Main lexing function used in code blocks *)
|
(** Main lexing function used in code blocks *)
|
||||||
let rec lex_code (lexbuf : lexbuf) : token =
|
let rec lex_code (lexbuf : lexbuf) : token =
|
||||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
@ -118,8 +121,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
|||||||
lex_code lexbuf
|
lex_code lexbuf
|
||||||
| "```" ->
|
| "```" ->
|
||||||
(* End of code section *)
|
(* End of code section *)
|
||||||
L.is_code := false;
|
L.context := Law;
|
||||||
END_CODE !L.code_string_acc
|
END_CODE (Buffer.contents L.code_buffer)
|
||||||
| "scope" ->
|
| "scope" ->
|
||||||
L.update_acc lexbuf;
|
L.update_acc lexbuf;
|
||||||
SCOPE
|
SCOPE
|
||||||
@ -500,60 +503,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
|||||||
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
||||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
|
let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||||
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
|
let prev_pos = lexing_positions lexbuf in
|
||||||
|
match%sedlex lexbuf with
|
||||||
|
| '@', Star hspace, "p.", Star hspace, Plus digit ->
|
||||||
|
let s = Utf8.lexeme lexbuf in
|
||||||
|
let i = String.index s '.' in
|
||||||
|
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||||
|
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||||
|
| Plus hspace -> lex_directive_args lexbuf
|
||||||
|
| '\n' | eof ->
|
||||||
|
L.context := Law;
|
||||||
|
END_DIRECTIVE
|
||||||
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
|
let rec lex_directive (lexbuf : lexbuf) : token =
|
||||||
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
|
let prev_pos = lexing_positions lexbuf in
|
||||||
|
match%sedlex lexbuf with
|
||||||
|
| Plus hspace -> lex_directive lexbuf
|
||||||
|
| "Begin", Plus hspace, "metadata" -> BEGIN_METADATA
|
||||||
|
| "End", Plus hspace, "metadata" -> END_METADATA
|
||||||
|
| "Include" -> LAW_INCLUDE
|
||||||
|
| ":" ->
|
||||||
|
L.context := Directive_args;
|
||||||
|
COLON
|
||||||
|
| '\n' | eof ->
|
||||||
|
L.context := Law;
|
||||||
|
END_DIRECTIVE
|
||||||
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
(** Main lexing function used outside code blocks *)
|
(** Main lexing function used outside code blocks *)
|
||||||
let lex_law (lexbuf : lexbuf) : token =
|
let lex_law (lexbuf : lexbuf) : token =
|
||||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
let prev_pos = lexing_positions lexbuf in
|
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
||||||
let compl_catala =
|
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
||||||
[%sedlex.regexp?
|
if at_bol then
|
||||||
( Compl 'c'
|
|
||||||
| 'c', Compl 'a'
|
|
||||||
| "ca", Compl 't'
|
|
||||||
| "cat", Compl 'a'
|
|
||||||
| "cata", Compl 'l'
|
|
||||||
| "catal", Compl 'a'
|
|
||||||
| "catala", Compl (white_space | '\n') )]
|
|
||||||
in
|
|
||||||
match%sedlex lexbuf with
|
match%sedlex lexbuf with
|
||||||
| "```catala" ->
|
|
||||||
L.is_code := true;
|
|
||||||
L.code_string_acc := "";
|
|
||||||
BEGIN_CODE
|
|
||||||
| eof -> EOF
|
| eof -> EOF
|
||||||
| '>', Star white_space, "Begin metadata" -> BEGIN_METADATA
|
| "```catala", Plus white_space ->
|
||||||
| '>', Star white_space, "End metadata" -> END_METADATA
|
L.context := Code;
|
||||||
| ( '>',
|
Buffer.clear L.code_buffer;
|
||||||
Star white_space,
|
BEGIN_CODE
|
||||||
"Include:",
|
| '>' ->
|
||||||
Star white_space,
|
L.context := Directive;
|
||||||
Plus (Compl ('@' | '\n')),
|
BEGIN_DIRECTIVE
|
||||||
Star white_space,
|
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||||
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
|
|
||||||
'\n' ) ->
|
|
||||||
let extract_components =
|
|
||||||
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
|
|
||||||
in
|
|
||||||
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
|
|
||||||
let name = get_component 1 in
|
|
||||||
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
|
|
||||||
let pos = lexing_positions lexbuf in
|
|
||||||
if Filename.extension name = ".pdf" then
|
|
||||||
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
|
|
||||||
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
|
|
||||||
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
|
|
||||||
L.get_law_heading lexbuf
|
L.get_law_heading lexbuf
|
||||||
| Plus
|
| _ -> (
|
||||||
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
|
(* Nested match for lower priority; `_` matches length 0 *)
|
||||||
previous regexp. *)
|
let lexbuf = lexbuf in
|
||||||
( Compl ('#' | '`' | '>')
|
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||||
(* Following literals allow to match grave accents as long as they don't conflict with the
|
match%sedlex lexbuf with
|
||||||
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||||
not be followed by 'catala'. *)
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
|
||||||
| Rep ('`', 1 .. 2), Compl '`'
|
else
|
||||||
| "```", compl_catala ) ->
|
match%sedlex lexbuf with
|
||||||
LAW_TEXT (Utf8.lexeme lexbuf)
|
| eof -> EOF
|
||||||
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
||||||
Surface.Lexer_common.is_code}. *)
|
Surface.Lexer_common.is_code}. *)
|
||||||
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
|
let lexer (lexbuf : lexbuf) : token =
|
||||||
|
match !L.context with
|
||||||
|
| Law -> lex_law lexbuf
|
||||||
|
| Code -> lex_code lexbuf
|
||||||
|
| Directive -> lex_directive lexbuf
|
||||||
|
| Directive_args -> lex_directive_args lexbuf
|
||||||
|
@ -100,6 +100,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
|
|||||||
(** Regexp matching at least one space. *)
|
(** Regexp matching at least one space. *)
|
||||||
let space_plus = [%sedlex.regexp? Plus white_space]
|
let space_plus = [%sedlex.regexp? Plus white_space]
|
||||||
|
|
||||||
|
(** Regexp matching white space but not newlines *)
|
||||||
|
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
|
||||||
|
|
||||||
(** Main lexing function used in code blocks *)
|
(** Main lexing function used in code blocks *)
|
||||||
let rec lex_code (lexbuf : lexbuf) : token =
|
let rec lex_code (lexbuf : lexbuf) : token =
|
||||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
@ -115,8 +118,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
|||||||
lex_code lexbuf
|
lex_code lexbuf
|
||||||
| "```" ->
|
| "```" ->
|
||||||
(* End of code section *)
|
(* End of code section *)
|
||||||
L.is_code := false;
|
L.context := Law;
|
||||||
END_CODE !L.code_string_acc
|
END_CODE (Buffer.contents L.code_buffer)
|
||||||
| "champ", space_plus, "d\'application" ->
|
| "champ", space_plus, "d\'application" ->
|
||||||
L.update_acc lexbuf;
|
L.update_acc lexbuf;
|
||||||
SCOPE
|
SCOPE
|
||||||
@ -503,64 +506,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
|||||||
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
||||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
|
let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||||
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
|
let prev_pos = lexing_positions lexbuf in
|
||||||
|
match%sedlex lexbuf with
|
||||||
|
| '@', Star hspace, "p.", Star hspace, Plus digit ->
|
||||||
|
let s = Utf8.lexeme lexbuf in
|
||||||
|
let i = String.index s '.' in
|
||||||
|
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||||
|
| Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||||
|
| Plus hspace -> lex_directive_args lexbuf
|
||||||
|
| '\n' | eof ->
|
||||||
|
L.context := Law;
|
||||||
|
END_DIRECTIVE
|
||||||
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
|
let rec lex_directive (lexbuf : lexbuf) : token =
|
||||||
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
|
let prev_pos = lexing_positions lexbuf in
|
||||||
|
match%sedlex lexbuf with
|
||||||
|
| Plus hspace -> lex_directive lexbuf
|
||||||
|
| 'D', 0xE9, "but", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
|
||||||
|
| "Fin", Plus hspace, "m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
|
||||||
|
| "Inclusion" -> LAW_INCLUDE
|
||||||
|
| ':', Star hspace ->
|
||||||
|
L.context := Directive_args;
|
||||||
|
COLON
|
||||||
|
| '\n' | eof ->
|
||||||
|
L.context := Law;
|
||||||
|
END_DIRECTIVE
|
||||||
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
(** Main lexing function used outside code blocks *)
|
(** Main lexing function used outside code blocks *)
|
||||||
let lex_law (lexbuf : lexbuf) : token =
|
let lex_law (lexbuf : lexbuf) : token =
|
||||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
let prev_pos = lexing_positions lexbuf in
|
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
||||||
let compl_catala =
|
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
||||||
[%sedlex.regexp?
|
if at_bol then
|
||||||
( Compl 'c'
|
|
||||||
| 'c', Compl 'a'
|
|
||||||
| "ca", Compl 't'
|
|
||||||
| "cat", Compl 'a'
|
|
||||||
| "cata", Compl 'l'
|
|
||||||
| "catal", Compl 'a'
|
|
||||||
| "catala", Compl (white_space | '\n') )]
|
|
||||||
in
|
|
||||||
match%sedlex lexbuf with
|
match%sedlex lexbuf with
|
||||||
| "```catala" ->
|
|
||||||
L.is_code := true;
|
|
||||||
L.code_string_acc := "";
|
|
||||||
|
|
||||||
BEGIN_CODE
|
|
||||||
| eof -> EOF
|
| eof -> EOF
|
||||||
| '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
|
| "```catala", Plus white_space ->
|
||||||
| '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
|
L.context := Code;
|
||||||
| ( '>',
|
Buffer.clear L.code_buffer;
|
||||||
Star white_space,
|
BEGIN_CODE
|
||||||
"Inclusion:",
|
| '>', Star hspace ->
|
||||||
Star white_space,
|
L.context := Directive;
|
||||||
Plus (Compl ('@' | '\n')),
|
BEGIN_DIRECTIVE
|
||||||
Star white_space,
|
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||||
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
|
|
||||||
'\n' ) ->
|
|
||||||
let extract_components =
|
|
||||||
R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
|
|
||||||
in
|
|
||||||
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
|
|
||||||
let jorftext = R.regexp "JORFTEXT\\d{12}" in
|
|
||||||
let name = get_component 1 in
|
|
||||||
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
|
|
||||||
let pos = lexing_positions lexbuf in
|
|
||||||
if R.pmatch ~rex:jorftext name then
|
|
||||||
LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
|
|
||||||
else if Filename.extension name = ".pdf" then
|
|
||||||
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
|
|
||||||
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
|
|
||||||
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
|
|
||||||
L.get_law_heading lexbuf
|
L.get_law_heading lexbuf
|
||||||
| Plus
|
| _ -> (
|
||||||
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
|
(* Nested match for lower priority; `_` matches length 0 *)
|
||||||
previous regexp. *)
|
let lexbuf = lexbuf in
|
||||||
( Compl ('#' | '`' | '>')
|
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||||
(* Following literals allow to match grave accents as long as they don't conflict with the
|
match%sedlex lexbuf with
|
||||||
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||||
not be followed by 'catala'. *)
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
|
||||||
| Rep ('`', 1 .. 2), Compl '`'
|
else
|
||||||
| "```", compl_catala ) ->
|
match%sedlex lexbuf with
|
||||||
LAW_TEXT (Utf8.lexeme lexbuf)
|
| eof -> EOF
|
||||||
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
||||||
Surface.Lexer_common.is_code}. *)
|
Surface.Lexer_common.is_code}. *)
|
||||||
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
|
let lexer (lexbuf : lexbuf) : token =
|
||||||
|
match !L.context with
|
||||||
|
| Law -> lex_law lexbuf
|
||||||
|
| Code -> lex_code lexbuf
|
||||||
|
| Directive -> lex_directive lexbuf
|
||||||
|
| Directive_args -> lex_directive_args lexbuf
|
||||||
|
@ -105,6 +105,9 @@ let digit = [%sedlex.regexp? '0' .. '9']
|
|||||||
(** Regexp matching at least one space. *)
|
(** Regexp matching at least one space. *)
|
||||||
let space_plus = [%sedlex.regexp? Plus white_space]
|
let space_plus = [%sedlex.regexp? Plus white_space]
|
||||||
|
|
||||||
|
(** Regexp matching white space but not newlines *)
|
||||||
|
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
|
||||||
|
|
||||||
(** Main lexing function used in code blocks *)
|
(** Main lexing function used in code blocks *)
|
||||||
let rec lex_code (lexbuf : lexbuf) : token =
|
let rec lex_code (lexbuf : lexbuf) : token =
|
||||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
@ -120,8 +123,8 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
|||||||
lex_code lexbuf
|
lex_code lexbuf
|
||||||
| "```" ->
|
| "```" ->
|
||||||
(* End of code section *)
|
(* End of code section *)
|
||||||
L.is_code := false;
|
L.context := Law;
|
||||||
END_CODE !L.code_string_acc
|
END_CODE (Buffer.contents L.code_buffer)
|
||||||
| "zakres" ->
|
| "zakres" ->
|
||||||
L.update_acc lexbuf;
|
L.update_acc lexbuf;
|
||||||
SCOPE
|
SCOPE
|
||||||
@ -507,61 +510,72 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
|||||||
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
||||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
(** Main lexing function used outside code blocks *)
|
let rec lex_directive_args (lexbuf : lexbuf) : token =
|
||||||
let lex_law (lexbuf : lexbuf) : token =
|
|
||||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
let prev_pos = lexing_positions lexbuf in
|
let prev_pos = lexing_positions lexbuf in
|
||||||
let compl_catala =
|
|
||||||
[%sedlex.regexp?
|
|
||||||
( Compl 'c'
|
|
||||||
| 'c', Compl 'a'
|
|
||||||
| "ca", Compl 't'
|
|
||||||
| "cat", Compl 'a'
|
|
||||||
| "cata", Compl 'l'
|
|
||||||
| "catal", Compl 'a'
|
|
||||||
| "catala", Compl (white_space | '\n') )]
|
|
||||||
in
|
|
||||||
match%sedlex lexbuf with
|
match%sedlex lexbuf with
|
||||||
| "```catala" ->
|
| '@', Star hspace, "p.", Star hspace, Plus digit ->
|
||||||
L.is_code := true;
|
let s = Utf8.lexeme lexbuf in
|
||||||
L.code_string_acc := "";
|
let i = String.index s '.' in
|
||||||
|
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
||||||
|
| Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
||||||
|
| Plus hspace -> lex_directive_args lexbuf
|
||||||
|
| '\n' | eof ->
|
||||||
|
L.context := Law;
|
||||||
|
END_DIRECTIVE
|
||||||
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
BEGIN_CODE
|
let rec lex_directive (lexbuf : lexbuf) : token =
|
||||||
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
|
let prev_pos = lexing_positions lexbuf in
|
||||||
|
match%sedlex lexbuf with
|
||||||
|
| Plus hspace -> lex_directive lexbuf
|
||||||
|
| "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
|
||||||
|
| "Koniec", Plus hspace, "metadanych" -> END_METADATA
|
||||||
|
| "Include", Star hspace -> LAW_INCLUDE
|
||||||
|
| ":" ->
|
||||||
|
L.context := Directive_args;
|
||||||
|
COLON
|
||||||
|
| '\n' | eof ->
|
||||||
|
L.context := Law;
|
||||||
|
END_DIRECTIVE
|
||||||
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
|
(** Main lexing function used outside code blocks *)
|
||||||
|
and lex_law (lexbuf : lexbuf) : token =
|
||||||
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||||
|
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
||||||
|
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
||||||
|
if at_bol then
|
||||||
|
match%sedlex lexbuf with
|
||||||
| eof -> EOF
|
| eof -> EOF
|
||||||
| '>', Star white_space, "Poczatek metadanych" -> BEGIN_METADATA
|
| "```catala", Plus white_space ->
|
||||||
| '>', Star white_space, "Koniec metadanych" -> END_METADATA
|
L.context := Code;
|
||||||
| ( '>',
|
Buffer.clear L.code_buffer;
|
||||||
Star white_space,
|
BEGIN_CODE
|
||||||
"Include:",
|
| '>', Star hspace ->
|
||||||
Star white_space,
|
L.context := Directive;
|
||||||
Plus (Compl ('@' | '\n')),
|
BEGIN_DIRECTIVE
|
||||||
Star white_space,
|
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
||||||
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
|
|
||||||
'\n' ) ->
|
|
||||||
let extract_components =
|
|
||||||
R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
|
|
||||||
in
|
|
||||||
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
|
|
||||||
let name = get_component 1 in
|
|
||||||
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
|
|
||||||
let pos = lexing_positions lexbuf in
|
|
||||||
if Filename.extension name = ".pdf" then
|
|
||||||
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
|
|
||||||
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
|
|
||||||
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
|
|
||||||
L.get_law_heading lexbuf
|
L.get_law_heading lexbuf
|
||||||
| Plus
|
| _ -> (
|
||||||
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
|
(* Nested match for lower priority; `_` matches length 0 *)
|
||||||
previous regexp. *)
|
let lexbuf = lexbuf in
|
||||||
( Compl ('#' | '`' | '>')
|
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
||||||
(* Following literals allow to match grave accents as long as they don't conflict with the
|
match%sedlex lexbuf with
|
||||||
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||||
not be followed by 'catala'. *)
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
|
||||||
| Rep ('`', 1 .. 2), Compl '`'
|
else
|
||||||
| "```", compl_catala ) ->
|
match%sedlex lexbuf with
|
||||||
LAW_TEXT (Utf8.lexeme lexbuf)
|
| eof -> EOF
|
||||||
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
||||||
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
||||||
|
|
||||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
||||||
Surface.Lexer_common.is_code}. *)
|
Surface.Lexer_common.is_code}. *)
|
||||||
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
|
let lexer (lexbuf : lexbuf) : token =
|
||||||
|
match !L.context with
|
||||||
|
| Law -> lex_law lexbuf
|
||||||
|
| Code -> lex_code lexbuf
|
||||||
|
| Directive -> lex_directive lexbuf
|
||||||
|
| Directive_args -> lex_directive_args lexbuf
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -550,7 +550,7 @@ code:
|
|||||||
| code = list(code_item) { (code, Pos.from_lpos $sloc) }
|
| code = list(code_item) { (code, Pos.from_lpos $sloc) }
|
||||||
|
|
||||||
metadata_block:
|
metadata_block:
|
||||||
| BEGIN_METADATA option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) END_METADATA {
|
| BEGIN_DIRECTIVE BEGIN_METADATA END_DIRECTIVE option(law_text) BEGIN_CODE code_and_pos = code text = END_CODE option(law_text) BEGIN_DIRECTIVE END_METADATA END_DIRECTIVE {
|
||||||
let (code, pos) = code_and_pos in
|
let (code, pos) = code_and_pos in
|
||||||
(code, (text, pos))
|
(code, (text, pos))
|
||||||
}
|
}
|
||||||
@ -567,7 +567,7 @@ law_heading:
|
|||||||
}
|
}
|
||||||
|
|
||||||
law_text:
|
law_text:
|
||||||
| text = LAW_TEXT { String.trim text }
|
| lines = nonempty_list(LAW_TEXT) { String.trim (String.concat "" lines) }
|
||||||
|
|
||||||
source_file_item:
|
source_file_item:
|
||||||
| text = law_text { LawText text }
|
| text = law_text { LawText text }
|
||||||
@ -582,8 +582,16 @@ source_file_item:
|
|||||||
let (code, source_repr) = code in
|
let (code, source_repr) = code in
|
||||||
CodeBlock (code, source_repr, true)
|
CodeBlock (code, source_repr, true)
|
||||||
}
|
}
|
||||||
| includ = LAW_INCLUDE {
|
| BEGIN_DIRECTIVE LAW_INCLUDE COLON args = nonempty_list(DIRECTIVE_ARG) page = option(AT_PAGE) END_DIRECTIVE {
|
||||||
LawInclude includ
|
let filename = String.trim (String.concat "" args) in
|
||||||
|
let pos = Pos.from_lpos $sloc in
|
||||||
|
let jorftext = Re.Pcre.regexp "JORFTEXT\\d{12}" in
|
||||||
|
if Re.Pcre.pmatch ~rex:jorftext filename && page = None then
|
||||||
|
LawInclude (Ast.LegislativeText (filename, pos))
|
||||||
|
else if Filename.extension filename = ".pdf" || page <> None then
|
||||||
|
LawInclude (Ast.PdfFile ((filename, pos), page))
|
||||||
|
else
|
||||||
|
LawInclude (Ast.CatalaFile (filename, pos))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,7 +23,11 @@
|
|||||||
|
|
||||||
%token EOF
|
%token EOF
|
||||||
%token<string * string option * string option * int> LAW_HEADING
|
%token<string * string option * string option * int> LAW_HEADING
|
||||||
%token<Ast.law_include> LAW_INCLUDE
|
|
||||||
|
%token BEGIN_DIRECTIVE END_DIRECTIVE LAW_INCLUDE
|
||||||
|
%token<int> AT_PAGE
|
||||||
|
%token<string> DIRECTIVE_ARG
|
||||||
|
|
||||||
%token<string> LAW_TEXT
|
%token<string> LAW_TEXT
|
||||||
%token<string> CONSTRUCTOR IDENT
|
%token<string> CONSTRUCTOR IDENT
|
||||||
%token<string> END_CODE
|
%token<string> END_CODE
|
||||||
|
Loading…
Reference in New Issue
Block a user