mirror of
https://github.com/CatalaLang/catala.git
synced 2024-09-19 16:28:12 +03:00
Pass the localised builtins as parameters to the parser
This commit is contained in:
parent
6ca3b2f18a
commit
7e9ec54947
@ -130,6 +130,10 @@ let token_list : (string * token) list =
|
||||
]
|
||||
@ token_list_language_agnostic
|
||||
|
||||
(** Localised builtin functions *)
|
||||
let builtins : (string * Ast.builtin_expression) list =
|
||||
[ ("int_to_dec", IntToDec); ("get_day", GetDay); ("get_month", GetMonth); ("get_year", GetYear) ]
|
||||
|
||||
(** Main lexing function used in a code block *)
|
||||
let rec lex_code (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
@ -298,18 +302,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "|]" ->
|
||||
update_acc lexbuf;
|
||||
CONSEQUENCE
|
||||
| "int_to_dec" ->
|
||||
update_acc lexbuf;
|
||||
INT_TO_DEC
|
||||
| "get_day" ->
|
||||
update_acc lexbuf;
|
||||
GET_DAY
|
||||
| "get_month" ->
|
||||
update_acc lexbuf;
|
||||
GET_MONTH
|
||||
| "get_year" ->
|
||||
update_acc lexbuf;
|
||||
GET_YEAR
|
||||
| "maximum" ->
|
||||
update_acc lexbuf;
|
||||
MAXIMUM
|
||||
@ -597,16 +589,19 @@ let lex_law (lexbuf : lexbuf) : token =
|
||||
let lexer (lexbuf : lexbuf) : token = if !is_code then lex_code lexbuf else lex_law lexbuf
|
||||
|
||||
module type LocalisedLexer = sig
|
||||
val token_list : (string * Parser.token) list
|
||||
val token_list : (string * Tokens.token) list
|
||||
(** Same as {!val: token_list_language_agnostic}, but with tokens specialized to a given language. *)
|
||||
|
||||
val lex_code : Sedlexing.lexbuf -> Parser.token
|
||||
val builtins : (string * Ast.builtin_expression) list
|
||||
(** Associative list of string to their corresponding builtins *)
|
||||
|
||||
val lex_code : Sedlexing.lexbuf -> Tokens.token
|
||||
(** Main lexing function used in code blocks *)
|
||||
|
||||
val lex_law : Sedlexing.lexbuf -> Parser.token
|
||||
val lex_law : Sedlexing.lexbuf -> Tokens.token
|
||||
(** Main lexing function used outside code blocks *)
|
||||
|
||||
val lexer : Sedlexing.lexbuf -> Parser.token
|
||||
val lexer : Sedlexing.lexbuf -> Tokens.token
|
||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of
|
||||
{!val: Surface.Lexer.is_code}. *)
|
||||
end
|
||||
|
@ -31,23 +31,25 @@ val update_acc : Sedlexing.lexbuf -> unit
|
||||
val raise_lexer_error : Utils.Pos.t -> string -> 'a
|
||||
(** Error-generating helper *)
|
||||
|
||||
val token_list_language_agnostic : (string * Parser.token) list
|
||||
val token_list_language_agnostic : (string * Tokens.token) list
|
||||
(** Associative list matching each punctuation string part of the Catala syntax with its {!module:
|
||||
Surface.Parser} token. Same for all the input languages (English, French, etc.) *)
|
||||
|
||||
module type LocalisedLexer = sig
|
||||
val token_list : (string * Parser.token) list
|
||||
val token_list : (string * Tokens.token) list
|
||||
(** Same as {!val: token_list_language_agnostic}, but with tokens whose string varies with the
|
||||
input language. *)
|
||||
|
||||
val builtins : (string * Ast.builtin_expression) list
|
||||
(** Associative list of string to their corresponding builtins *)
|
||||
|
||||
val lex_code : Sedlexing.lexbuf -> Parser.token
|
||||
val lex_code : Sedlexing.lexbuf -> Tokens.token
|
||||
(** Main lexing function used in a code block *)
|
||||
|
||||
val lex_law : Sedlexing.lexbuf -> Parser.token
|
||||
val lex_law : Sedlexing.lexbuf -> Tokens.token
|
||||
(** Main lexing function used outside code blocks *)
|
||||
|
||||
val lexer : Sedlexing.lexbuf -> Parser.token
|
||||
val lexer : Sedlexing.lexbuf -> Tokens.token
|
||||
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of
|
||||
{!val: is_code}. *)
|
||||
end
|
||||
|
@ -12,7 +12,7 @@
|
||||
or implied. See the License for the specific language governing permissions and limitations under
|
||||
the License. *)
|
||||
|
||||
open Parser
|
||||
open Tokens
|
||||
open Sedlexing
|
||||
module R = Re.Pcre
|
||||
|
||||
|
@ -15,5 +15,5 @@
|
||||
val calc_precedence : string -> int
|
||||
(** Calculates the precedence according a matched regex of the form : '[#]+' *)
|
||||
|
||||
val get_law_heading : Sedlexing.lexbuf -> Parser.token
|
||||
val get_law_heading : Sedlexing.lexbuf -> Tokens.token
|
||||
(** Gets the [LAW_HEADING] token from the current [lexbuf] *)
|
||||
|
@ -85,6 +85,15 @@ let token_list : (string * token) list =
|
||||
]
|
||||
@ L.token_list_language_agnostic
|
||||
|
||||
(** Localised builtin functions *)
|
||||
let builtins : (string * Ast.builtin_expression) list =
|
||||
[
|
||||
("integer_to_decimal", IntToDec);
|
||||
("get_day", GetDay);
|
||||
("get_month", GetMonth);
|
||||
("get_year", GetYear);
|
||||
]
|
||||
|
||||
(** Main lexing function used in code blocks *)
|
||||
let rec lex_code (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
@ -253,18 +262,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "not" ->
|
||||
L.update_acc lexbuf;
|
||||
NOT
|
||||
| "integer_to_decimal" ->
|
||||
L.update_acc lexbuf;
|
||||
INT_TO_DEC
|
||||
| "get_day" ->
|
||||
L.update_acc lexbuf;
|
||||
GET_DAY
|
||||
| "get_month" ->
|
||||
L.update_acc lexbuf;
|
||||
GET_MONTH
|
||||
| "get_year" ->
|
||||
L.update_acc lexbuf;
|
||||
GET_YEAR
|
||||
| "maximum" ->
|
||||
L.update_acc lexbuf;
|
||||
MAXIMUM
|
||||
|
@ -83,6 +83,14 @@ let token_list : (string * token) list =
|
||||
]
|
||||
@ L.token_list_language_agnostic
|
||||
|
||||
let builtins : (string * Ast.builtin_expression) list =
|
||||
[
|
||||
("entier_vers_décimal", Ast.IntToDec);
|
||||
("accès_jour", Ast.GetDay);
|
||||
("accès_mois", Ast.GetMonth);
|
||||
("accès_année", Ast.GetYear);
|
||||
]
|
||||
|
||||
(** Main lexing function used in code blocks *)
|
||||
let rec lex_code (lexbuf : lexbuf) : token =
|
||||
let prev_lexeme = Utf8.lexeme lexbuf in
|
||||
@ -275,18 +283,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
|
||||
| "initial" ->
|
||||
L.update_acc lexbuf;
|
||||
INIT
|
||||
| "entier_vers_d", 0xE9, "cimal" ->
|
||||
L.update_acc lexbuf;
|
||||
INT_TO_DEC
|
||||
| "acc", 0xE8, "s_jour" ->
|
||||
L.update_acc lexbuf;
|
||||
GET_DAY
|
||||
| "acc", 0xE8, "s_mois" ->
|
||||
L.update_acc lexbuf;
|
||||
GET_MONTH
|
||||
| "acc", 0xE8, "s_ann", 0xE9, "e" ->
|
||||
L.update_acc lexbuf;
|
||||
GET_YEAR
|
||||
| "vrai" ->
|
||||
L.update_acc lexbuf;
|
||||
TRUE
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -20,6 +20,9 @@
|
||||
open Utils
|
||||
%}
|
||||
|
||||
%parameter<Localisation: sig
|
||||
val builtins: (string * Ast.builtin_expression) list
|
||||
end>
|
||||
|
||||
%type <Ast.source_file_or_master> source_file_or_master
|
||||
|
||||
@ -58,7 +61,10 @@ qident:
|
||||
}
|
||||
|
||||
atomic_expression:
|
||||
| q = ident { let (q, q_pos) = q in (Ident q, q_pos) }
|
||||
| q = ident {
|
||||
let (q, q_pos) = q in
|
||||
(try Builtin (List.assoc q Localisation.builtins) with Not_found -> Ident q),
|
||||
q_pos }
|
||||
| l = literal { let (l, l_pos) = l in (Literal l, l_pos) }
|
||||
| LPAREN e = expression RPAREN { e }
|
||||
|
||||
@ -95,18 +101,6 @@ struct_or_enum_inject:
|
||||
(Builtin Cardinal, Pos.from_lpos $sloc)
|
||||
}
|
||||
|
||||
| INT_TO_DEC {
|
||||
(Builtin IntToDec, Pos.from_lpos $sloc)
|
||||
}
|
||||
| GET_DAY {
|
||||
(Builtin GetDay, Pos.from_lpos $sloc)
|
||||
}
|
||||
| GET_MONTH {
|
||||
(Builtin GetMonth, Pos.from_lpos $sloc)
|
||||
}
|
||||
| GET_YEAR {
|
||||
(Builtin GetYear, Pos.from_lpos $sloc)
|
||||
}
|
||||
| e = struct_or_enum_inject {
|
||||
e
|
||||
}
|
||||
|
@ -16,16 +16,9 @@
|
||||
|
||||
open Sedlexing
|
||||
open Utils
|
||||
module I = Parser.MenhirInterpreter
|
||||
|
||||
(** {1 Internal functions} *)
|
||||
|
||||
(** Returns the state number from the Menhir environment *)
|
||||
let state (env : 'semantic_value I.env) : int =
|
||||
match Lazy.force (I.stack env) with
|
||||
| MenhirLib.General.Nil -> 0
|
||||
| MenhirLib.General.Cons (Element (s, _, _, _), _) -> I.number s
|
||||
|
||||
(** Three-way minimum *)
|
||||
let minimum a b c = min a (min b c)
|
||||
|
||||
@ -113,104 +106,122 @@ let raise_parser_error (error_loc : Pos.t) (last_good_loc : Pos.t option) (token
|
||||
| None -> []
|
||||
| Some last_good_loc -> [ (Some "Last good token:", last_good_loc) ]))
|
||||
|
||||
(** Usage: [fail lexbuf env token_list last_input_needed]
|
||||
module ParserAux (LocalisedLexer : Lexer.LocalisedLexer) = struct
|
||||
include Parser.Make (LocalisedLexer)
|
||||
module I = MenhirInterpreter
|
||||
|
||||
Raises an error with meaningful hints about what the parsing error was. [lexbuf] is the lexing
|
||||
buffer state at the failure point, [env] is the Menhir environment and [last_input_needed] is
|
||||
the last checkpoint of a valid Menhir state before the parsing error. [token_list] is provided
|
||||
by things like {!val: Surface.Lexer.token_list_language_agnostic} and is used to provide
|
||||
suggestions of the tokens acceptable at the failure point *)
|
||||
let fail (lexbuf : lexbuf) (env : 'semantic_value I.env) (token_list : (string * Parser.token) list)
|
||||
(last_input_needed : 'semantic_value I.env option) : 'a =
|
||||
let wrong_token = Utf8.lexeme lexbuf in
|
||||
let acceptable_tokens, last_positions =
|
||||
match last_input_needed with
|
||||
| Some last_input_needed ->
|
||||
( List.filter
|
||||
(fun (_, t) ->
|
||||
I.acceptable (I.input_needed last_input_needed) t (fst (lexing_positions lexbuf)))
|
||||
token_list,
|
||||
Some (I.positions last_input_needed) )
|
||||
| None -> (token_list, None)
|
||||
in
|
||||
let similar_acceptable_tokens =
|
||||
List.sort
|
||||
(fun (x, _) (y, _) ->
|
||||
let truncated_x =
|
||||
if String.length wrong_token <= String.length x then
|
||||
String.sub x 0 (String.length wrong_token)
|
||||
else x
|
||||
in
|
||||
let truncated_y =
|
||||
if String.length wrong_token <= String.length y then
|
||||
String.sub y 0 (String.length wrong_token)
|
||||
else y
|
||||
in
|
||||
let levx = levenshtein_distance truncated_x wrong_token in
|
||||
let levy = levenshtein_distance truncated_y wrong_token in
|
||||
if levx = levy then String.length x - String.length y else levx - levy)
|
||||
acceptable_tokens
|
||||
in
|
||||
let similar_token_msg =
|
||||
if List.length similar_acceptable_tokens = 0 then None
|
||||
else
|
||||
Some
|
||||
(Printf.sprintf "did you mean %s?"
|
||||
(String.concat ", or maybe "
|
||||
(List.map
|
||||
(fun (ts, _) -> Cli.print_with_style syntax_hints_style "\"%s\"" ts)
|
||||
similar_acceptable_tokens)))
|
||||
in
|
||||
(* The parser has suspended itself because of a syntax error. Stop. *)
|
||||
let custom_menhir_message =
|
||||
match Parser_errors.message (state env) with
|
||||
| exception Not_found ->
|
||||
"Message: " ^ Cli.print_with_style syntax_hints_style "%s" "unexpected token"
|
||||
| msg ->
|
||||
"Message: "
|
||||
^ Cli.print_with_style syntax_hints_style "%s" (String.trim (String.uncapitalize_ascii msg))
|
||||
in
|
||||
let msg =
|
||||
match similar_token_msg with
|
||||
| None -> custom_menhir_message
|
||||
| Some similar_token_msg ->
|
||||
Printf.sprintf "%s\nAutosuggestion: %s" custom_menhir_message similar_token_msg
|
||||
in
|
||||
raise_parser_error
|
||||
(Pos.from_lpos (lexing_positions lexbuf))
|
||||
(Option.map Pos.from_lpos last_positions)
|
||||
(Utf8.lexeme lexbuf) msg
|
||||
(** Returns the state number from the Menhir environment *)
|
||||
let state (env : 'semantic_value I.env) : int =
|
||||
match Lazy.force (I.stack env) with
|
||||
| MenhirLib.General.Nil -> 0
|
||||
| MenhirLib.General.Cons (Element (s, _, _, _), _) -> I.number s
|
||||
|
||||
(** Main parsing loop *)
|
||||
let rec loop (next_token : unit -> Parser.token * Lexing.position * Lexing.position)
|
||||
(token_list : (string * Parser.token) list) (lexbuf : lexbuf)
|
||||
(last_input_needed : 'semantic_value I.env option) (checkpoint : 'semantic_value I.checkpoint) :
|
||||
Ast.source_file_or_master =
|
||||
match checkpoint with
|
||||
| I.InputNeeded env ->
|
||||
let token = next_token () in
|
||||
let checkpoint = I.offer checkpoint token in
|
||||
loop next_token token_list lexbuf (Some env) checkpoint
|
||||
| I.Shifting _ | I.AboutToReduce _ ->
|
||||
let checkpoint = I.resume checkpoint in
|
||||
loop next_token token_list lexbuf last_input_needed checkpoint
|
||||
| I.HandlingError env -> fail lexbuf env token_list last_input_needed
|
||||
| I.Accepted v -> v
|
||||
| I.Rejected ->
|
||||
(* Cannot happen as we stop at syntax error immediatly *)
|
||||
assert false
|
||||
(** Usage: [fail lexbuf env token_list last_input_needed]
|
||||
|
||||
(** Stub that wraps the parsing main loop and handles the Menhir/Sedlex type difference for
|
||||
[lexbuf]. *)
|
||||
let sedlex_with_menhir (lexer' : lexbuf -> Parser.token) (token_list : (string * Parser.token) list)
|
||||
(target_rule : Lexing.position -> 'semantic_value I.checkpoint) (lexbuf : lexbuf) :
|
||||
Ast.source_file_or_master =
|
||||
let lexer : unit -> Parser.token * Lexing.position * Lexing.position =
|
||||
with_tokenizer lexer' lexbuf
|
||||
in
|
||||
try loop lexer token_list lexbuf None (target_rule (fst @@ Sedlexing.lexing_positions lexbuf))
|
||||
with Sedlexing.MalFormed | Sedlexing.InvalidCodepoint _ ->
|
||||
Lexer.raise_lexer_error (Pos.from_lpos (lexing_positions lexbuf)) (Utf8.lexeme lexbuf)
|
||||
Raises an error with meaningful hints about what the parsing error was. [lexbuf] is the lexing
|
||||
buffer state at the failure point, [env] is the Menhir environment and [last_input_needed] is
|
||||
the last checkpoint of a valid Menhir state before the parsing error. [token_list] is provided
|
||||
by things like {!val: Surface.Lexer.token_list_language_agnostic} and is used to provide
|
||||
suggestions of the tokens acceptable at the failure point *)
|
||||
let fail (lexbuf : lexbuf) (env : 'semantic_value I.env)
|
||||
(token_list : (string * Tokens.token) list) (last_input_needed : 'semantic_value I.env option)
|
||||
: 'a =
|
||||
let wrong_token = Utf8.lexeme lexbuf in
|
||||
let acceptable_tokens, last_positions =
|
||||
match last_input_needed with
|
||||
| Some last_input_needed ->
|
||||
( List.filter
|
||||
(fun (_, t) ->
|
||||
I.acceptable (I.input_needed last_input_needed) t (fst (lexing_positions lexbuf)))
|
||||
token_list,
|
||||
Some (I.positions last_input_needed) )
|
||||
| None -> (token_list, None)
|
||||
in
|
||||
let similar_acceptable_tokens =
|
||||
List.sort
|
||||
(fun (x, _) (y, _) ->
|
||||
let truncated_x =
|
||||
if String.length wrong_token <= String.length x then
|
||||
String.sub x 0 (String.length wrong_token)
|
||||
else x
|
||||
in
|
||||
let truncated_y =
|
||||
if String.length wrong_token <= String.length y then
|
||||
String.sub y 0 (String.length wrong_token)
|
||||
else y
|
||||
in
|
||||
let levx = levenshtein_distance truncated_x wrong_token in
|
||||
let levy = levenshtein_distance truncated_y wrong_token in
|
||||
if levx = levy then String.length x - String.length y else levx - levy)
|
||||
acceptable_tokens
|
||||
in
|
||||
let similar_token_msg =
|
||||
if List.length similar_acceptable_tokens = 0 then None
|
||||
else
|
||||
Some
|
||||
(Printf.sprintf "did you mean %s?"
|
||||
(String.concat ", or maybe "
|
||||
(List.map
|
||||
(fun (ts, _) -> Cli.print_with_style syntax_hints_style "\"%s\"" ts)
|
||||
similar_acceptable_tokens)))
|
||||
in
|
||||
(* The parser has suspended itself because of a syntax error. Stop. *)
|
||||
let custom_menhir_message =
|
||||
match Parser_errors.message (state env) with
|
||||
| exception Not_found ->
|
||||
"Message: " ^ Cli.print_with_style syntax_hints_style "%s" "unexpected token"
|
||||
| msg ->
|
||||
"Message: "
|
||||
^ Cli.print_with_style syntax_hints_style "%s"
|
||||
(String.trim (String.uncapitalize_ascii msg))
|
||||
in
|
||||
let msg =
|
||||
match similar_token_msg with
|
||||
| None -> custom_menhir_message
|
||||
| Some similar_token_msg ->
|
||||
Printf.sprintf "%s\nAutosuggestion: %s" custom_menhir_message similar_token_msg
|
||||
in
|
||||
raise_parser_error
|
||||
(Pos.from_lpos (lexing_positions lexbuf))
|
||||
(Option.map Pos.from_lpos last_positions)
|
||||
(Utf8.lexeme lexbuf) msg
|
||||
|
||||
(** Main parsing loop *)
|
||||
let rec loop (next_token : unit -> Tokens.token * Lexing.position * Lexing.position)
|
||||
(token_list : (string * Tokens.token) list) (lexbuf : lexbuf)
|
||||
(last_input_needed : 'semantic_value I.env option) (checkpoint : 'semantic_value I.checkpoint)
|
||||
: Ast.source_file_or_master =
|
||||
match checkpoint with
|
||||
| I.InputNeeded env ->
|
||||
let token = next_token () in
|
||||
let checkpoint = I.offer checkpoint token in
|
||||
loop next_token token_list lexbuf (Some env) checkpoint
|
||||
| I.Shifting _ | I.AboutToReduce _ ->
|
||||
let checkpoint = I.resume checkpoint in
|
||||
loop next_token token_list lexbuf last_input_needed checkpoint
|
||||
| I.HandlingError env -> fail lexbuf env token_list last_input_needed
|
||||
| I.Accepted v -> v
|
||||
| I.Rejected ->
|
||||
(* Cannot happen as we stop at syntax error immediatly *)
|
||||
assert false
|
||||
|
||||
(** Stub that wraps the parsing main loop and handles the Menhir/Sedlex type difference for
|
||||
[lexbuf]. *)
|
||||
let sedlex_with_menhir (lexer' : lexbuf -> Tokens.token)
|
||||
(token_list : (string * Tokens.token) list)
|
||||
(target_rule : Lexing.position -> 'semantic_value I.checkpoint) (lexbuf : lexbuf) :
|
||||
Ast.source_file_or_master =
|
||||
let lexer : unit -> Tokens.token * Lexing.position * Lexing.position =
|
||||
with_tokenizer lexer' lexbuf
|
||||
in
|
||||
try loop lexer token_list lexbuf None (target_rule (fst @@ Sedlexing.lexing_positions lexbuf))
|
||||
with Sedlexing.MalFormed | Sedlexing.InvalidCodepoint _ ->
|
||||
Lexer.raise_lexer_error (Pos.from_lpos (lexing_positions lexbuf)) (Utf8.lexeme lexbuf)
|
||||
|
||||
let commands_or_includes (lexbuf : lexbuf) : Ast.source_file_or_master =
|
||||
sedlex_with_menhir LocalisedLexer.lexer LocalisedLexer.token_list
|
||||
Incremental.source_file_or_master lexbuf
|
||||
end
|
||||
|
||||
(** {1 Parsing multiple files}*)
|
||||
|
||||
@ -235,10 +246,8 @@ let rec parse_source_file (source_file : Pos.input_file) (language : Cli.fronten
|
||||
Sedlexing.set_filename lexbuf source_file_name;
|
||||
Parse_utils.current_file := source_file_name;
|
||||
let module LocalisedLexer = (val List.assoc language localised_lexers) in
|
||||
let commands_or_includes =
|
||||
sedlex_with_menhir LocalisedLexer.lexer LocalisedLexer.token_list
|
||||
Parser.Incremental.source_file_or_master lexbuf
|
||||
in
|
||||
let module ConcreteParser = ParserAux (LocalisedLexer) in
|
||||
let commands_or_includes = ConcreteParser.commands_or_includes lexbuf in
|
||||
(match input with Some input -> close_in input | None -> ());
|
||||
match commands_or_includes with
|
||||
| Ast.SourceFile commands ->
|
||||
|
@ -57,8 +57,6 @@
|
||||
%token UNDER_CONDITION CONSEQUENCE LBRACKET RBRACKET
|
||||
%token LABEL EXCEPTION LSQUARE RSQUARE SEMICOLON
|
||||
%token MAXIMUM MINIMUM INIT
|
||||
%token INT_TO_DEC
|
||||
%token GET_DAY GET_MONTH GET_YEAR
|
||||
%token FILTER MAP
|
||||
|
||||
%%
|
||||
|
Loading…
Reference in New Issue
Block a user