Pass the localised builtins as parameters to the parser

This commit is contained in:
Louis Gesbert 2021-04-30 09:59:09 +02:00
parent 6ca3b2f18a
commit 7e9ec54947
10 changed files with 376 additions and 385 deletions

View File

@ -130,6 +130,10 @@ let token_list : (string * token) list =
]
@ token_list_language_agnostic
(** Localised builtin functions *)
let builtins : (string * Ast.builtin_expression) list =
[ ("int_to_dec", IntToDec); ("get_day", GetDay); ("get_month", GetMonth); ("get_year", GetYear) ]
(** Main lexing function used in a code block *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
@ -298,18 +302,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "|]" ->
update_acc lexbuf;
CONSEQUENCE
| "int_to_dec" ->
update_acc lexbuf;
INT_TO_DEC
| "get_day" ->
update_acc lexbuf;
GET_DAY
| "get_month" ->
update_acc lexbuf;
GET_MONTH
| "get_year" ->
update_acc lexbuf;
GET_YEAR
| "maximum" ->
update_acc lexbuf;
MAXIMUM
@ -597,16 +589,19 @@ let lex_law (lexbuf : lexbuf) : token =
let lexer (lexbuf : lexbuf) : token = if !is_code then lex_code lexbuf else lex_law lexbuf
module type LocalisedLexer = sig
val token_list : (string * Parser.token) list
val token_list : (string * Tokens.token) list
(** Same as {!val: token_list_language_agnostic}, but with tokens specialized to a given language. *)
val lex_code : Sedlexing.lexbuf -> Parser.token
val builtins : (string * Ast.builtin_expression) list
(** Associative list of string to their corresponding builtins *)
val lex_code : Sedlexing.lexbuf -> Tokens.token
(** Main lexing function used in code blocks *)
val lex_law : Sedlexing.lexbuf -> Parser.token
val lex_law : Sedlexing.lexbuf -> Tokens.token
(** Main lexing function used outside code blocks *)
val lexer : Sedlexing.lexbuf -> Parser.token
val lexer : Sedlexing.lexbuf -> Tokens.token
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of
{!val: Surface.Lexer.is_code}. *)
end

View File

@ -31,23 +31,25 @@ val update_acc : Sedlexing.lexbuf -> unit
val raise_lexer_error : Utils.Pos.t -> string -> 'a
(** Error-generating helper *)
val token_list_language_agnostic : (string * Parser.token) list
val token_list_language_agnostic : (string * Tokens.token) list
(** Associative list matching each punctuation string part of the Catala syntax with its {!module:
Surface.Parser} token. Same for all the input languages (English, French, etc.) *)
module type LocalisedLexer = sig
val token_list : (string * Parser.token) list
val token_list : (string * Tokens.token) list
(** Same as {!val: token_list_language_agnostic}, but with tokens whose string varies with the
input language. *)
val builtins : (string * Ast.builtin_expression) list
(** Associative list of string to their corresponding builtins *)
val lex_code : Sedlexing.lexbuf -> Parser.token
val lex_code : Sedlexing.lexbuf -> Tokens.token
(** Main lexing function used in a code block *)
val lex_law : Sedlexing.lexbuf -> Parser.token
val lex_law : Sedlexing.lexbuf -> Tokens.token
(** Main lexing function used outside code blocks *)
val lexer : Sedlexing.lexbuf -> Parser.token
val lexer : Sedlexing.lexbuf -> Tokens.token
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of
{!val: is_code}. *)
end

View File

@ -12,7 +12,7 @@
or implied. See the License for the specific language governing permissions and limitations under
the License. *)
open Parser
open Tokens
open Sedlexing
module R = Re.Pcre

View File

@ -15,5 +15,5 @@
val calc_precedence : string -> int
(** Calculates the precedence according a matched regex of the form : '[#]+' *)
val get_law_heading : Sedlexing.lexbuf -> Parser.token
val get_law_heading : Sedlexing.lexbuf -> Tokens.token
(** Gets the [LAW_HEADING] token from the current [lexbuf] *)

View File

@ -85,6 +85,15 @@ let token_list : (string * token) list =
]
@ L.token_list_language_agnostic
(** Localised builtin functions *)
let builtins : (string * Ast.builtin_expression) list =
[
("integer_to_decimal", IntToDec);
("get_day", GetDay);
("get_month", GetMonth);
("get_year", GetYear);
]
(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
@ -253,18 +262,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "not" ->
L.update_acc lexbuf;
NOT
| "integer_to_decimal" ->
L.update_acc lexbuf;
INT_TO_DEC
| "get_day" ->
L.update_acc lexbuf;
GET_DAY
| "get_month" ->
L.update_acc lexbuf;
GET_MONTH
| "get_year" ->
L.update_acc lexbuf;
GET_YEAR
| "maximum" ->
L.update_acc lexbuf;
MAXIMUM

View File

@ -83,6 +83,14 @@ let token_list : (string * token) list =
]
@ L.token_list_language_agnostic
let builtins : (string * Ast.builtin_expression) list =
[
("entier_vers_décimal", Ast.IntToDec);
("accès_jour", Ast.GetDay);
("accès_mois", Ast.GetMonth);
("accès_année", Ast.GetYear);
]
(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
@ -275,18 +283,6 @@ let rec lex_code (lexbuf : lexbuf) : token =
| "initial" ->
L.update_acc lexbuf;
INIT
| "entier_vers_d", 0xE9, "cimal" ->
L.update_acc lexbuf;
INT_TO_DEC
| "acc", 0xE8, "s_jour" ->
L.update_acc lexbuf;
GET_DAY
| "acc", 0xE8, "s_mois" ->
L.update_acc lexbuf;
GET_MONTH
| "acc", 0xE8, "s_ann", 0xE9, "e" ->
L.update_acc lexbuf;
GET_YEAR
| "vrai" ->
L.update_acc lexbuf;
TRUE

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,9 @@
open Utils
%}
%parameter<Localisation: sig
val builtins: (string * Ast.builtin_expression) list
end>
%type <Ast.source_file_or_master> source_file_or_master
@ -58,7 +61,10 @@ qident:
}
atomic_expression:
| q = ident { let (q, q_pos) = q in (Ident q, q_pos) }
| q = ident {
let (q, q_pos) = q in
(try Builtin (List.assoc q Localisation.builtins) with Not_found -> Ident q),
q_pos }
| l = literal { let (l, l_pos) = l in (Literal l, l_pos) }
| LPAREN e = expression RPAREN { e }
@ -95,18 +101,6 @@ struct_or_enum_inject:
(Builtin Cardinal, Pos.from_lpos $sloc)
}
| INT_TO_DEC {
(Builtin IntToDec, Pos.from_lpos $sloc)
}
| GET_DAY {
(Builtin GetDay, Pos.from_lpos $sloc)
}
| GET_MONTH {
(Builtin GetMonth, Pos.from_lpos $sloc)
}
| GET_YEAR {
(Builtin GetYear, Pos.from_lpos $sloc)
}
| e = struct_or_enum_inject {
e
}

View File

@ -16,16 +16,9 @@
open Sedlexing
open Utils
module I = Parser.MenhirInterpreter
(** {1 Internal functions} *)
(** Returns the state number from the Menhir environment *)
let state (env : 'semantic_value I.env) : int =
match Lazy.force (I.stack env) with
| MenhirLib.General.Nil -> 0
| MenhirLib.General.Cons (Element (s, _, _, _), _) -> I.number s
(** Three-way minimum *)
let minimum a b c = min a (min b c)
@ -113,104 +106,122 @@ let raise_parser_error (error_loc : Pos.t) (last_good_loc : Pos.t option) (token
| None -> []
| Some last_good_loc -> [ (Some "Last good token:", last_good_loc) ]))
(** Usage: [fail lexbuf env token_list last_input_needed]
module ParserAux (LocalisedLexer : Lexer.LocalisedLexer) = struct
include Parser.Make (LocalisedLexer)
module I = MenhirInterpreter
Raises an error with meaningful hints about what the parsing error was. [lexbuf] is the lexing
buffer state at the failure point, [env] is the Menhir environment and [last_input_needed] is
the last checkpoint of a valid Menhir state before the parsing error. [token_list] is provided
by things like {!val: Surface.Lexer.token_list_language_agnostic} and is used to provide
suggestions of the tokens acceptable at the failure point *)
let fail (lexbuf : lexbuf) (env : 'semantic_value I.env) (token_list : (string * Parser.token) list)
(last_input_needed : 'semantic_value I.env option) : 'a =
let wrong_token = Utf8.lexeme lexbuf in
let acceptable_tokens, last_positions =
match last_input_needed with
| Some last_input_needed ->
( List.filter
(fun (_, t) ->
I.acceptable (I.input_needed last_input_needed) t (fst (lexing_positions lexbuf)))
token_list,
Some (I.positions last_input_needed) )
| None -> (token_list, None)
in
let similar_acceptable_tokens =
List.sort
(fun (x, _) (y, _) ->
let truncated_x =
if String.length wrong_token <= String.length x then
String.sub x 0 (String.length wrong_token)
else x
in
let truncated_y =
if String.length wrong_token <= String.length y then
String.sub y 0 (String.length wrong_token)
else y
in
let levx = levenshtein_distance truncated_x wrong_token in
let levy = levenshtein_distance truncated_y wrong_token in
if levx = levy then String.length x - String.length y else levx - levy)
acceptable_tokens
in
let similar_token_msg =
if List.length similar_acceptable_tokens = 0 then None
else
Some
(Printf.sprintf "did you mean %s?"
(String.concat ", or maybe "
(List.map
(fun (ts, _) -> Cli.print_with_style syntax_hints_style "\"%s\"" ts)
similar_acceptable_tokens)))
in
(* The parser has suspended itself because of a syntax error. Stop. *)
let custom_menhir_message =
match Parser_errors.message (state env) with
| exception Not_found ->
"Message: " ^ Cli.print_with_style syntax_hints_style "%s" "unexpected token"
| msg ->
"Message: "
^ Cli.print_with_style syntax_hints_style "%s" (String.trim (String.uncapitalize_ascii msg))
in
let msg =
match similar_token_msg with
| None -> custom_menhir_message
| Some similar_token_msg ->
Printf.sprintf "%s\nAutosuggestion: %s" custom_menhir_message similar_token_msg
in
raise_parser_error
(Pos.from_lpos (lexing_positions lexbuf))
(Option.map Pos.from_lpos last_positions)
(Utf8.lexeme lexbuf) msg
(** Returns the state number from the Menhir environment *)
let state (env : 'semantic_value I.env) : int =
match Lazy.force (I.stack env) with
| MenhirLib.General.Nil -> 0
| MenhirLib.General.Cons (Element (s, _, _, _), _) -> I.number s
(** Main parsing loop *)
let rec loop (next_token : unit -> Parser.token * Lexing.position * Lexing.position)
(token_list : (string * Parser.token) list) (lexbuf : lexbuf)
(last_input_needed : 'semantic_value I.env option) (checkpoint : 'semantic_value I.checkpoint) :
Ast.source_file_or_master =
match checkpoint with
| I.InputNeeded env ->
let token = next_token () in
let checkpoint = I.offer checkpoint token in
loop next_token token_list lexbuf (Some env) checkpoint
| I.Shifting _ | I.AboutToReduce _ ->
let checkpoint = I.resume checkpoint in
loop next_token token_list lexbuf last_input_needed checkpoint
| I.HandlingError env -> fail lexbuf env token_list last_input_needed
| I.Accepted v -> v
| I.Rejected ->
(* Cannot happen as we stop at syntax error immediatly *)
assert false
(** Usage: [fail lexbuf env token_list last_input_needed]
(** Stub that wraps the parsing main loop and handles the Menhir/Sedlex type difference for
[lexbuf]. *)
let sedlex_with_menhir (lexer' : lexbuf -> Parser.token) (token_list : (string * Parser.token) list)
(target_rule : Lexing.position -> 'semantic_value I.checkpoint) (lexbuf : lexbuf) :
Ast.source_file_or_master =
let lexer : unit -> Parser.token * Lexing.position * Lexing.position =
with_tokenizer lexer' lexbuf
in
try loop lexer token_list lexbuf None (target_rule (fst @@ Sedlexing.lexing_positions lexbuf))
with Sedlexing.MalFormed | Sedlexing.InvalidCodepoint _ ->
Lexer.raise_lexer_error (Pos.from_lpos (lexing_positions lexbuf)) (Utf8.lexeme lexbuf)
Raises an error with meaningful hints about what the parsing error was. [lexbuf] is the lexing
buffer state at the failure point, [env] is the Menhir environment and [last_input_needed] is
the last checkpoint of a valid Menhir state before the parsing error. [token_list] is provided
by things like {!val: Surface.Lexer.token_list_language_agnostic} and is used to provide
suggestions of the tokens acceptable at the failure point *)
let fail (lexbuf : lexbuf) (env : 'semantic_value I.env)
(token_list : (string * Tokens.token) list) (last_input_needed : 'semantic_value I.env option)
: 'a =
let wrong_token = Utf8.lexeme lexbuf in
let acceptable_tokens, last_positions =
match last_input_needed with
| Some last_input_needed ->
( List.filter
(fun (_, t) ->
I.acceptable (I.input_needed last_input_needed) t (fst (lexing_positions lexbuf)))
token_list,
Some (I.positions last_input_needed) )
| None -> (token_list, None)
in
let similar_acceptable_tokens =
List.sort
(fun (x, _) (y, _) ->
let truncated_x =
if String.length wrong_token <= String.length x then
String.sub x 0 (String.length wrong_token)
else x
in
let truncated_y =
if String.length wrong_token <= String.length y then
String.sub y 0 (String.length wrong_token)
else y
in
let levx = levenshtein_distance truncated_x wrong_token in
let levy = levenshtein_distance truncated_y wrong_token in
if levx = levy then String.length x - String.length y else levx - levy)
acceptable_tokens
in
let similar_token_msg =
if List.length similar_acceptable_tokens = 0 then None
else
Some
(Printf.sprintf "did you mean %s?"
(String.concat ", or maybe "
(List.map
(fun (ts, _) -> Cli.print_with_style syntax_hints_style "\"%s\"" ts)
similar_acceptable_tokens)))
in
(* The parser has suspended itself because of a syntax error. Stop. *)
let custom_menhir_message =
match Parser_errors.message (state env) with
| exception Not_found ->
"Message: " ^ Cli.print_with_style syntax_hints_style "%s" "unexpected token"
| msg ->
"Message: "
^ Cli.print_with_style syntax_hints_style "%s"
(String.trim (String.uncapitalize_ascii msg))
in
let msg =
match similar_token_msg with
| None -> custom_menhir_message
| Some similar_token_msg ->
Printf.sprintf "%s\nAutosuggestion: %s" custom_menhir_message similar_token_msg
in
raise_parser_error
(Pos.from_lpos (lexing_positions lexbuf))
(Option.map Pos.from_lpos last_positions)
(Utf8.lexeme lexbuf) msg
(** Main parsing loop *)
let rec loop (next_token : unit -> Tokens.token * Lexing.position * Lexing.position)
(token_list : (string * Tokens.token) list) (lexbuf : lexbuf)
(last_input_needed : 'semantic_value I.env option) (checkpoint : 'semantic_value I.checkpoint)
: Ast.source_file_or_master =
match checkpoint with
| I.InputNeeded env ->
let token = next_token () in
let checkpoint = I.offer checkpoint token in
loop next_token token_list lexbuf (Some env) checkpoint
| I.Shifting _ | I.AboutToReduce _ ->
let checkpoint = I.resume checkpoint in
loop next_token token_list lexbuf last_input_needed checkpoint
| I.HandlingError env -> fail lexbuf env token_list last_input_needed
| I.Accepted v -> v
| I.Rejected ->
(* Cannot happen as we stop at syntax error immediatly *)
assert false
(** Stub that wraps the parsing main loop and handles the Menhir/Sedlex type difference for
[lexbuf]. *)
let sedlex_with_menhir (lexer' : lexbuf -> Tokens.token)
(token_list : (string * Tokens.token) list)
(target_rule : Lexing.position -> 'semantic_value I.checkpoint) (lexbuf : lexbuf) :
Ast.source_file_or_master =
let lexer : unit -> Tokens.token * Lexing.position * Lexing.position =
with_tokenizer lexer' lexbuf
in
try loop lexer token_list lexbuf None (target_rule (fst @@ Sedlexing.lexing_positions lexbuf))
with Sedlexing.MalFormed | Sedlexing.InvalidCodepoint _ ->
Lexer.raise_lexer_error (Pos.from_lpos (lexing_positions lexbuf)) (Utf8.lexeme lexbuf)
let commands_or_includes (lexbuf : lexbuf) : Ast.source_file_or_master =
sedlex_with_menhir LocalisedLexer.lexer LocalisedLexer.token_list
Incremental.source_file_or_master lexbuf
end
(** {1 Parsing multiple files}*)
@ -235,10 +246,8 @@ let rec parse_source_file (source_file : Pos.input_file) (language : Cli.fronten
Sedlexing.set_filename lexbuf source_file_name;
Parse_utils.current_file := source_file_name;
let module LocalisedLexer = (val List.assoc language localised_lexers) in
let commands_or_includes =
sedlex_with_menhir LocalisedLexer.lexer LocalisedLexer.token_list
Parser.Incremental.source_file_or_master lexbuf
in
let module ConcreteParser = ParserAux (LocalisedLexer) in
let commands_or_includes = ConcreteParser.commands_or_includes lexbuf in
(match input with Some input -> close_in input | None -> ());
match commands_or_includes with
| Ast.SourceFile commands ->

View File

@ -57,8 +57,6 @@
%token UNDER_CONDITION CONSEQUENCE LBRACKET RBRACKET
%token LABEL EXCEPTION LSQUARE RSQUARE SEMICOLON
%token MAXIMUM MINIMUM INIT
%token INT_TO_DEC
%token GET_DAY GET_MONTH GET_YEAR
%token FILTER MAP
%%