2022-03-08 17:03:14 +03:00
|
|
|
(* This file is part of the Catala compiler, a specification language for tax
|
|
|
|
and social benefits computation rules. Copyright (C) 2020 Inria,
|
|
|
|
contributors: Denis Merigoux <denis.merigoux@inria.fr>, Emile Rolley
|
|
|
|
<emile.rolley@tuta.io>
|
2020-04-19 16:53:35 +03:00
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
|
|
use this file except in compliance with the License. You may obtain a copy of
|
|
|
|
the License at
|
2020-04-19 16:53:35 +03:00
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
License for the specific language governing permissions and limitations under
|
2020-04-19 16:53:35 +03:00
|
|
|
the License. *)
|
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
(** Wrapping module around parser and lexer that offers the {!:
|
|
|
|
Parser_driver.parse_source_file} API. *)
|
2020-12-14 17:23:04 +03:00
|
|
|
|
2020-04-25 19:38:37 +03:00
|
|
|
open Sedlexing
|
2021-01-21 23:33:04 +03:00
|
|
|
open Utils
|
2020-04-25 19:38:37 +03:00
|
|
|
|
2020-12-14 17:23:04 +03:00
|
|
|
(** {1 Internal functions} *)
|
|
|
|
|
|
|
|
(** Three-way minimum *)
|
2020-04-26 14:39:01 +03:00
|
|
|
let minimum a b c = min a (min b c)
|
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
(** Computes the levenshtein distance between two strings, used to provide error
|
|
|
|
messages suggestions *)
|
2020-04-26 14:39:01 +03:00
|
|
|
let levenshtein_distance (s : string) (t : string) : int =
|
|
|
|
let m = String.length s and n = String.length t in
|
2022-03-08 17:03:14 +03:00
|
|
|
(* for all i and j, d.(i).(j) will hold the Levenshtein distance between the
|
|
|
|
first i characters of s and the first j characters of t *)
|
2020-04-26 14:39:01 +03:00
|
|
|
let d = Array.make_matrix (m + 1) (n + 1) 0 in
|
|
|
|
|
|
|
|
for i = 0 to m do
|
2022-03-08 17:03:14 +03:00
|
|
|
d.(i).(0) <- i
|
|
|
|
(* the distance of any first string to an empty second string *)
|
2020-04-26 14:39:01 +03:00
|
|
|
done;
|
|
|
|
for j = 0 to n do
|
2022-03-08 17:03:14 +03:00
|
|
|
d.(0).(j) <- j
|
|
|
|
(* the distance of any second string to an empty first string *)
|
2020-04-26 14:39:01 +03:00
|
|
|
done;
|
|
|
|
|
|
|
|
for j = 1 to n do
|
|
|
|
for i = 1 to m do
|
2022-03-08 17:03:14 +03:00
|
|
|
if s.[i - 1] = t.[j - 1] then d.(i).(j) <- d.(i - 1).(j - 1)
|
|
|
|
(* no operation required *)
|
2020-04-26 14:39:01 +03:00
|
|
|
else
|
|
|
|
d.(i).(j) <-
|
|
|
|
minimum
|
|
|
|
(d.(i - 1).(j) + 1) (* a deletion *)
|
|
|
|
(d.(i).(j - 1) + 1) (* an insertion *)
|
|
|
|
(d.(i - 1).(j - 1) + 1) (* a substitution *)
|
|
|
|
done
|
|
|
|
done;
|
|
|
|
|
|
|
|
d.(m).(n)
|
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
(** After parsing, heading structure is completely flat because of the
|
|
|
|
[source_file_item] rule. We need to tree-i-fy the flat structure, by looking
|
|
|
|
at the precedence of the law headings. *)
|
|
|
|
let rec law_struct_list_to_tree (f : Ast.law_structure list) :
|
|
|
|
Ast.law_structure list =
|
2021-01-20 21:58:48 +03:00
|
|
|
match f with
|
|
|
|
| [] -> []
|
2022-05-12 16:10:55 +03:00
|
|
|
| [item] -> [item]
|
2021-01-20 21:58:48 +03:00
|
|
|
| first_item :: rest -> (
|
2022-05-12 16:10:55 +03:00
|
|
|
let rest_tree = law_struct_list_to_tree rest in
|
|
|
|
match rest_tree with
|
|
|
|
| [] -> assert false (* there should be at least one rest element *)
|
|
|
|
| rest_head :: rest_tail -> (
|
|
|
|
match first_item with
|
|
|
|
| CodeBlock _ | LawText _ | LawInclude _ ->
|
|
|
|
(* if an article or an include is just before a new heading , then we
|
|
|
|
don't merge it with what comes next *)
|
|
|
|
first_item :: rest_head :: rest_tail
|
|
|
|
| LawHeading (heading, _) ->
|
|
|
|
(* here we have encountered a heading, which is going to "gobble"
|
|
|
|
everything in the [rest_tree] until it finds a heading of at least
|
|
|
|
the same precedence *)
|
|
|
|
let rec split_rest_tree (rest_tree : Ast.law_structure list) :
|
|
|
|
Ast.law_structure list * Ast.law_structure list =
|
|
|
|
match rest_tree with
|
|
|
|
| [] -> [], []
|
|
|
|
| LawHeading (new_heading, _) :: _
|
|
|
|
when new_heading.law_heading_precedence
|
|
|
|
<= heading.law_heading_precedence ->
|
|
|
|
(* we stop gobbling *)
|
|
|
|
[], rest_tree
|
|
|
|
| first :: after ->
|
|
|
|
(* we continue gobbling *)
|
|
|
|
let after_gobbled, after_out = split_rest_tree after in
|
|
|
|
first :: after_gobbled, after_out
|
|
|
|
in
|
|
|
|
let gobbled, rest_out = split_rest_tree rest_tree in
|
|
|
|
LawHeading (heading, gobbled) :: rest_out))
|
2021-01-20 21:58:48 +03:00
|
|
|
|
2020-12-14 17:23:04 +03:00
|
|
|
(** Style with which to display syntax hints in the terminal output *)
|
2022-05-12 16:10:55 +03:00
|
|
|
let syntax_hints_style = [ANSITerminal.yellow]
|
2020-10-05 01:39:29 +03:00
|
|
|
|
2020-12-14 17:23:04 +03:00
|
|
|
(** Usage: [raise_parser_error error_loc last_good_loc token msg]
|
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
Raises an error message featuring the [error_loc] position where the parser
|
|
|
|
has failed, the [token] on which the parser has failed, and the error
|
|
|
|
message [msg]. If available, displays [last_good_loc] the location of the
|
|
|
|
last token correctly parsed. *)
|
|
|
|
let raise_parser_error
|
|
|
|
(error_loc : Pos.t)
|
|
|
|
(last_good_loc : Pos.t option)
|
|
|
|
(token : string)
|
2020-10-05 01:39:29 +03:00
|
|
|
(msg : string) : 'a =
|
|
|
|
Errors.raise_multispanned_error
|
2021-03-23 12:59:43 +03:00
|
|
|
((Some "Error token:", error_loc)
|
2021-08-19 12:35:56 +03:00
|
|
|
::
|
|
|
|
(match last_good_loc with
|
|
|
|
| None -> []
|
2022-05-12 16:10:55 +03:00
|
|
|
| Some last_good_loc -> [Some "Last good token:", last_good_loc]))
|
2022-03-08 15:04:27 +03:00
|
|
|
"Syntax error at token %a\n%s"
|
|
|
|
(Cli.format_with_style syntax_hints_style)
|
2022-03-08 17:03:14 +03:00
|
|
|
(Printf.sprintf "\"%s\"" token)
|
|
|
|
msg
|
2020-08-07 13:51:51 +03:00
|
|
|
|
2021-05-26 18:39:39 +03:00
|
|
|
module ParserAux (LocalisedLexer : Lexer_common.LocalisedLexer) = struct
|
2021-04-30 10:59:09 +03:00
|
|
|
include Parser.Make (LocalisedLexer)
|
|
|
|
module I = MenhirInterpreter
|
|
|
|
|
|
|
|
(** Returns the state number from the Menhir environment *)
|
|
|
|
let state (env : 'semantic_value I.env) : int =
|
|
|
|
match Lazy.force (I.stack env) with
|
|
|
|
| MenhirLib.General.Nil -> 0
|
|
|
|
| MenhirLib.General.Cons (Element (s, _, _, _), _) -> I.number s
|
|
|
|
|
|
|
|
(** Usage: [fail lexbuf env token_list last_input_needed]
|
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
Raises an error with meaningful hints about what the parsing error was.
|
|
|
|
[lexbuf] is the lexing buffer state at the failure point, [env] is the
|
|
|
|
Menhir environment and [last_input_needed] is the last checkpoint of a
|
|
|
|
valid Menhir state before the parsing error. [token_list] is provided by
|
|
|
|
things like {!val: Surface.Lexer_common.token_list_language_agnostic} and
|
|
|
|
is used to provide suggestions of the tokens acceptable at the failure
|
|
|
|
point *)
|
|
|
|
let fail
|
|
|
|
(lexbuf : lexbuf)
|
|
|
|
(env : 'semantic_value I.env)
|
|
|
|
(token_list : (string * Tokens.token) list)
|
|
|
|
(last_input_needed : 'semantic_value I.env option) : 'a =
|
2021-04-30 10:59:09 +03:00
|
|
|
let wrong_token = Utf8.lexeme lexbuf in
|
|
|
|
let acceptable_tokens, last_positions =
|
|
|
|
match last_input_needed with
|
|
|
|
| Some last_input_needed ->
|
2022-05-12 16:10:55 +03:00
|
|
|
( List.filter
|
|
|
|
(fun (_, t) ->
|
|
|
|
I.acceptable
|
|
|
|
(I.input_needed last_input_needed)
|
|
|
|
t
|
|
|
|
(fst (lexing_positions lexbuf)))
|
|
|
|
token_list,
|
|
|
|
Some (I.positions last_input_needed) )
|
|
|
|
| None -> token_list, None
|
2021-04-30 10:59:09 +03:00
|
|
|
in
|
|
|
|
let similar_acceptable_tokens =
|
|
|
|
List.sort
|
|
|
|
(fun (x, _) (y, _) ->
|
|
|
|
let truncated_x =
|
|
|
|
if String.length wrong_token <= String.length x then
|
|
|
|
String.sub x 0 (String.length wrong_token)
|
|
|
|
else x
|
|
|
|
in
|
|
|
|
let truncated_y =
|
|
|
|
if String.length wrong_token <= String.length y then
|
|
|
|
String.sub y 0 (String.length wrong_token)
|
|
|
|
else y
|
|
|
|
in
|
|
|
|
let levx = levenshtein_distance truncated_x wrong_token in
|
|
|
|
let levy = levenshtein_distance truncated_y wrong_token in
|
|
|
|
if levx = levy then String.length x - String.length y else levx - levy)
|
|
|
|
acceptable_tokens
|
|
|
|
in
|
|
|
|
let similar_token_msg =
|
|
|
|
if List.length similar_acceptable_tokens = 0 then None
|
|
|
|
else
|
|
|
|
Some
|
|
|
|
(Printf.sprintf "did you mean %s?"
|
|
|
|
(String.concat ", or maybe "
|
|
|
|
(List.map
|
2022-03-08 17:03:14 +03:00
|
|
|
(fun (ts, _) ->
|
|
|
|
Cli.with_style syntax_hints_style "\"%s\"" ts)
|
2021-04-30 10:59:09 +03:00
|
|
|
similar_acceptable_tokens)))
|
|
|
|
in
|
|
|
|
(* The parser has suspended itself because of a syntax error. Stop. *)
|
|
|
|
let custom_menhir_message =
|
|
|
|
match Parser_errors.message (state env) with
|
|
|
|
| exception Not_found ->
|
2022-05-12 16:10:55 +03:00
|
|
|
"Message: " ^ Cli.with_style syntax_hints_style "%s" "unexpected token"
|
2021-04-30 10:59:09 +03:00
|
|
|
| msg ->
|
2022-05-12 16:10:55 +03:00
|
|
|
"Message: "
|
|
|
|
^ Cli.with_style syntax_hints_style "%s"
|
|
|
|
(String.trim (String.uncapitalize_ascii msg))
|
2021-04-30 10:59:09 +03:00
|
|
|
in
|
|
|
|
let msg =
|
|
|
|
match similar_token_msg with
|
|
|
|
| None -> custom_menhir_message
|
|
|
|
| Some similar_token_msg ->
|
2022-05-12 16:10:55 +03:00
|
|
|
Printf.sprintf "%s\nAutosuggestion: %s" custom_menhir_message
|
|
|
|
similar_token_msg
|
2021-04-30 10:59:09 +03:00
|
|
|
in
|
|
|
|
raise_parser_error
|
|
|
|
(Pos.from_lpos (lexing_positions lexbuf))
|
|
|
|
(Option.map Pos.from_lpos last_positions)
|
|
|
|
(Utf8.lexeme lexbuf) msg
|
|
|
|
|
|
|
|
(** Main parsing loop *)
|
2022-03-08 17:03:14 +03:00
|
|
|
let rec loop
|
|
|
|
(next_token : unit -> Tokens.token * Lexing.position * Lexing.position)
|
|
|
|
(token_list : (string * Tokens.token) list)
|
|
|
|
(lexbuf : lexbuf)
|
|
|
|
(last_input_needed : 'semantic_value I.env option)
|
|
|
|
(checkpoint : 'semantic_value I.checkpoint) : Ast.source_file =
|
2021-04-30 10:59:09 +03:00
|
|
|
match checkpoint with
|
|
|
|
| I.InputNeeded env ->
|
2022-05-12 16:10:55 +03:00
|
|
|
let token = next_token () in
|
|
|
|
let checkpoint = I.offer checkpoint token in
|
|
|
|
loop next_token token_list lexbuf (Some env) checkpoint
|
2021-04-30 10:59:09 +03:00
|
|
|
| I.Shifting _ | I.AboutToReduce _ ->
|
2022-05-12 16:10:55 +03:00
|
|
|
let checkpoint = I.resume checkpoint in
|
|
|
|
loop next_token token_list lexbuf last_input_needed checkpoint
|
2021-04-30 10:59:09 +03:00
|
|
|
| I.HandlingError env -> fail lexbuf env token_list last_input_needed
|
|
|
|
| I.Accepted v -> v
|
|
|
|
| I.Rejected ->
|
2022-05-12 16:10:55 +03:00
|
|
|
(* Cannot happen as we stop at syntax error immediatly *)
|
|
|
|
assert false
|
2021-04-30 10:59:09 +03:00
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
(** Stub that wraps the parsing main loop and handles the Menhir/Sedlex type
|
|
|
|
difference for [lexbuf]. *)
|
|
|
|
let sedlex_with_menhir
|
|
|
|
(lexer' : lexbuf -> Tokens.token)
|
2021-04-30 10:59:09 +03:00
|
|
|
(token_list : (string * Tokens.token) list)
|
2022-03-08 17:03:14 +03:00
|
|
|
(target_rule : Lexing.position -> 'semantic_value I.checkpoint)
|
|
|
|
(lexbuf : lexbuf) : Ast.source_file =
|
2021-04-30 10:59:09 +03:00
|
|
|
let lexer : unit -> Tokens.token * Lexing.position * Lexing.position =
|
|
|
|
with_tokenizer lexer' lexbuf
|
|
|
|
in
|
2022-03-08 17:03:14 +03:00
|
|
|
try
|
|
|
|
loop lexer token_list lexbuf None
|
|
|
|
(target_rule (fst @@ Sedlexing.lexing_positions lexbuf))
|
2021-04-30 10:59:09 +03:00
|
|
|
with Sedlexing.MalFormed | Sedlexing.InvalidCodepoint _ ->
|
2022-03-08 17:03:14 +03:00
|
|
|
Lexer_common.raise_lexer_error
|
|
|
|
(Pos.from_lpos (lexing_positions lexbuf))
|
|
|
|
(Utf8.lexeme lexbuf)
|
2021-04-30 10:59:09 +03:00
|
|
|
|
2021-05-15 02:16:08 +03:00
|
|
|
let commands_or_includes (lexbuf : lexbuf) : Ast.source_file =
|
2022-03-08 17:03:14 +03:00
|
|
|
sedlex_with_menhir LocalisedLexer.lexer LocalisedLexer.token_list
|
|
|
|
Incremental.source_file lexbuf
|
2021-04-30 10:59:09 +03:00
|
|
|
end
|
2020-04-25 19:38:37 +03:00
|
|
|
|
2021-05-03 18:06:08 +03:00
|
|
|
module Parser_En = ParserAux (Lexer_en)
|
|
|
|
module Parser_Fr = ParserAux (Lexer_fr)
|
2021-05-09 23:55:50 +03:00
|
|
|
module Parser_Pl = ParserAux (Lexer_pl)
|
2020-12-14 17:23:04 +03:00
|
|
|
|
2021-05-26 22:18:18 +03:00
|
|
|
let localised_parser : Cli.backend_lang -> lexbuf -> Ast.source_file = function
|
|
|
|
| En -> Parser_En.commands_or_includes
|
|
|
|
| Fr -> Parser_Fr.commands_or_includes
|
|
|
|
| Pl -> Parser_Pl.commands_or_includes
|
2021-05-03 18:06:08 +03:00
|
|
|
|
|
|
|
(** {1 Parsing multiple files} *)
|
2021-04-29 19:40:29 +03:00
|
|
|
|
2020-12-14 17:23:04 +03:00
|
|
|
(** Parses a single source file *)
|
2022-03-08 17:03:14 +03:00
|
|
|
let rec parse_source_file
|
2022-05-12 16:10:55 +03:00
|
|
|
(source_file : Pos.input_file)
|
|
|
|
(language : Cli.backend_lang) : Ast.program =
|
2022-03-08 17:03:14 +03:00
|
|
|
Cli.debug_print "Parsing %s"
|
|
|
|
(match source_file with FileName s | Contents s -> s);
|
2020-12-26 19:37:41 +03:00
|
|
|
let lexbuf, input =
|
|
|
|
match source_file with
|
|
|
|
| FileName source_file -> (
|
2022-05-12 16:10:55 +03:00
|
|
|
try
|
|
|
|
let input = open_in source_file in
|
|
|
|
Sedlexing.Utf8.from_channel input, Some input
|
|
|
|
with Sys_error msg -> Errors.raise_error "System error: %s" msg)
|
|
|
|
| Contents contents -> Sedlexing.Utf8.from_string contents, None
|
2020-12-26 19:37:41 +03:00
|
|
|
in
|
2022-03-08 17:03:14 +03:00
|
|
|
let source_file_name =
|
|
|
|
match source_file with FileName s -> s | Contents _ -> "stdin"
|
|
|
|
in
|
2020-12-26 19:37:41 +03:00
|
|
|
Sedlexing.set_filename lexbuf source_file_name;
|
|
|
|
Parse_utils.current_file := source_file_name;
|
2021-05-15 02:16:08 +03:00
|
|
|
let commands = localised_parser language lexbuf in
|
2020-12-26 19:37:41 +03:00
|
|
|
(match input with Some input -> close_in input | None -> ());
|
2021-05-15 02:16:08 +03:00
|
|
|
let program = expand_includes source_file_name commands language in
|
|
|
|
{
|
|
|
|
program_items = program.Ast.program_items;
|
|
|
|
program_source_files = source_file_name :: program.Ast.program_source_files;
|
|
|
|
}
|
2020-12-11 23:17:01 +03:00
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
(** Expands the include directives in a parsing result, thus parsing new source
|
|
|
|
files *)
|
|
|
|
and expand_includes
|
|
|
|
(source_file : string)
|
|
|
|
(commands : Ast.law_structure list)
|
2021-05-26 22:18:18 +03:00
|
|
|
(language : Cli.backend_lang) : Ast.program =
|
2020-12-11 23:17:01 +03:00
|
|
|
List.fold_left
|
|
|
|
(fun acc command ->
|
|
|
|
match command with
|
2021-05-15 02:16:08 +03:00
|
|
|
| Ast.LawInclude (Ast.CatalaFile sub_source) ->
|
2022-05-12 16:10:55 +03:00
|
|
|
let source_dir = Filename.dirname source_file in
|
2022-05-30 12:20:48 +03:00
|
|
|
let sub_source =
|
|
|
|
Filename.concat source_dir (Marked.unmark sub_source)
|
|
|
|
in
|
2022-05-12 16:10:55 +03:00
|
|
|
let includ_program = parse_source_file (FileName sub_source) language in
|
|
|
|
{
|
|
|
|
Ast.program_source_files =
|
|
|
|
acc.Ast.program_source_files @ includ_program.program_source_files;
|
|
|
|
Ast.program_items =
|
|
|
|
acc.Ast.program_items @ includ_program.program_items;
|
|
|
|
}
|
2021-05-15 02:16:08 +03:00
|
|
|
| Ast.LawHeading (heading, commands') ->
|
2022-05-12 16:10:55 +03:00
|
|
|
let {
|
|
|
|
Ast.program_items = commands';
|
|
|
|
Ast.program_source_files = new_sources;
|
|
|
|
} =
|
|
|
|
expand_includes source_file commands' language
|
|
|
|
in
|
|
|
|
{
|
|
|
|
Ast.program_source_files = acc.Ast.program_source_files @ new_sources;
|
|
|
|
Ast.program_items =
|
|
|
|
acc.Ast.program_items @ [Ast.LawHeading (heading, commands')];
|
|
|
|
}
|
|
|
|
| i -> { acc with Ast.program_items = acc.Ast.program_items @ [i] })
|
2020-12-11 23:17:01 +03:00
|
|
|
{ Ast.program_source_files = []; Ast.program_items = [] }
|
|
|
|
commands
|
2021-01-20 21:58:48 +03:00
|
|
|
|
|
|
|
(** {1 API} *)
|
|
|
|
|
2022-03-08 17:03:14 +03:00
|
|
|
let parse_top_level_file
|
2022-05-12 16:10:55 +03:00
|
|
|
(source_file : Pos.input_file)
|
|
|
|
(language : Cli.backend_lang) : Ast.program =
|
2021-01-20 21:58:48 +03:00
|
|
|
let program = parse_source_file source_file language in
|
2022-03-08 17:03:14 +03:00
|
|
|
{
|
|
|
|
program with
|
|
|
|
Ast.program_items = law_struct_list_to_tree program.Ast.program_items;
|
|
|
|
}
|