2020-04-19 16:53:35 +03:00
(* This file is part of the Catala compiler, a specification language for tax
2021-05-27 19:56:47 +03:00
and social benefits computation rules . Copyright ( C ) 2020 Inria ,
contributors : Denis Merigoux < denis . merigoux @ inria . fr > , Emile Rolley
< emile . rolley @ tuta . io >
2020-04-19 16:53:35 +03:00
Licensed under the Apache License , Version 2 . 0 ( the " License " ) ; you may not
use this file except in compliance with the License . You may obtain a copy of
the License at
http : // www . apache . org / licenses / LICENSE - 2 . 0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an " AS IS " BASIS , WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied . See the
License for the specific language governing permissions and limitations under
the License . * )
2022-01-02 16:53:51 +03:00
(* * Wrapping module around parser and lexer that offers the
{ !: Parser_driver . parse_source_file } API . * )
2020-12-14 17:23:04 +03:00
2020-04-25 19:38:37 +03:00
open Sedlexing
2022-11-21 12:46:17 +03:00
open Catala_utils
2020-04-25 19:38:37 +03:00
2021-01-20 21:58:48 +03:00
(* * After parsing, heading structure is completely flat because of the
[ source_file_item ] rule . We need to tree - i - fy the flat structure , by looking
at the precedence of the law headings . * )
2021-05-15 02:16:08 +03:00
let rec law_struct_list_to_tree ( f : Ast . law_structure list ) :
Ast . law_structure list =
2021-01-20 21:58:48 +03:00
match f with
| [] -> []
| [ item ] -> [ item ]
| first_item :: rest -> (
let rest_tree = law_struct_list_to_tree rest in
match rest_tree with
| [] -> assert false (* there should be at least one rest element *)
| rest_head :: rest_tail -> (
match first_item with
2023-09-05 16:00:55 +03:00
| CodeBlock _ | LawText _ | LawInclude _ | ModuleDef _ | ModuleUse _ ->
2021-05-15 02:16:08 +03:00
(* if an article or an include is just before a new heading , then we
don't merge it with what comes next * )
2021-01-20 21:58:48 +03:00
first_item :: rest_head :: rest_tail
2021-05-15 02:16:08 +03:00
| LawHeading ( heading , _ ) ->
2021-01-20 21:58:48 +03:00
(* here we have encountered a heading, which is going to "gobble"
everything in the [ rest_tree ] until it finds a heading of at least
the same precedence * )
2021-05-15 02:16:08 +03:00
let rec split_rest_tree ( rest_tree : Ast . law_structure list ) :
Ast . law_structure list * Ast . law_structure list =
2021-01-20 21:58:48 +03:00
match rest_tree with
| [] -> [] , []
2021-05-15 02:16:08 +03:00
| LawHeading ( new_heading , _ ) :: _
2021-01-20 21:58:48 +03:00
when new_heading . law_heading_precedence
< = heading . law_heading_precedence ->
(* we stop gobbling *)
[] , rest_tree
2021-05-15 02:16:08 +03:00
| first :: after ->
2021-01-20 21:58:48 +03:00
(* we continue gobbling *)
let after_gobbled , after_out = split_rest_tree after in
first :: after_gobbled , after_out
in
let gobbled , rest_out = split_rest_tree rest_tree in
2021-05-15 02:16:08 +03:00
LawHeading ( heading , gobbled ) :: rest_out ) )
2021-01-20 21:58:48 +03:00
2020-12-14 17:23:04 +03:00
(* * Usage: [raise_parser_error error_loc last_good_loc token msg]
Raises an error message featuring the [ error_loc ] position where the parser
has failed , the [ token ] on which the parser has failed , and the error
message [ msg ] . If available , displays [ last_good_loc ] the location of the
last token correctly parsed . * )
2020-10-05 01:39:29 +03:00
let raise_parser_error
2023-07-09 18:58:07 +03:00
? ( suggestion : string list option )
2020-10-05 01:39:29 +03:00
( error_loc : Pos . t )
( last_good_loc : Pos . t option )
( token : string )
2023-06-07 19:10:50 +03:00
( msg : Format . formatter -> unit ) : ' a =
2023-07-06 13:44:44 +03:00
Message . raise_multispanned_error_full ? suggestion
2023-06-07 19:10:50 +03:00
( ( Some ( fun ppf -> Format . pp_print_string ppf " Error token: " ) , error_loc )
2021-05-29 15:15:23 +03:00
::
( match last_good_loc with
| None -> []
2023-06-07 19:10:50 +03:00
| Some last_good_loc ->
[
( Some ( fun ppf -> Format . pp_print_string ppf " Last good token: " ) ,
last_good_loc ) ;
] ) )
2023-07-09 18:58:07 +03:00
" @[<v>Syntax error at token %a@,%t@] "
( fun ppf string -> Format . fprintf ppf " @{<yellow> \" %s \" @} " string )
token msg
2020-08-07 13:51:51 +03:00
2021-05-26 18:39:39 +03:00
module ParserAux ( LocalisedLexer : Lexer_common . LocalisedLexer ) = struct
2021-04-30 10:59:09 +03:00
include Parser . Make ( LocalisedLexer )
module I = MenhirInterpreter
(* * Returns the state number from the Menhir environment *)
let state ( env : ' semantic_value I . env ) : int =
match Lazy . force ( I . stack env ) with
| MenhirLib . General . Nil -> 0
| MenhirLib . General . Cons ( Element ( s , _ , _ , _ ) , _ ) -> I . number s
(* * Usage: [fail lexbuf env token_list last_input_needed]
Raises an error with meaningful hints about what the parsing error was .
[ lexbuf ] is the lexing buffer state at the failure point , [ env ] is the
Menhir environment and [ last_input_needed ] is the last checkpoint of a
valid Menhir state before the parsing error . [ token_list ] is provided by
2021-05-26 18:39:39 +03:00
things like { ! val : Surface . Lexer_common . token_list_language_agnostic } and
is used to provide suggestions of the tokens acceptable at the failure
point * )
2021-04-30 10:59:09 +03:00
let fail
( lexbuf : lexbuf )
( env : ' semantic_value I . env )
( token_list : ( string * Tokens . token ) list )
( last_input_needed : ' semantic_value I . env option ) : ' a =
let wrong_token = Utf8 . lexeme lexbuf in
let acceptable_tokens , last_positions =
match last_input_needed with
| Some last_input_needed ->
( List . filter
( fun ( _ , t ) ->
I . acceptable
( I . input_needed last_input_needed )
t
( fst ( lexing_positions lexbuf ) ) )
token_list ,
Some ( I . positions last_input_needed ) )
| None -> token_list , None
in
let similar_acceptable_tokens =
2023-07-09 18:58:07 +03:00
Suggestions . suggestion_minimum_levenshtein_distance_association
( List . map ( fun ( s , _ ) -> s ) acceptable_tokens )
wrong_token
2021-04-30 10:59:09 +03:00
in
(* The parser has suspended itself because of a syntax error. Stop. *)
2023-06-07 19:10:50 +03:00
let custom_menhir_message ppf =
2023-07-10 17:21:23 +03:00
( match Parser_errors . message ( state env ) with
2021-04-30 10:59:09 +03:00
| exception Not_found ->
2023-07-10 17:21:23 +03:00
Format . fprintf ppf " Message: @{<yellow>unexpected token@}@,%t "
2021-04-30 10:59:09 +03:00
| msg ->
2023-07-10 17:21:23 +03:00
Format . fprintf ppf " Message: @{<yellow>%s@}@,%t "
( String . trim ( String . uncapitalize_ascii msg ) ) )
( fun ( ppf : Format . formatter ) ->
2023-07-12 17:32:55 +03:00
Format . fprintf ppf " You could have written : " ;
2023-07-10 17:21:23 +03:00
Format . pp_print_list
~ pp_sep : ( fun ppf () -> Format . fprintf ppf " ,@ or " )
( fun ppf string -> Format . fprintf ppf " @{<yellow> \" %s \" @} " string )
ppf
( List . map ( fun ( s , _ ) -> s ) acceptable_tokens ) )
2021-04-30 10:59:09 +03:00
in
2023-07-12 17:32:55 +03:00
raise_parser_error ~ suggestion : similar_acceptable_tokens
2021-04-30 10:59:09 +03:00
( Pos . from_lpos ( lexing_positions lexbuf ) )
( Option . map Pos . from_lpos last_positions )
2023-07-06 13:44:44 +03:00
( Utf8 . lexeme lexbuf ) custom_menhir_message
2021-04-30 10:59:09 +03:00
(* * Main parsing loop *)
let rec loop
( next_token : unit -> Tokens . token * Lexing . position * Lexing . position )
( token_list : ( string * Tokens . token ) list )
( lexbuf : lexbuf )
( last_input_needed : ' semantic_value I . env option )
( checkpoint : ' semantic_value I . checkpoint ) : Ast . source_file =
match checkpoint with
| I . InputNeeded env ->
let token = next_token () in
let checkpoint = I . offer checkpoint token in
loop next_token token_list lexbuf ( Some env ) checkpoint
| I . Shifting _ | I . AboutToReduce _ ->
let checkpoint = I . resume checkpoint in
loop next_token token_list lexbuf last_input_needed checkpoint
| I . HandlingError env -> fail lexbuf env token_list last_input_needed
| I . Accepted v -> v
| I . Rejected ->
(* Cannot happen as we stop at syntax error immediatly *)
assert false
(* * Stub that wraps the parsing main loop and handles the Menhir/Sedlex type
difference for [ lexbuf ] . * )
let sedlex_with_menhir
( lexer' : lexbuf -> Tokens . token )
( token_list : ( string * Tokens . token ) list )
( target_rule : Lexing . position -> ' semantic_value I . checkpoint )
( lexbuf : lexbuf ) : Ast . source_file =
let lexer : unit -> Tokens . token * Lexing . position * Lexing . position =
with_tokenizer lexer' lexbuf
in
try
loop lexer token_list lexbuf None
( target_rule ( fst @@ Sedlexing . lexing_positions lexbuf ) )
with Sedlexing . MalFormed | Sedlexing . InvalidCodepoint _ ->
2021-05-26 18:39:39 +03:00
Lexer_common . raise_lexer_error
( Pos . from_lpos ( lexing_positions lexbuf ) )
( Utf8 . lexeme lexbuf )
2021-04-30 10:59:09 +03:00
2021-05-15 02:16:08 +03:00
let commands_or_includes ( lexbuf : lexbuf ) : Ast . source_file =
sedlex_with_menhir LocalisedLexer . lexer LocalisedLexer . token_list
Incremental . source_file lexbuf
2021-04-30 10:59:09 +03:00
end
2020-04-25 19:38:37 +03:00
2021-05-03 18:06:08 +03:00
module Parser_En = ParserAux ( Lexer_en )
module Parser_Fr = ParserAux ( Lexer_fr )
2021-05-09 23:55:50 +03:00
module Parser_Pl = ParserAux ( Lexer_pl )
2020-12-14 17:23:04 +03:00
2021-05-26 22:18:18 +03:00
let localised_parser : Cli . backend_lang -> lexbuf -> Ast . source_file = function
| En -> Parser_En . commands_or_includes
| Fr -> Parser_Fr . commands_or_includes
| Pl -> Parser_Pl . commands_or_includes
2021-05-03 18:06:08 +03:00
2023-09-11 17:44:35 +03:00
(* * Lightweight lexer for dependency *)
let lines ( file : File . t ) ( language : Cli . backend_lang ) =
let lex_line = match language with
| En -> Lexer_en . lex_line
| Fr -> Lexer_fr . lex_line
| Pl -> Lexer_pl . lex_line
in
let input = open_in file in
try
let lexbuf = Sedlexing . Utf8 . from_channel input in
Sedlexing . set_filename lexbuf file ;
let rec aux () =
match lex_line lexbuf with
| Some line -> Seq . Cons ( line , aux )
| None -> close_in input ; Seq . Nil
in
aux
with exc ->
let bt = Printexc . get_raw_backtrace () in
close_in input ;
Printexc . raise_with_backtrace exc bt
2021-05-03 18:06:08 +03:00
(* * {1 Parsing multiple files} *)
2021-04-29 19:40:29 +03:00
2020-12-14 17:23:04 +03:00
(* * Parses a single source file *)
2021-05-26 22:18:18 +03:00
let rec parse_source_file
2023-06-28 16:57:52 +03:00
( source_file : Cli . input_file )
2021-05-26 22:18:18 +03:00
( language : Cli . backend_lang ) : Ast . program =
2023-06-13 12:27:45 +03:00
Message . emit_debug " Parsing %s "
2022-03-08 15:04:27 +03:00
( match source_file with FileName s | Contents s -> s ) ;
2020-12-26 19:37:41 +03:00
let lexbuf , input =
match source_file with
| FileName source_file -> (
try
let input = open_in source_file in
Sedlexing . Utf8 . from_channel input , Some input
2023-06-13 12:27:45 +03:00
with Sys_error msg -> Message . raise_error " System error: %s " msg )
2021-11-07 13:00:46 +03:00
| Contents contents -> Sedlexing . Utf8 . from_string contents , None
2020-12-26 19:37:41 +03:00
in
let source_file_name =
match source_file with FileName s -> s | Contents _ -> " stdin "
in
Sedlexing . set_filename lexbuf source_file_name ;
2021-05-15 02:16:08 +03:00
let commands = localised_parser language lexbuf in
2020-12-26 19:37:41 +03:00
( match input with Some input -> close_in input | None -> () ) ;
2021-05-15 02:16:08 +03:00
let program = expand_includes source_file_name commands language in
{
2023-09-19 12:44:18 +03:00
program_module_name = program . Ast . program_module_name ;
2021-05-15 02:16:08 +03:00
program_items = program . Ast . program_items ;
program_source_files = source_file_name :: program . Ast . program_source_files ;
2023-09-19 12:44:18 +03:00
program_modules = program . program_modules ;
2023-09-22 18:50:19 +03:00
program_lang = language ;
2021-05-15 02:16:08 +03:00
}
2020-12-11 23:17:01 +03:00
2020-12-14 17:23:04 +03:00
(* * Expands the include directives in a parsing result, thus parsing new source
files * )
2021-05-15 02:16:08 +03:00
and expand_includes
( source_file : string )
( commands : Ast . law_structure list )
2021-05-26 22:18:18 +03:00
( language : Cli . backend_lang ) : Ast . program =
2023-09-19 12:44:18 +03:00
let rprg =
List . fold_left
( fun acc command ->
match command with
| Ast . ModuleDef id ->
( match acc . Ast . program_module_name with
| None -> { acc with Ast . program_module_name = Some id }
| Some id2 ->
Message . raise_multispanned_error
[ None , Mark . get id ; None , Mark . get id2 ]
" Multiple definitions of the module name " )
| Ast . ModuleUse ( id , _ alias ) ->
{ acc with
Ast . program_modules = ( id , [] ) :: acc . Ast . program_modules ;
Ast . program_items = command :: acc . Ast . program_items }
| Ast . LawInclude ( Ast . CatalaFile inc_file ) ->
let source_dir = Filename . dirname source_file in
let sub_source = File . ( source_dir / Mark . remove inc_file ) in
let includ_program = parse_source_file ( FileName sub_source ) language in
let () =
includ_program . Ast . program_module_name | > Option . iter @@ fun id ->
Message . raise_multispanned_error
[ Some " File include " , Mark . get inc_file ;
Some " Module declaration " , Mark . get id ]
" A file that declares a module cannot be used through the raw '@{<yellow>> Include@}' directive. You should use it as a module with '@{<yellow>> Use %a@}' instead. " Uid . Module . format ( Uid . Module . of_string id )
in
{
Ast . program_module_name = None ;
Ast . program_source_files =
List . rev_append includ_program . program_source_files acc . Ast . program_source_files ;
Ast . program_items =
List . rev_append includ_program . program_items acc . Ast . program_items ;
Ast . program_modules =
List . rev_append includ_program . program_modules acc . Ast . program_modules ;
2023-09-22 18:50:19 +03:00
Ast . program_lang = language ;
2023-09-19 12:44:18 +03:00
}
| Ast . LawHeading ( heading , commands' ) ->
let {
Ast . program_module_name ;
Ast . program_items = commands' ;
Ast . program_source_files = new_sources ;
Ast . program_modules = new_modules ;
2023-09-22 18:50:19 +03:00
Ast . program_lang = _ ;
2023-09-19 12:44:18 +03:00
} =
expand_includes source_file commands' language
in
{
Ast . program_module_name ;
Ast . program_source_files = List . rev_append new_sources acc . Ast . program_source_files ;
Ast . program_items =
Ast . LawHeading ( heading , commands' ) :: acc . Ast . program_items ;
Ast . program_modules = List . rev_append new_modules acc . Ast . program_modules ;
2023-09-22 18:50:19 +03:00
Ast . program_lang = language ;
2023-09-19 12:44:18 +03:00
}
| i -> { acc with Ast . program_items = i :: acc . Ast . program_items } )
{
Ast . program_module_name = None ;
Ast . program_source_files = [] ;
Ast . program_items = [] ;
Ast . program_modules = [] ;
Ast . program_lang = language ;
}
commands
in
{
Ast . program_lang = language ;
Ast . program_module_name = rprg . Ast . program_module_name ;
Ast . program_source_files = List . rev rprg . Ast . program_source_files ;
Ast . program_items = List . rev rprg . Ast . program_items ;
Ast . program_modules = List . rev rprg . Ast . program_modules ;
}
2021-01-20 21:58:48 +03:00
2023-05-11 18:39:38 +03:00
(* * {2 Handling interfaces} *)
2023-04-19 19:26:50 +03:00
2023-05-11 18:39:38 +03:00
let get_interface program =
2023-09-19 12:44:18 +03:00
let rec filter ( req , acc ) = function
| Ast . LawInclude _ | Ast . LawText _ | Ast . ModuleDef _ ->
req , acc
| Ast . LawHeading ( _ , str ) -> List . fold_left filter ( req , acc ) str
| Ast . ModuleUse ( m , _ ) -> ( m :: req ) , acc
2023-04-19 19:26:50 +03:00
| Ast . CodeBlock ( code , _ , true ) ->
2023-09-19 12:44:18 +03:00
req ,
List . fold_left
( fun acc -> function
| Ast . ScopeUse _ , _ -> acc
| ( ( Ast . ScopeDecl _ | StructDecl _ | EnumDecl _ ) , _ ) as e ->
e :: acc
| Ast . Topdef def , m ->
( Ast . Topdef { def with topdef_expr = None } , m ) :: acc )
acc code
2023-04-19 19:26:50 +03:00
| Ast . CodeBlock ( _ , _ , false ) ->
(* Non-metadata blocks are ignored *)
2023-09-19 12:44:18 +03:00
req , acc
2023-04-19 19:26:50 +03:00
in
2023-09-19 12:44:18 +03:00
List . fold_left filter ( [] , [] ) program . Ast . program_items
2023-05-11 18:39:38 +03:00
(* * {1 API} *)
2023-08-10 17:52:39 +03:00
let load_interface source_file language =
2023-09-19 12:44:18 +03:00
let program = parse_source_file source_file language in
let modname =
match program . Ast . program_module_name with
| Some mname -> mname
| None ->
Message . raise_error
" %s doesn't define a module name. It should contain a '@{<cyan>> Module \
% s @ } ' directive . "
( match source_file with
| FileName s -> " File " ^ s
| Contents _ -> " Source input " )
( match source_file with
| FileName s ->
String . capitalize_ascii Filename . ( basename ( remove_extension s ) )
| Contents _ -> " Module_name " )
in
let used_modules , intf = get_interface program in
( modname , intf ) , used_modules
2023-09-05 16:00:55 +03:00
2021-05-26 22:18:18 +03:00
let parse_top_level_file
2023-06-28 16:57:52 +03:00
( source_file : Cli . input_file )
2021-05-26 22:18:18 +03:00
( language : Cli . backend_lang ) : Ast . program =
2021-01-20 21:58:48 +03:00
let program = parse_source_file source_file language in
{
program with
Ast . program_items = law_struct_list_to_tree program . Ast . program_items ;
}