Add non verbose lexing

This commit is contained in:
Nicolas Chataing 2020-08-03 18:07:09 +02:00
parent 15c1df9fa7
commit 84835b0a79
10 changed files with 452 additions and 37 deletions

View File

@ -21,6 +21,7 @@ depends: [
"cmdliner" {>= "1.0.4"}
"re" {>= "1.9.0"}
"dune" {build}
"ocamlgraph" {>= "1.8.8"}
]
build: [
["dune" "subst"] {pinned}

View File

@ -27,6 +27,7 @@
(cmdliner (>= 1.0.4))
(re (>= 1.9.0))
(dune (and :build ))
(ocamlgraph (>= 1.8.8))
)
)

View File

@ -45,9 +45,14 @@ let language =
Arg.(
value
& opt (some string) None
& info [ "l"; "language" ] ~docv:"LANG" ~doc:"Input language among: en, fr (default fr)")
& info [ "l"; "language" ] ~docv:"LANG" ~doc:"Input language among: en, fr, nv (default nv)")
type language_option = Fr | En
type language_option = [ `Fr | `En | `NonVerbose ]
type reduced_lang_option = [ `Fr | `En ]
let reduce_lang (lang : language_option) : reduced_lang_option =
match lang with `En | `NonVerbose -> `En | `Fr -> `Fr
let output =
Arg.(

View File

@ -25,13 +25,13 @@ let driver (source_file : string) (debug : bool) (wrap_weaved_output : bool)
let language =
match language with
| Some l ->
if l = "fr" then Cli.Fr
else if l = "en" then Cli.En
if l = "fr" then `Fr
else if l = "en" then `En
else begin
Cli.error_print (Printf.sprintf "The selected language (%s) is not supported by Catala" l);
exit 1
end
| None -> Cli.Fr
| None -> `NonVerbose
in
let backend =
if backend = "Makefile" then Cli.Makefile
@ -68,8 +68,8 @@ let driver (source_file : string) (debug : bool) (wrap_weaved_output : bool)
try
let weaved_output =
match backend with
| Cli.Latex -> Latex.ast_to_latex program language
| Cli.Html -> Html.ast_to_html program pygmentize_loc language
| Cli.Latex -> Latex.ast_to_latex program (Cli.reduce_lang language)
| Cli.Html -> Html.ast_to_html program pygmentize_loc (Cli.reduce_lang language)
| _ -> assert false
in
let output_file =

View File

@ -24,6 +24,7 @@ let pre_html (s : string) = s
let wrap_html (code : string) (source_files : string list) (custom_pygments : string option)
(language : Cli.language_option) : string =
let language = C.reduce_lang language in
let pygments = match custom_pygments with Some p -> p | None -> "pygmentize" in
let css_file = Filename.temp_file "catala_css_pygments" "" in
let pygments_args = [| "-f"; "html"; "-S"; "colorful"; "-a"; ".catala-code" |] in
@ -56,15 +57,15 @@ let wrap_html (code : string) (source_files : string list) (custom_pygments : st
%s"
css_as_string
( match language with
| C.Fr -> "Implémentation de texte législatif"
| C.En -> "Legislative text implementation" )
(match language with C.Fr -> "Document généré par" | C.En -> "Document generated by")
| `Fr -> "Implémentation de texte législatif"
| `En -> "Legislative text implementation" )
(match language with `Fr -> "Document généré par" | `En -> "Document generated by")
( match Build_info.V1.version () with
| None -> "n/a"
| Some v -> Build_info.V1.Version.to_string v )
( match language with
| C.Fr -> "Fichiers sources tissés dans ce document"
| C.En -> "Source files weaved in this document" )
| `Fr -> "Fichiers sources tissés dans ce document"
| `En -> "Source files weaved in this document" )
(String.concat "\n"
(List.map
(fun filename ->
@ -76,14 +77,12 @@ let wrap_html (code : string) (source_files : string list) (custom_pygments : st
in
Printf.sprintf "<li><tt>%s</tt>, %s %s</li>"
(pre_html (Filename.basename filename))
( match language with
| C.Fr -> "dernière modification le"
| C.En -> "last modification" )
(match language with `Fr -> "dernière modification le" | `En -> "last modification")
ftime)
source_files))
code
let pygmentize_code (c : string Pos.marked) (language : C.language_option)
let pygmentize_code (c : string Pos.marked) (language : C.reduced_lang_option)
(custom_pygments : string option) : string =
C.debug_print (Printf.sprintf "Pygmenting the code chunk %s" (Pos.to_string (Pos.get_position c)));
let temp_file_in = Filename.temp_file "catala_html_pygments" "in" in
@ -92,7 +91,7 @@ let pygmentize_code (c : string Pos.marked) (language : C.language_option)
Printf.fprintf oc "%s" (Pos.unmark c);
close_out oc;
let pygments = match custom_pygments with Some p -> p | None -> "pygmentize" in
let pygments_lexer = match language with C.Fr -> "catala_fr" | C.En -> "catala_en" in
let pygments_lexer = match language with `Fr -> "catala_fr" | `En -> "catala_en" in
let pygments_args =
[|
"-l";
@ -122,7 +121,7 @@ let pygmentize_code (c : string Pos.marked) (language : C.language_option)
type program_state = InsideArticle | OutsideArticle
let program_item_to_html (i : A.program_item) (custom_pygments : string option)
(language : C.language_option) (state : program_state) : string * program_state =
(language : C.reduced_lang_option) (state : program_state) : string * program_state =
let closing_div =
(* First we terminate the div of the previous article if need be *)
match (i, state) with
@ -146,7 +145,7 @@ let program_item_to_html (i : A.program_item) (custom_pygments : string option)
P.sprintf
"<div class='article-container'>\n\n<div class='article-title'><a href='%s'>%s</a></div>"
( match (a.law_article_id, language) with
| Some id, C.Fr ->
| Some id, `Fr ->
let ltime = Unix.localtime (Unix.time ()) in
P.sprintf "https://beta.legifrance.gouv.fr/codes/id/%s/%d-%02d-%02d" id
(1900 + ltime.Unix.tm_year) (ltime.Unix.tm_mon + 1) ltime.Unix.tm_mday
@ -179,7 +178,7 @@ let program_item_to_html (i : A.program_item) (custom_pygments : string option)
(closing_div ^ item_string, new_state)
let ast_to_html (program : A.program) (custom_pygments : string option)
(language : C.language_option) : string =
(language : C.reduced_lang_option) : string =
let i_s, _ =
List.fold_left
(fun (acc, state) i ->

View File

@ -33,6 +33,7 @@ let pre_latexify (s : string) =
let wrap_latex (code : string) (source_files : string list) (custom_pygments : string option)
(language : Cli.language_option) =
let language = C.reduce_lang language in
Printf.sprintf
"\\documentclass[11pt, a4paper]{article}\n\n\
\\usepackage[T1]{fontenc}\n\
@ -76,20 +77,20 @@ let wrap_latex (code : string) (source_files : string list) (custom_pygments : s
\\[\\star\\star\\star\\]\\\\\n\
%s\n\n\
\\end{document}"
(match language with C.Fr -> "french" | C.En -> "english")
(match language with `Fr -> "french" | `En -> "english")
( match custom_pygments with
| None -> ""
| Some p -> Printf.sprintf "\\renewcommand{\\MintedPygmentize}{%s}" p )
( match language with
| C.Fr -> "Implémentation de texte législatif"
| C.En -> "Legislative text implementation" )
(match language with C.Fr -> "Document généré par" | C.En -> "Document generated by")
| `Fr -> "Implémentation de texte législatif"
| `En -> "Legislative text implementation" )
(match language with `Fr -> "Document généré par" | `En -> "Document generated by")
( match Build_info.V1.version () with
| None -> "n/a"
| Some v -> Build_info.V1.Version.to_string v )
( match language with
| C.Fr -> "Fichiers sources tissés dans ce document"
| C.En -> "Source files weaved in this document" )
| `Fr -> "Fichiers sources tissés dans ce document"
| `En -> "Source files weaved in this document" )
(String.concat ","
(List.map
(fun filename ->
@ -101,9 +102,7 @@ let wrap_latex (code : string) (source_files : string list) (custom_pygments : s
in
Printf.sprintf "\\item\\texttt{%s}, %s %s"
(pre_latexify (Filename.basename filename))
( match language with
| C.Fr -> "dernière modification le"
| C.En -> "last modification" )
(match language with `Fr -> "dernière modification le" | `En -> "last modification")
ftime)
source_files))
code
@ -123,7 +122,7 @@ let math_syms_replace (c : string) : string =
in
R.substitute ~rex:syms ~subst:syms2cmd c
let program_item_to_latex (i : A.program_item) (language : C.language_option) : string =
let program_item_to_latex (i : A.program_item) (language : C.reduced_lang_option) : string =
match i with
| A.LawHeading (title, precedence) ->
P.sprintf "\\%ssection*{%s}"
@ -138,10 +137,10 @@ let program_item_to_latex (i : A.program_item) (language : C.language_option) :
\\end{minted}"
(pre_latexify (Filename.basename (Pos.get_file (Pos.get_position c))))
(Pos.get_start_line (Pos.get_position c))
(match language with C.Fr -> "catala_fr" | C.En -> "catala_en")
(match language with `Fr -> "catala_fr" | `En -> "catala_en")
(math_syms_replace (Pos.unmark c))
| A.MetadataBlock (_, c) ->
let metadata_title = match language with C.Fr -> "Métadonnées" | C.En -> "Metadata" in
let metadata_title = match language with `Fr -> "Métadonnées" | `En -> "Metadata" in
P.sprintf
"\\begin{tcolorbox}[colframe=OliveGreen, breakable, \
title=\\textcolor{black}{\\texttt{%s}},title after \
@ -153,7 +152,7 @@ let program_item_to_latex (i : A.program_item) (language : C.language_option) :
metadata_title metadata_title
(Pos.get_start_line (Pos.get_position c))
(pre_latexify (Filename.basename (Pos.get_file (Pos.get_position c))))
(match language with C.Fr -> "catala_fr" | C.En -> "catala_en")
(match language with `Fr -> "catala_fr" | `En -> "catala_en")
(math_syms_replace (Pos.unmark c))
| A.LawInclude (A.PdfFile ((file, _), page)) ->
let label = file ^ match page with None -> "" | Some p -> P.sprintf "_page_%d," p in
@ -165,5 +164,5 @@ let program_item_to_latex (i : A.program_item) (language : C.language_option) :
file label
| A.LawInclude (A.CatalaFile _ | A.LegislativeText _) -> ""
let ast_to_latex (program : A.program) (language : C.language_option) : string =
let ast_to_latex (program : A.program) (language : C.reduced_lang_option) : string =
String.concat "\n\n" (List.map (fun i -> program_item_to_latex i language) program.program_items)

397
src/catala/parsing/lexer.ml Normal file
View File

@ -0,0 +1,397 @@
(* This file is part of the Catala compiler, a specification language for tax and social benefits
computation rules. Copyright (C) 2020 Inria, contributor: Denis Merigoux
<denis.merigoux@inria.fr>
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
or implied. See the License for the specific language governing permissions and limitations under
the License. *)
open Parser
open Sedlexing
module R = Re.Pcre
let is_code : bool ref = ref false
let code_string_acc : string ref = ref ""
let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf
let token_list : (string * token) list =
[
("scope", SCOPE);
("~", CONSEQUENCE);
("data", DATA);
("depends on", DEPENDS);
("declaration", DECLARATION);
("context", CONTEXT);
("decreasing", DECREASING);
("increasing", INCREASING);
("of", OF);
("collection", COLLECTION);
("enumeration", ENUM);
("integer", INTEGER);
("amount", MONEY);
("text", TEXT);
("decimal", DECIMAL);
("date", DATE);
("boolean", BOOLEAN);
("sum", SUM);
("fulfilled", FILLED);
("def", DEFINITION);
("equals", DEFINED_AS);
("match", MATCH);
("with pattern", WITH);
("?", UNDER_CONDITION);
("if", IF);
("then", THEN);
("else", ELSE);
("content", CONTENT);
("structure", STRUCT);
("optional", OPTIONAL);
("assertion", ASSERTION);
("varies", VARIES);
("with", WITH_V);
("for", FOR);
("all", ALL);
("we have", WE_HAVE);
("fixed", FIXED);
("by", BY);
("rule", RULE);
("exists", EXISTS);
("such", SUCH);
("that", THAT);
("now", NOW);
("and", AND);
("or", OR);
("not", NOT);
("number", CARDINAL);
("year", YEAR);
("true", TRUE);
("false", FALSE);
]
@ Lexer_fr.token_list_language_agnostic
let rec lex_code (lexbuf : lexbuf) : token =
match%sedlex lexbuf with
| white_space ->
(* Whitespaces *)
update_acc lexbuf;
lex_code lexbuf
| '#', Star (Compl '\n'), '\n' ->
(* Comments *)
update_acc lexbuf;
lex_code lexbuf
| "*/" ->
(* End of code section *)
is_code := false;
END_CODE !code_string_acc
| "scope" ->
update_acc lexbuf;
SCOPE
| "data" ->
update_acc lexbuf;
DATA
| "depends on" ->
update_acc lexbuf;
DEPENDS
| "declaration" ->
update_acc lexbuf;
DECLARATION
| "context" ->
update_acc lexbuf;
CONTEXT
| "decreasing" ->
update_acc lexbuf;
DECREASING
| "increasing" ->
update_acc lexbuf;
INCREASING
| "of" ->
update_acc lexbuf;
OF
| "collection" ->
update_acc lexbuf;
COLLECTION
| "enumeration" ->
update_acc lexbuf;
ENUM
| "int" ->
update_acc lexbuf;
INTEGER
| "amount" ->
update_acc lexbuf;
MONEY
| "text" ->
update_acc lexbuf;
TEXT
| "dec" ->
update_acc lexbuf;
DECIMAL
| "date" ->
update_acc lexbuf;
DATE
| "bool" ->
update_acc lexbuf;
BOOLEAN
| "sum" ->
update_acc lexbuf;
SUM
| "fulfilled" ->
update_acc lexbuf;
FILLED
| "def" ->
update_acc lexbuf;
DEFINITION
| "=" ->
update_acc lexbuf;
DEFINED_AS
| "match" ->
update_acc lexbuf;
MATCH
| "with pattern" ->
update_acc lexbuf;
WITH
| "?" ->
update_acc lexbuf;
UNDER_CONDITION
| "if" ->
update_acc lexbuf;
IF
| "~" ->
update_acc lexbuf;
CONSEQUENCE
| "then" ->
update_acc lexbuf;
THEN
| "else" ->
update_acc lexbuf;
ELSE
| "condition" ->
update_acc lexbuf;
CONDITION
| "content" ->
update_acc lexbuf;
CONTENT
| "structure" ->
update_acc lexbuf;
STRUCT
| "optional" ->
update_acc lexbuf;
OPTIONAL
| "assertion" ->
update_acc lexbuf;
ASSERTION
| "varies" ->
update_acc lexbuf;
VARIES
| "with" ->
update_acc lexbuf;
WITH_V
| "for" ->
update_acc lexbuf;
FOR
| "all" ->
update_acc lexbuf;
ALL
| "we have" ->
update_acc lexbuf;
WE_HAVE
| "fixed" ->
update_acc lexbuf;
FIXED
| "by" ->
update_acc lexbuf;
BY
| "rule" ->
(* 0xE8 is è *)
update_acc lexbuf;
RULE
| "exists" ->
update_acc lexbuf;
EXISTS
| "in" ->
update_acc lexbuf;
IN
| "such" ->
update_acc lexbuf;
SUCH
| "that" ->
update_acc lexbuf;
THAT
| "now" ->
update_acc lexbuf;
NOW
| "and" ->
update_acc lexbuf;
AND
| "or" ->
update_acc lexbuf;
OR
| "not" ->
update_acc lexbuf;
NOT
| "number" ->
update_acc lexbuf;
CARDINAL
| "true" ->
update_acc lexbuf;
TRUE
| "false" ->
update_acc lexbuf;
FALSE
| "year" ->
update_acc lexbuf;
YEAR
| 0x24, Star white_space, '0' .. '9', Star ('0' .. '9' | ','), Opt ('.', Rep ('0' .. '9', 0 .. 2))
->
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
let full_str = Utf8.lexeme lexbuf in
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
(* Integer literal*)
let units = parts 1 in
let remove_commas = R.regexp "," in
let units = int_of_string (R.substitute ~rex:remove_commas ~subst:(fun _ -> "") units) in
let cents = try int_of_string (parts 4) with Not_found -> 0 in
update_acc lexbuf;
MONEY_AMOUNT (units, cents)
| Plus '0' .. '9', '.', Star '0' .. '9' ->
let extract_code_title = R.regexp "([0-9]+)\\.([0-9]*)" in
let dec_parts = R.get_substring (R.exec ~rex:extract_code_title (Utf8.lexeme lexbuf)) in
(* Integer literal*)
update_acc lexbuf;
DECIMAL_LITERAL (int_of_string (dec_parts 1), int_of_string (dec_parts 2))
| "->" ->
update_acc lexbuf;
ARROW
| '.' ->
update_acc lexbuf;
DOT
| "<=" ->
update_acc lexbuf;
LESSER_EQUAL
| '<' ->
update_acc lexbuf;
LESSER
| ">=" ->
update_acc lexbuf;
GREATER_EQUAL
| '>' ->
update_acc lexbuf;
GREATER
| "!=" ->
update_acc lexbuf;
NOT_EQUAL
| '=' ->
update_acc lexbuf;
EQUAL
| '(' ->
update_acc lexbuf;
LPAREN
| ')' ->
update_acc lexbuf;
RPAREN
| '+' ->
update_acc lexbuf;
PLUS
| '-' ->
update_acc lexbuf;
MINUS
| '*' ->
update_acc lexbuf;
MULT
| '%' ->
update_acc lexbuf;
PERCENT
| '/' ->
update_acc lexbuf;
DIV
| '|' ->
update_acc lexbuf;
VERTICAL
| ':' ->
update_acc lexbuf;
COLON
| "--" ->
update_acc lexbuf;
ALT
| uppercase, Star (uppercase | lowercase | '0' .. '9' | '_' | '\'') ->
(* Name of constructor *)
update_acc lexbuf;
CONSTRUCTOR (Utf8.lexeme lexbuf)
| lowercase, Star (lowercase | uppercase | '0' .. '9' | '_' | '\'') ->
(* Name of variable *)
update_acc lexbuf;
IDENT (Utf8.lexeme lexbuf)
| Plus '0' .. '9' ->
(* Integer literal*)
update_acc lexbuf;
INT_LITERAL (int_of_string (Utf8.lexeme lexbuf))
| _ -> Errors.lexer_error (lexing_positions lexbuf) (Utf8.lexeme lexbuf)
let rec lex_law (lexbuf : lexbuf) : token =
match%sedlex lexbuf with
| '\n' -> lex_law lexbuf
| "/*" ->
is_code := true;
code_string_acc := "";
BEGIN_CODE
| eof -> EOF
| "@@", Star white_space, "Master file", Star white_space, "@@" -> MASTER_FILE
| "@@", Star white_space, "Begin metadata", Star white_space, "@@" -> BEGIN_METADATA
| "@@", Star white_space, "End metadata", Star white_space, "@@" -> END_METADATA
| ( "@@",
Star white_space,
"Include:",
Star white_space,
Plus (Compl '@'),
Star white_space,
Opt ('@', Star white_space, "p.", Star white_space, Plus '0' .. '9', Star white_space),
"@@" ) ->
let extract_components =
R.regexp "@@\\s*Include\\:\\s*([^@]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)@@"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if Filename.extension name = ".pdf" then LAW_INCLUDE (Ast.PdfFile ((name, pos), pages))
else if Filename.extension name = ".catala" then LAW_INCLUDE (Ast.CatalaFile (name, pos))
else Errors.lexer_error (lexing_positions lexbuf) "this type of file cannot be included"
| "@@", Plus (Compl '@'), "@@", Star '+' ->
let extract_code_title = R.regexp "@@([^@]+)@@([\\+]*)" in
let get_match = R.get_substring (R.exec ~rex:extract_code_title (Utf8.lexeme lexbuf)) in
let get_new_lines = R.regexp "\n" in
let new_lines_count =
try Array.length (R.extract ~rex:get_new_lines (Utf8.lexeme lexbuf)) with Not_found -> 0
in
for _i = 1 to new_lines_count do
new_line lexbuf
done;
let law_title = get_match 1 in
let precedence = String.length (get_match 2) in
LAW_HEADING (law_title, precedence)
| "@", Plus (Compl '@'), "@" ->
let extract_article_title = R.regexp "@([^@]+)@" in
let title = R.get_substring (R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf)) 1 in
let get_new_lines = R.regexp "\n" in
let new_lines_count =
try Array.length (R.extract ~rex:get_new_lines (Utf8.lexeme lexbuf)) with Not_found -> 0
in
(* the -1 is here to compensate for Sedlex's automatic newline detection around token *)
for _i = 1 to new_lines_count - 1 do
new_line lexbuf
done;
LAW_ARTICLE (title, None, None)
| Plus (Compl ('@' | '/' | '\n')) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> Errors.lexer_error (lexing_positions lexbuf) (Utf8.lexeme lexbuf)
let lexer lexbuf = if !is_code then lex_code lexbuf else lex_law lexbuf

View File

@ -142,10 +142,16 @@ let rec parse_source_files (source_files : string list) (language : Cli.language
try
Parse_utils.current_file := source_file;
let lexer_lang =
match language with Cli.Fr -> Lexer_fr.lexer_fr | Cli.En -> Lexer_en.lexer_en
match language with
| `Fr -> Lexer_fr.lexer_fr
| `En -> Lexer_en.lexer_en
| `NonVerbose -> Lexer.lexer
in
let token_list_lang =
match language with Cli.Fr -> Lexer_fr.token_list_fr | Cli.En -> Lexer_en.token_list_en
match language with
| `Fr -> Lexer_fr.token_list_fr
| `En -> Lexer_en.token_list_en
| `NonVerbose -> Lexer.token_list
in
let commands_or_includes =
sedlex_with_menhir lexer_lang token_list_lang Parser.Incremental.source_file_or_master

View File

@ -175,7 +175,7 @@ let driver (file : string) (debug : bool) (client_id : string) (client_secret :
if debug then Catala.Cli.debug_flag := true;
let access_token = Api.get_token client_id client_secret in
(* LegiFrance is only supported for French texts *)
let program = Catala.Parser_driver.parse_source_files [ file ] Catala.Cli.Fr in
let program = Catala.Parser_driver.parse_source_files [ file ] `Fr in
let article_text_acc =
List.fold_left
(fun article_text_acc item ->

7
tests/non_verbose.catala Normal file
View File

@ -0,0 +1,7 @@
/*
declaration scope A:
context x content int
scope A:
def x ? true ~= 42
*/