From db46b1c44461a4f027ef7b81540bae69355b7cf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartosz=20Wo=C5=BAniak?= Date: Sun, 9 May 2021 22:55:50 +0200 Subject: [PATCH] lexer --- CONTRIBUTING.md | 2 +- INSTALL.md | 3 +- Makefile | 15 +- examples/README.md | 2 +- examples/polish_taxes_pl/.gitignore | 14 + examples/polish_taxes_pl/Makefile | 4 + .../rozdzial_3.catala_pl | 24 + .../polish_taxes_pl/polish_taxes_pl.catala_pl | 3 + .../tests/test_a7_u1_p1.catala_pl | 23 + generate_website_assets.sh | 2 + src/catala/dcalc/print.ml | 3 +- src/catala/driver.ml | 1 + src/catala/lcalc/print.ml | 3 +- src/catala/literate/html.ml | 13 +- src/catala/literate/latex.ml | 19 +- src/catala/surface/dune | 28 +- src/catala/surface/lexer_pl.ml | 550 ++++++++++++++++++ src/catala/surface/lexer_pl.mli | 15 + src/catala/surface/parser_driver.ml | 2 + src/catala/utils/cli.ml | 9 +- src/catala/utils/cli.mli | 4 +- 21 files changed, 709 insertions(+), 30 deletions(-) create mode 100644 examples/polish_taxes_pl/.gitignore create mode 100644 examples/polish_taxes_pl/Makefile create mode 100644 examples/polish_taxes_pl/podatek_od_czynnosci_cywilnoprawnych/rozdzial_3.catala_pl create mode 100644 examples/polish_taxes_pl/polish_taxes_pl.catala_pl create mode 100644 examples/polish_taxes_pl/tests/test_a7_u1_p1.catala_pl create mode 100644 src/catala/surface/lexer_pl.ml create mode 100644 src/catala/surface/lexer_pl.mli diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index be823554..bfe74e16 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -124,7 +124,7 @@ general-to-specifics statutes order. Therefore, there exists multiple versions of the Catala surface syntax, adapted to the language of the legislative text. Currently, Catala supports English and French legislative text via the -`--language=en` or `--language=fr` option. +`--language=en`, `--language=fr` or `--language=pl` option. Technically, support for new languages can be added via a new lexer. If you want to add a new language, you can start from diff --git a/INSTALL.md b/INSTALL.md index f65cb3d7..01bcbede 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -119,7 +119,8 @@ augmented with the Catala plugin, simply enter sudo make pygments This will execute the -script `syntax_highlighting/fr/pygments/set_up_pygments.sh` and +script `syntax_highlighting/fr/pygments/set_up_pygments.sh`, +`syntax_highlighting/pl/pygments/set_up_pygments.sh` and `syntax_highlighting/en/pygments/set_up_pygments.sh`. The scripts patch your `pigmentize` executable, used for instance by the `minted` LaTeX package. diff --git a/Makefile b/Makefile index c430051f..539678cf 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,7 @@ install: SYNTAX_HIGHLIGHTING_FR=${CURDIR}/syntax_highlighting/fr SYNTAX_HIGHLIGHTING_EN=${CURDIR}/syntax_highlighting/en +SYNTAX_HIGHLIGHTING_PL=${CURDIR}/syntax_highlighting/pl pygmentize_fr: $(SYNTAX_HIGHLIGHTING_FR)/set_up_pygments.sh chmod +x $< @@ -67,6 +68,10 @@ pygmentize_en: $(SYNTAX_HIGHLIGHTING_EN)/set_up_pygments.sh chmod +x $< sudo $< +pygmentize_pl: $(SYNTAX_HIGHLIGHTING_PL)/set_up_pygments.sh + chmod +x $< + sudo $< + #> pygments : Extends your pygmentize executable with Catala lexers pygments: pygmentize_fr pygmentize_en @@ -78,6 +83,10 @@ atom_en: ${CURDIR}/syntax_highlighting/en/setup_atom.sh chmod +x $< $< +atom_pl: ${CURDIR}/syntax_highlighting/pl/setup_atom.sh + chmod +x $< + $< + atom_nv: ${CURDIR}/syntax_highlighting/nv/setup_atom.sh chmod +x $< $< @@ -93,6 +102,10 @@ vscode_en: ${CURDIR}/syntax_highlighting/en/setup_vscode.sh chmod +x $< $< +vscode_pl: ${CURDIR}/syntax_highlighting/pl/setup_vscode.sh + chmod +x $< + $< + vscode_nv: ${CURDIR}/syntax_highlighting/nv/setup_vscode.sh chmod +x $< $< @@ -222,7 +235,7 @@ clean: $(MAKE) -C $(CODE_GENERAL_IMPOTS_DIR) clean inspect: - gitinspector -f ml,mli,mly,iro,tex,catala,catala_en,catala_fr,md,fst,mld --grading + gitinspector -f ml,mli,mly,iro,tex,catala,catala_en,catala_pl,catala_fr,md,fst,mld --grading ########################################## # Special targets diff --git a/examples/README.md b/examples/README.md index d798a9b7..deb076e5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -99,7 +99,7 @@ To build and run the example, create a `Makefile` in `foo/` with the following contents: ```Makefile -CATALA_LANG=en # or fr if your source code is in French +CATALA_LANG=en # or fr/pl if your source code is in French/Polish SRC=foo.catala include ../Makefile.common.mk diff --git a/examples/polish_taxes_pl/.gitignore b/examples/polish_taxes_pl/.gitignore new file mode 100644 index 00000000..5304e283 --- /dev/null +++ b/examples/polish_taxes_pl/.gitignore @@ -0,0 +1,14 @@ +*.aux +*.dvi +*.fdb_latexmk +*.fls +*.log +*.out +*.fls +*.tex +*.pdf +_minted* +*.toc +*.pyg +*.d +*.ml \ No newline at end of file diff --git a/examples/polish_taxes_pl/Makefile b/examples/polish_taxes_pl/Makefile new file mode 100644 index 00000000..15d94880 --- /dev/null +++ b/examples/polish_taxes_pl/Makefile @@ -0,0 +1,4 @@ +CATALA_LANG=pl +SRC=polish_tax_code.catala_pl + +include ../Makefile.common.mk diff --git a/examples/polish_taxes_pl/podatek_od_czynnosci_cywilnoprawnych/rozdzial_3.catala_pl b/examples/polish_taxes_pl/podatek_od_czynnosci_cywilnoprawnych/rozdzial_3.catala_pl new file mode 100644 index 00000000..5557b883 --- /dev/null +++ b/examples/polish_taxes_pl/podatek_od_czynnosci_cywilnoprawnych/rozdzial_3.catala_pl @@ -0,0 +1,24 @@ +## Rozdzial 3 + +## Art. 7. 1. Stawki podatku wynoszą: + +### 1) od umowy sprzedaży: +#### a) nieruchomości, rzeczy ruchomych, prawa użytkowania wieczystego, własnościowego spółdzielczego prawa do lokalu mieszkalnego, spółdzielczego prawa do lokalu użytkowego oraz wynikających z przepisów prawa spółdzielczego: prawa do domu jednorodzinnego oraz prawa do lokalu w małym domu mieszkalnym – 2%, + +> Begin metadata +```catala +deklaracja zakres UmowaSprzedazy: + kontekst kwota zawartosc pieniadze + kontekst stawka zawartosc dziesietna + kontekst podatek zawartosc pieniadze + +zakres UmowaSprzedazy: + definicja podatek wynosi + kwota *$ stawka + +zakres UmowaSprzedazy: + etykieta a7_u1_p1_ppa + definicja stawka wynosi 2% +``` +> End metadata + diff --git a/examples/polish_taxes_pl/polish_taxes_pl.catala_pl b/examples/polish_taxes_pl/polish_taxes_pl.catala_pl new file mode 100644 index 00000000..5080d52d --- /dev/null +++ b/examples/polish_taxes_pl/polish_taxes_pl.catala_pl @@ -0,0 +1,3 @@ +# Master file + +> Include: podatek_od_czynnosci_cywilnoprawnych/rozdzial_3.catala_pl diff --git a/examples/polish_taxes_pl/tests/test_a7_u1_p1.catala_pl b/examples/polish_taxes_pl/tests/test_a7_u1_p1.catala_pl new file mode 100644 index 00000000..9ceedc24 --- /dev/null +++ b/examples/polish_taxes_pl/tests/test_a7_u1_p1.catala_pl @@ -0,0 +1,23 @@ +> Include: ../podatek_od_czynnosci_cywilnoprawnych/rozdzial_3.catala_en + +## [Test - Art. 7 ustęp 1 punkt 1] + +```catala +deklaracja zakres Test_A7_U1_P1_PPa: + kontekst sprzedaz zakres UmowaSprzedazy + +zakres Test_A7_U1_P1_PPa: + definicja sprzedaz.kwota wynosi $100 + + asercja sprzedaz.podatek = $2 + + +deklaracja zakres Test_A7_U1_P1_PPb: + kontekst sprzedaz zakres UmowaSprzedazy + +zakres Test_A7_U1_P1_PPb: + definicja sprzedaz.kwota wynosi $100 + definicja sprzedaz.inne_prawa_majatkowe wynosi prawda + + asercja sprzedaz.podatek = $1 +``` \ No newline at end of file diff --git a/generate_website_assets.sh b/generate_website_assets.sh index 4f94b3f1..74bb62d5 100755 --- a/generate_website_assets.sh +++ b/generate_website_assets.sh @@ -16,6 +16,7 @@ scp grammar.html $1/ scp catala.html $1/ scp _build/default/src/catala/catala_web.bc.js $1/playground/ scp examples/tutorial_en/tutorial_en.catala_en $1/playground/ +# TODO BW: PL scp examples/tutoriel_fr/tutoriel_fr.catala_fr $1/playground/ cat examples/allocations_familiales/prologue.catala_fr > allocations_familiales.catala_fr cat examples/allocations_familiales/decrets_divers.catala_fr >> allocations_familiales.catala_fr @@ -29,5 +30,6 @@ mv -f allocations_familiales_fixed.catala_fr allocations_familiales.catala_fr scp allocations_familiales.catala_fr $1/playground/ rm allocations_familiales.catala_fr scp syntax_highlighting/en/ace/mode-catala_en.js $1/playground/ +# TODO BW: PL scp syntax_highlighting/fr/ace/mode-catala_fr.js $1/playground/ scp french_law_js/french_law.js $1/french_law.js diff --git a/src/catala/dcalc/print.ml b/src/catala/dcalc/print.ml index 0ad11025..251c123a 100644 --- a/src/catala/dcalc/print.ml +++ b/src/catala/dcalc/print.ml @@ -98,7 +98,8 @@ let format_lit (fmt : Format.formatter) (l : lit Pos.marked) : unit = | LMoney e -> ( match !Utils.Cli.locale_lang with | `En -> Format.fprintf fmt "$%s" (Runtime.money_to_string e) - | `Fr -> Format.fprintf fmt "%s €" (Runtime.money_to_string e)) + | `Fr -> Format.fprintf fmt "%s €" (Runtime.money_to_string e) + | `Pl -> Format.fprintf fmt "%s PLN" (Runtime.money_to_string e)) | LDate d -> Format.fprintf fmt "%s" (Runtime.date_to_string d) | LDuration d -> Format.fprintf fmt "%s" (Runtime.duration_to_string d) diff --git a/src/catala/driver.ml b/src/catala/driver.ml index 4a6ffe00..d651f7e5 100644 --- a/src/catala/driver.ml +++ b/src/catala/driver.ml @@ -35,6 +35,7 @@ let driver (source_file : Pos.input_file) (debug : bool) (dcalc : bool) (unstyle | Some l -> if l = "fr" then `Fr else if l = "en" then `En + else if l = "pl" then `Pl else if l = "non-verbose" then `NonVerbose else Errors.raise_error diff --git a/src/catala/lcalc/print.ml b/src/catala/lcalc/print.ml index 20aad56c..fae59f48 100644 --- a/src/catala/lcalc/print.ml +++ b/src/catala/lcalc/print.ml @@ -38,7 +38,8 @@ let format_lit (fmt : Format.formatter) (l : lit Pos.marked) : unit = | LMoney e -> ( match !Utils.Cli.locale_lang with | `En -> Format.fprintf fmt "$%s" (Runtime.money_to_string e) - | `Fr -> Format.fprintf fmt "%s €" (Runtime.money_to_string e)) + | `Fr -> Format.fprintf fmt "%s €" (Runtime.money_to_string e) + | `Pl -> Format.fprintf fmt "%s PLN" (Runtime.money_to_string e)) | LDate d -> Format.fprintf fmt "%s" (Runtime.date_to_string d) | LDuration d -> Format.fprintf fmt "%s" (Runtime.duration_to_string d) diff --git a/src/catala/literate/html.ml b/src/catala/literate/html.ml index 9d3ff845..08b997cd 100644 --- a/src/catala/literate/html.ml +++ b/src/catala/literate/html.ml @@ -71,12 +71,14 @@ let wrap_html (source_files : string list) (language : Cli.backend_lang) (fmt : css_as_string (match language with | `Fr -> "Implémentation de texte législatif" - | `En -> "Legislative text implementation") - (match language with `Fr -> "Document généré par" | `En -> "Document generated by") + | `En -> "Legislative text implementation" + | `Pl -> "Implementowanie tekstów legislacyjnych") + (match language with `Fr -> "Document généré par" | `En -> "Document generated by" | `Pl -> "Dokument wygenerowany przez") Utils.Cli.version (match language with | `Fr -> "Fichiers sources tissés dans ce document" - | `En -> "Source files weaved in this document") + | `En -> "Source files weaved in this document" + | `Pl -> "Pliki źródłowe w tym dokumencie") (String.concat "\n" (List.map (fun filename -> @@ -90,7 +92,8 @@ let wrap_html (source_files : string list) (language : Cli.backend_lang) (fmt : (pre_html (Filename.basename filename)) (match language with | `Fr -> "dernière modification le" - | `En -> "last modification") + | `En -> "last modification" + | `Pl -> "ostatnia modyfikacja") ftime) source_files)); wrapped fmt @@ -104,7 +107,7 @@ let pygmentize_code (c : string Pos.marked) (language : C.backend_lang) : string Printf.fprintf oc "%s" (Pos.unmark c); close_out oc; let pygments = "pygmentize" in - let pygments_lexer = match language with `Fr -> "catala_fr" | `En -> "catala_en" in + let pygments_lexer = match language with `Fr -> "catala_fr" | `En -> "catala_en" | `Pl -> "catala_pl" in let pygments_args = [| "-l"; diff --git a/src/catala/literate/latex.ml b/src/catala/literate/latex.ml index cf4fdff3..56a1baa8 100644 --- a/src/catala/literate/latex.ml +++ b/src/catala/literate/latex.ml @@ -80,15 +80,17 @@ let wrap_latex (source_files : string list) (language : C.backend_lang) (fmt : F %s : \n\ \\begin{itemize}%s\\end{itemize}\n\n\ \\[\\star\\star\\star\\]\\\\\n" - (match language with `Fr -> "french" | `En -> "english") + (match language with `Fr -> "french" | `En -> "english" | `Pl -> "polish") (match language with | `Fr -> "Implémentation de texte législatif" - | `En -> "Legislative text implementation") - (match language with `Fr -> "Document généré par" | `En -> "Document generated by") + | `En -> "Legislative text implementation" + | `Pl -> "Implementacja tekstów legislacyjnych") + (match language with `Fr -> "Document généré par" | `En -> "Document generated by" | `Pl -> "Dokument wygenerowany przez") Utils.Cli.version (match language with | `Fr -> "Fichiers sources tissés dans ce document" - | `En -> "Source files weaved in this document") + | `En -> "Source files weaved in this document" + | `Pl -> "Pliki źródłowe w tym dokumencie") (String.concat "," (List.map (fun filename -> @@ -102,7 +104,8 @@ let wrap_latex (source_files : string list) (language : C.backend_lang) (fmt : F (pre_latexify (Filename.basename filename)) (match language with | `Fr -> "dernière modification le" - | `En -> "last modification") + | `En -> "last modification" + | `Pl -> "ostatnia modyfikacja") ftime) source_files)); wrapped fmt; @@ -137,7 +140,7 @@ let law_article_item_to_latex (language : C.backend_lang) (fmt : Format.formatte \\end{minted}" (pre_latexify (Filename.basename (Pos.get_file (Pos.get_position c)))) (Pos.get_start_line (Pos.get_position c) - 1) - (match language with `Fr -> "catala_fr" | `En -> "catala_en") + (match language with `Fr -> "catala_fr" | `En -> "catala_en" | `Pl -> "catala_pl") (math_syms_replace (Pos.unmark c)) let rec law_structure_to_latex (language : C.backend_lang) (fmt : Format.formatter) @@ -171,7 +174,7 @@ let rec law_structure_to_latex (language : C.backend_lang) (fmt : Format.formatt (law_article_item_to_latex language) fmt children | A.MetadataBlock (_, c) -> - let metadata_title = match language with `Fr -> "Métadonnées" | `En -> "Metadata" in + let metadata_title = match language with `Fr -> "Métadonnées" | `En -> "Metadata" | `Pl -> "Metadane" in Format.fprintf fmt "\\begin{tcolorbox}[colframe=OliveGreen, breakable, \ title=\\textcolor{black}{\\texttt{%s}},title after \ @@ -183,7 +186,7 @@ let rec law_structure_to_latex (language : C.backend_lang) (fmt : Format.formatt metadata_title metadata_title (Pos.get_start_line (Pos.get_position c) - 1) (pre_latexify (Filename.basename (Pos.get_file (Pos.get_position c)))) - (match language with `Fr -> "catala_fr" | `En -> "catala_en") + (match language with `Fr -> "catala_fr" | `En -> "catala_en" | `Pl -> "catala_pl") (math_syms_replace (Pos.unmark c)) | A.IntermediateText t -> Format.fprintf fmt "%s" (pre_latexify t) diff --git a/src/catala/surface/dune b/src/catala/surface/dune index 520ce6b9..87b438a1 100644 --- a/src/catala/surface/dune +++ b/src/catala/surface/dune @@ -32,19 +32,37 @@ (rule (with-stdout-to parser.messages.new - (run menhir %{dep:tokens.mly} %{dep:parser.mly} --base parser --list-errors))) + (run + menhir + %{dep:tokens.mly} + %{dep:parser.mly} + --base + parser + --list-errors))) (rule (with-stdout-to parser_errors.ml - (run menhir %{dep:tokens.mly} %{dep:parser.mly} --base parser - --compile-errors %{dep:parser.messages}))) + (run + menhir + %{dep:tokens.mly} + %{dep:parser.mly} + --base + parser + --compile-errors + %{dep:parser.messages}))) (rule (with-stdout-to parser.messages.updated - (run menhir %{dep:tokens.mly} %{dep:parser.mly} --base parser - --update-errors %{dep:parser.messages}))) + (run + menhir + %{dep:tokens.mly} + %{dep:parser.mly} + --base + parser + --update-errors + %{dep:parser.messages}))) (rule (alias update-parser-messages) diff --git a/src/catala/surface/lexer_pl.ml b/src/catala/surface/lexer_pl.ml new file mode 100644 index 00000000..9448c7f5 --- /dev/null +++ b/src/catala/surface/lexer_pl.ml @@ -0,0 +1,550 @@ +(* This file is part of the Catala compiler, a specification language for tax and social benefits + computation rules. Copyright (C) 2020 Inria, contributor: Denis Merigoux + + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + in compliance with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + or implied. See the License for the specific language governing permissions and limitations under + the License. *) + +open Tokens +open Sedlexing +open Utils +open Lexer_common +module L = Lexer +module R = Re.Pcre + +(** Same as {!val: Surface.Lexer.token_list_language_agnostic}, but with tokens specialized to + Polish. *) +let token_list : (string * token) list = + [ + ("zakres", SCOPE); + ("konsekwencja", CONSEQUENCE); + ("data", DATA); + ("zalezy od", DEPENDS); + ("deklaracja", DECLARATION); + ("kontekst", CONTEXT); + ("malejacy", DECREASING); + ("rosnacy", INCREASING); + ("z", OF); + ("kolekcja", COLLECTION); + ("enumeracja", ENUM); + ("calkowita", INTEGER); + ("pieniadze", MONEY); + ("tekst", TEXT); + ("dziesietna", DECIMAL); + ("czas", DATE); + ("czas trwania", DURATION); + ("zerojedynkowy", BOOLEAN); + ("suma", SUM); + ("spelnione", FILLED); + ("definicja", DEFINITION); + ("etykieta", LABEL); + ("wyjatek", EXCEPTION); + ("wynosi", DEFINED_AS); + ("pasuje", MATCH); + ("ze wzorem", WITH); + ("pod warunkiem", UNDER_CONDITION); + ("jezeli", IF); + ("wtedy", THEN); + ("inaczej", ELSE); + ("zawartosc", CONTENT); + ("struktura", STRUCT); + ("asercja", ASSERTION); + ("rozna", VARIES); + ("with", WITH_V); + ("dla", FOR); + ("wszystkie", ALL); + ("mamy", WE_HAVE); + ("staloprzecinkowa", FIXED); + ("przez", BY); + ("zasada", RULE); + ("istnieje", EXISTS); + ("takie ze", SUCH); + ("to", THAT); + ("i", AND); + ("lub", OR); + ("xor", XOR); + ("nie", NOT); + ("maximum", MAXIMUM); + ("minimum", MINIMUM); + ("filtr", FILTER); + ("mapuj", MAP); + ("poczatkowy", INIT); + ("liczba", CARDINAL); + ("rok", YEAR); + ("miesiac", MONTH); + ("dzien", DAY); + ("prawda", TRUE); + ("falsz", FALSE); + ] + @ L.token_list_language_agnostic + +(** Localised builtin functions *) +let builtins : (string * Ast.builtin_expression) list = + [ + ("integer_to_decimal", IntToDec); + ("get_day", GetDay); + ("get_month", GetMonth); + ("get_year", GetYear); + ] + +(** Main lexing function used in code blocks *) +let rec lex_code (lexbuf : lexbuf) : token = + let prev_lexeme = Utf8.lexeme lexbuf in + let prev_pos = lexing_positions lexbuf in + match%sedlex lexbuf with + | white_space -> + (* Whitespaces *) + L.update_acc lexbuf; + lex_code lexbuf + | '#', Star (Compl '\n'), '\n' -> + (* Comments *) + L.update_acc lexbuf; + lex_code lexbuf + | "```" -> + (* End of code section *) + L.is_code := false; + END_CODE !L.code_string_acc + | "zakres" -> + L.update_acc lexbuf; + SCOPE + | "data" -> + L.update_acc lexbuf; + DATA + | "zalezy od" -> + L.update_acc lexbuf; + DEPENDS + | "deklaracja" -> + L.update_acc lexbuf; + DECLARATION + | "kontekst" -> + L.update_acc lexbuf; + CONTEXT + | "malejacy" -> + L.update_acc lexbuf; + DECREASING + | "rosnacy" -> + L.update_acc lexbuf; + INCREASING + | "z" -> + L.update_acc lexbuf; + OF + | "kolekcja" -> + L.update_acc lexbuf; + COLLECTION + | "enumeracja" -> + L.update_acc lexbuf; + ENUM + | "calkowita" -> + L.update_acc lexbuf; + INTEGER + | "pieniadze" -> + L.update_acc lexbuf; + MONEY + | "tekst" -> + L.update_acc lexbuf; + TEXT + | "dziesietna" -> + L.update_acc lexbuf; + DECIMAL + | "czas" -> + L.update_acc lexbuf; + DATE + | "czas trwania" -> + L.update_acc lexbuf; + DURATION + | "zerojedynkowy" -> + L.update_acc lexbuf; + BOOLEAN + | "suma" -> + L.update_acc lexbuf; + SUM + | "spelnione" -> + L.update_acc lexbuf; + FILLED + | "definicja" -> + L.update_acc lexbuf; + DEFINITION + | "etykieta" -> + L.update_acc lexbuf; + LABEL + | "wyjatek" -> + L.update_acc lexbuf; + EXCEPTION + | "wynosi" -> + L.update_acc lexbuf; + DEFINED_AS + | "pasuje" -> + L.update_acc lexbuf; + MATCH + | "ze wzorem" -> + L.update_acc lexbuf; + WITH + | "pod warunkiem" -> + L.update_acc lexbuf; + UNDER_CONDITION + | "jezeli" -> + L.update_acc lexbuf; + IF + | "konsekwencja" -> + L.update_acc lexbuf; + CONSEQUENCE + | "wtedy" -> + L.update_acc lexbuf; + THEN + | "inaczej" -> + L.update_acc lexbuf; + ELSE + | "condition" -> + L.update_acc lexbuf; + CONDITION + | "zawartosc" -> + L.update_acc lexbuf; + CONTENT + | "struktura" -> + L.update_acc lexbuf; + STRUCT + | "asercja" -> + L.update_acc lexbuf; + ASSERTION + | "rozna" -> + L.update_acc lexbuf; + VARIES + | "wraz z" -> + L.update_acc lexbuf; + WITH_V + | "dla" -> + L.update_acc lexbuf; + FOR + | "wszystkie" -> + L.update_acc lexbuf; + ALL + | "mamy" -> + L.update_acc lexbuf; + WE_HAVE + | "staloprzecinkowa" -> + L.update_acc lexbuf; + FIXED + | "przez" -> + L.update_acc lexbuf; + BY + | "zasada" -> + (* 0xE8 is è *) + L.update_acc lexbuf; + RULE + | "istnieje" -> + L.update_acc lexbuf; + EXISTS + | "in" -> + L.update_acc lexbuf; + IN + | "takie ze" -> + L.update_acc lexbuf; + SUCH + | "to" -> + L.update_acc lexbuf; + THAT + | "i" -> + L.update_acc lexbuf; + AND + | "lub" -> + L.update_acc lexbuf; + OR + | "xor" -> + L.update_acc lexbuf; + XOR + | "nie" -> + L.update_acc lexbuf; + NOT + | "maximum" -> + L.update_acc lexbuf; + MAXIMUM + | "minimum" -> + L.update_acc lexbuf; + MINIMUM + | "filtr" -> + L.update_acc lexbuf; + FILTER + | "mapuj" -> + L.update_acc lexbuf; + MAP + | "poczatkowy" -> + L.update_acc lexbuf; + INIT + | "liczba" -> + L.update_acc lexbuf; + CARDINAL + | "prawda" -> + L.update_acc lexbuf; + TRUE + | "falsz" -> + L.update_acc lexbuf; + FALSE + | "rok" -> + L.update_acc lexbuf; + YEAR + | "miesiac" -> + L.update_acc lexbuf; + MONTH + | "dzien" -> + L.update_acc lexbuf; + DAY + | 0x24, Star white_space, '0' .. '9', Star ('0' .. '9' | ','), Opt ('.', Rep ('0' .. '9', 0 .. 2)) + -> + let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in + let full_str = Utf8.lexeme lexbuf in + let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in + let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in + (* Integer literal*) + let units = parts 1 in + let remove_commas = R.regexp "," in + let units = + Runtime.integer_of_string (R.substitute ~rex:remove_commas ~subst:(fun _ -> "") units) + in + let cents = + try Runtime.integer_of_string (parts 4) with Not_found -> Runtime.integer_of_int 0 + in + L.update_acc lexbuf; + MONEY_AMOUNT (units, cents) + | Plus '0' .. '9', '.', Star '0' .. '9' -> + let extract_code_title = R.regexp "([0-9]+)\\.([0-9]*)" in + let dec_parts = R.get_substring (R.exec ~rex:extract_code_title (Utf8.lexeme lexbuf)) in + (* Integer literal*) + L.update_acc lexbuf; + DECIMAL_LITERAL + (Runtime.integer_of_string (dec_parts 1), Runtime.integer_of_string (dec_parts 2)) + | "<=@" -> + L.update_acc lexbuf; + LESSER_EQUAL_DATE + | "<@" -> + L.update_acc lexbuf; + LESSER_DATE + | ">=@" -> + L.update_acc lexbuf; + GREATER_EQUAL_DATE + | ">@" -> + L.update_acc lexbuf; + GREATER_DATE + | "-@" -> + L.update_acc lexbuf; + MINUSDATE + | "+@" -> + L.update_acc lexbuf; + PLUSDATE + | "<=^" -> + L.update_acc lexbuf; + LESSER_EQUAL_DURATION + | "<^" -> + L.update_acc lexbuf; + LESSER_DURATION + | ">=^" -> + L.update_acc lexbuf; + GREATER_EQUAL_DURATION + | ">^" -> + L.update_acc lexbuf; + GREATER_DURATION + | "+^" -> + L.update_acc lexbuf; + PLUSDURATION + | "-^" -> + L.update_acc lexbuf; + MINUSDURATION + | "<=", 0x24 -> + L.update_acc lexbuf; + LESSER_EQUAL_MONEY + | '<', 0x24 -> + L.update_acc lexbuf; + LESSER_MONEY + | ">=", 0x24 -> + L.update_acc lexbuf; + GREATER_EQUAL_MONEY + | '>', 0x24 -> + L.update_acc lexbuf; + GREATER_MONEY + | '+', 0x24 -> + L.update_acc lexbuf; + PLUSMONEY + | '-', 0x24 -> + L.update_acc lexbuf; + MINUSMONEY + | '*', 0x24 -> + L.update_acc lexbuf; + MULTMONEY + | '/', 0x24 -> + L.update_acc lexbuf; + DIVMONEY + | "<=." -> + L.update_acc lexbuf; + LESSER_EQUAL_DEC + | "<." -> + L.update_acc lexbuf; + LESSER_DEC + | ">=." -> + L.update_acc lexbuf; + GREATER_EQUAL_DEC + | ">." -> + L.update_acc lexbuf; + GREATER_DEC + | "+." -> + L.update_acc lexbuf; + PLUSDEC + | "-." -> + L.update_acc lexbuf; + MINUSDEC + | "*." -> + L.update_acc lexbuf; + MULTDEC + | "/." -> + L.update_acc lexbuf; + DIVDEC + | "<=" -> + L.update_acc lexbuf; + LESSER_EQUAL + | '<' -> + L.update_acc lexbuf; + LESSER + | ">=" -> + L.update_acc lexbuf; + GREATER_EQUAL + | '>' -> + L.update_acc lexbuf; + GREATER + | '+' -> + L.update_acc lexbuf; + PLUS + | '-' -> + L.update_acc lexbuf; + MINUS + | '*' -> + L.update_acc lexbuf; + MULT + | '/' -> + L.update_acc lexbuf; + DIV + | "!=" -> + L.update_acc lexbuf; + NOT_EQUAL + | '=' -> + L.update_acc lexbuf; + EQUAL + | '%' -> + L.update_acc lexbuf; + PERCENT + | '(' -> + L.update_acc lexbuf; + LPAREN + | ')' -> + L.update_acc lexbuf; + RPAREN + | '{' -> + L.update_acc lexbuf; + LBRACKET + | '}' -> + L.update_acc lexbuf; + RBRACKET + | '[' -> + L.update_acc lexbuf; + LSQUARE + | ']' -> + L.update_acc lexbuf; + RSQUARE + | '|' -> + L.update_acc lexbuf; + VERTICAL + | ':' -> + L.update_acc lexbuf; + COLON + | ';' -> + L.update_acc lexbuf; + SEMICOLON + | "--" -> + L.update_acc lexbuf; + ALT + | '.' -> + L.update_acc lexbuf; + DOT + | uppercase, Star (uppercase | lowercase | '0' .. '9' | '_' | '\'') -> + (* Name of constructor *) + L.update_acc lexbuf; + CONSTRUCTOR (Utf8.lexeme lexbuf) + | lowercase, Star (lowercase | uppercase | '0' .. '9' | '_' | '\'') -> + (* Name of variable *) + L.update_acc lexbuf; + IDENT (Utf8.lexeme lexbuf) + | Plus '0' .. '9' -> + (* Integer literal*) + L.update_acc lexbuf; + INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf)) + | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme + +(** Main lexing function used outside code blocks *) +let lex_law (lexbuf : lexbuf) : token = + let prev_lexeme = Utf8.lexeme lexbuf in + let prev_pos = lexing_positions lexbuf in + match%sedlex lexbuf with + | "```catala" -> + L.is_code := true; + L.code_string_acc := ""; + + BEGIN_CODE + | eof -> EOF + | '#', Star white_space, "Master file" -> MASTER_FILE + | '>', Star white_space, "Begin metadata" -> BEGIN_METADATA + | '>', Star white_space, "End metadata" -> END_METADATA + | ( '>', + Star white_space, + "Include:", + Star white_space, + Plus (Compl ('@' | '\n')), + Star white_space, + Opt ('@', Star white_space, "p.", Star white_space, Plus '0' .. '9', Star white_space), + '\n' ) -> + let extract_components = + R.regexp ">\\s*Include\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)" + in + let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in + let name = get_component 1 in + let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in + let pos = lexing_positions lexbuf in + if Filename.extension name = ".pdf" then + LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages)) + else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos)) + | '#', Plus '#', Star white_space, Plus (Compl ('[' | ']' | '\n')), Star white_space, '\n' -> + get_law_heading lexbuf + | ( '#', + Plus '#', + Star white_space, + '[', + Star white_space, + Plus (Compl ']'), + Star white_space, + ']', + '\n' ) -> + let extract_article_title = R.regexp "([#]+)\\s*\\[([^\\]]+)\\]" in + let get_substring = + R.get_substring (R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf)) + in + let title = get_substring 2 in + let get_new_lines = R.regexp "\n" in + let new_lines_count = + try Array.length (R.extract ~rex:get_new_lines (Utf8.lexeme lexbuf)) with Not_found -> 0 + in + (* the -1 is here to compensate for Sedlex's automatic newline detection around token *) + for _i = 1 to new_lines_count - 1 do + new_line lexbuf + done; + let precedence = calc_precedence (get_substring 1) in + + LAW_ARTICLE (title, None, None, precedence) + | Plus (Compl ('/' | '#' | '`' | '>')) -> LAW_TEXT (Utf8.lexeme lexbuf) + | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme + +(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val: + Surface.Lexer.is_code}. *) +let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf diff --git a/src/catala/surface/lexer_pl.mli b/src/catala/surface/lexer_pl.mli new file mode 100644 index 00000000..97e67565 --- /dev/null +++ b/src/catala/surface/lexer_pl.mli @@ -0,0 +1,15 @@ +(* This file is part of the Catala compiler, a specification language for tax and social benefits + computation rules. Copyright (C) 2020 Inria, contributor: Denis Merigoux + + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + in compliance with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + or implied. See the License for the specific language governing permissions and limitations under + the License. *) + +include Lexer.LocalisedLexer diff --git a/src/catala/surface/parser_driver.ml b/src/catala/surface/parser_driver.ml index 8d14a65f..12ed332b 100644 --- a/src/catala/surface/parser_driver.ml +++ b/src/catala/surface/parser_driver.ml @@ -226,11 +226,13 @@ end module Parser_NonVerbose = ParserAux (Lexer) module Parser_En = ParserAux (Lexer_en) module Parser_Fr = ParserAux (Lexer_fr) +module Parser_Pl = ParserAux (Lexer_pl) let localised_parser : Cli.frontend_lang -> lexbuf -> Ast.source_file_or_master = function | `NonVerbose -> Parser_NonVerbose.commands_or_includes | `En -> Parser_En.commands_or_includes | `Fr -> Parser_Fr.commands_or_includes + | `Pl -> Parser_Pl.commands_or_includes (** {1 Parsing multiple files} *) diff --git a/src/catala/utils/cli.ml b/src/catala/utils/cli.ml index 5064d78f..8720d227 100644 --- a/src/catala/utils/cli.ml +++ b/src/catala/utils/cli.ml @@ -12,12 +12,12 @@ or implied. See the License for the specific language governing permissions and limitations under the License. *) -type frontend_lang = [ `Fr | `En | `NonVerbose ] +type frontend_lang = [ `Fr | `En | `NonVerbose | `Pl] -type backend_lang = [ `Fr | `En ] +type backend_lang = [ `Fr | `En | `Pl ] let to_backend_lang (lang : frontend_lang) : backend_lang = - match lang with `En | `NonVerbose -> `En | `Fr -> `Fr + match lang with `En | `NonVerbose -> `En | `Fr -> `Fr | `Pl -> `Pl (** Source files to be compiled *) let source_files : string list ref = ref [] @@ -57,7 +57,8 @@ let unstyled = Arg.(value & flag & info [ "unstyled" ] ~doc:"Removes styling fro let optimize = Arg.(value & flag & info [ "optimize"; "O" ] ~doc:"Run compiler optimizations") let trace_opt = - Arg.(value & flag & info [ "trace"; "t" ] ~doc:"Displays a trace of the interpreter's computation") + Arg.( + value & flag & info [ "trace"; "t" ] ~doc:"Displays a trace of the interpreter's computation") let wrap_weaved_output = Arg.( diff --git a/src/catala/utils/cli.mli b/src/catala/utils/cli.mli index 0b874ef9..2ffa0b96 100644 --- a/src/catala/utils/cli.mli +++ b/src/catala/utils/cli.mli @@ -12,9 +12,9 @@ or implied. See the License for the specific language governing permissions and limitations under the License. *) -type frontend_lang = [ `En | `Fr | `NonVerbose ] +type frontend_lang = [ `En | `Fr | `NonVerbose | `Pl ] -type backend_lang = [ `En | `Fr ] +type backend_lang = [ `En | `Fr | `Pl ] val to_backend_lang : frontend_lang -> backend_lang