catala/compiler/surface/lexer_fr.ml
2021-06-21 11:39:06 +02:00

564 lines
14 KiB
OCaml

(* This file is part of the Catala compiler, a specification language for tax and social benefits
computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux
<denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
or implied. See the License for the specific language governing permissions and limitations under
the License. *)
open Tokens
open Sedlexing
open Utils
module L = Lexer_common
module R = Re.Pcre
(** Same as {!val: Surface.Lexer_common.token_list_language_agnostic}, but with tokens specialized
to French. *)
let token_list : (string * token) list =
[
("champ d'application", SCOPE);
("conséquence", CONSEQUENCE);
("donnée", DATA);
("dépend de", DEPENDS);
("déclaration", DECLARATION);
("contexte", CONTEXT);
("décroissant", DECREASING);
("croissant", INCREASING);
("de", OF);
("collection", COLLECTION);
("énumération", ENUM);
("entier", INTEGER);
("argent", MONEY);
("texte", TEXT);
("decimal", DECIMAL);
("date", DATE);
("durée", DURATION);
("booléen", BOOLEAN);
("somme", SUM);
("rempli", FILLED);
("définition", DEFINITION);
("égal à", DEFINED_AS);
("selon", MATCH);
("sous forme", WITH);
("n'importe quel", WILDCARD);
("sous condition", UNDER_CONDITION);
("si", IF);
("alors", THEN);
("sinon", ELSE);
("contenu", CONTENT);
("structure", STRUCT);
("assertion", ASSERTION);
("varie", VARIES);
("avec", WITH_V);
("pour", FOR);
("tout", ALL);
("on a", WE_HAVE);
("fixé", FIXED);
("par", BY);
("règle", RULE);
("existe", EXISTS);
("tel", SUCH);
("que", THAT);
("et", AND);
("ou", OR);
("ou bien", XOR);
("non", NOT);
("nombre", CARDINAL);
("maximum", MAXIMUM);
("minimum", MINIMUM);
("filtre", FILTER);
("application", MAP);
("initial", INIT);
("an", YEAR);
("mois", MONTH);
("jour", DAY);
("vrai", TRUE);
("faux", FALSE);
]
@ L.token_list_language_agnostic
let builtins : (string * Ast.builtin_expression) list =
[
("entier_vers_décimal", Ast.IntToDec);
("accès_jour", Ast.GetDay);
("accès_mois", Ast.GetMonth);
("accès_année", Ast.GetYear);
]
(** Regexp matching any digit character.
@note can not be used outside the current module (@see <
https://github.com/ocaml-community/sedlex#lexer-specifications >). *)
let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space]
(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| white_space | '\n' ->
(* Whitespaces *)
L.update_acc lexbuf;
lex_code lexbuf
| '#', Star (Compl '\n'), '\n' ->
(* Comments *)
L.update_acc lexbuf;
lex_code lexbuf
| "```" ->
(* End of code section *)
L.is_code := false;
END_CODE !L.code_string_acc
| "champ", space_plus, "d\'application" ->
L.update_acc lexbuf;
SCOPE
| "donn", 0xE9, "e" ->
(* 0xE9 is é *)
L.update_acc lexbuf;
DATA
| "d", 0xE9, "pend", space_plus, "de" ->
L.update_acc lexbuf;
DEPENDS
| "d", 0xE9, "claration" ->
L.update_acc lexbuf;
DECLARATION
| "contexte" ->
L.update_acc lexbuf;
CONTEXT
| "d", 0xE9, "croissant" ->
L.update_acc lexbuf;
DECREASING
| "croissant" ->
L.update_acc lexbuf;
INCREASING
| "de" ->
L.update_acc lexbuf;
OF
| "collection" ->
L.update_acc lexbuf;
COLLECTION
| 0xE9, "num", 0xE9, "ration" ->
L.update_acc lexbuf;
ENUM
| "entier" ->
L.update_acc lexbuf;
INTEGER
| "argent" ->
L.update_acc lexbuf;
MONEY
| "texte" ->
L.update_acc lexbuf;
TEXT
| "d", 0xE9, "cimal" ->
L.update_acc lexbuf;
DECIMAL
| "date" ->
L.update_acc lexbuf;
DATE
| "dur", 0xE9, "e" ->
L.update_acc lexbuf;
DURATION
| "bool", 0xE9, "en" ->
L.update_acc lexbuf;
BOOLEAN
| "somme" ->
L.update_acc lexbuf;
SUM
| "rempli" ->
L.update_acc lexbuf;
FILLED
| "d", 0xE9, "finition" ->
(* 0xE9 is é *)
L.update_acc lexbuf;
DEFINITION
| 0xE9, "tiquette" ->
L.update_acc lexbuf;
LABEL
| "exception" ->
L.update_acc lexbuf;
EXCEPTION
| 0xE9, "gal ", 0x00E0 ->
(* 0xE9 is é *)
L.update_acc lexbuf;
DEFINED_AS
| "selon" ->
L.update_acc lexbuf;
MATCH
| "sous", space_plus, "forme" ->
L.update_acc lexbuf;
WITH
| "n'importe", space_plus, "quel" ->
L.update_acc lexbuf;
WILDCARD
| "sous", space_plus, "condition" ->
L.update_acc lexbuf;
UNDER_CONDITION
| "si" ->
L.update_acc lexbuf;
IF
| "cons", 0xE9, "quence" ->
(* 0xE9 is é *)
L.update_acc lexbuf;
CONSEQUENCE
| "alors" ->
(* 0xE9 is é *)
L.update_acc lexbuf;
THEN
| "sinon" ->
L.update_acc lexbuf;
ELSE
| "condition" ->
L.update_acc lexbuf;
CONDITION
| "contenu" ->
L.update_acc lexbuf;
CONTENT
| "structure" ->
L.update_acc lexbuf;
STRUCT
| "assertion" ->
L.update_acc lexbuf;
ASSERTION
| "varie" ->
L.update_acc lexbuf;
VARIES
| "avec" ->
L.update_acc lexbuf;
WITH_V
| "pour" ->
L.update_acc lexbuf;
FOR
| "tout" ->
L.update_acc lexbuf;
ALL
| "on", space_plus, "a" ->
L.update_acc lexbuf;
WE_HAVE
| "fix", 0xE9 ->
(* 0xE9 is é *)
L.update_acc lexbuf;
FIXED
| "par" ->
L.update_acc lexbuf;
BY
| "r", 0xE8, "gle" ->
(* 0xE8 is è *)
L.update_acc lexbuf;
RULE
| "existe" ->
L.update_acc lexbuf;
EXISTS
| "dans" ->
L.update_acc lexbuf;
IN
| "tel" ->
L.update_acc lexbuf;
SUCH
| "que" ->
L.update_acc lexbuf;
THAT
| "et" ->
L.update_acc lexbuf;
AND
| "ou" ->
L.update_acc lexbuf;
OR
| "ou", space_plus, "bien" ->
L.update_acc lexbuf;
XOR
| "non" ->
L.update_acc lexbuf;
NOT
| "nombre" ->
L.update_acc lexbuf;
CARDINAL
| "maximum" ->
L.update_acc lexbuf;
MAXIMUM
| "minimum" ->
L.update_acc lexbuf;
MINIMUM
| "filtre" ->
L.update_acc lexbuf;
FILTER
| "application" ->
L.update_acc lexbuf;
MAP
| "initial" ->
L.update_acc lexbuf;
INIT
| "vrai" ->
L.update_acc lexbuf;
TRUE
| "faux" ->
L.update_acc lexbuf;
FALSE
| "an" ->
L.update_acc lexbuf;
YEAR
| "mois" ->
L.update_acc lexbuf;
MONTH
| "jour" ->
L.update_acc lexbuf;
DAY
| digit, Star (digit | white_space), Opt (',', Rep (digit, 0 .. 2)), Star white_space, 0x20AC ->
let extract_parts = R.regexp "([0-9]([0-9 ]*[0-9]|))(,([0-9]{0,2})|)" in
let full_str = Utf8.lexeme lexbuf in
let only_numbers_str = String.trim (String.sub full_str 0 (String.length full_str - 1)) in
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
(* Integer literal*)
let units = parts 1 in
let remove_spaces = R.regexp " " in
let units =
Runtime.integer_of_string (R.substitute ~rex:remove_spaces ~subst:(fun _ -> "") units)
in
let cents =
try Runtime.integer_of_string (parts 4) with Not_found -> Runtime.integer_of_int 0
in
L.update_acc lexbuf;
MONEY_AMOUNT (units, cents)
| Plus digit, ',', Star digit ->
let extract_code_title = R.regexp "([0-9]+),([0-9]*)" in
let dec_parts = R.get_substring (R.exec ~rex:extract_code_title (Utf8.lexeme lexbuf)) in
(* Integer literal*)
L.update_acc lexbuf;
DECIMAL_LITERAL
(Runtime.integer_of_string (dec_parts 1), Runtime.integer_of_string (dec_parts 2))
| "<=@" ->
L.update_acc lexbuf;
LESSER_EQUAL_DATE
| "<@" ->
L.update_acc lexbuf;
LESSER_DATE
| ">=@" ->
L.update_acc lexbuf;
GREATER_EQUAL_DATE
| ">@" ->
L.update_acc lexbuf;
GREATER_DATE
| "-@" ->
L.update_acc lexbuf;
MINUSDATE
| "+@" ->
L.update_acc lexbuf;
PLUSDATE
| "<=^" ->
L.update_acc lexbuf;
LESSER_EQUAL_DURATION
| "<^" ->
L.update_acc lexbuf;
LESSER_DURATION
| ">=^" ->
L.update_acc lexbuf;
GREATER_EQUAL_DURATION
| ">^" ->
L.update_acc lexbuf;
GREATER_DURATION
| "+^" ->
L.update_acc lexbuf;
PLUSDURATION
| "-^" ->
L.update_acc lexbuf;
MINUSDURATION
| "/^" ->
L.update_acc lexbuf;
DIVDURATION
| "<=", 0x20AC ->
L.update_acc lexbuf;
LESSER_EQUAL_MONEY
| '<', 0x20AC ->
L.update_acc lexbuf;
LESSER_MONEY
| ">=", 0x20AC ->
L.update_acc lexbuf;
GREATER_EQUAL_MONEY
| '>', 0x20AC ->
L.update_acc lexbuf;
GREATER_MONEY
| '+', 0x20AC ->
L.update_acc lexbuf;
PLUSMONEY
| '-', 0x20AC ->
L.update_acc lexbuf;
MINUSMONEY
| '*', 0x20AC ->
L.update_acc lexbuf;
MULTMONEY
| '/', 0x20AC ->
L.update_acc lexbuf;
DIVMONEY
| "<=." ->
L.update_acc lexbuf;
LESSER_EQUAL_DEC
| "<." ->
L.update_acc lexbuf;
LESSER_DEC
| ">=." ->
L.update_acc lexbuf;
GREATER_EQUAL_DEC
| ">." ->
L.update_acc lexbuf;
GREATER_DEC
| "+." ->
L.update_acc lexbuf;
PLUSDEC
| "-." ->
L.update_acc lexbuf;
MINUSDEC
| "*." ->
L.update_acc lexbuf;
MULTDEC
| "/." ->
L.update_acc lexbuf;
DIVDEC
| "<=" ->
L.update_acc lexbuf;
LESSER_EQUAL
| '<' ->
L.update_acc lexbuf;
LESSER
| ">=" ->
L.update_acc lexbuf;
GREATER_EQUAL
| '>' ->
L.update_acc lexbuf;
GREATER
| '+' ->
L.update_acc lexbuf;
PLUS
| '-' ->
L.update_acc lexbuf;
MINUS
| '*' ->
L.update_acc lexbuf;
MULT
| '/' ->
L.update_acc lexbuf;
DIV
| "!=" ->
L.update_acc lexbuf;
NOT_EQUAL
| '=' ->
L.update_acc lexbuf;
EQUAL
| '%' ->
L.update_acc lexbuf;
PERCENT
| '(' ->
L.update_acc lexbuf;
LPAREN
| ')' ->
L.update_acc lexbuf;
RPAREN
| '{' ->
L.update_acc lexbuf;
LBRACKET
| '}' ->
L.update_acc lexbuf;
RBRACKET
| '[' ->
L.update_acc lexbuf;
LSQUARE
| ']' ->
L.update_acc lexbuf;
RSQUARE
| '|' ->
L.update_acc lexbuf;
VERTICAL
| ':' ->
L.update_acc lexbuf;
COLON
| ';' ->
L.update_acc lexbuf;
SEMICOLON
| "--" ->
L.update_acc lexbuf;
ALT
| '.' ->
L.update_acc lexbuf;
DOT
| uppercase, Star (uppercase | lowercase | digit | '_' | '\'') ->
(* Name of constructor *)
L.update_acc lexbuf;
CONSTRUCTOR (Utf8.lexeme lexbuf)
| lowercase, Star (lowercase | uppercase | digit | '_' | '\'') ->
(* Name of variable *)
L.update_acc lexbuf;
IDENT (Utf8.lexeme lexbuf)
| Plus digit ->
(* Integer literal*)
L.update_acc lexbuf;
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
let compl_catala =
[%sedlex.regexp?
( Compl 'c'
| 'c', Compl 'a'
| "ca", Compl 't'
| "cat", Compl 'a'
| "cata", Compl 'l'
| "catal", Compl 'a'
| "catala", Compl (white_space | '\n') )]
in
match%sedlex lexbuf with
| "```catala" ->
L.is_code := true;
L.code_string_acc := "";
BEGIN_CODE
| eof -> EOF
| '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
| '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
| ( '>',
Star white_space,
"Inclusion:",
Star white_space,
Plus (Compl ('@' | '\n')),
Star white_space,
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
'\n' ) ->
let extract_components =
R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
in
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
let jorftext = R.regexp "JORFTEXT\\d{12}" in
let name = get_component 1 in
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
let pos = lexing_positions lexbuf in
if R.pmatch ~rex:jorftext name then
LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
else if Filename.extension name = ".pdf" then
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
L.get_law_heading lexbuf
| Plus
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
previous regexp. *)
( Compl ('#' | '`' | '>')
(* Following literals allow to match grave accents as long as they don't conflict with the
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
not be followed by 'catala'. *)
| Rep ('`', 1 .. 2), Compl '`'
| "```", compl_catala ) ->
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
Surface.Lexer_common.is_code}. *)
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf