mirror of
https://github.com/CatalaLang/catala.git
synced 2024-11-08 07:51:43 +03:00
564 lines
14 KiB
OCaml
564 lines
14 KiB
OCaml
(* This file is part of the Catala compiler, a specification language for tax and social benefits
|
|
computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux
|
|
<denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
|
in compliance with the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
or implied. See the License for the specific language governing permissions and limitations under
|
|
the License. *)
|
|
|
|
open Tokens
|
|
open Sedlexing
|
|
open Utils
|
|
module L = Lexer_common
|
|
module R = Re.Pcre
|
|
|
|
(** Same as {!val: Surface.Lexer_common.token_list_language_agnostic}, but with tokens specialized
|
|
to French. *)
|
|
let token_list : (string * token) list =
|
|
[
|
|
("champ d'application", SCOPE);
|
|
("conséquence", CONSEQUENCE);
|
|
("donnée", DATA);
|
|
("dépend de", DEPENDS);
|
|
("déclaration", DECLARATION);
|
|
("contexte", CONTEXT);
|
|
("décroissant", DECREASING);
|
|
("croissant", INCREASING);
|
|
("de", OF);
|
|
("collection", COLLECTION);
|
|
("énumération", ENUM);
|
|
("entier", INTEGER);
|
|
("argent", MONEY);
|
|
("texte", TEXT);
|
|
("decimal", DECIMAL);
|
|
("date", DATE);
|
|
("durée", DURATION);
|
|
("booléen", BOOLEAN);
|
|
("somme", SUM);
|
|
("rempli", FILLED);
|
|
("définition", DEFINITION);
|
|
("égal à", DEFINED_AS);
|
|
("selon", MATCH);
|
|
("sous forme", WITH);
|
|
("n'importe quel", WILDCARD);
|
|
("sous condition", UNDER_CONDITION);
|
|
("si", IF);
|
|
("alors", THEN);
|
|
("sinon", ELSE);
|
|
("contenu", CONTENT);
|
|
("structure", STRUCT);
|
|
("assertion", ASSERTION);
|
|
("varie", VARIES);
|
|
("avec", WITH_V);
|
|
("pour", FOR);
|
|
("tout", ALL);
|
|
("on a", WE_HAVE);
|
|
("fixé", FIXED);
|
|
("par", BY);
|
|
("règle", RULE);
|
|
("existe", EXISTS);
|
|
("tel", SUCH);
|
|
("que", THAT);
|
|
("et", AND);
|
|
("ou", OR);
|
|
("ou bien", XOR);
|
|
("non", NOT);
|
|
("nombre", CARDINAL);
|
|
("maximum", MAXIMUM);
|
|
("minimum", MINIMUM);
|
|
("filtre", FILTER);
|
|
("application", MAP);
|
|
("initial", INIT);
|
|
("an", YEAR);
|
|
("mois", MONTH);
|
|
("jour", DAY);
|
|
("vrai", TRUE);
|
|
("faux", FALSE);
|
|
]
|
|
@ L.token_list_language_agnostic
|
|
|
|
let builtins : (string * Ast.builtin_expression) list =
|
|
[
|
|
("entier_vers_décimal", Ast.IntToDec);
|
|
("accès_jour", Ast.GetDay);
|
|
("accès_mois", Ast.GetMonth);
|
|
("accès_année", Ast.GetYear);
|
|
]
|
|
|
|
(** Regexp matching any digit character.
|
|
|
|
@note can not be used outside the current module (@see <
|
|
https://github.com/ocaml-community/sedlex#lexer-specifications >). *)
|
|
let digit = [%sedlex.regexp? '0' .. '9']
|
|
|
|
(** Regexp matching at least one space. *)
|
|
let space_plus = [%sedlex.regexp? Plus white_space]
|
|
|
|
(** Main lexing function used in code blocks *)
|
|
let rec lex_code (lexbuf : lexbuf) : token =
|
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
|
let prev_pos = lexing_positions lexbuf in
|
|
match%sedlex lexbuf with
|
|
| white_space | '\n' ->
|
|
(* Whitespaces *)
|
|
L.update_acc lexbuf;
|
|
lex_code lexbuf
|
|
| '#', Star (Compl '\n'), '\n' ->
|
|
(* Comments *)
|
|
L.update_acc lexbuf;
|
|
lex_code lexbuf
|
|
| "```" ->
|
|
(* End of code section *)
|
|
L.is_code := false;
|
|
END_CODE !L.code_string_acc
|
|
| "champ", space_plus, "d\'application" ->
|
|
L.update_acc lexbuf;
|
|
SCOPE
|
|
| "donn", 0xE9, "e" ->
|
|
(* 0xE9 is é *)
|
|
L.update_acc lexbuf;
|
|
DATA
|
|
| "d", 0xE9, "pend", space_plus, "de" ->
|
|
L.update_acc lexbuf;
|
|
DEPENDS
|
|
| "d", 0xE9, "claration" ->
|
|
L.update_acc lexbuf;
|
|
DECLARATION
|
|
| "contexte" ->
|
|
L.update_acc lexbuf;
|
|
CONTEXT
|
|
| "d", 0xE9, "croissant" ->
|
|
L.update_acc lexbuf;
|
|
DECREASING
|
|
| "croissant" ->
|
|
L.update_acc lexbuf;
|
|
INCREASING
|
|
| "de" ->
|
|
L.update_acc lexbuf;
|
|
OF
|
|
| "collection" ->
|
|
L.update_acc lexbuf;
|
|
COLLECTION
|
|
| 0xE9, "num", 0xE9, "ration" ->
|
|
L.update_acc lexbuf;
|
|
ENUM
|
|
| "entier" ->
|
|
L.update_acc lexbuf;
|
|
INTEGER
|
|
| "argent" ->
|
|
L.update_acc lexbuf;
|
|
MONEY
|
|
| "texte" ->
|
|
L.update_acc lexbuf;
|
|
TEXT
|
|
| "d", 0xE9, "cimal" ->
|
|
L.update_acc lexbuf;
|
|
DECIMAL
|
|
| "date" ->
|
|
L.update_acc lexbuf;
|
|
DATE
|
|
| "dur", 0xE9, "e" ->
|
|
L.update_acc lexbuf;
|
|
DURATION
|
|
| "bool", 0xE9, "en" ->
|
|
L.update_acc lexbuf;
|
|
BOOLEAN
|
|
| "somme" ->
|
|
L.update_acc lexbuf;
|
|
SUM
|
|
| "rempli" ->
|
|
L.update_acc lexbuf;
|
|
FILLED
|
|
| "d", 0xE9, "finition" ->
|
|
(* 0xE9 is é *)
|
|
L.update_acc lexbuf;
|
|
DEFINITION
|
|
| 0xE9, "tiquette" ->
|
|
L.update_acc lexbuf;
|
|
LABEL
|
|
| "exception" ->
|
|
L.update_acc lexbuf;
|
|
EXCEPTION
|
|
| 0xE9, "gal ", 0x00E0 ->
|
|
(* 0xE9 is é *)
|
|
L.update_acc lexbuf;
|
|
DEFINED_AS
|
|
| "selon" ->
|
|
L.update_acc lexbuf;
|
|
MATCH
|
|
| "sous", space_plus, "forme" ->
|
|
L.update_acc lexbuf;
|
|
WITH
|
|
| "n'importe", space_plus, "quel" ->
|
|
L.update_acc lexbuf;
|
|
WILDCARD
|
|
| "sous", space_plus, "condition" ->
|
|
L.update_acc lexbuf;
|
|
UNDER_CONDITION
|
|
| "si" ->
|
|
L.update_acc lexbuf;
|
|
IF
|
|
| "cons", 0xE9, "quence" ->
|
|
(* 0xE9 is é *)
|
|
L.update_acc lexbuf;
|
|
CONSEQUENCE
|
|
| "alors" ->
|
|
(* 0xE9 is é *)
|
|
L.update_acc lexbuf;
|
|
THEN
|
|
| "sinon" ->
|
|
L.update_acc lexbuf;
|
|
ELSE
|
|
| "condition" ->
|
|
L.update_acc lexbuf;
|
|
CONDITION
|
|
| "contenu" ->
|
|
L.update_acc lexbuf;
|
|
CONTENT
|
|
| "structure" ->
|
|
L.update_acc lexbuf;
|
|
STRUCT
|
|
| "assertion" ->
|
|
L.update_acc lexbuf;
|
|
ASSERTION
|
|
| "varie" ->
|
|
L.update_acc lexbuf;
|
|
VARIES
|
|
| "avec" ->
|
|
L.update_acc lexbuf;
|
|
WITH_V
|
|
| "pour" ->
|
|
L.update_acc lexbuf;
|
|
FOR
|
|
| "tout" ->
|
|
L.update_acc lexbuf;
|
|
ALL
|
|
| "on", space_plus, "a" ->
|
|
L.update_acc lexbuf;
|
|
WE_HAVE
|
|
| "fix", 0xE9 ->
|
|
(* 0xE9 is é *)
|
|
L.update_acc lexbuf;
|
|
FIXED
|
|
| "par" ->
|
|
L.update_acc lexbuf;
|
|
BY
|
|
| "r", 0xE8, "gle" ->
|
|
(* 0xE8 is è *)
|
|
L.update_acc lexbuf;
|
|
RULE
|
|
| "existe" ->
|
|
L.update_acc lexbuf;
|
|
EXISTS
|
|
| "dans" ->
|
|
L.update_acc lexbuf;
|
|
IN
|
|
| "tel" ->
|
|
L.update_acc lexbuf;
|
|
SUCH
|
|
| "que" ->
|
|
L.update_acc lexbuf;
|
|
THAT
|
|
| "et" ->
|
|
L.update_acc lexbuf;
|
|
AND
|
|
| "ou" ->
|
|
L.update_acc lexbuf;
|
|
OR
|
|
| "ou", space_plus, "bien" ->
|
|
L.update_acc lexbuf;
|
|
XOR
|
|
| "non" ->
|
|
L.update_acc lexbuf;
|
|
NOT
|
|
| "nombre" ->
|
|
L.update_acc lexbuf;
|
|
CARDINAL
|
|
| "maximum" ->
|
|
L.update_acc lexbuf;
|
|
MAXIMUM
|
|
| "minimum" ->
|
|
L.update_acc lexbuf;
|
|
MINIMUM
|
|
| "filtre" ->
|
|
L.update_acc lexbuf;
|
|
FILTER
|
|
| "application" ->
|
|
L.update_acc lexbuf;
|
|
MAP
|
|
| "initial" ->
|
|
L.update_acc lexbuf;
|
|
INIT
|
|
| "vrai" ->
|
|
L.update_acc lexbuf;
|
|
TRUE
|
|
| "faux" ->
|
|
L.update_acc lexbuf;
|
|
FALSE
|
|
| "an" ->
|
|
L.update_acc lexbuf;
|
|
YEAR
|
|
| "mois" ->
|
|
L.update_acc lexbuf;
|
|
MONTH
|
|
| "jour" ->
|
|
L.update_acc lexbuf;
|
|
DAY
|
|
| digit, Star (digit | white_space), Opt (',', Rep (digit, 0 .. 2)), Star white_space, 0x20AC ->
|
|
let extract_parts = R.regexp "([0-9]([0-9 ]*[0-9]|))(,([0-9]{0,2})|)" in
|
|
let full_str = Utf8.lexeme lexbuf in
|
|
let only_numbers_str = String.trim (String.sub full_str 0 (String.length full_str - 1)) in
|
|
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
|
|
(* Integer literal*)
|
|
let units = parts 1 in
|
|
let remove_spaces = R.regexp " " in
|
|
let units =
|
|
Runtime.integer_of_string (R.substitute ~rex:remove_spaces ~subst:(fun _ -> "") units)
|
|
in
|
|
let cents =
|
|
try Runtime.integer_of_string (parts 4) with Not_found -> Runtime.integer_of_int 0
|
|
in
|
|
L.update_acc lexbuf;
|
|
MONEY_AMOUNT (units, cents)
|
|
| Plus digit, ',', Star digit ->
|
|
let extract_code_title = R.regexp "([0-9]+),([0-9]*)" in
|
|
let dec_parts = R.get_substring (R.exec ~rex:extract_code_title (Utf8.lexeme lexbuf)) in
|
|
(* Integer literal*)
|
|
L.update_acc lexbuf;
|
|
DECIMAL_LITERAL
|
|
(Runtime.integer_of_string (dec_parts 1), Runtime.integer_of_string (dec_parts 2))
|
|
| "<=@" ->
|
|
L.update_acc lexbuf;
|
|
LESSER_EQUAL_DATE
|
|
| "<@" ->
|
|
L.update_acc lexbuf;
|
|
LESSER_DATE
|
|
| ">=@" ->
|
|
L.update_acc lexbuf;
|
|
GREATER_EQUAL_DATE
|
|
| ">@" ->
|
|
L.update_acc lexbuf;
|
|
GREATER_DATE
|
|
| "-@" ->
|
|
L.update_acc lexbuf;
|
|
MINUSDATE
|
|
| "+@" ->
|
|
L.update_acc lexbuf;
|
|
PLUSDATE
|
|
| "<=^" ->
|
|
L.update_acc lexbuf;
|
|
LESSER_EQUAL_DURATION
|
|
| "<^" ->
|
|
L.update_acc lexbuf;
|
|
LESSER_DURATION
|
|
| ">=^" ->
|
|
L.update_acc lexbuf;
|
|
GREATER_EQUAL_DURATION
|
|
| ">^" ->
|
|
L.update_acc lexbuf;
|
|
GREATER_DURATION
|
|
| "+^" ->
|
|
L.update_acc lexbuf;
|
|
PLUSDURATION
|
|
| "-^" ->
|
|
L.update_acc lexbuf;
|
|
MINUSDURATION
|
|
| "/^" ->
|
|
L.update_acc lexbuf;
|
|
DIVDURATION
|
|
| "<=", 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
LESSER_EQUAL_MONEY
|
|
| '<', 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
LESSER_MONEY
|
|
| ">=", 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
GREATER_EQUAL_MONEY
|
|
| '>', 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
GREATER_MONEY
|
|
| '+', 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
PLUSMONEY
|
|
| '-', 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
MINUSMONEY
|
|
| '*', 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
MULTMONEY
|
|
| '/', 0x20AC ->
|
|
L.update_acc lexbuf;
|
|
DIVMONEY
|
|
| "<=." ->
|
|
L.update_acc lexbuf;
|
|
LESSER_EQUAL_DEC
|
|
| "<." ->
|
|
L.update_acc lexbuf;
|
|
LESSER_DEC
|
|
| ">=." ->
|
|
L.update_acc lexbuf;
|
|
GREATER_EQUAL_DEC
|
|
| ">." ->
|
|
L.update_acc lexbuf;
|
|
GREATER_DEC
|
|
| "+." ->
|
|
L.update_acc lexbuf;
|
|
PLUSDEC
|
|
| "-." ->
|
|
L.update_acc lexbuf;
|
|
MINUSDEC
|
|
| "*." ->
|
|
L.update_acc lexbuf;
|
|
MULTDEC
|
|
| "/." ->
|
|
L.update_acc lexbuf;
|
|
DIVDEC
|
|
| "<=" ->
|
|
L.update_acc lexbuf;
|
|
LESSER_EQUAL
|
|
| '<' ->
|
|
L.update_acc lexbuf;
|
|
LESSER
|
|
| ">=" ->
|
|
L.update_acc lexbuf;
|
|
GREATER_EQUAL
|
|
| '>' ->
|
|
L.update_acc lexbuf;
|
|
GREATER
|
|
| '+' ->
|
|
L.update_acc lexbuf;
|
|
PLUS
|
|
| '-' ->
|
|
L.update_acc lexbuf;
|
|
MINUS
|
|
| '*' ->
|
|
L.update_acc lexbuf;
|
|
MULT
|
|
| '/' ->
|
|
L.update_acc lexbuf;
|
|
DIV
|
|
| "!=" ->
|
|
L.update_acc lexbuf;
|
|
NOT_EQUAL
|
|
| '=' ->
|
|
L.update_acc lexbuf;
|
|
EQUAL
|
|
| '%' ->
|
|
L.update_acc lexbuf;
|
|
PERCENT
|
|
| '(' ->
|
|
L.update_acc lexbuf;
|
|
LPAREN
|
|
| ')' ->
|
|
L.update_acc lexbuf;
|
|
RPAREN
|
|
| '{' ->
|
|
L.update_acc lexbuf;
|
|
LBRACKET
|
|
| '}' ->
|
|
L.update_acc lexbuf;
|
|
RBRACKET
|
|
| '[' ->
|
|
L.update_acc lexbuf;
|
|
LSQUARE
|
|
| ']' ->
|
|
L.update_acc lexbuf;
|
|
RSQUARE
|
|
| '|' ->
|
|
L.update_acc lexbuf;
|
|
VERTICAL
|
|
| ':' ->
|
|
L.update_acc lexbuf;
|
|
COLON
|
|
| ';' ->
|
|
L.update_acc lexbuf;
|
|
SEMICOLON
|
|
| "--" ->
|
|
L.update_acc lexbuf;
|
|
ALT
|
|
| '.' ->
|
|
L.update_acc lexbuf;
|
|
DOT
|
|
| uppercase, Star (uppercase | lowercase | digit | '_' | '\'') ->
|
|
(* Name of constructor *)
|
|
L.update_acc lexbuf;
|
|
CONSTRUCTOR (Utf8.lexeme lexbuf)
|
|
| lowercase, Star (lowercase | uppercase | digit | '_' | '\'') ->
|
|
(* Name of variable *)
|
|
L.update_acc lexbuf;
|
|
IDENT (Utf8.lexeme lexbuf)
|
|
| Plus digit ->
|
|
(* Integer literal*)
|
|
L.update_acc lexbuf;
|
|
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
|
|
|
(** Main lexing function used outside code blocks *)
|
|
let lex_law (lexbuf : lexbuf) : token =
|
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
|
let prev_pos = lexing_positions lexbuf in
|
|
let compl_catala =
|
|
[%sedlex.regexp?
|
|
( Compl 'c'
|
|
| 'c', Compl 'a'
|
|
| "ca", Compl 't'
|
|
| "cat", Compl 'a'
|
|
| "cata", Compl 'l'
|
|
| "catal", Compl 'a'
|
|
| "catala", Compl (white_space | '\n') )]
|
|
in
|
|
match%sedlex lexbuf with
|
|
| "```catala" ->
|
|
L.is_code := true;
|
|
L.code_string_acc := "";
|
|
|
|
BEGIN_CODE
|
|
| eof -> EOF
|
|
| '>', Star white_space, 'D', 0xE9, "but m", 0xE9, "tadonn", 0xE9, "es" -> BEGIN_METADATA
|
|
| '>', Star white_space, "Fin m", 0xE9, "tadonn", 0xE9, "es" -> END_METADATA
|
|
| ( '>',
|
|
Star white_space,
|
|
"Inclusion:",
|
|
Star white_space,
|
|
Plus (Compl ('@' | '\n')),
|
|
Star white_space,
|
|
Opt ('@', Star white_space, "p.", Star white_space, Plus digit, Star white_space),
|
|
'\n' ) ->
|
|
let extract_components =
|
|
R.regexp ">\\s*Inclusion\\:\\s*([^@\\n]+)\\s*(@\\s*p\\.\\s*([0-9]+)|)"
|
|
in
|
|
let get_component = R.get_substring (R.exec ~rex:extract_components (Utf8.lexeme lexbuf)) in
|
|
let jorftext = R.regexp "JORFTEXT\\d{12}" in
|
|
let name = get_component 1 in
|
|
let pages = try Some (int_of_string (get_component 3)) with Not_found -> None in
|
|
let pos = lexing_positions lexbuf in
|
|
if R.pmatch ~rex:jorftext name then
|
|
LAW_INCLUDE (Ast.LegislativeText (name, Pos.from_lpos pos))
|
|
else if Filename.extension name = ".pdf" then
|
|
LAW_INCLUDE (Ast.PdfFile ((name, Pos.from_lpos pos), pages))
|
|
else LAW_INCLUDE (Ast.CatalaFile (name, Pos.from_lpos pos))
|
|
| Plus '#', Star white_space, Plus (Compl '\n'), Star white_space, '\n' ->
|
|
L.get_law_heading lexbuf
|
|
| Plus
|
|
(* Match non-special characters, i.e. characters that doesn't appear at the start of a
|
|
previous regexp. *)
|
|
( Compl ('#' | '`' | '>')
|
|
(* Following literals allow to match grave accents as long as they don't conflict with the
|
|
[BEGIN_CODE] token, i.e. either there are no more than three consecutive ones or they must
|
|
not be followed by 'catala'. *)
|
|
| Rep ('`', 1 .. 2), Compl '`'
|
|
| "```", compl_catala ) ->
|
|
LAW_TEXT (Utf8.lexeme lexbuf)
|
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
|
|
|
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
|
Surface.Lexer_common.is_code}. *)
|
|
let lexer (lexbuf : lexbuf) : token = if !L.is_code then lex_code lexbuf else lex_law lexbuf
|