2021-05-09 23:55:50 +03:00
|
|
|
(* This file is part of the Catala compiler, a specification language for tax and social benefits
|
2021-05-27 19:56:47 +03:00
|
|
|
computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux
|
|
|
|
<denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>
|
2021-05-09 23:55:50 +03:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
|
|
|
in compliance with the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
|
|
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
|
|
or implied. See the License for the specific language governing permissions and limitations under
|
|
|
|
the License. *)
|
|
|
|
|
|
|
|
open Tokens
|
|
|
|
open Sedlexing
|
|
|
|
open Utils
|
2021-05-26 18:39:39 +03:00
|
|
|
module L = Lexer_common
|
2021-05-09 23:55:50 +03:00
|
|
|
module R = Re.Pcre
|
|
|
|
|
2021-05-26 18:39:39 +03:00
|
|
|
(** Same as {!val: Surface.Lexer_common.token_list_language_agnostic}, but with tokens specialized
|
|
|
|
to Polish. *)
|
2021-05-09 23:55:50 +03:00
|
|
|
let token_list : (string * token) list =
|
|
|
|
[
|
|
|
|
("zakres", SCOPE);
|
|
|
|
("konsekwencja", CONSEQUENCE);
|
|
|
|
("data", DATA);
|
|
|
|
("zalezy od", DEPENDS);
|
|
|
|
("deklaracja", DECLARATION);
|
|
|
|
("kontekst", CONTEXT);
|
|
|
|
("malejacy", DECREASING);
|
|
|
|
("rosnacy", INCREASING);
|
|
|
|
("z", OF);
|
|
|
|
("kolekcja", COLLECTION);
|
|
|
|
("enumeracja", ENUM);
|
|
|
|
("calkowita", INTEGER);
|
2021-05-13 20:00:33 +03:00
|
|
|
("pieniądze", MONEY);
|
2021-05-09 23:55:50 +03:00
|
|
|
("tekst", TEXT);
|
2021-05-16 21:38:38 +03:00
|
|
|
("dziesiętny", DECIMAL);
|
2021-05-09 23:55:50 +03:00
|
|
|
("czas", DATE);
|
|
|
|
("czas trwania", DURATION);
|
|
|
|
("zerojedynkowy", BOOLEAN);
|
|
|
|
("suma", SUM);
|
|
|
|
("spelnione", FILLED);
|
|
|
|
("definicja", DEFINITION);
|
|
|
|
("etykieta", LABEL);
|
2021-05-16 22:07:14 +03:00
|
|
|
("wyjątek", EXCEPTION);
|
2021-05-09 23:55:50 +03:00
|
|
|
("wynosi", DEFINED_AS);
|
|
|
|
("pasuje", MATCH);
|
|
|
|
("ze wzorem", WITH);
|
2021-06-03 16:33:40 +03:00
|
|
|
("cokolwiek", WILDCARD);
|
2021-05-09 23:55:50 +03:00
|
|
|
("pod warunkiem", UNDER_CONDITION);
|
|
|
|
("jezeli", IF);
|
|
|
|
("wtedy", THEN);
|
|
|
|
("inaczej", ELSE);
|
2021-05-12 20:42:40 +03:00
|
|
|
("typu", CONTENT);
|
2021-05-09 23:55:50 +03:00
|
|
|
("struktura", STRUCT);
|
|
|
|
("asercja", ASSERTION);
|
|
|
|
("rozna", VARIES);
|
2021-05-10 20:05:18 +03:00
|
|
|
("wraz z", WITH_V);
|
2021-05-09 23:55:50 +03:00
|
|
|
("dla", FOR);
|
|
|
|
("wszystkie", ALL);
|
|
|
|
("mamy", WE_HAVE);
|
|
|
|
("staloprzecinkowa", FIXED);
|
|
|
|
("przez", BY);
|
|
|
|
("zasada", RULE);
|
|
|
|
("istnieje", EXISTS);
|
|
|
|
("takie ze", SUCH);
|
|
|
|
("to", THAT);
|
|
|
|
("i", AND);
|
|
|
|
("lub", OR);
|
|
|
|
("xor", XOR);
|
|
|
|
("nie", NOT);
|
|
|
|
("maximum", MAXIMUM);
|
|
|
|
("minimum", MINIMUM);
|
|
|
|
("filtr", FILTER);
|
|
|
|
("mapuj", MAP);
|
|
|
|
("poczatkowy", INIT);
|
|
|
|
("liczba", CARDINAL);
|
|
|
|
("rok", YEAR);
|
|
|
|
("miesiac", MONTH);
|
|
|
|
("dzien", DAY);
|
|
|
|
("prawda", TRUE);
|
|
|
|
("falsz", FALSE);
|
|
|
|
]
|
|
|
|
@ L.token_list_language_agnostic
|
|
|
|
|
2021-05-24 17:23:45 +03:00
|
|
|
(** Localised builtin functions
|
|
|
|
|
|
|
|
@note (EmileRolley): shouldn't the builtin functions also be translated ? *)
|
2021-05-09 23:55:50 +03:00
|
|
|
let builtins : (string * Ast.builtin_expression) list =
|
|
|
|
[
|
|
|
|
("integer_to_decimal", IntToDec);
|
|
|
|
("get_day", GetDay);
|
|
|
|
("get_month", GetMonth);
|
|
|
|
("get_year", GetYear);
|
|
|
|
]
|
|
|
|
|
2021-05-24 17:23:45 +03:00
|
|
|
(** Regexp matching any digit character.
|
|
|
|
|
|
|
|
@note can not be used outside the current module (@see <
|
|
|
|
https://github.com/ocaml-community/sedlex#lexer-specifications >). *)
|
|
|
|
let digit = [%sedlex.regexp? '0' .. '9']
|
|
|
|
|
2021-06-03 16:47:48 +03:00
|
|
|
(** Regexp matching at least one space. *)
|
|
|
|
let space_plus = [%sedlex.regexp? Plus white_space]
|
|
|
|
|
2021-08-17 16:49:48 +03:00
|
|
|
(** Regexp matching white space but not newlines *)
|
|
|
|
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
|
|
|
|
|
2021-05-09 23:55:50 +03:00
|
|
|
(** Main lexing function used in code blocks *)
|
|
|
|
let rec lex_code (lexbuf : lexbuf) : token =
|
|
|
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
|
|
|
let prev_pos = lexing_positions lexbuf in
|
|
|
|
match%sedlex lexbuf with
|
|
|
|
| white_space ->
|
|
|
|
(* Whitespaces *)
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
lex_code lexbuf
|
|
|
|
| '#', Star (Compl '\n'), '\n' ->
|
|
|
|
(* Comments *)
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
lex_code lexbuf
|
|
|
|
| "```" ->
|
|
|
|
(* End of code section *)
|
2021-08-17 16:49:48 +03:00
|
|
|
L.context := Law;
|
|
|
|
END_CODE (Buffer.contents L.code_buffer)
|
2021-05-09 23:55:50 +03:00
|
|
|
| "zakres" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
SCOPE
|
|
|
|
| "data" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DATA
|
2021-06-03 16:47:48 +03:00
|
|
|
| "zalezy", space_plus, "od" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
DEPENDS
|
|
|
|
| "deklaracja" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DECLARATION
|
|
|
|
| "kontekst" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
CONTEXT
|
|
|
|
| "malejacy" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DECREASING
|
|
|
|
| "rosnacy" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
INCREASING
|
|
|
|
| "z" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
OF
|
|
|
|
| "kolekcja" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
COLLECTION
|
|
|
|
| "enumeracja" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
ENUM
|
|
|
|
| "calkowita" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
INTEGER
|
2021-05-13 20:00:33 +03:00
|
|
|
| "pieni", 0x0105, "dze" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
MONEY
|
|
|
|
| "tekst" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
TEXT
|
2021-05-16 21:38:38 +03:00
|
|
|
| "dziesi", 0x0119, "tny" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
DECIMAL
|
|
|
|
| "czas" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DATE
|
2021-06-03 16:47:48 +03:00
|
|
|
| "czas", space_plus, "trwania" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
DURATION
|
|
|
|
| "zerojedynkowy" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
BOOLEAN
|
|
|
|
| "suma" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
SUM
|
|
|
|
| "spelnione" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
FILLED
|
|
|
|
| "definicja" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DEFINITION
|
|
|
|
| "etykieta" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LABEL
|
2021-05-16 22:07:14 +03:00
|
|
|
| "wyj", 0x0105, "tek" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
EXCEPTION
|
|
|
|
| "wynosi" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DEFINED_AS
|
|
|
|
| "pasuje" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MATCH
|
2021-06-03 16:47:48 +03:00
|
|
|
| "ze", space_plus, "wzorem" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
WITH
|
2021-06-03 16:33:40 +03:00
|
|
|
| "cokolwiek" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
WILDCARD
|
2021-06-03 16:47:48 +03:00
|
|
|
| "pod", space_plus, "warunkiem" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
UNDER_CONDITION
|
|
|
|
| "jezeli" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
IF
|
|
|
|
| "konsekwencja" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
CONSEQUENCE
|
|
|
|
| "wtedy" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
THEN
|
|
|
|
| "inaczej" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
ELSE
|
2021-05-12 19:54:09 +03:00
|
|
|
| "warunek" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
CONDITION
|
2021-05-12 20:42:40 +03:00
|
|
|
| "typu" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
CONTENT
|
|
|
|
| "struktura" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
STRUCT
|
|
|
|
| "asercja" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
ASSERTION
|
|
|
|
| "rozna" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
VARIES
|
2021-06-03 16:47:48 +03:00
|
|
|
| "wraz", space_plus, "z" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
WITH_V
|
|
|
|
| "dla" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
FOR
|
|
|
|
| "wszystkie" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
ALL
|
|
|
|
| "mamy" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
WE_HAVE
|
|
|
|
| "staloprzecinkowa" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
FIXED
|
|
|
|
| "przez" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
BY
|
|
|
|
| "zasada" ->
|
|
|
|
(* 0xE8 is è *)
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
RULE
|
|
|
|
| "istnieje" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
EXISTS
|
|
|
|
| "in" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
IN
|
2021-06-03 16:47:48 +03:00
|
|
|
| "takie", space_plus, "ze" ->
|
2021-05-09 23:55:50 +03:00
|
|
|
L.update_acc lexbuf;
|
|
|
|
SUCH
|
|
|
|
| "to" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
THAT
|
|
|
|
| "i" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
AND
|
|
|
|
| "lub" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
OR
|
|
|
|
| "xor" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
XOR
|
|
|
|
| "nie" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
NOT
|
|
|
|
| "maximum" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MAXIMUM
|
|
|
|
| "minimum" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MINIMUM
|
|
|
|
| "filtr" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
FILTER
|
|
|
|
| "mapuj" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MAP
|
|
|
|
| "poczatkowy" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
INIT
|
|
|
|
| "liczba" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
CARDINAL
|
|
|
|
| "prawda" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
TRUE
|
|
|
|
| "falsz" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
FALSE
|
|
|
|
| "rok" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
YEAR
|
|
|
|
| "miesiac" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MONTH
|
|
|
|
| "dzien" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DAY
|
2021-05-13 11:22:21 +03:00
|
|
|
| ( Star white_space,
|
2021-05-24 17:23:45 +03:00
|
|
|
digit,
|
|
|
|
Star (digit | ','),
|
|
|
|
Opt ('.', Rep (digit, 0 .. 2)),
|
2021-05-13 11:22:21 +03:00
|
|
|
Star white_space,
|
|
|
|
"PLN" ) ->
|
2021-05-09 23:55:50 +03:00
|
|
|
let extract_parts = R.regexp "([0-9]([0-9,]*[0-9]|))(.([0-9]{0,2})|)" in
|
|
|
|
let full_str = Utf8.lexeme lexbuf in
|
|
|
|
let only_numbers_str = String.trim (String.sub full_str 1 (String.length full_str - 1)) in
|
|
|
|
let parts = R.get_substring (R.exec ~rex:extract_parts only_numbers_str) in
|
|
|
|
(* Integer literal*)
|
|
|
|
let units = parts 1 in
|
|
|
|
let remove_commas = R.regexp "," in
|
|
|
|
let units =
|
|
|
|
Runtime.integer_of_string (R.substitute ~rex:remove_commas ~subst:(fun _ -> "") units)
|
|
|
|
in
|
|
|
|
let cents =
|
|
|
|
try Runtime.integer_of_string (parts 4) with Not_found -> Runtime.integer_of_int 0
|
|
|
|
in
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MONEY_AMOUNT (units, cents)
|
2021-05-24 17:23:45 +03:00
|
|
|
| Plus digit, '.', Star digit ->
|
2021-05-09 23:55:50 +03:00
|
|
|
let extract_code_title = R.regexp "([0-9]+)\\.([0-9]*)" in
|
|
|
|
let dec_parts = R.get_substring (R.exec ~rex:extract_code_title (Utf8.lexeme lexbuf)) in
|
|
|
|
(* Integer literal*)
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DECIMAL_LITERAL
|
|
|
|
(Runtime.integer_of_string (dec_parts 1), Runtime.integer_of_string (dec_parts 2))
|
|
|
|
| "<=@" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_EQUAL_DATE
|
|
|
|
| "<@" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_DATE
|
|
|
|
| ">=@" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_EQUAL_DATE
|
|
|
|
| ">@" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_DATE
|
|
|
|
| "-@" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MINUSDATE
|
|
|
|
| "+@" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
PLUSDATE
|
|
|
|
| "<=^" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_EQUAL_DURATION
|
|
|
|
| "<^" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_DURATION
|
|
|
|
| ">=^" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_EQUAL_DURATION
|
|
|
|
| ">^" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_DURATION
|
|
|
|
| "+^" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
PLUSDURATION
|
|
|
|
| "-^" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MINUSDURATION
|
2021-05-30 14:21:28 +03:00
|
|
|
| "/^" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DIVDURATION
|
2021-05-09 23:55:50 +03:00
|
|
|
| "<=", 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_EQUAL_MONEY
|
|
|
|
| '<', 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_MONEY
|
|
|
|
| ">=", 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_EQUAL_MONEY
|
|
|
|
| '>', 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_MONEY
|
|
|
|
| '+', 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
PLUSMONEY
|
|
|
|
| '-', 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MINUSMONEY
|
|
|
|
| '*', 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MULTMONEY
|
|
|
|
| '/', 0x24 ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DIVMONEY
|
|
|
|
| "<=." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_EQUAL_DEC
|
|
|
|
| "<." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_DEC
|
|
|
|
| ">=." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_EQUAL_DEC
|
|
|
|
| ">." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_DEC
|
|
|
|
| "+." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
PLUSDEC
|
|
|
|
| "-." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MINUSDEC
|
|
|
|
| "*." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MULTDEC
|
|
|
|
| "/." ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DIVDEC
|
|
|
|
| "<=" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER_EQUAL
|
|
|
|
| '<' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LESSER
|
|
|
|
| ">=" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER_EQUAL
|
|
|
|
| '>' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
GREATER
|
|
|
|
| '+' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
PLUS
|
|
|
|
| '-' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MINUS
|
|
|
|
| '*' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
MULT
|
|
|
|
| '/' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DIV
|
|
|
|
| "!=" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
NOT_EQUAL
|
|
|
|
| '=' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
EQUAL
|
|
|
|
| '%' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
PERCENT
|
|
|
|
| '(' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LPAREN
|
|
|
|
| ')' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
RPAREN
|
|
|
|
| '{' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LBRACKET
|
|
|
|
| '}' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
RBRACKET
|
|
|
|
| '[' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
LSQUARE
|
|
|
|
| ']' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
RSQUARE
|
|
|
|
| '|' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
VERTICAL
|
|
|
|
| ':' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
COLON
|
|
|
|
| ';' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
SEMICOLON
|
|
|
|
| "--" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
ALT
|
2021-07-08 17:27:46 +03:00
|
|
|
| "++" ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
CONCAT
|
2021-05-09 23:55:50 +03:00
|
|
|
| '.' ->
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
DOT
|
2021-05-24 17:23:45 +03:00
|
|
|
| uppercase, Star (uppercase | lowercase | digit | '_' | '\'') ->
|
2021-05-09 23:55:50 +03:00
|
|
|
(* Name of constructor *)
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
CONSTRUCTOR (Utf8.lexeme lexbuf)
|
2021-05-24 17:23:45 +03:00
|
|
|
| lowercase, Star (lowercase | uppercase | digit | '_' | '\'') ->
|
2021-05-09 23:55:50 +03:00
|
|
|
(* Name of variable *)
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
IDENT (Utf8.lexeme lexbuf)
|
2021-05-24 17:23:45 +03:00
|
|
|
| Plus digit ->
|
2021-05-09 23:55:50 +03:00
|
|
|
(* Integer literal*)
|
|
|
|
L.update_acc lexbuf;
|
|
|
|
INT_LITERAL (Runtime.integer_of_string (Utf8.lexeme lexbuf))
|
|
|
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
|
|
|
|
2021-08-17 16:49:48 +03:00
|
|
|
let rec lex_directive_args (lexbuf : lexbuf) : token =
|
2021-05-09 23:55:50 +03:00
|
|
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
|
|
|
let prev_pos = lexing_positions lexbuf in
|
|
|
|
match%sedlex lexbuf with
|
2021-08-17 16:49:48 +03:00
|
|
|
| '@', Star hspace, "p.", Star hspace, Plus digit ->
|
|
|
|
let s = Utf8.lexeme lexbuf in
|
|
|
|
let i = String.index s '.' in
|
|
|
|
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
|
|
|
|
| Compl (white_space | '@'), Star (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
|
|
|
|
| Plus hspace -> lex_directive_args lexbuf
|
|
|
|
| '\n' | eof ->
|
|
|
|
L.context := Law;
|
|
|
|
END_DIRECTIVE
|
|
|
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
2021-05-09 23:55:50 +03:00
|
|
|
|
2021-08-17 16:49:48 +03:00
|
|
|
let rec lex_directive (lexbuf : lexbuf) : token =
|
|
|
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
|
|
|
let prev_pos = lexing_positions lexbuf in
|
|
|
|
match%sedlex lexbuf with
|
|
|
|
| Plus hspace -> lex_directive lexbuf
|
|
|
|
| "Poczatek", Plus hspace, "metadanych" -> BEGIN_METADATA
|
|
|
|
| "Koniec", Plus hspace, "metadanych" -> END_METADATA
|
|
|
|
| "Include", Star hspace -> LAW_INCLUDE
|
|
|
|
| ":" ->
|
|
|
|
L.context := Directive_args;
|
|
|
|
COLON
|
|
|
|
| '\n' | eof ->
|
|
|
|
L.context := Law;
|
|
|
|
END_DIRECTIVE
|
2021-05-09 23:55:50 +03:00
|
|
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
|
|
|
|
2021-08-17 16:49:48 +03:00
|
|
|
(** Main lexing function used outside code blocks *)
|
|
|
|
and lex_law (lexbuf : lexbuf) : token =
|
|
|
|
let prev_lexeme = Utf8.lexeme lexbuf in
|
|
|
|
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
|
|
|
|
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
|
|
|
|
if at_bol then
|
|
|
|
match%sedlex lexbuf with
|
|
|
|
| eof -> EOF
|
|
|
|
| "```catala", Plus white_space ->
|
|
|
|
L.context := Code;
|
|
|
|
Buffer.clear L.code_buffer;
|
|
|
|
BEGIN_CODE
|
|
|
|
| '>', Star hspace ->
|
|
|
|
L.context := Directive;
|
|
|
|
BEGIN_DIRECTIVE
|
|
|
|
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
|
|
|
|
L.get_law_heading lexbuf
|
|
|
|
| _ -> (
|
|
|
|
(* Nested match for lower priority; `_` matches length 0 *)
|
|
|
|
let lexbuf = lexbuf in
|
|
|
|
(* workaround sedlex bug, see https://github.com/ocaml-community/sedlex/issues/12 *)
|
|
|
|
match%sedlex lexbuf with
|
|
|
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
|
|
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
|
|
|
|
else
|
|
|
|
match%sedlex lexbuf with
|
|
|
|
| eof -> EOF
|
|
|
|
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
|
|
|
|
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
|
|
|
|
|
2021-05-09 23:55:50 +03:00
|
|
|
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of {!val:
|
2021-05-26 18:39:39 +03:00
|
|
|
Surface.Lexer_common.is_code}. *)
|
2021-08-17 16:49:48 +03:00
|
|
|
let lexer (lexbuf : lexbuf) : token =
|
|
|
|
match !L.context with
|
|
|
|
| Law -> lex_law lexbuf
|
|
|
|
| Code -> lex_code lexbuf
|
|
|
|
| Directive -> lex_directive lexbuf
|
|
|
|
| Directive_args -> lex_directive_args lexbuf
|