catala/compiler/surface/lexer.cppo.ml

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

908 lines
24 KiB
OCaml
Raw Permalink Normal View History

2021-05-09 23:55:50 +03:00
(* This file is part of the Catala compiler, a specification language for tax and social benefits
computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux
<denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>
2021-05-09 23:55:50 +03:00
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
or implied. See the License for the specific language governing permissions and limitations under
the License. *)
open Tokens
open Sedlexing
2022-11-21 12:46:17 +03:00
open Catala_utils
module L = Lexer_common
2021-05-09 23:55:50 +03:00
module R = Re.Pcre
2021-08-19 19:26:06 +03:00
(* The localised strings and regexps for the tokens and specific parsing rules
are defined as CPPO macros in the `lexer_XX.cppo.ml` files.
- `MS_*` macros define token strings
- `MR_*` are sedlex regexps matching the token (inferred from the strings if absent,
but should be present for any token containing spacing, and for any non-latin1
character)
- `MX_*` are full matching rules of the form `sedlex regexp -> ocaml expression`
*)
(* Avoid the need for defining the regexps when they are simple strings *)
#ifndef MR_SCOPE
#define MR_SCOPE MS_SCOPE
#endif
#ifndef MR_CONSEQUENCE
#define MR_CONSEQUENCE MS_CONSEQUENCE
#endif
#ifndef MR_DATA
#define MR_DATA MS_DATA
#endif
#ifndef MR_DEPENDS
#define MR_DEPENDS MS_DEPENDS
#endif
#ifndef MR_DECLARATION
#define MR_DECLARATION MS_DECLARATION
#endif
#ifndef MR_CONTEXT
#define MR_CONTEXT MS_CONTEXT
#endif
#ifndef MR_DECREASING
#define MR_DECREASING MS_DECREASING
#endif
#ifndef MR_INCREASING
#define MR_INCREASING MS_INCREASING
#endif
#ifndef MR_OF
#define MR_OF MS_OF
#endif
#ifndef MR_LIST
#define MR_LIST MS_LIST
2021-08-19 19:26:06 +03:00
#endif
#ifndef MR_CONTAINS
#define MR_CONTAINS MS_CONTAINS
#endif
2021-08-19 19:26:06 +03:00
#ifndef MR_ENUM
#define MR_ENUM MS_ENUM
#endif
#ifndef MR_INTEGER
#define MR_INTEGER MS_INTEGER
#endif
#ifndef MR_MONEY
#define MR_MONEY MS_MONEY
#endif
#ifndef MR_TEXT
#define MR_TEXT MS_TEXT
#endif
#ifndef MR_DECIMAL
#define MR_DECIMAL MS_DECIMAL
#endif
#ifndef MR_DATE
#define MR_DATE MS_DATE
#endif
#ifndef MR_DURATION
#define MR_DURATION MS_DURATION
#endif
#ifndef MR_BOOLEAN
#define MR_BOOLEAN MS_BOOLEAN
#endif
#ifndef MR_SUM
#define MR_SUM MS_SUM
#endif
#ifndef MR_FILLED
#define MR_FILLED MS_FILLED
#endif
#ifndef MR_DEFINITION
#define MR_DEFINITION MS_DEFINITION
#endif
2022-02-28 16:33:07 +03:00
#ifndef MR_STATE
#define MR_STATE MS_STATE
#endif
2021-08-19 19:26:06 +03:00
#ifndef MR_LABEL
#define MR_LABEL MS_LABEL
#endif
#ifndef MR_EXCEPTION
#define MR_EXCEPTION MS_EXCEPTION
#endif
#ifndef MR_DEFINED_AS
#define MR_DEFINED_AS MS_DEFINED_AS
#endif
#ifndef MR_MATCH
#define MR_MATCH MS_MATCH
#endif
#ifndef MR_WILDCARD
#define MR_WILDCARD MS_WILDCARD
#endif
#ifndef MR_WITH
#define MR_WITH MS_WITH
#endif
#ifndef MR_UNDER_CONDITION
#define MR_UNDER_CONDITION MS_UNDER_CONDITION
#endif
#ifndef MR_IF
#define MR_IF MS_IF
#endif
#ifndef MR_THEN
#define MR_THEN MS_THEN
#endif
#ifndef MR_ELSE
#define MR_ELSE MS_ELSE
#endif
#ifndef MR_CONDITION
#define MR_CONDITION MS_CONDITION
#endif
#ifndef MR_CONTENT
#define MR_CONTENT MS_CONTENT
#endif
#ifndef MR_STRUCT
#define MR_STRUCT MS_STRUCT
#endif
#ifndef MR_ASSERTION
#define MR_ASSERTION MS_ASSERTION
#endif
#ifndef MR_WITH_V
#define MR_WITH_V MS_WITH_V
#endif
#ifndef MR_FOR
#define MR_FOR MS_FOR
#endif
#ifndef MR_ALL
#define MR_ALL MS_ALL
#endif
#ifndef MR_WE_HAVE
#define MR_WE_HAVE MS_WE_HAVE
#endif
#ifndef MR_RULE
#define MR_RULE MS_RULE
#endif
2022-07-26 14:40:43 +03:00
#ifndef MR_LET
#define MR_LET MS_LET
#endif
2021-08-19 19:26:06 +03:00
#ifndef MR_EXISTS
#define MR_EXISTS MS_EXISTS
#endif
#ifndef MR_IN
#define MR_IN MS_IN
#endif
#ifndef MR_AMONG
#define MR_AMONG MS_AMONG
#endif
2021-08-19 19:26:06 +03:00
#ifndef MR_SUCH
#define MR_SUCH MS_SUCH
#endif
#ifndef MR_THAT
#define MR_THAT MS_THAT
#endif
#ifndef MR_AND
#define MR_AND MS_AND
#endif
#ifndef MR_OR
#define MR_OR MS_OR
#endif
#ifndef MR_XOR
#define MR_XOR MS_XOR
#endif
#ifndef MR_NOT
#define MR_NOT MS_NOT
#endif
#ifndef MR_MAXIMUM
#define MR_MAXIMUM MS_MAXIMUM
#endif
#ifndef MR_MINIMUM
#define MR_MINIMUM MS_MINIMUM
#endif
2022-12-12 18:02:07 +03:00
#ifndef MR_IS
#define MR_IS MS_IS
2021-08-19 19:26:06 +03:00
#endif
#ifndef MR_LIST_EMPTY
#define MR_LIST_EMPTY MS_LIST_EMPTY
2021-08-19 19:26:06 +03:00
#endif
#ifndef MR_BUT_REPLACE
#define MR_BUT_REPLACE MS_BUT_REPLACE
#endif
2024-10-22 18:07:07 +03:00
#ifndef MR_COMBINE
#define MR_COMBINE MS_COMBINE
#endif
#ifndef MR_INITIALLY
#define MR_INITIALLY MS_INITIALLY
#endif
2021-08-19 19:26:06 +03:00
#ifndef MR_CARDINAL
#define MR_CARDINAL MS_CARDINAL
#endif
#ifndef MR_YEAR
#define MR_YEAR MS_YEAR
#endif
#ifndef MR_MONTH
#define MR_MONTH MS_MONTH
#endif
#ifndef MR_DAY
#define MR_DAY MS_DAY
#endif
#ifndef MR_TRUE
#define MR_TRUE MS_TRUE
#endif
#ifndef MR_FALSE
#define MR_FALSE MS_FALSE
#endif
#ifndef MR_Round
#define MR_Round MS_Round
2022-04-29 22:18:15 +03:00
#endif
2021-08-19 19:26:06 +03:00
#ifndef MR_GetDay
#define MR_GetDay MS_GetDay
#endif
#ifndef MR_GetMonth
#define MR_GetMonth MS_GetMonth
#endif
#ifndef MR_GetYear
#define MR_GetYear MS_GetYear
#endif
#ifndef MR_FirstDayOfMonth
#define MR_FirstDayOfMonth MS_FirstDayOfMonth
#endif
#ifndef MR_LastDayOfMonth
#define MR_LastDayOfMonth MS_LastDayOfMonth
#endif
#ifndef MR_INPUT
#define MR_INPUT MS_INPUT
#endif
#ifndef MR_OUTPUT
#define MR_OUTPUT MS_OUTPUT
#endif
#ifndef MR_INTERNAL
#define MR_INTERNAL MS_INTERNAL
#endif
#ifndef MR_MONEY_OP_SUFFIX
#define MR_MONEY_OP_SUFFIX MS_MONEY_OP_SUFFIX
#endif
2021-08-19 19:26:06 +03:00
2021-05-09 23:55:50 +03:00
let token_list : (string * token) list =
[
2021-08-19 19:26:06 +03:00
(MS_SCOPE, SCOPE);
(MS_CONSEQUENCE, CONSEQUENCE);
(MS_DATA, DATA);
(MS_DEPENDS, DEPENDS);
(MS_DECLARATION, DECLARATION);
(MS_CONTEXT, CONTEXT);
(MS_DECREASING, DECREASING);
(MS_INCREASING, INCREASING);
(MS_OF, OF);
(MS_LIST, LIST);
(MS_CONTAINS, CONTAINS);
2021-08-19 19:26:06 +03:00
(MS_ENUM, ENUM);
(MS_INTEGER, INTEGER);
(MS_MONEY, MONEY);
(MS_TEXT, TEXT);
(MS_DECIMAL, DECIMAL);
(MS_DATE, DATE);
(MS_DURATION, DURATION);
(MS_BOOLEAN, BOOLEAN);
(MS_SUM, SUM);
(MS_FILLED, FILLED);
(MS_DEFINITION, DEFINITION);
2022-02-28 16:33:07 +03:00
(MS_STATE, STATE);
2021-08-19 19:26:06 +03:00
(MS_LABEL, LABEL);
(MS_EXCEPTION, EXCEPTION);
(MS_DEFINED_AS, DEFINED_AS);
(MS_MATCH, MATCH);
(MS_WILDCARD, WILDCARD);
(MS_WITH, WITH);
(MS_UNDER_CONDITION, UNDER_CONDITION);
(MS_IF, IF);
(MS_THEN, THEN);
(MS_ELSE, ELSE);
(MS_CONDITION, CONDITION);
(MS_CONTENT, CONTENT);
(MS_STRUCT, STRUCT);
(MS_ASSERTION, ASSERTION);
(MS_WITH_V, WITH_V);
(MS_FOR, FOR);
(MS_ALL, ALL);
(MS_WE_HAVE, WE_HAVE);
(MS_RULE, RULE);
2022-07-26 14:40:43 +03:00
(MS_LET, LET);
2021-08-19 19:26:06 +03:00
(MS_EXISTS, EXISTS);
(MS_IN, IN);
(MS_AMONG, AMONG);
2021-08-19 19:26:06 +03:00
(MS_SUCH, SUCH);
(MS_THAT, THAT);
(MS_AND, AND);
(MS_OR, OR);
(MS_XOR, XOR);
(MS_NOT, NOT);
(MS_MAXIMUM, MAXIMUM);
(MS_MINIMUM, MINIMUM);
2022-12-12 18:02:07 +03:00
(MS_IS, IS);
(MS_LIST_EMPTY, LIST_EMPTY);
(MS_BUT_REPLACE, BUT_REPLACE);
2024-10-22 18:07:07 +03:00
(MS_COMBINE, COMBINE);
(MS_INITIALLY, INITIALLY);
2021-08-19 19:26:06 +03:00
(MS_CARDINAL, CARDINAL);
(MS_YEAR, YEAR);
(MS_MONTH, MONTH);
(MS_DAY, DAY);
(MS_TRUE, TRUE);
(MS_FALSE, FALSE);
(MS_INPUT, INPUT);
(MS_OUTPUT, OUTPUT);
(MS_INTERNAL, INTERNAL)
2021-05-09 23:55:50 +03:00
]
@ L.token_list_language_agnostic
2021-08-19 19:26:06 +03:00
(** Localised builtin functions *)
let lex_builtin (s : string) : Ast.builtin_expression option =
let lexbuf = Utf8.from_string s in
match%sedlex lexbuf with
| MR_Round, eof -> Some Round
2021-08-19 21:41:34 +03:00
| MR_GetDay, eof -> Some GetDay
| MR_GetMonth, eof -> Some GetMonth
| MR_GetYear, eof -> Some GetYear
| MR_FirstDayOfMonth -> Some FirstDayOfMonth
| MR_LastDayOfMonth -> Some LastDayOfMonth
2021-08-19 19:26:06 +03:00
| _ -> None
2021-05-09 23:55:50 +03:00
(** Regexp matching any digit character.
@note can not be used outside the current module (@see <
https://github.com/ocaml-community/sedlex#lexer-specifications >). *)
let digit = [%sedlex.regexp? '0' .. '9']
(** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space]
(** characters that can be present in idents (excluding first char) *)
let idchar = [%sedlex.regexp? uppercase | lowercase | digit | '_' | '\'']
(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Operator explicit typing suffix chars *)
let op_kind_re = [%sedlex.regexp? "" | MR_MONEY_OP_SUFFIX | Chars "!.@^"]
let op_kind = function
| "" -> Ast.KPoly
| "!" -> Ast.KInt
| "." -> Ast.KDec
| MS_MONEY_OP_SUFFIX -> Ast.KMoney
| "@" -> Ast.KDate
| "^" -> Ast.KDuration
| _ -> invalid_arg "op_kind"
2021-05-09 23:55:50 +03:00
(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| white_space ->
(* Whitespaces *)
L.update_acc lexbuf;
lex_code lexbuf
| '#', Star (Compl '\n'), '\n' ->
(* Comments *)
L.update_acc lexbuf;
lex_code lexbuf
| "```" ->
(* End of code section *)
L.context := Law;
END_CODE (L.flush_acc ())
2021-08-19 19:26:06 +03:00
| MR_SCOPE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
SCOPE
2021-08-19 19:26:06 +03:00
| MR_DATA ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DATA
2021-08-19 19:26:06 +03:00
| MR_DEPENDS ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DEPENDS
2021-08-19 19:26:06 +03:00
| MR_DECLARATION ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DECLARATION
2021-08-19 19:26:06 +03:00
| MR_CONTEXT ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
CONTEXT
| MR_INPUT ->
L.update_acc lexbuf;
INPUT
| MR_OUTPUT ->
L.update_acc lexbuf;
OUTPUT
| MR_INTERNAL ->
L.update_acc lexbuf;
INTERNAL
2021-08-19 19:26:06 +03:00
| MR_DECREASING ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DECREASING
2021-08-19 19:26:06 +03:00
| MR_INCREASING ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
INCREASING
2021-08-19 19:26:06 +03:00
| MR_OF ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
OF
| MR_LIST ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
LIST
| MR_CONTAINS ->
L.update_acc lexbuf;
CONTAINS
2021-08-19 19:26:06 +03:00
| MR_ENUM ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
ENUM
2021-08-19 19:26:06 +03:00
| MR_INTEGER ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
INTEGER
2021-08-19 19:26:06 +03:00
| MR_MONEY ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
MONEY
2021-08-19 19:26:06 +03:00
| MR_TEXT ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
TEXT
2021-08-19 19:26:06 +03:00
| MR_DECIMAL ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DECIMAL
2021-08-19 19:26:06 +03:00
| MR_DATE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DATE
2021-08-19 19:26:06 +03:00
| MR_DURATION ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DURATION
2021-08-19 19:26:06 +03:00
| MR_BOOLEAN ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
BOOLEAN
2021-08-19 19:26:06 +03:00
| MR_SUM ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
SUM
2021-08-19 19:26:06 +03:00
| MR_FILLED ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
FILLED
2021-08-19 19:26:06 +03:00
| MR_DEFINITION ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DEFINITION
2022-02-28 16:33:07 +03:00
| MR_STATE ->
L.update_acc lexbuf;
STATE
2021-08-19 19:26:06 +03:00
| MR_LABEL ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
LABEL
2021-08-19 19:26:06 +03:00
| MR_EXCEPTION ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
EXCEPTION
2021-08-19 19:26:06 +03:00
| MR_DEFINED_AS ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DEFINED_AS
2021-08-19 19:26:06 +03:00
| MR_MATCH ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
MATCH
2021-08-19 19:26:06 +03:00
| MR_WITH ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
WITH
2021-08-19 19:26:06 +03:00
| MR_WILDCARD ->
L.update_acc lexbuf;
WILDCARD
2021-08-19 19:26:06 +03:00
| MR_UNDER_CONDITION ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
UNDER_CONDITION
2021-08-19 19:26:06 +03:00
| MR_IF ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
IF
2021-08-19 19:26:06 +03:00
| MR_CONSEQUENCE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
CONSEQUENCE
2021-08-19 19:26:06 +03:00
| MR_THEN ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
THEN
2021-08-19 19:26:06 +03:00
| MR_ELSE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
ELSE
2021-08-19 19:26:06 +03:00
| MR_CONDITION ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
CONDITION
2021-08-19 19:26:06 +03:00
| MR_CONTENT ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
CONTENT
2021-08-19 19:26:06 +03:00
| MR_STRUCT ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
STRUCT
2021-08-19 19:26:06 +03:00
| MR_ASSERTION ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
ASSERTION
2021-08-19 19:26:06 +03:00
| MR_WITH_V ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
WITH_V
2021-08-19 19:26:06 +03:00
| MR_FOR ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
FOR
2021-08-19 19:26:06 +03:00
| MR_ALL ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
ALL
2021-08-19 19:26:06 +03:00
| MR_WE_HAVE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
WE_HAVE
2021-08-19 19:26:06 +03:00
| MR_RULE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
RULE
2022-07-26 14:40:43 +03:00
| MR_LET ->
L.update_acc lexbuf;
LET
2021-08-19 19:26:06 +03:00
| MR_EXISTS ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
EXISTS
2021-08-19 19:26:06 +03:00
| MR_IN ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
IN
| MR_AMONG ->
L.update_acc lexbuf;
AMONG
2021-08-19 19:26:06 +03:00
| MR_SUCH ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
SUCH
2021-08-19 19:26:06 +03:00
| MR_THAT ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
THAT
2021-08-19 19:26:06 +03:00
| MR_AND ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
AND
2021-08-19 19:26:06 +03:00
| MR_OR ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
OR
2021-08-19 19:26:06 +03:00
| MR_XOR ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
XOR
2021-08-19 19:26:06 +03:00
| MR_NOT ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
NOT
2021-08-19 19:26:06 +03:00
| MR_MAXIMUM ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
MAXIMUM
2021-08-19 19:26:06 +03:00
| MR_MINIMUM ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
MINIMUM
2022-12-12 18:02:07 +03:00
| MR_IS ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
2022-12-12 18:02:07 +03:00
IS
| MR_LIST_EMPTY ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
LIST_EMPTY
| MR_BUT_REPLACE ->
L.update_acc lexbuf;
BUT_REPLACE
2024-10-22 18:07:07 +03:00
| MR_COMBINE ->
L.update_acc lexbuf;
COMBINE
| MR_INITIALLY ->
L.update_acc lexbuf;
INITIALLY
2021-08-19 19:26:06 +03:00
| MR_CARDINAL ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
CARDINAL
2021-08-19 19:26:06 +03:00
| MR_TRUE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
TRUE
2021-08-19 19:26:06 +03:00
| MR_FALSE ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
FALSE
2021-08-19 19:26:06 +03:00
| MR_YEAR ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
YEAR
2021-08-19 19:26:06 +03:00
| MR_MONTH ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
MONTH
2021-08-19 19:26:06 +03:00
| MR_DAY ->
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DAY
| MR_MONEY_PREFIX, digit, Opt (Star (digit | MR_MONEY_DELIM), digit), Opt (MC_DECIMAL_SEPARATOR, Rep (digit, 0 .. 2)), MR_MONEY_SUFFIX ->
let s = Utf8.lexeme lexbuf in
let units = Buffer.create (String.length s) in
let cents = Buffer.create 2 in
let buf = ref units in
for i = 0 to String.length s - 1 do
match s.[i] with
| '0'..'9' as c -> Buffer.add_char !buf c
| MC_DECIMAL_SEPARATOR -> buf := cents
| _ -> ()
done;
(* If the user has written $0.3 it means 30 cents so we have to pad
with a 0 *)
Buffer.add_string cents (String.make (2 - Buffer.length cents) '0');
L.update_acc lexbuf;
2022-04-22 15:26:28 +03:00
MONEY_AMOUNT (Buffer.contents units, Buffer.contents cents)
| '|', Rep (digit, 4), '-', Rep (digit, 2), '-', Rep (digit, 2), '|' ->
2023-03-21 18:59:34 +03:00
L.update_acc lexbuf;
2021-08-19 21:32:23 +03:00
let rex =
Re.(compile @@ whole_string @@ seq [
char '|';
2022-07-21 15:14:22 +03:00
group (repn digit 4 None);
char '-';
group (repn digit 2 None);
char '-';
group (repn digit 2 None);
char '|';
2022-07-21 15:14:22 +03:00
])
in
let date_parts = R.get_substring (R.exec ~rex (Utf8.lexeme lexbuf)) in
DATE_LITERAL (
int_of_string (date_parts 1),
int_of_string (date_parts 2),
int_of_string (date_parts 3)
)
| Opt '-', Plus digit, MC_DECIMAL_SEPARATOR, Star digit ->
let rex =
Re.(compile @@ whole_string @@ seq [
group (seq [opt (char '-') ; rep1 digit]);
char MC_DECIMAL_SEPARATOR;
2021-08-19 21:32:23 +03:00
group (rep digit)
]) in
let dec_parts = R.get_substring (R.exec ~rex (Utf8.lexeme lexbuf)) in
L.update_acc lexbuf;
DECIMAL_LITERAL
2022-04-22 15:26:28 +03:00
(dec_parts 1, dec_parts 2)
| "<=", op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:"<=" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
LESSER_EQUAL k
| "<", op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:"<" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
LESSER k
| ">=", op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:">=" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
GREATER_EQUAL k
| ">", op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:">" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
GREATER k
| "-", op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:"-" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
MINUS k
| "+", op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:"+" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
PLUS k
| "*", op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:"*" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
MULT k
| '/', op_kind_re ->
let k = op_kind (String.remove_prefix ~prefix:"/" (Utf8.lexeme lexbuf)) in
2021-05-09 23:55:50 +03:00
L.update_acc lexbuf;
DIV k
2021-05-09 23:55:50 +03:00
| "!=" ->
L.update_acc lexbuf;
NOT_EQUAL
| '=' ->
L.update_acc lexbuf;
EQUAL
| '%' ->
L.update_acc lexbuf;
PERCENT
| '(' ->
L.update_acc lexbuf;
LPAREN
| ')' ->
L.update_acc lexbuf;
RPAREN
| '{' ->
L.update_acc lexbuf;
LBRACE
2021-05-09 23:55:50 +03:00
| '}' ->
L.update_acc lexbuf;
RBRACE
2021-05-09 23:55:50 +03:00
| '[' ->
L.update_acc lexbuf;
LBRACKET
2021-05-09 23:55:50 +03:00
| ']' ->
L.update_acc lexbuf;
RBRACKET
2021-05-09 23:55:50 +03:00
| ':' ->
L.update_acc lexbuf;
COLON
| ';' ->
L.update_acc lexbuf;
SEMICOLON
| "--" ->
L.update_acc lexbuf;
ALT
| "++" ->
L.update_acc lexbuf;
2021-08-19 19:26:06 +03:00
PLUSPLUS
2021-05-09 23:55:50 +03:00
| '.' ->
L.update_acc lexbuf;
DOT
| ',' ->
L.update_acc lexbuf;
COMMA
| uppercase, Star idchar ->
2021-05-09 23:55:50 +03:00
(* Name of constructor *)
L.update_acc lexbuf;
UIDENT (Utf8.lexeme lexbuf)
| lowercase, Star idchar ->
2021-05-09 23:55:50 +03:00
(* Name of variable *)
L.update_acc lexbuf;
LIDENT (Utf8.lexeme lexbuf)
2022-07-21 15:14:22 +03:00
| Opt '-', Plus digit ->
2021-05-09 23:55:50 +03:00
(* Integer literal*)
L.update_acc lexbuf;
2022-04-22 15:26:28 +03:00
INT_LITERAL (Utf8.lexeme lexbuf)
2021-05-09 23:55:50 +03:00
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let rec lex_directive_args (lexbuf : lexbuf) : token =
2021-05-09 23:55:50 +03:00
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| '@', Star hspace, "p.", Star hspace, Plus digit ->
let s = Utf8.lexeme lexbuf in
let i = String.index s '.' in
AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
2023-09-05 16:00:55 +03:00
| MR_MODULE_ALIAS -> MODULE_ALIAS
| MR_EXTERNAL -> MODULE_EXTERNAL
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
2021-05-09 23:55:50 +03:00
let rec lex_directive (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let prev_pos = lexing_positions lexbuf in
match%sedlex lexbuf with
| Plus hspace -> lex_directive lexbuf
2021-08-19 19:26:06 +03:00
| MR_LAW_INCLUDE -> LAW_INCLUDE
| MR_MODULE_DEF -> L.context := Directive_args; MODULE_DEF
| MR_MODULE_USE -> L.context := Directive_args; MODULE_USE
| ":" ->
L.context := Directive_args;
COLON
| '\n' | eof ->
L.context := Law;
END_DIRECTIVE
2021-05-09 23:55:50 +03:00
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
let lex_raw (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```", Star hspace, ('\n' | eof) ->
L.context := Law;
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token =
let prev_lexeme = Utf8.lexeme lexbuf in
let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```catala", Star white_space, ('\n' | eof) ->
L.context := Code;
BEGIN_CODE
| "```catala-metadata", Star white_space, ('\n' | eof) ->
L.context := Code;
BEGIN_METADATA
| "```", Star (idchar | '-') ->
L.context := Raw;
LAW_TEXT (Utf8.lexeme lexbuf)
| '>' ->
L.context := Directive;
BEGIN_DIRECTIVE
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
L.get_law_heading lexbuf
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val:lex_law}
depending of the current {!val: Surface.Lexer_common.context}. *)
let lexer (lexbuf : lexbuf) : token =
match !L.context with
| Law -> lex_law lexbuf
| Raw -> lex_raw lexbuf
| Code -> lex_code lexbuf
| Directive -> lex_directive lexbuf
| Directive_args -> lex_directive_args lexbuf
| Inactive ->
Message.error ~internal:true "Lexer started outside of an initialised context."
(* -- Shallow lexing for dependency extraction -- *)
let line_test_id_re =
Re.(compile @@ seq [
char '{'; rep space; str "id"; rep space; char '='; rep space;
char '"'; group (seq [rep1 (diff any (char '"'))]); char '"';
])
let line_dir_arg_re =
Re.(compile @@ seq [
bos; char '>'; rep space; rep1 alpha;
rep1 any;
alt [space; char ':'];
2023-10-02 15:04:12 +03:00
group (rep1 (diff any space));
eol
])
(* This is a bit cheap, but we don't want a full-fledged parser to handle these
trivial line directives. Here we extract the first uppercase argument of a
directive line, which is guaranteed to match the module name we are
interested in and nothing else (e.g. in French, the module usage "keywords"
are multiple words) *)
2023-12-01 01:49:19 +03:00
let line_dir_arg_upcase_re =
Re.(compile @@ seq [
bos; char '>'; rep space; rep1 alpha;
rep (alt [space; lower]); space;
group (seq [rep1 upper; rep (diff any space)]);
rep any;
eol
])
let lex_line (lexbuf : lexbuf) : (string * L.line_token) option =
match%sedlex lexbuf with
| eof -> None
| "```catala-test-inline", Star hspace, ('\n' | eof) ->
Some (Utf8.lexeme lexbuf, LINE_INLINE_TEST)
| "```catala-test", Star (Compl '\n'), ('\n' | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let id = Re.Group.get (Re.exec line_test_id_re str) 1 in
Some (str, LINE_TEST id)
with Not_found ->
Message.warning ~pos:(Pos.from_lpos (lexing_positions lexbuf))
"Ignored invalid test section, must have an explicit \
`{ id = \"name\" }` specification";
Some (str, LINE_ANY))
| "```", Star hspace, ('\n' | eof) ->
Some (Utf8.lexeme lexbuf, LINE_BLOCK_END)
| '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus (Compl '\n'),
('\n' | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let file = Re.Group.get (Re.exec line_dir_arg_re str) 1 in
Some (str, LINE_INCLUDE file)
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_DEF, Plus hspace,
uppercase, Star (Compl white_space), Plus hspace,
MR_EXTERNAL, Star hspace, ('\n' | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_DEF (mdl, true))
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star (Compl '\n'),
('\n' | eof) ->
let str = Utf8.lexeme lexbuf in
(try
2023-12-01 01:49:19 +03:00
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_DEF (mdl, false))
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (Compl '\n'),
('\n' | eof) ->
let str = Utf8.lexeme lexbuf in
(try
2023-12-01 01:49:19 +03:00
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_USE mdl)
with Not_found -> Some (str, LINE_ANY))
| Star (Compl '\n'), ('\n' | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY)
| _ -> assert false