2021-03-09 22:57:41 +03:00
|
|
|
(* This file is part of the Catala compiler, a specification language for tax and social benefits
|
2021-05-27 19:56:47 +03:00
|
|
|
computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux
|
|
|
|
<denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>
|
2021-03-09 22:57:41 +03:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
|
|
|
in compliance with the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
|
|
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
|
|
or implied. See the License for the specific language governing permissions and limitations under
|
|
|
|
the License. *)
|
|
|
|
|
2021-04-30 10:59:09 +03:00
|
|
|
open Tokens
|
2021-03-09 22:57:41 +03:00
|
|
|
open Sedlexing
|
2021-05-26 18:39:39 +03:00
|
|
|
open Utils
|
2021-03-09 22:57:41 +03:00
|
|
|
module R = Re.Pcre
|
|
|
|
|
2021-03-10 00:04:36 +03:00
|
|
|
(* Calculates the precedence according a {!val: matched_regex} of the form : '[#]+'.
|
2021-03-09 22:57:41 +03:00
|
|
|
|
2021-05-15 02:16:08 +03:00
|
|
|
@note -2 because [LAW_HEADING] start with at least "#" and the number of '#' remaining
|
|
|
|
corresponds to the precedence. *)
|
|
|
|
let calc_precedence (matched_regex : string) : int = String.length matched_regex - 1
|
2021-03-09 22:57:41 +03:00
|
|
|
|
2021-03-09 23:01:24 +03:00
|
|
|
(* Gets the [LAW_HEADING] token from the current {!val: lexbuf} *)
|
2021-03-09 22:57:41 +03:00
|
|
|
let get_law_heading (lexbuf : lexbuf) : token =
|
2021-05-15 02:16:08 +03:00
|
|
|
let extract_article_title =
|
|
|
|
R.regexp "([#]+)\\s*([^\\|]+)(\\|([^\\|]+)|)(\\|\\s*([0-9]{4}\\-[0-9]{2}\\-[0-9]{2})|)"
|
2021-03-09 22:57:41 +03:00
|
|
|
in
|
2021-05-15 02:16:08 +03:00
|
|
|
let get_substring = R.get_substring (R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf)) in
|
2021-05-15 17:04:35 +03:00
|
|
|
let title = String.trim (get_substring 2) in
|
2021-05-15 20:50:06 +03:00
|
|
|
let article_id = try Some (String.trim (get_substring 4)) with Not_found -> None in
|
|
|
|
let article_expiration_date = try Some (String.trim (get_substring 6)) with Not_found -> None in
|
|
|
|
let precedence = calc_precedence (String.trim (get_substring 1)) in
|
2021-05-15 02:16:08 +03:00
|
|
|
LAW_HEADING (title, article_id, article_expiration_date, precedence)
|
2021-05-26 18:39:39 +03:00
|
|
|
|
|
|
|
(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
|
|
|
|
code or law. *)
|
|
|
|
let is_code : bool ref = ref false
|
|
|
|
|
|
|
|
(** Mutable string reference that accumulates the string representation of the body of code being
|
|
|
|
lexed. This string representation is used in the literate programming backends to faithfully
|
|
|
|
capture the spacing pattern of the original program *)
|
|
|
|
let code_string_acc : string ref = ref ""
|
|
|
|
|
|
|
|
(** Updates {!val:code_string_acc} with the current lexeme *)
|
|
|
|
let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf
|
|
|
|
|
|
|
|
(** Error-generating helper *)
|
|
|
|
let raise_lexer_error (loc : Pos.t) (token : string) =
|
|
|
|
Errors.raise_spanned_error
|
|
|
|
(Printf.sprintf "Parsing error after token \"%s\": what comes after is unknown" token)
|
|
|
|
loc
|
|
|
|
|
|
|
|
(** Associative list matching each punctuation string part of the Catala syntax with its {!module:
|
|
|
|
Surface.Parser} token. Same for all the input languages (English, French, etc.) *)
|
|
|
|
let token_list_language_agnostic : (string * token) list =
|
|
|
|
[
|
|
|
|
(".", DOT);
|
|
|
|
("<=", LESSER_EQUAL);
|
|
|
|
(">=", GREATER_EQUAL);
|
|
|
|
(">", GREATER);
|
|
|
|
("!=", NOT_EQUAL);
|
|
|
|
("=", EQUAL);
|
|
|
|
("(", LPAREN);
|
|
|
|
(")", RPAREN);
|
|
|
|
("{", LBRACKET);
|
|
|
|
("}", RBRACKET);
|
|
|
|
("{", LSQUARE);
|
|
|
|
("}", RSQUARE);
|
|
|
|
("+", PLUS);
|
|
|
|
("-", MINUS);
|
|
|
|
("*", MULT);
|
|
|
|
("/", DIV);
|
|
|
|
("|", VERTICAL);
|
|
|
|
(":", COLON);
|
|
|
|
(";", SEMICOLON);
|
|
|
|
("--", ALT);
|
2021-07-08 17:27:46 +03:00
|
|
|
("++", CONCAT);
|
2021-05-26 18:39:39 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
module type LocalisedLexer = sig
|
|
|
|
val token_list : (string * Tokens.token) list
|
|
|
|
(** Same as {!val: token_list_language_agnostic}, but with tokens specialized to a given language. *)
|
|
|
|
|
|
|
|
val builtins : (string * Ast.builtin_expression) list
|
|
|
|
(** Associative list of string to their corresponding builtins *)
|
|
|
|
|
|
|
|
val lex_code : Sedlexing.lexbuf -> Tokens.token
|
|
|
|
(** Main lexing function used in code blocks *)
|
|
|
|
|
|
|
|
val lex_law : Sedlexing.lexbuf -> Tokens.token
|
|
|
|
(** Main lexing function used outside code blocks *)
|
|
|
|
|
|
|
|
val lexer : Sedlexing.lexbuf -> Tokens.token
|
|
|
|
(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of
|
|
|
|
{!val: Surface.Lexer_common.is_code}. *)
|
|
|
|
end
|