catala/compiler/surface/lexer_common.ml

(* This file is part of the Catala compiler, a specification language for tax and social benefits
   computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux
   <denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>

   Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
   in compliance with the License. You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software distributed under the License
   is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
   or implied. See the License for the specific language governing permissions and limitations under
   the License. *)

open Tokens
open Sedlexing
open Utils
module R = Re.Pcre

(* Calculates the precedence according a {!val: matched_regex} of the form : '[#]+'.

   @note -2 because [LAW_HEADING] start with at least "#" and the number of '#' remaining
   corresponds to the precedence. *)
let calc_precedence (matched_regex : string) : int = String.length matched_regex - 1

(* Gets the [LAW_HEADING] token from the current {!val: lexbuf} *)
let get_law_heading (lexbuf : lexbuf) : token =
  let extract_article_title =
    R.regexp "([#]+)\\s*([^\\|]+)(\\|([^\\|]+)|)(\\|\\s*([0-9]{4}\\-[0-9]{2}\\-[0-9]{2})|)"
  in
  let get_substring = R.get_substring (R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf)) in
  let title = String.trim (get_substring 2) in
  let article_id = try Some (String.trim (get_substring 4)) with Not_found -> None in
  let article_expiration_date = try Some (String.trim (get_substring 6)) with Not_found -> None in
  let precedence = calc_precedence (String.trim (get_substring 1)) in
  LAW_HEADING (title, article_id, article_expiration_date, precedence)

(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing
    code or law. *)
let is_code : bool ref = ref false

(** Mutable string reference that accumulates the string representation of the body of code being
    lexed. This string representation is used in the literate programming backends to faithfully
    capture the spacing pattern of the original program *)
let code_string_acc : string ref = ref ""

(** Updates {!val:code_string_acc} with the current lexeme *)
let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf

(** Error-generating helper *)
let raise_lexer_error (loc : Pos.t) (token : string) =
  Errors.raise_spanned_error
    (Printf.sprintf "Parsing error after token \"%s\": what comes after is unknown" token)
    loc

(** Associative list matching each punctuation string part of the Catala syntax with its {!module:
    Surface.Parser} token. Same for all the input languages (English, French, etc.) *)
let token_list_language_agnostic : (string * token) list =
  [
    (".", DOT);
    ("<=", LESSER_EQUAL);
    (">=", GREATER_EQUAL);
    (">", GREATER);
    ("!=", NOT_EQUAL);
    ("=", EQUAL);
    ("(", LPAREN);
    (")", RPAREN);
    ("{", LBRACKET);
    ("}", RBRACKET);
    ("{", LSQUARE);
    ("}", RSQUARE);
    ("+", PLUS);
    ("-", MINUS);
    ("*", MULT);
    ("/", DIV);
    ("|", VERTICAL);
    (":", COLON);
    (";", SEMICOLON);
    ("--", ALT);
    ("++", CONCAT);
  ]

module type LocalisedLexer = sig
  val token_list : (string * Tokens.token) list
  (** Same as {!val: token_list_language_agnostic}, but with tokens specialized to a given language. *)

  val builtins : (string * Ast.builtin_expression) list
  (** Associative list of string to their corresponding builtins *)

  val lex_code : Sedlexing.lexbuf -> Tokens.token
  (** Main lexing function used in code blocks *)

  val lex_law : Sedlexing.lexbuf -> Tokens.token
  (** Main lexing function used outside code blocks *)

  val lexer : Sedlexing.lexbuf -> Tokens.token
  (** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of
      {!val: Surface.Lexer_common.is_code}. *)
end
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`(* This file is part of the Catala compiler, a specification language for tax and social benefits`
docs: update contributions in file comments 2021-05-27 19:56:47 +03:00			`computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux`
			`<denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00
			`Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except`
			`in compliance with the License. You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software distributed under the License`
			`is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express`
			`or implied. See the License for the specific language governing permissions and limitations under`
			`the License. *)`

Pass the localised builtins as parameters to the parser 2021-04-30 10:59:09 +03:00			`open Tokens`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`open Sedlexing`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`open Utils`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`module R = Re.Pcre`

refactor(lexer): manage the new markdown syntax 2021-03-10 00:04:36 +03:00			`(* Calculates the precedence according a {!val: matched_regex} of the form : '[#]+'.`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00
Big refactoring of the literate programming structure Partially fixes #120 Removes the distinction between headers and articles, streamlines the surface AST Changes affects lexers and parser 2021-05-15 02:16:08 +03:00			`@note -2 because [LAW_HEADING] start with at least "#" and the number of '#' remaining`
			`corresponds to the precedence. *)`
			`let calc_precedence (matched_regex : string) : int = String.length matched_regex - 1`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00
fix(lexer): update comments in order to keep the unicity 2021-03-09 23:01:24 +03:00			`(* Gets the [LAW_HEADING] token from the current {!val: lexbuf} *)`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`let get_law_heading (lexbuf : lexbuf) : token =`
Big refactoring of the literate programming structure Partially fixes #120 Removes the distinction between headers and articles, streamlines the surface AST Changes affects lexers and parser 2021-05-15 02:16:08 +03:00			`let extract_article_title =`
			`R.regexp "([#]+)\\s([^\\\|]+)(\\\|([^\\\|]+)\|)(\\\|\\s([0-9]{4}\\-[0-9]{2}\\-[0-9]{2})\|)"`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`in`
Big refactoring of the literate programming structure Partially fixes #120 Removes the distinction between headers and articles, streamlines the surface AST Changes affects lexers and parser 2021-05-15 02:16:08 +03:00			`let get_substring = R.get_substring (R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf)) in`
Fixed heading precedence in examples 2021-05-15 17:04:35 +03:00			`let title = String.trim (get_substring 2) in`
Better HTML generation 2021-05-15 20:50:06 +03:00			`let article_id = try Some (String.trim (get_substring 4)) with Not_found -> None in`
			`let article_expiration_date = try Some (String.trim (get_substring 6)) with Not_found -> None in`
			`let precedence = calc_precedence (String.trim (get_substring 1)) in`
Big refactoring of the literate programming structure Partially fixes #120 Removes the distinction between headers and articles, streamlines the surface AST Changes affects lexers and parser 2021-05-15 02:16:08 +03:00			`LAW_HEADING (title, article_id, article_expiration_date, precedence)`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00
			`(** Boolean reference, used by the lexer as the mutable state to distinguish whether it is lexing`
			`code or law. *)`
			`let is_code : bool ref = ref false`

			`(** Mutable string reference that accumulates the string representation of the body of code being`
			`lexed. This string representation is used in the literate programming backends to faithfully`
			`capture the spacing pattern of the original program *)`
			`let code_string_acc : string ref = ref ""`

			`(** Updates {!val:code_string_acc} with the current lexeme *)`
			`let update_acc (lexbuf : lexbuf) : unit = code_string_acc := !code_string_acc ^ Utf8.lexeme lexbuf`

			`(** Error-generating helper *)`
			`let raise_lexer_error (loc : Pos.t) (token : string) =`
			`Errors.raise_spanned_error`
			`(Printf.sprintf "Parsing error after token \"%s\": what comes after is unknown" token)`
			`loc`

			`(** Associative list matching each punctuation string part of the Catala syntax with its {!module:`
			`Surface.Parser} token. Same for all the input languages (English, French, etc.) *)`
			`let token_list_language_agnostic : (string * token) list =`
			`[`
			`(".", DOT);`
			`("<=", LESSER_EQUAL);`
			`(">=", GREATER_EQUAL);`
			`(">", GREATER);`
			`("!=", NOT_EQUAL);`
			`("=", EQUAL);`
			`("(", LPAREN);`
			`(")", RPAREN);`
			`("{", LBRACKET);`
			`("}", RBRACKET);`
			`("{", LSQUARE);`
			`("}", RSQUARE);`
			`("+", PLUS);`
			`("-", MINUS);`
			`("*", MULT);`
			`("/", DIV);`
			`("\|", VERTICAL);`
			`(":", COLON);`
			`(";", SEMICOLON);`
			`("--", ALT);`
feat(compiler): add collection concatenation operator 2021-07-08 17:27:46 +03:00			`("++", CONCAT);`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`]`

			`module type LocalisedLexer = sig`
			`val token_list : (string * Tokens.token) list`
			`(** Same as {!val: token_list_language_agnostic}, but with tokens specialized to a given language. *)`

			`val builtins : (string * Ast.builtin_expression) list`
			`(** Associative list of string to their corresponding builtins *)`

			`val lex_code : Sedlexing.lexbuf -> Tokens.token`
			`(** Main lexing function used in code blocks *)`

			`val lex_law : Sedlexing.lexbuf -> Tokens.token`
			`(** Main lexing function used outside code blocks *)`

			`val lexer : Sedlexing.lexbuf -> Tokens.token`
			`(** Entry point of the lexer, distributes to {!val: lex_code} or {!val: lex_law} depending of`
			`{!val: Surface.Lexer_common.is_code}. *)`
			`end`