catala/compiler/surface/lexer_common.ml

(* This file is part of the Catala compiler, a specification language for tax
   and social benefits computation rules. Copyright (C) 2020 Inria,
   contributors: Denis Merigoux <denis.merigoux@inria.fr>, Emile Rolley
   <emile.rolley@tuta.io>

   Licensed under the Apache License, Version 2.0 (the "License"); you may not
   use this file except in compliance with the License. You may obtain a copy of
   the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
   WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
   License for the specific language governing permissions and limitations under
   the License. *)

open Tokens
open Sedlexing
open Catala_utils
module R = Re.Pcre

(* Calculates the precedence according a {!val: matched_regex} of the form :
   '[#]+'.

   @note -2 because [LAW_HEADING] start with at least "#" and the number of '#'
   remaining corresponds to the precedence. *)
let calc_precedence (matched_regex : string) : int =
  String.length matched_regex - 1

(* Gets the [LAW_HEADING] token from the current {!val: lexbuf} *)
let get_law_heading (lexbuf : lexbuf) : token =
  let extract_article_title =
    R.regexp "([#]+)\\s*([^\\|]+)(\\|\\s*([^\\s]+)|)(\\s*(\\[archive\\])|)"
  in
  let rex = R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf) in
  let title = String.trim (R.get_substring rex 2) in
  let article_id =
    try Some (String.trim (R.get_substring rex 4)) with Not_found -> None
  in
  let is_archive = Option.is_some (Re.Group.get_opt rex 6) in
  let precedence = calc_precedence (String.trim (R.get_substring rex 1)) in
  LAW_HEADING (title, article_id, is_archive, precedence)

type lexing_context = Law | Raw | Code | Directive | Directive_args | Inactive

(** Boolean reference, used by the lexer as the mutable state to distinguish
    whether it is lexing code or law. *)
let context : lexing_context ref = ref Inactive

(** Mutable string reference that accumulates the string representation of the
    body of code being lexed. This string representation is used in the literate
    programming backends to faithfully capture the spacing pattern of the
    original program *)
let code_buffer : Buffer.t option ref = ref None

let with_lexing_context filename f =
  let saved_context = !context in
  let saved_buffer = !code_buffer in
  context := Law;
  code_buffer := Some (Buffer.create 4000);
  Fun.protect f ~finally:(fun () ->
      if
        !context <> Law
        || match !code_buffer with Some b -> Buffer.length b > 0 | _ -> false
      then
        Message.warning
          "Unclosed block or missing newline at the end of file %a.@ Did you \
           forget a @{<yellow>```@} ?"
          File.format filename;
      context := saved_context;
      code_buffer := saved_buffer)

(** Updates {!val:code_buffer} with the current lexeme *)
let update_acc (lexbuf : lexbuf) : unit =
  match !code_buffer with
  | None ->
    Message.error ~internal:true "Lexer update outside of a lexing context"
  | Some buf -> Buffer.add_string buf (Utf8.lexeme lexbuf)

let flush_acc () =
  match !code_buffer with
  | None ->
    Message.error ~internal:true "Lexer update outside of a lexing context"
  | Some buf ->
    let s = Buffer.contents buf in
    Buffer.clear buf;
    s

exception Lexing_error of (Pos.t * string)

(** Error-generating helper *)
let raise_lexer_error (loc : Pos.t) (token : string) =
  raise (Lexing_error (loc, token))

(** Associative list matching each punctuation string part of the Catala syntax
    with its {!module: Surface.Parser} token. Same for all the input languages
    (English, French, etc.) *)
let token_list_language_agnostic : (string * token) list =
  [
    ".", DOT;
    "<=", LESSER_EQUAL KPoly;
    ">=", GREATER_EQUAL KPoly;
    ">", GREATER KPoly;
    "!=", NOT_EQUAL;
    "=", EQUAL;
    "(", LPAREN;
    ")", RPAREN;
    "{", LBRACE;
    "}", RBRACE;
    "{", LBRACKET;
    "}", RBRACKET;
    "+", PLUS KPoly;
    "-", MINUS KPoly;
    "*", MULT KPoly;
    "/", DIV KPoly;
    ":", COLON;
    ";", SEMICOLON;
    "--", ALT;
    "++", PLUSPLUS;
  ]

type line_token =
  | LINE_TEST of string (* ```catala-test { id = xx } *)
  | LINE_INLINE_TEST (* ```catala-test-inline *)
  | LINE_BLOCK_END (* ``` *)
  | LINE_INCLUDE of string (* > Include foo.catala_en *)
  | LINE_MODULE_DEF of string * bool (* > Module Xxx [external] *)
  | LINE_MODULE_USE of string (* > Using Xxx [as Yyy] *)
  | LINE_ANY (* anything else *)

module type LocalisedLexer = sig
  val token_list : (string * Tokens.token) list
  (** Same as {!val: token_list_language_agnostic}, but with tokens specialized
      to a given language. *)

  val lex_builtin : string -> Ast.builtin_expression option
  (** Simple lexer for builtins *)

  val lex_code : Sedlexing.lexbuf -> Tokens.token
  (** Main lexing function used in code blocks *)

  val lex_law : Sedlexing.lexbuf -> Tokens.token
  (** Main lexing function used outside code blocks *)

  val lexer : Sedlexing.lexbuf -> Tokens.token
  (** Entry point of the lexer, distributes to {!val: lex_code} or
      {!val:lex_law} depending of the current
      {!val:Surface.Lexer_common.context}. *)

  val lex_line : Sedlexing.lexbuf -> (string * line_token) option
  (** Low-level lexer intended for dependency extraction *)
end
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`(* This file is part of the Catala compiler, a specification language for tax`
docs: update contributions in file comments 2021-05-27 19:56:47 +03:00			`and social benefits computation rules. Copyright (C) 2020 Inria,`
			`contributors: Denis Merigoux <denis.merigoux@inria.fr>, Emile Rolley`
			`<emile.rolley@tuta.io>`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00
			`Licensed under the Apache License, Version 2.0 (the "License"); you may not`
			`use this file except in compliance with the License. You may obtain a copy of`
			`the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`License for the specific language governing permissions and limitations under`
			`the License. *)`

Pass the localised builtins as parameters to the parser 2021-04-30 10:59:09 +03:00			`open Tokens`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`open Sedlexing`
Rename utils to catala_utils 2022-11-21 12:46:17 +03:00			`open Catala_utils`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`module R = Re.Pcre`

refactor(lexer): manage the new markdown syntax 2021-03-10 00:04:36 +03:00			`(* Calculates the precedence according a {!val: matched_regex} of the form :`
			`'[#]+'.`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00
Big refactoring of the literate programming structure Partially fixes #120 Removes the distinction between headers and articles, streamlines the surface AST Changes affects lexers and parser 2021-05-15 02:16:08 +03:00			`@note -2 because [LAW_HEADING] start with at least "#" and the number of '#'`
			`remaining corresponds to the precedence. *)`
			`let calc_precedence (matched_regex : string) : int =`
			`String.length matched_regex - 1`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00
fix(lexer): update comments in order to keep the unicity 2021-03-09 23:01:24 +03:00			`(* Gets the [LAW_HEADING] token from the current {!val: lexbuf} *)`
refactor(lexer): factorize the law heading exctraction in a new module : Lexer_common 2021-03-09 22:57:41 +03:00			`let get_law_heading (lexbuf : lexbuf) : token =`
Big refactoring of the literate programming structure Partially fixes #120 Removes the distinction between headers and articles, streamlines the surface AST Changes affects lexers and parser 2021-05-15 02:16:08 +03:00			`let extract_article_title =`
Improvements to expiration checking 2022-09-07 18:14:22 +03:00			`R.regexp "([#]+)\\s([^\\\|]+)(\\\|\\s([^\\s]+)\|)(\\s*(\\[archive\\])\|)"`
Big reformatting ocamlformat 0.19.0 -> 0.20.1 100 -> 80 columns per line Reestablished @emilerolley's smart fun break 2022-03-08 17:03:14 +03:00			`in`
Improvements to expiration checking 2022-09-07 18:14:22 +03:00			`let rex = R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf) in`
			`let title = String.trim (R.get_substring rex 2) in`
Better HTML generation 2021-05-15 20:50:06 +03:00			`let article_id =`
Improvements to expiration checking 2022-09-07 18:14:22 +03:00			`try Some (String.trim (R.get_substring rex 4)) with Not_found -> None`
Better HTML generation 2021-05-15 20:50:06 +03:00			`in`
Improvements to expiration checking 2022-09-07 18:14:22 +03:00			`let is_archive = Option.is_some (Re.Group.get_opt rex 6) in`
			`let precedence = calc_precedence (String.trim (R.get_substring rex 1)) in`
			`LAW_HEADING (title, article_id, is_archive, precedence)`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00
Fix handling of the lexer context The lexing context uses a global reference. This patch ensures it is handled carefully for the lifetime of lexing a given file. Before that, very weird bugs could happen, for example when a file ended with an unclosed verbatim block, the parsing of module interfaces done later on would start in verbatim context and fail to parse e.g. module use definitions. The patch additionally adds checks and warns about unclosed blocks at EOF, which before that would be silently accepted (with the above pitfall). 2024-09-25 16:20:45 +03:00			`type lexing_context = Law \| Raw \| Code \| Directive \| Directive_args \| Inactive`
Cleanup the lexer, and refactor for more generic directives 2021-08-17 16:49:48 +03:00
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`(** Boolean reference, used by the lexer as the mutable state to distinguish`
			`whether it is lexing code or law. *)`
Fix handling of the lexer context The lexing context uses a global reference. This patch ensures it is handled carefully for the lifetime of lexing a given file. Before that, very weird bugs could happen, for example when a file ended with an unclosed verbatim block, the parsing of module interfaces done later on would start in verbatim context and fail to parse e.g. module use definitions. The patch additionally adds checks and warns about unclosed blocks at EOF, which before that would be silently accepted (with the above pitfall). 2024-09-25 16:20:45 +03:00			`let context : lexing_context ref = ref Inactive`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00
			`(** Mutable string reference that accumulates the string representation of the`
			`body of code being lexed. This string representation is used in the literate`
			`programming backends to faithfully capture the spacing pattern of the`
			`original program *)`
Fix handling of the lexer context The lexing context uses a global reference. This patch ensures it is handled carefully for the lifetime of lexing a given file. Before that, very weird bugs could happen, for example when a file ended with an unclosed verbatim block, the parsing of module interfaces done later on would start in verbatim context and fail to parse e.g. module use definitions. The patch additionally adds checks and warns about unclosed blocks at EOF, which before that would be silently accepted (with the above pitfall). 2024-09-25 16:20:45 +03:00			`let code_buffer : Buffer.t option ref = ref None`

			`let with_lexing_context filename f =`
			`let saved_context = !context in`
			`let saved_buffer = !code_buffer in`
			`context := Law;`
			`code_buffer := Some (Buffer.create 4000);`
			`Fun.protect f ~finally:(fun () ->`
			`if`
			`!context <> Law`
			`\|\| match !code_buffer with Some b -> Buffer.length b > 0 \| _ -> false`
			`then`
			`Message.warning`
			`"Unclosed block or missing newline at the end of file %a.@ Did you \`
			forget a @{<yellow>```@} ?"
			`File.format filename;`
			`context := saved_context;`
			`code_buffer := saved_buffer)`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00
Cleanup the lexer, and refactor for more generic directives 2021-08-17 16:49:48 +03:00			`(** Updates {!val:code_buffer} with the current lexeme *)`
			`let update_acc (lexbuf : lexbuf) : unit =`
Fix handling of the lexer context The lexing context uses a global reference. This patch ensures it is handled carefully for the lifetime of lexing a given file. Before that, very weird bugs could happen, for example when a file ended with an unclosed verbatim block, the parsing of module interfaces done later on would start in verbatim context and fail to parse e.g. module use definitions. The patch additionally adds checks and warns about unclosed blocks at EOF, which before that would be silently accepted (with the above pitfall). 2024-09-25 16:20:45 +03:00			`match !code_buffer with`
			`\| None ->`
			`Message.error ~internal:true "Lexer update outside of a lexing context"`
			`\| Some buf -> Buffer.add_string buf (Utf8.lexeme lexbuf)`

			`let flush_acc () =`
			`match !code_buffer with`
			`\| None ->`
			`Message.error ~internal:true "Lexer update outside of a lexing context"`
			`\| Some buf ->`
			`let s = Buffer.contents buf in`
			`Buffer.clear buf;`
			`s`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00
adapt existing errors to changes 2024-07-30 16:20:51 +03:00			`exception Lexing_error of (Pos.t * string)`

refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`(** Error-generating helper *)`
			`let raise_lexer_error (loc : Pos.t) (token : string) =`
adapt existing errors to changes 2024-07-30 16:20:51 +03:00			`raise (Lexing_error (loc, token))`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00
			`(** Associative list matching each punctuation string part of the Catala syntax`
			`with its {!module: Surface.Parser} token. Same for all the input languages`
			`(English, French, etc.) *)`
			`let token_list_language_agnostic : (string * token) list =`
			`[`
			`".", DOT;`
Add overloaded operators for the common operations This uses the same disambiguation mechanism put in place for structures, calling the typer on individual rules on the desugared AST to propagate types, in order to resolve ambiguous operators like `+` to their strongly typed counterparts (`+!`, `+.`, `+$`, `+@`, `+$`) in the translation to scopelang. The patch includes some normalisation of the definition of all the operators, and classifies them based on their typing policy instead of their arity. It also adds a little more flexibility: - a couple new operators, like `-` on date and duration - optional type annotation on some aggregation constructions The `Shared_ast` lib is also lightly restructured, with the `Expr` module split into `Type`, `Operator` and `Expr`. 2022-11-29 11:47:53 +03:00			`"<=", LESSER_EQUAL KPoly;`
			`">=", GREATER_EQUAL KPoly;`
			`">", GREATER KPoly;`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`"!=", NOT_EQUAL;`
			`"=", EQUAL;`
			`"(", LPAREN;`
			`")", RPAREN;`
Rename a few tokens In particular `CONSTRUCTOR` is no longer valid for paths & modules, so let's switch to the more usual LIDENT / UIDENT for lower- or upper- case idents. cd compiler/surface sed -i 's/VERTICAL/BAR/g' * sed -i 's/BRACKET/BRACE/g' * sed -i 's/SQUARE/BRACKET/g' * sed -i 's/IDENT/LIDENT/g' * sed -i 's/CONSTRUCTOR/UIDENT/g' * 2022-12-15 13:48:48 +03:00			`"{", LBRACE;`
			`"}", RBRACE;`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`"{", LBRACKET;`
			`"}", RBRACKET;`
Add overloaded operators for the common operations This uses the same disambiguation mechanism put in place for structures, calling the typer on individual rules on the desugared AST to propagate types, in order to resolve ambiguous operators like `+` to their strongly typed counterparts (`+!`, `+.`, `+$`, `+@`, `+$`) in the translation to scopelang. The patch includes some normalisation of the definition of all the operators, and classifies them based on their typing policy instead of their arity. It also adds a little more flexibility: - a couple new operators, like `-` on date and duration - optional type annotation on some aggregation constructions The `Shared_ast` lib is also lightly restructured, with the `Expr` module split into `Type`, `Operator` and `Expr`. 2022-11-29 11:47:53 +03:00			`"+", PLUS KPoly;`
			`"-", MINUS KPoly;`
			`"*", MULT KPoly;`
			`"/", DIV KPoly;`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`":", COLON;`
			`";", SEMICOLON;`
			`"--", ALT;`
Factorise lexer translations 2021-08-19 19:26:06 +03:00			`"++", PLUSPLUS;`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`]`

Add a lightweight lexer for dependency extraction in Clerk The base is there but not fully used yet in Clerk 2023-09-11 17:44:35 +03:00			`type line_token =`
			\| LINE_TEST of string (* ```catala-test { id = xx } *)
			\| LINE_INLINE_TEST (* ```catala-test-inline *)
			\| LINE_BLOCK_END (* ``` *)
			`\| LINE_INCLUDE of string (* > Include foo.catala_en *)`
Document and first test for externals Also some fixes for Clerk to properly support them 2023-12-01 17:24:54 +03:00			`\| LINE_MODULE_DEF of string * bool (* > Module Xxx [external] *)`
Add a lightweight lexer for dependency extraction in Clerk The base is there but not fully used yet in Clerk 2023-09-11 17:44:35 +03:00			`\| LINE_MODULE_USE of string (* > Using Xxx [as Yyy] *)`
			`\| LINE_ANY (* anything else *)`

refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`module type LocalisedLexer = sig`
			`val token_list : (string * Tokens.token) list`
			`(** Same as {!val: token_list_language_agnostic}, but with tokens specialized`
			`to a given language. *)`

Factorise lexer translations 2021-08-19 19:26:06 +03:00			`val lex_builtin : string -> Ast.builtin_expression option`
			`(** Simple lexer for builtins *)`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00
			`val lex_code : Sedlexing.lexbuf -> Tokens.token`
			`(** Main lexing function used in code blocks *)`

			`val lex_law : Sedlexing.lexbuf -> Tokens.token`
			`(** Main lexing function used outside code blocks *)`

			`val lexer : Sedlexing.lexbuf -> Tokens.token`
fix(build/doc): remove warnings due to .ml* files 2022-01-02 16:53:51 +03:00			`(** Entry point of the lexer, distributes to {!val: lex_code} or`
			`{!val:lex_law} depending of the current`
			`{!val:Surface.Lexer_common.context}. *)`
Add a lightweight lexer for dependency extraction in Clerk The base is there but not fully used yet in Clerk 2023-09-11 17:44:35 +03:00
			`val lex_line : Sedlexing.lexbuf -> (string * line_token) option`
			`(** Low-level lexer intended for dependency extraction *)`
refactor(lexer)!: removes the abbreviated syntax lexer (lexer.ml) 2021-05-26 18:39:39 +03:00			`end`