catala/compiler/surface/lexer.cppo.ml

(* This file is part of the Catala compiler, a specification language for tax and social benefits
   computation rules. Copyright (C) 2020 Inria, contributors: Denis Merigoux
   <denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>

   Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
   in compliance with the License. You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software distributed under the License
   is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
   or implied. See the License for the specific language governing permissions and limitations under
   the License. *)

open Tokens
open Sedlexing
open Catala_utils
module L = Lexer_common
module R = Re.Pcre

(* The localised strings and regexps for the tokens and specific parsing rules
   are defined as CPPO macros in the `lexer_XX.cppo.ml` files.

   - `MS_*` macros define token strings
   - `MR_*` are sedlex regexps matching the token (inferred from the strings if absent,
     but should be present for any token containing spacing, and for any non-latin1
     character)
   - `MX_*` are full matching rules of the form `sedlex regexp -> ocaml expression`
*)

(* Avoid the need for defining the regexps when they are simple strings *)
#ifndef MR_SCOPE
  #define MR_SCOPE MS_SCOPE
#endif
#ifndef MR_CONSEQUENCE
  #define MR_CONSEQUENCE MS_CONSEQUENCE
#endif
#ifndef MR_DATA
  #define MR_DATA MS_DATA
#endif
#ifndef MR_DEPENDS
  #define MR_DEPENDS MS_DEPENDS
#endif
#ifndef MR_DECLARATION
  #define MR_DECLARATION MS_DECLARATION
#endif
#ifndef MR_CONTEXT
  #define MR_CONTEXT MS_CONTEXT
#endif
#ifndef MR_DECREASING
  #define MR_DECREASING MS_DECREASING
#endif
#ifndef MR_INCREASING
  #define MR_INCREASING MS_INCREASING
#endif
#ifndef MR_OF
  #define MR_OF MS_OF
#endif
#ifndef MR_LIST
  #define MR_LIST MS_LIST
#endif
#ifndef MR_CONTAINS
  #define MR_CONTAINS MS_CONTAINS
#endif
#ifndef MR_ENUM
  #define MR_ENUM MS_ENUM
#endif
#ifndef MR_INTEGER
  #define MR_INTEGER MS_INTEGER
#endif
#ifndef MR_MONEY
  #define MR_MONEY MS_MONEY
#endif
#ifndef MR_TEXT
  #define MR_TEXT MS_TEXT
#endif
#ifndef MR_DECIMAL
  #define MR_DECIMAL MS_DECIMAL
#endif
#ifndef MR_DATE
  #define MR_DATE MS_DATE
#endif
#ifndef MR_DURATION
  #define MR_DURATION MS_DURATION
#endif
#ifndef MR_BOOLEAN
  #define MR_BOOLEAN MS_BOOLEAN
#endif
#ifndef MR_SUM
  #define MR_SUM MS_SUM
#endif
#ifndef MR_FILLED
  #define MR_FILLED MS_FILLED
#endif
#ifndef MR_DEFINITION
  #define MR_DEFINITION MS_DEFINITION
#endif
#ifndef MR_STATE
  #define MR_STATE MS_STATE
#endif
#ifndef MR_LABEL
  #define MR_LABEL MS_LABEL
#endif
#ifndef MR_EXCEPTION
  #define MR_EXCEPTION MS_EXCEPTION
#endif
#ifndef MR_DEFINED_AS
  #define MR_DEFINED_AS MS_DEFINED_AS
#endif
#ifndef MR_MATCH
  #define MR_MATCH MS_MATCH
#endif
#ifndef MR_WILDCARD
  #define MR_WILDCARD MS_WILDCARD
#endif
#ifndef MR_WITH
  #define MR_WITH MS_WITH
#endif
#ifndef MR_UNDER_CONDITION
  #define MR_UNDER_CONDITION MS_UNDER_CONDITION
#endif
#ifndef MR_IF
  #define MR_IF MS_IF
#endif
#ifndef MR_THEN
  #define MR_THEN MS_THEN
#endif
#ifndef MR_ELSE
  #define MR_ELSE MS_ELSE
#endif
#ifndef MR_CONDITION
  #define MR_CONDITION MS_CONDITION
#endif
#ifndef MR_CONTENT
  #define MR_CONTENT MS_CONTENT
#endif
#ifndef MR_STRUCT
  #define MR_STRUCT MS_STRUCT
#endif
#ifndef MR_ASSERTION
  #define MR_ASSERTION MS_ASSERTION
#endif
#ifndef MR_VARIES
  #define MR_VARIES MS_VARIES
#endif
#ifndef MR_WITH_V
  #define MR_WITH_V MS_WITH_V
#endif
#ifndef MR_FOR
  #define MR_FOR MS_FOR
#endif
#ifndef MR_ALL
  #define MR_ALL MS_ALL
#endif
#ifndef MR_WE_HAVE
  #define MR_WE_HAVE MS_WE_HAVE
#endif
#ifndef MR_FIXED
  #define MR_FIXED MS_FIXED
#endif
#ifndef MR_BY
  #define MR_BY MS_BY
#endif
#ifndef MR_RULE
  #define MR_RULE MS_RULE
#endif
#ifndef MR_LET
  #define MR_LET MS_LET
#endif
#ifndef MR_EXISTS
  #define MR_EXISTS MS_EXISTS
#endif
#ifndef MR_IN
  #define MR_IN MS_IN
#endif
#ifndef MR_AMONG
  #define MR_AMONG MS_AMONG
#endif
#ifndef MR_SUCH
  #define MR_SUCH MS_SUCH
#endif
#ifndef MR_THAT
  #define MR_THAT MS_THAT
#endif
#ifndef MR_AND
  #define MR_AND MS_AND
#endif
#ifndef MR_OR
  #define MR_OR MS_OR
#endif
#ifndef MR_XOR
  #define MR_XOR MS_XOR
#endif
#ifndef MR_NOT
  #define MR_NOT MS_NOT
#endif
#ifndef MR_MAXIMUM
  #define MR_MAXIMUM MS_MAXIMUM
#endif
#ifndef MR_MINIMUM
  #define MR_MINIMUM MS_MINIMUM
#endif
#ifndef MR_IS
  #define MR_IS MS_IS
#endif
#ifndef MR_LIST_EMPTY
  #define MR_LIST_EMPTY MS_LIST_EMPTY
#endif
#ifndef MR_BUT_REPLACE
  #define MR_BUT_REPLACE MS_BUT_REPLACE
#endif
#ifndef MR_CARDINAL
  #define MR_CARDINAL MS_CARDINAL
#endif
#ifndef MR_YEAR
  #define MR_YEAR MS_YEAR
#endif
#ifndef MR_MONTH
  #define MR_MONTH MS_MONTH
#endif
#ifndef MR_DAY
  #define MR_DAY MS_DAY
#endif
#ifndef MR_TRUE
  #define MR_TRUE MS_TRUE
#endif
#ifndef MR_FALSE
  #define MR_FALSE MS_FALSE
#endif
#ifndef MR_Round
  #define MR_Round MS_Round
#endif
#ifndef MR_GetDay
  #define MR_GetDay MS_GetDay
#endif
#ifndef MR_GetMonth
  #define MR_GetMonth MS_GetMonth
#endif
#ifndef MR_GetYear
  #define MR_GetYear MS_GetYear
#endif
#ifndef MR_FirstDayOfMonth
  #define MR_FirstDayOfMonth MS_FirstDayOfMonth
#endif
#ifndef MR_LastDayOfMonth
  #define MR_LastDayOfMonth MS_LastDayOfMonth
#endif
#ifndef MR_INPUT
  #define MR_INPUT MS_INPUT
#endif
#ifndef MR_OUTPUT
  #define MR_OUTPUT MS_OUTPUT
#endif
#ifndef MR_INTERNAL
  #define MR_INTERNAL MS_INTERNAL
#endif
#ifndef MR_MONEY_OP_SUFFIX
  #define MR_MONEY_OP_SUFFIX MS_MONEY_OP_SUFFIX
#endif

let token_list : (string * token) list =
  [
    (MS_SCOPE, SCOPE);
    (MS_CONSEQUENCE, CONSEQUENCE);
    (MS_DATA, DATA);
    (MS_DEPENDS, DEPENDS);
    (MS_DECLARATION, DECLARATION);
    (MS_CONTEXT, CONTEXT);
    (MS_DECREASING, DECREASING);
    (MS_INCREASING, INCREASING);
    (MS_OF, OF);
    (MS_LIST, LIST);
    (MS_CONTAINS, CONTAINS);
    (MS_ENUM, ENUM);
    (MS_INTEGER, INTEGER);
    (MS_MONEY, MONEY);
    (MS_TEXT, TEXT);
    (MS_DECIMAL, DECIMAL);
    (MS_DATE, DATE);
    (MS_DURATION, DURATION);
    (MS_BOOLEAN, BOOLEAN);
    (MS_SUM, SUM);
    (MS_FILLED, FILLED);
    (MS_DEFINITION, DEFINITION);
    (MS_STATE, STATE);
    (MS_LABEL, LABEL);
    (MS_EXCEPTION, EXCEPTION);
    (MS_DEFINED_AS, DEFINED_AS);
    (MS_MATCH, MATCH);
    (MS_WILDCARD, WILDCARD);
    (MS_WITH, WITH);
    (MS_UNDER_CONDITION, UNDER_CONDITION);
    (MS_IF, IF);
    (MS_THEN, THEN);
    (MS_ELSE, ELSE);
    (MS_CONDITION, CONDITION);
    (MS_CONTENT, CONTENT);
    (MS_STRUCT, STRUCT);
    (MS_ASSERTION, ASSERTION);
    (MS_VARIES, VARIES);
    (MS_WITH_V, WITH_V);
    (MS_FOR, FOR);
    (MS_ALL, ALL);
    (MS_WE_HAVE, WE_HAVE);
    (MS_FIXED, FIXED);
    (MS_BY, BY);
    (MS_RULE, RULE);
    (MS_LET, LET);
    (MS_EXISTS, EXISTS);
    (MS_IN, IN);
    (MS_AMONG, AMONG);
    (MS_SUCH, SUCH);
    (MS_THAT, THAT);
    (MS_AND, AND);
    (MS_OR, OR);
    (MS_XOR, XOR);
    (MS_NOT, NOT);
    (MS_MAXIMUM, MAXIMUM);
    (MS_MINIMUM, MINIMUM);
    (MS_IS, IS);
    (MS_LIST_EMPTY, LIST_EMPTY);
    (MS_BUT_REPLACE, BUT_REPLACE);
    (MS_CARDINAL, CARDINAL);
    (MS_YEAR, YEAR);
    (MS_MONTH, MONTH);
    (MS_DAY, DAY);
    (MS_TRUE, TRUE);
    (MS_FALSE, FALSE);
    (MS_INPUT, INPUT);
    (MS_OUTPUT, OUTPUT);
    (MS_INTERNAL, INTERNAL)
  ]
  @ L.token_list_language_agnostic

(** Localised builtin functions *)
let lex_builtin (s : string) : Ast.builtin_expression option =
  let lexbuf = Utf8.from_string s in
  match%sedlex lexbuf with
  | MR_Round, eof -> Some Round
  | MR_GetDay, eof -> Some GetDay
  | MR_GetMonth, eof -> Some GetMonth
  | MR_GetYear, eof -> Some GetYear
  | MR_FirstDayOfMonth -> Some FirstDayOfMonth
  | MR_LastDayOfMonth -> Some LastDayOfMonth
  | _ -> None

(** Regexp matching any digit character.

    @note can not be used outside the current module (@see <
    https://github.com/ocaml-community/sedlex#lexer-specifications >). *)
let digit = [%sedlex.regexp? '0' .. '9']

(** Regexp matching at least one space. *)
let space_plus = [%sedlex.regexp? Plus white_space]

(** characters that can be present in idents (excluding first char) *)
let idchar = [%sedlex.regexp? uppercase | lowercase | digit | '_' | '\'']

(** Regexp matching white space but not newlines *)
let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]

(** Operator explicit typing suffix chars *)
let op_kind_re = [%sedlex.regexp? "" | MR_MONEY_OP_SUFFIX | Chars "!.@^"]

let op_kind = function
  | "" -> Ast.KPoly
  | "!" -> Ast.KInt
  | "." -> Ast.KDec
  | MS_MONEY_OP_SUFFIX -> Ast.KMoney
  | "@" -> Ast.KDate
  | "^" -> Ast.KDuration
  | _ -> invalid_arg "op_kind"

(** Main lexing function used in code blocks *)
let rec lex_code (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | white_space ->
      (* Whitespaces *)
      L.update_acc lexbuf;
      lex_code lexbuf
  | '#', Star (Compl '\n'), '\n' ->
      (* Comments *)
      L.update_acc lexbuf;
      lex_code lexbuf
  | "```" ->
      (* End of code section *)
      L.context := Law;
      END_CODE (Buffer.contents L.code_buffer)
  | MR_SCOPE ->
      L.update_acc lexbuf;
      SCOPE
  | MR_DATA ->
      L.update_acc lexbuf;
      DATA
  | MR_DEPENDS ->
      L.update_acc lexbuf;
      DEPENDS
  | MR_DECLARATION ->
      L.update_acc lexbuf;
      DECLARATION
  | MR_CONTEXT ->
      L.update_acc lexbuf;
      CONTEXT
  | MR_INPUT ->
      L.update_acc lexbuf;
      INPUT
  | MR_OUTPUT ->
      L.update_acc lexbuf;
      OUTPUT
  | MR_INTERNAL ->
      L.update_acc lexbuf;
      INTERNAL
  | MR_DECREASING ->
      L.update_acc lexbuf;
      DECREASING
  | MR_INCREASING ->
      L.update_acc lexbuf;
      INCREASING
  | MR_OF ->
      L.update_acc lexbuf;
      OF
  | MR_LIST ->
      L.update_acc lexbuf;
      LIST
  | MR_CONTAINS ->
      L.update_acc lexbuf;
      CONTAINS
  | MR_ENUM ->
      L.update_acc lexbuf;
      ENUM
  | MR_INTEGER ->
      L.update_acc lexbuf;
      INTEGER
  | MR_MONEY ->
      L.update_acc lexbuf;
      MONEY
  | MR_TEXT ->
      L.update_acc lexbuf;
      TEXT
  | MR_DECIMAL ->
      L.update_acc lexbuf;
      DECIMAL
  | MR_DATE ->
      L.update_acc lexbuf;
      DATE
  | MR_DURATION ->
      L.update_acc lexbuf;
      DURATION
  | MR_BOOLEAN ->
      L.update_acc lexbuf;
      BOOLEAN
  | MR_SUM ->
      L.update_acc lexbuf;
      SUM
  | MR_FILLED ->
      L.update_acc lexbuf;
      FILLED
  | MR_DEFINITION ->
      L.update_acc lexbuf;
      DEFINITION
  | MR_STATE ->
      L.update_acc lexbuf;
      STATE
  | MR_LABEL ->
      L.update_acc lexbuf;
      LABEL
  | MR_EXCEPTION ->
      L.update_acc lexbuf;
      EXCEPTION
  | MR_DEFINED_AS ->
      L.update_acc lexbuf;
      DEFINED_AS
  | MR_MATCH ->
      L.update_acc lexbuf;
      MATCH
  | MR_WITH ->
      L.update_acc lexbuf;
      WITH
  | MR_WILDCARD ->
      L.update_acc lexbuf;
      WILDCARD
  | MR_UNDER_CONDITION ->
      L.update_acc lexbuf;
      UNDER_CONDITION
  | MR_IF ->
      L.update_acc lexbuf;
      IF
  | MR_CONSEQUENCE ->
      L.update_acc lexbuf;
      CONSEQUENCE
  | MR_THEN ->
      L.update_acc lexbuf;
      THEN
  | MR_ELSE ->
      L.update_acc lexbuf;
      ELSE
  | MR_CONDITION ->
      L.update_acc lexbuf;
      CONDITION
  | MR_CONTENT ->
      L.update_acc lexbuf;
      CONTENT
  | MR_STRUCT ->
      L.update_acc lexbuf;
      STRUCT
  | MR_ASSERTION ->
      L.update_acc lexbuf;
      ASSERTION
  | MR_VARIES ->
      L.update_acc lexbuf;
      VARIES
  | MR_WITH_V ->
      L.update_acc lexbuf;
      WITH_V
  | MR_FOR ->
      L.update_acc lexbuf;
      FOR
  | MR_ALL ->
      L.update_acc lexbuf;
      ALL
  | MR_WE_HAVE ->
      L.update_acc lexbuf;
      WE_HAVE
  | MR_FIXED ->
      L.update_acc lexbuf;
      FIXED
  | MR_BY ->
      L.update_acc lexbuf;
      BY
  | MR_RULE ->
      L.update_acc lexbuf;
      RULE
  | MR_LET ->
      L.update_acc lexbuf;
      LET
  | MR_EXISTS ->
      L.update_acc lexbuf;
      EXISTS
  | MR_IN ->
      L.update_acc lexbuf;
      IN
  | MR_AMONG ->
      L.update_acc lexbuf;
      AMONG
  | MR_SUCH ->
      L.update_acc lexbuf;
      SUCH
  | MR_THAT ->
      L.update_acc lexbuf;
      THAT
  | MR_AND ->
      L.update_acc lexbuf;
      AND
  | MR_OR ->
      L.update_acc lexbuf;
      OR
  | MR_XOR ->
      L.update_acc lexbuf;
      XOR
  | MR_NOT ->
      L.update_acc lexbuf;
      NOT
  | MR_MAXIMUM ->
      L.update_acc lexbuf;
      MAXIMUM
  | MR_MINIMUM ->
      L.update_acc lexbuf;
      MINIMUM
  | MR_IS ->
      L.update_acc lexbuf;
      IS
  | MR_LIST_EMPTY ->
      L.update_acc lexbuf;
      LIST_EMPTY
  | MR_BUT_REPLACE ->
      L.update_acc lexbuf;
      BUT_REPLACE
  | MR_CARDINAL ->
      L.update_acc lexbuf;
      CARDINAL
  | MR_TRUE ->
      L.update_acc lexbuf;
      TRUE
  | MR_FALSE ->
      L.update_acc lexbuf;
      FALSE
  | MR_YEAR ->
      L.update_acc lexbuf;
      YEAR
  | MR_MONTH ->
      L.update_acc lexbuf;
      MONTH
  | MR_DAY ->
      L.update_acc lexbuf;
      DAY
  | MR_MONEY_PREFIX, digit, Opt (Star (digit | MR_MONEY_DELIM), digit), Opt (MC_DECIMAL_SEPARATOR, Rep (digit, 0 .. 2)), MR_MONEY_SUFFIX ->
      let s = Utf8.lexeme lexbuf in
      let units = Buffer.create (String.length s) in
      let cents = Buffer.create 2 in
      let buf = ref units in
      for i = 0 to String.length s - 1 do
        match s.[i] with
        | '0'..'9' as c -> Buffer.add_char !buf c
        | MC_DECIMAL_SEPARATOR -> buf := cents
        | _ -> ()
      done;
      (* If the user has written $0.3 it means 30 cents so we have to pad
           with a 0 *)
      Buffer.add_string cents (String.make (2 - Buffer.length cents) '0');
      L.update_acc lexbuf;
      MONEY_AMOUNT (Buffer.contents units, Buffer.contents cents)
  | '|', Rep (digit, 4), '-', Rep (digit, 2), '-', Rep (digit, 2), '|' ->
    L.update_acc lexbuf;
    let rex =
      Re.(compile @@ whole_string @@ seq [
          char '|';
          group (repn digit 4 None);
          char '-';
          group (repn digit 2 None);
          char '-';
          group (repn digit 2 None);
          char '|';
        ])
    in
    let date_parts = R.get_substring (R.exec ~rex (Utf8.lexeme lexbuf)) in
    DATE_LITERAL (
      int_of_string (date_parts 1),
      int_of_string (date_parts 2),
      int_of_string (date_parts 3)
    )
  | Opt '-', Plus digit, MC_DECIMAL_SEPARATOR, Star digit ->
    let rex =
      Re.(compile @@ whole_string @@ seq [
          group (seq [opt (char '-') ; rep1 digit]);
          char MC_DECIMAL_SEPARATOR;
          group (rep digit)
        ]) in
    let dec_parts = R.get_substring (R.exec ~rex (Utf8.lexeme lexbuf)) in
    L.update_acc lexbuf;
    DECIMAL_LITERAL
      (dec_parts 1, dec_parts 2)
  | "<=", op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:"<=" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      LESSER_EQUAL k
  | "<", op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:"<" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      LESSER k
  | ">=", op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:">=" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      GREATER_EQUAL k
  | ">", op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:">" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      GREATER k
  | "-", op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:"-" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      MINUS k
  | "+", op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:"+" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      PLUS k
  | "*", op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:"*" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      MULT k
  | '/', op_kind_re ->
      let k = op_kind (String.remove_prefix ~prefix:"/" (Utf8.lexeme lexbuf)) in
      L.update_acc lexbuf;
      DIV k
  | "!=" ->
      L.update_acc lexbuf;
      NOT_EQUAL
  | '=' ->
      L.update_acc lexbuf;
      EQUAL
  | '%' ->
      L.update_acc lexbuf;
      PERCENT
  | '(' ->
      L.update_acc lexbuf;
      LPAREN
  | ')' ->
      L.update_acc lexbuf;
      RPAREN
  | '{' ->
      L.update_acc lexbuf;
      LBRACE
  | '}' ->
      L.update_acc lexbuf;
      RBRACE
  | '[' ->
      L.update_acc lexbuf;
      LBRACKET
  | ']' ->
      L.update_acc lexbuf;
      RBRACKET
  | ':' ->
      L.update_acc lexbuf;
      COLON
  | ';' ->
      L.update_acc lexbuf;
      SEMICOLON
  | "--" ->
      L.update_acc lexbuf;
      ALT
  | "++" ->
      L.update_acc lexbuf;
      PLUSPLUS
  | '.' ->
      L.update_acc lexbuf;
      DOT
  | ',' ->
      L.update_acc lexbuf;
      COMMA
  | uppercase, Star idchar ->
      (* Name of constructor *)
      L.update_acc lexbuf;
      UIDENT (Utf8.lexeme lexbuf)
  | lowercase, Star idchar ->
      (* Name of variable *)
      L.update_acc lexbuf;
      LIDENT (Utf8.lexeme lexbuf)
  | Opt '-', Plus digit ->
      (* Integer literal*)
      L.update_acc lexbuf;
      INT_LITERAL (Utf8.lexeme lexbuf)
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

let rec lex_directive_args (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | '@', Star hspace, "p.", Star hspace, Plus digit ->
      let s = Utf8.lexeme lexbuf in
      let i = String.index s '.' in
      AT_PAGE (int_of_string (String.trim (String.sub s i (String.length s - i))))
  | MR_MODULE_ALIAS -> MODULE_ALIAS
  | MR_EXTERNAL -> MODULE_EXTERNAL
  | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
  | Plus hspace -> lex_directive_args lexbuf
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

let rec lex_directive (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let prev_pos = lexing_positions lexbuf in
  match%sedlex lexbuf with
  | Plus hspace -> lex_directive lexbuf
  | MR_LAW_INCLUDE -> LAW_INCLUDE
  | MR_MODULE_DEF -> L.context := Directive_args; MODULE_DEF
  | MR_MODULE_USE -> L.context := Directive_args; MODULE_USE
  | ":" ->
      L.context := Directive_args;
      COLON
  | '\n' | eof ->
      L.context := Law;
      END_DIRECTIVE
  | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

let lex_raw (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
  if at_bol then
    match%sedlex lexbuf with
    | eof -> EOF
    | "```", Star hspace, ('\n' | eof) ->
        L.context := Law;
        LAW_TEXT (Utf8.lexeme lexbuf)
    | _ -> (
        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
           sub-match at the same point *)
        match%sedlex lexbuf with
        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
  else
    match%sedlex lexbuf with
    | eof -> EOF
    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

(** Main lexing function used outside code blocks *)
let lex_law (lexbuf : lexbuf) : token =
  let prev_lexeme = Utf8.lexeme lexbuf in
  let ((_, start_pos) as prev_pos) = lexing_positions lexbuf in
  let at_bol = Lexing.(start_pos.pos_bol = start_pos.pos_cnum) in
  if at_bol then
    match%sedlex lexbuf with
    | eof -> EOF
    | "```catala", Star white_space, ('\n' | eof) ->
        L.context := Code;
        Buffer.clear L.code_buffer;
        BEGIN_CODE
    | "```catala-metadata", Star white_space, ('\n' | eof) ->
        L.context := Code;
        Buffer.clear L.code_buffer;
        BEGIN_METADATA
    | "```", Star (idchar | '-') ->
        L.context := Raw;
        LAW_TEXT (Utf8.lexeme lexbuf)
    | '>' ->
        L.context := Directive;
        BEGIN_DIRECTIVE
    | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
        L.get_law_heading lexbuf
    | _ -> (
        (* Nested match for lower priority; `_` matches length 0 so we effectively retry the
           sub-match at the same point *)
        match%sedlex lexbuf with
        | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
        | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
  else
    match%sedlex lexbuf with
    | eof -> EOF
    | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
    | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

(** Entry point of the lexer, distributes to {!val: lex_code} or {!val:lex_law}
    depending of the current {!val: Surface.Lexer_common.context}. *)
let lexer (lexbuf : lexbuf) : token =
  match !L.context with
  | Law -> lex_law lexbuf
  | Raw -> lex_raw lexbuf
  | Code -> lex_code lexbuf
  | Directive -> lex_directive lexbuf
  | Directive_args -> lex_directive_args lexbuf


(* -- Shallow lexing for dependency extraction -- *)

let line_test_id_re =
  Re.(compile @@ seq [
      char '{'; rep space; str "id"; rep space; char '='; rep space;
      char '"'; group (seq [rep1 (diff any (char '"'))]); char '"';
    ])

let line_dir_arg_re =
  Re.(compile @@ seq [
      bos; char '>'; rep space; rep1 alpha;
      rep1 any;
      alt [space; char ':'];
      group (rep1 (diff any space));
      eol
    ])

(* This is a bit cheap, but we don't want a full-fledged parser to handle these
   trivial line directives. Here we extract the first uppercase argument of a
   directive line, which is guaranteed to match the module name we are
   interested in and nothing else (e.g. in French, the module usage "keywords"
   are multiple words) *)
let line_dir_arg_upcase_re =
  Re.(compile @@ seq [
      bos; char '>'; rep space; rep1 alpha;
      rep (alt [space; lower]); space;
      group (seq [rep1 upper; rep (diff any space)]);
      rep any;
      eol
    ])

let lex_line (lexbuf : lexbuf) : (string * L.line_token) option =
  match%sedlex lexbuf with
  | eof -> None
  | "```catala-test-inline", Star hspace, ('\n' | eof) ->
    Some (Utf8.lexeme lexbuf, LINE_INLINE_TEST)
  | "```catala-test", Star (Compl '\n'), ('\n' | eof) ->
    let str = Utf8.lexeme lexbuf in
    (try
       let id = Re.Group.get (Re.exec line_test_id_re str) 1 in
       Some (str, LINE_TEST id)
     with Not_found ->
       Message.warning ~pos:(Pos.from_lpos (lexing_positions lexbuf))
         "Ignored invalid test section, must have an explicit \
          `{ id = \"name\" }` specification";
       Some (str, LINE_ANY))
  | "```", Star hspace, ('\n' | eof) ->
    Some (Utf8.lexeme lexbuf, LINE_BLOCK_END)
  | '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus (Compl '\n'),
    ('\n' | eof)  ->
    let str = Utf8.lexeme lexbuf in
    (try
       let file = Re.Group.get (Re.exec line_dir_arg_re str) 1 in
       Some (str, LINE_INCLUDE file)
     with Not_found -> Some (str, LINE_ANY))
  | '>', Star hspace, MR_MODULE_DEF, Plus hspace,
    uppercase, Star (Compl white_space), Plus hspace,
    MR_EXTERNAL, Star hspace, ('\n' | eof)  ->
    let str = Utf8.lexeme lexbuf in
    (try
       let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
       Some (str, LINE_MODULE_DEF (mdl, true))
     with Not_found -> Some (str, LINE_ANY))
  | '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star (Compl '\n'),
    ('\n' | eof)  ->
    let str = Utf8.lexeme lexbuf in
    (try
       let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
       Some (str, LINE_MODULE_DEF (mdl, false))
     with Not_found -> Some (str, LINE_ANY))
  | '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (Compl '\n'),
    ('\n' | eof)  ->
    let str = Utf8.lexeme lexbuf in
    (try
       let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
       Some (str, LINE_MODULE_USE mdl)
     with Not_found -> Some (str, LINE_ANY))
  | Star (Compl '\n'), ('\n' | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY)
  | _ -> assert false