catala/compiler/catala_utils/string.ml

(* This file is part of the Catala compiler, a specification language for tax
   and social benefits computation rules. Copyright (C) 2020 Inria, contributor:
   Denis Merigoux <denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>

   Licensed under the Apache License, Version 2.0 (the "License"); you may not
   use this file except in compliance with the License. You may obtain a copy of
   the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
   WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
   License for the specific language governing permissions and limitations under
   the License. *)

include Stdlib.String

let to_ascii : string -> string = Ubase.from_utf8
let is_uppercase_ascii = function 'A' .. 'Z' -> true | _ -> false

let begins_with_uppercase (s : string) : bool =
  "" <> s && is_uppercase_ascii (get (to_ascii s) 0)

let to_snake_case (s : string) : string =
  let out = Buffer.create (2 * length s) in
  s
  |> to_ascii
  |> iteri (fun i c ->
         if is_uppercase_ascii c && 0 <> i && get s (i-1) <> '_' then Buffer.add_char out '_';
         Buffer.add_char out (Char.lowercase_ascii c));
  Buffer.contents out

let to_camel_case (s : string) : string =
  let last_was_underscore = ref true in
  let out = Buffer.create (length s) in
  s
  |> to_ascii
  |> iter (function
       | '_' -> last_was_underscore := true
       | c ->
         Buffer.add_char out
           (if !last_was_underscore then Char.uppercase_ascii c else c);
         last_was_underscore := false);
  Buffer.contents out

let remove_prefix ~prefix s =
  if starts_with ~prefix s then
    let plen = length prefix in
    sub s plen (length s - plen)
  else s

let trim_end s =
  let rec stop n =
    if n < 0 then n
    else
      match get s n with
      | ' ' | '\x0c' | '\n' | '\r' | '\t' -> stop (n - 1)
      | _ -> n
  in
  let last = length s - 1 in
  let i = stop last in
  if i = last then s else sub s 0 (i + 1)

(* Note: this should do, but remains incorrect for combined unicode characters
   that display as one (e.g. `e` + postfix `'`). We should switch to Uuseg at
   some poing *)
let width s =
  let len = length s in
  let rec aux ncols i =
    if i >= len then ncols
    else if get s i = '\t' then aux (ncols + 8) (i + 1)
    else aux (ncols + 1) (i + Uchar.utf_decode_length (get_utf_8_uchar s i))
  in
  aux 0 0

let format ppf s = Format.pp_print_as ppf (width s) s

module Arg = struct
  include Stdlib.String

  let format = format

  let compare s1 s2 =
    let len1 = length s1 in
    let len2 = length s2 in
    let int c = int_of_char c - int_of_char '0' in
    let rec readnum acc s i =
      if i >= length s then acc, i
      else
        match get s i with
        | '0' .. '9' as c -> readnum ((acc * 10) + int c) s (i + 1)
        | _ -> acc, i
    in
    let rec aux i1 i2 =
      if i1 >= len1 then if i2 >= len2 then 0 else -1
      else if i2 >= len2 then 1
      else
        match get s1 i1, get s2 i2 with
        | ('0' .. '9' as c1), ('0' .. '9' as c2) -> (
          let x1, i1' = readnum (int c1) s1 (i1 + 1) in
          let x2, i2' = readnum (int c2) s2 (i2 + 1) in
          match Int.compare x1 x2 with
          | 0 -> (
            match Int.compare (i1' - i1) (i2' - i2) with
            | 0 -> aux i1' i2'
            | n -> n)
          | n -> n)
        | c1, c2 -> (
          match Char.compare c1 c2 with 0 -> aux (i1 + 1) (i2 + 1) | n -> n)
    in
    aux 0 0
end

let compare = Arg.compare
let hash t = Hash.raw t

module Set = Set.Make (Arg)
module Map = Map.Make (Arg)
refactor(compiler): remove the camomile dependency due to the new Utils.String_common module based on Ubase 2022-08-03 18:02:13 +03:00			`(* This file is part of the Catala compiler, a specification language for tax`
			`and social benefits computation rules. Copyright (C) 2020 Inria, contributor:`
			`Denis Merigoux <denis.merigoux@inria.fr>, Emile Rolley <emile.rolley@tuta.io>`

			`Licensed under the Apache License, Version 2.0 (the "License"); you may not`
			`use this file except in compliance with the License. You may obtain a copy of`
			`the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`License for the specific language governing permissions and limitations under`
			`the License. *)`

Define Catala_utils.String as an overlay to stdlib string 2022-11-21 13:17:42 +03:00			`include Stdlib.String`

refactor(compiler): remove the camomile dependency due to the new Utils.String_common module based on Ubase 2022-08-03 18:02:13 +03:00			`let to_ascii : string -> string = Ubase.from_utf8`
Tweak error message location printing 2022-11-24 20:00:45 +03:00			`let is_uppercase_ascii = function 'A' .. 'Z' -> true \| _ -> false`
refactor(compiler): remove the camomile dependency due to the new Utils.String_common module based on Ubase 2022-08-03 18:02:13 +03:00
			`let begins_with_uppercase (s : string) : bool =`
Define Catala_utils.String as an overlay to stdlib string 2022-11-21 13:17:42 +03:00			`"" <> s && is_uppercase_ascii (get (to_ascii s) 0)`
refactor(compiler): remove the camomile dependency due to the new Utils.String_common module based on Ubase 2022-08-03 18:02:13 +03:00
			`let to_snake_case (s : string) : string =`
Fix complexity of some string functions 2024-03-05 19:59:38 +03:00			`let out = Buffer.create (2 * length s) in`
			`s`
			`\|> to_ascii`
Define Catala_utils.String as an overlay to stdlib string 2022-11-21 13:17:42 +03:00			`\|> iteri (fun i c ->`
Fixes to renamings, scalc and C backend 2024-08-30 16:00:13 +03:00			`if is_uppercase_ascii c && 0 <> i && get s (i-1) <> '_' then Buffer.add_char out '_';`
Fix complexity of some string functions 2024-03-05 19:59:38 +03:00			`Buffer.add_char out (Char.lowercase_ascii c));`
			`Buffer.contents out`
refactor(compiler): remove the camomile dependency due to the new Utils.String_common module based on Ubase 2022-08-03 18:02:13 +03:00
			`let to_camel_case (s : string) : string =`
Fix complexity of some string functions 2024-03-05 19:59:38 +03:00			`let last_was_underscore = ref true in`
			`let out = Buffer.create (length s) in`
			`s`
			`\|> to_ascii`
			`\|> iter (function`
			`\| '_' -> last_was_underscore := true`
			`\| c ->`
			`Buffer.add_char out`
			`(if !last_was_underscore then Char.uppercase_ascii c else c);`
			`last_was_underscore := false);`
			`Buffer.contents out`
Add ambiguous StructAccess for desugared to be resolved in scopelang 2022-11-22 22:57:59 +03:00
Add overloaded operators for the common operations This uses the same disambiguation mechanism put in place for structures, calling the typer on individual rules on the desugared AST to propagate types, in order to resolve ambiguous operators like `+` to their strongly typed counterparts (`+!`, `+.`, `+$`, `+@`, `+$`) in the translation to scopelang. The patch includes some normalisation of the definition of all the operators, and classifies them based on their typing policy instead of their arity. It also adds a little more flexibility: - a couple new operators, like `-` on date and duration - optional type annotation on some aggregation constructions The `Shared_ast` lib is also lightly restructured, with the `Expr` module split into `Type`, `Operator` and `Expr`. 2022-11-29 11:47:53 +03:00			`let remove_prefix ~prefix s =`
			`if starts_with ~prefix s then`
			`let plen = length prefix in`
			`sub s plen (length s - plen)`
			`else s`

Clerk reports: postprocess diff output This relies less on specific color flags of GNU diff, and reformats and colorises the output. (it may still depend on the specific layout of GNU diff with the `-y` flag though) 2024-06-21 16:41:44 +03:00			`let trim_end s =`
			`let rec stop n =`
			`if n < 0 then n`
			`else`
			`match get s n with`
			`\| ' ' \| '\x0c' \| '\n' \| '\r' \| '\t' -> stop (n - 1)`
			`\| _ -> n`
			`in`
			`let last = length s - 1 in`
			`let i = stop last in`
			`if i = last then s else sub s 0 (i + 1)`

Fix formatting of exception trees 2023-07-07 15:48:53 +03:00			`(* Note: this should do, but remains incorrect for combined unicode characters`
			that display as one (e.g. `e` + postfix `'`). We should switch to Uuseg at
			`some poing *)`
			`let width s =`
			`let len = length s in`
			`let rec aux ncols i =`
			`if i >= len then ncols`
			`else if get s i = '\t' then aux (ncols + 8) (i + 1)`
			`else aux (ncols + 1) (i + Uchar.utf_decode_length (get_utf_8_uchar s i))`
			`in`
			`aux 0 0`

Add some helper functions in a wrapper `Map` module and use them throughout. No more `List.map fst (Map.bindings m)` ! Also adds some facilities for direct formatting without going through a list. 2023-07-12 12:48:46 +03:00			`let format ppf s = Format.pp_print_as ppf (width s) s`
Generalise the expression printer This patch functorises the generic expression printer, in order to be able to re-use it for end-user printing. It makes it possible to have an end-user, localised printer that shares the code for e.g. priority and automatic parens handling. A generic AST rewriting that disambiguates variables (very simple to write with bindlib) is also added and used in the OCaml backend for something safer than just appending `_user` (-- this also handles clashing variables that could be introduced during compilation which would have generated wrong code before this) Finally, the `explain` plugin is adapted to use the new printer. Ah, and `String.format_t` was tweaked to correctly print strings that might contain unicode without breaking alignment, and should be used instead of `format_string` or `%s` whenever unicode can be expected. 2023-07-11 18:10:00 +03:00
Add some helper functions in a wrapper `Map` module and use them throughout. No more `List.map fst (Map.bindings m)` ! Also adds some facilities for direct formatting without going through a list. 2023-07-12 12:48:46 +03:00			`module Arg = struct`
			`include Stdlib.String`

			`let format = format`
Sort strings in natural order when they contain numbers Seeing results sorted as 1 10 11 2 doesn't look nice. 2024-03-18 19:38:10 +03:00
			`let compare s1 s2 =`
			`let len1 = length s1 in`
			`let len2 = length s2 in`
			`let int c = int_of_char c - int_of_char '0' in`
			`let rec readnum acc s i =`
			`if i >= length s then acc, i`
			`else`
			`match get s i with`
			`\| '0' .. '9' as c -> readnum ((acc * 10) + int c) s (i + 1)`
			`\| _ -> acc, i`
			`in`
			`let rec aux i1 i2 =`
			`if i1 >= len1 then if i2 >= len2 then 0 else -1`
			`else if i2 >= len2 then 1`
			`else`
			`match get s1 i1, get s2 i2 with`
			`\| ('0' .. '9' as c1), ('0' .. '9' as c2) -> (`
			`let x1, i1' = readnum (int c1) s1 (i1 + 1) in`
			`let x2, i2' = readnum (int c2) s2 (i2 + 1) in`
			`match Int.compare x1 x2 with`
			`\| 0 -> (`
			`match Int.compare (i1' - i1) (i2' - i2) with`
			`\| 0 -> aux i1' i2'`
			`\| n -> n)`
			`\| n -> n)`
			`\| c1, c2 -> (`
			`match Char.compare c1 c2 with 0 -> aux (i1 + 1) (i2 + 1) \| n -> n)`
			`in`
			`aux 0 0`
Add some helper functions in a wrapper `Map` module and use them throughout. No more `List.map fst (Map.bindings m)` ! Also adds some facilities for direct formatting without going through a list. 2023-07-12 12:48:46 +03:00			`end`

Sort strings in natural order when they contain numbers Seeing results sorted as 1 10 11 2 doesn't look nice. 2024-03-18 19:38:10 +03:00			`let compare = Arg.compare`
Computation and checking of module hashes This includes a few separate changes: - pass visibility information of declarations (depending on wether the declaration was in a ```catala-metadata block or not) - add reasonable hash computation functions to discriminate the interfaces. In particular: * Uids have a `hash` function that depends on their string, but not on their actual uid (which is not stable between runs of the compiler) ; the existing `hash` function and its uses have been renamed to `id`. * The `Hash` module provides the tools to properly combine hashes, etc. While we rely on `Hashtbl.hash` for the atoms, we take care not to use it on any recursive structure (it relies on a bounded traversal). - insert the hashes in the artefacts, and properly check and report those (for OCaml) Remains to do: - Record and check the hashes in the other backends - Provide a way to get stable inline-test outputs in the presence of module hashes - Provide a way to write external modules that don't break at every Catala update. 2024-05-24 15:26:44 +03:00			`let hash t = Hash.raw t`
Sort strings in natural order when they contain numbers Seeing results sorted as 1 10 11 2 doesn't look nice. 2024-03-18 19:38:10 +03:00
Add some helper functions in a wrapper `Map` module and use them throughout. No more `List.map fst (Map.bindings m)` ! Also adds some facilities for direct formatting without going through a list. 2023-07-12 12:48:46 +03:00			`module Set = Set.Make (Arg)`
			`module Map = Map.Make (Arg)`