From cc60c926a8d37e554ff7e2a974fffa03aeb8c885 Mon Sep 17 00:00:00 2001 From: imaqtkatt Date: Tue, 18 Jun 2024 18:41:33 -0300 Subject: [PATCH] Add utf8 and ascii encoding functions --- src/fun/builtins.bend | 117 ++++++++++++++++++ .../run_file/encode_decode_utf8.bend | 4 + .../run_file__encode_decode_utf8.bend.snap | 9 ++ 3 files changed, 130 insertions(+) create mode 100644 tests/golden_tests/run_file/encode_decode_utf8.bend create mode 100644 tests/snapshots/run_file__encode_decode_utf8.bend.snap diff --git a/src/fun/builtins.bend b/src/fun/builtins.bend index ab072085..a9b97ee8 100644 --- a/src/fun/builtins.bend +++ b/src/fun/builtins.bend @@ -175,3 +175,120 @@ hvm to_u24: # Casts any native number to an i24. hvm to_i24: ($([i24] ret) ret) + +# Encoding + +Utf8/MAX_RUNE = '\u{0010FFFF}' +Utf8/RUNE_ERROR = '\u{FFFD}' +#Utf8/RUNE_SELF = 0x80 +#Utf8/MAX = 4 + +Utf8/t1 = 0b00000000 +Utf8/tx = 0b10000000 +Utf8/t2 = 0b11000000 +Utf8/t3 = 0b11100000 +Utf8/t4 = 0b11110000 + +Utf8/maskx = 0b00111111 +Utf8/mask2 = 0b00011111 +Utf8/mask3 = 0b00001111 +Utf8/mask4 = 0b00000111 + +Utf8/rune1max = (- (<< 1 7) 1) +Utf8/rune2max = (- (<< 1 11) 1) +Utf8/rune3max = (- (<< 1 16) 1) + +Encoding/List/append x [] = [x] +Encoding/List/append x (List/Cons y ys) = (List/Cons y (Encoding/List/append x ys)) + +String/append x "" = (String/Cons x String/Nil) +String/append x (String/Cons y ys) = (String/Cons y (String/append x ys)) + +Bytes/decode_utf8 bytes acc = + let (got, _len, rest) = (Utf8/decode_rune bytes) + match rest { + List/Nil: (String/append got acc) + List/Cons: (Bytes/decode_utf8 rest (String/append got acc)) + } + +Utf8/decode_rune [] = (0, 0, []) +Utf8/decode_rune [a] = if (<= a 0x7F) { (a, 1, []) } else { (Utf8/RUNE_ERROR, 0, []) } +Utf8/decode_rune [a, b] = + if (<= a 0x7F) { + (a, 1, [b]) + } else { + if (== (& a 0xE0) 0xC0) { + let r = (| (<< (& a Utf8/mask2) 6) (& b Utf8/maskx)) + (r, 2, []) + } else { + (Utf8/RUNE_ERROR, 0, []) + } + } +Utf8/decode_rune [a, b, c] = + if (<= a 0x7F) { + (a, 1, [b, c]) + } else { + if (== (& a 0xE0) 0xC0) { + let r = (| (<< (& a Utf8/mask2) 6) (& b Utf8/maskx)) + (r, 2, [c]) + } else { + if (== (& a 0xF0) 0xE0) { + let r = (| (<< (& a Utf8/mask3) 12) (| (<< (& b Utf8/maskx) 6) (& c Utf8/maskx))) + (r, 3, []) + } else { + (Utf8/RUNE_ERROR, 0, []) + } + } + } +Utf8/decode_rune (List/Cons a (List/Cons b (List/Cons c (List/Cons d rest)))) = + if (<= a 0x7F) { + (a, 1, (List/Cons b (List/Cons c (List/Cons d rest)))) + } else { + if (== (& a 0xE0) 0xC0) { + let r = (| (<< (& a Utf8/mask2) 6) (& b Utf8/maskx)) + (r, 2, (List/Cons c (List/Cons d rest))) + } else { + if (== (& a 0xF0) 0xE0) { + let r = (| (<< (& a Utf8/mask3) 12) (| (<< (& b Utf8/maskx) 6) (& c Utf8/maskx))) + (r, 3, (List/Cons d rest)) + } else { + if (== (& a 0xF8) 0xF0) { + let r = (| (<< (& a Utf8/mask4) 18) (| (<< (& b Utf8/maskx) 12) (| (<< (& c Utf8/maskx) 6) (& d Utf8/maskx)))) + (r, 4, []) + } else { + (Utf8/RUNE_ERROR, 0, rest) + } + } + } + } + +String/encode_utf8 (String/Nil) acc = acc +String/encode_utf8 (String/Cons x xs) acc = + if (<= x Utf8/rune1max) { + (String/encode_utf8 xs (Encoding/List/append x acc)) + } else { + if (<= x Utf8/rune2max) { + let b1 = (| Utf8/t2 (>> x 6)) + let b2 = (| Utf8/tx (& x Utf8/maskx)) + (String/encode_utf8 xs (Encoding/List/append b2 (Encoding/List/append b1 acc))) + } else { + if (<= x Utf8/rune3max) { + let b1 = (| Utf8/t3 (>> x 12)) + let b2 = (| Utf8/tx (& (>> x 6) Utf8/maskx)) + let b3 = (| Utf8/tx (& x Utf8/maskx)) + (String/encode_utf8 xs (Encoding/List/append b3 (Encoding/List/append b2 (Encoding/List/append b1 acc)))) + } else { + let b1 = (| Utf8/t4 (>> x 18)) + let b2 = (| Utf8/tx (& (>> x 12) Utf8/maskx)) + let b3 = (| Utf8/tx (& (>> x 6) Utf8/maskx)) + let b4 = (| Utf8/tx (& x Utf8/maskx)) + (String/encode_utf8 xs (Encoding/List/append b4 (Encoding/List/append b3 (Encoding/List/append b2 (Encoding/List/append b1 acc))))) + } + } + } + +Bytes/decode_ascii (List/Cons x xs) = (String/Cons x (Bytes/decode_ascii xs)) +Bytes/decode_ascii (List/Nil) = (String/Nil) + +String/encode_ascii (String/Cons x xs) = (List/Cons x (String/encode_ascii xs)) +String/encode_ascii (String/Nil) = (List/Nil) diff --git a/tests/golden_tests/run_file/encode_decode_utf8.bend b/tests/golden_tests/run_file/encode_decode_utf8.bend new file mode 100644 index 00000000..c8796dda --- /dev/null +++ b/tests/golden_tests/run_file/encode_decode_utf8.bend @@ -0,0 +1,4 @@ +def main: + use bytes = [72, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140, 33] + use s = "Hello, 世界!" + return (String/encode_utf8(s, []), Bytes/decode_utf8(bytes, [])) diff --git a/tests/snapshots/run_file__encode_decode_utf8.bend.snap b/tests/snapshots/run_file__encode_decode_utf8.bend.snap new file mode 100644 index 00000000..17562d40 --- /dev/null +++ b/tests/snapshots/run_file__encode_decode_utf8.bend.snap @@ -0,0 +1,9 @@ +--- +source: tests/golden_tests.rs +input_file: tests/golden_tests/run_file/encode_decode_utf8.bend +--- +NumScott: +([72, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140, 33], "Hello, 世界!") + +Scott: +([72, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140, 33], "Hello, 世界!")