Add utf8 and ascii encoding functions

This commit is contained in:
imaqtkatt 2024-06-18 18:41:33 -03:00
parent 78f0968281
commit cc60c926a8
3 changed files with 130 additions and 0 deletions

View File

@ -175,3 +175,120 @@ hvm to_u24:
# Casts any native number to an i24.
hvm to_i24:
($([i24] ret) ret)
# Encoding
Utf8/MAX_RUNE = '\u{0010FFFF}'
Utf8/RUNE_ERROR = '\u{FFFD}'
#Utf8/RUNE_SELF = 0x80
#Utf8/MAX = 4
Utf8/t1 = 0b00000000
Utf8/tx = 0b10000000
Utf8/t2 = 0b11000000
Utf8/t3 = 0b11100000
Utf8/t4 = 0b11110000
Utf8/maskx = 0b00111111
Utf8/mask2 = 0b00011111
Utf8/mask3 = 0b00001111
Utf8/mask4 = 0b00000111
Utf8/rune1max = (- (<< 1 7) 1)
Utf8/rune2max = (- (<< 1 11) 1)
Utf8/rune3max = (- (<< 1 16) 1)
Encoding/List/append x [] = [x]
Encoding/List/append x (List/Cons y ys) = (List/Cons y (Encoding/List/append x ys))
String/append x "" = (String/Cons x String/Nil)
String/append x (String/Cons y ys) = (String/Cons y (String/append x ys))
Bytes/decode_utf8 bytes acc =
let (got, _len, rest) = (Utf8/decode_rune bytes)
match rest {
List/Nil: (String/append got acc)
List/Cons: (Bytes/decode_utf8 rest (String/append got acc))
}
Utf8/decode_rune [] = (0, 0, [])
Utf8/decode_rune [a] = if (<= a 0x7F) { (a, 1, []) } else { (Utf8/RUNE_ERROR, 0, []) }
Utf8/decode_rune [a, b] =
if (<= a 0x7F) {
(a, 1, [b])
} else {
if (== (& a 0xE0) 0xC0) {
let r = (| (<< (& a Utf8/mask2) 6) (& b Utf8/maskx))
(r, 2, [])
} else {
(Utf8/RUNE_ERROR, 0, [])
}
}
Utf8/decode_rune [a, b, c] =
if (<= a 0x7F) {
(a, 1, [b, c])
} else {
if (== (& a 0xE0) 0xC0) {
let r = (| (<< (& a Utf8/mask2) 6) (& b Utf8/maskx))
(r, 2, [c])
} else {
if (== (& a 0xF0) 0xE0) {
let r = (| (<< (& a Utf8/mask3) 12) (| (<< (& b Utf8/maskx) 6) (& c Utf8/maskx)))
(r, 3, [])
} else {
(Utf8/RUNE_ERROR, 0, [])
}
}
}
Utf8/decode_rune (List/Cons a (List/Cons b (List/Cons c (List/Cons d rest)))) =
if (<= a 0x7F) {
(a, 1, (List/Cons b (List/Cons c (List/Cons d rest))))
} else {
if (== (& a 0xE0) 0xC0) {
let r = (| (<< (& a Utf8/mask2) 6) (& b Utf8/maskx))
(r, 2, (List/Cons c (List/Cons d rest)))
} else {
if (== (& a 0xF0) 0xE0) {
let r = (| (<< (& a Utf8/mask3) 12) (| (<< (& b Utf8/maskx) 6) (& c Utf8/maskx)))
(r, 3, (List/Cons d rest))
} else {
if (== (& a 0xF8) 0xF0) {
let r = (| (<< (& a Utf8/mask4) 18) (| (<< (& b Utf8/maskx) 12) (| (<< (& c Utf8/maskx) 6) (& d Utf8/maskx))))
(r, 4, [])
} else {
(Utf8/RUNE_ERROR, 0, rest)
}
}
}
}
String/encode_utf8 (String/Nil) acc = acc
String/encode_utf8 (String/Cons x xs) acc =
if (<= x Utf8/rune1max) {
(String/encode_utf8 xs (Encoding/List/append x acc))
} else {
if (<= x Utf8/rune2max) {
let b1 = (| Utf8/t2 (>> x 6))
let b2 = (| Utf8/tx (& x Utf8/maskx))
(String/encode_utf8 xs (Encoding/List/append b2 (Encoding/List/append b1 acc)))
} else {
if (<= x Utf8/rune3max) {
let b1 = (| Utf8/t3 (>> x 12))
let b2 = (| Utf8/tx (& (>> x 6) Utf8/maskx))
let b3 = (| Utf8/tx (& x Utf8/maskx))
(String/encode_utf8 xs (Encoding/List/append b3 (Encoding/List/append b2 (Encoding/List/append b1 acc))))
} else {
let b1 = (| Utf8/t4 (>> x 18))
let b2 = (| Utf8/tx (& (>> x 12) Utf8/maskx))
let b3 = (| Utf8/tx (& (>> x 6) Utf8/maskx))
let b4 = (| Utf8/tx (& x Utf8/maskx))
(String/encode_utf8 xs (Encoding/List/append b4 (Encoding/List/append b3 (Encoding/List/append b2 (Encoding/List/append b1 acc)))))
}
}
}
Bytes/decode_ascii (List/Cons x xs) = (String/Cons x (Bytes/decode_ascii xs))
Bytes/decode_ascii (List/Nil) = (String/Nil)
String/encode_ascii (String/Cons x xs) = (List/Cons x (String/encode_ascii xs))
String/encode_ascii (String/Nil) = (List/Nil)

View File

@ -0,0 +1,4 @@
def main:
use bytes = [72, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140, 33]
use s = "Hello, 世界!"
return (String/encode_utf8(s, []), Bytes/decode_utf8(bytes, []))

View File

@ -0,0 +1,9 @@
---
source: tests/golden_tests.rs
input_file: tests/golden_tests/run_file/encode_decode_utf8.bend
---
NumScott:
([72, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140, 33], "Hello, 世界!")
Scott:
([72, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140, 33], "Hello, 世界!")