From cfd0a60863b4361976de2e1eabb8ab140be02d34 Mon Sep 17 00:00:00 2001 From: Simon Wanner Date: Wed, 14 Jun 2023 15:24:30 +0200 Subject: [PATCH] LibUnicode: Add Punycode::encode --- Tests/LibUnicode/TestPunycode.cpp | 7 ++ Userland/Libraries/LibUnicode/Punycode.cpp | 123 +++++++++++++++++++++ Userland/Libraries/LibUnicode/Punycode.h | 2 + 3 files changed, 132 insertions(+) diff --git a/Tests/LibUnicode/TestPunycode.cpp b/Tests/LibUnicode/TestPunycode.cpp index e66b2137e04..0da2cfffb9f 100644 --- a/Tests/LibUnicode/TestPunycode.cpp +++ b/Tests/LibUnicode/TestPunycode.cpp @@ -47,4 +47,11 @@ TEST_CASE(decode) EXPECT(decode("Nåväl hej vänner"sv).is_error()); } +TEST_CASE(encode) +{ +#define CASE(a, b) EXPECT_EQ(TRY_OR_FAIL(encode(a)), b); + ENUMERATE_TEST_CASES +#undef CASE +} + } diff --git a/Userland/Libraries/LibUnicode/Punycode.cpp b/Userland/Libraries/LibUnicode/Punycode.cpp index bead3638991..cf82ea1930c 100644 --- a/Userland/Libraries/LibUnicode/Punycode.cpp +++ b/Userland/Libraries/LibUnicode/Punycode.cpp @@ -5,6 +5,7 @@ */ #include +#include #include namespace Unicode::Punycode { @@ -30,6 +31,14 @@ static Optional digit_value_of_code_point(u32 code_point) return {}; } +static u32 code_point_value_of_digit(u32 digit) +{ + VERIFY(digit < 36); + if (digit <= 25) + return 'a' + digit; + return '0' + digit - 26; +} + // https://www.rfc-editor.org/rfc/rfc3492.html#section-6.1 static u32 adapt(u32 delta, u32 num_points, bool first_time) { @@ -160,4 +169,118 @@ ErrorOr decode(StringView input) return builder.to_string(); } +static Optional find_smallest_code_point_greater_than_or_equal(Utf32View code_points, u32 threshold) +{ + Optional result; + for (auto code_point : code_points) { + if (code_point >= threshold && (!result.has_value() || code_point < result.value())) + result = code_point; + } + return result; +} + +ErrorOr encode(StringView input) +{ + Vector code_points; + for (auto code_point : Utf8View(input)) + TRY(code_points.try_append(code_point)); + return encode(Utf32View(code_points.data(), code_points.size())); +} + +// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.3 +ErrorOr encode(Utf32View input) +{ + Vector output; + + // let n = initial_n + Checked n = INITIAL_N; + + // let delta = 0 + Checked delta = 0; + + // let bias = initial_bias + u32 bias = INITIAL_BIAS; + + // let h = b = the number of basic code points in the input + // copy them to the output in order, followed by a delimiter if b > 0 + size_t b = 0; + for (auto code_point : input) { + if (is_ascii(code_point)) { + TRY(output.try_append(code_point)); + b++; + } + } + auto h = b; + if (b > 0) + TRY(output.try_append(DELIMITER)); + + // while h < length(input) do begin + while (h < input.length()) { + // let m = the minimum {non-basic} code point >= n in the input + auto m = find_smallest_code_point_greater_than_or_equal(input, n.value()); + VERIFY(m.has_value()); + + // let delta = delta + (m - n) * (h + 1), fail on overflow + delta = delta + (Checked(static_cast(m.value())) - n) * Checked(h + 1); + if (delta.has_overflow()) + return Error::from_string_literal("Numeric overflow"); + + // let n = m + n = m.value(); + + // for each code point c in the input (in order) do begin + for (auto c : input) { + // if c < n {or c is basic} then increment delta, fail on overflow + if (c < n.value()) { + delta++; + if (delta.has_overflow()) + return Error::from_string_literal("Numeric overflow"); + } + + // if c == n then begin + if (c == n.value()) { + // let q = delta + auto q = delta.value(); + + // for k = base to infinity in steps of base do begin + for (size_t k = BASE;; k += BASE) { + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias); + + // if q < t then break + if (q < t) + break; + + // output the code point for digit t + ((q - t) mod (base - t)) + auto digit = t + ((q - t) % (BASE - t)); + TRY(output.try_append(code_point_value_of_digit(digit))); + + // let q = (q - t) div (base - t) + q = (q - t) / (BASE - t); + } + // output the code point for digit q + TRY(output.try_append(code_point_value_of_digit(q))); + + // let bias = adapt(delta, h + 1, test h equals b?) + bias = adapt(delta.value(), h + 1, h == b); + + // let delta = 0 + delta = 0; + + // increment h + h++; + } + } + + // increment delta and n + delta++; + n++; + } + + StringBuilder builder; + TRY(builder.try_append(Utf32View(output.data(), output.size()))); + return builder.to_string(); +} + } diff --git a/Userland/Libraries/LibUnicode/Punycode.h b/Userland/Libraries/LibUnicode/Punycode.h index e866ca03b22..bfc0981a3e3 100644 --- a/Userland/Libraries/LibUnicode/Punycode.h +++ b/Userland/Libraries/LibUnicode/Punycode.h @@ -11,5 +11,7 @@ namespace Unicode::Punycode { ErrorOr decode(StringView); +ErrorOr encode(StringView); +ErrorOr encode(Utf32View); }