diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt index 06e904c21a4..37d2a69020c 100644 --- a/Tests/LibUnicode/CMakeLists.txt +++ b/Tests/LibUnicode/CMakeLists.txt @@ -1,5 +1,6 @@ set(TEST_SOURCES TestEmoji.cpp + TestPunycode.cpp TestSegmentation.cpp TestUnicodeCharacterTypes.cpp TestUnicodeNormalization.cpp diff --git a/Tests/LibUnicode/TestPunycode.cpp b/Tests/LibUnicode/TestPunycode.cpp new file mode 100644 index 00000000000..e66b2137e04 --- /dev/null +++ b/Tests/LibUnicode/TestPunycode.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include + +namespace Unicode::Punycode { + +#define ENUMERATE_TEST_CASES \ + CASE(""sv, ""sv) \ + CASE("Well hello friends!"sv, "Well hello friends!-"sv) \ + CASE("Well-hello-friends"sv, "Well-hello-friends-"sv) \ + CASE("Wгellд-бhellбвo"sv, "Well-hello-friends"sv) \ + CASE("Hallöchen Freunde!"sv, "Hallchen Freunde!-2zb"sv) \ + CASE("Nåväl hej vänner"sv, "Nvl hej vnner-cfbhg"sv) \ + CASE("Ну привіт друзі"sv, " -kjc9flsd9cjetgj5xg"sv) \ + CASE("ليهمابتكلموشعربي؟"sv, "egbpdaj6bu4bxfgehfvwxn"sv) \ + CASE("他们为什么不说中文"sv, "ihqwcrb4cv8a8dqg056pqjye"sv) \ + CASE("他們爲什麽不說中文"sv, "ihqwctvzc91f659drss3x8bo0yb"sv) \ + CASE("Pročprostěnemluvíčesky"sv, "Proprostnemluvesky-uyb24dma41a"sv) \ + CASE("למההםפשוטלאמדבריםעברית"sv, "4dbcagdahymbxekheh6e0a7fei0b"sv) \ + CASE("यहलोगहिन्दीक्योंनहींबोलसकतेहैं"sv, "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"sv) \ + CASE("なぜみんな日本語を話してくれないのか"sv, "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"sv) \ + CASE("세계의모든사람들이한국어를이해한다면얼마나좋을까"sv, "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c"sv) \ + CASE("почемужеонинеговорятпорусски"sv, "b1abfaaepdrnnbgefbadotcwatmq2g4l"sv) \ + CASE("PorquénopuedensimplementehablarenEspañol"sv, "PorqunopuedensimplementehablarenEspaol-fmd56a"sv) \ + CASE("TạisaohọkhôngthểchỉnóitiếngViệt"sv, "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"sv) \ + CASE("3年B組金八先生"sv, "3B-ww4c5e180e575a65lsy2b"sv) \ + CASE("安室奈美恵-with-SUPER-MONKEYS"sv, "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"sv) \ + CASE("Hello-Another-Way-それぞれの場所"sv, "Hello-Another-Way--fc4qua05auwb3674vfr0b"sv) \ + CASE("ひとつ屋根の下2"sv, "2-u9tlzr9756bt3uc0v"sv) \ + CASE("MajiでKoiする5秒前"sv, "MajiKoi5-783gue6qz075azm5e"sv) \ + CASE("パフィーdeルンバ"sv, "de-jg4avhby1noc0d"sv) \ + CASE("そのスピードで"sv, "d9juau41awczczp"sv) \ + CASE("-> $1.00 <-"sv, "-> $1.00 <--"sv) + +TEST_CASE(decode) +{ +#define CASE(a, b) EXPECT_EQ(TRY_OR_FAIL(decode(b)), a); + ENUMERATE_TEST_CASES +#undef CASE + EXPECT(decode("Well hello friends!"sv).is_error()); + EXPECT(decode("Nåväl hej vänner"sv).is_error()); +} + +} diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 54c99ceebb4..57269720a30 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -5,6 +5,7 @@ set(SOURCES CurrencyCode.cpp Emoji.cpp Normalize.cpp + Punycode.cpp Segmentation.cpp String.cpp UnicodeUtils.cpp diff --git a/Userland/Libraries/LibUnicode/Punycode.cpp b/Userland/Libraries/LibUnicode/Punycode.cpp new file mode 100644 index 00000000000..bead3638991 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Punycode.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +namespace Unicode::Punycode { + +// https://www.rfc-editor.org/rfc/rfc3492.html#section-5 +static constexpr u32 BASE = 36; +static constexpr u32 TMIN = 1; +static constexpr u32 TMAX = 26; +static constexpr u32 SKEW = 38; +static constexpr u32 DAMP = 700; +static constexpr u32 INITIAL_BIAS = 72; +static constexpr u32 INITIAL_N = 0x80; +static constexpr u32 DELIMITER = '-'; + +static Optional digit_value_of_code_point(u32 code_point) +{ + if (code_point >= 'A' && code_point <= 'Z') + return code_point - 'A'; + if (code_point >= 'a' && code_point <= 'z') + return code_point - 'a'; + if (code_point >= '0' && code_point <= '9') + return code_point - '0' + 26; + return {}; +} + +// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.1 +static u32 adapt(u32 delta, u32 num_points, bool first_time) +{ + // if firsttime then let delta = delta div damp + if (first_time) + delta = delta / DAMP; + // else let delta = delta div 2 + else + delta = delta / 2; + + // let delta = delta + (delta div numpoints) + delta = delta + (delta / num_points); + + // let k = 0 + u32 k = 0; + + // while delta > ((base - tmin) * tmax) div 2 do begin + while (delta > ((BASE - TMIN) * TMAX) / 2) { + // let delta = delta div (base - tmin) + delta = delta / (BASE - TMIN); + + // let k = k + base + k = k + BASE; + } + + // return k + (((base - tmin + 1) * delta) div (delta + skew)) + return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); +} + +// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.2 +ErrorOr decode(StringView input) +{ + size_t consumed = 0; + + // let n = initial_n + Checked n = INITIAL_N; + + // let i = 0 + Checked i = 0; + + // let bias = initial_bias + u32 bias = INITIAL_BIAS; + + // let output = an empty string indexed from 0 + Vector output; + + // consume all code points before the last delimiter (if there is one) + // and copy them to output, fail on any non-basic code point + Optional last_delimiter_index = input.find_last(DELIMITER); + if (last_delimiter_index.has_value()) { + for (; consumed < last_delimiter_index.value(); consumed++) { + if (!is_ascii(input[consumed])) + return Error::from_string_literal("Unexpected non-basic code point"); + TRY(output.try_append(input[consumed])); + } + + // if more than zero code points were consumed then consume one more + // (which will be the last delimiter) + if (last_delimiter_index.value() > 0) { + auto next = input[consumed++]; + VERIFY(next == DELIMITER); + } + } + + // while the input is not exhausted do begin + while (consumed < input.length()) { + // let oldi = i + Checked old_i = i; + + // let w = 1 + Checked w = 1; + + // for k = base to infinity in steps of base do begin + for (size_t k = BASE;; k += BASE) { + // consume a code point, or fail if there was none to consume + if (consumed >= input.length()) + return Error::from_string_literal("No more code points to consume"); + auto code_point = input[consumed++]; + + // let digit = the code point's digit-value, fail if it has none + auto digit = digit_value_of_code_point(code_point); + if (!digit.has_value()) + return Error::from_string_literal("Invalid base-36 digit"); + + // let i = i + digit * w, fail on overflow + i = i + Checked(digit.value()) * w; + if (i.has_overflow()) + return Error::from_string_literal("Numeric overflow"); + + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias); + + // if digit < t then break + if (digit.value() < t) + break; + + // let w = w * (base - t), fail on overflow + w = w * Checked(BASE - t); + if (w.has_overflow()) + return Error::from_string_literal("Numeric overflow"); + } + // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) + bias = adapt((i - old_i).value(), output.size() + 1, !old_i); + + // let n = n + i div (length(output) + 1), fail on overflow + n = n + Checked(static_cast(i.value() / static_cast(output.size() + 1))); + if (n.has_overflow()) + return Error::from_string_literal("Numeric overflow"); + + // let i = i mod (length(output) + 1) + i = i % Checked(static_cast(output.size() + 1)); + + // {if n is a basic code point then fail} + // NOTE: The full statement enclosed in braces (checking whether n is a basic code point) can be omitted if initial_n exceeds all basic code points + // (which is true for Punycode), because n is never less than initial_n. + VERIFY(!is_ascii(n.value())); + + // insert n into output at position i + TRY(output.try_insert(i.value(), n.value())); + + // increment i + i++; + } + + StringBuilder builder; + TRY(builder.try_append(Utf32View(output.data(), output.size()))); + return builder.to_string(); +} + +} diff --git a/Userland/Libraries/LibUnicode/Punycode.h b/Userland/Libraries/LibUnicode/Punycode.h new file mode 100644 index 00000000000..e866ca03b22 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Punycode.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include + +namespace Unicode::Punycode { + +ErrorOr decode(StringView); + +}