LibUnicode: Add Punycode::decode

Author: https://github.com/skyrising Commit: https://github.com/SerenityOS/serenity/commit/299d35aadc Pull-request: https://github.com/SerenityOS/serenity/pull/19414 Reviewed-by: https://github.com/AtkinsSJ Reviewed-by: https://github.com/nico Reviewed-by: https://github.com/shannonbooth Reviewed-by: https://github.com/trflynn89
2024-12-28 05:35:52 +03:00 · 2023-06-14 13:56:11 +02:00 · 2023-06-14 13:56:11 +02:00 · 299d35aadc · 2024-07-17 09:37:30 +09:00
commit 299d35aadc
parent 37b5c05ec5
5 changed files with 230 additions and 0 deletions
--- a/Tests/LibUnicode/CMakeLists.txt
+++ b/Tests/LibUnicode/CMakeLists.txt
@ -1,5 +1,6 @@
 set(TEST_SOURCES
    TestEmoji.cpp
+    TestPunycode.cpp
    TestSegmentation.cpp
    TestUnicodeCharacterTypes.cpp
    TestUnicodeNormalization.cpp
--- a/Tests/LibUnicode/TestPunycode.cpp
+++ b/Tests/LibUnicode/TestPunycode.cpp
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibTest/TestCase.h>
+
+#include <LibUnicode/Punycode.h>
+
+namespace Unicode::Punycode {
+
+#define ENUMERATE_TEST_CASES                                                                                                              \
+    CASE(""sv, ""sv)                                                                                                                      \
+    CASE("Well hello friends!"sv, "Well hello friends!-"sv)                                                                               \
+    CASE("Well-hello-friends"sv, "Well-hello-friends-"sv)                                                                                 \
+    CASE("Wгellд-бhellбвo"sv, "Well-hello-friends"sv)                                                                                     \
+    CASE("Hallöchen Freunde!"sv, "Hallchen Freunde!-2zb"sv)                                                                               \
+    CASE("Nåväl hej vänner"sv, "Nvl hej vnner-cfbhg"sv)                                                                                   \
+    CASE("Ну привіт друзі"sv, "  -kjc9flsd9cjetgj5xg"sv)                                                                                  \
+    CASE("ليهمابتكلموشعربي؟"sv, "egbpdaj6bu4bxfgehfvwxn"sv)                                                                               \
+    CASE("他们为什么不说中文"sv, "ihqwcrb4cv8a8dqg056pqjye"sv)                                                                            \
+    CASE("他們爲什麽不說中文"sv, "ihqwctvzc91f659drss3x8bo0yb"sv)                                                                         \
+    CASE("Pročprostěnemluvíčesky"sv, "Proprostnemluvesky-uyb24dma41a"sv)                                                                  \
+    CASE("למההםפשוטלאמדבריםעברית"sv, "4dbcagdahymbxekheh6e0a7fei0b"sv)                                                                    \
+    CASE("यहलोगहिन्दीक्योंनहींबोलसकतेहैं"sv, "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"sv)                                                   \
+    CASE("なぜみんな日本語を話してくれないのか"sv, "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"sv)                                            \
+    CASE("세계의모든사람들이한국어를이해한다면얼마나좋을까"sv, "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c"sv) \
+    CASE("почемужеонинеговорятпорусски"sv, "b1abfaaepdrnnbgefbadotcwatmq2g4l"sv)                                                          \
+    CASE("PorquénopuedensimplementehablarenEspañol"sv, "PorqunopuedensimplementehablarenEspaol-fmd56a"sv)                                 \
+    CASE("TạisaohọkhôngthểchỉnóitiếngViệt"sv, "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"sv)                                           \
+    CASE("3年B組金八先生"sv, "3B-ww4c5e180e575a65lsy2b"sv)                                                                                \
+    CASE("安室奈美恵-with-SUPER-MONKEYS"sv, "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"sv)                                                 \
+    CASE("Hello-Another-Way-それぞれの場所"sv, "Hello-Another-Way--fc4qua05auwb3674vfr0b"sv)                                              \
+    CASE("ひとつ屋根の下2"sv, "2-u9tlzr9756bt3uc0v"sv)                                                                                    \
+    CASE("MajiでKoiする5秒前"sv, "MajiKoi5-783gue6qz075azm5e"sv)                                                                          \
+    CASE("パフィーdeルンバ"sv, "de-jg4avhby1noc0d"sv)                                                                                     \
+    CASE("そのスピードで"sv, "d9juau41awczczp"sv)                                                                                         \
+    CASE("-> $1.00 <-"sv, "-> $1.00 <--"sv)
+
+TEST_CASE(decode)
+{
+#define CASE(a, b) EXPECT_EQ(TRY_OR_FAIL(decode(b)), a);
+    ENUMERATE_TEST_CASES
+#undef CASE
+    EXPECT(decode("Well hello friends!"sv).is_error());
+    EXPECT(decode("Nåväl hej vänner"sv).is_error());
+}
+
+}
--- a/Userland/Libraries/LibUnicode/CMakeLists.txt
+++ b/Userland/Libraries/LibUnicode/CMakeLists.txt
@ -5,6 +5,7 @@ set(SOURCES
    CurrencyCode.cpp
    Emoji.cpp
    Normalize.cpp
+    Punycode.cpp
    Segmentation.cpp
    String.cpp
    UnicodeUtils.cpp
--- a/Userland/Libraries/LibUnicode/Punycode.cpp
+++ b/Userland/Libraries/LibUnicode/Punycode.cpp
@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Utf32View.h>
+#include <LibUnicode/Punycode.h>
+
+namespace Unicode::Punycode {
+
+// https://www.rfc-editor.org/rfc/rfc3492.html#section-5
+static constexpr u32 BASE = 36;
+static constexpr u32 TMIN = 1;
+static constexpr u32 TMAX = 26;
+static constexpr u32 SKEW = 38;
+static constexpr u32 DAMP = 700;
+static constexpr u32 INITIAL_BIAS = 72;
+static constexpr u32 INITIAL_N = 0x80;
+static constexpr u32 DELIMITER = '-';
+
+static Optional<u32> digit_value_of_code_point(u32 code_point)
+{
+    if (code_point >= 'A' && code_point <= 'Z')
+        return code_point - 'A';
+    if (code_point >= 'a' && code_point <= 'z')
+        return code_point - 'a';
+    if (code_point >= '0' && code_point <= '9')
+        return code_point - '0' + 26;
+    return {};
+}
+
+// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.1
+static u32 adapt(u32 delta, u32 num_points, bool first_time)
+{
+    // if firsttime then let delta = delta div damp
+    if (first_time)
+        delta = delta / DAMP;
+    // else let delta = delta div 2
+    else
+        delta = delta / 2;
+
+    // let delta = delta + (delta div numpoints)
+    delta = delta + (delta / num_points);
+
+    // let k = 0
+    u32 k = 0;
+
+    // while delta > ((base - tmin) * tmax) div 2 do begin
+    while (delta > ((BASE - TMIN) * TMAX) / 2) {
+        // let delta = delta div (base - tmin)
+        delta = delta / (BASE - TMIN);
+
+        // let k = k + base
+        k = k + BASE;
+    }
+
+    // return k + (((base - tmin + 1) * delta) div (delta + skew))
+    return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
+}
+
+// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.2
+ErrorOr<String> decode(StringView input)
+{
+    size_t consumed = 0;
+
+    // let n = initial_n
+    Checked<size_t> n = INITIAL_N;
+
+    // let i = 0
+    Checked<u32> i = 0;
+
+    // let bias = initial_bias
+    u32 bias = INITIAL_BIAS;
+
+    // let output = an empty string indexed from 0
+    Vector<u32> output;
+
+    // consume all code points before the last delimiter (if there is one)
+    //   and copy them to output, fail on any non-basic code point
+    Optional<size_t> last_delimiter_index = input.find_last(DELIMITER);
+    if (last_delimiter_index.has_value()) {
+        for (; consumed < last_delimiter_index.value(); consumed++) {
+            if (!is_ascii(input[consumed]))
+                return Error::from_string_literal("Unexpected non-basic code point");
+            TRY(output.try_append(input[consumed]));
+        }
+
+        // if more than zero code points were consumed then consume one more
+        //   (which will be the last delimiter)
+        if (last_delimiter_index.value() > 0) {
+            auto next = input[consumed++];
+            VERIFY(next == DELIMITER);
+        }
+    }
+
+    // while the input is not exhausted do begin
+    while (consumed < input.length()) {
+        // let oldi = i
+        Checked<u32> old_i = i;
+
+        // let w = 1
+        Checked<u32> w = 1;
+
+        // for k = base to infinity in steps of base do begin
+        for (size_t k = BASE;; k += BASE) {
+            // consume a code point, or fail if there was none to consume
+            if (consumed >= input.length())
+                return Error::from_string_literal("No more code points to consume");
+            auto code_point = input[consumed++];
+
+            // let digit = the code point's digit-value, fail if it has none
+            auto digit = digit_value_of_code_point(code_point);
+            if (!digit.has_value())
+                return Error::from_string_literal("Invalid base-36 digit");
+
+            // let i = i + digit * w, fail on overflow
+            i = i + Checked(digit.value()) * w;
+            if (i.has_overflow())
+                return Error::from_string_literal("Numeric overflow");
+
+            // let t = tmin if k <= bias {+ tmin}, or
+            //         tmax if k >= bias + tmax, or k - bias otherwise
+            u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias);
+
+            // if digit < t then break
+            if (digit.value() < t)
+                break;
+
+            // let w = w * (base - t), fail on overflow
+            w = w * Checked(BASE - t);
+            if (w.has_overflow())
+                return Error::from_string_literal("Numeric overflow");
+        }
+        // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
+        bias = adapt((i - old_i).value(), output.size() + 1, !old_i);
+
+        // let n = n + i div (length(output) + 1), fail on overflow
+        n = n + Checked(static_cast<size_t>(i.value() / static_cast<u32>(output.size() + 1)));
+        if (n.has_overflow())
+            return Error::from_string_literal("Numeric overflow");
+
+        // let i = i mod (length(output) + 1)
+        i = i % Checked(static_cast<u32>(output.size() + 1));
+
+        // {if n is a basic code point then fail}
+        // NOTE: The full statement enclosed in braces (checking whether n is a basic code point) can be omitted if initial_n exceeds all basic code points
+        //       (which is true for Punycode), because n is never less than initial_n.
+        VERIFY(!is_ascii(n.value()));
+
+        // insert n into output at position i
+        TRY(output.try_insert(i.value(), n.value()));
+
+        // increment i
+        i++;
+    }
+
+    StringBuilder builder;
+    TRY(builder.try_append(Utf32View(output.data(), output.size())));
+    return builder.to_string();
+}
+
+}
--- a/Userland/Libraries/LibUnicode/Punycode.h
+++ b/Userland/Libraries/LibUnicode/Punycode.h
@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/String.h>
+
+namespace Unicode::Punycode {
+
+ErrorOr<String> decode(StringView);
+
+}