From 3d22efccca772bd4aff23f227cf5da6e93996eaa Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sun, 8 Jan 2023 11:05:51 -0500 Subject: [PATCH] LibUnicode+LibJS: Propagate OOM from Unicode normalization --- Tests/LibUnicode/TestUnicodeNormalization.cpp | 86 +++++++++---------- .../LibJS/Runtime/StringPrototype.cpp | 2 +- Userland/Libraries/LibUnicode/Normalize.cpp | 65 ++++++-------- Userland/Libraries/LibUnicode/Normalize.h | 3 +- 4 files changed, 74 insertions(+), 82 deletions(-) diff --git a/Tests/LibUnicode/TestUnicodeNormalization.cpp b/Tests/LibUnicode/TestUnicodeNormalization.cpp index daef39d7652..115d16440e3 100644 --- a/Tests/LibUnicode/TestUnicodeNormalization.cpp +++ b/Tests/LibUnicode/TestUnicodeNormalization.cpp @@ -12,84 +12,84 @@ using namespace Unicode; TEST_CASE(normalize_nfd) { - EXPECT_EQ(normalize(""sv, NormalizationForm::NFD), ""sv); + EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFD)), ""sv); - EXPECT_EQ(normalize("Hello"sv, NormalizationForm::NFD), "Hello"sv); + EXPECT_EQ(MUST(normalize("Hello"sv, NormalizationForm::NFD)), "Hello"sv); - EXPECT_EQ(normalize("Amélie"sv, NormalizationForm::NFD), "Ame\u0301lie"sv); + EXPECT_EQ(MUST(normalize("Amélie"sv, NormalizationForm::NFD)), "Ame\u0301lie"sv); - EXPECT_EQ(normalize("Office"sv, NormalizationForm::NFD), "Office"sv); + EXPECT_EQ(MUST(normalize("Office"sv, NormalizationForm::NFD)), "Office"sv); - EXPECT_EQ(normalize("\u1E9B\u0323"sv, NormalizationForm::NFD), "\u017F\u0323\u0307"sv); + EXPECT_EQ(MUST(normalize("\u1E9B\u0323"sv, NormalizationForm::NFD)), "\u017F\u0323\u0307"sv); - EXPECT_EQ(normalize("\u0112\u0300"sv, NormalizationForm::NFD), "\u0045\u0304\u0300"sv); + EXPECT_EQ(MUST(normalize("\u0112\u0300"sv, NormalizationForm::NFD)), "\u0045\u0304\u0300"sv); - EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFD), "\u03D2\u0301"sv); - EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFD), "\u03D2\u0308"sv); + EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFD)), "\u03D2\u0301"sv); + EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFD)), "\u03D2\u0308"sv); - EXPECT_EQ(normalize("닭"sv, NormalizationForm::NFD), "\u1103\u1161\u11B0"sv); - EXPECT_EQ(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFD), "\u1100\u1100\u1161\u11A8"sv); + EXPECT_EQ(MUST(normalize("닭"sv, NormalizationForm::NFD)), "\u1103\u1161\u11B0"sv); + EXPECT_EQ(MUST(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFD)), "\u1100\u1100\u1161\u11A8"sv); // Composition exclusions. - EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFD), "\u0915\u093C"sv); - EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFD), "\u03A9"sv); + EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFD)), "\u0915\u093C"sv); + EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFD)), "\u03A9"sv); } TEST_CASE(normalize_nfc) { - EXPECT_EQ(normalize(""sv, NormalizationForm::NFC), ""sv); + EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFC)), ""sv); - EXPECT_EQ(normalize("Hello"sv, NormalizationForm::NFC), "Hello"sv); + EXPECT_EQ(MUST(normalize("Hello"sv, NormalizationForm::NFC)), "Hello"sv); - EXPECT_EQ(normalize("Office"sv, NormalizationForm::NFC), "Office"sv); + EXPECT_EQ(MUST(normalize("Office"sv, NormalizationForm::NFC)), "Office"sv); - EXPECT_EQ(normalize("\u1E9B\u0323"sv, NormalizationForm::NFC), "\u1E9B\u0323"sv); - EXPECT_EQ(normalize("\u0044\u0307"sv, NormalizationForm::NFC), "\u1E0A"sv); + EXPECT_EQ(MUST(normalize("\u1E9B\u0323"sv, NormalizationForm::NFC)), "\u1E9B\u0323"sv); + EXPECT_EQ(MUST(normalize("\u0044\u0307"sv, NormalizationForm::NFC)), "\u1E0A"sv); - EXPECT_EQ(normalize("\u0044\u0307\u0323"sv, NormalizationForm::NFC), "\u1E0C\u0307"sv); - EXPECT_EQ(normalize("\u0044\u0323\u0307"sv, NormalizationForm::NFC), "\u1E0C\u0307"sv); + EXPECT_EQ(MUST(normalize("\u0044\u0307\u0323"sv, NormalizationForm::NFC)), "\u1E0C\u0307"sv); + EXPECT_EQ(MUST(normalize("\u0044\u0323\u0307"sv, NormalizationForm::NFC)), "\u1E0C\u0307"sv); - EXPECT_EQ(normalize("\u0112\u0300"sv, NormalizationForm::NFC), "\u1E14"sv); - EXPECT_EQ(normalize("\u1E14\u0304"sv, NormalizationForm::NFC), "\u1E14\u0304"sv); + EXPECT_EQ(MUST(normalize("\u0112\u0300"sv, NormalizationForm::NFC)), "\u1E14"sv); + EXPECT_EQ(MUST(normalize("\u1E14\u0304"sv, NormalizationForm::NFC)), "\u1E14\u0304"sv); - EXPECT_EQ(normalize("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F"sv, NormalizationForm::NFC), "\u05B1\u05B8\u05B9\u0591\u05C3\u05B0\u05AC\u059F"sv); - EXPECT_EQ(normalize("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"sv, NormalizationForm::NFC), "\u05B0\u05B7\u05BC\u05A5\u0592\u05C0\u05AD\u05C4"sv); + EXPECT_EQ(MUST(normalize("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F"sv, NormalizationForm::NFC)), "\u05B1\u05B8\u05B9\u0591\u05C3\u05B0\u05AC\u059F"sv); + EXPECT_EQ(MUST(normalize("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"sv, NormalizationForm::NFC)), "\u05B0\u05B7\u05BC\u05A5\u0592\u05C0\u05AD\u05C4"sv); - EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFC), "\u03D3"sv); - EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFC), "\u03D4"sv); + EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFC)), "\u03D3"sv); + EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFC)), "\u03D4"sv); - EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFC), "\u0915\u093C"sv); - EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFC), "\u03A9"sv); + EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFC)), "\u0915\u093C"sv); + EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFC)), "\u03A9"sv); - EXPECT_EQ(normalize("\u1103\u1161\u11B0"sv, NormalizationForm::NFC), "닭"sv); - EXPECT_EQ(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFC), "\u1100\uAC01"sv); - EXPECT_EQ(normalize("\u1103\u1161\u11B0\u11B0"sv, NormalizationForm::NFC), "닭\u11B0"); + EXPECT_EQ(MUST(normalize("\u1103\u1161\u11B0"sv, NormalizationForm::NFC)), "닭"sv); + EXPECT_EQ(MUST(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFC)), "\u1100\uAC01"sv); + EXPECT_EQ(MUST(normalize("\u1103\u1161\u11B0\u11B0"sv, NormalizationForm::NFC)), "닭\u11B0"); } TEST_CASE(normalize_nfkd) { - EXPECT_EQ(normalize(""sv, NormalizationForm::NFKD), ""sv); + EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFKD)), ""sv); - EXPECT_EQ(normalize("Office"sv, NormalizationForm::NFKD), "Office"sv); + EXPECT_EQ(MUST(normalize("Office"sv, NormalizationForm::NFKD)), "Office"sv); - EXPECT_EQ(normalize("¼"sv, NormalizationForm::NFKD), "1\u20444"sv); + EXPECT_EQ(MUST(normalize("¼"sv, NormalizationForm::NFKD)), "1\u20444"sv); - EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFKD), "\u03A5\u0301"sv); - EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFKD), "\u03A5\u0308"sv); + EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFKD)), "\u03A5\u0301"sv); + EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFKD)), "\u03A5\u0308"sv); - EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFKD), "\u0915\u093C"sv); - EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFKD), "\u03A9"sv); + EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFKD)), "\u0915\u093C"sv); + EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFKD)), "\u03A9"sv); - EXPECT_EQ(normalize("\uFDFA"sv, NormalizationForm::NFKD), "\u0635\u0644\u0649\u0020\u0627\u0644\u0644\u0647\u0020\u0639\u0644\u064A\u0647\u0020\u0648\u0633\u0644\u0645"sv); + EXPECT_EQ(MUST(normalize("\uFDFA"sv, NormalizationForm::NFKD)), "\u0635\u0644\u0649\u0020\u0627\u0644\u0644\u0647\u0020\u0639\u0644\u064A\u0647\u0020\u0648\u0633\u0644\u0645"sv); } TEST_CASE(normalize_nfkc) { - EXPECT_EQ(normalize(""sv, NormalizationForm::NFKC), ""sv); + EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFKC)), ""sv); - EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFKC), "\u038E"sv); - EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFKC), "\u03AB"sv); + EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFKC)), "\u038E"sv); + EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFKC)), "\u03AB"sv); - EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFKC), "\u0915\u093C"sv); - EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFKC), "\u03A9"sv); + EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFKC)), "\u0915\u093C"sv); + EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFKC)), "\u03A9"sv); } diff --git a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp index e0abc3aa5de..cfc260e3314 100644 --- a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp @@ -499,7 +499,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::normalize) // 6. Let ns be the String value that is the result of normalizing S into the normalization form named by f as specified in https://unicode.org/reports/tr15/. auto unicode_form = Unicode::normalization_form_from_string(form); - auto ns = Unicode::normalize(string, unicode_form); + auto ns = TRY_OR_THROW_OOM(vm, Unicode::normalize(string, unicode_form)); // 7. return ns. return PrimitiveString::create(vm, move(ns)); diff --git a/Userland/Libraries/LibUnicode/Normalize.cpp b/Userland/Libraries/LibUnicode/Normalize.cpp index a47aabb17f4..3d9fb6c1aa2 100644 --- a/Userland/Libraries/LibUnicode/Normalize.cpp +++ b/Userland/Libraries/LibUnicode/Normalize.cpp @@ -88,7 +88,7 @@ ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point) } // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669 -static void decompose_hangul_code_point(u32 code_point, Vector& code_points_output) +static ErrorOr decompose_hangul_code_point(u32 code_point, Vector& code_points_output) { auto const index = code_point - HANGUL_SYLLABLE_BASE; @@ -100,10 +100,12 @@ static void decompose_hangul_code_point(u32 code_point, Vector& code_points auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index; auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index; - code_points_output.append(leading_part); - code_points_output.append(vowel_part); + TRY(code_points_output.try_append(leading_part)); + TRY(code_points_output.try_append(vowel_part)); if (trailing_index != 0) - code_points_output.append(trailing_part); + TRY(code_points_output.try_append(trailing_part)); + + return {}; } // L, V and LV, T Hangul Syllable Composition @@ -150,23 +152,23 @@ enum class UseCompatibility { No }; -static void decompose_code_point(u32 code_point, Vector& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility) +static ErrorOr decompose_code_point(u32 code_point, Vector& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility) { - if (is_hangul_code_point(code_point)) { - decompose_hangul_code_point(code_point, code_points_output); - return; - } + if (is_hangul_code_point(code_point)) + return decompose_hangul_code_point(code_point, code_points_output); #if ENABLE_UNICODE_DATA auto const mapping = Unicode::code_point_decomposition(code_point); if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) { for (auto code_point : mapping->decomposition) { - decompose_code_point(code_point, code_points_output, use_compatibility); + TRY(decompose_code_point(code_point, code_points_output, use_compatibility)); } } else { - code_points_output.append(code_point); + TRY(code_points_output.try_append(code_point)); } #endif + + return {}; } // This can be any sorting algorithm that maintains order (like std::stable_sort), @@ -249,51 +251,43 @@ static void canonical_composition_algorithm(Vector& code_points) } } -static Vector normalize_nfd(Utf8View string) +static ErrorOr> normalize_nfd(Utf8View string) { Vector result; - - for (auto const code_point : string) { - decompose_code_point(code_point, result, UseCompatibility::No); - } + for (auto const code_point : string) + TRY(decompose_code_point(code_point, result, UseCompatibility::No)); canonical_ordering_algorithm(result); - return result; } -static Vector normalize_nfc(Utf8View string) +static ErrorOr> normalize_nfc(Utf8View string) { - auto result = normalize_nfd(string); - + auto result = TRY(normalize_nfd(string)); canonical_composition_algorithm(result); return result; } -static Vector normalize_nfkd(Utf8View string) +static ErrorOr> normalize_nfkd(Utf8View string) { Vector result; - - for (auto const code_point : string) { - decompose_code_point(code_point, result, UseCompatibility::Yes); - } + for (auto const code_point : string) + TRY(decompose_code_point(code_point, result, UseCompatibility::Yes)); canonical_ordering_algorithm(result); - return result; } -static Vector normalize_nfkc(Utf8View string) +static ErrorOr> normalize_nfkc(Utf8View string) { - auto result = normalize_nfkd(string); - + auto result = TRY(normalize_nfkd(string)); canonical_composition_algorithm(result); return result; } -static Vector normalize_implementation(Utf8View string, NormalizationForm form) +static ErrorOr> normalize_implementation(Utf8View string, NormalizationForm form) { switch (form) { case NormalizationForm::NFD: @@ -308,16 +302,13 @@ static Vector normalize_implementation(Utf8View string, NormalizationForm f VERIFY_NOT_REACHED(); } -DeprecatedString normalize(StringView string, NormalizationForm form) +ErrorOr normalize(StringView string, NormalizationForm form) { - Utf8View const view { string }; - - auto const code_points = normalize_implementation(view, form); + auto const code_points = TRY(normalize_implementation(Utf8View { string }, form)); StringBuilder builder; - for (auto code_point : code_points) { - builder.append_code_point(code_point); - } + for (auto code_point : code_points) + TRY(builder.try_append_code_point(code_point)); return builder.to_deprecated_string(); } diff --git a/Userland/Libraries/LibUnicode/Normalize.h b/Userland/Libraries/LibUnicode/Normalize.h index 981f6ed9667..c476490fdb1 100644 --- a/Userland/Libraries/LibUnicode/Normalize.h +++ b/Userland/Libraries/LibUnicode/Normalize.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -28,6 +29,6 @@ enum class NormalizationForm { NormalizationForm normalization_form_from_string(StringView form); StringView normalization_form_to_string(NormalizationForm form); -[[nodiscard]] DeprecatedString normalize(StringView string, NormalizationForm form); +ErrorOr normalize(StringView string, NormalizationForm form); }