From 0053d48c41c6d1699c7580a964f4b6d51f3f7bdb Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sun, 5 Sep 2021 14:39:20 -0400 Subject: [PATCH] LibUnicode: Implement locale-aware AFTER_I special casing --- .../LibUnicode/TestUnicodeCharacterTypes.cpp | 43 ++++++++++++++ .../Libraries/LibUnicode/CharacterTypes.cpp | 58 +++++++++++++++---- .../Libraries/LibUnicode/CharacterTypes.h | 5 +- 3 files changed, 94 insertions(+), 12 deletions(-) diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 65aa4ca5d2e..eff0b892869 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -155,6 +155,49 @@ TEST_CASE(to_unicode_lowercase_special_casing_sigma) EXPECT_EQ(result, "a\u180E\u03C3\u180Eb"); } +TEST_CASE(to_unicode_lowercase_special_casing_i) +{ + // LATIN CAPITAL LETTER I + auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv); + EXPECT_EQ(result, "i"sv); + + result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv); + EXPECT_EQ(result, "\u0131"sv); + + result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv); + EXPECT_EQ(result, "\u0131"sv); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv); + EXPECT_EQ(result, "\u0069\u0307"sv); + + result = Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv); + EXPECT_EQ(result, "i"sv); + + result = Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv); + EXPECT_EQ(result, "i"sv); + + // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE + result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv); + EXPECT_EQ(result, "i\u0307"sv); + + result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv); + EXPECT_EQ(result, "i"sv); + + result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv); + EXPECT_EQ(result, "i"sv); + + // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE + result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv); + EXPECT_EQ(result, "ia\u0307"sv); + + result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv); + EXPECT_EQ(result, "\u0131a\u0307"sv); + + result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv); + EXPECT_EQ(result, "\u0131a\u0307"sv); +} + TEST_CASE(to_unicode_uppercase_unconditional_special_casing) { // LATIN SMALL LETTER SHARP S diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index dba0397601e..b566416c086 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #if ENABLE_UNICODE_DATA # include @@ -22,6 +23,32 @@ namespace Unicode { #if ENABLE_UNICODE_DATA +static bool is_after_uppercase_i(Utf8View const& string, size_t index) +{ + // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0. + auto preceding_view = string.substring_view(0, index); + bool found_uppercase_i = false; + + // FIXME: Would be better if Utf8View supported reverse iteration. + for (auto code_point : preceding_view) { + if (code_point == 'I') { + found_uppercase_i = true; + continue; + } + + auto unicode_data = Detail::unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) + return false; + + if (unicode_data->canonical_combining_class == 0) + found_uppercase_i = false; + else if (unicode_data->canonical_combining_class == 230) + found_uppercase_i = false; + } + + return found_uppercase_i; +} + static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) { // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable @@ -62,19 +89,30 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt return true; } -static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data) +static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional locale, size_t index, size_t byte_length, UnicodeData const& unicode_data) { + auto requested_locale = Locale::None; + + if (locale.has_value()) { + if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value()) + requested_locale = *maybe_locale; + } + for (size_t i = 0; i < unicode_data.special_casing_size; ++i) { auto const* special_casing = unicode_data.special_casing[i]; - if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None)) - return special_casing; - - // FIXME: Handle locale. - if (special_casing->locale != Locale::None) + if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) continue; switch (special_casing->condition) { + case Condition::None: + return special_casing; + + case Condition::AfterI: + if (is_after_uppercase_i(string, index)) + return special_casing; + break; + case Condition::FinalSigma: if (is_final_code_point(string, index, byte_length)) return special_casing; @@ -114,7 +152,7 @@ u32 to_unicode_uppercase(u32 code_point) #endif } -String to_unicode_lowercase_full(StringView const& string) +String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Optional locale) { #if ENABLE_UNICODE_DATA Utf8View view { string }; @@ -133,7 +171,7 @@ String to_unicode_lowercase_full(StringView const& string) continue; } - auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data); + auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); if (!special_casing) { builder.append_code_point(unicode_data->simple_lowercase_mapping); continue; @@ -149,7 +187,7 @@ String to_unicode_lowercase_full(StringView const& string) #endif } -String to_unicode_uppercase_full(StringView const& string) +String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Optional locale) { #if ENABLE_UNICODE_DATA Utf8View view { string }; @@ -168,7 +206,7 @@ String to_unicode_uppercase_full(StringView const& string) continue; } - auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data); + auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); if (!special_casing) { builder.append_code_point(unicode_data->simple_uppercase_mapping); continue; diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 5651632f47b..62b34bf4134 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -18,8 +19,8 @@ namespace Unicode { u32 to_unicode_lowercase(u32 code_point); u32 to_unicode_uppercase(u32 code_point); -String to_unicode_lowercase_full(StringView const&); -String to_unicode_uppercase_full(StringView const&); +String to_unicode_lowercase_full(StringView const&, Optional locale = {}); +String to_unicode_uppercase_full(StringView const&, Optional locale = {}); Optional general_category_from_string(StringView const&); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);