From bc51017a03087057dc8e8f437b4049f2ab7ebba1 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 16 Jan 2023 11:22:01 -0500 Subject: [PATCH] LibUnicode: Support full case folding for titlecasing a string Unicode declares that to titlecase a string, the first cased code point after each word boundary should be transformed to its titlecase mapping. All other codepoints are transformed to their lowercase mapping. --- .../LibUnicode/TestUnicodeCharacterTypes.cpp | 93 +++++++++++++++++++ .../Libraries/LibUnicode/CharacterTypes.cpp | 7 ++ .../Libraries/LibUnicode/CharacterTypes.h | 2 + .../Libraries/LibUnicode/UnicodeUtils.cpp | 62 +++++++++++++ Userland/Libraries/LibUnicode/UnicodeUtils.h | 1 + 5 files changed, 165 insertions(+) diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index dd6f5c4654e..971461ba8ef 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -74,6 +74,27 @@ TEST_CASE(to_unicode_titlecase) EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj" EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj" EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz" + + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(""sv)), ""sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" "sv)), " "sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" - "sv)), " - "sv); + + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a"sv)), "A"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("A"sv)), "A"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" a"sv)), " A"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a "sv)), "A "sv); + + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab"sv)), "Ab"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("Ab"sv)), "Ab"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("aB"sv)), "Ab"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("AB"sv)), "Ab"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" ab"sv)), " Ab"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab "sv)), "Ab "sv); + + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'Ar'"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv); } TEST_CASE(to_unicode_lowercase_unconditional_special_casing) @@ -382,6 +403,78 @@ TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted) EXPECT_EQ(result, "J"sv); } +TEST_CASE(to_unicode_titlecase_unconditional_special_casing) +{ + // LATIN SMALL LETTER SHARP S + auto result = MUST(Unicode::to_unicode_titlecase_full("\u00DF"sv)); + EXPECT_EQ(result, "\u0053\u0073"sv); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = MUST(Unicode::to_unicode_titlecase_full("\u0130"sv)); + EXPECT_EQ(result, "\u0130"sv); + + // LATIN SMALL LIGATURE FF + result = MUST(Unicode::to_unicode_titlecase_full("\uFB00"sv)); + EXPECT_EQ(result, "\u0046\u0066"sv); + + // LATIN SMALL LIGATURE FI + result = MUST(Unicode::to_unicode_titlecase_full("\uFB01"sv)); + EXPECT_EQ(result, "\u0046\u0069"sv); + + // LATIN SMALL LIGATURE FL + result = MUST(Unicode::to_unicode_titlecase_full("\uFB02"sv)); + EXPECT_EQ(result, "\u0046\u006C"sv); + + // LATIN SMALL LIGATURE FFI + result = MUST(Unicode::to_unicode_titlecase_full("\uFB03"sv)); + EXPECT_EQ(result, "\u0046\u0066\u0069"sv); + + // LATIN SMALL LIGATURE FFL + result = MUST(Unicode::to_unicode_titlecase_full("\uFB04"sv)); + EXPECT_EQ(result, "\u0046\u0066\u006C"sv); + + // LATIN SMALL LIGATURE LONG S T + result = MUST(Unicode::to_unicode_titlecase_full("\uFB05"sv)); + EXPECT_EQ(result, "\u0053\u0074"sv); + + // LATIN SMALL LIGATURE ST + result = MUST(Unicode::to_unicode_titlecase_full("\uFB06"sv)); + EXPECT_EQ(result, "\u0053\u0074"sv); + + // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + result = MUST(Unicode::to_unicode_titlecase_full("\u0390"sv)); + EXPECT_EQ(result, "\u0399\u0308\u0301"sv); + + // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + result = MUST(Unicode::to_unicode_titlecase_full("\u03B0"sv)); + EXPECT_EQ(result, "\u03A5\u0308\u0301"sv); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST(Unicode::to_unicode_titlecase_full("\u1FB7"sv)); + EXPECT_EQ(result, "\u0391\u0342\u0345"sv); + + // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST(Unicode::to_unicode_titlecase_full("\u1FC7"sv)); + EXPECT_EQ(result, "\u0397\u0342\u0345"sv); + + // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST(Unicode::to_unicode_titlecase_full("\u1FF7"sv)); + EXPECT_EQ(result, "\u03A9\u0342\u0345"sv); +} + +TEST_CASE(to_unicode_titlecase_special_casing_i) +{ + // LATIN SMALL LETTER I + auto result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "en"sv)); + EXPECT_EQ(result, "I"sv); + + result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "az"sv)); + EXPECT_EQ(result, "\u0130"sv); + + result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "tr"sv)); + EXPECT_EQ(result, "\u0130"sv); +} + TEST_CASE(general_category) { auto general_category = [](StringView name) { diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 4ab8b9a6915..3f1b62b95c4 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -57,6 +57,13 @@ ErrorOr to_unicode_uppercase_full(StringView string, Optional< return builder.to_deprecated_string(); } +ErrorOr to_unicode_titlecase_full(StringView string, Optional const& locale) +{ + StringBuilder builder; + TRY(Detail::build_titlecase_string(Utf8View { string }, builder, locale)); + return builder.to_string(); +} + Optional __attribute__((weak)) general_category_from_string(StringView) { return {}; } bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; } Optional __attribute__((weak)) property_from_string(StringView) { return {}; } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 04ce644d301..43f3c8f6e9a 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ u32 to_unicode_titlecase(u32 code_point); ErrorOr to_unicode_lowercase_full(StringView, Optional const& locale = {}); ErrorOr to_unicode_uppercase_full(StringView, Optional const& locale = {}); +ErrorOr to_unicode_titlecase_full(StringView, Optional const& locale = {}); Optional general_category_from_string(StringView); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp index 992122690ac..e8c03a0fb67 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -249,4 +249,66 @@ ErrorOr build_uppercase_string([[maybe_unused]] Utf8View code_points, [[ma #endif } +ErrorOr build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional const& locale) +{ +#if ENABLE_UNICODE_DATA + // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29, + // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following + // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between + // F and the following word boundary to Lowercase_Mapping(C). + + auto boundaries = find_word_segmentation_boundaries(code_points); + if (boundaries.is_empty()) + return {}; + + auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional { + auto it = code_points.iterator_at_byte_offset_without_validation(boundary); + auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary); + + for (; it != end; ++it) { + if (code_point_has_property(*it, Property::Cased)) + return it; + } + + return {}; + }; + + auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr { + auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); + if (!special_casing) { + TRY(builder.try_append_code_point(to_unicode_titlecase(code_point))); + return {}; + } + + for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i) + TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i])); + return {}; + }; + + for (size_t i = 0; i < boundaries.size() - 1; ++i) { + auto boundary = boundaries[i]; + auto next_boundary = boundaries[i + 1]; + + if (auto it = first_cased_code_point_after_boundary(boundary, next_boundary); it.has_value()) { + auto code_point = *it.value(); + auto code_point_offset = code_points.byte_offset_of(*it); + auto code_point_length = it->underlying_code_point_length_in_bytes(); + + auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary); + TRY(builder.try_append(caseless_code_points.as_string())); + + TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length)); + boundary = code_point_offset + code_point_length; + } + + auto substring_to_lowercase = code_points.substring_view(boundary, next_boundary - boundary); + TRY(build_lowercase_string(substring_to_lowercase, builder, locale)); + } + + return {}; +#else + return Error::from_string_literal("Unicode data has been disabled"); +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h index 1770c385a72..5e9bcbf2a70 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.h +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h @@ -16,5 +16,6 @@ namespace Unicode::Detail { ErrorOr build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); ErrorOr build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); +ErrorOr build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); }