diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp index e48bb9c6bef..d5253bb40eb 100644 --- a/Tests/AK/TestString.cpp +++ b/Tests/AK/TestString.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2022, Andreas Kling + * Copyright (c) 2021, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -15,6 +16,7 @@ #include #include #include +#include TEST_CASE(construct_empty) { @@ -263,65 +265,439 @@ TEST_CASE(reverse) test_reverse("ab😀cd"sv, "dc😀ba"sv); } -TEST_CASE(to_lowercase) +TEST_CASE(to_lowercase_unconditional_special_casing) { - { - auto string = "Aa"_string; - auto result = MUST(string.to_lowercase()); - EXPECT_EQ(result, "aa"sv); - } - { - auto string = "Ωω"_string; - auto result = MUST(string.to_lowercase()); - EXPECT_EQ(result, "ωω"sv); - } - { - auto string = "İi̇"_string; - auto result = MUST(string.to_lowercase()); - EXPECT_EQ(result, "i̇i̇"sv); - } + // LATIN SMALL LETTER SHARP S + auto result = MUST("\u00DF"_string.to_lowercase()); + EXPECT_EQ(result, "\u00DF"); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = MUST("\u0130"_string.to_lowercase()); + EXPECT_EQ(result, "\u0069\u0307"); + + // LATIN SMALL LIGATURE FF + result = MUST("\uFB00"_string.to_lowercase()); + EXPECT_EQ(result, "\uFB00"); + + // LATIN SMALL LIGATURE FI + result = MUST("\uFB01"_string.to_lowercase()); + EXPECT_EQ(result, "\uFB01"); + + // LATIN SMALL LIGATURE FL + result = MUST("\uFB02"_string.to_lowercase()); + EXPECT_EQ(result, "\uFB02"); + + // LATIN SMALL LIGATURE FFI + result = MUST("\uFB03"_string.to_lowercase()); + EXPECT_EQ(result, "\uFB03"); + + // LATIN SMALL LIGATURE FFL + result = MUST("\uFB04"_string.to_lowercase()); + EXPECT_EQ(result, "\uFB04"); + + // LATIN SMALL LIGATURE LONG S T + result = MUST("\uFB05"_string.to_lowercase()); + EXPECT_EQ(result, "\uFB05"); + + // LATIN SMALL LIGATURE ST + result = MUST("\uFB06"_string.to_lowercase()); + EXPECT_EQ(result, "\uFB06"); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FB7"_string.to_lowercase()); + EXPECT_EQ(result, "\u1FB7"); + + // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FC7"_string.to_lowercase()); + EXPECT_EQ(result, "\u1FC7"); + + // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FF7"_string.to_lowercase()); + EXPECT_EQ(result, "\u1FF7"); } -TEST_CASE(to_uppercase) +TEST_CASE(to_lowercase_special_casing_sigma) { - { - auto string = "Aa"_string; - auto result = MUST(string.to_uppercase()); - EXPECT_EQ(result, "AA"sv); - } - { - auto string = "Ωω"_string; - auto result = MUST(string.to_uppercase()); - EXPECT_EQ(result, "ΩΩ"sv); - } - { - auto string = "ʼn"_string; - auto result = MUST(string.to_uppercase()); - EXPECT_EQ(result, "ʼN"sv); - } + auto result = MUST("ABCI"_string.to_lowercase()); + EXPECT_EQ(result, "abci"); + + // Sigma preceded by A + result = MUST("A\u03A3"_string.to_lowercase()); + EXPECT_EQ(result, "a\u03C2"); + + // Sigma preceded by FEMININE ORDINAL INDICATOR + result = MUST("\u00AA\u03A3"_string.to_lowercase()); + EXPECT_EQ(result, "\u00AA\u03C2"); + + // Sigma preceded by ROMAN NUMERAL ONE + result = MUST("\u2160\u03A3"_string.to_lowercase()); + EXPECT_EQ(result, "\u2170\u03C2"); + + // Sigma preceded by COMBINING GREEK YPOGEGRAMMENI + result = MUST("\u0345\u03A3"_string.to_lowercase()); + EXPECT_EQ(result, "\u0345\u03C3"); + + // Sigma preceded by A and FULL STOP + result = MUST("A.\u03A3"_string.to_lowercase()); + EXPECT_EQ(result, "a.\u03C2"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR + result = MUST("A\u180E\u03A3"_string.to_lowercase()); + EXPECT_EQ(result, "a\u180E\u03C2"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B + result = MUST("A\u180E\u03A3B"_string.to_lowercase()); + EXPECT_EQ(result, "a\u180E\u03C3b"); + + // Sigma followed by A + result = MUST("\u03A3A"_string.to_lowercase()); + EXPECT_EQ(result, "\u03C3a"); + + // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR + result = MUST("A\u03A3\u180E"_string.to_lowercase()); + EXPECT_EQ(result, "a\u03C2\u180E"); + + // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B + result = MUST("A\u03A3\u180EB"_string.to_lowercase()); + EXPECT_EQ(result, "a\u03C3\u180Eb"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR + result = MUST("A\u180E\u03A3\u180E"_string.to_lowercase()); + EXPECT_EQ(result, "a\u180E\u03C2\u180E"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B + result = MUST("A\u180E\u03A3\u180EB"_string.to_lowercase()); + EXPECT_EQ(result, "a\u180E\u03C3\u180Eb"); +} + +TEST_CASE(to_lowercase_special_casing_i) +{ + // LATIN CAPITAL LETTER I + auto result = MUST("I"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "i"sv); + + result = MUST("I"_string.to_lowercase("az"sv)); + EXPECT_EQ(result, "\u0131"sv); + + result = MUST("I"_string.to_lowercase("tr"sv)); + EXPECT_EQ(result, "\u0131"sv); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = MUST("\u0130"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "\u0069\u0307"sv); + + result = MUST("\u0130"_string.to_lowercase("az"sv)); + EXPECT_EQ(result, "i"sv); + + result = MUST("\u0130"_string.to_lowercase("tr"sv)); + EXPECT_EQ(result, "i"sv); + + // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE + result = MUST("I\u0307"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "i\u0307"sv); + + result = MUST("I\u0307"_string.to_lowercase("az"sv)); + EXPECT_EQ(result, "i"sv); + + result = MUST("I\u0307"_string.to_lowercase("tr"sv)); + EXPECT_EQ(result, "i"sv); + + // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE + result = MUST("IA\u0307"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "ia\u0307"sv); + + result = MUST("IA\u0307"_string.to_lowercase("az"sv)); + EXPECT_EQ(result, "\u0131a\u0307"sv); + + result = MUST("IA\u0307"_string.to_lowercase("tr"sv)); + EXPECT_EQ(result, "\u0131a\u0307"sv); +} + +TEST_CASE(to_lowercase_special_casing_more_above) +{ + // LATIN CAPITAL LETTER I + auto result = MUST("I"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "i"sv); + + result = MUST("I"_string.to_lowercase("lt"sv)); + EXPECT_EQ(result, "i"sv); + + // LATIN CAPITAL LETTER J + result = MUST("J"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "j"sv); + + result = MUST("J"_string.to_lowercase("lt"sv)); + EXPECT_EQ(result, "j"sv); + + // LATIN CAPITAL LETTER I WITH OGONEK + result = MUST("\u012e"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "\u012f"sv); + + result = MUST("\u012e"_string.to_lowercase("lt"sv)); + EXPECT_EQ(result, "\u012f"sv); + + // LATIN CAPITAL LETTER I followed by COMBINING GRAVE ACCENT + result = MUST("I\u0300"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "i\u0300"sv); + + result = MUST("I\u0300"_string.to_lowercase("lt"sv)); + EXPECT_EQ(result, "i\u0307\u0300"sv); + + // LATIN CAPITAL LETTER J followed by COMBINING GRAVE ACCENT + result = MUST("J\u0300"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "j\u0300"sv); + + result = MUST("J\u0300"_string.to_lowercase("lt"sv)); + EXPECT_EQ(result, "j\u0307\u0300"sv); + + // LATIN CAPITAL LETTER I WITH OGONEK followed by COMBINING GRAVE ACCENT + result = MUST("\u012e\u0300"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "\u012f\u0300"sv); + + result = MUST("\u012e\u0300"_string.to_lowercase("lt"sv)); + EXPECT_EQ(result, "\u012f\u0307\u0300"sv); +} + +TEST_CASE(to_lowercase_special_casing_not_before_dot) +{ + // LATIN CAPITAL LETTER I + auto result = MUST("I"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "i"sv); + + result = MUST("I"_string.to_lowercase("az"sv)); + EXPECT_EQ(result, "\u0131"sv); + + result = MUST("I"_string.to_lowercase("tr"sv)); + EXPECT_EQ(result, "\u0131"sv); + + // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE + result = MUST("I\u0307"_string.to_lowercase("en"sv)); + EXPECT_EQ(result, "i\u0307"sv); + + result = MUST("I\u0307"_string.to_lowercase("az"sv)); + EXPECT_EQ(result, "i"sv); + + result = MUST("I\u0307"_string.to_lowercase("tr"sv)); + EXPECT_EQ(result, "i"sv); +} + +TEST_CASE(to_uppercase_unconditional_special_casing) +{ + // LATIN SMALL LETTER SHARP S + auto result = MUST("\u00DF"_string.to_uppercase()); + EXPECT_EQ(result, "\u0053\u0053"); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = MUST("\u0130"_string.to_uppercase()); + EXPECT_EQ(result, "\u0130"); + + // LATIN SMALL LIGATURE FF + result = MUST("\uFB00"_string.to_uppercase()); + EXPECT_EQ(result, "\u0046\u0046"); + + // LATIN SMALL LIGATURE FI + result = MUST("\uFB01"_string.to_uppercase()); + EXPECT_EQ(result, "\u0046\u0049"); + + // LATIN SMALL LIGATURE FL + result = MUST("\uFB02"_string.to_uppercase()); + EXPECT_EQ(result, "\u0046\u004C"); + + // LATIN SMALL LIGATURE FFI + result = MUST("\uFB03"_string.to_uppercase()); + EXPECT_EQ(result, "\u0046\u0046\u0049"); + + // LATIN SMALL LIGATURE FFL + result = MUST("\uFB04"_string.to_uppercase()); + EXPECT_EQ(result, "\u0046\u0046\u004C"); + + // LATIN SMALL LIGATURE LONG S T + result = MUST("\uFB05"_string.to_uppercase()); + EXPECT_EQ(result, "\u0053\u0054"); + + // LATIN SMALL LIGATURE ST + result = MUST("\uFB06"_string.to_uppercase()); + EXPECT_EQ(result, "\u0053\u0054"); + + // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + result = MUST("\u0390"_string.to_uppercase()); + EXPECT_EQ(result, "\u0399\u0308\u0301"); + + // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + result = MUST("\u03B0"_string.to_uppercase()); + EXPECT_EQ(result, "\u03A5\u0308\u0301"); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FB7"_string.to_uppercase()); + EXPECT_EQ(result, "\u0391\u0342\u0399"); + + // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FC7"_string.to_uppercase()); + EXPECT_EQ(result, "\u0397\u0342\u0399"); + + // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FF7"_string.to_uppercase()); + EXPECT_EQ(result, "\u03A9\u0342\u0399"); +} + +TEST_CASE(to_uppercase_special_casing_soft_dotted) +{ + // LATIN SMALL LETTER I + auto result = MUST("i"_string.to_uppercase("en"sv)); + EXPECT_EQ(result, "I"sv); + + result = MUST("i"_string.to_uppercase("lt"sv)); + EXPECT_EQ(result, "I"sv); + + // LATIN SMALL LETTER J + result = MUST("j"_string.to_uppercase("en"sv)); + EXPECT_EQ(result, "J"sv); + + result = MUST("j"_string.to_uppercase("lt"sv)); + EXPECT_EQ(result, "J"sv); + + // LATIN SMALL LETTER I followed by COMBINING DOT ABOVE + result = MUST("i\u0307"_string.to_uppercase("en"sv)); + EXPECT_EQ(result, "I\u0307"sv); + + result = MUST("i\u0307"_string.to_uppercase("lt"sv)); + EXPECT_EQ(result, "I"sv); + + // LATIN SMALL LETTER J followed by COMBINING DOT ABOVE + result = MUST("j\u0307"_string.to_uppercase("en"sv)); + EXPECT_EQ(result, "J\u0307"sv); + + result = MUST("j\u0307"_string.to_uppercase("lt"sv)); + EXPECT_EQ(result, "J"sv); } TEST_CASE(to_titlecase) { - { - auto string = "foo bar baz"_string; - auto result = MUST(string.to_titlecase()); - EXPECT_EQ(result, "Foo Bar Baz"sv); + EXPECT_EQ(MUST(""_string.to_titlecase()), ""sv); + EXPECT_EQ(MUST(" "_string.to_titlecase()), " "sv); + EXPECT_EQ(MUST(" - "_string.to_titlecase()), " - "sv); + + EXPECT_EQ(MUST("a"_string.to_titlecase()), "A"sv); + EXPECT_EQ(MUST("A"_string.to_titlecase()), "A"sv); + EXPECT_EQ(MUST(" a"_string.to_titlecase()), " A"sv); + EXPECT_EQ(MUST("a "_string.to_titlecase()), "A "sv); + + EXPECT_EQ(MUST("ab"_string.to_titlecase()), "Ab"sv); + EXPECT_EQ(MUST("Ab"_string.to_titlecase()), "Ab"sv); + EXPECT_EQ(MUST("aB"_string.to_titlecase()), "Ab"sv); + EXPECT_EQ(MUST("AB"_string.to_titlecase()), "Ab"sv); + EXPECT_EQ(MUST(" ab"_string.to_titlecase()), " Ab"sv); + EXPECT_EQ(MUST("ab "_string.to_titlecase()), "Ab "sv); + + EXPECT_EQ(MUST("foo bar baz"_string.to_titlecase()), "Foo Bar Baz"sv); + EXPECT_EQ(MUST("foo \n \r bar \t baz"_string.to_titlecase()), "Foo \n \r Bar \t Baz"sv); + EXPECT_EQ(MUST("f\"oo\" b'ar'"_string.to_titlecase()), "F\"Oo\" B'ar'"sv); + EXPECT_EQ(MUST("123dollars"_string.to_titlecase()), "123Dollars"sv); +} + +TEST_CASE(to_casefold) +{ + for (u8 code_point = 0; code_point < 0x80; ++code_point) { + auto ascii = tolower(code_point); + auto unicode = MUST(MUST(String::from_utf8({ reinterpret_cast(&code_point), 1 })).to_casefold()); + + EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u); + EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii); } - { - auto string = "foo \n \r bar \t baz"_string; - auto result = MUST(string.to_titlecase()); - EXPECT_EQ(result, "Foo \n \r Bar \t Baz"sv); - } - { - auto string = "f\"oo\" b'ar'"_string; - auto result = MUST(string.to_titlecase()); - EXPECT_EQ(result, "F\"Oo\" B'ar'"sv); - } - { - auto string = "123dollars"_string; - auto result = MUST(string.to_titlecase()); - EXPECT_EQ(result, "123Dollars"sv); + + // LATIN SMALL LETTER SHARP S + auto result = MUST("\u00DF"_string.to_casefold()); + EXPECT_EQ(result, "\u0073\u0073"sv); + + // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI + result = MUST("\u1FB3"_string.to_casefold()); + EXPECT_EQ(result, "\u03B1\u03B9"sv); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI + result = MUST("\u1FB6"_string.to_casefold()); + EXPECT_EQ(result, "\u03B1\u0342"sv); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FB7"_string.to_casefold()); + EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv); +} + +TEST_CASE(to_titlecase_unconditional_special_casing) +{ + // LATIN SMALL LETTER SHARP S + auto result = MUST("\u00DF"_string.to_titlecase()); + EXPECT_EQ(result, "\u0053\u0073"sv); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = MUST("\u0130"_string.to_titlecase()); + EXPECT_EQ(result, "\u0130"sv); + + // LATIN SMALL LIGATURE FF + result = MUST("\uFB00"_string.to_titlecase()); + EXPECT_EQ(result, "\u0046\u0066"sv); + + // LATIN SMALL LIGATURE FI + result = MUST("\uFB01"_string.to_titlecase()); + EXPECT_EQ(result, "\u0046\u0069"sv); + + // LATIN SMALL LIGATURE FL + result = MUST("\uFB02"_string.to_titlecase()); + EXPECT_EQ(result, "\u0046\u006C"sv); + + // LATIN SMALL LIGATURE FFI + result = MUST("\uFB03"_string.to_titlecase()); + EXPECT_EQ(result, "\u0046\u0066\u0069"sv); + + // LATIN SMALL LIGATURE FFL + result = MUST("\uFB04"_string.to_titlecase()); + EXPECT_EQ(result, "\u0046\u0066\u006C"sv); + + // LATIN SMALL LIGATURE LONG S T + result = MUST("\uFB05"_string.to_titlecase()); + EXPECT_EQ(result, "\u0053\u0074"sv); + + // LATIN SMALL LIGATURE ST + result = MUST("\uFB06"_string.to_titlecase()); + EXPECT_EQ(result, "\u0053\u0074"sv); + + // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + result = MUST("\u0390"_string.to_titlecase()); + EXPECT_EQ(result, "\u0399\u0308\u0301"sv); + + // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + result = MUST("\u03B0"_string.to_titlecase()); + EXPECT_EQ(result, "\u03A5\u0308\u0301"sv); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FB7"_string.to_titlecase()); + EXPECT_EQ(result, "\u0391\u0342\u0345"sv); + + // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FC7"_string.to_titlecase()); + EXPECT_EQ(result, "\u0397\u0342\u0345"sv); + + // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST("\u1FF7"_string.to_titlecase()); + EXPECT_EQ(result, "\u03A9\u0342\u0345"sv); +} + +TEST_CASE(to_titlecase_special_casing_i) +{ + // LATIN SMALL LETTER I + auto result = MUST("i"_string.to_titlecase("en"sv)); + EXPECT_EQ(result, "I"sv); + + result = MUST("i"_string.to_titlecase("az"sv)); + EXPECT_EQ(result, "\u0130"sv); + + result = MUST("i"_string.to_titlecase("tr"sv)); + EXPECT_EQ(result, "\u0130"sv); +} + +BENCHMARK_CASE(casefold) +{ + for (size_t i = 0; i < 50'000; ++i) { + __test_to_casefold(); } } diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 9edd7a93480..eddfb260ebb 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -74,54 +74,6 @@ TEST_CASE(to_unicode_titlecase) EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj" EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj" EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz" - - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(""sv)), ""sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" "sv)), " "sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" - "sv)), " - "sv); - - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a"sv)), "A"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("A"sv)), "A"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" a"sv)), " A"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a "sv)), "A "sv); - - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab"sv)), "Ab"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("Ab"sv)), "Ab"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("aB"sv)), "Ab"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("AB"sv)), "Ab"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" ab"sv)), " Ab"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab "sv)), "Ab "sv); - - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'ar'"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv); -} - -TEST_CASE(to_unicode_casefold) -{ - for (u8 code_point = 0; code_point < 0x80; ++code_point) { - auto ascii = tolower(code_point); - auto unicode = MUST(Unicode::to_unicode_casefold_full({ reinterpret_cast(&code_point), 1 })); - - EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u); - EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii); - } - - // LATIN SMALL LETTER SHARP S - auto result = MUST(Unicode::to_unicode_casefold_full("\u00DF"sv)); - EXPECT_EQ(result, "\u0073\u0073"sv); - - // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_casefold_full("\u1FB3"sv)); - EXPECT_EQ(result, "\u03B1\u03B9"sv); - - // GREEK SMALL LETTER ALPHA WITH PERISPOMENI - result = MUST(Unicode::to_unicode_casefold_full("\u1FB6"sv)); - EXPECT_EQ(result, "\u03B1\u0342"sv); - - // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_casefold_full("\u1FB7"sv)); - EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv); } BENCHMARK_CASE(casing) @@ -130,388 +82,9 @@ BENCHMARK_CASE(casing) __test_to_unicode_lowercase(); __test_to_unicode_uppercase(); __test_to_unicode_titlecase(); - __test_to_unicode_casefold(); } } -TEST_CASE(to_unicode_lowercase_unconditional_special_casing) -{ - // LATIN SMALL LETTER SHARP S - auto result = MUST(Unicode::to_unicode_lowercase_full("\u00DF"sv)); - EXPECT_EQ(result, "\u00DF"); - - // LATIN CAPITAL LETTER I WITH DOT ABOVE - result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv)); - EXPECT_EQ(result, "\u0069\u0307"); - - // LATIN SMALL LIGATURE FF - result = MUST(Unicode::to_unicode_lowercase_full("\uFB00"sv)); - EXPECT_EQ(result, "\uFB00"); - - // LATIN SMALL LIGATURE FI - result = MUST(Unicode::to_unicode_lowercase_full("\uFB01"sv)); - EXPECT_EQ(result, "\uFB01"); - - // LATIN SMALL LIGATURE FL - result = MUST(Unicode::to_unicode_lowercase_full("\uFB02"sv)); - EXPECT_EQ(result, "\uFB02"); - - // LATIN SMALL LIGATURE FFI - result = MUST(Unicode::to_unicode_lowercase_full("\uFB03"sv)); - EXPECT_EQ(result, "\uFB03"); - - // LATIN SMALL LIGATURE FFL - result = MUST(Unicode::to_unicode_lowercase_full("\uFB04"sv)); - EXPECT_EQ(result, "\uFB04"); - - // LATIN SMALL LIGATURE LONG S T - result = MUST(Unicode::to_unicode_lowercase_full("\uFB05"sv)); - EXPECT_EQ(result, "\uFB05"); - - // LATIN SMALL LIGATURE ST - result = MUST(Unicode::to_unicode_lowercase_full("\uFB06"sv)); - EXPECT_EQ(result, "\uFB06"); - - // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_lowercase_full("\u1FB7"sv)); - EXPECT_EQ(result, "\u1FB7"); - - // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_lowercase_full("\u1FC7"sv)); - EXPECT_EQ(result, "\u1FC7"); - - // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_lowercase_full("\u1FF7"sv)); - EXPECT_EQ(result, "\u1FF7"); -} - -TEST_CASE(to_unicode_lowercase_special_casing_sigma) -{ - auto result = MUST(Unicode::to_unicode_lowercase_full("ABCI"sv)); - EXPECT_EQ(result, "abci"); - - // Sigma preceded by A - result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3"sv)); - EXPECT_EQ(result, "a\u03C2"); - - // Sigma preceded by FEMININE ORDINAL INDICATOR - result = MUST(Unicode::to_unicode_lowercase_full("\u00AA\u03A3"sv)); - EXPECT_EQ(result, "\u00AA\u03C2"); - - // Sigma preceded by ROMAN NUMERAL ONE - result = MUST(Unicode::to_unicode_lowercase_full("\u2160\u03A3"sv)); - EXPECT_EQ(result, "\u2170\u03C2"); - - // Sigma preceded by COMBINING GREEK YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_lowercase_full("\u0345\u03A3"sv)); - EXPECT_EQ(result, "\u0345\u03C3"); - - // Sigma preceded by A and FULL STOP - result = MUST(Unicode::to_unicode_lowercase_full("A.\u03A3"sv)); - EXPECT_EQ(result, "a.\u03C2"); - - // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR - result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv)); - EXPECT_EQ(result, "a\u180E\u03C2"); - - // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B - result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv)); - EXPECT_EQ(result, "a\u180E\u03C3b"); - - // Sigma followed by A - result = MUST(Unicode::to_unicode_lowercase_full("\u03A3A"sv)); - EXPECT_EQ(result, "\u03C3a"); - - // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR - result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv)); - EXPECT_EQ(result, "a\u03C2\u180E"); - - // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B - result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv)); - EXPECT_EQ(result, "a\u03C3\u180Eb"); - - // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR - result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv)); - EXPECT_EQ(result, "a\u180E\u03C2\u180E"); - - // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B - result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv)); - EXPECT_EQ(result, "a\u180E\u03C3\u180Eb"); -} - -TEST_CASE(to_unicode_lowercase_special_casing_i) -{ - // LATIN CAPITAL LETTER I - auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv)); - EXPECT_EQ(result, "i"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv)); - EXPECT_EQ(result, "\u0131"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv)); - EXPECT_EQ(result, "\u0131"sv); - - // LATIN CAPITAL LETTER I WITH DOT ABOVE - result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv)); - EXPECT_EQ(result, "\u0069\u0307"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv)); - EXPECT_EQ(result, "i"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv)); - EXPECT_EQ(result, "i"sv); - - // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE - result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv)); - EXPECT_EQ(result, "i\u0307"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv)); - EXPECT_EQ(result, "i"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv)); - EXPECT_EQ(result, "i"sv); - - // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE - result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv)); - EXPECT_EQ(result, "ia\u0307"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv)); - EXPECT_EQ(result, "\u0131a\u0307"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv)); - EXPECT_EQ(result, "\u0131a\u0307"sv); -} - -TEST_CASE(to_unicode_lowercase_special_casing_more_above) -{ - // LATIN CAPITAL LETTER I - auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv)); - EXPECT_EQ(result, "i"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "lt"sv)); - EXPECT_EQ(result, "i"sv); - - // LATIN CAPITAL LETTER J - result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "en"sv)); - EXPECT_EQ(result, "j"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "lt"sv)); - EXPECT_EQ(result, "j"sv); - - // LATIN CAPITAL LETTER I WITH OGONEK - result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "en"sv)); - EXPECT_EQ(result, "\u012f"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "lt"sv)); - EXPECT_EQ(result, "\u012f"sv); - - // LATIN CAPITAL LETTER I followed by COMBINING GRAVE ACCENT - result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "en"sv)); - EXPECT_EQ(result, "i\u0300"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "lt"sv)); - EXPECT_EQ(result, "i\u0307\u0300"sv); - - // LATIN CAPITAL LETTER J followed by COMBINING GRAVE ACCENT - result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "en"sv)); - EXPECT_EQ(result, "j\u0300"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "lt"sv)); - EXPECT_EQ(result, "j\u0307\u0300"sv); - - // LATIN CAPITAL LETTER I WITH OGONEK followed by COMBINING GRAVE ACCENT - result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "en"sv)); - EXPECT_EQ(result, "\u012f\u0300"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "lt"sv)); - EXPECT_EQ(result, "\u012f\u0307\u0300"sv); -} - -TEST_CASE(to_unicode_lowercase_special_casing_not_before_dot) -{ - // LATIN CAPITAL LETTER I - auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv)); - EXPECT_EQ(result, "i"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv)); - EXPECT_EQ(result, "\u0131"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv)); - EXPECT_EQ(result, "\u0131"sv); - - // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE - result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv)); - EXPECT_EQ(result, "i\u0307"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv)); - EXPECT_EQ(result, "i"sv); - - result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv)); - EXPECT_EQ(result, "i"sv); -} - -TEST_CASE(to_unicode_uppercase_unconditional_special_casing) -{ - // LATIN SMALL LETTER SHARP S - auto result = MUST(Unicode::to_unicode_uppercase_full("\u00DF"sv)); - EXPECT_EQ(result, "\u0053\u0053"); - - // LATIN CAPITAL LETTER I WITH DOT ABOVE - result = MUST(Unicode::to_unicode_uppercase_full("\u0130"sv)); - EXPECT_EQ(result, "\u0130"); - - // LATIN SMALL LIGATURE FF - result = MUST(Unicode::to_unicode_uppercase_full("\uFB00"sv)); - EXPECT_EQ(result, "\u0046\u0046"); - - // LATIN SMALL LIGATURE FI - result = MUST(Unicode::to_unicode_uppercase_full("\uFB01"sv)); - EXPECT_EQ(result, "\u0046\u0049"); - - // LATIN SMALL LIGATURE FL - result = MUST(Unicode::to_unicode_uppercase_full("\uFB02"sv)); - EXPECT_EQ(result, "\u0046\u004C"); - - // LATIN SMALL LIGATURE FFI - result = MUST(Unicode::to_unicode_uppercase_full("\uFB03"sv)); - EXPECT_EQ(result, "\u0046\u0046\u0049"); - - // LATIN SMALL LIGATURE FFL - result = MUST(Unicode::to_unicode_uppercase_full("\uFB04"sv)); - EXPECT_EQ(result, "\u0046\u0046\u004C"); - - // LATIN SMALL LIGATURE LONG S T - result = MUST(Unicode::to_unicode_uppercase_full("\uFB05"sv)); - EXPECT_EQ(result, "\u0053\u0054"); - - // LATIN SMALL LIGATURE ST - result = MUST(Unicode::to_unicode_uppercase_full("\uFB06"sv)); - EXPECT_EQ(result, "\u0053\u0054"); - - // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - result = MUST(Unicode::to_unicode_uppercase_full("\u0390"sv)); - EXPECT_EQ(result, "\u0399\u0308\u0301"); - - // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - result = MUST(Unicode::to_unicode_uppercase_full("\u03B0"sv)); - EXPECT_EQ(result, "\u03A5\u0308\u0301"); - - // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_uppercase_full("\u1FB7"sv)); - EXPECT_EQ(result, "\u0391\u0342\u0399"); - - // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_uppercase_full("\u1FC7"sv)); - EXPECT_EQ(result, "\u0397\u0342\u0399"); - - // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_uppercase_full("\u1FF7"sv)); - EXPECT_EQ(result, "\u03A9\u0342\u0399"); -} - -TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted) -{ - // LATIN SMALL LETTER I - auto result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "en"sv)); - EXPECT_EQ(result, "I"sv); - - result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "lt"sv)); - EXPECT_EQ(result, "I"sv); - - // LATIN SMALL LETTER J - result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "en"sv)); - EXPECT_EQ(result, "J"sv); - - result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "lt"sv)); - EXPECT_EQ(result, "J"sv); - - // LATIN SMALL LETTER I followed by COMBINING DOT ABOVE - result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "en"sv)); - EXPECT_EQ(result, "I\u0307"sv); - - result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "lt"sv)); - EXPECT_EQ(result, "I"sv); - - // LATIN SMALL LETTER J followed by COMBINING DOT ABOVE - result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "en"sv)); - EXPECT_EQ(result, "J\u0307"sv); - - result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "lt"sv)); - EXPECT_EQ(result, "J"sv); -} - -TEST_CASE(to_unicode_titlecase_unconditional_special_casing) -{ - // LATIN SMALL LETTER SHARP S - auto result = MUST(Unicode::to_unicode_titlecase_full("\u00DF"sv)); - EXPECT_EQ(result, "\u0053\u0073"sv); - - // LATIN CAPITAL LETTER I WITH DOT ABOVE - result = MUST(Unicode::to_unicode_titlecase_full("\u0130"sv)); - EXPECT_EQ(result, "\u0130"sv); - - // LATIN SMALL LIGATURE FF - result = MUST(Unicode::to_unicode_titlecase_full("\uFB00"sv)); - EXPECT_EQ(result, "\u0046\u0066"sv); - - // LATIN SMALL LIGATURE FI - result = MUST(Unicode::to_unicode_titlecase_full("\uFB01"sv)); - EXPECT_EQ(result, "\u0046\u0069"sv); - - // LATIN SMALL LIGATURE FL - result = MUST(Unicode::to_unicode_titlecase_full("\uFB02"sv)); - EXPECT_EQ(result, "\u0046\u006C"sv); - - // LATIN SMALL LIGATURE FFI - result = MUST(Unicode::to_unicode_titlecase_full("\uFB03"sv)); - EXPECT_EQ(result, "\u0046\u0066\u0069"sv); - - // LATIN SMALL LIGATURE FFL - result = MUST(Unicode::to_unicode_titlecase_full("\uFB04"sv)); - EXPECT_EQ(result, "\u0046\u0066\u006C"sv); - - // LATIN SMALL LIGATURE LONG S T - result = MUST(Unicode::to_unicode_titlecase_full("\uFB05"sv)); - EXPECT_EQ(result, "\u0053\u0074"sv); - - // LATIN SMALL LIGATURE ST - result = MUST(Unicode::to_unicode_titlecase_full("\uFB06"sv)); - EXPECT_EQ(result, "\u0053\u0074"sv); - - // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - result = MUST(Unicode::to_unicode_titlecase_full("\u0390"sv)); - EXPECT_EQ(result, "\u0399\u0308\u0301"sv); - - // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - result = MUST(Unicode::to_unicode_titlecase_full("\u03B0"sv)); - EXPECT_EQ(result, "\u03A5\u0308\u0301"sv); - - // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_titlecase_full("\u1FB7"sv)); - EXPECT_EQ(result, "\u0391\u0342\u0345"sv); - - // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_titlecase_full("\u1FC7"sv)); - EXPECT_EQ(result, "\u0397\u0342\u0345"sv); - - // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI - result = MUST(Unicode::to_unicode_titlecase_full("\u1FF7"sv)); - EXPECT_EQ(result, "\u03A9\u0342\u0345"sv); -} - -TEST_CASE(to_unicode_titlecase_special_casing_i) -{ - // LATIN SMALL LETTER I - auto result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "en"sv)); - EXPECT_EQ(result, "I"sv); - - result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "az"sv)); - EXPECT_EQ(result, "\u0130"sv); - - result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "tr"sv)); - EXPECT_EQ(result, "\u0130"sv); -} - TEST_CASE(general_category) { auto general_category = [](StringView name) { diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index faecd51be85..416243d21a1 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -41,34 +41,6 @@ u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point) return to_ascii_uppercase(code_point); } -ErrorOr to_unicode_lowercase_full(StringView string, Optional const& locale) -{ - StringBuilder builder; - TRY(Detail::build_lowercase_string(Utf8View { string }, builder, locale)); - return builder.to_deprecated_string(); -} - -ErrorOr to_unicode_uppercase_full(StringView string, Optional const& locale) -{ - StringBuilder builder; - TRY(Detail::build_uppercase_string(Utf8View { string }, builder, locale)); - return builder.to_deprecated_string(); -} - -ErrorOr to_unicode_titlecase_full(StringView string, Optional const& locale, TrailingCodePointTransformation trailing_code_point_transformation) -{ - StringBuilder builder; - TRY(Detail::build_titlecase_string(Utf8View { string }, builder, locale, trailing_code_point_transformation)); - return builder.to_string(); -} - -ErrorOr to_unicode_casefold_full(StringView string) -{ - StringBuilder builder; - TRY(Detail::build_casefold_string(Utf8View { string }, builder)); - return builder.to_string(); -} - template class CasefoldStringComparator { public: diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 70af064ef68..23c48cc9aef 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -48,11 +48,6 @@ u32 to_unicode_lowercase(u32 code_point); u32 to_unicode_uppercase(u32 code_point); u32 to_unicode_titlecase(u32 code_point); -ErrorOr to_unicode_lowercase_full(StringView, Optional const& locale = {}); -ErrorOr to_unicode_uppercase_full(StringView, Optional const& locale = {}); -ErrorOr to_unicode_titlecase_full(StringView, Optional const& locale = {}, TrailingCodePointTransformation trailing_code_point_transformation = TrailingCodePointTransformation::Lowercase); -ErrorOr to_unicode_casefold_full(StringView); - template bool equals_ignoring_case(ViewType, ViewType);