LibUnicode: Generate simple case folding mappings for titlecase

Note we already generate the special case foldings for titlecase.
This commit is contained in:
Timothy Flynn 2023-01-16 10:33:15 -05:00 committed by Tim Flynn
parent 6d710eeb43
commit b562348d31
Notes: sideshowbarker 2024-07-17 06:39:26 +09:00
4 changed files with 36 additions and 0 deletions

View File

@ -113,6 +113,7 @@ struct UnicodeData {
u32 simple_uppercase_mapping_size { 0 };
u32 simple_lowercase_mapping_size { 0 };
u32 simple_titlecase_mapping_size { 0 };
Vector<SpecialCasing> special_casing;
u32 code_points_with_special_casing { 0 };
@ -674,6 +675,7 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0;
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
unicode_data.simple_titlecase_mapping_size += data.simple_titlecase_mapping.has_value();
unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value();
unicode_data.code_points_with_special_casing += has_special_casing;
@ -978,6 +980,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
});
append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; });
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
@ -1138,6 +1141,7 @@ u32 @method@(u32 code_point)
append_code_point_mapping_search("canonical_combining_class"sv, "s_combining_class_mappings"sv, "0"sv);
append_code_point_mapping_search("to_unicode_uppercase"sv, "s_uppercase_mappings"sv, "code_point"sv);
append_code_point_mapping_search("to_unicode_lowercase"sv, "s_lowercase_mappings"sv, "code_point"sv);
append_code_point_mapping_search("to_unicode_titlecase"sv, "s_titlecase_mappings"sv, "code_point"sv);
generator.append(R"~~~(
Span<SpecialCasing const* const> special_case_mapping(u32 code_point)

View File

@ -48,6 +48,32 @@ TEST_CASE(to_unicode_uppercase)
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
// Code points whose uppercase and titlecase mappings actually differ.
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ"
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ"
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ"
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ"
}
TEST_CASE(to_unicode_titlecase)
{
compare_to_ascii(toupper, Unicode::to_unicode_titlecase);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
// Code points encoded by ranges in UnicodeData.txt
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu);
// Code points whose uppercase and titlecase mappings actually differ.
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz"
}
TEST_CASE(to_unicode_lowercase_unconditional_special_casing)

View File

@ -38,6 +38,11 @@ u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
return to_ascii_uppercase(code_point);
}
u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point)
{
return to_ascii_uppercase(code_point);
}
ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView string, Optional<StringView> const& locale)
{
StringBuilder builder;

View File

@ -38,6 +38,7 @@ u32 canonical_combining_class(u32 code_point);
// Use the full-string transformations for full case folding.
u32 to_unicode_lowercase(u32 code_point);
u32 to_unicode_uppercase(u32 code_point);
u32 to_unicode_titlecase(u32 code_point);
ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView, Optional<StringView> const& locale = {});
ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringView> const& locale = {});