diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 903e28ccee9..62494de8f0b 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -885,8 +885,8 @@ struct CodePointDecomposition { Optional locale_from_string(StringView locale); -ReadonlySpan special_case_mapping(u32 code_point); -ReadonlySpan case_folding_mapping(u32 code_point); +ReadonlySpan special_case_mapping(u32 code_point); +ReadonlySpan case_folding_mapping(u32 code_point); } )~~~"); @@ -982,20 +982,16 @@ static constexpr Array s_case_folding { {)~~~" generator.append(R"~~~( } }; -struct CodePointMapping { - u32 code_point { 0 }; - u32 mapping { 0 }; -}; +struct CasingTable { + u8 canonical_combining_class { 0 }; + i32 simple_uppercase_mapping { -1 }; + i32 simple_lowercase_mapping { -1 }; + i32 simple_titlecase_mapping { -1 }; -struct SpecialCaseMapping { - u32 code_point { 0 }; - Array special_casing {}; + u32 special_casing_start_index { 0 }; u32 special_casing_size { 0 }; -}; -struct CaseFoldingMapping { - u32 code_point { 0 }; - Array case_folding {}; + u32 case_folding_start_index { 0 }; u32 case_folding_size { 0 }; }; @@ -1094,20 +1090,48 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { )~~~"); }; - append_code_point_mappings("combining_class"sv, "CodePointMapping"sv, unicode_data.code_points_with_non_zero_combining_class, - [](auto const& data) -> Optional { - if (data.casing.canonical_combining_class == 0) - return {}; - return data.casing.canonical_combining_class; - }); - append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.casing.simple_uppercase_mapping; }); - append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.casing.simple_lowercase_mapping; }); - append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.casing.simple_titlecase_mapping; }); - append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.casing.special_casing_indices; }); - append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.casing.case_folding_indices; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; }); + auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { + TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)))); + TRY(generator.set("size", TRY(String::number(unique_properties.size())))); + + auto optional_code_point_to_string = [](auto const& code_point) -> ErrorOr { + if (!code_point.has_value()) + return "-1"_short_string; + return String::number(*code_point); + }; + auto first_index_to_string = [](auto const& list) -> ErrorOr { + if (list.is_empty()) + return "0"_short_string; + return String::number(list.first()); + }; + + generator.append(R"~~~( +static constexpr Array @name@ { {)~~~"); + + for (auto const& casing : unique_properties) { + TRY(generator.set("canonical_combining_class", TRY(String::number(casing.canonical_combining_class)))); + TRY(generator.set("simple_uppercase_mapping", TRY(optional_code_point_to_string(casing.simple_uppercase_mapping)))); + TRY(generator.set("simple_lowercase_mapping", TRY(optional_code_point_to_string(casing.simple_lowercase_mapping)))); + TRY(generator.set("simple_titlecase_mapping", TRY(optional_code_point_to_string(casing.simple_titlecase_mapping)))); + TRY(generator.set("special_casing_start_index", TRY(first_index_to_string(casing.special_casing_indices)))); + TRY(generator.set("special_casing_size", TRY(String::number(casing.special_casing_indices.size())))); + TRY(generator.set("case_folding_start_index", TRY(first_index_to_string(casing.case_folding_indices)))); + TRY(generator.set("case_folding_size", TRY(String::number(casing.case_folding_indices.size())))); + + generator.append(R"~~~( + { @canonical_combining_class@, @simple_uppercase_mapping@, @simple_lowercase_mapping@, @simple_titlecase_mapping@, @special_casing_start_index@, @special_casing_size@, @case_folding_start_index@, @case_folding_size@ },)~~~"); + } + + generator.append(R"~~~( +} }; +)~~~"); + + return {}; + }; + auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)))); TRY(generator.set("outer_size", TRY(String::number(unique_properties.size())))); @@ -1174,6 +1198,7 @@ static constexpr Array<@type@, @size@> @name@ { { return {}; }; + TRY(append_code_point_tables("s_casings"sv, unicode_data.casing_tables, append_casing_table)); TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table)); TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table)); TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table)); @@ -1253,43 +1278,63 @@ Optional code_point_display_name(u32 code_point) return {}; } + +static CasingTable const& casing_table_for_code_point(u32 code_point) +{ + auto stage1_index = code_point >> @CODE_POINT_TABLES_LSB_COUNT@; + auto stage2_index = s_casings_stage1[stage1_index] + (code_point & @CODE_POINT_TABLES_LSB_MASK@); + auto unique_properties_index = s_casings_stage2[stage2_index]; + + return s_casings_unique_properties[unique_properties_index]; +} )~~~"); - auto append_code_point_mapping_search = [&](StringView method, StringView mappings, StringView fallback) { + auto append_code_point_mapping_search = [&](StringView method, StringView mapping, Optional const& fallback = {}) { generator.set("method", method); - generator.set("mappings", mappings); - generator.set("fallback", fallback); + generator.set("mapping", mapping); generator.append(R"~~~( u32 @method@(u32 code_point) { - auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); - return mapping ? mapping->mapping : @fallback@; + auto const& casing_table = casing_table_for_code_point(code_point); + auto mapping = casing_table.@mapping@; +)~~~"); + + if (fallback.has_value()) { + generator.set("fallback", *fallback); + generator.append(R"~~~( + return mapping == -1 ? @fallback@ : static_cast(mapping);)~~~"); + } else { + generator.append(R"~~~( + return mapping;)~~~"); + } + + generator.append(R"~~~( } )~~~"); }; - append_code_point_mapping_search("canonical_combining_class"sv, "s_combining_class_mappings"sv, "0"sv); - append_code_point_mapping_search("to_unicode_uppercase"sv, "s_uppercase_mappings"sv, "code_point"sv); - append_code_point_mapping_search("to_unicode_lowercase"sv, "s_lowercase_mappings"sv, "code_point"sv); - append_code_point_mapping_search("to_unicode_titlecase"sv, "s_titlecase_mappings"sv, "code_point"sv); + append_code_point_mapping_search("canonical_combining_class"sv, "canonical_combining_class"sv); + append_code_point_mapping_search("to_unicode_uppercase"sv, "simple_uppercase_mapping"sv, "code_point"sv); + append_code_point_mapping_search("to_unicode_lowercase"sv, "simple_lowercase_mapping"sv, "code_point"sv); + append_code_point_mapping_search("to_unicode_titlecase"sv, "simple_titlecase_mapping"sv, "code_point"sv); generator.append(R"~~~( -ReadonlySpan special_case_mapping(u32 code_point) +ReadonlySpan special_case_mapping(u32 code_point) { - auto const* mapping = binary_search(s_special_case_mappings, code_point, nullptr, CodePointComparator {}); - if (mapping == nullptr) + auto const& casing_table = casing_table_for_code_point(code_point); + if (casing_table.special_casing_size == 0) return {}; - return mapping->special_casing.span().slice(0, mapping->special_casing_size); + return s_special_case.span().slice(casing_table.special_casing_start_index, casing_table.special_casing_size); } -ReadonlySpan case_folding_mapping(u32 code_point) +ReadonlySpan case_folding_mapping(u32 code_point) { - auto const* mapping = binary_search(s_case_folding_mappings, code_point, nullptr, CodePointComparator {}); - if (mapping == nullptr) + auto const& casing_table = casing_table_for_code_point(code_point); + if (casing_table.case_folding_size == 0) return {}; - return mapping->case_folding.span().slice(0, mapping->case_folding_size); + return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size); } Optional code_point_abbreviation(u32 code_point) @@ -1513,6 +1558,22 @@ static ErrorOr normalize_script_extensions(PropList& script_extensions, Pr return {}; } +struct CasingMetadata { + using ConstIterator = typename Vector::ConstIterator; + + CasingMetadata(Vector const& code_point_data) + : iterator(code_point_data.begin()) + , end(code_point_data.end()) + { + } + + ConstIterator iterator; + ConstIterator const end; + + Vector current_block; + HashMap unique_blocks; +}; + struct PropertyMetadata { static ErrorOr create(PropList& property_list) { @@ -1608,6 +1669,25 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) return {}; }; + auto update_casing_tables = [&](auto code_point, auto& tables, auto& metadata) -> ErrorOr { + CasingTable casing {}; + + while (metadata.iterator != metadata.end) { + if (code_point < metadata.iterator->code_point) + break; + + if (code_point == metadata.iterator->code_point) { + casing = move(metadata.iterator->casing); + break; + } + + ++metadata.iterator; + } + + TRY(update_tables(code_point, tables, metadata, casing)); + return {}; + }; + auto update_property_tables = [&](auto code_point, auto& tables, auto& metadata) -> ErrorOr { static Unicode::CodePointRangeComparator comparator {}; @@ -1634,6 +1714,7 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) return {}; }; + CasingMetadata casing_metadata { unicode_data.code_point_data }; auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories)); auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list)); auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list)); @@ -1643,6 +1724,7 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props)); for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) { + TRY(update_casing_tables(code_point, unicode_data.casing_tables, casing_metadata)); TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata)); TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata)); TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata)); diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 64a9ea20252..822e9c83899 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -124,6 +124,16 @@ TEST_CASE(to_unicode_casefold) EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv); } +BENCHMARK_CASE(casing) +{ + for (size_t i = 0; i < 50'000; ++i) { + __test_to_unicode_lowercase(); + __test_to_unicode_uppercase(); + __test_to_unicode_titlecase(); + __test_to_unicode_casefold(); + } +} + TEST_CASE(to_unicode_lowercase_unconditional_special_casing) { // LATIN SMALL LETTER SHARP S diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp index 7c5ed2393ae..7fe764252d6 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -147,7 +147,7 @@ static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t in return false; } -static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) +static Optional find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) { auto requested_locale = Locale::None; @@ -158,11 +158,11 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View auto special_casings = special_case_mapping(code_point); - for (auto const* special_casing : special_casings) { - if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) + for (auto const& special_casing : special_casings) { + if (special_casing.locale != Locale::None && special_casing.locale != requested_locale) continue; - switch (special_casing->condition) { + switch (special_casing.condition) { case Condition::None: return special_casing; @@ -193,20 +193,20 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View } } - return nullptr; + return {}; } template -static CaseFolding const* find_matching_case_folding(u32 code_point) +static Optional find_matching_case_folding(u32 code_point) { auto case_foldings = case_folding_mapping(code_point); - for (auto const* case_folding : case_foldings) { - if (((case_folding->status == StatusFilter) || ...)) + for (auto const& case_folding : case_foldings) { + if (((case_folding.status == StatusFilter) || ...)) return case_folding; } - return nullptr; + return {}; } #endif @@ -222,8 +222,8 @@ ErrorOr build_lowercase_string([[maybe_unused]] Utf8View code_points, [[ma u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); - if (!special_casing) { + auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); + if (!special_casing.has_value()) { TRY(builder.try_append_code_point(to_unicode_lowercase(code_point))); continue; } @@ -249,8 +249,8 @@ ErrorOr build_uppercase_string([[maybe_unused]] Utf8View code_points, [[ma u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); - if (!special_casing) { + auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); + if (!special_casing.has_value()) { TRY(builder.try_append_code_point(to_unicode_uppercase(code_point))); continue; } @@ -287,8 +287,8 @@ ErrorOr build_titlecase_string([[maybe_unused]] Utf8View code_points, [[ma }; auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr { - auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); - if (!special_casing) { + auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); + if (!special_casing.has_value()) { TRY(builder.try_append_code_point(to_unicode_titlecase(code_point))); return {}; } @@ -350,7 +350,7 @@ Utf32View casefold_code_point(u32 const& code_point) // CaseFolding.txt in the Unicode Character Database. using enum CaseFoldingStatus; - if (auto const* case_folding = find_matching_case_folding(code_point)) + if (auto case_folding = find_matching_case_folding(code_point); case_folding.has_value()) return Utf32View { case_folding->mapping, case_folding->mapping_size }; #endif