From 456211932fa27b25c62339e4fe79f4d0fa3bfd16 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 26 Jul 2023 12:54:05 -0400 Subject: [PATCH] LibUnicode: Perform code point case conversion lookups in constant time Similar to commit 0652cc4, we now generate 2-stage lookup tables for case conversion information. Only about 1500 code points are actually cased. This means that case information is rather highly compressible, as the blocks we break the code points into will generally all have no casing information at all. In total, this change: * Does not change the size of libunicode.so (which is nice because, generally, the 2-stage lookup tables are expected to trade a bit of size for performance). * Reduces the runtime of the new benchmark test case added here from 1.383s to 1.127s (about an 18.5% improvement). --- .../LibUnicode/GenerateUnicodeData.cpp | 164 +++++++++++++----- .../LibUnicode/TestUnicodeCharacterTypes.cpp | 10 ++ .../Libraries/LibUnicode/UnicodeUtils.cpp | 32 ++-- 3 files changed, 149 insertions(+), 57 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 903e28ccee9..62494de8f0b 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -885,8 +885,8 @@ struct CodePointDecomposition { Optional locale_from_string(StringView locale); -ReadonlySpan special_case_mapping(u32 code_point); -ReadonlySpan case_folding_mapping(u32 code_point); +ReadonlySpan special_case_mapping(u32 code_point); +ReadonlySpan case_folding_mapping(u32 code_point); } )~~~"); @@ -982,20 +982,16 @@ static constexpr Array s_case_folding { {)~~~" generator.append(R"~~~( } }; -struct CodePointMapping { - u32 code_point { 0 }; - u32 mapping { 0 }; -}; +struct CasingTable { + u8 canonical_combining_class { 0 }; + i32 simple_uppercase_mapping { -1 }; + i32 simple_lowercase_mapping { -1 }; + i32 simple_titlecase_mapping { -1 }; -struct SpecialCaseMapping { - u32 code_point { 0 }; - Array special_casing {}; + u32 special_casing_start_index { 0 }; u32 special_casing_size { 0 }; -}; -struct CaseFoldingMapping { - u32 code_point { 0 }; - Array case_folding {}; + u32 case_folding_start_index { 0 }; u32 case_folding_size { 0 }; }; @@ -1094,20 +1090,48 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { )~~~"); }; - append_code_point_mappings("combining_class"sv, "CodePointMapping"sv, unicode_data.code_points_with_non_zero_combining_class, - [](auto const& data) -> Optional { - if (data.casing.canonical_combining_class == 0) - return {}; - return data.casing.canonical_combining_class; - }); - append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.casing.simple_uppercase_mapping; }); - append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.casing.simple_lowercase_mapping; }); - append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.casing.simple_titlecase_mapping; }); - append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.casing.special_casing_indices; }); - append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.casing.case_folding_indices; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; }); + auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { + TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)))); + TRY(generator.set("size", TRY(String::number(unique_properties.size())))); + + auto optional_code_point_to_string = [](auto const& code_point) -> ErrorOr { + if (!code_point.has_value()) + return "-1"_short_string; + return String::number(*code_point); + }; + auto first_index_to_string = [](auto const& list) -> ErrorOr { + if (list.is_empty()) + return "0"_short_string; + return String::number(list.first()); + }; + + generator.append(R"~~~( +static constexpr Array @name@ { {)~~~"); + + for (auto const& casing : unique_properties) { + TRY(generator.set("canonical_combining_class", TRY(String::number(casing.canonical_combining_class)))); + TRY(generator.set("simple_uppercase_mapping", TRY(optional_code_point_to_string(casing.simple_uppercase_mapping)))); + TRY(generator.set("simple_lowercase_mapping", TRY(optional_code_point_to_string(casing.simple_lowercase_mapping)))); + TRY(generator.set("simple_titlecase_mapping", TRY(optional_code_point_to_string(casing.simple_titlecase_mapping)))); + TRY(generator.set("special_casing_start_index", TRY(first_index_to_string(casing.special_casing_indices)))); + TRY(generator.set("special_casing_size", TRY(String::number(casing.special_casing_indices.size())))); + TRY(generator.set("case_folding_start_index", TRY(first_index_to_string(casing.case_folding_indices)))); + TRY(generator.set("case_folding_size", TRY(String::number(casing.case_folding_indices.size())))); + + generator.append(R"~~~( + { @canonical_combining_class@, @simple_uppercase_mapping@, @simple_lowercase_mapping@, @simple_titlecase_mapping@, @special_casing_start_index@, @special_casing_size@, @case_folding_start_index@, @case_folding_size@ },)~~~"); + } + + generator.append(R"~~~( +} }; +)~~~"); + + return {}; + }; + auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)))); TRY(generator.set("outer_size", TRY(String::number(unique_properties.size())))); @@ -1174,6 +1198,7 @@ static constexpr Array<@type@, @size@> @name@ { { return {}; }; + TRY(append_code_point_tables("s_casings"sv, unicode_data.casing_tables, append_casing_table)); TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table)); TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table)); TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table)); @@ -1253,43 +1278,63 @@ Optional code_point_display_name(u32 code_point) return {}; } + +static CasingTable const& casing_table_for_code_point(u32 code_point) +{ + auto stage1_index = code_point >> @CODE_POINT_TABLES_LSB_COUNT@; + auto stage2_index = s_casings_stage1[stage1_index] + (code_point & @CODE_POINT_TABLES_LSB_MASK@); + auto unique_properties_index = s_casings_stage2[stage2_index]; + + return s_casings_unique_properties[unique_properties_index]; +} )~~~"); - auto append_code_point_mapping_search = [&](StringView method, StringView mappings, StringView fallback) { + auto append_code_point_mapping_search = [&](StringView method, StringView mapping, Optional const& fallback = {}) { generator.set("method", method); - generator.set("mappings", mappings); - generator.set("fallback", fallback); + generator.set("mapping", mapping); generator.append(R"~~~( u32 @method@(u32 code_point) { - auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); - return mapping ? mapping->mapping : @fallback@; + auto const& casing_table = casing_table_for_code_point(code_point); + auto mapping = casing_table.@mapping@; +)~~~"); + + if (fallback.has_value()) { + generator.set("fallback", *fallback); + generator.append(R"~~~( + return mapping == -1 ? @fallback@ : static_cast(mapping);)~~~"); + } else { + generator.append(R"~~~( + return mapping;)~~~"); + } + + generator.append(R"~~~( } )~~~"); }; - append_code_point_mapping_search("canonical_combining_class"sv, "s_combining_class_mappings"sv, "0"sv); - append_code_point_mapping_search("to_unicode_uppercase"sv, "s_uppercase_mappings"sv, "code_point"sv); - append_code_point_mapping_search("to_unicode_lowercase"sv, "s_lowercase_mappings"sv, "code_point"sv); - append_code_point_mapping_search("to_unicode_titlecase"sv, "s_titlecase_mappings"sv, "code_point"sv); + append_code_point_mapping_search("canonical_combining_class"sv, "canonical_combining_class"sv); + append_code_point_mapping_search("to_unicode_uppercase"sv, "simple_uppercase_mapping"sv, "code_point"sv); + append_code_point_mapping_search("to_unicode_lowercase"sv, "simple_lowercase_mapping"sv, "code_point"sv); + append_code_point_mapping_search("to_unicode_titlecase"sv, "simple_titlecase_mapping"sv, "code_point"sv); generator.append(R"~~~( -ReadonlySpan special_case_mapping(u32 code_point) +ReadonlySpan special_case_mapping(u32 code_point) { - auto const* mapping = binary_search(s_special_case_mappings, code_point, nullptr, CodePointComparator {}); - if (mapping == nullptr) + auto const& casing_table = casing_table_for_code_point(code_point); + if (casing_table.special_casing_size == 0) return {}; - return mapping->special_casing.span().slice(0, mapping->special_casing_size); + return s_special_case.span().slice(casing_table.special_casing_start_index, casing_table.special_casing_size); } -ReadonlySpan case_folding_mapping(u32 code_point) +ReadonlySpan case_folding_mapping(u32 code_point) { - auto const* mapping = binary_search(s_case_folding_mappings, code_point, nullptr, CodePointComparator {}); - if (mapping == nullptr) + auto const& casing_table = casing_table_for_code_point(code_point); + if (casing_table.case_folding_size == 0) return {}; - return mapping->case_folding.span().slice(0, mapping->case_folding_size); + return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size); } Optional code_point_abbreviation(u32 code_point) @@ -1513,6 +1558,22 @@ static ErrorOr normalize_script_extensions(PropList& script_extensions, Pr return {}; } +struct CasingMetadata { + using ConstIterator = typename Vector::ConstIterator; + + CasingMetadata(Vector const& code_point_data) + : iterator(code_point_data.begin()) + , end(code_point_data.end()) + { + } + + ConstIterator iterator; + ConstIterator const end; + + Vector current_block; + HashMap unique_blocks; +}; + struct PropertyMetadata { static ErrorOr create(PropList& property_list) { @@ -1608,6 +1669,25 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) return {}; }; + auto update_casing_tables = [&](auto code_point, auto& tables, auto& metadata) -> ErrorOr { + CasingTable casing {}; + + while (metadata.iterator != metadata.end) { + if (code_point < metadata.iterator->code_point) + break; + + if (code_point == metadata.iterator->code_point) { + casing = move(metadata.iterator->casing); + break; + } + + ++metadata.iterator; + } + + TRY(update_tables(code_point, tables, metadata, casing)); + return {}; + }; + auto update_property_tables = [&](auto code_point, auto& tables, auto& metadata) -> ErrorOr { static Unicode::CodePointRangeComparator comparator {}; @@ -1634,6 +1714,7 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) return {}; }; + CasingMetadata casing_metadata { unicode_data.code_point_data }; auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories)); auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list)); auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list)); @@ -1643,6 +1724,7 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props)); for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) { + TRY(update_casing_tables(code_point, unicode_data.casing_tables, casing_metadata)); TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata)); TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata)); TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata)); diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 64a9ea20252..822e9c83899 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -124,6 +124,16 @@ TEST_CASE(to_unicode_casefold) EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv); } +BENCHMARK_CASE(casing) +{ + for (size_t i = 0; i < 50'000; ++i) { + __test_to_unicode_lowercase(); + __test_to_unicode_uppercase(); + __test_to_unicode_titlecase(); + __test_to_unicode_casefold(); + } +} + TEST_CASE(to_unicode_lowercase_unconditional_special_casing) { // LATIN SMALL LETTER SHARP S diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp index 7c5ed2393ae..7fe764252d6 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -147,7 +147,7 @@ static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t in return false; } -static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) +static Optional find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) { auto requested_locale = Locale::None; @@ -158,11 +158,11 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View auto special_casings = special_case_mapping(code_point); - for (auto const* special_casing : special_casings) { - if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) + for (auto const& special_casing : special_casings) { + if (special_casing.locale != Locale::None && special_casing.locale != requested_locale) continue; - switch (special_casing->condition) { + switch (special_casing.condition) { case Condition::None: return special_casing; @@ -193,20 +193,20 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View } } - return nullptr; + return {}; } template -static CaseFolding const* find_matching_case_folding(u32 code_point) +static Optional find_matching_case_folding(u32 code_point) { auto case_foldings = case_folding_mapping(code_point); - for (auto const* case_folding : case_foldings) { - if (((case_folding->status == StatusFilter) || ...)) + for (auto const& case_folding : case_foldings) { + if (((case_folding.status == StatusFilter) || ...)) return case_folding; } - return nullptr; + return {}; } #endif @@ -222,8 +222,8 @@ ErrorOr build_lowercase_string([[maybe_unused]] Utf8View code_points, [[ma u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); - if (!special_casing) { + auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); + if (!special_casing.has_value()) { TRY(builder.try_append_code_point(to_unicode_lowercase(code_point))); continue; } @@ -249,8 +249,8 @@ ErrorOr build_uppercase_string([[maybe_unused]] Utf8View code_points, [[ma u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); - if (!special_casing) { + auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); + if (!special_casing.has_value()) { TRY(builder.try_append_code_point(to_unicode_uppercase(code_point))); continue; } @@ -287,8 +287,8 @@ ErrorOr build_titlecase_string([[maybe_unused]] Utf8View code_points, [[ma }; auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr { - auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); - if (!special_casing) { + auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); + if (!special_casing.has_value()) { TRY(builder.try_append_code_point(to_unicode_titlecase(code_point))); return {}; } @@ -350,7 +350,7 @@ Utf32View casefold_code_point(u32 const& code_point) // CaseFolding.txt in the Unicode Character Database. using enum CaseFoldingStatus; - if (auto const* case_folding = find_matching_case_folding(code_point)) + if (auto case_folding = find_matching_case_folding(code_point); case_folding.has_value()) return Utf32View { case_folding->mapping, case_folding->mapping_size }; #endif