From 5cf818e305eca0dfc4c295b2c83a7d8a9a5786bb Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 19 Jun 2024 16:39:30 -0400 Subject: [PATCH] LibUnicode: Replace case transformations and comparison with ICUs There are a couple of differences here due to using ICU: 1. Titlecasing behaves slightly differently. We previously transformed "123dollars" to "123Dollars", as we would use word segmentation to split a string into words, then transform the first cased character to titlecase. ICU doesn't go quite that far, and leaves the string as "123dollars". While this is a behavior change, the only user of this API is the `text-transform: capitalize;` CSS rule, and we now match the behavior of other browsers. 2. There isn't an API to compare strings with case insensitivity without allocating case-folded strings for both the left- and right-hand-side strings. Our implementation was previously allocation-free; however, in a benchmark, ICU is still ~1.4x faster. --- Meta/CMake/unicode_data.cmake | 10 +- .../LibUnicode/GenerateUnicodeData.cpp | 439 ------------------ Tests/AK/TestString.cpp | 1 - .../LibUnicode/TestUnicodeCharacterTypes.cpp | 76 --- Userland/Libraries/LibRegex/RegexByteCode.cpp | 10 +- Userland/Libraries/LibUnicode/CMakeLists.txt | 1 - .../Libraries/LibUnicode/CharacterTypes.cpp | 134 +----- .../Libraries/LibUnicode/CharacterTypes.h | 21 +- Userland/Libraries/LibUnicode/Forward.h | 5 +- Userland/Libraries/LibUnicode/String.cpp | 114 ++++- .../Libraries/LibUnicode/UnicodeUtils.cpp | 368 --------------- Userland/Libraries/LibUnicode/UnicodeUtils.h | 24 - 12 files changed, 111 insertions(+), 1092 deletions(-) delete mode 100644 Userland/Libraries/LibUnicode/UnicodeUtils.cpp delete mode 100644 Userland/Libraries/LibUnicode/UnicodeUtils.h diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 08b07d31a51..b0c96be9b8d 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -13,12 +13,6 @@ set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip") set(UNICODE_DATA_SOURCE "UnicodeData.txt") set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}") -set(SPECIAL_CASING_SOURCE "SpecialCasing.txt") -set(SPECIAL_CASING_PATH "${UCD_PATH}/${SPECIAL_CASING_SOURCE}") - -set(CASE_FOLDING_SOURCE "CaseFolding.txt") -set(CASE_FOLDING_PATH "${UCD_PATH}/${CASE_FOLDING_SOURCE}") - set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt") set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}") @@ -72,8 +66,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) if (ENABLE_NETWORK_DOWNLOADS) download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SPECIAL_CASING_SOURCE}" "${SPECIAL_CASING_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${CASE_FOLDING_SOURCE}" "${CASE_FOLDING_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}") @@ -111,7 +103,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${UCD_VERSION_FILE}" "${UNICODE_DATA_HEADER}" "${UNICODE_DATA_IMPLEMENTATION}" - arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" + arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" ) invoke_generator( "EmojiData" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 75aa9825d4e..b1d5d27e4d1 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -21,24 +21,6 @@ #include #include -// https://www.unicode.org/reports/tr44/#SpecialCasing.txt -struct SpecialCasing { - u32 index { 0 }; - u32 code_point { 0 }; - Vector lowercase_mapping; - Vector uppercase_mapping; - Vector titlecase_mapping; - ByteString locale; - ByteString condition; -}; - -// https://www.unicode.org/reports/tr44/#CaseFolding.txt -struct CaseFolding { - u32 code_point { 0 }; - StringView status { "Common"sv }; - Vector mapping { 0 }; -}; - // https://www.unicode.org/reports/tr44/#PropList.txt using PropList = HashMap>; @@ -57,25 +39,6 @@ struct Normalization { using NormalizationProps = HashMap>; -struct CasingTable { - bool operator==(CasingTable const& other) const - { - return canonical_combining_class == other.canonical_combining_class - && simple_lowercase_mapping == other.simple_lowercase_mapping - && simple_uppercase_mapping == other.simple_uppercase_mapping - && simple_titlecase_mapping == other.simple_titlecase_mapping - && special_casing_indices == other.special_casing_indices - && case_folding_indices == other.case_folding_indices; - } - - u8 canonical_combining_class { 0 }; - Optional simple_uppercase_mapping; - Optional simple_lowercase_mapping; - Optional simple_titlecase_mapping; - Vector special_casing_indices; - Vector case_folding_indices; -}; - // https://www.unicode.org/reports/tr44/#UnicodeData.txt struct CodePointData { u32 code_point { 0 }; @@ -87,7 +50,6 @@ struct CodePointData { bool bidi_mirrored { false }; ByteString unicode_1_name; ByteString iso_comment; - CasingTable casing; }; using PropertyTable = Vector; @@ -111,15 +73,6 @@ struct CodePointBidiClass { }; struct UnicodeData { - Vector special_casing; - u32 largest_special_casing_mapping_size { 0 }; - Vector conditions; - Vector locales; - - Vector case_folding; - u32 largest_case_folding_mapping_size { 0 }; - Vector statuses; - Vector code_point_data; // https://www.unicode.org/reports/tr44/#General_Category_Values @@ -149,7 +102,6 @@ struct UnicodeData { PropList word_break_props; PropList sentence_break_props; - CodePointTables casing_tables; CodePointTables general_category_tables; CodePointTables property_tables; CodePointTables script_tables; @@ -180,125 +132,6 @@ static ByteString sanitize_entry(ByteString const& entry) return builder.to_byte_string(); } -static ErrorOr parse_special_casing(Core::InputBufferedFile& file, UnicodeData& unicode_data) -{ - Array buffer; - - while (TRY(file.can_read_line())) { - auto line = TRY(file.read_line(buffer)); - - if (line.is_empty() || line.starts_with('#')) - continue; - - if (auto index = line.find('#'); index.has_value()) - line = line.substring_view(0, *index); - - auto segments = line.split_view(';', SplitBehavior::KeepEmpty); - VERIFY(segments.size() == 5 || segments.size() == 6); - - SpecialCasing casing {}; - casing.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); - casing.lowercase_mapping = parse_code_point_list(segments[1]); - casing.titlecase_mapping = parse_code_point_list(segments[2]); - casing.uppercase_mapping = parse_code_point_list(segments[3]); - - if (auto condition = segments[4].trim_whitespace(); !condition.is_empty()) { - auto conditions = condition.split_view(' ', SplitBehavior::KeepEmpty); - VERIFY(conditions.size() == 1 || conditions.size() == 2); - - if (conditions.size() == 2) { - casing.locale = conditions[0]; - casing.condition = conditions[1]; - } else if (all_of(conditions[0], is_ascii_lower_alpha)) { - casing.locale = conditions[0]; - } else { - casing.condition = conditions[0]; - } - - if (!casing.locale.is_empty()) { - casing.locale = ByteString::formatted("{:c}{}", to_ascii_uppercase(casing.locale[0]), casing.locale.substring_view(1)); - - if (!unicode_data.locales.contains_slow(casing.locale)) - unicode_data.locales.append(casing.locale); - } - - casing.condition = casing.condition.replace("_"sv, ""sv, ReplaceMode::All); - - if (!casing.condition.is_empty() && !unicode_data.conditions.contains_slow(casing.condition)) - unicode_data.conditions.append(casing.condition); - } - - unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.lowercase_mapping.size()); - unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.titlecase_mapping.size()); - unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.uppercase_mapping.size()); - - unicode_data.special_casing.append(move(casing)); - } - - quick_sort(unicode_data.special_casing, [](auto const& lhs, auto const& rhs) { - if (lhs.code_point != rhs.code_point) - return lhs.code_point < rhs.code_point; - if (lhs.locale.is_empty() && !rhs.locale.is_empty()) - return false; - if (!lhs.locale.is_empty() && rhs.locale.is_empty()) - return true; - return lhs.locale < rhs.locale; - }); - - for (u32 i = 0; i < unicode_data.special_casing.size(); ++i) - unicode_data.special_casing[i].index = i; - - return {}; -} - -static ErrorOr parse_case_folding(Core::InputBufferedFile& file, UnicodeData& unicode_data) -{ - Array buffer; - - while (TRY(file.can_read_line())) { - auto line = TRY(file.read_line(buffer)); - if (line.is_empty() || line.starts_with('#')) - continue; - - auto segments = line.split_view(';', SplitBehavior::KeepEmpty); - VERIFY(segments.size() == 4); - - CaseFolding folding {}; - folding.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); - folding.mapping = parse_code_point_list(segments[2]); - - switch (segments[1].trim_whitespace()[0]) { - case 'C': - folding.status = "Common"sv; - break; - case 'F': - folding.status = "Full"sv; - break; - case 'S': - folding.status = "Simple"sv; - break; - case 'T': - folding.status = "Special"sv; - break; - } - - unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size()); - - if (!unicode_data.statuses.contains_slow(folding.status)) - unicode_data.statuses.append(folding.status); - - unicode_data.case_folding.append(move(folding)); - } - - quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) { - if (lhs.code_point != rhs.code_point) - return lhs.code_point < rhs.code_point; - return lhs.status < rhs.status; - }); - - return {}; -} - static ErrorOr parse_prop_list(Core::InputBufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false) { Array buffer; @@ -503,7 +336,6 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa CodePointData data {}; data.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); data.name = segments[1]; - data.casing.canonical_combining_class = AK::StringUtils::convert_to_uint(segments[3]).value(); data.bidi_class = segments[4]; data.numeric_value_decimal = AK::StringUtils::convert_to_int(segments[6]); data.numeric_value_digit = AK::StringUtils::convert_to_int(segments[7]); @@ -511,9 +343,6 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa data.bidi_mirrored = segments[9] == "Y"sv; data.unicode_1_name = segments[10]; data.iso_comment = segments[11]; - data.casing.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[12]); - data.casing.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[13]); - data.casing.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[14]); if (!assigned_code_point_range_start.has_value()) assigned_code_point_range_start = data.code_point; @@ -547,16 +376,6 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa } } - for (auto const& casing : unicode_data.special_casing) { - if (casing.code_point == data.code_point) - data.casing.special_casing_indices.append(casing.index); - } - - for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) { - if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) - data.casing.case_folding_indices.append(i); - } - unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep); previous_code_point = data.code_point; @@ -570,8 +389,6 @@ static ErrorOr generate_unicode_data_header(Core::InputBufferedFile& file, { StringBuilder builder; SourceGenerator generator { builder }; - generator.set("special_casing_mapping_size", ByteString::number(unicode_data.largest_special_casing_mapping_size)); - generator.set("case_folding_mapping_size", ByteString::number(unicode_data.largest_case_folding_mapping_size)); auto generate_enum = [&](StringView name, StringView default_, auto values, Vector aliases = {}) { quick_sort(values); @@ -619,9 +436,6 @@ enum class @name@ : @underlying@ {)~~~"); namespace Unicode { )~~~"); - generate_enum("Locale"sv, "None"sv, unicode_data.locales); - generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions)); - generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses)); generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases); generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); @@ -631,35 +445,6 @@ namespace Unicode { generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values()); generator.append(R"~~~( -struct SpecialCasing { - u32 code_point { 0 }; - - u32 lowercase_mapping[@special_casing_mapping_size@]; - u32 lowercase_mapping_size { 0 }; - - u32 uppercase_mapping[@special_casing_mapping_size@]; - u32 uppercase_mapping_size { 0 }; - - u32 titlecase_mapping[@special_casing_mapping_size@]; - u32 titlecase_mapping_size { 0 }; - - Locale locale { Locale::None }; - Condition condition { Condition::None }; -}; - -struct CaseFolding { - u32 code_point { 0 }; - CaseFoldingStatus status { CaseFoldingStatus::Common }; - - u32 mapping[@case_folding_mapping_size@]; - u32 mapping_size { 0 }; -}; - -Optional locale_from_string(StringView locale); - -ReadonlySpan special_case_mapping(u32 code_point); -ReadonlySpan case_folding_mapping(u32 code_point); - } )~~~"); @@ -672,9 +457,6 @@ static ErrorOr generate_unicode_data_implementation(Core::InputBufferedFil StringBuilder builder; SourceGenerator generator { builder }; - generator.set("special_casing_size", ByteString::number(unicode_data.special_casing.size())); - generator.set("case_folding_size", ByteString::number(unicode_data.case_folding.size())); - generator.set("CODE_POINT_TABLES_LSB_COUNT", TRY(String::number(CODE_POINT_TABLES_LSB_COUNT))); generator.set("CODE_POINT_TABLES_LSB_MASK", TRY(String::formatted("{:#x}", CODE_POINT_TABLES_LSB_MASK))); @@ -693,83 +475,7 @@ static ErrorOr generate_unicode_data_implementation(Core::InputBufferedFil namespace Unicode { )~~~"); - auto append_list_and_size = [&](auto const& list, StringView format) { - if (list.is_empty()) { - generator.append(", {}, 0"); - return; - } - - bool first = true; - generator.append(", {"); - for (auto const& item : list) { - generator.append(first ? " "sv : ", "sv); - generator.append(ByteString::formatted(format, item)); - first = false; - } - generator.append(ByteString::formatted(" }}, {}", list.size())); - }; - generator.append(R"~~~( -static constexpr Array s_special_case { {)~~~"); - - for (auto const& casing : unicode_data.special_casing) { - generator.set("code_point", ByteString::formatted("{:#x}", casing.code_point)); - generator.append(R"~~~( - { @code_point@)~~~"); - - constexpr auto format = "{:#x}"sv; - append_list_and_size(casing.lowercase_mapping, format); - append_list_and_size(casing.uppercase_mapping, format); - append_list_and_size(casing.titlecase_mapping, format); - - generator.set("locale", casing.locale.is_empty() ? "None" : casing.locale); - generator.append(", Locale::@locale@"); - - generator.set("condition", casing.condition.is_empty() ? "None" : casing.condition); - generator.append(", Condition::@condition@"); - - generator.append(" },"); - } - - generator.append(R"~~~( -} }; - -static constexpr Array s_case_folding { {)~~~"); - - for (auto const& folding : unicode_data.case_folding) { - generator.set("code_point", ByteString::formatted("{:#x}", folding.code_point)); - generator.set("status", folding.status); - generator.append(R"~~~( - { @code_point@, CaseFoldingStatus::@status@)~~~"); - - append_list_and_size(folding.mapping, "{:#x}"sv); - generator.append(" },"); - } - - generator.append(R"~~~( -} }; - -struct CasingTable { - u8 canonical_combining_class { 0 }; - i32 simple_uppercase_mapping { -1 }; - i32 simple_lowercase_mapping { -1 }; - i32 simple_titlecase_mapping { -1 }; - - u32 special_casing_start_index { 0 }; - u32 special_casing_size { 0 }; - - u32 case_folding_start_index { 0 }; - u32 case_folding_size { 0 }; -}; - -template -struct CodePointComparator { - constexpr int operator()(u32 code_point, MappingType const& mapping) - { - return code_point - mapping.code_point; - } -}; - struct BidiClassData { CodePointRange code_point_range {}; BidirectionalClass bidi_class {}; @@ -784,45 +490,6 @@ struct CodePointBidiClassComparator : public CodePointRangeComparator { )~~~"); - auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { - generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake))); - generator.set("size", TRY(String::number(unique_properties.size()))); - - auto optional_code_point_to_string = [](auto const& code_point) -> ErrorOr { - if (!code_point.has_value()) - return "-1"_string; - return String::number(*code_point); - }; - auto first_index_to_string = [](auto const& list) -> ErrorOr { - if (list.is_empty()) - return "0"_string; - return String::number(list.first()); - }; - - generator.append(R"~~~( -static constexpr Array @name@ { {)~~~"); - - for (auto const& casing : unique_properties) { - generator.set("canonical_combining_class", TRY(String::number(casing.canonical_combining_class))); - generator.set("simple_uppercase_mapping", TRY(optional_code_point_to_string(casing.simple_uppercase_mapping))); - generator.set("simple_lowercase_mapping", TRY(optional_code_point_to_string(casing.simple_lowercase_mapping))); - generator.set("simple_titlecase_mapping", TRY(optional_code_point_to_string(casing.simple_titlecase_mapping))); - generator.set("special_casing_start_index", TRY(first_index_to_string(casing.special_casing_indices))); - generator.set("special_casing_size", TRY(String::number(casing.special_casing_indices.size()))); - generator.set("case_folding_start_index", TRY(first_index_to_string(casing.case_folding_indices))); - generator.set("case_folding_size", TRY(String::number(casing.case_folding_indices.size()))); - - generator.append(R"~~~( - { @canonical_combining_class@, @simple_uppercase_mapping@, @simple_lowercase_mapping@, @simple_titlecase_mapping@, @special_casing_start_index@, @special_casing_size@, @case_folding_start_index@, @case_folding_size@ },)~~~"); - } - - generator.append(R"~~~( -} }; -)~~~"); - - return {}; - }; - auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake))); generator.set("outer_size", TRY(String::number(unique_properties.size()))); @@ -889,7 +556,6 @@ static constexpr Array<@type@, @size@> @name@ { { return {}; }; - TRY(append_code_point_tables("s_casings"sv, unicode_data.casing_tables, append_casing_table)); TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table)); TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table)); TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table)); @@ -926,64 +592,6 @@ static constexpr Array s_bidirectional_classes { { } generator.append(R"~~~( -static CasingTable const& casing_table_for_code_point(u32 code_point) -{ - auto stage1_index = code_point >> @CODE_POINT_TABLES_LSB_COUNT@; - auto stage2_index = s_casings_stage1[stage1_index] + (code_point & @CODE_POINT_TABLES_LSB_MASK@); - auto unique_properties_index = s_casings_stage2[stage2_index]; - - return s_casings_unique_properties[unique_properties_index]; -} -)~~~"); - - auto append_code_point_mapping_search = [&](StringView method, StringView mapping, Optional const& fallback = {}) { - generator.set("method", method); - generator.set("mapping", mapping); - generator.append(R"~~~( -u32 @method@(u32 code_point) -{ - auto const& casing_table = casing_table_for_code_point(code_point); - auto mapping = casing_table.@mapping@; -)~~~"); - - if (fallback.has_value()) { - generator.set("fallback", *fallback); - generator.append(R"~~~( - return mapping == -1 ? @fallback@ : static_cast(mapping);)~~~"); - } else { - generator.append(R"~~~( - return mapping;)~~~"); - } - - generator.append(R"~~~( -} -)~~~"); - }; - - append_code_point_mapping_search("canonical_combining_class"sv, "canonical_combining_class"sv); - append_code_point_mapping_search("to_unicode_uppercase"sv, "simple_uppercase_mapping"sv, "code_point"sv); - append_code_point_mapping_search("to_unicode_lowercase"sv, "simple_lowercase_mapping"sv, "code_point"sv); - append_code_point_mapping_search("to_unicode_titlecase"sv, "simple_titlecase_mapping"sv, "code_point"sv); - - generator.append(R"~~~( -ReadonlySpan special_case_mapping(u32 code_point) -{ - auto const& casing_table = casing_table_for_code_point(code_point); - if (casing_table.special_casing_size == 0) - return {}; - - return s_special_case.span().slice(casing_table.special_casing_start_index, casing_table.special_casing_size); -} - -ReadonlySpan case_folding_mapping(u32 code_point) -{ - auto const& casing_table = casing_table_for_code_point(code_point); - if (casing_table.case_folding_size == 0) - return {}; - - return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size); -} - Optional bidirectional_class(u32 code_point) { if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {})) @@ -1036,8 +644,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) return {}; }; - TRY(append_from_string("Locale"sv, "locale"sv, unicode_data.locales, {})); - TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv)); TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases)); @@ -1188,22 +794,6 @@ static ErrorOr normalize_script_extensions(PropList& script_extensions, Pr return {}; } -struct CasingMetadata { - using ConstIterator = typename Vector::ConstIterator; - - CasingMetadata(Vector const& code_point_data) - : iterator(code_point_data.begin()) - , end(code_point_data.end()) - { - } - - ConstIterator iterator; - ConstIterator const end; - - Vector current_block; - HashMap unique_blocks; -}; - struct PropertyMetadata { static ErrorOr create(PropList& property_list) { @@ -1301,25 +891,6 @@ static ErrorOr update_tables(u32 code_point, CodePointTables& tables, a static ErrorOr create_code_point_tables(UnicodeData& unicode_data) { - auto update_casing_tables = [&](u32 code_point, CodePointTables& tables, CasingMetadata& metadata) -> ErrorOr { - CasingTable casing {}; - - while (metadata.iterator != metadata.end) { - if (code_point < metadata.iterator->code_point) - break; - - if (code_point == metadata.iterator->code_point) { - casing = move(metadata.iterator->casing); - break; - } - - ++metadata.iterator; - } - - TRY(update_tables(code_point, tables, metadata, casing)); - return {}; - }; - auto update_property_tables = [&](u32 code_point, CodePointTables& tables, PropertyMetadata& metadata) -> ErrorOr { static Unicode::CodePointRangeComparator comparator {}; @@ -1346,7 +917,6 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) return {}; }; - CasingMetadata casing_metadata { unicode_data.code_point_data }; auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories)); auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list)); auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list)); @@ -1356,7 +926,6 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props)); for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) { - TRY(update_casing_tables(code_point, unicode_data.casing_tables, casing_metadata)); TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata)); TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata)); TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata)); @@ -1374,8 +943,6 @@ ErrorOr serenity_main(Main::Arguments arguments) StringView generated_header_path; StringView generated_implementation_path; StringView unicode_data_path; - StringView special_casing_path; - StringView case_folding_path; StringView derived_general_category_path; StringView prop_list_path; StringView derived_core_prop_path; @@ -1394,8 +961,6 @@ ErrorOr serenity_main(Main::Arguments arguments) args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); - args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path"); - args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path"); args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path"); args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path"); args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path"); @@ -1415,8 +980,6 @@ ErrorOr serenity_main(Main::Arguments arguments) auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read)); auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read)); - auto special_casing_file = TRY(open_file(special_casing_path, Core::File::OpenMode::Read)); - auto case_folding_file = TRY(open_file(case_folding_path, Core::File::OpenMode::Read)); auto prop_list_file = TRY(open_file(prop_list_path, Core::File::OpenMode::Read)); auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::File::OpenMode::Read)); auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::File::OpenMode::Read)); @@ -1431,8 +994,6 @@ ErrorOr serenity_main(Main::Arguments arguments) auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read)); UnicodeData unicode_data {}; - TRY(parse_special_casing(*special_casing_file, unicode_data)); - TRY(parse_case_folding(*case_folding_file, unicode_data)); TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories)); TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list)); TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list)); diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp index bbc09428931..7595aa75d21 100644 --- a/Tests/AK/TestString.cpp +++ b/Tests/AK/TestString.cpp @@ -592,7 +592,6 @@ TEST_CASE(to_titlecase) EXPECT_EQ(MUST("foo bar baz"_string.to_titlecase()), "Foo Bar Baz"sv); EXPECT_EQ(MUST("foo \n \r bar \t baz"_string.to_titlecase()), "Foo \n \r Bar \t Baz"sv); EXPECT_EQ(MUST("f\"oo\" b'ar'"_string.to_titlecase()), "F\"Oo\" B'ar'"sv); - EXPECT_EQ(MUST("123dollars"_string.to_titlecase()), "123Dollars"sv); } TEST_CASE(to_casefold) diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index a428cbcc26f..bfc175165e1 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -8,82 +8,6 @@ #include #include -#include - -static void compare_to_ascii(auto& old_function, auto& new_function) -{ - i64 result1 = 0; - i64 result2 = 0; - - for (u32 i = 0; i < 0x80; ++i) { - EXPECT_EQ(result1 = old_function(i), result2 = new_function(i)); - if (result1 != result2) - dbgln("Function input value was {}.", i); - } -} - -TEST_CASE(to_unicode_lowercase) -{ - compare_to_ascii(tolower, Unicode::to_unicode_lowercase); - - EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω" - EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω" - - // Code points encoded by ranges in UnicodeData.txt - EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u); - EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u); - EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u); - EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu); -} - -TEST_CASE(to_unicode_uppercase) -{ - compare_to_ascii(toupper, Unicode::to_unicode_uppercase); - - EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω" - EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω" - - // Code points encoded by ranges in UnicodeData.txt - EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u); - EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u); - EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u); - EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu); - - // Code points whose uppercase and titlecase mappings actually differ. - EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ" - EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ" - EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ" - EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ" -} - -TEST_CASE(to_unicode_titlecase) -{ - compare_to_ascii(toupper, Unicode::to_unicode_titlecase); - - EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω" - EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω" - - // Code points encoded by ranges in UnicodeData.txt - EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u); - EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u); - EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u); - EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu); - - // Code points whose uppercase and titlecase mappings actually differ. - EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž" - EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj" - EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj" - EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz" -} - -BENCHMARK_CASE(casing) -{ - for (size_t i = 0; i < 50'000; ++i) { - __test_to_unicode_lowercase(); - __test_to_unicode_uppercase(); - __test_to_unicode_titlecase(); - } -} TEST_CASE(general_category) { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index f8bdf1a3c1f..dcceb3ee9fd 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -713,10 +713,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt bool equal; if (input.regex_options & AllFlags::Insensitive) { - if (input.view.unicode()) - equal = Unicode::equals_ignoring_case(Utf32View { &input_view, 1 }, Utf32View { &ch1, 1 }); - else + if (input.view.unicode()) { + auto lhs = String::from_code_point(input_view); + auto rhs = String::from_code_point(ch1); + + equal = lhs.equals_ignoring_case(rhs); + } else { equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1); + } } else { equal = input_view == ch1; } diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index d6beddc25ed..3daa825f406 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -8,7 +8,6 @@ set(SOURCES Normalize.cpp Segmentation.cpp String.cpp - UnicodeUtils.cpp ${UNICODE_DATA_SOURCES} ) set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED}) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 72206192a04..55545ca329b 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -1,19 +1,10 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #if ENABLE_UNICODE_DATA # include @@ -21,129 +12,6 @@ namespace Unicode { -u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; } - -u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point) -{ - return to_ascii_lowercase(code_point); -} - -u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point) -{ - return to_ascii_uppercase(code_point); -} - -u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point) -{ - return to_ascii_uppercase(code_point); -} - -template -class CasefoldStringComparator { -public: - explicit CasefoldStringComparator(ViewType string) - : m_string(string) - , m_it(m_string.begin()) - { - } - - bool has_more_data() const - { - return !m_casefolded_code_points.is_empty() || (m_it != m_string.end()); - } - - size_t index() const - { - if constexpr (IsSame) - return m_string.byte_offset_of(m_it); - else if constexpr (IsSame) - return m_string.code_unit_offset_of(m_it); - else if constexpr (IsSame) - return m_string.iterator_offset(m_it); - else - static_assert(DependentFalse); - } - - u32 next_code_point() - { - VERIFY(has_more_data()); - - if (m_casefolded_code_points.is_empty()) { - m_current_code_point = *m_it; - ++m_it; - - m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point); - VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point. - } - - auto code_point = m_casefolded_code_points[0]; - m_casefolded_code_points = m_casefolded_code_points.substring_view(1); - - return code_point; - } - -private: - ViewType m_string; - typename ViewType::Iterator m_it; - - u32 m_current_code_point { 0 }; - Utf32View m_casefolded_code_points; -}; - -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145 -template -bool equals_ignoring_case(ViewType lhs, ViewType rhs) -{ - // A string X is a caseless match for a string Y if and only if: - // toCasefold(X) = toCasefold(Y) - - CasefoldStringComparator lhs_comparator { lhs }; - CasefoldStringComparator rhs_comparator { rhs }; - - while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) { - if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) - return false; - } - - return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data(); -} - -template bool equals_ignoring_case(Utf8View, Utf8View); -template bool equals_ignoring_case(Utf16View, Utf16View); -template bool equals_ignoring_case(Utf32View, Utf32View); - -template -Optional find_ignoring_case(ViewType lhs, ViewType rhs) -{ - CasefoldStringComparator lhs_comparator { lhs }; - - while (lhs_comparator.has_more_data()) { - CasefoldStringComparator rhs_comparator { rhs }; - - auto saved_state = lhs_comparator; - auto matches = true; - - while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) { - if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) { - matches = false; - break; - } - } - - if (matches && !rhs_comparator.has_more_data()) - return saved_state.index(); - - lhs_comparator = move(saved_state); - lhs_comparator.next_code_point(); - } - - return {}; -} - -template Optional find_ignoring_case(Utf8View, Utf8View); -template Optional find_ignoring_case(Utf16View, Utf16View); -template Optional find_ignoring_case(Utf32View, Utf32View); - Optional __attribute__((weak)) general_category_from_string(StringView) { return {}; } bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; } Optional __attribute__((weak)) property_from_string(StringView) { return {}; } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 7452fe5aa21..449daef653e 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -1,18 +1,15 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once -#include #include #include -#include -#include +#include #include -#include #include namespace Unicode { @@ -29,20 +26,6 @@ struct CodePointRangeComparator { } }; -u32 canonical_combining_class(u32 code_point); - -// Note: The single code point case conversions only perform simple case folding. -// Use the full-string transformations for full case folding. -u32 to_unicode_lowercase(u32 code_point); -u32 to_unicode_uppercase(u32 code_point); -u32 to_unicode_titlecase(u32 code_point); - -template -bool equals_ignoring_case(ViewType, ViewType); - -template -Optional find_ignoring_case(ViewType, ViewType); - Optional general_category_from_string(StringView); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index c6c6dfd9dab..9bc71644452 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -11,7 +11,6 @@ namespace Unicode { enum class BidirectionalClass : u8; -enum class Block : u16; enum class EmojiGroup : u8; enum class GeneralCategory : u8; enum class GraphemeBreakProperty : u8; @@ -20,9 +19,7 @@ enum class Script : u8; enum class SentenceBreakProperty : u8; enum class WordBreakProperty : u8; -struct CodePointDecomposition; struct CurrencyCode; struct Emoji; -struct SpecialCasing; } diff --git a/Userland/Libraries/LibUnicode/String.cpp b/Userland/Libraries/LibUnicode/String.cpp index b7aa405e7bd..27241e4d9e6 100644 --- a/Userland/Libraries/LibUnicode/String.cpp +++ b/Userland/Libraries/LibUnicode/String.cpp @@ -1,57 +1,141 @@ /* - * Copyright (c) 2023, Tim Flynn + * Copyright (c) 2023-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ +#define AK_DONT_REPLACE_STD + #include #include -#include -#include -#include +#include + +#include +#include +#include // This file contains definitions of AK::String methods which require UCD data. namespace AK { +struct ResolvedLocale { + ByteString buffer; + char const* locale { nullptr }; +}; + +static ResolvedLocale resolve_locale(Optional const& locale) +{ + if (!locale.has_value()) + return {}; + + ResolvedLocale resolved_locale; + resolved_locale.buffer = *locale; + resolved_locale.locale = resolved_locale.buffer.characters(); + + return resolved_locale; +} + ErrorOr String::to_lowercase(Optional const& locale) const { - StringBuilder builder; - TRY(Unicode::Detail::build_lowercase_string(code_points(), builder, locale)); + UErrorCode status = U_ZERO_ERROR; + + StringBuilder builder { bytes_as_string_view().length() }; + icu::StringByteSink sink { &builder }; + + auto resolved_locale = resolve_locale(locale); + + icu::CaseMap::utf8ToLower(resolved_locale.locale, 0, Locale::icu_string_piece(*this), sink, nullptr, status); + if (Locale::icu_failure(status)) + return Error::from_string_literal("Unable to convert string to lowercase"); + return builder.to_string_without_validation(); } ErrorOr String::to_uppercase(Optional const& locale) const { - StringBuilder builder; - TRY(Unicode::Detail::build_uppercase_string(code_points(), builder, locale)); + UErrorCode status = U_ZERO_ERROR; + + StringBuilder builder { bytes_as_string_view().length() }; + icu::StringByteSink sink { &builder }; + + auto resolved_locale = resolve_locale(locale); + + icu::CaseMap::utf8ToUpper(resolved_locale.locale, 0, Locale::icu_string_piece(*this), sink, nullptr, status); + if (Locale::icu_failure(status)) + return Error::from_string_literal("Unable to convert string to uppercase"); + return builder.to_string_without_validation(); } ErrorOr String::to_titlecase(Optional const& locale, TrailingCodePointTransformation trailing_code_point_transformation) const { - StringBuilder builder; - TRY(Unicode::Detail::build_titlecase_string(code_points(), builder, locale, trailing_code_point_transformation)); + UErrorCode status = U_ZERO_ERROR; + + StringBuilder builder { bytes_as_string_view().length() }; + icu::StringByteSink sink { &builder }; + + auto resolved_locale = resolve_locale(locale); + + u32 options = 0; + if (trailing_code_point_transformation == TrailingCodePointTransformation::PreserveExisting) + options |= U_TITLECASE_NO_LOWERCASE; + + icu::CaseMap::utf8ToTitle(resolved_locale.locale, options, nullptr, Locale::icu_string_piece(*this), sink, nullptr, status); + if (Locale::icu_failure(status)) + return Error::from_string_literal("Unable to convert string to titlecase"); + return builder.to_string_without_validation(); } +static ErrorOr build_casefold_string(StringView string, StringBuilder& builder) +{ + UErrorCode status = U_ZERO_ERROR; + + icu::StringByteSink sink { &builder }; + + icu::CaseMap::utf8Fold(0, Locale::icu_string_piece(string), sink, nullptr, status); + if (Locale::icu_failure(status)) + return Error::from_string_literal("Unable to casefold string"); + + return {}; +} + ErrorOr String::to_casefold() const { - StringBuilder builder; - TRY(Unicode::Detail::build_casefold_string(code_points(), builder)); + StringBuilder builder { bytes_as_string_view().length() }; + TRY(build_casefold_string(*this, builder)); + return builder.to_string_without_validation(); } bool String::equals_ignoring_case(String const& other) const { - return Unicode::equals_ignoring_case(code_points(), other.code_points()); + StringBuilder lhs_builder { bytes_as_string_view().length() }; + if (build_casefold_string(*this, lhs_builder).is_error()) + return false; + + StringBuilder rhs_builder { other.bytes_as_string_view().length() }; + if (build_casefold_string(other, rhs_builder).is_error()) + return false; + + return lhs_builder.string_view() == rhs_builder.string_view(); } Optional String::find_byte_offset_ignoring_case(StringView needle, size_t from_byte_offset) const { - auto haystack = code_points().substring_view(from_byte_offset); + auto haystack = bytes_as_string_view().substring_view(from_byte_offset); + if (haystack.is_empty()) + return {}; - if (auto index = Unicode::find_ignoring_case(haystack, Utf8View { needle }); index.has_value()) + StringBuilder lhs_builder { haystack.length() }; + if (build_casefold_string(haystack, lhs_builder).is_error()) + return {}; + + StringBuilder rhs_builder { needle.length() }; + if (build_casefold_string(needle, rhs_builder).is_error()) + return false; + + if (auto index = lhs_builder.string_view().find(rhs_builder.string_view()); index.has_value()) return *index + from_byte_offset; return {}; diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp deleted file mode 100644 index 2d3572b5a9a..00000000000 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright (c) 2023, Tim Flynn - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include -#include -#include -#include -#include - -#if ENABLE_UNICODE_DATA -# include -#endif - -// For details on the algorithms used here, see Section 3.13 Default Case Algorithms -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf - -namespace Unicode::Detail { - -#if ENABLE_UNICODE_DATA - -static bool is_after_uppercase_i(Utf8View const& string, size_t index) -{ - // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0. - auto preceding_view = string.substring_view(0, index); - bool found_uppercase_i = false; - - // FIXME: Would be better if Utf8View supported reverse iteration. - for (auto code_point : preceding_view) { - if (code_point == 'I') { - found_uppercase_i = true; - continue; - } - - auto combining_class = canonical_combining_class(code_point); - if (combining_class == 0 || combining_class == 230) - found_uppercase_i = false; - } - - return found_uppercase_i; -} - -static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index) -{ - // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above). - auto preceding_view = string.substring_view(0, index); - bool found_soft_dotted_code_point = false; - - // FIXME: Would be better if Utf8View supported reverse iteration. - for (auto code_point : preceding_view) { - if (code_point_has_property(code_point, Property::Soft_Dotted)) { - found_soft_dotted_code_point = true; - continue; - } - - auto combining_class = canonical_combining_class(code_point); - if (combining_class == 0 || combining_class == 230) - found_soft_dotted_code_point = false; - } - - return found_soft_dotted_code_point; -} - -static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) -{ - // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable - // characters, and C is not followed by a sequence consisting of zero or more case-ignorable - // characters and then a cased letter. - auto preceding_view = string.substring_view(0, index); - auto following_view = ((index + byte_length) < string.byte_length()) - ? string.substring_view(index + byte_length) - : Utf8View {}; - - size_t cased_letter_count = 0; - - for (auto code_point : preceding_view) { - bool is_cased = code_point_has_property(code_point, Property::Cased); - bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); - - if (is_cased && !is_case_ignorable) - ++cased_letter_count; - else if (!is_case_ignorable) - cased_letter_count = 0; - } - - if (cased_letter_count == 0) - return false; - - for (auto code_point : following_view) { - bool is_cased = code_point_has_property(code_point, Property::Cased); - bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); - - if (is_case_ignorable) - continue; - if (is_cased) - return false; - - break; - } - - return true; -} - -static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length) -{ - // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above). - auto following_view = ((index + byte_length) < string.byte_length()) - ? string.substring_view(index + byte_length) - : Utf8View {}; - - for (auto code_point : following_view) { - u32 combining_class = canonical_combining_class(code_point); - - if (combining_class == 0) - return false; - if (combining_class == 230) - return true; - } - - return false; -} - -static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length) -{ - // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may - // intervene between the current character and the combining dot above. - auto following_view = ((index + byte_length) < string.byte_length()) - ? string.substring_view(index + byte_length) - : Utf8View {}; - - for (auto code_point : following_view) { - if (code_point == 0x307) - return true; - - u32 combining_class = canonical_combining_class(code_point); - - if (combining_class == 0) - return false; - if (combining_class == 230) - return false; - } - - return false; -} - -static Optional find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) -{ - auto requested_locale = Locale::None; - - if (locale.has_value()) { - if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value()) - requested_locale = *maybe_locale; - } - - auto special_casings = special_case_mapping(code_point); - - for (auto const& special_casing : special_casings) { - if (special_casing.locale != Locale::None && special_casing.locale != requested_locale) - continue; - - switch (special_casing.condition) { - case Condition::None: - return special_casing; - - case Condition::AfterI: - if (is_after_uppercase_i(string, index)) - return special_casing; - break; - - case Condition::AfterSoftDotted: - if (is_after_soft_dotted_code_point(string, index)) - return special_casing; - break; - - case Condition::FinalSigma: - if (is_final_code_point(string, index, byte_length)) - return special_casing; - break; - - case Condition::MoreAbove: - if (is_followed_by_combining_class_above(string, index, byte_length)) - return special_casing; - break; - - case Condition::NotBeforeDot: - if (!is_followed_by_combining_dot_above(string, index, byte_length)) - return special_casing; - break; - } - } - - return {}; -} - -template -static Optional find_matching_case_folding(u32 code_point) -{ - auto case_foldings = case_folding_mapping(code_point); - - for (auto const& case_folding : case_foldings) { - if (((case_folding.status == StatusFilter) || ...)) - return case_folding; - } - - return {}; -} - -#endif - -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 -ErrorOr build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional const& locale) -{ -#if ENABLE_UNICODE_DATA - size_t index = 0; - size_t byte_length = 0; - - for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { - u32 code_point = *it; - byte_length = it.underlying_code_point_length_in_bytes(); - - auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); - if (!special_casing.has_value()) { - TRY(builder.try_append_code_point(to_unicode_lowercase(code_point))); - continue; - } - - for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i) - TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i])); - } - - return {}; -#else - return Error::from_string_literal("Unicode data has been disabled"); -#endif -} - -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 -ErrorOr build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional const& locale) -{ -#if ENABLE_UNICODE_DATA - size_t index = 0; - size_t byte_length = 0; - - for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { - u32 code_point = *it; - byte_length = it.underlying_code_point_length_in_bytes(); - - auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); - if (!special_casing.has_value()) { - TRY(builder.try_append_code_point(to_unicode_uppercase(code_point))); - continue; - } - - for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i) - TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i])); - } - - return {}; -#else - return Error::from_string_literal("Unicode data has been disabled"); -#endif -} - -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 -ErrorOr build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional const& locale, [[maybe_unused]] TrailingCodePointTransformation trailing_code_point_transformation) -{ -#if ENABLE_UNICODE_DATA - // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29, - // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following - // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between - // F and the following word boundary to Lowercase_Mapping(C). - - auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional { - auto it = code_points.iterator_at_byte_offset_without_validation(boundary); - auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary); - - for (; it != end; ++it) { - if (code_point_has_property(*it, Property::Cased)) - return it; - } - - return {}; - }; - - auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr { - auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); - if (!special_casing.has_value()) { - TRY(builder.try_append_code_point(to_unicode_titlecase(code_point))); - return {}; - } - - for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i) - TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i])); - return {}; - }; - - size_t boundary = 0; - - while (true) { - auto next_boundary = next_word_segmentation_boundary(code_points, boundary); - if (!next_boundary.has_value()) - break; - - if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) { - auto code_point = *it.value(); - auto code_point_offset = code_points.byte_offset_of(*it); - auto code_point_length = it->underlying_code_point_length_in_bytes(); - - auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary); - TRY(builder.try_append(caseless_code_points.as_string())); - - TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length)); - boundary = code_point_offset + code_point_length; - } - - auto remaining_code_points = code_points.substring_view(boundary, *next_boundary - boundary); - switch (trailing_code_point_transformation) { - case TrailingCodePointTransformation::Lowercase: - TRY(build_lowercase_string(remaining_code_points, builder, locale)); - break; - case TrailingCodePointTransformation::PreserveExisting: - TRY(builder.try_append(remaining_code_points.as_string())); - break; - } - - boundary = *next_boundary; - } - - return {}; -#else - return Error::from_string_literal("Unicode data has been disabled"); -#endif -} - -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 -ErrorOr build_casefold_string(Utf8View code_points, StringBuilder& builder) -{ - // toCasefold(X): Map each character C in X to Case_Folding(C). - for (auto code_point : code_points) { - auto case_folding = casefold_code_point(code_point); - TRY(builder.try_append(case_folding)); - } - - return {}; -} - -// https://www.unicode.org/reports/tr44/#CaseFolding.txt -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 -Utf32View casefold_code_point(u32 const& code_point) -{ -#if ENABLE_UNICODE_DATA - // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file - // CaseFolding.txt in the Unicode Character Database. - using enum CaseFoldingStatus; - - if (auto case_folding = find_matching_case_folding(code_point); case_folding.has_value()) - return Utf32View { case_folding->mapping, case_folding->mapping_size }; -#endif - - // The case foldings are omitted in the data file if they are the same as the code point itself. - return Utf32View { &code_point, 1 }; -} - -} diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h deleted file mode 100644 index 320025112a1..00000000000 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2023, Tim Flynn - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace Unicode::Detail { - -ErrorOr build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); -ErrorOr build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); -ErrorOr build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale, TrailingCodePointTransformation trailing_code_point_transformation); -ErrorOr build_casefold_string(Utf8View code_points, StringBuilder& builder); -Utf32View casefold_code_point(u32 const& code_point); - -}