diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 6914b99a11f..dc97c6362b3 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -13,6 +13,9 @@ set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}") set(SPECIAL_CASING_SOURCE "SpecialCasing.txt") set(SPECIAL_CASING_PATH "${UCD_PATH}/${SPECIAL_CASING_SOURCE}") +set(CASE_FOLDING_SOURCE "CaseFolding.txt") +set(CASE_FOLDING_PATH "${UCD_PATH}/${CASE_FOLDING_SOURCE}") + set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt") set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}") @@ -71,6 +74,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SPECIAL_CASING_SOURCE}" "${SPECIAL_CASING_PATH}") + extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${CASE_FOLDING_SOURCE}" "${CASE_FOLDING_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}") @@ -105,7 +109,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${UCD_VERSION_FILE}" "${UNICODE_DATA_HEADER}" "${UNICODE_DATA_IMPLEMENTATION}" - arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" + arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" ) invoke_generator( "EmojiData" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 62947ac3265..f79b2bb8895 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -41,6 +41,13 @@ struct SpecialCasing { DeprecatedString condition; }; +// https://www.unicode.org/reports/tr44/#CaseFolding.txt +struct CaseFolding { + u32 code_point { 0 }; + StringView status { "Common"sv }; + Vector mapping { 0 }; +}; + // https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings struct CodePointDecomposition { // `tag` is a string since it's used for codegen as an enum value. @@ -90,6 +97,7 @@ struct CodePointData { Optional simple_lowercase_mapping; Optional simple_titlecase_mapping; Vector special_casing_indices; + Vector case_folding_indices; }; struct BlockName { @@ -117,6 +125,12 @@ struct UnicodeData { Vector conditions; Vector locales; + Vector case_folding; + u32 code_points_with_case_folding { 0 }; + u32 largest_case_folding_mapping_size { 0 }; + u32 largest_case_folding_size { 0 }; + Vector statuses; + Vector code_point_data; HashMap code_point_abbreviations; @@ -276,6 +290,54 @@ static ErrorOr parse_special_casing(Core::Stream::BufferedFile& file, Unic return {}; } +static ErrorOr parse_case_folding(Core::Stream::BufferedFile& file, UnicodeData& unicode_data) +{ + Array buffer; + + while (TRY(file.can_read_line())) { + auto line = TRY(file.read_line(buffer)); + if (line.is_empty() || line.starts_with('#')) + continue; + + auto segments = line.split_view(';', SplitBehavior::KeepEmpty); + VERIFY(segments.size() == 4); + + CaseFolding folding {}; + folding.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); + folding.mapping = parse_code_point_list(segments[2]); + + switch (segments[1].trim_whitespace()[0]) { + case 'C': + folding.status = "Common"sv; + break; + case 'F': + folding.status = "Full"sv; + break; + case 'S': + folding.status = "Simple"sv; + break; + case 'T': + folding.status = "Special"sv; + break; + } + + unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size()); + + if (!unicode_data.statuses.contains_slow(folding.status)) + unicode_data.statuses.append(folding.status); + + unicode_data.case_folding.append(move(folding)); + } + + quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) { + if (lhs.code_point != rhs.code_point) + return lhs.code_point < rhs.code_point; + return lhs.status < rhs.status; + }); + + return {}; +} + static ErrorOr parse_prop_list(Core::Stream::BufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false) { Array buffer; @@ -667,6 +729,14 @@ static ErrorOr parse_unicode_data(Core::Stream::BufferedFile& file, Unicod } } + bool has_case_folding { false }; + for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) { + if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) { + data.case_folding_indices.append(i); + has_case_folding = true; + } + } + unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0; unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); @@ -675,8 +745,11 @@ static ErrorOr parse_unicode_data(Core::Stream::BufferedFile& file, Unicod unicode_data.code_points_with_special_casing += has_special_casing; unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); - previous_code_point = data.code_point; + unicode_data.code_points_with_case_folding += has_case_folding; + unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.case_folding_indices.size()); + + previous_code_point = data.code_point; unicode_data.code_point_data.append(move(data)); } @@ -688,8 +761,9 @@ static ErrorOr generate_unicode_data_header(Core::Stream::BufferedFile& fi StringBuilder builder; SourceGenerator generator { builder }; generator.set("special_casing_mapping_size", DeprecatedString::number(unicode_data.largest_special_casing_mapping_size)); + generator.set("case_folding_mapping_size", DeprecatedString::number(unicode_data.largest_case_folding_mapping_size)); - auto generate_enum = [&](StringView name, StringView default_, Vector values, Vector aliases = {}) { + auto generate_enum = [&](StringView name, StringView default_, auto values, Vector aliases = {}) { quick_sort(values); quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; }); @@ -737,6 +811,7 @@ namespace Unicode { generate_enum("Locale"sv, "None"sv, unicode_data.locales); generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions)); + generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses)); generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases); generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); @@ -763,6 +838,14 @@ struct SpecialCasing { Condition condition { Condition::None }; }; +struct CaseFolding { + u32 code_point { 0 }; + CaseFoldingStatus status { CaseFoldingStatus::Common }; + + u32 mapping[@case_folding_mapping_size@]; + u32 mapping_size { 0 }; +}; + struct CodePointDecompositionRaw { u32 code_point { 0 }; CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical }; @@ -779,6 +862,7 @@ struct CodePointDecomposition { Optional locale_from_string(StringView locale); Span special_case_mapping(u32 code_point); +Span case_folding_mapping(u32 code_point); } )~~~"); @@ -795,6 +879,8 @@ static ErrorOr generate_unicode_data_implementation(Core::Stream::Buffered generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits()); generator.set("largest_special_casing_size", DeprecatedString::number(unicode_data.largest_special_casing_size)); generator.set("special_casing_size", DeprecatedString::number(unicode_data.special_casing.size())); + generator.set("largest_case_folding_size", DeprecatedString::number(unicode_data.largest_case_folding_size)); + generator.set("case_folding_size", DeprecatedString::number(unicode_data.case_folding.size())); generator.append(R"~~~( #include @@ -830,7 +916,7 @@ namespace Unicode { }; generator.append(R"~~~( -static constexpr Array s_special_casing { {)~~~"); +static constexpr Array s_special_case { {)~~~"); for (auto const& casing : unicode_data.special_casing) { generator.set("code_point", DeprecatedString::formatted("{:#x}", casing.code_point)); @@ -854,6 +940,21 @@ static constexpr Array s_special_casing { generator.append(R"~~~( } }; +static constexpr Array s_case_folding { {)~~~"); + + for (auto const& folding : unicode_data.case_folding) { + generator.set("code_point", DeprecatedString::formatted("{:#x}", folding.code_point)); + generator.set("status", folding.status); + generator.append(R"~~~( + { @code_point@, CaseFoldingStatus::@status@)~~~"); + + append_list_and_size(folding.mapping, "0x{:x}"sv); + generator.append(" },"); + } + + generator.append(R"~~~( +} }; + struct CodePointMapping { u32 code_point { 0 }; u32 mapping { 0 }; @@ -865,6 +966,12 @@ struct SpecialCaseMapping { u32 special_casing_size { 0 }; }; +struct CaseFoldingMapping { + u32 code_point { 0 }; + Array case_folding {}; + u32 case_folding_size { 0 }; +}; + struct CodePointAbbreviation { u32 code_point { 0 }; @string_index_type@ abbreviation { 0 }; @@ -953,7 +1060,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { generator.set("size", DeprecatedString::number(mapping->decomposition_size)); generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },"); } else { - append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); + append_list_and_size(mapping, "&s_@name@[{}]"sv); generator.append(" },"); } @@ -977,6 +1084,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; }); append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); + append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.case_folding_indices; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, @@ -1148,6 +1256,15 @@ Span special_case_mapping(u32 code_point) return mapping->special_casing.span().slice(0, mapping->special_casing_size); } +Span case_folding_mapping(u32 code_point) +{ + auto const* mapping = binary_search(s_case_folding_mappings, code_point, nullptr, CodePointComparator {}); + if (mapping == nullptr) + return {}; + + return mapping->case_folding.span().slice(0, mapping->case_folding_size); +} + Optional code_point_abbreviation(u32 code_point) { auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator {}); @@ -1373,6 +1490,7 @@ ErrorOr serenity_main(Main::Arguments arguments) StringView generated_implementation_path; StringView unicode_data_path; StringView special_casing_path; + StringView case_folding_path; StringView derived_general_category_path; StringView prop_list_path; StringView derived_core_prop_path; @@ -1394,6 +1512,7 @@ ErrorOr serenity_main(Main::Arguments arguments) args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path"); + args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path"); args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path"); args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path"); args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path"); @@ -1416,6 +1535,7 @@ ErrorOr serenity_main(Main::Arguments arguments) auto unicode_data_file = TRY(open_file(unicode_data_path, Core::Stream::OpenMode::Read)); auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::Stream::OpenMode::Read)); auto special_casing_file = TRY(open_file(special_casing_path, Core::Stream::OpenMode::Read)); + auto case_folding_file = TRY(open_file(case_folding_path, Core::Stream::OpenMode::Read)); auto prop_list_file = TRY(open_file(prop_list_path, Core::Stream::OpenMode::Read)); auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::Stream::OpenMode::Read)); auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::Stream::OpenMode::Read)); @@ -1433,6 +1553,7 @@ ErrorOr serenity_main(Main::Arguments arguments) UnicodeData unicode_data {}; TRY(parse_special_casing(*special_casing_file, unicode_data)); + TRY(parse_case_folding(*case_folding_file, unicode_data)); TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories)); TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list)); TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list)); diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 971461ba8ef..cc13963e272 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -97,6 +97,33 @@ TEST_CASE(to_unicode_titlecase) EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv); } +TEST_CASE(to_unicode_casefold) +{ + for (u8 code_point = 0; code_point < 0x80; ++code_point) { + auto ascii = tolower(code_point); + auto unicode = MUST(Unicode::to_unicode_casefold_full({ reinterpret_cast(&code_point), 1 })); + + EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u); + EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii); + } + + // LATIN SMALL LETTER SHARP S + auto result = MUST(Unicode::to_unicode_casefold_full("\u00DF"sv)); + EXPECT_EQ(result, "\u0073\u0073"sv); + + // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI + result = MUST(Unicode::to_unicode_casefold_full("\u1FB3"sv)); + EXPECT_EQ(result, "\u03B1\u03B9"sv); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI + result = MUST(Unicode::to_unicode_casefold_full("\u1FB6"sv)); + EXPECT_EQ(result, "\u03B1\u0342"sv); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = MUST(Unicode::to_unicode_casefold_full("\u1FB7"sv)); + EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv); +} + TEST_CASE(to_unicode_lowercase_unconditional_special_casing) { // LATIN SMALL LETTER SHARP S diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 50ded775c16..a38609399da 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -61,6 +61,13 @@ ErrorOr to_unicode_titlecase_full(StringView string, Optional to_unicode_casefold_full(StringView string) +{ + StringBuilder builder; + TRY(Detail::build_casefold_string(Utf8View { string }, builder)); + return builder.to_string(); +} + Optional __attribute__((weak)) general_category_from_string(StringView) { return {}; } bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; } Optional __attribute__((weak)) property_from_string(StringView) { return {}; } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 43f3c8f6e9a..1976d614e95 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -44,6 +44,7 @@ u32 to_unicode_titlecase(u32 code_point); ErrorOr to_unicode_lowercase_full(StringView, Optional const& locale = {}); ErrorOr to_unicode_uppercase_full(StringView, Optional const& locale = {}); ErrorOr to_unicode_titlecase_full(StringView, Optional const& locale = {}); +ErrorOr to_unicode_casefold_full(StringView); Optional general_category_from_string(StringView); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp index 8af88ea244d..fbd3a8a164b 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -195,6 +195,19 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View return nullptr; } +template +static CaseFolding const* find_matching_case_folding(u32 code_point) +{ + auto case_foldings = case_folding_mapping(code_point); + + for (auto const* case_folding : case_foldings) { + if (((case_folding->status == StatusFilter) || ...)) + return case_folding; + } + + return nullptr; +} + #endif // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 @@ -314,4 +327,32 @@ ErrorOr build_titlecase_string([[maybe_unused]] Utf8View code_points, [[ma #endif } +// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 +ErrorOr build_casefold_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder) +{ +#if ENABLE_UNICODE_DATA + // toCasefold(X): Map each character C in X to Case_Folding(C). + // + // Case_Folding(C) uses the mappings with the status field value ā€œCā€ or ā€œFā€ in the data file + // CaseFolding.txt in the Unicode Character Database. + + using enum CaseFoldingStatus; + + for (auto code_point : code_points) { + auto const* case_folding = find_matching_case_folding(code_point); + if (!case_folding) { + TRY(builder.try_append_code_point(code_point)); + continue; + } + + for (size_t i = 0; i < case_folding->mapping_size; ++i) + TRY(builder.try_append_code_point(case_folding->mapping[i])); + } + + return {}; +#else + return Error::from_string_literal("Unicode data has been disabled"); +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h index 5e9bcbf2a70..af7702abbc7 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.h +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h @@ -17,5 +17,6 @@ namespace Unicode::Detail { ErrorOr build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); ErrorOr build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); ErrorOr build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); +ErrorOr build_casefold_string(Utf8View code_points, StringBuilder& builder); }