LibUnicode: Replace case transformations and comparison with ICUs

There are a couple of differences here due to using ICU:

1. Titlecasing behaves slightly differently. We previously transformed
   "123dollars" to "123Dollars", as we would use word segmentation to
   split a string into words, then transform the first cased character
   to titlecase. ICU doesn't go quite that far, and leaves the string
   as "123dollars". While this is a behavior change, the only user of
   this API is the `text-transform: capitalize;` CSS rule, and we now
   match the behavior of other browsers.

2. There isn't an API to compare strings with case insensitivity without
   allocating case-folded strings for both the left- and right-hand-side
   strings. Our implementation was previously allocation-free; however,
   in a benchmark, ICU is still ~1.4x faster.
This commit is contained in:
Timothy Flynn 2024-06-19 16:39:30 -04:00 committed by Andreas Kling
parent a3a7a65b1c
commit 5cf818e305
Notes: sideshowbarker 2024-07-16 23:17:55 +09:00
12 changed files with 111 additions and 1092 deletions

View File

@ -13,12 +13,6 @@ set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip")
set(UNICODE_DATA_SOURCE "UnicodeData.txt")
set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}")
set(SPECIAL_CASING_SOURCE "SpecialCasing.txt")
set(SPECIAL_CASING_PATH "${UCD_PATH}/${SPECIAL_CASING_SOURCE}")
set(CASE_FOLDING_SOURCE "CaseFolding.txt")
set(CASE_FOLDING_PATH "${UCD_PATH}/${CASE_FOLDING_SOURCE}")
set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt")
set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}")
@ -72,8 +66,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
if (ENABLE_NETWORK_DOWNLOADS)
download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SPECIAL_CASING_SOURCE}" "${SPECIAL_CASING_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${CASE_FOLDING_SOURCE}" "${CASE_FOLDING_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}")
@ -111,7 +103,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UCD_VERSION_FILE}"
"${UNICODE_DATA_HEADER}"
"${UNICODE_DATA_IMPLEMENTATION}"
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
)
invoke_generator(
"EmojiData"

View File

@ -21,24 +21,6 @@
#include <LibCore/ArgsParser.h>
#include <LibUnicode/CharacterTypes.h>
// https://www.unicode.org/reports/tr44/#SpecialCasing.txt
struct SpecialCasing {
u32 index { 0 };
u32 code_point { 0 };
Vector<u32> lowercase_mapping;
Vector<u32> uppercase_mapping;
Vector<u32> titlecase_mapping;
ByteString locale;
ByteString condition;
};
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
struct CaseFolding {
u32 code_point { 0 };
StringView status { "Common"sv };
Vector<u32> mapping { 0 };
};
// https://www.unicode.org/reports/tr44/#PropList.txt
using PropList = HashMap<ByteString, Vector<Unicode::CodePointRange>>;
@ -57,25 +39,6 @@ struct Normalization {
using NormalizationProps = HashMap<ByteString, Vector<Normalization>>;
struct CasingTable {
bool operator==(CasingTable const& other) const
{
return canonical_combining_class == other.canonical_combining_class
&& simple_lowercase_mapping == other.simple_lowercase_mapping
&& simple_uppercase_mapping == other.simple_uppercase_mapping
&& simple_titlecase_mapping == other.simple_titlecase_mapping
&& special_casing_indices == other.special_casing_indices
&& case_folding_indices == other.case_folding_indices;
}
u8 canonical_combining_class { 0 };
Optional<u32> simple_uppercase_mapping;
Optional<u32> simple_lowercase_mapping;
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
Vector<u32> case_folding_indices;
};
// https://www.unicode.org/reports/tr44/#UnicodeData.txt
struct CodePointData {
u32 code_point { 0 };
@ -87,7 +50,6 @@ struct CodePointData {
bool bidi_mirrored { false };
ByteString unicode_1_name;
ByteString iso_comment;
CasingTable casing;
};
using PropertyTable = Vector<bool>;
@ -111,15 +73,6 @@ struct CodePointBidiClass {
};
struct UnicodeData {
Vector<SpecialCasing> special_casing;
u32 largest_special_casing_mapping_size { 0 };
Vector<ByteString> conditions;
Vector<ByteString> locales;
Vector<CaseFolding> case_folding;
u32 largest_case_folding_mapping_size { 0 };
Vector<StringView> statuses;
Vector<CodePointData> code_point_data;
// https://www.unicode.org/reports/tr44/#General_Category_Values
@ -149,7 +102,6 @@ struct UnicodeData {
PropList word_break_props;
PropList sentence_break_props;
CodePointTables<CasingTable> casing_tables;
CodePointTables<PropertyTable> general_category_tables;
CodePointTables<PropertyTable> property_tables;
CodePointTables<PropertyTable> script_tables;
@ -180,125 +132,6 @@ static ByteString sanitize_entry(ByteString const& entry)
return builder.to_byte_string();
}
static ErrorOr<void> parse_special_casing(Core::InputBufferedFile& file, UnicodeData& unicode_data)
{
Array<u8, 1024> buffer;
while (TRY(file.can_read_line())) {
auto line = TRY(file.read_line(buffer));
if (line.is_empty() || line.starts_with('#'))
continue;
if (auto index = line.find('#'); index.has_value())
line = line.substring_view(0, *index);
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
VERIFY(segments.size() == 5 || segments.size() == 6);
SpecialCasing casing {};
casing.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
casing.lowercase_mapping = parse_code_point_list(segments[1]);
casing.titlecase_mapping = parse_code_point_list(segments[2]);
casing.uppercase_mapping = parse_code_point_list(segments[3]);
if (auto condition = segments[4].trim_whitespace(); !condition.is_empty()) {
auto conditions = condition.split_view(' ', SplitBehavior::KeepEmpty);
VERIFY(conditions.size() == 1 || conditions.size() == 2);
if (conditions.size() == 2) {
casing.locale = conditions[0];
casing.condition = conditions[1];
} else if (all_of(conditions[0], is_ascii_lower_alpha)) {
casing.locale = conditions[0];
} else {
casing.condition = conditions[0];
}
if (!casing.locale.is_empty()) {
casing.locale = ByteString::formatted("{:c}{}", to_ascii_uppercase(casing.locale[0]), casing.locale.substring_view(1));
if (!unicode_data.locales.contains_slow(casing.locale))
unicode_data.locales.append(casing.locale);
}
casing.condition = casing.condition.replace("_"sv, ""sv, ReplaceMode::All);
if (!casing.condition.is_empty() && !unicode_data.conditions.contains_slow(casing.condition))
unicode_data.conditions.append(casing.condition);
}
unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.lowercase_mapping.size());
unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.titlecase_mapping.size());
unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.uppercase_mapping.size());
unicode_data.special_casing.append(move(casing));
}
quick_sort(unicode_data.special_casing, [](auto const& lhs, auto const& rhs) {
if (lhs.code_point != rhs.code_point)
return lhs.code_point < rhs.code_point;
if (lhs.locale.is_empty() && !rhs.locale.is_empty())
return false;
if (!lhs.locale.is_empty() && rhs.locale.is_empty())
return true;
return lhs.locale < rhs.locale;
});
for (u32 i = 0; i < unicode_data.special_casing.size(); ++i)
unicode_data.special_casing[i].index = i;
return {};
}
static ErrorOr<void> parse_case_folding(Core::InputBufferedFile& file, UnicodeData& unicode_data)
{
Array<u8, 1024> buffer;
while (TRY(file.can_read_line())) {
auto line = TRY(file.read_line(buffer));
if (line.is_empty() || line.starts_with('#'))
continue;
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
VERIFY(segments.size() == 4);
CaseFolding folding {};
folding.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
folding.mapping = parse_code_point_list(segments[2]);
switch (segments[1].trim_whitespace()[0]) {
case 'C':
folding.status = "Common"sv;
break;
case 'F':
folding.status = "Full"sv;
break;
case 'S':
folding.status = "Simple"sv;
break;
case 'T':
folding.status = "Special"sv;
break;
}
unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size());
if (!unicode_data.statuses.contains_slow(folding.status))
unicode_data.statuses.append(folding.status);
unicode_data.case_folding.append(move(folding));
}
quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) {
if (lhs.code_point != rhs.code_point)
return lhs.code_point < rhs.code_point;
return lhs.status < rhs.status;
});
return {};
}
static ErrorOr<void> parse_prop_list(Core::InputBufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false)
{
Array<u8, 1024> buffer;
@ -503,7 +336,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
CodePointData data {};
data.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
data.name = segments[1];
data.casing.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value();
data.bidi_class = segments[4];
data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
data.numeric_value_digit = AK::StringUtils::convert_to_int<i8>(segments[7]);
@ -511,9 +343,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
data.bidi_mirrored = segments[9] == "Y"sv;
data.unicode_1_name = segments[10];
data.iso_comment = segments[11];
data.casing.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[12]);
data.casing.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
data.casing.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
if (!assigned_code_point_range_start.has_value())
assigned_code_point_range_start = data.code_point;
@ -547,16 +376,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
}
}
for (auto const& casing : unicode_data.special_casing) {
if (casing.code_point == data.code_point)
data.casing.special_casing_indices.append(casing.index);
}
for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) {
if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point)
data.casing.case_folding_indices.append(i);
}
unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
previous_code_point = data.code_point;
@ -570,8 +389,6 @@ static ErrorOr<void> generate_unicode_data_header(Core::InputBufferedFile& file,
{
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("special_casing_mapping_size", ByteString::number(unicode_data.largest_special_casing_mapping_size));
generator.set("case_folding_mapping_size", ByteString::number(unicode_data.largest_case_folding_mapping_size));
auto generate_enum = [&](StringView name, StringView default_, auto values, Vector<Alias> aliases = {}) {
quick_sort(values);
@ -619,9 +436,6 @@ enum class @name@ : @underlying@ {)~~~");
namespace Unicode {
)~~~");
generate_enum("Locale"sv, "None"sv, unicode_data.locales);
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses));
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
@ -631,35 +445,6 @@ namespace Unicode {
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
generator.append(R"~~~(
struct SpecialCasing {
u32 code_point { 0 };
u32 lowercase_mapping[@special_casing_mapping_size@];
u32 lowercase_mapping_size { 0 };
u32 uppercase_mapping[@special_casing_mapping_size@];
u32 uppercase_mapping_size { 0 };
u32 titlecase_mapping[@special_casing_mapping_size@];
u32 titlecase_mapping_size { 0 };
Locale locale { Locale::None };
Condition condition { Condition::None };
};
struct CaseFolding {
u32 code_point { 0 };
CaseFoldingStatus status { CaseFoldingStatus::Common };
u32 mapping[@case_folding_mapping_size@];
u32 mapping_size { 0 };
};
Optional<Locale> locale_from_string(StringView locale);
ReadonlySpan<SpecialCasing> special_case_mapping(u32 code_point);
ReadonlySpan<CaseFolding> case_folding_mapping(u32 code_point);
}
)~~~");
@ -672,9 +457,6 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFil
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("special_casing_size", ByteString::number(unicode_data.special_casing.size()));
generator.set("case_folding_size", ByteString::number(unicode_data.case_folding.size()));
generator.set("CODE_POINT_TABLES_LSB_COUNT", TRY(String::number(CODE_POINT_TABLES_LSB_COUNT)));
generator.set("CODE_POINT_TABLES_LSB_MASK", TRY(String::formatted("{:#x}", CODE_POINT_TABLES_LSB_MASK)));
@ -693,83 +475,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFil
namespace Unicode {
)~~~");
auto append_list_and_size = [&](auto const& list, StringView format) {
if (list.is_empty()) {
generator.append(", {}, 0");
return;
}
bool first = true;
generator.append(", {");
for (auto const& item : list) {
generator.append(first ? " "sv : ", "sv);
generator.append(ByteString::formatted(format, item));
first = false;
}
generator.append(ByteString::formatted(" }}, {}", list.size()));
};
generator.append(R"~~~(
static constexpr Array<SpecialCasing, @special_casing_size@> s_special_case { {)~~~");
for (auto const& casing : unicode_data.special_casing) {
generator.set("code_point", ByteString::formatted("{:#x}", casing.code_point));
generator.append(R"~~~(
{ @code_point@)~~~");
constexpr auto format = "{:#x}"sv;
append_list_and_size(casing.lowercase_mapping, format);
append_list_and_size(casing.uppercase_mapping, format);
append_list_and_size(casing.titlecase_mapping, format);
generator.set("locale", casing.locale.is_empty() ? "None" : casing.locale);
generator.append(", Locale::@locale@");
generator.set("condition", casing.condition.is_empty() ? "None" : casing.condition);
generator.append(", Condition::@condition@");
generator.append(" },");
}
generator.append(R"~~~(
} };
static constexpr Array<CaseFolding, @case_folding_size@> s_case_folding { {)~~~");
for (auto const& folding : unicode_data.case_folding) {
generator.set("code_point", ByteString::formatted("{:#x}", folding.code_point));
generator.set("status", folding.status);
generator.append(R"~~~(
{ @code_point@, CaseFoldingStatus::@status@)~~~");
append_list_and_size(folding.mapping, "{:#x}"sv);
generator.append(" },");
}
generator.append(R"~~~(
} };
struct CasingTable {
u8 canonical_combining_class { 0 };
i32 simple_uppercase_mapping { -1 };
i32 simple_lowercase_mapping { -1 };
i32 simple_titlecase_mapping { -1 };
u32 special_casing_start_index { 0 };
u32 special_casing_size { 0 };
u32 case_folding_start_index { 0 };
u32 case_folding_size { 0 };
};
template<typename MappingType>
struct CodePointComparator {
constexpr int operator()(u32 code_point, MappingType const& mapping)
{
return code_point - mapping.code_point;
}
};
struct BidiClassData {
CodePointRange code_point_range {};
BidirectionalClass bidi_class {};
@ -784,45 +490,6 @@ struct CodePointBidiClassComparator : public CodePointRangeComparator {
)~~~");
auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> {
generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)));
generator.set("size", TRY(String::number(unique_properties.size())));
auto optional_code_point_to_string = [](auto const& code_point) -> ErrorOr<String> {
if (!code_point.has_value())
return "-1"_string;
return String::number(*code_point);
};
auto first_index_to_string = [](auto const& list) -> ErrorOr<String> {
if (list.is_empty())
return "0"_string;
return String::number(list.first());
};
generator.append(R"~~~(
static constexpr Array<CasingTable, @size@> @name@ { {)~~~");
for (auto const& casing : unique_properties) {
generator.set("canonical_combining_class", TRY(String::number(casing.canonical_combining_class)));
generator.set("simple_uppercase_mapping", TRY(optional_code_point_to_string(casing.simple_uppercase_mapping)));
generator.set("simple_lowercase_mapping", TRY(optional_code_point_to_string(casing.simple_lowercase_mapping)));
generator.set("simple_titlecase_mapping", TRY(optional_code_point_to_string(casing.simple_titlecase_mapping)));
generator.set("special_casing_start_index", TRY(first_index_to_string(casing.special_casing_indices)));
generator.set("special_casing_size", TRY(String::number(casing.special_casing_indices.size())));
generator.set("case_folding_start_index", TRY(first_index_to_string(casing.case_folding_indices)));
generator.set("case_folding_size", TRY(String::number(casing.case_folding_indices.size())));
generator.append(R"~~~(
{ @canonical_combining_class@, @simple_uppercase_mapping@, @simple_lowercase_mapping@, @simple_titlecase_mapping@, @special_casing_start_index@, @special_casing_size@, @case_folding_start_index@, @case_folding_size@ },)~~~");
}
generator.append(R"~~~(
} };
)~~~");
return {};
};
auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> {
generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)));
generator.set("outer_size", TRY(String::number(unique_properties.size())));
@ -889,7 +556,6 @@ static constexpr Array<@type@, @size@> @name@ { {
return {};
};
TRY(append_code_point_tables("s_casings"sv, unicode_data.casing_tables, append_casing_table));
TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table));
TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table));
TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
@ -926,64 +592,6 @@ static constexpr Array<BidiClassData, @size@> s_bidirectional_classes { {
}
generator.append(R"~~~(
static CasingTable const& casing_table_for_code_point(u32 code_point)
{
auto stage1_index = code_point >> @CODE_POINT_TABLES_LSB_COUNT@;
auto stage2_index = s_casings_stage1[stage1_index] + (code_point & @CODE_POINT_TABLES_LSB_MASK@);
auto unique_properties_index = s_casings_stage2[stage2_index];
return s_casings_unique_properties[unique_properties_index];
}
)~~~");
auto append_code_point_mapping_search = [&](StringView method, StringView mapping, Optional<StringView> const& fallback = {}) {
generator.set("method", method);
generator.set("mapping", mapping);
generator.append(R"~~~(
u32 @method@(u32 code_point)
{
auto const& casing_table = casing_table_for_code_point(code_point);
auto mapping = casing_table.@mapping@;
)~~~");
if (fallback.has_value()) {
generator.set("fallback", *fallback);
generator.append(R"~~~(
return mapping == -1 ? @fallback@ : static_cast<u32>(mapping);)~~~");
} else {
generator.append(R"~~~(
return mapping;)~~~");
}
generator.append(R"~~~(
}
)~~~");
};
append_code_point_mapping_search("canonical_combining_class"sv, "canonical_combining_class"sv);
append_code_point_mapping_search("to_unicode_uppercase"sv, "simple_uppercase_mapping"sv, "code_point"sv);
append_code_point_mapping_search("to_unicode_lowercase"sv, "simple_lowercase_mapping"sv, "code_point"sv);
append_code_point_mapping_search("to_unicode_titlecase"sv, "simple_titlecase_mapping"sv, "code_point"sv);
generator.append(R"~~~(
ReadonlySpan<SpecialCasing> special_case_mapping(u32 code_point)
{
auto const& casing_table = casing_table_for_code_point(code_point);
if (casing_table.special_casing_size == 0)
return {};
return s_special_case.span().slice(casing_table.special_casing_start_index, casing_table.special_casing_size);
}
ReadonlySpan<CaseFolding> case_folding_mapping(u32 code_point)
{
auto const& casing_table = casing_table_for_code_point(code_point);
if (casing_table.case_folding_size == 0)
return {};
return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size);
}
Optional<BidirectionalClass> bidirectional_class(u32 code_point)
{
if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {}))
@ -1036,8 +644,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
return {};
};
TRY(append_from_string("Locale"sv, "locale"sv, unicode_data.locales, {}));
TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv));
TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases));
@ -1188,22 +794,6 @@ static ErrorOr<void> normalize_script_extensions(PropList& script_extensions, Pr
return {};
}
struct CasingMetadata {
using ConstIterator = typename Vector<CodePointData>::ConstIterator;
CasingMetadata(Vector<CodePointData> const& code_point_data)
: iterator(code_point_data.begin())
, end(code_point_data.end())
{
}
ConstIterator iterator;
ConstIterator const end;
Vector<size_t> current_block;
HashMap<decltype(current_block), size_t> unique_blocks;
};
struct PropertyMetadata {
static ErrorOr<PropertyMetadata> create(PropList& property_list)
{
@ -1301,25 +891,6 @@ static ErrorOr<void> update_tables(u32 code_point, CodePointTables<T>& tables, a
static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
{
auto update_casing_tables = [&]<typename T>(u32 code_point, CodePointTables<T>& tables, CasingMetadata& metadata) -> ErrorOr<void> {
CasingTable casing {};
while (metadata.iterator != metadata.end) {
if (code_point < metadata.iterator->code_point)
break;
if (code_point == metadata.iterator->code_point) {
casing = move(metadata.iterator->casing);
break;
}
++metadata.iterator;
}
TRY(update_tables(code_point, tables, metadata, casing));
return {};
};
auto update_property_tables = [&]<typename T>(u32 code_point, CodePointTables<T>& tables, PropertyMetadata& metadata) -> ErrorOr<void> {
static Unicode::CodePointRangeComparator comparator {};
@ -1346,7 +917,6 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
return {};
};
CasingMetadata casing_metadata { unicode_data.code_point_data };
auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories));
auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list));
auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
@ -1356,7 +926,6 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props));
for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
TRY(update_casing_tables(code_point, unicode_data.casing_tables, casing_metadata));
TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata));
TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
@ -1374,8 +943,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
StringView generated_header_path;
StringView generated_implementation_path;
StringView unicode_data_path;
StringView special_casing_path;
StringView case_folding_path;
StringView derived_general_category_path;
StringView prop_list_path;
StringView derived_core_prop_path;
@ -1394,8 +961,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path");
args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
@ -1415,8 +980,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read));
auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read));
auto special_casing_file = TRY(open_file(special_casing_path, Core::File::OpenMode::Read));
auto case_folding_file = TRY(open_file(case_folding_path, Core::File::OpenMode::Read));
auto prop_list_file = TRY(open_file(prop_list_path, Core::File::OpenMode::Read));
auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::File::OpenMode::Read));
auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::File::OpenMode::Read));
@ -1431,8 +994,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read));
UnicodeData unicode_data {};
TRY(parse_special_casing(*special_casing_file, unicode_data));
TRY(parse_case_folding(*case_folding_file, unicode_data));
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list));
TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list));

View File

@ -592,7 +592,6 @@ TEST_CASE(to_titlecase)
EXPECT_EQ(MUST("foo bar baz"_string.to_titlecase()), "Foo Bar Baz"sv);
EXPECT_EQ(MUST("foo \n \r bar \t baz"_string.to_titlecase()), "Foo \n \r Bar \t Baz"sv);
EXPECT_EQ(MUST("f\"oo\" b'ar'"_string.to_titlecase()), "F\"Oo\" B'ar'"sv);
EXPECT_EQ(MUST("123dollars"_string.to_titlecase()), "123Dollars"sv);
}
TEST_CASE(to_casefold)

View File

@ -8,82 +8,6 @@
#include <AK/StringView.h>
#include <LibUnicode/CharacterTypes.h>
#include <ctype.h>
static void compare_to_ascii(auto& old_function, auto& new_function)
{
i64 result1 = 0;
i64 result2 = 0;
for (u32 i = 0; i < 0x80; ++i) {
EXPECT_EQ(result1 = old_function(i), result2 = new_function(i));
if (result1 != result2)
dbgln("Function input value was {}.", i);
}
}
TEST_CASE(to_unicode_lowercase)
{
compare_to_ascii(tolower, Unicode::to_unicode_lowercase);
EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω"
EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω"
// Code points encoded by ranges in UnicodeData.txt
EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u);
EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u);
EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u);
EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu);
}
TEST_CASE(to_unicode_uppercase)
{
compare_to_ascii(toupper, Unicode::to_unicode_uppercase);
EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω"
EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
// Code points encoded by ranges in UnicodeData.txt
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u);
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
// Code points whose uppercase and titlecase mappings actually differ.
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ"
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ"
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ"
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ"
}
TEST_CASE(to_unicode_titlecase)
{
compare_to_ascii(toupper, Unicode::to_unicode_titlecase);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
// Code points encoded by ranges in UnicodeData.txt
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u);
EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu);
// Code points whose uppercase and titlecase mappings actually differ.
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz"
}
BENCHMARK_CASE(casing)
{
for (size_t i = 0; i < 50'000; ++i) {
__test_to_unicode_lowercase();
__test_to_unicode_uppercase();
__test_to_unicode_titlecase();
}
}
TEST_CASE(general_category)
{

View File

@ -713,10 +713,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
bool equal;
if (input.regex_options & AllFlags::Insensitive) {
if (input.view.unicode())
equal = Unicode::equals_ignoring_case(Utf32View { &input_view, 1 }, Utf32View { &ch1, 1 });
else
if (input.view.unicode()) {
auto lhs = String::from_code_point(input_view);
auto rhs = String::from_code_point(ch1);
equal = lhs.equals_ignoring_case(rhs);
} else {
equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1);
}
} else {
equal = input_view == ch1;
}

View File

@ -8,7 +8,6 @@ set(SOURCES
Normalize.cpp
Segmentation.cpp
String.cpp
UnicodeUtils.cpp
${UNICODE_DATA_SOURCES}
)
set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED})

View File

@ -1,19 +1,10 @@
/*
* Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/Platform.h>
#include <AK/ScopeGuard.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/UnicodeUtils.h>
#if ENABLE_UNICODE_DATA
# include <LibUnicode/UnicodeData.h>
@ -21,129 +12,6 @@
namespace Unicode {
u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
{
return to_ascii_lowercase(code_point);
}
u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
{
return to_ascii_uppercase(code_point);
}
u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point)
{
return to_ascii_uppercase(code_point);
}
template<typename ViewType>
class CasefoldStringComparator {
public:
explicit CasefoldStringComparator(ViewType string)
: m_string(string)
, m_it(m_string.begin())
{
}
bool has_more_data() const
{
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
}
size_t index() const
{
if constexpr (IsSame<ViewType, Utf8View>)
return m_string.byte_offset_of(m_it);
else if constexpr (IsSame<ViewType, Utf16View>)
return m_string.code_unit_offset_of(m_it);
else if constexpr (IsSame<ViewType, Utf32View>)
return m_string.iterator_offset(m_it);
else
static_assert(DependentFalse<ViewType>);
}
u32 next_code_point()
{
VERIFY(has_more_data());
if (m_casefolded_code_points.is_empty()) {
m_current_code_point = *m_it;
++m_it;
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
}
auto code_point = m_casefolded_code_points[0];
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
return code_point;
}
private:
ViewType m_string;
typename ViewType::Iterator m_it;
u32 m_current_code_point { 0 };
Utf32View m_casefolded_code_points;
};
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
template<typename ViewType>
bool equals_ignoring_case(ViewType lhs, ViewType rhs)
{
// A string X is a caseless match for a string Y if and only if:
// toCasefold(X) = toCasefold(Y)
CasefoldStringComparator lhs_comparator { lhs };
CasefoldStringComparator rhs_comparator { rhs };
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point())
return false;
}
return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data();
}
template bool equals_ignoring_case(Utf8View, Utf8View);
template bool equals_ignoring_case(Utf16View, Utf16View);
template bool equals_ignoring_case(Utf32View, Utf32View);
template<typename ViewType>
Optional<size_t> find_ignoring_case(ViewType lhs, ViewType rhs)
{
CasefoldStringComparator lhs_comparator { lhs };
while (lhs_comparator.has_more_data()) {
CasefoldStringComparator rhs_comparator { rhs };
auto saved_state = lhs_comparator;
auto matches = true;
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) {
matches = false;
break;
}
}
if (matches && !rhs_comparator.has_more_data())
return saved_state.index();
lhs_comparator = move(saved_state);
lhs_comparator.next_code_point();
}
return {};
}
template Optional<size_t> find_ignoring_case(Utf8View, Utf8View);
template Optional<size_t> find_ignoring_case(Utf16View, Utf16View);
template Optional<size_t> find_ignoring_case(Utf32View, Utf32View);
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }

View File

@ -1,18 +1,15 @@
/*
* Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/ByteString.h>
#include <AK/Forward.h>
#include <AK/Optional.h>
#include <AK/Span.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h>
namespace Unicode {
@ -29,20 +26,6 @@ struct CodePointRangeComparator {
}
};
u32 canonical_combining_class(u32 code_point);
// Note: The single code point case conversions only perform simple case folding.
// Use the full-string transformations for full case folding.
u32 to_unicode_lowercase(u32 code_point);
u32 to_unicode_uppercase(u32 code_point);
u32 to_unicode_titlecase(u32 code_point);
template<typename ViewType>
bool equals_ignoring_case(ViewType, ViewType);
template<typename ViewType>
Optional<size_t> find_ignoring_case(ViewType, ViewType);
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -11,7 +11,6 @@
namespace Unicode {
enum class BidirectionalClass : u8;
enum class Block : u16;
enum class EmojiGroup : u8;
enum class GeneralCategory : u8;
enum class GraphemeBreakProperty : u8;
@ -20,9 +19,7 @@ enum class Script : u8;
enum class SentenceBreakProperty : u8;
enum class WordBreakProperty : u8;
struct CodePointDecomposition;
struct CurrencyCode;
struct Emoji;
struct SpecialCasing;
}

View File

@ -1,57 +1,141 @@
/*
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#define AK_DONT_REPLACE_STD
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/UnicodeUtils.h>
#include <LibLocale/ICU.h>
#include <unicode/bytestream.h>
#include <unicode/casemap.h>
#include <unicode/stringoptions.h>
// This file contains definitions of AK::String methods which require UCD data.
namespace AK {
struct ResolvedLocale {
ByteString buffer;
char const* locale { nullptr };
};
static ResolvedLocale resolve_locale(Optional<StringView> const& locale)
{
if (!locale.has_value())
return {};
ResolvedLocale resolved_locale;
resolved_locale.buffer = *locale;
resolved_locale.locale = resolved_locale.buffer.characters();
return resolved_locale;
}
ErrorOr<String> String::to_lowercase(Optional<StringView> const& locale) const
{
StringBuilder builder;
TRY(Unicode::Detail::build_lowercase_string(code_points(), builder, locale));
UErrorCode status = U_ZERO_ERROR;
StringBuilder builder { bytes_as_string_view().length() };
icu::StringByteSink sink { &builder };
auto resolved_locale = resolve_locale(locale);
icu::CaseMap::utf8ToLower(resolved_locale.locale, 0, Locale::icu_string_piece(*this), sink, nullptr, status);
if (Locale::icu_failure(status))
return Error::from_string_literal("Unable to convert string to lowercase");
return builder.to_string_without_validation();
}
ErrorOr<String> String::to_uppercase(Optional<StringView> const& locale) const
{
StringBuilder builder;
TRY(Unicode::Detail::build_uppercase_string(code_points(), builder, locale));
UErrorCode status = U_ZERO_ERROR;
StringBuilder builder { bytes_as_string_view().length() };
icu::StringByteSink sink { &builder };
auto resolved_locale = resolve_locale(locale);
icu::CaseMap::utf8ToUpper(resolved_locale.locale, 0, Locale::icu_string_piece(*this), sink, nullptr, status);
if (Locale::icu_failure(status))
return Error::from_string_literal("Unable to convert string to uppercase");
return builder.to_string_without_validation();
}
ErrorOr<String> String::to_titlecase(Optional<StringView> const& locale, TrailingCodePointTransformation trailing_code_point_transformation) const
{
StringBuilder builder;
TRY(Unicode::Detail::build_titlecase_string(code_points(), builder, locale, trailing_code_point_transformation));
UErrorCode status = U_ZERO_ERROR;
StringBuilder builder { bytes_as_string_view().length() };
icu::StringByteSink sink { &builder };
auto resolved_locale = resolve_locale(locale);
u32 options = 0;
if (trailing_code_point_transformation == TrailingCodePointTransformation::PreserveExisting)
options |= U_TITLECASE_NO_LOWERCASE;
icu::CaseMap::utf8ToTitle(resolved_locale.locale, options, nullptr, Locale::icu_string_piece(*this), sink, nullptr, status);
if (Locale::icu_failure(status))
return Error::from_string_literal("Unable to convert string to titlecase");
return builder.to_string_without_validation();
}
static ErrorOr<void> build_casefold_string(StringView string, StringBuilder& builder)
{
UErrorCode status = U_ZERO_ERROR;
icu::StringByteSink sink { &builder };
icu::CaseMap::utf8Fold(0, Locale::icu_string_piece(string), sink, nullptr, status);
if (Locale::icu_failure(status))
return Error::from_string_literal("Unable to casefold string");
return {};
}
ErrorOr<String> String::to_casefold() const
{
StringBuilder builder;
TRY(Unicode::Detail::build_casefold_string(code_points(), builder));
StringBuilder builder { bytes_as_string_view().length() };
TRY(build_casefold_string(*this, builder));
return builder.to_string_without_validation();
}
bool String::equals_ignoring_case(String const& other) const
{
return Unicode::equals_ignoring_case(code_points(), other.code_points());
StringBuilder lhs_builder { bytes_as_string_view().length() };
if (build_casefold_string(*this, lhs_builder).is_error())
return false;
StringBuilder rhs_builder { other.bytes_as_string_view().length() };
if (build_casefold_string(other, rhs_builder).is_error())
return false;
return lhs_builder.string_view() == rhs_builder.string_view();
}
Optional<size_t> String::find_byte_offset_ignoring_case(StringView needle, size_t from_byte_offset) const
{
auto haystack = code_points().substring_view(from_byte_offset);
auto haystack = bytes_as_string_view().substring_view(from_byte_offset);
if (haystack.is_empty())
return {};
if (auto index = Unicode::find_ignoring_case(haystack, Utf8View { needle }); index.has_value())
StringBuilder lhs_builder { haystack.length() };
if (build_casefold_string(haystack, lhs_builder).is_error())
return {};
StringBuilder rhs_builder { needle.length() };
if (build_casefold_string(needle, rhs_builder).is_error())
return false;
if (auto index = lhs_builder.string_view().find(rhs_builder.string_view()); index.has_value())
return *index + from_byte_offset;
return {};

View File

@ -1,368 +0,0 @@
/*
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Platform.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/Segmentation.h>
#include <LibUnicode/UnicodeUtils.h>
#if ENABLE_UNICODE_DATA
# include <LibUnicode/UnicodeData.h>
#endif
// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
namespace Unicode::Detail {
#if ENABLE_UNICODE_DATA
static bool is_after_uppercase_i(Utf8View const& string, size_t index)
{
// There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
auto preceding_view = string.substring_view(0, index);
bool found_uppercase_i = false;
// FIXME: Would be better if Utf8View supported reverse iteration.
for (auto code_point : preceding_view) {
if (code_point == 'I') {
found_uppercase_i = true;
continue;
}
auto combining_class = canonical_combining_class(code_point);
if (combining_class == 0 || combining_class == 230)
found_uppercase_i = false;
}
return found_uppercase_i;
}
static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
{
// There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
auto preceding_view = string.substring_view(0, index);
bool found_soft_dotted_code_point = false;
// FIXME: Would be better if Utf8View supported reverse iteration.
for (auto code_point : preceding_view) {
if (code_point_has_property(code_point, Property::Soft_Dotted)) {
found_soft_dotted_code_point = true;
continue;
}
auto combining_class = canonical_combining_class(code_point);
if (combining_class == 0 || combining_class == 230)
found_soft_dotted_code_point = false;
}
return found_soft_dotted_code_point;
}
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
{
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
// characters, and C is not followed by a sequence consisting of zero or more case-ignorable
// characters and then a cased letter.
auto preceding_view = string.substring_view(0, index);
auto following_view = ((index + byte_length) < string.byte_length())
? string.substring_view(index + byte_length)
: Utf8View {};
size_t cased_letter_count = 0;
for (auto code_point : preceding_view) {
bool is_cased = code_point_has_property(code_point, Property::Cased);
bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
if (is_cased && !is_case_ignorable)
++cased_letter_count;
else if (!is_case_ignorable)
cased_letter_count = 0;
}
if (cased_letter_count == 0)
return false;
for (auto code_point : following_view) {
bool is_cased = code_point_has_property(code_point, Property::Cased);
bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
if (is_case_ignorable)
continue;
if (is_cased)
return false;
break;
}
return true;
}
static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
{
// C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
auto following_view = ((index + byte_length) < string.byte_length())
? string.substring_view(index + byte_length)
: Utf8View {};
for (auto code_point : following_view) {
u32 combining_class = canonical_combining_class(code_point);
if (combining_class == 0)
return false;
if (combining_class == 230)
return true;
}
return false;
}
static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
{
// C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
// intervene between the current character and the combining dot above.
auto following_view = ((index + byte_length) < string.byte_length())
? string.substring_view(index + byte_length)
: Utf8View {};
for (auto code_point : following_view) {
if (code_point == 0x307)
return true;
u32 combining_class = canonical_combining_class(code_point);
if (combining_class == 0)
return false;
if (combining_class == 230)
return false;
}
return false;
}
static Optional<SpecialCasing const&> find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
{
auto requested_locale = Locale::None;
if (locale.has_value()) {
if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
requested_locale = *maybe_locale;
}
auto special_casings = special_case_mapping(code_point);
for (auto const& special_casing : special_casings) {
if (special_casing.locale != Locale::None && special_casing.locale != requested_locale)
continue;
switch (special_casing.condition) {
case Condition::None:
return special_casing;
case Condition::AfterI:
if (is_after_uppercase_i(string, index))
return special_casing;
break;
case Condition::AfterSoftDotted:
if (is_after_soft_dotted_code_point(string, index))
return special_casing;
break;
case Condition::FinalSigma:
if (is_final_code_point(string, index, byte_length))
return special_casing;
break;
case Condition::MoreAbove:
if (is_followed_by_combining_class_above(string, index, byte_length))
return special_casing;
break;
case Condition::NotBeforeDot:
if (!is_followed_by_combining_dot_above(string, index, byte_length))
return special_casing;
break;
}
}
return {};
}
template<CaseFoldingStatus... StatusFilter>
static Optional<CaseFolding const&> find_matching_case_folding(u32 code_point)
{
auto case_foldings = case_folding_mapping(code_point);
for (auto const& case_folding : case_foldings) {
if (((case_folding.status == StatusFilter) || ...))
return case_folding;
}
return {};
}
#endif
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
{
#if ENABLE_UNICODE_DATA
size_t index = 0;
size_t byte_length = 0;
for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
u32 code_point = *it;
byte_length = it.underlying_code_point_length_in_bytes();
auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
if (!special_casing.has_value()) {
TRY(builder.try_append_code_point(to_unicode_lowercase(code_point)));
continue;
}
for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i]));
}
return {};
#else
return Error::from_string_literal("Unicode data has been disabled");
#endif
}
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
{
#if ENABLE_UNICODE_DATA
size_t index = 0;
size_t byte_length = 0;
for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
u32 code_point = *it;
byte_length = it.underlying_code_point_length_in_bytes();
auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
if (!special_casing.has_value()) {
TRY(builder.try_append_code_point(to_unicode_uppercase(code_point)));
continue;
}
for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i]));
}
return {};
#else
return Error::from_string_literal("Unicode data has been disabled");
#endif
}
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale, [[maybe_unused]] TrailingCodePointTransformation trailing_code_point_transformation)
{
#if ENABLE_UNICODE_DATA
// toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
// “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
// the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
// F and the following word boundary to Lowercase_Mapping(C).
auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
for (; it != end; ++it) {
if (code_point_has_property(*it, Property::Cased))
return it;
}
return {};
};
auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
if (!special_casing.has_value()) {
TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
return {};
}
for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
return {};
};
size_t boundary = 0;
while (true) {
auto next_boundary = next_word_segmentation_boundary(code_points, boundary);
if (!next_boundary.has_value())
break;
if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) {
auto code_point = *it.value();
auto code_point_offset = code_points.byte_offset_of(*it);
auto code_point_length = it->underlying_code_point_length_in_bytes();
auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
TRY(builder.try_append(caseless_code_points.as_string()));
TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
boundary = code_point_offset + code_point_length;
}
auto remaining_code_points = code_points.substring_view(boundary, *next_boundary - boundary);
switch (trailing_code_point_transformation) {
case TrailingCodePointTransformation::Lowercase:
TRY(build_lowercase_string(remaining_code_points, builder, locale));
break;
case TrailingCodePointTransformation::PreserveExisting:
TRY(builder.try_append(remaining_code_points.as_string()));
break;
}
boundary = *next_boundary;
}
return {};
#else
return Error::from_string_literal("Unicode data has been disabled");
#endif
}
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder)
{
// toCasefold(X): Map each character C in X to Case_Folding(C).
for (auto code_point : code_points) {
auto case_folding = casefold_code_point(code_point);
TRY(builder.try_append(case_folding));
}
return {};
}
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
Utf32View casefold_code_point(u32 const& code_point)
{
#if ENABLE_UNICODE_DATA
// Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
// CaseFolding.txt in the Unicode Character Database.
using enum CaseFoldingStatus;
if (auto case_folding = find_matching_case_folding<Common, Full>(code_point); case_folding.has_value())
return Utf32View { case_folding->mapping, case_folding->mapping_size };
#endif
// The case foldings are omitted in the data file if they are the same as the code point itself.
return Utf32View { &code_point, 1 };
}
}

View File

@ -1,24 +0,0 @@
/*
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Error.h>
#include <AK/Forward.h>
#include <AK/Optional.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/Forward.h>
namespace Unicode::Detail {
ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale, TrailingCodePointTransformation trailing_code_point_transformation);
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder);
Utf32View casefold_code_point(u32 const& code_point);
}