LibUnicode: Remove completely unused code point name & block name data

These were used for e.g. the Character Map on Serenity, but are not used
at all for Ladybird.
This commit is contained in:
Timothy Flynn 2024-06-18 10:57:14 -04:00 committed by Andreas Kling
parent 9c3a775395
commit 1feef17bf7
Notes: sideshowbarker 2024-07-16 18:03:21 +09:00
5 changed files with 1 additions and 328 deletions

View File

@ -38,18 +38,12 @@ set(PROP_ALIAS_PATH "${UCD_PATH}/${PROP_ALIAS_SOURCE}")
set(PROP_VALUE_ALIAS_SOURCE "PropertyValueAliases.txt")
set(PROP_VALUE_ALIAS_PATH "${UCD_PATH}/${PROP_VALUE_ALIAS_SOURCE}")
set(NAME_ALIAS_SOURCE "NameAliases.txt")
set(NAME_ALIAS_PATH "${UCD_PATH}/${NAME_ALIAS_SOURCE}")
set(SCRIPTS_SOURCE "Scripts.txt")
set(SCRIPTS_PATH "${UCD_PATH}/${SCRIPTS_SOURCE}")
set(SCRIPT_EXTENSIONS_SOURCE "ScriptExtensions.txt")
set(SCRIPT_EXTENSIONS_PATH "${UCD_PATH}/${SCRIPT_EXTENSIONS_SOURCE}")
set(BLOCKS_SOURCE "Blocks.txt")
set(BLOCKS_PATH "${UCD_PATH}/${BLOCKS_SOURCE}")
set(EMOJI_DATA_SOURCE "emoji/emoji-data.txt")
set(EMOJI_DATA_PATH "${UCD_PATH}/${EMOJI_DATA_SOURCE}")
@ -90,10 +84,8 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_BINARY_PROP_SOURCE}" "${DERIVED_BINARY_PROP_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_ALIAS_SOURCE}" "${PROP_ALIAS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_VALUE_ALIAS_SOURCE}" "${PROP_VALUE_ALIAS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NAME_ALIAS_SOURCE}" "${NAME_ALIAS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPTS_SOURCE}" "${SCRIPTS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${BLOCKS_SOURCE}" "${BLOCKS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${EMOJI_DATA_SOURCE}" "${EMOJI_DATA_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NORM_PROPS_SOURCE}" "${NORM_PROPS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${GRAPHEME_BREAK_PROP_SOURCE}" "${GRAPHEME_BREAK_PROP_PATH}")
@ -129,7 +121,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UCD_VERSION_FILE}"
"${UNICODE_DATA_HEADER}"
"${UNICODE_DATA_IMPLEMENTATION}"
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
)
invoke_generator(
"EmojiData"

View File

@ -65,11 +65,6 @@ struct Normalization {
using NormalizationProps = HashMap<ByteString, Vector<Normalization>>;
struct CodePointName {
Unicode::CodePointRange code_point_range;
size_t name { 0 };
};
struct CasingTable {
bool operator==(CasingTable const& other) const
{
@ -93,7 +88,6 @@ struct CasingTable {
struct CodePointData {
u32 code_point { 0 };
ByteString name;
Optional<size_t> abbreviation;
ByteString bidi_class;
Optional<CodePointDecomposition> decomposition_mapping;
Optional<i8> numeric_value_decimal;
@ -105,11 +99,6 @@ struct CodePointData {
CasingTable casing;
};
struct BlockName {
Unicode::CodePointRange code_point_range;
size_t name { 0 };
};
using PropertyTable = Vector<bool>;
static constexpr auto CODE_POINT_TABLES_MSB_COUNT = 16u;
@ -136,8 +125,6 @@ struct CodePointComposition {
};
struct UnicodeData {
UniqueStringStorage unique_strings;
u32 code_points_with_decomposition_mapping { 0 };
Vector<u32> decomposition_mappings;
HashMap<u32, Vector<CodePointComposition>> composition_mappings;
@ -154,10 +141,6 @@ struct UnicodeData {
Vector<CodePointData> code_point_data;
HashMap<u32, size_t> code_point_abbreviations;
HashMap<u32, size_t> code_point_display_name_aliases;
Vector<CodePointName> code_point_display_names;
// https://www.unicode.org/reports/tr44/#General_Category_Values
PropList general_categories;
Vector<Alias> general_category_aliases;
@ -178,8 +161,6 @@ struct UnicodeData {
Vector<Alias> script_aliases;
PropList script_extensions;
Vector<BlockName> block_display_names;
// FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
NormalizationProps normalization_props;
@ -430,37 +411,6 @@ static ErrorOr<void> parse_alias_list(Core::InputBufferedFile& file, PropList co
return {};
}
static ErrorOr<void> parse_name_aliases(Core::InputBufferedFile& file, UnicodeData& unicode_data)
{
Array<u8, 1024> buffer;
while (TRY(file.can_read_line())) {
auto line = TRY(file.read_line(buffer));
if (line.is_empty() || line.starts_with('#'))
continue;
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
VERIFY(segments.size() == 3);
auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0].trim_whitespace());
auto alias = segments[1].trim_whitespace();
auto reason = segments[2].trim_whitespace();
if (reason == "abbreviation"sv) {
auto index = unicode_data.unique_strings.ensure(alias);
unicode_data.code_point_abbreviations.set(*code_point, index);
} else if (reason.is_one_of("correction"sv, "control"sv)) {
if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) {
auto index = unicode_data.unique_strings.ensure(alias);
unicode_data.code_point_display_name_aliases.set(*code_point, index);
}
}
}
return {};
}
static ErrorOr<void> parse_value_alias_list(Core::InputBufferedFile& file, StringView desired_category, Vector<ByteString> const& value_list, Vector<Alias>& prop_aliases, bool primary_value_is_first = true, bool sanitize_alias = false)
{
TRY(file.seek(0, SeekMode::SetPosition));
@ -550,68 +500,6 @@ static ErrorOr<void> parse_normalization_props(Core::InputBufferedFile& file, Un
return {};
}
static void add_canonical_code_point_name(Unicode::CodePointRange range, StringView name, UnicodeData& unicode_data)
{
// https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G142981
// FIXME: Implement the NR1 rules for Hangul syllables.
struct CodePointNameFormat {
Unicode::CodePointRange code_point_range;
StringView name;
};
// These code point ranges are the NR2 set of name replacements defined by Table 4-8.
constexpr Array<CodePointNameFormat, 16> s_ideographic_replacements { {
{ { 0x3400, 0x4DBF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0x4E00, 0x9FFF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0xF900, 0xFA6D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
{ { 0xFA70, 0xFAD9 }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
{ { 0x17000, 0x187F7 }, "TANGUT IDEOGRAPH-{:X}"sv },
{ { 0x18B00, 0x18CD5 }, "KHITAN SMALL SCRIPT CHARACTER-{:X}"sv },
{ { 0x18D00, 0x18D08 }, "TANGUT IDEOGRAPH-{:X}"sv },
{ { 0x1B170, 0x1B2FB }, "NUSHU CHARACTER-{:X}"sv },
{ { 0x20000, 0x2A6DF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0x2A700, 0x2B739 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0x2B740, 0x2B81D }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0x2B820, 0x2CEA1 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0x2CEB0, 0x2EBE0 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0x2F800, 0x2FA1D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
{ { 0x30000, 0x3134A }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
{ { 0x31350, 0x323AF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
} };
auto it = find_if(s_ideographic_replacements.begin(), s_ideographic_replacements.end(),
[&](auto const& replacement) {
return replacement.code_point_range.first == range.first;
});
if (it != s_ideographic_replacements.end()) {
auto index = unicode_data.unique_strings.ensure(it->name);
unicode_data.code_point_display_names.append({ it->code_point_range, index });
return;
}
it = find_if(s_ideographic_replacements.begin(), s_ideographic_replacements.end(),
[&](auto const& replacement) {
return (replacement.code_point_range.first <= range.first) && (range.first <= replacement.code_point_range.last);
});
if (it != s_ideographic_replacements.end()) {
// Drop code points that will have been captured by a range defined by the ideographic replacements.
return;
}
if (auto alias = unicode_data.code_point_display_name_aliases.get(range.first); alias.has_value()) {
// NR4 states that control code points have a null string as their name. Our implementation
// uses the control code's alias as its display name.
unicode_data.code_point_display_names.append({ range, *alias });
return;
}
auto index = unicode_data.unique_strings.ensure(name);
unicode_data.code_point_display_names.append({ range, index });
}
static Optional<CodePointDecomposition> parse_decomposition_mapping(StringView string, UnicodeData& unicode_data)
{
if (string.is_empty())
@ -660,29 +548,6 @@ static void add_composition_mapping(u32 code_point, CodePointDecomposition& deco
unicode_data.composition_mappings.ensure(first_code_point).append(CodePointComposition { .second_code_point = second_code_point, .combined_code_point = code_point });
}
static ErrorOr<void> parse_block_display_names(Core::InputBufferedFile& file, UnicodeData& unicode_data)
{
Array<u8, 1024> buffer;
while (TRY(file.can_read_line())) {
auto line = TRY(file.read_line(buffer));
if (line.is_empty() || line.starts_with('#'))
continue;
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
VERIFY(segments.size() == 2);
auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
auto display_name = segments[1].trim_whitespace();
auto index = unicode_data.unique_strings.ensure(display_name);
unicode_data.block_display_names.append({ code_point_range, index });
}
TRY(file.seek(0, SeekMode::SetPosition));
return {};
}
static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data)
{
Optional<u32> code_point_range_start;
@ -719,9 +584,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
data.casing.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
data.casing.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value())
data.abbreviation = *abbreviation;
if (!assigned_code_point_range_start.has_value())
assigned_code_point_range_start = data.code_point;
@ -742,10 +604,8 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
data.name = data.name.substring(1, data.name.length() - 8);
code_point_range_start.clear();
add_canonical_code_point_name(code_point_range, data.name, unicode_data);
unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class });
} else {
add_canonical_code_point_name({ data.code_point, data.code_point }, data.name, unicode_data);
unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class });
if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) {
@ -905,7 +765,6 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFil
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
generator.set("special_casing_size", ByteString::number(unicode_data.special_casing.size()));
generator.set("case_folding_size", ByteString::number(unicode_data.case_folding.size()));
@ -927,8 +786,6 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFil
namespace Unicode {
)~~~");
unicode_data.unique_strings.generate(generator);
auto append_list_and_size = [&](auto const& list, StringView format) {
if (list.is_empty()) {
generator.append(", {}, 0");
@ -998,11 +855,6 @@ struct CasingTable {
u32 case_folding_size { 0 };
};
struct CodePointAbbreviation {
u32 code_point { 0 };
@string_index_type@ abbreviation { 0 };
};
template<typename MappingType>
struct CodePointComparator {
constexpr int operator()(u32 code_point, MappingType const& mapping)
@ -1011,30 +863,6 @@ struct CodePointComparator {
}
};
struct BlockNameData {
CodePointRange code_point_range {};
@string_index_type@ display_name { 0 };
};
struct BlockNameComparator : public CodePointRangeComparator {
constexpr int operator()(u32 code_point, BlockNameData const& name)
{
return CodePointRangeComparator::operator()(code_point, name.code_point_range);
}
};
struct CodePointName {
CodePointRange code_point_range {};
@string_index_type@ display_name { 0 };
};
struct CodePointNameComparator : public CodePointRangeComparator {
constexpr int operator()(u32 code_point, CodePointName const& name)
{
return CodePointRangeComparator::operator()(code_point, name.code_point_range);
}
};
struct BidiClassData {
CodePointRange code_point_range {};
BidirectionalClass bidi_class {};
@ -1106,7 +934,6 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
)~~~");
};
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; });
size_t composition_mappings_size = 0;
@ -1254,39 +1081,6 @@ static constexpr Array<@type@, @size@> @name@ { {
TRY(append_code_point_tables("s_word_break_properties"sv, unicode_data.word_break_tables, append_property_table));
TRY(append_code_point_tables("s_sentence_break_properties"sv, unicode_data.sentence_break_tables, append_property_table));
auto append_code_point_display_names = [&](StringView type, StringView name, auto const& display_names) {
constexpr size_t max_values_per_row = 30;
size_t values_in_current_row = 0;
generator.set("type", type);
generator.set("name", name);
generator.set("size", ByteString::number(display_names.size()));
generator.append(R"~~~(
static constexpr Array<@type@, @size@> @name@ { {
)~~~");
for (auto const& display_name : display_names) {
if (values_in_current_row++ > 0)
generator.append(", ");
generator.set("first", ByteString::formatted("{:#x}", display_name.code_point_range.first));
generator.set("last", ByteString::formatted("{:#x}", display_name.code_point_range.last));
generator.set("name", ByteString::number(display_name.name));
generator.append("{ { @first@, @last@ }, @name@ }");
if (values_in_current_row == max_values_per_row) {
values_in_current_row = 0;
generator.append(",\n ");
}
}
generator.append(R"~~~(
} };
)~~~");
};
append_code_point_display_names("BlockNameData"sv, "s_block_display_names"sv, unicode_data.block_display_names);
append_code_point_display_names("CodePointName"sv, "s_code_point_display_names"sv, unicode_data.code_point_display_names);
{
constexpr size_t max_bidi_classes_per_row = 20;
size_t bidi_classes_in_current_row = 0;
@ -1315,44 +1109,6 @@ static constexpr Array<BidiClassData, @size@> s_bidirectional_classes { {
}
generator.append(R"~~~(
Optional<StringView> code_point_block_display_name(u32 code_point)
{
if (auto const* entry = binary_search(s_block_display_names, code_point, nullptr, BlockNameComparator {}))
return decode_string(entry->display_name);
return {};
}
ReadonlySpan<BlockName> block_display_names()
{
static auto display_names = []() {
Array<BlockName, s_block_display_names.size()> display_names;
for (size_t i = 0; i < s_block_display_names.size(); ++i) {
auto const& display_name = s_block_display_names[i];
display_names[i] = { display_name.code_point_range, decode_string(display_name.display_name) };
}
return display_names;
}();
return display_names.span();
}
Optional<ByteString> code_point_display_name(u32 code_point)
{
if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) {
auto display_name = decode_string(entry->display_name);
if (display_name.ends_with("{:X}"sv))
return ByteString::formatted(display_name, code_point);
return display_name;
}
return {};
}
static CasingTable const& casing_table_for_code_point(u32 code_point)
{
auto stage1_index = code_point >> @CODE_POINT_TABLES_LSB_COUNT@;
@ -1411,17 +1167,6 @@ ReadonlySpan<CaseFolding> case_folding_mapping(u32 code_point)
return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size);
}
Optional<StringView> code_point_abbreviation(u32 code_point)
{
auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
if (mapping == nullptr)
return {};
if (mapping->abbreviation == 0)
return {};
return decode_string(mapping->abbreviation);
}
Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point)
{
auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator<CodePointDecompositionRaw> {});
@ -1842,10 +1587,8 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
StringView derived_binary_prop_path;
StringView prop_alias_path;
StringView prop_value_alias_path;
StringView name_alias_path;
StringView scripts_path;
StringView script_extensions_path;
StringView blocks_path;
StringView emoji_data_path;
StringView normalization_path;
StringView grapheme_break_path;
@ -1864,10 +1607,8 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
args_parser.add_option(derived_binary_prop_path, "Path to DerivedBinaryProperties.txt file", "derived-binary-prop-path", 'b', "derived-binary-prop-path");
args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
args_parser.add_option(name_alias_path, "Path to NameAliases.txt file", "name-alias-path", 'm', "name-alias-path");
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
args_parser.add_option(blocks_path, "Path to Blocks.txt file", "blocks-path", 'k', "blocks-path");
args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path");
@ -1886,10 +1627,8 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::File::OpenMode::Read));
auto prop_alias_file = TRY(open_file(prop_alias_path, Core::File::OpenMode::Read));
auto prop_value_alias_file = TRY(open_file(prop_value_alias_path, Core::File::OpenMode::Read));
auto name_alias_file = TRY(open_file(name_alias_path, Core::File::OpenMode::Read));
auto scripts_file = TRY(open_file(scripts_path, Core::File::OpenMode::Read));
auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
auto blocks_file = TRY(open_file(blocks_path, Core::File::OpenMode::Read));
auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read));
auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read));
auto grapheme_break_file = TRY(open_file(grapheme_break_path, Core::File::OpenMode::Read));
@ -1908,8 +1647,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases));
TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
TRY(parse_block_display_names(*blocks_file, unicode_data));
TRY(parse_name_aliases(*name_alias_file, unicode_data));
TRY(parse_prop_list(*grapheme_break_file, unicode_data.grapheme_break_props));
TRY(parse_prop_list(*word_break_file, unicode_data.word_break_props));
TRY(parse_prop_list(*sentence_break_file, unicode_data.sentence_break_props));

View File

@ -330,15 +330,6 @@ TEST_CASE(script)
}
}
TEST_CASE(block)
{
for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point)
EXPECT_EQ("Basic Latin"sv, Unicode::code_point_block_display_name(code_point).value());
for (u32 code_point = 0x0370; code_point <= 0x03FF; ++code_point)
EXPECT_EQ("Greek and Coptic"sv, Unicode::code_point_block_display_name(code_point).value());
}
TEST_CASE(script_extension)
{
auto script = [](StringView name) {
@ -390,38 +381,6 @@ TEST_CASE(script_extension)
EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
}
TEST_CASE(code_point_display_name)
{
auto code_point_display_name = [](u32 code_point) {
auto name = Unicode::code_point_display_name(code_point);
VERIFY(name.has_value());
return name.release_value();
};
// Control code points.
EXPECT_EQ(code_point_display_name(0), "NULL"sv);
EXPECT_EQ(code_point_display_name(1), "START OF HEADING"sv);
EXPECT_EQ(code_point_display_name(0xa), "LINE FEED"sv);
// Ideographic code points (which already appeared in a range in UnicodeData.txt).
EXPECT_EQ(code_point_display_name(0x3400), "CJK UNIFIED IDEOGRAPH-3400"sv);
EXPECT_EQ(code_point_display_name(0x3401), "CJK UNIFIED IDEOGRAPH-3401"sv);
EXPECT_EQ(code_point_display_name(0x3402), "CJK UNIFIED IDEOGRAPH-3402"sv);
EXPECT_EQ(code_point_display_name(0x4dbf), "CJK UNIFIED IDEOGRAPH-4DBF"sv);
EXPECT_EQ(code_point_display_name(0x20000), "CJK UNIFIED IDEOGRAPH-20000"sv);
EXPECT_EQ(code_point_display_name(0x20001), "CJK UNIFIED IDEOGRAPH-20001"sv);
EXPECT_EQ(code_point_display_name(0x20002), "CJK UNIFIED IDEOGRAPH-20002"sv);
EXPECT_EQ(code_point_display_name(0x2a6df), "CJK UNIFIED IDEOGRAPH-2A6DF"sv);
EXPECT(!Unicode::code_point_display_name(0x2a6e0).has_value());
// Ideographic code points (which appeared individually in UnicodeData.txt and were coalesced into a range).
EXPECT_EQ(code_point_display_name(0x2f800), "CJK COMPATIBILITY IDEOGRAPH-2F800"sv);
EXPECT_EQ(code_point_display_name(0x2f801), "CJK COMPATIBILITY IDEOGRAPH-2F801"sv);
EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv);
EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv);
}
TEST_CASE(code_point_bidirectional_character_type)
{
auto code_point_bidi_class = [](u32 code_point) {

View File

@ -21,11 +21,7 @@
namespace Unicode {
Optional<ByteString> __attribute__((weak)) code_point_display_name(u32) { return {}; }
Optional<StringView> __attribute__((weak)) code_point_block_display_name(u32) { return {}; }
Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
ReadonlySpan<BlockName> __attribute__((weak)) block_display_names() { return {}; }
u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
{

View File

@ -29,17 +29,6 @@ struct CodePointRangeComparator {
}
};
struct BlockName {
CodePointRange code_point_range {};
StringView display_name;
};
Optional<ByteString> code_point_display_name(u32 code_point);
Optional<StringView> code_point_block_display_name(u32 code_point);
Optional<StringView> code_point_abbreviation(u32 code_point);
ReadonlySpan<BlockName> block_display_names();
u32 canonical_combining_class(u32 code_point);
// Note: The single code point case conversions only perform simple case folding.