diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 6c87fabdeaf..f371ee52358 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -42,6 +42,9 @@ set(SCRIPTS_PATH "${UCD_PATH}/Scripts.txt") set(SCRIPT_EXTENSIONS_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/ScriptExtensions.txt") set(SCRIPT_EXTENSIONS_PATH "${UCD_PATH}/ScriptExtensions.txt") +set(BLOCKS_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/Blocks.txt") +set(BLOCKS_PATH "${UCD_PATH}/Blocks.txt") + set(EMOJI_DATA_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/emoji/emoji-data.txt") set(EMOJI_DATA_PATH "${UCD_PATH}/emoji-data.txt") @@ -103,6 +106,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) download_file("${NAME_ALIAS_URL}" "${NAME_ALIAS_PATH}") download_file("${SCRIPTS_URL}" "${SCRIPTS_PATH}") download_file("${SCRIPT_EXTENSIONS_URL}" "${SCRIPT_EXTENSIONS_PATH}") + download_file("${BLOCKS_URL}" "${BLOCKS_PATH}") download_file("${EMOJI_DATA_URL}" "${EMOJI_DATA_PATH}") download_file("${NORM_PROPS_URL}" "${NORM_PROPS_PATH}") download_file("${GRAPHEME_BREAK_PROP_URL}" "${GRAPHEME_BREAK_PROP_PATH}") @@ -160,7 +164,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${UNICODE_META_TARGET_PREFIX}" "${UNICODE_DATA_HEADER}" "${UNICODE_DATA_IMPLEMENTATION}" - arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" + arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" ) invoke_generator( "UnicodeDateTimeFormat" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 2188d8adbff..8beb463413f 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -88,6 +88,11 @@ struct CodePointData { Vector special_casing_indices; }; +struct BlockName { + CodePointRange code_point_range; + String name; +}; + struct UnicodeData { u32 code_points_with_non_zero_combining_class { 0 }; @@ -125,6 +130,12 @@ struct UnicodeData { Vector script_aliases; PropList script_extensions; + PropList block_list { + { "No_Block"sv, {} }, + }; + Vector block_aliases; + Vector block_display_names; + // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize. NormalizationProps normalization_props; @@ -133,6 +144,24 @@ struct UnicodeData { PropList sentence_break_props; }; +static String sanitize_entry(String const& entry) +{ + auto sanitized = entry.replace("-", "_", true); + sanitized = sanitized.replace(" ", "_", true); + + StringBuilder builder; + bool next_is_upper = true; + for (auto ch : sanitized) { + if (next_is_upper) + builder.append_code_point(to_ascii_uppercase(ch)); + else + builder.append_code_point(ch); + next_is_upper = ch == '_'; + } + + return builder.to_string(); +} + static Vector parse_code_point_list(StringView list) { Vector code_points; @@ -230,7 +259,7 @@ static ErrorOr parse_special_casing(Core::Stream::BufferedFile& file, Unic return {}; } -static ErrorOr parse_prop_list(Core::Stream::BufferedFile& file, PropList& prop_list, bool multi_value_property = false) +static ErrorOr parse_prop_list(Core::Stream::BufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false) { Array buffer; @@ -255,8 +284,8 @@ static ErrorOr parse_prop_list(Core::Stream::BufferedFile& file, PropList& else properties = { segments[1].trim_whitespace() }; - for (auto const& property : properties) { - auto& code_points = prop_list.ensure(property.trim_whitespace()); + for (auto& property : properties) { + auto& code_points = prop_list.ensure(sanitize_property ? sanitize_entry(property).trim_whitespace().view() : property.trim_whitespace()); code_points.append(code_point_range); } } @@ -340,7 +369,7 @@ static ErrorOr parse_name_aliases(Core::Stream::BufferedFile& file, Unicod return {}; } -static ErrorOr parse_value_alias_list(Core::Stream::BufferedFile& file, StringView desired_category, Vector const& value_list, Vector& prop_aliases, bool primary_value_is_first = true) +static ErrorOr parse_value_alias_list(Core::Stream::BufferedFile& file, StringView desired_category, Vector const& value_list, Vector& prop_aliases, bool primary_value_is_first = true, bool sanitize_alias = false) { TRY(file.seek(0, Core::Stream::SeekMode::SetPosition)); Array buffer; @@ -376,11 +405,11 @@ static ErrorOr parse_value_alias_list(Core::Stream::BufferedFile& file, St VERIFY((segments.size() == 3) || (segments.size() == 4)); auto value = primary_value_is_first ? segments[1].trim_whitespace() : segments[2].trim_whitespace(); auto alias = primary_value_is_first ? segments[2].trim_whitespace() : segments[1].trim_whitespace(); - append_alias(alias, value); + append_alias(sanitize_alias ? sanitize_entry(alias).view() : alias, value); if (segments.size() == 4) { alias = segments[3].trim_whitespace(); - append_alias(alias, value); + append_alias(sanitize_alias ? sanitize_entry(alias).view() : alias, value); } } @@ -485,6 +514,28 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, unicode_data.code_point_display_names.append({ range, name }); } +static ErrorOr parse_block_display_names(Core::Stream::BufferedFile& file, UnicodeData& unicode_data) +{ + Array buffer; + while (TRY(file.can_read_line())) { + auto nread = TRY(file.read_line(buffer)); + StringView line { buffer.data(), nread }; + if (line.is_empty() || line.starts_with('#')) + continue; + + auto segments = line.split_view(';', true); + VERIFY(segments.size() == 2); + + auto code_point_range = parse_code_point_range(segments[0].trim_whitespace()); + auto display_name = segments[1].trim_whitespace(); + unicode_data.block_display_names.append({ code_point_range, display_name }); + } + + TRY(file.seek(0, Core::Stream::SeekMode::SetPosition)); + + return {}; +} + static ErrorOr parse_unicode_data(Core::Stream::BufferedFile& file, UnicodeData& unicode_data) { Optional code_point_range_start; @@ -591,9 +642,10 @@ static ErrorOr generate_unicode_data_header(Core::Stream::BufferedFile& fi generator.set("name", name); generator.set("underlying", String::formatted("{}UnderlyingType", name)); + generator.set("type", ((values.size() + !default_.is_empty()) < 256) ? "u8"sv : "u16"sv); generator.append(R"~~~( -using @underlying@ = u8; +using @underlying@ = @type@; enum class @name@ : @underlying@ {)~~~"); @@ -635,6 +687,7 @@ namespace Unicode { generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases); generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); + generate_enum("Block"sv, {}, unicode_data.block_list.keys(), unicode_data.block_aliases); generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys()); generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys()); generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys()); @@ -813,11 +866,6 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); generator.append(R"~~~( -struct CodePointRange { - u32 first { 0 }; - u32 last { 0 }; -}; - struct CodePointRangeComparator { constexpr int operator()(u32 code_point, CodePointRange const& range) { @@ -885,10 +933,49 @@ static constexpr Array, @size@> @name@ { {)~~~"); append_prop_list("s_properties"sv, "s_property_{}"sv, unicode_data.prop_list); append_prop_list("s_scripts"sv, "s_script_{}"sv, unicode_data.script_list); append_prop_list("s_script_extensions"sv, "s_script_extension_{}"sv, unicode_data.script_extensions); + append_prop_list("s_blocks"sv, "s_block_{}"sv, unicode_data.block_list); append_prop_list("s_grapheme_break_properties"sv, "s_grapheme_break_property_{}"sv, unicode_data.grapheme_break_props); append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props); append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props); + generator.append(R"~~~( +struct BlockNameComparator : public CodePointRangeComparator { + constexpr int operator()(u32 code_point, BlockName const& name) + { + return CodePointRangeComparator::operator()(code_point, name.code_point_range); + } +}; +)~~~"); + + generator.set("block_display_names_size", String::number(unicode_data.block_display_names.size())); + generator.append(R"~~~( +static constexpr Array s_block_display_names { { +)~~~"); + for (auto const& block_name : unicode_data.block_display_names) { + generator.set("first", String::formatted("{:#x}", block_name.code_point_range.first)); + generator.set("last", String::formatted("{:#x}", block_name.code_point_range.last)); + generator.set("name", block_name.name); + generator.append(R"~~~( { { @first@, @last@ }, "@name@"sv }, +)~~~"); + } + generator.append(R"~~~(} }; +)~~~"); + + generator.append(R"~~~( +Optional code_point_block_display_name(u32 code_point) +{ + if (auto const* entry = binary_search(s_block_display_names, code_point, nullptr, BlockNameComparator {})) + return entry->display_name; + + return {}; +} + +Span block_display_names() +{ + return s_block_display_names; +} +)~~~"); + generator.append(R"~~~( struct CodePointName { CodePointRange code_point_range {}; @@ -1006,6 +1093,9 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv); append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases); + append_prop_search("Block"sv, "block"sv, "s_blocks"sv); + append_from_string("Block"sv, "block"sv, unicode_data.block_list, unicode_data.block_aliases); + append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv); append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv); append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv); @@ -1158,6 +1248,7 @@ ErrorOr serenity_main(Main::Arguments arguments) StringView name_alias_path; StringView scripts_path; StringView script_extensions_path; + StringView blocks_path; StringView emoji_data_path; StringView normalization_path; StringView grapheme_break_path; @@ -1178,6 +1269,7 @@ ErrorOr serenity_main(Main::Arguments arguments) args_parser.add_option(name_alias_path, "Path to NameAliases.txt file", "name-alias-path", 'm', "name-alias-path"); args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path"); args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path"); + args_parser.add_option(blocks_path, "Path to Blocks.txt file", "blocks-path", 'k', "blocks-path"); args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path"); args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path"); args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path"); @@ -1198,6 +1290,7 @@ ErrorOr serenity_main(Main::Arguments arguments) auto name_alias_file = TRY(open_file(name_alias_path, Core::Stream::OpenMode::Read)); auto scripts_file = TRY(open_file(scripts_path, Core::Stream::OpenMode::Read)); auto script_extensions_file = TRY(open_file(script_extensions_path, Core::Stream::OpenMode::Read)); + auto blocks_file = TRY(open_file(blocks_path, Core::Stream::OpenMode::Read)); auto emoji_data_file = TRY(open_file(emoji_data_path, Core::Stream::OpenMode::Read)); auto normalization_file = TRY(open_file(normalization_path, Core::Stream::OpenMode::Read)); auto grapheme_break_file = TRY(open_file(grapheme_break_path, Core::Stream::OpenMode::Read)); @@ -1215,6 +1308,8 @@ ErrorOr serenity_main(Main::Arguments arguments) TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases)); TRY(parse_prop_list(*scripts_file, unicode_data.script_list)); TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true)); + TRY(parse_block_display_names(*blocks_file, unicode_data)); + TRY(parse_prop_list(*blocks_file, unicode_data.block_list, false, true)); TRY(parse_name_aliases(*name_alias_file, unicode_data)); TRY(parse_prop_list(*grapheme_break_file, unicode_data.grapheme_break_props)); TRY(parse_prop_list(*word_break_file, unicode_data.word_break_props)); @@ -1224,6 +1319,7 @@ ErrorOr serenity_main(Main::Arguments arguments) TRY(parse_unicode_data(*unicode_data_file, unicode_data)); TRY(parse_value_alias_list(*prop_value_alias_file, "gc"sv, unicode_data.general_categories.keys(), unicode_data.general_category_aliases)); TRY(parse_value_alias_list(*prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), unicode_data.script_aliases, false)); + TRY(parse_value_alias_list(*prop_value_alias_file, "blk"sv, unicode_data.block_list.keys(), unicode_data.block_aliases, false, true)); normalize_script_extensions(unicode_data.script_extensions, unicode_data.script_list, unicode_data.script_aliases); TRY(generate_unicode_data_header(*generated_header_file, unicode_data)); diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 26b3acc65b1..9974f095275 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -23,9 +23,11 @@ namespace Unicode { Optional __attribute__((weak)) code_point_display_name(u32) { return {}; } +Optional __attribute__((weak)) code_point_block_display_name(u32) { return {}; } Optional __attribute__((weak)) code_point_abbreviation(u32) { return {}; } u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; } Span __attribute__((weak)) special_case_mapping(u32) { return {}; } +Span __attribute__((weak)) block_display_names() { return {}; } #if ENABLE_UNICODE_DATA @@ -354,6 +356,9 @@ Optional