diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index e8e25da5422..479c969a453 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -125,6 +125,11 @@ struct CodePointTables { Vector unique_properties; }; +struct CodePointBidiClass { + Unicode::CodePointRange code_point_range; + DeprecatedString bidi_class; +}; + struct UnicodeData { UniqueStringStorage unique_strings; @@ -184,6 +189,9 @@ struct UnicodeData { CodePointTables grapheme_break_tables; CodePointTables word_break_tables; CodePointTables sentence_break_tables; + + HashTable bidirectional_classes; + Vector code_point_bidirectional_classes; }; static DeprecatedString sanitize_entry(DeprecatedString const& entry) @@ -725,8 +733,10 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa code_point_range_start.clear(); add_canonical_code_point_name(code_point_range, data.name, unicode_data); + unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class }); } else { add_canonical_code_point_name({ data.code_point, data.code_point }, data.name, unicode_data); + unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class }); if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) { VERIFY(assigned_code_point_range_start.has_value()); @@ -748,6 +758,8 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value(); + unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep); + previous_code_point = data.code_point; unicode_data.code_point_data.append(move(data)); } @@ -818,6 +830,7 @@ namespace Unicode { generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys()); generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys()); generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags); + generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values()); generator.append(R"~~~( struct SpecialCasing { @@ -1003,6 +1016,19 @@ struct CodePointNameComparator : public CodePointRangeComparator { return CodePointRangeComparator::operator()(code_point, name.code_point_range); } }; + +struct BidiClassData { + CodePointRange code_point_range {}; + BidirectionalClass bidi_class {}; +}; + +struct CodePointBidiClassComparator : public CodePointRangeComparator { + constexpr int operator()(u32 code_point, BidiClassData const& bidi_class) + { + return CodePointRangeComparator::operator()(code_point, bidi_class.code_point_range); + } +}; + )~~~"); generator.set("decomposition_mappings_size", DeprecatedString::number(unicode_data.decomposition_mappings.size())); @@ -1212,6 +1238,33 @@ static constexpr Array<@type@, @size@> @name@ { { append_code_point_display_names("BlockNameData"sv, "s_block_display_names"sv, unicode_data.block_display_names); append_code_point_display_names("CodePointName"sv, "s_code_point_display_names"sv, unicode_data.code_point_display_names); + { + constexpr size_t max_bidi_classes_per_row = 20; + size_t bidi_classes_in_current_row = 0; + + generator.set("size"sv, DeprecatedString::number(unicode_data.code_point_bidirectional_classes.size())); + generator.append(R"~~~( +static constexpr Array s_bidirectional_classes { { +)~~~"); + for (auto const& data : unicode_data.code_point_bidirectional_classes) { + if (bidi_classes_in_current_row++ > 0) + generator.append(", "); + + generator.set("first", DeprecatedString::formatted("{:#x}", data.code_point_range.first)); + generator.set("last", DeprecatedString::formatted("{:#x}", data.code_point_range.last)); + generator.set("bidi_class", data.bidi_class); + generator.append("{ { @first@, @last@ }, BidirectionalClass::@bidi_class@ }"); + + if (bidi_classes_in_current_row == max_bidi_classes_per_row) { + bidi_classes_in_current_row = 0; + generator.append(",\n "); + } + } + generator.append(R"~~~( +} }; +)~~~"); + } + generator.append(R"~~~( Optional code_point_block_display_name(u32 code_point) { @@ -1335,6 +1388,14 @@ Optional code_point_decomposition_by_index(size_t auto const& mapping = s_decomposition_mappings[index]; return CodePointDecomposition { mapping.code_point, mapping.tag, ReadonlySpan { s_decomposition_mappings_data.data() + mapping.decomposition_index, mapping.decomposition_count } }; } + +Optional bidirectional_class(u32 code_point) +{ + if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {})) + return entry->bidi_class; + + return {}; +} )~~~"); auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) -> ErrorOr { @@ -1396,6 +1457,8 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv)); TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv)); + TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {})); + generator.append(R"~~~( } )~~~"); diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 822e9c83899..9edd7a93480 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -848,3 +848,28 @@ TEST_CASE(code_point_display_name) EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv); EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv); } + +TEST_CASE(code_point_bidirectional_character_type) +{ + auto code_point_bidi_class = [](u32 code_point) { + auto bidi_class = Unicode::bidirectional_class(code_point); + VERIFY(bidi_class.has_value()); + return bidi_class.release_value(); + }; + + auto bidi_class_from_string = [](StringView name) { + auto result = Unicode::bidirectional_class_from_string(name); + VERIFY(result.has_value()); + return result.release_value(); + }; + + // Left-to-right + EXPECT_EQ(code_point_bidi_class('A'), bidi_class_from_string("L"sv)); + EXPECT_EQ(code_point_bidi_class('z'), bidi_class_from_string("L"sv)); + // European number + EXPECT_EQ(code_point_bidi_class('7'), bidi_class_from_string("EN"sv)); + // Whitespace + EXPECT_EQ(code_point_bidi_class(' '), bidi_class_from_string("WS"sv)); + // Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM) + EXPECT_EQ(code_point_bidi_class(0xFEB4), bidi_class_from_string("AL"sv)); +} diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 744b7464dcd..89182efeb6f 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -147,4 +147,7 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; } bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; } +Optional __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; } +Optional __attribute__((weak)) bidirectional_class(u32) { return {}; } + } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 81fcd3c0668..8c4c205b623 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -68,4 +68,7 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property); bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); +Optional bidirectional_class_from_string(StringView); +Optional bidirectional_class(u32 code_point); + } diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index 5b6126fad47..c6c6dfd9dab 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -10,6 +10,7 @@ namespace Unicode { +enum class BidirectionalClass : u8; enum class Block : u16; enum class EmojiGroup : u8; enum class GeneralCategory : u8;