From 7e6ad172a461e23718df9321570734c8ed02a792 Mon Sep 17 00:00:00 2001
From: Timothy Flynn <trflynn89@pm.me>
Date: Tue, 23 Nov 2021 08:24:13 -0500
Subject: [PATCH] LibUnicode: Support code point names that apply to ranges of
 code points

For example, consider the following adjacent entries in UnicodeData.txt:

    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;

Our current implementation would assign the display name "CJK Ideograph
Extension A" to code points U+3400 & U+4DBF, but not to the code points
in between. Not only should those code points be assigned a name, but
the Unicode spec also has formatting rules on what the names should be
(the names for these ranged code points are not as they appear in
UnicodeData.txt).

The spec also defines names for code point ranges that actually are
listed individually in UnicodeData.txt. For example:

    2F800;CJK COMPATIBILITY IDEOGRAPH-2F800;Lo;0;L;4E3D;;;;N;;;;;
    2F801;CJK COMPATIBILITY IDEOGRAPH-2F801;Lo;0;L;4E38;;;;N;;;;;
    2F802;CJK COMPATIBILITY IDEOGRAPH-2F802;Lo;0;L;4E41;;;;N;;;;;

Code points are only coalesced into a range if all fields after the name
are equivalent. Our parser will insert the range and its name formatting
pattern when it comes across the first code point in that range, then
ignore other code points in that range. This reduces the number of names
we generated by nearly 2,000.
---
 .../LibUnicode/GenerateUnicodeData.cpp        | 148 ++++++++++++------
 .../LibUnicode/TestUnicodeCharacterTypes.cpp  |  31 ++++
 .../Libraries/LibUnicode/CharacterTypes.cpp   |   7 +-
 .../Libraries/LibUnicode/CharacterTypes.h     |   2 +-
 4 files changed, 137 insertions(+), 51 deletions(-)
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
index c476592e970..c34ecf4b102 100644
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
@@ -61,6 +61,11 @@ struct Normalization {
 
 using NormalizationProps = HashMap<String, Vector<Normalization>>;
 
+struct CodePointName {
+    CodePointRange code_point_range;
+    StringView name;
+};
+
 // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
 // Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#UnicodeData.txt
 //                     https://www.unicode.org/reports/tr44/#General_Category_Values
@@ -96,6 +101,9 @@ struct UnicodeData {
 
     Vector<CodePointData> code_point_data;
 
+    HashMap<u32, String> code_point_display_name_aliases;
+    Vector<CodePointName> code_point_display_names;
+
     PropList general_categories;
     Vector<Alias> general_category_aliases;
 
@@ -280,25 +288,8 @@ static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector
     }
 }
 
-static void parse_name_aliases(Core::File& file, Vector<CodePointData>& code_point_data)
+static void parse_name_aliases(Core::File& file, UnicodeData& unicode_data)
 {
-    auto iterator = code_point_data.begin();
-    VERIFY(!iterator.is_end());
-    // We use a single iterator because both lists should be sorted, and repeated lookups are unnecessary.
-
-    auto code_point_data_at = [&iterator](u32 code_point) -> CodePointData& {
-        VERIFY(!iterator.is_end());
-        while (iterator->code_point < code_point) {
-            ++iterator;
-            VERIFY(!iterator.is_end());
-        }
-        VERIFY(iterator->code_point == code_point);
-        return *iterator;
-    };
-
-    // We always use the *first* "control"-type alias. The actual reason for this is to avoid the silly name "END OF LINE" for code point 0x000a.
-    u32 last_overridden = (u32)-1;
-
     while (file.can_read_line()) {
         auto line = file.read_line();
         if (line.is_empty() || line.starts_with('#'))
@@ -307,22 +298,15 @@ static void parse_name_aliases(Core::File& file, Vector<CodePointData>& code_poi
         auto segments = line.split_view(';', true);
         VERIFY(segments.size() == 3);
 
-        auto code_point_optional = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0].trim_whitespace());
+        auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0].trim_whitespace());
         auto alias = segments[1].trim_whitespace();
         auto reason = segments[2].trim_whitespace();
 
-        VERIFY(code_point_optional.has_value());
-        VERIFY(!alias.is_empty());
-        // Thankfully, there is no correction alias for any of the control code points (yet).
-        if (!reason.is_one_of("correction", "control"))
+        if (!reason.is_one_of("correction"sv, "control"sv))
             continue;
 
-        auto code_point = code_point_optional.value();
-        if (code_point == last_overridden)
-            continue;
-
-        code_point_data_at(code_point).name = alias;
-        last_overridden = code_point;
+        if (!unicode_data.code_point_display_name_aliases.contains(*code_point))
+            unicode_data.code_point_display_name_aliases.set(*code_point, alias);
     }
 }
 
@@ -406,6 +390,60 @@ static void parse_normalization_props(Core::File& file, UnicodeData& unicode_dat
     }
 }
 
+static void add_canonical_code_point_name(CodePointRange range, StringView name, UnicodeData& unicode_data)
+{
+    // https://www.unicode.org/versions/Unicode14.0.0/ch04.pdf#G142981
+    // FIXME: Implement the NR1 rules for Hangul syllables.
+
+    // These code point ranges are the NR2 set of name replacements defined by Table 4-8.
+    constexpr Array<CodePointName, 15> s_ideographic_replacements { {
+        { { 0x3400, 0x4DBF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+        { { 0x4E00, 0x9FFC }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+        { { 0xF900, 0xFA6D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
+        { { 0xFA70, 0xFAD9 }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
+        { { 0x17000, 0x187F7 }, "TANGUT IDEOGRAPH-{:X}"sv },
+        { { 0x18B00, 0x18CD5 }, "KHITAN SMALL SCRIPT CHARACTER-{:X}"sv },
+        { { 0x18D00, 0x18D08 }, "TANGUT IDEOGRAPH-{:X}"sv },
+        { { 0x1B170, 0x1B2FB }, "NUSHU CHARACTER-{:X}"sv },
+        { { 0x20000, 0x2A6DD }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+        { { 0x2A700, 0x2B734 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+        { { 0x2B740, 0x2B81D }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+        { { 0x2B820, 0x2CEA1 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+        { { 0x2CEB0, 0x2EBE0 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+        { { 0x2F800, 0x2FA1D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
+        { { 0x30000, 0x3134A }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
+    } };
+
+    auto it = find_if(s_ideographic_replacements.begin(), s_ideographic_replacements.end(),
+        [&](auto const& replacement) {
+            return replacement.code_point_range.first == range.first;
+        });
+
+    if (it != s_ideographic_replacements.end()) {
+        unicode_data.code_point_display_names.append(*it);
+        return;
+    }
+
+    it = find_if(s_ideographic_replacements.begin(), s_ideographic_replacements.end(),
+        [&](auto const& replacement) {
+            return (replacement.code_point_range.first <= range.first) && (range.first <= replacement.code_point_range.last);
+        });
+
+    if (it != s_ideographic_replacements.end()) {
+        // Drop code points that will have been captured by a range defined by the ideographic replacements.
+        return;
+    }
+
+    if (auto alias = unicode_data.code_point_display_name_aliases.get(range.first); alias.has_value()) {
+        // NR4 states that control code points have a null string as their name. Our implementation
+        // uses the control code's alias as its display name.
+        unicode_data.code_point_display_names.append({ range, *alias });
+        return;
+    }
+
+    unicode_data.code_point_display_names.append({ range, name });
+}
+
 static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
 {
     Optional<u32> code_point_range_start;
@@ -457,11 +495,17 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
 
             data.name = data.name.substring(1, data.name.length() - 8);
             code_point_range_start.clear();
-        } else if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) {
-            VERIFY(assigned_code_point_range_start.has_value());
 
-            assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
-            assigned_code_point_range_start = data.code_point;
+            add_canonical_code_point_name(code_point_range, data.name, unicode_data);
+        } else {
+            add_canonical_code_point_name({ data.code_point, data.code_point }, data.name, unicode_data);
+
+            if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) {
+                VERIFY(assigned_code_point_range_start.has_value());
+
+                assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
+                assigned_code_point_range_start = data.code_point;
+            }
         }
 
         bool has_special_casing { false };
@@ -563,7 +607,7 @@ struct SpecialCasing {
 
 namespace Detail {
 
-StringView code_point_display_name(u32 code_point);
+Optional<String> code_point_display_name(u32 code_point);
 
 u32 canonical_combining_class(u32 code_point);
 
@@ -802,19 +846,27 @@ static constexpr Array<Span<CodePointRange const>, @size@> @name@ { {)~~~");
 
     generator.append(R"~~~(
 struct CodePointName {
-    u32 code_point { 0 };
+    CodePointRange code_point_range {};
     StringView display_name;
 };
+
+struct CodePointNameComparator : public CodePointRangeComparator {
+    constexpr int operator()(u32 code_point, CodePointName const& name)
+    {
+        return CodePointRangeComparator::operator()(code_point, name.code_point_range);
+    }
+};
 )~~~");
 
-    generator.set("code_point_names_size", String::number(unicode_data.code_point_data.size()));
+    generator.set("code_point_display_names_size", String::number(unicode_data.code_point_display_names.size()));
     generator.append(R"~~~(
-static constexpr Array<CodePointName, @code_point_names_size@> s_code_point_names { {
+static constexpr Array<CodePointName, @code_point_display_names_size@> s_code_point_display_names { {
 )~~~");
-    for (auto const& code_point_data : unicode_data.code_point_data) {
-        generator.set("code_point", String::formatted("{:#x}", code_point_data.code_point));
-        generator.set("code_point_name", code_point_data.name);
-        generator.append(R"~~~(    { @code_point@, "@code_point_name@"sv },
+    for (auto const& code_point_name : unicode_data.code_point_display_names) {
+        generator.set("first", String::formatted("{:#x}", code_point_name.code_point_range.first));
+        generator.set("last", String::formatted("{:#x}", code_point_name.code_point_range.last));
+        generator.set("name", code_point_name.name);
+        generator.append(R"~~~(    { { @first@, @last@ }, "@name@"sv },
 )~~~");
     }
     generator.append(R"~~~(} };
@@ -823,10 +875,16 @@ static constexpr Array<CodePointName, @code_point_names_size@> s_code_point_name
     generator.append(R"~~~(
 namespace Detail {
 
-StringView code_point_display_name(u32 code_point)
+Optional<String> code_point_display_name(u32 code_point)
 {
-    auto const* entry = binary_search(s_code_point_names, code_point, nullptr, CodePointComparator<CodePointName> {});
-    return entry ? entry->display_name : StringView();
+    if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) {
+        if (entry->display_name.ends_with("{:X}"sv))
+            return String::formatted(entry->display_name, code_point);
+
+        return entry->display_name;
+    }
+
+    return {};
 }
 )~~~");
 
@@ -1102,10 +1160,10 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
     parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
     parse_prop_list(scripts_file, unicode_data.script_list);
     parse_prop_list(script_extensions_file, unicode_data.script_extensions, true);
+    parse_name_aliases(name_alias_file, unicode_data);
 
     populate_general_category_unions(unicode_data.general_categories);
     parse_unicode_data(unicode_data_file, unicode_data);
-    parse_name_aliases(name_alias_file, unicode_data.code_point_data);
     parse_value_alias_list(prop_value_alias_file, "gc"sv, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
     parse_value_alias_list(prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), unicode_data.script_aliases, false);
     normalize_script_extensions(unicode_data.script_extensions, unicode_data.script_list, unicode_data.script_aliases);
diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
index c20f0e4849a..59d7b5900fd 100644
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@@ -629,3 +629,34 @@ TEST_CASE(script_extension)
     EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
     EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
 }
+
+TEST_CASE(code_point_display_name)
+{
+    auto code_point_display_name = [](u32 code_point) {
+        auto name = Unicode::code_point_display_name(code_point);
+        VERIFY(name.has_value());
+        return name.release_value();
+    };
+
+    // Control code points.
+    EXPECT_EQ(code_point_display_name(0), "NULL"sv);
+    EXPECT_EQ(code_point_display_name(1), "START OF HEADING"sv);
+    EXPECT_EQ(code_point_display_name(0xa), "LINE FEED"sv);
+
+    // Ideographic code points (which already appeared in a range in UnicodeData.txt).
+    EXPECT_EQ(code_point_display_name(0x3400), "CJK UNIFIED IDEOGRAPH-3400"sv);
+    EXPECT_EQ(code_point_display_name(0x3401), "CJK UNIFIED IDEOGRAPH-3401"sv);
+    EXPECT_EQ(code_point_display_name(0x3402), "CJK UNIFIED IDEOGRAPH-3402"sv);
+    EXPECT_EQ(code_point_display_name(0x4dbf), "CJK UNIFIED IDEOGRAPH-4DBF"sv);
+
+    EXPECT_EQ(code_point_display_name(0x20000), "CJK UNIFIED IDEOGRAPH-20000"sv);
+    EXPECT_EQ(code_point_display_name(0x20001), "CJK UNIFIED IDEOGRAPH-20001"sv);
+    EXPECT_EQ(code_point_display_name(0x20002), "CJK UNIFIED IDEOGRAPH-20002"sv);
+    EXPECT(!Unicode::code_point_display_name(0x2a6df).has_value());
+
+    // Ideographic code points (which appeared individually in UnicodeData.txt and were coalesced into a range).
+    EXPECT_EQ(code_point_display_name(0x2f800), "CJK COMPATIBILITY IDEOGRAPH-2F800"sv);
+    EXPECT_EQ(code_point_display_name(0x2f801), "CJK COMPATIBILITY IDEOGRAPH-2F801"sv);
+    EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv);
+    EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv);
+}
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
index d74bc5892af..520f94fc4b5 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -222,13 +222,10 @@ u32 to_unicode_uppercase(u32 code_point)
 #endif
 }
 
-Optional<StringView> code_point_display_name([[maybe_unused]] u32 code_point)
+Optional<String> code_point_display_name([[maybe_unused]] u32 code_point)
 {
 #if ENABLE_UNICODE_DATA
-    auto name = Detail::code_point_display_name(code_point);
-    if (name.is_null())
-        return {};
-    return name;
+    return Detail::code_point_display_name(code_point);
 #else
     return {};
 #endif
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h
index 20b6601c5a4..a22b7ce23c6 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -19,7 +19,7 @@ namespace Unicode {
 u32 to_unicode_lowercase(u32 code_point);
 u32 to_unicode_uppercase(u32 code_point);
 
-Optional<StringView> code_point_display_name(u32 code_point);
+Optional<String> code_point_display_name(u32 code_point);
 
 String to_unicode_lowercase_full(StringView, Optional<StringView> locale = {});
 String to_unicode_uppercase_full(StringView, Optional<StringView> locale = {});