From 139c575cc92d05b1621860791522d8a724586291 Mon Sep 17 00:00:00 2001
From: Timothy Flynn <trflynn89@pm.me>
Date: Fri, 15 Sep 2023 09:23:56 -0400
Subject: [PATCH] LibUnicode: Update to Unicode version 15.1.0

https://unicode.org/versions/Unicode15.1.0/

This update includes a new set of code point properties, Indic Conjunct
Break. These may have the values Consonant, Linker, or Extend. These are
used in text segmentation to prevent breaking on some extended grapheme
cluster sequences.
---
 Meta/CMake/unicode_data.cmake                 |  2 +-
 .../LibUnicode/GenerateUnicodeData.cpp        | 16 ++++++++-
 Tests/LibUnicode/TestSegmentation.cpp         | 21 ++++++++++++
 .../Libraries/LibUnicode/Segmentation.cpp     | 33 +++++++++++++++++++
 4 files changed, 70 insertions(+), 2 deletions(-)
diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake
index ba43f3dc185..dca7f69fe41 100644
--- a/Meta/CMake/unicode_data.cmake
+++ b/Meta/CMake/unicode_data.cmake
@@ -1,6 +1,6 @@
 include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)
 
-set(UCD_VERSION 15.0.0)
+set(UCD_VERSION 15.1.0)
 set(UCD_PATH "${SERENITY_CACHE_DIR}/UCD" CACHE PATH "Download location for UCD files")
 set(UCD_VERSION_FILE "${UCD_PATH}/version.txt")
 
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
index 196410a50b5..36afc1fda1d 100644
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
@@ -375,7 +375,21 @@ static ErrorOr<void> parse_prop_list(Core::InputBufferedFile& file, PropList& pr
             line = line.substring_view(0, *index);
 
         auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
-        VERIFY(segments.size() == 2);
+        VERIFY(segments.size() == 2 || segments.size() == 3);
+
+        String combined_segment_buffer;
+
+        if (segments.size() == 3) {
+            // For example, in DerivedCoreProperties.txt, there are lines such as:
+            //
+            //     094D          ; InCB; Linker # Mn       DEVANAGARI SIGN VIRAMA
+            //
+            // These are used in text segmentation to prevent breaking within some extended grapheme clusters.
+            // So here, we combine the segments into a single property, which allows us to simply do code point
+            // property lookups at runtime for specific Indic Conjunct Break sequences.
+            combined_segment_buffer = MUST(String::join('_', Array { segments[1].trim_whitespace(), segments[2].trim_whitespace() }));
+            segments[1] = combined_segment_buffer;
+        }
 
         auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
         Vector<StringView> properties;
diff --git a/Tests/LibUnicode/TestSegmentation.cpp b/Tests/LibUnicode/TestSegmentation.cpp
index 1159ae875b4..675ce060fe7 100644
--- a/Tests/LibUnicode/TestSegmentation.cpp
+++ b/Tests/LibUnicode/TestSegmentation.cpp
@@ -51,6 +51,27 @@ TEST_CASE(grapheme_segmentation)
     test_grapheme_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u });
 }
 
+TEST_CASE(grapheme_segmentation_indic_conjunct_break)
+{
+    test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
+    test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
+    test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
+
+    test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
+    test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
+    test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
+}
+
 template<size_t N>
 static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
 {
diff --git a/Userland/Libraries/LibUnicode/Segmentation.cpp b/Userland/Libraries/LibUnicode/Segmentation.cpp
index 750f52907ac..74a8dedfb40 100644
--- a/Userland/Libraries/LibUnicode/Segmentation.cpp
+++ b/Userland/Libraries/LibUnicode/Segmentation.cpp
@@ -57,6 +57,22 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy
         return (code_point_has_grapheme_break_property(code_point, properties) || ...);
     };
 
+    auto skip_incb_extend_linker_sequence = [&](auto& it) {
+        while (true) {
+            if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
+                return;
+
+            auto next_it = it;
+            ++next_it;
+
+            if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
+                return;
+
+            it = next_it;
+            ++it;
+        }
+    };
+
     // GB1
     if (callback(0) == IterationDecision::Break)
         return;
@@ -70,6 +86,23 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy
         for (++it; it != view.end(); ++it, code_point = next_code_point) {
             next_code_point = *it;
 
+            // GB9c
+            if (code_point_has_property(code_point, Property::InCB_Consonant)) {
+                auto it_copy = it;
+                skip_incb_extend_linker_sequence(it_copy);
+
+                if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
+                    ++it_copy;
+                    skip_incb_extend_linker_sequence(it_copy);
+
+                    if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
+                        next_code_point = *it_copy;
+                        it = it_copy;
+                        continue;
+                    }
+                }
+            }
+
             // GB11
             if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
                 auto it_copy = it;