From 5cbf05465183216a3ae30c2d434375e361df5313 Mon Sep 17 00:00:00 2001
From: Timothy Flynn <trflynn89@pm.me>
Date: Tue, 14 Feb 2023 13:56:19 -0500
Subject: [PATCH] LibUnicode: Fix typos causing text segmentation on mid-word
 punctuation

For example the words "can't" and "32.3" should not have boundaries
detected on the "'" and "." code points, respectively.

The String test cases fixed here are because "b'ar" is now considered
one word.
---
 Tests/AK/TestString.cpp                       |  2 +-
 .../LibUnicode/TestUnicodeCharacterTypes.cpp  |  2 +-
 .../Segmenter/Segmenter.prototype.segment.js  | 42 ++++++++++++++++++-
 .../Libraries/LibUnicode/Segmentation.cpp     |  4 +-
 4 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp
index 38799ddb61a..6c40eba4887 100644
--- a/Tests/AK/TestString.cpp
+++ b/Tests/AK/TestString.cpp
@@ -211,7 +211,7 @@ TEST_CASE(to_titlecase)
     {
         auto string = MUST(String::from_utf8("f\"oo\" b'ar'"sv));
         auto result = MUST(string.to_titlecase());
-        EXPECT_EQ(result, "F\"Oo\" B'Ar'"sv);
+        EXPECT_EQ(result, "F\"Oo\" B'ar'"sv);
     }
     {
         auto string = MUST(String::from_utf8("123dollars"sv));
diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
index cc13963e272..211b4369df6 100644
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@@ -93,7 +93,7 @@ TEST_CASE(to_unicode_titlecase)
 
     EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv);
     EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv);
-    EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'Ar'"sv);
+    EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'ar'"sv);
     EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv);
 }
 
diff --git a/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js b/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js
index 3db51e4e462..a1a527642c8 100644
--- a/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js
+++ b/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js
@@ -82,7 +82,6 @@ describe("correct behavior", () => {
         ];
         index = 0;
         for (const segment of wordSegments) {
-            console.log(JSON.stringify(segment));
             expect(segment.segment).toBe(expectedSegments[index].segment);
             expect(segment.index).toBe(expectedSegments[index].index);
             expect(segment.input).toBe(string);
@@ -103,4 +102,45 @@ describe("correct behavior", () => {
         }
         expect(index).toBe(1);
     });
+
+    test("word segmentation of string with mid-word punctuation", () => {
+        const string = "The quick (“brown”) fox can’t jump 32.3 feet, right?";
+
+        const segmenter = new Intl.Segmenter([], { granularity: "word" });
+        const segments = segmenter.segment(string);
+
+        const expectedSegments = [
+            { segment: "The", index: 0, isWordLike: true },
+            { segment: " ", index: 3, isWordLike: false },
+            { segment: "quick", index: 4, isWordLike: true },
+            { segment: " ", index: 9, isWordLike: false },
+            { segment: "(", index: 10, isWordLike: false },
+            { segment: "“", index: 11, isWordLike: false },
+            { segment: "brown", index: 12, isWordLike: true },
+            { segment: "”", index: 17, isWordLike: false },
+            { segment: ")", index: 18, isWordLike: false },
+            { segment: " ", index: 19, isWordLike: false },
+            { segment: "fox", index: 20, isWordLike: true },
+            { segment: " ", index: 23, isWordLike: false },
+            { segment: "can’t", index: 24, isWordLike: true },
+            { segment: " ", index: 29, isWordLike: false },
+            { segment: "jump", index: 30, isWordLike: true },
+            { segment: " ", index: 34, isWordLike: false },
+            { segment: "32.3", index: 35, isWordLike: true },
+            { segment: " ", index: 39, isWordLike: false },
+            { segment: "feet", index: 40, isWordLike: true },
+            { segment: ",", index: 44, isWordLike: false },
+            { segment: " ", index: 45, isWordLike: false },
+            { segment: "right", index: 46, isWordLike: true },
+            { segment: "?", index: 51, isWordLike: false },
+        ];
+
+        let index = 0;
+        for (const segment of segments) {
+            expect(segment.segment).toBe(expectedSegments[index].segment);
+            expect(segment.index).toBe(expectedSegments[index].index);
+            expect(segment.input).toBe(string);
+            index++;
+        }
+    });
 });
diff --git a/Userland/Libraries/LibUnicode/Segmentation.cpp b/Userland/Libraries/LibUnicode/Segmentation.cpp
index f843f1f5a06..47715efe845 100644
--- a/Userland/Libraries/LibUnicode/Segmentation.cpp
+++ b/Userland/Libraries/LibUnicode/Segmentation.cpp
@@ -215,7 +215,7 @@ static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType c
                 auto it_copy = it;
                 ++it_copy;
                 if (it_copy != view.end())
-                    next_next_code_point = *it;
+                    next_next_code_point = *it_copy;
             }
             bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
             bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
@@ -256,7 +256,7 @@ static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType c
             if (code_point_is_numeric && next_code_point_is_ah_letter)
                 continue;
 
-            auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric);
+            auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
 
             // WB11
             if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))