diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp index 38799ddb61a..6c40eba4887 100644 --- a/Tests/AK/TestString.cpp +++ b/Tests/AK/TestString.cpp @@ -211,7 +211,7 @@ TEST_CASE(to_titlecase) { auto string = MUST(String::from_utf8("f\"oo\" b'ar'"sv)); auto result = MUST(string.to_titlecase()); - EXPECT_EQ(result, "F\"Oo\" B'Ar'"sv); + EXPECT_EQ(result, "F\"Oo\" B'ar'"sv); } { auto string = MUST(String::from_utf8("123dollars"sv)); diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index cc13963e272..211b4369df6 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -93,7 +93,7 @@ TEST_CASE(to_unicode_titlecase) EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv); EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv); - EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'Ar'"sv); + EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'ar'"sv); EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv); } diff --git a/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js b/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js index 3db51e4e462..a1a527642c8 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js +++ b/Userland/Libraries/LibJS/Tests/builtins/Intl/Segmenter/Segmenter.prototype.segment.js @@ -82,7 +82,6 @@ describe("correct behavior", () => { ]; index = 0; for (const segment of wordSegments) { - console.log(JSON.stringify(segment)); expect(segment.segment).toBe(expectedSegments[index].segment); expect(segment.index).toBe(expectedSegments[index].index); expect(segment.input).toBe(string); @@ -103,4 +102,45 @@ describe("correct behavior", () => { } expect(index).toBe(1); }); + + test("word segmentation of string with mid-word punctuation", () => { + const string = "The quick (“brown”) fox can’t jump 32.3 feet, right?"; + + const segmenter = new Intl.Segmenter([], { granularity: "word" }); + const segments = segmenter.segment(string); + + const expectedSegments = [ + { segment: "The", index: 0, isWordLike: true }, + { segment: " ", index: 3, isWordLike: false }, + { segment: "quick", index: 4, isWordLike: true }, + { segment: " ", index: 9, isWordLike: false }, + { segment: "(", index: 10, isWordLike: false }, + { segment: "“", index: 11, isWordLike: false }, + { segment: "brown", index: 12, isWordLike: true }, + { segment: "”", index: 17, isWordLike: false }, + { segment: ")", index: 18, isWordLike: false }, + { segment: " ", index: 19, isWordLike: false }, + { segment: "fox", index: 20, isWordLike: true }, + { segment: " ", index: 23, isWordLike: false }, + { segment: "can’t", index: 24, isWordLike: true }, + { segment: " ", index: 29, isWordLike: false }, + { segment: "jump", index: 30, isWordLike: true }, + { segment: " ", index: 34, isWordLike: false }, + { segment: "32.3", index: 35, isWordLike: true }, + { segment: " ", index: 39, isWordLike: false }, + { segment: "feet", index: 40, isWordLike: true }, + { segment: ",", index: 44, isWordLike: false }, + { segment: " ", index: 45, isWordLike: false }, + { segment: "right", index: 46, isWordLike: true }, + { segment: "?", index: 51, isWordLike: false }, + ]; + + let index = 0; + for (const segment of segments) { + expect(segment.segment).toBe(expectedSegments[index].segment); + expect(segment.index).toBe(expectedSegments[index].index); + expect(segment.input).toBe(string); + index++; + } + }); }); diff --git a/Userland/Libraries/LibUnicode/Segmentation.cpp b/Userland/Libraries/LibUnicode/Segmentation.cpp index f843f1f5a06..47715efe845 100644 --- a/Userland/Libraries/LibUnicode/Segmentation.cpp +++ b/Userland/Libraries/LibUnicode/Segmentation.cpp @@ -215,7 +215,7 @@ static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType c auto it_copy = it; ++it_copy; if (it_copy != view.end()) - next_next_code_point = *it; + next_next_code_point = *it_copy; } bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter); bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter)); @@ -256,7 +256,7 @@ static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType c if (code_point_is_numeric && next_code_point_is_ah_letter) continue; - auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric); + auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric); // WB11 if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))