diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index ba43f3dc185..dca7f69fe41 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -1,6 +1,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) -set(UCD_VERSION 15.0.0) +set(UCD_VERSION 15.1.0) set(UCD_PATH "${SERENITY_CACHE_DIR}/UCD" CACHE PATH "Download location for UCD files") set(UCD_VERSION_FILE "${UCD_PATH}/version.txt") diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 196410a50b5..36afc1fda1d 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -375,7 +375,21 @@ static ErrorOr parse_prop_list(Core::InputBufferedFile& file, PropList& pr line = line.substring_view(0, *index); auto segments = line.split_view(';', SplitBehavior::KeepEmpty); - VERIFY(segments.size() == 2); + VERIFY(segments.size() == 2 || segments.size() == 3); + + String combined_segment_buffer; + + if (segments.size() == 3) { + // For example, in DerivedCoreProperties.txt, there are lines such as: + // + // 094D ; InCB; Linker # Mn DEVANAGARI SIGN VIRAMA + // + // These are used in text segmentation to prevent breaking within some extended grapheme clusters. + // So here, we combine the segments into a single property, which allows us to simply do code point + // property lookups at runtime for specific Indic Conjunct Break sequences. + combined_segment_buffer = MUST(String::join('_', Array { segments[1].trim_whitespace(), segments[2].trim_whitespace() })); + segments[1] = combined_segment_buffer; + } auto code_point_range = parse_code_point_range(segments[0].trim_whitespace()); Vector properties; diff --git a/Tests/LibUnicode/TestSegmentation.cpp b/Tests/LibUnicode/TestSegmentation.cpp index 1159ae875b4..675ce060fe7 100644 --- a/Tests/LibUnicode/TestSegmentation.cpp +++ b/Tests/LibUnicode/TestSegmentation.cpp @@ -51,6 +51,27 @@ TEST_CASE(grapheme_segmentation) test_grapheme_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u }); } +TEST_CASE(grapheme_segmentation_indic_conjunct_break) +{ + test_grapheme_segmentation("\u0915"sv, { 0u, 3u }); + test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u }); + test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u }); + + test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u }); + + test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u }); + test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u }); + + test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u }); + test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u }); + test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u }); + + test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u }); + test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u }); + + test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u }); +} + template static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N]) { diff --git a/Userland/Libraries/LibUnicode/Segmentation.cpp b/Userland/Libraries/LibUnicode/Segmentation.cpp index 750f52907ac..74a8dedfb40 100644 --- a/Userland/Libraries/LibUnicode/Segmentation.cpp +++ b/Userland/Libraries/LibUnicode/Segmentation.cpp @@ -57,6 +57,22 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy return (code_point_has_grapheme_break_property(code_point, properties) || ...); }; + auto skip_incb_extend_linker_sequence = [&](auto& it) { + while (true) { + if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend)) + return; + + auto next_it = it; + ++next_it; + + if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker)) + return; + + it = next_it; + ++it; + } + }; + // GB1 if (callback(0) == IterationDecision::Break) return; @@ -70,6 +86,23 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy for (++it; it != view.end(); ++it, code_point = next_code_point) { next_code_point = *it; + // GB9c + if (code_point_has_property(code_point, Property::InCB_Consonant)) { + auto it_copy = it; + skip_incb_extend_linker_sequence(it_copy); + + if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) { + ++it_copy; + skip_incb_extend_linker_sequence(it_copy); + + if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) { + next_code_point = *it_copy; + it = it_copy; + continue; + } + } + } + // GB11 if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) { auto it_copy = it;