LibUnicode: Update to Unicode version 15.1.0

https://unicode.org/versions/Unicode15.1.0/

This update includes a new set of code point properties, Indic Conjunct
Break. These may have the values Consonant, Linker, or Extend. These are
used in text segmentation to prevent breaking on some extended grapheme
cluster sequences.
This commit is contained in:
Timothy Flynn 2023-09-15 09:23:56 -04:00 committed by Andreas Kling
parent ae15b68b79
commit 139c575cc9
Notes: sideshowbarker 2024-07-17 07:16:27 +09:00
4 changed files with 70 additions and 2 deletions

View File

@ -1,6 +1,6 @@
include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)
set(UCD_VERSION 15.0.0)
set(UCD_VERSION 15.1.0)
set(UCD_PATH "${SERENITY_CACHE_DIR}/UCD" CACHE PATH "Download location for UCD files")
set(UCD_VERSION_FILE "${UCD_PATH}/version.txt")

View File

@ -375,7 +375,21 @@ static ErrorOr<void> parse_prop_list(Core::InputBufferedFile& file, PropList& pr
line = line.substring_view(0, *index);
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
VERIFY(segments.size() == 2);
VERIFY(segments.size() == 2 || segments.size() == 3);
String combined_segment_buffer;
if (segments.size() == 3) {
// For example, in DerivedCoreProperties.txt, there are lines such as:
//
// 094D ; InCB; Linker # Mn DEVANAGARI SIGN VIRAMA
//
// These are used in text segmentation to prevent breaking within some extended grapheme clusters.
// So here, we combine the segments into a single property, which allows us to simply do code point
// property lookups at runtime for specific Indic Conjunct Break sequences.
combined_segment_buffer = MUST(String::join('_', Array { segments[1].trim_whitespace(), segments[2].trim_whitespace() }));
segments[1] = combined_segment_buffer;
}
auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
Vector<StringView> properties;

View File

@ -51,6 +51,27 @@ TEST_CASE(grapheme_segmentation)
test_grapheme_segmentation("a👩🏼👨🏻b"sv, { 0u, 1u, 29u, 30u });
}
TEST_CASE(grapheme_segmentation_indic_conjunct_break)
{
test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
}
template<size_t N>
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
{

View File

@ -57,6 +57,22 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy
return (code_point_has_grapheme_break_property(code_point, properties) || ...);
};
auto skip_incb_extend_linker_sequence = [&](auto& it) {
while (true) {
if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
return;
auto next_it = it;
++next_it;
if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
return;
it = next_it;
++it;
}
};
// GB1
if (callback(0) == IterationDecision::Break)
return;
@ -70,6 +86,23 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy
for (++it; it != view.end(); ++it, code_point = next_code_point) {
next_code_point = *it;
// GB9c
if (code_point_has_property(code_point, Property::InCB_Consonant)) {
auto it_copy = it;
skip_incb_extend_linker_sequence(it_copy);
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
++it_copy;
skip_incb_extend_linker_sequence(it_copy);
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
next_code_point = *it_copy;
it = it_copy;
continue;
}
}
}
// GB11
if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
auto it_copy = it;