LibUnicode: Add an overload of word segmentation for UTF-8 strings

This commit is contained in:
Timothy Flynn 2023-01-16 09:39:12 -05:00 committed by Tim Flynn
parent 40cb41a16c
commit 6d710eeb43
Notes: sideshowbarker 2024-07-17 02:57:43 +09:00
2 changed files with 40 additions and 6 deletions

View File

@ -227,24 +227,46 @@ Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View
#endif #endif
} }
Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View const& view) template<typename ViewType>
static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
{ {
#if ENABLE_UNICODE_DATA #if ENABLE_UNICODE_DATA
using WBP = WordBreakProperty; using WBP = WordBreakProperty;
Vector<size_t> boundaries; Vector<size_t> boundaries;
// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
if (view.length_in_code_points() == 0) if (view.is_empty())
return boundaries; return boundaries;
auto has_any_wbp = [](u32 code_point, auto&&... properties) { auto has_any_wbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_word_break_property(code_point, properties) || ...); return (code_point_has_word_break_property(code_point, properties) || ...);
}; };
size_t code_unit_length = 0;
size_t code_point_length = 0;
if constexpr (requires { view.byte_length(); }) {
code_unit_length = view.byte_length();
code_point_length = view.length();
} else if constexpr (requires { view.length_in_code_units(); }) {
code_unit_length = view.length_in_code_units();
code_point_length = view.length_in_code_points();
} else {
static_assert(DependentFalse<ViewType>);
}
auto code_unit_offset_of = [&](auto it) {
if constexpr (requires { view.byte_offset_of(it); })
return view.byte_offset_of(it);
else if constexpr (requires { view.code_unit_offset_of(it); })
return view.code_unit_offset_of(it);
VERIFY_NOT_REACHED();
};
// WB1 // WB1
boundaries.append(0); boundaries.append(0);
if (view.length_in_code_points() > 1) { if (code_point_length > 1) {
auto it = view.begin(); auto it = view.begin();
auto code_point = *it; auto code_point = *it;
u32 next_code_point; u32 next_code_point;
@ -262,7 +284,7 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
continue; continue;
// WB3a, WB3b // WB3a, WB3b
if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
boundaries.append(view.code_unit_offset_of(it)); boundaries.append(code_unit_offset_of(it));
continue; continue;
} }
// WB3c // WB3c
@ -367,18 +389,28 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
continue; continue;
// WB999 // WB999
boundaries.append(view.code_unit_offset_of(it)); boundaries.append(code_unit_offset_of(it));
} }
} }
// WB2 // WB2
boundaries.append(view.length_in_code_units()); boundaries.append(code_unit_length);
return boundaries; return boundaries;
#else #else
return {}; return {};
#endif #endif
} }
Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view)
{
return find_word_segmentation_boundaries_impl(view);
}
Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
{
return find_word_segmentation_boundaries_impl(view);
}
Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view) Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
{ {
#if ENABLE_UNICODE_DATA #if ENABLE_UNICODE_DATA

View File

@ -11,6 +11,7 @@
#include <AK/Optional.h> #include <AK/Optional.h>
#include <AK/Span.h> #include <AK/Span.h>
#include <AK/Types.h> #include <AK/Types.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h> #include <LibUnicode/Forward.h>
namespace Unicode { namespace Unicode {
@ -60,6 +61,7 @@ bool code_point_has_word_break_property(u32 code_point, WordBreakProperty proper
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&); Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_word_segmentation_boundaries(Utf8View const&);
Vector<size_t> find_word_segmentation_boundaries(Utf16View const&); Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&); Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);