mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-01-05 01:55:21 +03:00
LibUnicode: Add an overload of word segmentation for UTF-8 strings
This commit is contained in:
parent
40cb41a16c
commit
6d710eeb43
Notes:
sideshowbarker
2024-07-17 02:57:43 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/6d710eeb43 Pull-request: https://github.com/SerenityOS/serenity/pull/17048 Reviewed-by: https://github.com/linusg ✅
@ -227,24 +227,46 @@ Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
|
template<typename ViewType>
|
||||||
|
static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
|
||||||
{
|
{
|
||||||
#if ENABLE_UNICODE_DATA
|
#if ENABLE_UNICODE_DATA
|
||||||
using WBP = WordBreakProperty;
|
using WBP = WordBreakProperty;
|
||||||
Vector<size_t> boundaries;
|
Vector<size_t> boundaries;
|
||||||
|
|
||||||
// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
|
// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
|
||||||
if (view.length_in_code_points() == 0)
|
if (view.is_empty())
|
||||||
return boundaries;
|
return boundaries;
|
||||||
|
|
||||||
auto has_any_wbp = [](u32 code_point, auto&&... properties) {
|
auto has_any_wbp = [](u32 code_point, auto&&... properties) {
|
||||||
return (code_point_has_word_break_property(code_point, properties) || ...);
|
return (code_point_has_word_break_property(code_point, properties) || ...);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
size_t code_unit_length = 0;
|
||||||
|
size_t code_point_length = 0;
|
||||||
|
|
||||||
|
if constexpr (requires { view.byte_length(); }) {
|
||||||
|
code_unit_length = view.byte_length();
|
||||||
|
code_point_length = view.length();
|
||||||
|
} else if constexpr (requires { view.length_in_code_units(); }) {
|
||||||
|
code_unit_length = view.length_in_code_units();
|
||||||
|
code_point_length = view.length_in_code_points();
|
||||||
|
} else {
|
||||||
|
static_assert(DependentFalse<ViewType>);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto code_unit_offset_of = [&](auto it) {
|
||||||
|
if constexpr (requires { view.byte_offset_of(it); })
|
||||||
|
return view.byte_offset_of(it);
|
||||||
|
else if constexpr (requires { view.code_unit_offset_of(it); })
|
||||||
|
return view.code_unit_offset_of(it);
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
};
|
||||||
|
|
||||||
// WB1
|
// WB1
|
||||||
boundaries.append(0);
|
boundaries.append(0);
|
||||||
|
|
||||||
if (view.length_in_code_points() > 1) {
|
if (code_point_length > 1) {
|
||||||
auto it = view.begin();
|
auto it = view.begin();
|
||||||
auto code_point = *it;
|
auto code_point = *it;
|
||||||
u32 next_code_point;
|
u32 next_code_point;
|
||||||
@ -262,7 +284,7 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
|
|||||||
continue;
|
continue;
|
||||||
// WB3a, WB3b
|
// WB3a, WB3b
|
||||||
if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
|
if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
|
||||||
boundaries.append(view.code_unit_offset_of(it));
|
boundaries.append(code_unit_offset_of(it));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// WB3c
|
// WB3c
|
||||||
@ -367,18 +389,28 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
// WB999
|
// WB999
|
||||||
boundaries.append(view.code_unit_offset_of(it));
|
boundaries.append(code_unit_offset_of(it));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// WB2
|
// WB2
|
||||||
boundaries.append(view.length_in_code_units());
|
boundaries.append(code_unit_length);
|
||||||
return boundaries;
|
return boundaries;
|
||||||
#else
|
#else
|
||||||
return {};
|
return {};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view)
|
||||||
|
{
|
||||||
|
return find_word_segmentation_boundaries_impl(view);
|
||||||
|
}
|
||||||
|
|
||||||
|
Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
|
||||||
|
{
|
||||||
|
return find_word_segmentation_boundaries_impl(view);
|
||||||
|
}
|
||||||
|
|
||||||
Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
|
Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
|
||||||
{
|
{
|
||||||
#if ENABLE_UNICODE_DATA
|
#if ENABLE_UNICODE_DATA
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
#include <AK/Optional.h>
|
#include <AK/Optional.h>
|
||||||
#include <AK/Span.h>
|
#include <AK/Span.h>
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
|
#include <AK/Vector.h>
|
||||||
#include <LibUnicode/Forward.h>
|
#include <LibUnicode/Forward.h>
|
||||||
|
|
||||||
namespace Unicode {
|
namespace Unicode {
|
||||||
@ -60,6 +61,7 @@ bool code_point_has_word_break_property(u32 code_point, WordBreakProperty proper
|
|||||||
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
|
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
|
||||||
|
|
||||||
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
|
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
|
||||||
|
Vector<size_t> find_word_segmentation_boundaries(Utf8View const&);
|
||||||
Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
|
Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
|
||||||
Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);
|
Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user