mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-08-15 08:20:39 +03:00
LibUnicode: Add a unit test for Unicode grapheme and word segmentation
These include tests for previously broken boundary conditions.
This commit is contained in:
parent
fa96811a22
commit
73239fdd82
Notes:
sideshowbarker
2024-07-17 14:36:19 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/73239fdd82 Pull-request: https://github.com/SerenityOS/serenity/pull/17624 Reviewed-by: https://github.com/linusg ✅
@ -1,5 +1,6 @@
|
||||
set(TEST_SOURCES
|
||||
TestEmoji.cpp
|
||||
TestSegmentation.cpp
|
||||
TestUnicodeCharacterTypes.cpp
|
||||
TestUnicodeNormalization.cpp
|
||||
)
|
||||
|
101
Tests/LibUnicode/TestSegmentation.cpp
Normal file
101
Tests/LibUnicode/TestSegmentation.cpp
Normal file
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Array.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibTest/TestCase.h>
|
||||
#include <LibUnicode/Segmentation.h>
|
||||
|
||||
template<size_t N>
|
||||
static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
Utf8View view { string };
|
||||
|
||||
Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(grapheme_segmentation)
|
||||
{
|
||||
Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) {
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_grapheme_segmentation("a"sv, { 0u, 1u });
|
||||
test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
|
||||
test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
|
||||
|
||||
test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
|
||||
test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
|
||||
test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
|
||||
|
||||
test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
|
||||
test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
|
||||
|
||||
test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_grapheme_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
}
|
||||
|
||||
template<size_t N>
|
||||
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
Utf8View view { string };
|
||||
|
||||
Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(word_segmentation)
|
||||
{
|
||||
Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) {
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_word_segmentation("a"sv, { 0u, 1u });
|
||||
test_word_segmentation("ab"sv, { 0u, 2u });
|
||||
test_word_segmentation("abc"sv, { 0u, 3u });
|
||||
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
|
||||
test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
|
||||
test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
|
||||
|
||||
test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_word_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
|
||||
test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
|
||||
test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
|
||||
test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
|
||||
|
||||
test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
|
||||
|
||||
test_word_segmentation(
|
||||
"The quick (“brown”) fox can’t jump 32.3 feet, right?"sv,
|
||||
{ 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
|
||||
}
|
Loading…
Reference in New Issue
Block a user