From 1484d3d9f51e3ccade772931131e7d998933382e Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Thu, 23 Feb 2023 08:33:22 -0500 Subject: [PATCH] LibUnicode: Add a method to check if a code point could start an emoji --- Tests/LibUnicode/CMakeLists.txt | 1 + Tests/LibUnicode/TestEmoji.cpp | 54 ++++++++++++ Userland/Libraries/LibUnicode/Emoji.cpp | 104 +++++++++++++++++++++++- Userland/Libraries/LibUnicode/Emoji.h | 6 +- 4 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 Tests/LibUnicode/TestEmoji.cpp diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt index 088ddce8bf3..0b43421ce27 100644 --- a/Tests/LibUnicode/CMakeLists.txt +++ b/Tests/LibUnicode/CMakeLists.txt @@ -1,4 +1,5 @@ set(TEST_SOURCES + TestEmoji.cpp TestUnicodeCharacterTypes.cpp TestUnicodeNormalization.cpp ) diff --git a/Tests/LibUnicode/TestEmoji.cpp b/Tests/LibUnicode/TestEmoji.cpp new file mode 100644 index 00000000000..591d02445bd --- /dev/null +++ b/Tests/LibUnicode/TestEmoji.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include + +// These emojis are the first subgroup in each Unicode-defined group of emojis, plus some interesting +// hand-picked test cases (such as keycap emoji, which begin with ASCII symbols, and country flags). +static constexpr auto s_smileys_emotion = Array { "๐Ÿ˜€"sv, "๐Ÿ˜ƒ"sv, "๐Ÿ˜„"sv, "๐Ÿ˜"sv, "๐Ÿ˜†"sv, "๐Ÿ˜…"sv, "๐Ÿคฃ"sv, "๐Ÿ˜‚"sv, "๐Ÿ™‚"sv, "๐Ÿ™ƒ"sv, "๐Ÿซ "sv, "๐Ÿ˜‰"sv, "๐Ÿ˜Š"sv, "๐Ÿ˜‡"sv }; +static constexpr auto s_people_body = Array { "๐Ÿ‘‹"sv, "๐Ÿคš"sv, "๐Ÿ–๏ธ"sv, "๐Ÿ–"sv, "โœ‹"sv, "๐Ÿซฑ"sv, "๐Ÿซฒ"sv, "๐Ÿซณ"sv, "๐Ÿซด"sv, "๐Ÿซท"sv, "๐Ÿซธ"sv }; +static constexpr auto s_animals_nature = Array { "๐Ÿถ"sv, "๐Ÿ•"sv, "๐Ÿ•โ€๐Ÿฆบ"sv, "๐Ÿฉ"sv, "๐ŸฆŠ"sv, "๐Ÿฆ"sv, "๐Ÿฑ"sv, "๐Ÿˆ"sv, "๐Ÿˆโ€โฌ›"sv, "๐Ÿฆ"sv, "๐Ÿฏ"sv, "๐Ÿด"sv, "๐ŸซŽ"sv, "๐Ÿซ"sv, "๐ŸŽ"sv, "๐Ÿฆ„"sv, "๐Ÿฆ“"sv, "๐ŸฆŒ"sv, "๐Ÿฆฌ"sv, "๐Ÿฎ"sv, "๐Ÿท"sv, "๐Ÿ–"sv, "๐Ÿ—"sv, "๐Ÿฝ"sv, "๐Ÿ‘"sv, "๐Ÿฆ™"sv, "๐Ÿฆ’"sv, "๐Ÿ˜"sv, "๐Ÿญ"sv, "๐Ÿ"sv, "๐Ÿ€"sv, "๐Ÿฐ"sv, "๐Ÿ‡"sv, "๐Ÿฟ๏ธ"sv, "๐Ÿฟ"sv, "๐Ÿฆ”"sv, "๐Ÿฆ‡"sv, "๐Ÿป"sv, "๐Ÿปโ€โ„๏ธ"sv, "๐Ÿปโ€โ„"sv, "๐Ÿจ"sv, "๐Ÿผ"sv, "๐Ÿฆฅ"sv, "๐Ÿฆ˜"sv, "๐Ÿฆก"sv, "๐Ÿพ"sv }; +static constexpr auto s_food_drink = Array { "๐Ÿ‡"sv, "๐Ÿˆ"sv, "๐Ÿ‰"sv, "๐ŸŠ"sv, "๐Ÿ‹"sv, "๐ŸŒ"sv, "๐Ÿ"sv, "๐Ÿฅญ"sv, "๐ŸŽ"sv, "๐Ÿ"sv, "๐Ÿ"sv, "๐Ÿ‘"sv, "๐Ÿ’"sv, "๐Ÿ“"sv, "๐Ÿซ"sv, "๐Ÿฅ"sv, "๐Ÿ…"sv, "๐Ÿซ’"sv, "๐Ÿฅฅ"sv }; +static constexpr auto s_travel_places = Array { "๐ŸŒ"sv, "๐ŸŒŽ"sv, "๐ŸŒ"sv, "๐ŸŒ"sv, "๐Ÿ—บ๏ธ"sv, "๐Ÿ—บ"sv, "๐Ÿ—พ"sv, "๐Ÿงญ"sv }; +static constexpr auto s_activities = Array { "๐ŸŽƒ"sv, "๐ŸŽ„"sv, "๐ŸŽ†"sv, "๐ŸŽ‡"sv, "๐Ÿงจ"sv, "โœจ"sv, "๐ŸŽˆ"sv, "๐ŸŽ‰"sv, "๐ŸŽŠ"sv, "๐ŸŽ‹"sv, "๐ŸŽ"sv, "๐ŸŽ"sv, "๐ŸŽ‘"sv, "๐ŸŽ€"sv, "๐ŸŽ"sv, "๐ŸŽ—๏ธ"sv, "๐ŸŽ—"sv, "๐ŸŽŸ๏ธ"sv, "๐ŸŽŸ"sv, "๐ŸŽซ"sv }; +static constexpr auto s_objects = Array { "๐Ÿ‘“"sv, "๐Ÿ•ถ๏ธ"sv, "๐Ÿ•ถ"sv, "๐Ÿฆบ"sv, "๐Ÿ‘”"sv, "๐Ÿ‘–"sv, "๐Ÿงฆ"sv, "๐Ÿ‘—"sv, "๐Ÿฅป"sv, "๐Ÿฉฑ"sv, "๐Ÿฉฒ"sv, "๐Ÿฉณ"sv, "๐Ÿ‘™"sv, "๐Ÿชญ"sv, "๐Ÿ‘›"sv, "๐Ÿ‘œ"sv, "๐Ÿ›๏ธ"sv, "๐Ÿ›"sv, "๐Ÿฉด"sv, "๐Ÿ‘ก"sv, "๐Ÿ‘ข"sv, "๐Ÿชฎ"sv, "๐Ÿ‘‘"sv, "๐ŸŽฉ"sv, "๐ŸŽ“"sv, "๐Ÿช–"sv, "โ›‘๏ธ"sv, "โ›‘"sv, "๐Ÿ’„"sv, "๐Ÿ’"sv, "๐Ÿ’Ž"sv }; +static constexpr auto s_symbols = Array { "๐Ÿšฎ"sv, "๐Ÿšฐ"sv, "โ™ฟ"sv, "๐Ÿšน"sv, "๐Ÿšบ"sv, "๐Ÿšพ"sv, "๐Ÿ›‚"sv, "๐Ÿ›ƒ"sv, "๐Ÿ›„"sv, "๐Ÿ›…"sv, "#๏ธโƒฃ"sv, "#โƒฃ"sv, "*๏ธโƒฃ"sv, "*โƒฃ"sv, "0๏ธโƒฃ"sv, "0โƒฃ"sv, "1๏ธโƒฃ"sv, "1โƒฃ"sv, "2๏ธโƒฃ"sv, "2โƒฃ"sv, "3๏ธโƒฃ"sv, "3โƒฃ"sv, "4๏ธโƒฃ"sv, "4โƒฃ"sv, "5๏ธโƒฃ"sv, "5โƒฃ"sv, "6๏ธโƒฃ"sv, "6โƒฃ"sv, "7๏ธโƒฃ"sv, "7โƒฃ"sv, "8๏ธโƒฃ"sv, "8โƒฃ"sv, "9๏ธโƒฃ"sv, "9โƒฃ"sv, "๐Ÿ”Ÿ"sv }; +static constexpr auto s_flags = Array { "๐Ÿ"sv, "๐Ÿšฉ"sv, "๐ŸŽŒ"sv, "๐Ÿด"sv, "๐Ÿณ๏ธ"sv, "๐Ÿณ"sv, "๐Ÿณ๏ธโ€๐ŸŒˆ"sv, "๐Ÿณโ€๐ŸŒˆ"sv, "๐Ÿณ๏ธโ€โšง๏ธ"sv, "๐Ÿณโ€โšง๏ธ"sv, "๐Ÿณ๏ธโ€โšง"sv, "๐Ÿณโ€โšง"sv, "๐Ÿดโ€โ˜ ๏ธ"sv, "๐Ÿดโ€โ˜ "sv, "๐Ÿ‡ฆ๐Ÿ‡จ"sv, "๐Ÿ‡ฆ๐Ÿ‡ฉ"sv, "๐Ÿ‡ฆ๐Ÿ‡ช"sv, "๐Ÿ‡ฆ๐Ÿ‡ซ"sv, "๐Ÿ‡ฆ๐Ÿ‡ฌ"sv, "๐Ÿ‡ฆ๐Ÿ‡ฎ"sv, "๐Ÿ‡ฆ๐Ÿ‡ฑ"sv, "๐Ÿ‡ฆ๐Ÿ‡ฒ"sv, "๐Ÿ‡ฆ๐Ÿ‡ด"sv, "๐Ÿ‡ฆ๐Ÿ‡ถ"sv, "๐Ÿ‡ฆ๐Ÿ‡ท"sv, "๐Ÿ‡ฆ๐Ÿ‡ธ"sv, "๐Ÿ‡ฆ๐Ÿ‡น"sv, "๐Ÿ‡ฆ๐Ÿ‡บ"sv, "๐Ÿ‡ฆ๐Ÿ‡ผ"sv, "๐Ÿ‡ฆ๐Ÿ‡ฝ"sv, "๐Ÿ‡ฆ๐Ÿ‡ฟ"sv, "๐Ÿ‡ง๐Ÿ‡ฆ"sv, "๐Ÿ‡ง๐Ÿ‡ง"sv, "๐Ÿ‡ง๐Ÿ‡ฉ"sv, "๐Ÿ‡ง๐Ÿ‡ช"sv, "๐Ÿ‡ง๐Ÿ‡ซ"sv, "๐Ÿ‡ง๐Ÿ‡ฌ"sv, "๐Ÿ‡ง๐Ÿ‡ญ"sv, "๐Ÿ‡ง๐Ÿ‡ฎ"sv, "๐Ÿ‡ง๐Ÿ‡ฏ"sv, "๐Ÿ‡ง๐Ÿ‡ฑ"sv, "๐Ÿ‡ง๐Ÿ‡ฒ"sv, "๐Ÿ‡ง๐Ÿ‡ณ"sv, "๐Ÿ‡ง๐Ÿ‡ด"sv, "๐Ÿ‡ง๐Ÿ‡ถ"sv, "๐Ÿ‡ง๐Ÿ‡ท"sv, "๐Ÿ‡ง๐Ÿ‡ธ"sv }; + +TEST_CASE(emoji) +{ + auto test_emojis = [](auto const& emojis) { + for (auto emoji : emojis) { + Utf8View view { emoji }; + EXPECT(Unicode::could_be_start_of_emoji_sequence(view.begin())); + } + }; + + test_emojis(s_smileys_emotion); + test_emojis(s_people_body); + test_emojis(s_animals_nature); + test_emojis(s_food_drink); + test_emojis(s_travel_places); + test_emojis(s_activities); + test_emojis(s_objects); + test_emojis(s_symbols); + test_emojis(s_flags); +} + +TEST_CASE(ascii_is_not_emoji) +{ + for (u32 code_point = 0u; is_ascii(code_point); ++code_point) { + auto string = String::from_code_point(code_point); + Utf8View view { string }; + + EXPECT(!Unicode::could_be_start_of_emoji_sequence(view.begin())); + } +} diff --git a/Userland/Libraries/LibUnicode/Emoji.cpp b/Userland/Libraries/LibUnicode/Emoji.cpp index 4322342fdf9..32e45d93516 100644 --- a/Userland/Libraries/LibUnicode/Emoji.cpp +++ b/Userland/Libraries/LibUnicode/Emoji.cpp @@ -1,13 +1,115 @@ /* - * Copyright (c) 2022, Tim Flynn + * Copyright (c) 2022-2023, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ +#include +#include +#include +#include #include +#if ENABLE_UNICODE_DATA +# include +#endif + namespace Unicode { Optional __attribute__((weak)) find_emoji_for_code_points(ReadonlySpan) { return {}; } +#if ENABLE_UNICODE_DATA + +// https://unicode.org/reports/tr51/#def_emoji_core_sequence +static bool could_be_start_of_emoji_core_sequence(u32 code_point, Optional const& next_code_point) +{ + // emoji_core_sequence := emoji_character | emoji_presentation_sequence | emoji_keycap_sequence | emoji_modifier_sequence | emoji_flag_sequence + + static constexpr auto emoji_presentation_selector = 0xFE0Fu; + static constexpr auto combining_enclosing_keycap = 0x20E3u; + + // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence + // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3} + if (is_ascii_digit(code_point) || code_point == '#' || code_point == '*') + return next_code_point == emoji_presentation_selector || next_code_point == combining_enclosing_keycap; + + // A little non-standard, but all other ASCII code points are not the beginning of any emoji sequence. + if (is_ascii(code_point)) + return false; + + // https://unicode.org/reports/tr51/#def_emoji_character + if (code_point_has_property(code_point, Property::Emoji)) + return true; + + // https://unicode.org/reports/tr51/#def_emoji_presentation_sequence + // emoji_presentation_sequence := emoji_character emoji_presentation_selector + if (next_code_point == emoji_presentation_selector) + return true; + + // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence + // emoji_modifier_sequence := emoji_modifier_base emoji_modifier + if (code_point_has_property(code_point, Property::Emoji_Modifier_Base)) + return true; + + // https://unicode.org/reports/tr51/#def_emoji_flag_sequence + // emoji_flag_sequence := regional_indicator regional_indicator + if (code_point_has_property(code_point, Property::Regional_Indicator)) + return true; + + return false; +} + +static bool could_be_start_of_serenity_emoji(u32 code_point) +{ + // We use Supplementary Private Use Area-B for custom Serenity emoji, starting at U+10CD00. + static constexpr auto first_custom_serenity_emoji_code_point = 0x10CD00u; + + return code_point >= first_custom_serenity_emoji_code_point; +} + +#endif + +// https://unicode.org/reports/tr51/#def_emoji_sequence +template +static bool could_be_start_of_emoji_sequence_impl(CodePointIterator const& it) +{ + // emoji_sequence := emoji_core_sequence | emoji_zwj_sequence | emoji_tag_sequence + + if (it.done()) + return false; + +#if ENABLE_UNICODE_DATA + // The purpose of this method is to quickly filter out code points that cannot be the start of + // an emoji. The emoji_core_sequence definition alone captures the start of all possible + // emoji_zwj_sequence and emoji_tag_sequence emojis, because: + // + // * emoji_zwj_sequence must begin with emoji_zwj_element, which is: + // emoji_zwj_sequence := emoji_core_sequence | emoji_tag_sequence + // + // * emoji_tag_sequence must begin with tag_base, which is: + // tag_base := emoji_character | emoji_modifier_sequence | emoji_presentation_sequence + // Note that this is a subset of emoji_core_sequence. + auto code_point = *it; + auto next_code_point = it.peek(1); + + if (could_be_start_of_emoji_core_sequence(code_point, next_code_point)) + return true; + if (could_be_start_of_serenity_emoji(code_point)) + return true; + return false; +#else + return true; +#endif +} + +bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const& it) +{ + return could_be_start_of_emoji_sequence_impl(it); +} + +bool could_be_start_of_emoji_sequence(Utf32CodePointIterator const& it) +{ + return could_be_start_of_emoji_sequence_impl(it); +} + } diff --git a/Userland/Libraries/LibUnicode/Emoji.h b/Userland/Libraries/LibUnicode/Emoji.h index dcb137ba0a0..7e772e9c3ef 100644 --- a/Userland/Libraries/LibUnicode/Emoji.h +++ b/Userland/Libraries/LibUnicode/Emoji.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 2022, Tim Flynn + * Copyright (c) 2022-2023, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once +#include #include #include #include @@ -46,6 +47,9 @@ Optional find_emoji_for_code_points(u32 const (&code_points)[Size]) return find_emoji_for_code_points(ReadonlySpan { code_points }); } +bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const&); +bool could_be_start_of_emoji_sequence(Utf32CodePointIterator const&); + constexpr StringView emoji_group_to_string(EmojiGroup group) { switch (group) {