LibUncode: Parse and generate emoji code point data

According to TR #51, the "best definition of the full set [of emojis] is
in the emoji-test.txt file". This defines not only the emoji themselves,
but the order in which they should be displayed, and what "group" of
emojis they belong to.
This commit is contained in:
Timothy Flynn 2022-09-07 13:39:31 -04:00 committed by Linus Groh
parent fff79379d4
commit b61eca0a1e
Notes: sideshowbarker 2024-07-17 07:20:39 +09:00
7 changed files with 351 additions and 0 deletions

View File

@ -92,12 +92,18 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h)
set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp)
set(EMOJI_DATA_HEADER LibUnicode/EmojiData.h)
set(EMOJI_DATA_IMPLEMENTATION LibUnicode/EmojiData.cpp)
set(UNICODE_META_TARGET_PREFIX LibUnicode_)
if (CMAKE_CURRENT_BINARY_DIR MATCHES ".*/LibUnicode") # Serenity build.
set(UNICODE_DATA_HEADER UnicodeData.h)
set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp)
set(EMOJI_DATA_HEADER EmojiData.h)
set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp)
set(UNICODE_META_TARGET_PREFIX "")
endif()
@ -110,6 +116,15 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UNICODE_DATA_IMPLEMENTATION}"
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
)
invoke_generator(
"EmojiData"
Lagom::GenerateEmojiData
"${UCD_VERSION_FILE}"
"${UNICODE_META_TARGET_PREFIX}"
"${EMOJI_DATA_HEADER}"
"${EMOJI_DATA_IMPLEMENTATION}"
arguments -e "${EMOJI_TEST_PATH}"
)
if (CMAKE_CURRENT_BINARY_DIR MATCHES ".*/LibUnicode") # Serenity build.
add_custom_command(
@ -128,5 +143,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
set(UNICODE_DATA_SOURCES
${UNICODE_DATA_HEADER}
${UNICODE_DATA_IMPLEMENTATION}
${EMOJI_DATA_HEADER}
${EMOJI_DATA_IMPLEMENTATION}
)
endif()

View File

@ -1 +1,2 @@
lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain)
lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain)

View File

@ -0,0 +1,220 @@
/*
* Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "GeneratorUtil.h"
#include <AK/SourceGenerator.h>
#include <AK/String.h>
#include <AK/StringUtils.h>
#include <AK/Types.h>
#include <LibCore/ArgsParser.h>
#include <LibCore/Stream.h>
#include <LibUnicode/Emoji.h>
using StringIndexType = u16;
constexpr auto s_string_index_type = "u16"sv;
struct Emoji {
StringIndexType name { 0 };
Unicode::EmojiGroup group;
u32 display_order { 0 };
String code_points_name;
Vector<u32> code_points;
};
struct EmojiData {
UniqueStringStorage<StringIndexType> unique_strings;
Vector<Emoji> emojis;
};
static ErrorOr<void> parse_emoji_test_data(Core::Stream::BufferedFile& file, EmojiData& emoji_data)
{
static constexpr auto group_header = "# group: "sv;
Array<u8, 1024> buffer;
Unicode::EmojiGroup group;
u32 display_order { 0 };
while (TRY(file.can_read_line())) {
auto line = TRY(file.read_line(buffer));
if (line.is_empty())
continue;
if (line.starts_with('#')) {
if (line.starts_with(group_header)) {
auto name = line.substring_view(group_header.length());
group = Unicode::emoji_group_from_string(name);
}
continue;
}
auto status_index = line.find(';');
VERIFY(status_index.has_value());
auto emoji_and_name_index = line.find('#', *status_index);
VERIFY(emoji_and_name_index.has_value());
// FIXME: Should we keep non-fully-qualified emoji? TR #51 states this is implementation defined.
auto status = line.substring_view(*status_index + 1, *emoji_and_name_index - *status_index - 1).trim_whitespace();
if (status != "fully-qualified"sv)
continue;
Emoji emoji {};
emoji.group = group;
emoji.display_order = display_order++;
auto code_points = line.substring_view(0, *status_index).split_view(' ');
TRY(emoji.code_points.try_ensure_capacity(code_points.size()));
for (auto code_point : code_points) {
auto value = AK::StringUtils::convert_to_uint_from_hex<u32>(code_point);
VERIFY(value.has_value());
emoji.code_points.unchecked_append(*value);
}
auto emoji_and_name = line.substring_view(*emoji_and_name_index + 1);
auto emoji_and_name_spaces = emoji_and_name.find_all(" "sv);
VERIFY(emoji_and_name_spaces.size() > 2);
auto name = emoji_and_name.substring_view(emoji_and_name_spaces[2]).trim_whitespace();
emoji.name = emoji_data.unique_strings.ensure(name.to_titlecase_string());
emoji.code_points_name = String::join('_', code_points);
TRY(emoji_data.emojis.try_append(move(emoji)));
}
return {};
}
static ErrorOr<void> generate_emoji_data_header(Core::Stream::BufferedFile& file, EmojiData const&)
{
StringBuilder builder;
SourceGenerator generator { builder };
TRY(file.write(generator.as_string_view().bytes()));
return {};
}
static ErrorOr<void> generate_emoji_data_implementation(Core::Stream::BufferedFile& file, EmojiData const& emoji_data)
{
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("string_index_type"sv, s_string_index_type);
generator.set("emojis_size"sv, String::number(emoji_data.emojis.size()));
generator.append(R"~~~(
#include <AK/Array.h>
#include <AK/BinarySearch.h>
#include <AK/Span.h>
#include <AK/StringView.h>
#include <AK/Types.h>
#include <LibUnicode/Emoji.h>
#include <LibUnicode/EmojiData.h>
namespace Unicode {
)~~~");
emoji_data.unique_strings.generate(generator);
generator.append(R"~~~(
struct EmojiData {
constexpr Emoji to_unicode_emoji() const
{
Emoji emoji {};
emoji.name = decode_string(name);
emoji.group = static_cast<EmojiGroup>(group);
emoji.display_order = display_order;
emoji.code_points = code_points;
return emoji;
}
@string_index_type@ name { 0 };
u8 group { 0 };
u32 display_order { 0 };
Span<u32 const> code_points;
};
)~~~");
for (auto const& emoji : emoji_data.emojis) {
generator.set("name"sv, emoji.code_points_name);
generator.set("size"sv, String::number(emoji.code_points.size()));
generator.append(R"~~~(
static constexpr Array<u32, @size@> s_@name@ { {)~~~");
bool first = true;
for (auto code_point : emoji.code_points) {
generator.append(first ? " "sv : ", "sv);
generator.append(String::formatted("{:#x}", code_point));
first = false;
}
generator.append(" } };"sv);
}
generator.append(R"~~~(
static constexpr Array<EmojiData, @emojis_size@> s_emojis { {)~~~");
for (auto const& emoji : emoji_data.emojis) {
generator.set("name"sv, String::number(emoji.name));
generator.set("group"sv, String::number(to_underlying(emoji.group)));
generator.set("display_order"sv, String::number(emoji.display_order));
generator.set("code_points_name"sv, emoji.code_points_name);
generator.append(R"~~~(
{ @name@, @group@, @display_order@, s_@code_points_name@ },)~~~");
}
generator.append(R"~~~(
} };
Optional<Emoji> find_emoji_for_code_points(Span<u32 const> code_points)
{
for (auto& emoji : s_emojis) {
if (emoji.code_points == code_points)
return emoji.to_unicode_emoji();
}
return {};
}
}
)~~~");
TRY(file.write(generator.as_string_view().bytes()));
return {};
}
ErrorOr<int> serenity_main(Main::Arguments arguments)
{
StringView generated_header_path;
StringView generated_implementation_path;
StringView emoji_test_path;
Core::ArgsParser args_parser;
args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(emoji_test_path, "Path to emoji-test.txt file", "emoji-test-path", 'e', "emoji-test-path");
args_parser.parse(arguments);
auto generated_header_file = TRY(open_file(generated_header_path, Core::Stream::OpenMode::Write));
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::Stream::OpenMode::Write));
auto emoji_test_file = TRY(open_file(emoji_test_path, Core::Stream::OpenMode::Read));
EmojiData emoji_data {};
TRY(parse_emoji_test_data(*emoji_test_file, emoji_data));
TRY(generate_emoji_data_header(*generated_header_file, emoji_data));
TRY(generate_emoji_data_implementation(*generated_implementation_file, emoji_data));
return 0;
}

View File

@ -3,6 +3,7 @@ include(${SerenityOS_SOURCE_DIR}/Meta/CMake/unicode_data.cmake)
set(SOURCES
CharacterTypes.cpp
CurrencyCode.cpp
Emoji.cpp
${UNICODE_DATA_SOURCES}
)

View File

@ -0,0 +1,13 @@
/*
* Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibUnicode/Emoji.h>
namespace Unicode {
Optional<Emoji> __attribute__((weak)) find_emoji_for_code_points(Span<u32 const>) { return {}; }
}

View File

@ -0,0 +1,97 @@
/*
* Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Optional.h>
#include <AK/StringView.h>
#include <AK/Types.h>
namespace Unicode {
enum class EmojiGroup : u8 {
SmileysAndEmotion,
PeopleAndBody,
Component,
AnimalsAndNature,
FoodAndDrink,
TravelAndPlaces,
Activities,
Objects,
Symbols,
Flags,
};
struct Emoji {
StringView name;
EmojiGroup group;
u32 display_order { 0 };
Span<u32 const> code_points;
};
Optional<Emoji> find_emoji_for_code_points(Span<u32 const> code_points);
template<size_t Size>
Optional<Emoji> find_emoji_for_code_points(u32 const (&code_points)[Size])
{
return find_emoji_for_code_points(Span<u32 const> { code_points });
}
constexpr StringView emoji_group_to_string(EmojiGroup group)
{
switch (group) {
case EmojiGroup::SmileysAndEmotion:
return "Smileys & Emotion"sv;
case EmojiGroup::PeopleAndBody:
return "People & Body"sv;
case EmojiGroup::Component:
return "Component"sv;
case EmojiGroup::AnimalsAndNature:
return "Animals & Nature"sv;
case EmojiGroup::FoodAndDrink:
return "Food & Drink"sv;
case EmojiGroup::TravelAndPlaces:
return "Travel & Places"sv;
case EmojiGroup::Activities:
return "Activities"sv;
case EmojiGroup::Objects:
return "Objects"sv;
case EmojiGroup::Symbols:
return "Symbols"sv;
case EmojiGroup::Flags:
return "Flags"sv;
}
VERIFY_NOT_REACHED();
}
constexpr EmojiGroup emoji_group_from_string(StringView group)
{
if (group == "Smileys & Emotion"sv)
return EmojiGroup::SmileysAndEmotion;
if (group == "People & Body"sv)
return EmojiGroup::PeopleAndBody;
if (group == "Component"sv)
return EmojiGroup::Component;
if (group == "Animals & Nature"sv)
return EmojiGroup::AnimalsAndNature;
if (group == "Food & Drink"sv)
return EmojiGroup::FoodAndDrink;
if (group == "Travel & Places"sv)
return EmojiGroup::TravelAndPlaces;
if (group == "Activities"sv)
return EmojiGroup::Activities;
if (group == "Objects"sv)
return EmojiGroup::Objects;
if (group == "Symbols"sv)
return EmojiGroup::Symbols;
if (group == "Flags"sv)
return EmojiGroup::Flags;
VERIFY_NOT_REACHED();
}
}

View File

@ -11,6 +11,7 @@
namespace Unicode {
enum class Block : u16;
enum class EmojiGroup : u8;
enum class GeneralCategory : u8;
enum class GraphemeBreakProperty : u8;
enum class Property : u8;
@ -19,6 +20,7 @@ enum class SentenceBreakProperty : u8;
enum class WordBreakProperty : u8;
struct CurrencyCode;
struct Emoji;
struct SpecialCasing;
}