From 4dda3edc9e86fb1769929e00e54bb4e41fa38877 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sun, 25 Jul 2021 15:10:51 -0400 Subject: [PATCH] LibUnicode: Introduce a Unicode library for interacting with UCD files The Unicode standard publishes the Unicode Character Database (UCD) with information about every code point, such as each code point's upper case mapping. LibUnicode exists to download and parse UCD files at build time and to provide accessors to that data. As a start, LibUnicode includes upper- and lower-case code point converters. --- CMakeLists.txt | 1 + Tests/CMakeLists.txt | 1 + Tests/LibUnicode/CMakeLists.txt | 5 + .../LibUnicode/TestUnicodeCharacterTypes.cpp | 50 +++ Userland/Libraries/CMakeLists.txt | 1 + Userland/Libraries/LibUnicode/CMakeLists.txt | 9 + .../Libraries/LibUnicode/CharacterTypes.cpp | 43 +++ .../Libraries/LibUnicode/CharacterTypes.h | 16 + .../LibUnicode/CodeGenerators/CMakeLists.txt | 2 + .../CodeGenerators/GenerateUnicodeData.cpp | 309 ++++++++++++++++++ .../Libraries/LibUnicode/unicode_data.cmake | 36 ++ 11 files changed, 473 insertions(+) create mode 100644 Tests/LibUnicode/CMakeLists.txt create mode 100644 Tests/LibUnicode/TestUnicodeCharacterTypes.cpp create mode 100644 Userland/Libraries/LibUnicode/CMakeLists.txt create mode 100644 Userland/Libraries/LibUnicode/CharacterTypes.cpp create mode 100644 Userland/Libraries/LibUnicode/CharacterTypes.h create mode 100644 Userland/Libraries/LibUnicode/CodeGenerators/CMakeLists.txt create mode 100644 Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp create mode 100644 Userland/Libraries/LibUnicode/unicode_data.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index fa9af296aa2..90747b55ea6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,6 +142,7 @@ endif() add_subdirectory(Userland/DevTools/IPCCompiler) add_subdirectory(Userland/DevTools/StateMachineGenerator) +add_subdirectory(Userland/Libraries/LibUnicode/CodeGenerators) add_subdirectory(Userland/Libraries/LibWeb/CodeGenerators) add_subdirectory(Meta/CMake/ConfigureComponents) diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index b9680ca8441..e22f1b8e6f2 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -12,6 +12,7 @@ add_subdirectory(LibM) add_subdirectory(LibPthread) add_subdirectory(LibRegex) add_subdirectory(LibSQL) +add_subdirectory(LibUnicode) add_subdirectory(LibWasm) add_subdirectory(LibWeb) if (${SERENITY_ARCH} STREQUAL "i686") diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt new file mode 100644 index 00000000000..a8ef1ab9da3 --- /dev/null +++ b/Tests/LibUnicode/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB TEST_SOURCES CONFIGURE_DEPENDS "*.cpp") + +foreach(source ${TEST_SOURCES}) + serenity_test(${source} LibUnicode LIBS LibUnicode) +endforeach() diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp new file mode 100644 index 00000000000..bc5190c50cb --- /dev/null +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include +#include + +static void compare_to_ascii(auto& old_function, auto& new_function) +{ + i64 result1 = 0; + i64 result2 = 0; + + for (u32 i = 0; i < 0x80; ++i) { + EXPECT_EQ(result1 = old_function(i), result2 = new_function(i)); + if (result1 != result2) + dbgln("Function input value was {}.", i); + } +} + +TEST_CASE(to_unicode_lowercase) +{ + compare_to_ascii(tolower, Unicode::to_unicode_lowercase); + + EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω" + EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω" + + // Code points encoded by ranges in UnicodeData.txt + EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u); + EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u); + EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u); + EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu); +} + +TEST_CASE(to_unicode_uppercase) +{ + compare_to_ascii(toupper, Unicode::to_unicode_uppercase); + + EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω" + EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω" + + // Code points encoded by ranges in UnicodeData.txt + EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u); + EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u); + EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u); + EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu); +} diff --git a/Userland/Libraries/CMakeLists.txt b/Userland/Libraries/CMakeLists.txt index 5299c46e8a9..56cdc75ed33 100644 --- a/Userland/Libraries/CMakeLists.txt +++ b/Userland/Libraries/CMakeLists.txt @@ -43,6 +43,7 @@ add_subdirectory(LibTextCodec) add_subdirectory(LibThreading) add_subdirectory(LibTLS) add_subdirectory(LibTTF) +add_subdirectory(LibUnicode) add_subdirectory(LibUSBDB) add_subdirectory(LibVideo) add_subdirectory(LibVT) diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt new file mode 100644 index 00000000000..f0875b7de88 --- /dev/null +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -0,0 +1,9 @@ +include(unicode_data.cmake) + +SET(SOURCES + ${UNICODE_DATA_SOURCES} + CharacterTypes.cpp +) + +serenity_lib(LibUnicode unicode) +target_link_libraries(LibUnicode LibCore) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp new file mode 100644 index 00000000000..103284195da --- /dev/null +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +#if ENABLE_UNICODE_DATA +# include +#else +# include +#endif + +namespace Unicode { + +u32 to_unicode_lowercase(u32 code_point) +{ +#if ENABLE_UNICODE_DATA + auto unicode_data = unicode_data_for_code_point(code_point); + if (unicode_data.has_value()) + return unicode_data->simple_lowercase_mapping; + return code_point; +#else + return AK::to_ascii_lowercase(code_point); +#endif +} + +u32 to_unicode_uppercase(u32 code_point) +{ +#if ENABLE_UNICODE_DATA + auto unicode_data = unicode_data_for_code_point(code_point); + if (unicode_data.has_value()) + return unicode_data->simple_uppercase_mapping; + return code_point; +#else + return AK::to_ascii_uppercase(code_point); +#endif +} + +} diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h new file mode 100644 index 00000000000..d390d7a3d3f --- /dev/null +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2021, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include + +namespace Unicode { + +u32 to_unicode_lowercase(u32 code_point); +u32 to_unicode_uppercase(u32 code_point); + +} diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/CMakeLists.txt b/Userland/Libraries/LibUnicode/CodeGenerators/CMakeLists.txt new file mode 100644 index 00000000000..a5ae2371adc --- /dev/null +++ b/Userland/Libraries/LibUnicode/CodeGenerators/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(GenerateUnicodeData GenerateUnicodeData.cpp) +target_link_libraries(GenerateUnicodeData LagomCore) diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp new file mode 100644 index 00000000000..03461564919 --- /dev/null +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2021, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt +// Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#UnicodeData.txt +struct CodePointData { + u32 index { 0 }; + u32 code_point { 0 }; + String name; + String general_category; + u8 canonical_combining_class { 0 }; + String bidi_class; + String decomposition_type; + Optional numeric_value_decimal; + Optional numeric_value_digit; + Optional numeric_value_numeric; + bool bidi_mirrored { false }; + String unicode_1_name; + String iso_comment; + Optional simple_uppercase_mapping; + Optional simple_lowercase_mapping; + Optional simple_titlecase_mapping; +}; + +// Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code +// points, as indicated by the "name" field. For example: +// 3400;;Lo;0;L;;;;;N;;;;; +// 4DBF;;Lo;0;L;;;;;N;;;;; +struct CodePointRange { + u32 index; + u32 first; + u32 last; +}; + +struct UnicodeData { + Vector code_point_data; + Vector code_point_ranges; + u32 last_contiguous_code_point { 0 }; +}; + +static constexpr auto s_desired_fields = Array { + "simple_uppercase_mapping"sv, + "simple_lowercase_mapping"sv, +}; + +static UnicodeData parse_unicode_data(Core::File& file) +{ + UnicodeData unicode_data; + + Optional code_point_range_start; + Optional code_point_range_index; + + Optional last_contiguous_code_point; + u32 previous_code_point = 0; + + while (file.can_read_line()) { + auto line = file.read_line(); + if (line.is_empty()) + continue; + + auto segments = line.split(';', true); + VERIFY(segments.size() == 15); + + CodePointData data {}; + data.index = static_cast(unicode_data.code_point_data.size()); + data.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); + data.name = move(segments[1]); + data.general_category = move(segments[2]); + data.canonical_combining_class = AK::StringUtils::convert_to_uint(segments[3]).value(); + data.bidi_class = move(segments[4]); + data.decomposition_type = move(segments[5]); + data.numeric_value_decimal = AK::StringUtils::convert_to_int(segments[6]); + data.numeric_value_digit = AK::StringUtils::convert_to_int(segments[7]); + data.numeric_value_numeric = AK::StringUtils::convert_to_int(segments[8]); + data.bidi_mirrored = segments[9] == "Y"sv; + data.unicode_1_name = move(segments[10]); + data.iso_comment = move(segments[11]); + data.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[12]); + data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[13]); + data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[14]); + + if (data.name.starts_with("<"sv) && data.name.ends_with(", First>")) { + VERIFY(!code_point_range_start.has_value() && !code_point_range_index.has_value()); + + code_point_range_start = data.code_point; + code_point_range_index = data.index; + + data.name = data.name.substring(1, data.name.length() - 9); + } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>")) { + VERIFY(code_point_range_start.has_value() && code_point_range_index.has_value()); + + unicode_data.code_point_ranges.append({ *code_point_range_index, *code_point_range_start, data.code_point }); + data.name = data.name.substring(1, data.name.length() - 8); + + code_point_range_start.clear(); + code_point_range_index.clear(); + } else if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) { + if (!last_contiguous_code_point.has_value()) + last_contiguous_code_point = previous_code_point; + } + + previous_code_point = data.code_point; + unicode_data.code_point_data.append(move(data)); + } + + unicode_data.last_contiguous_code_point = *last_contiguous_code_point; + return unicode_data; +} + +static void generate_unicode_data_header() +{ + StringBuilder builder; + SourceGenerator generator { builder }; + + generator.append(R"~~~( +#pragma once + +#include +#include + +namespace Unicode { + +struct UnicodeData { + u32 code_point;)~~~"); + + auto append_field = [&](StringView type, StringView name) { + if (!s_desired_fields.span().contains_slow(name)) + return; + + generator.set("type", type); + generator.set("name", name); + generator.append(R"~~~( + @type@ @name@;)~~~"); + }; + + // Note: For compile-time performance, only primitive types are used. + append_field("char const*"sv, "name"sv); + append_field("char const*"sv, "general_category"sv); + append_field("u8"sv, "canonical_combining_class"sv); + append_field("char const*"sv, "bidi_class"sv); + append_field("char const*"sv, "decomposition_type"sv); + append_field("i8"sv, "numeric_value_decimal"sv); + append_field("i8"sv, "numeric_value_digit"sv); + append_field("i8"sv, "numeric_value_numeric"sv); + append_field("bool"sv, "bidi_mirrored"sv); + append_field("char const*"sv, "unicode_1_name"sv); + append_field("char const*"sv, "iso_comment"sv); + append_field("u32"sv, "simple_uppercase_mapping"sv); + append_field("u32"sv, "simple_lowercase_mapping"sv); + append_field("u32"sv, "simple_titlecase_mapping"sv); + + builder.append(R"~~~( +}; + +Optional unicode_data_for_code_point(u32 code_point); + +} +)~~~"); + + outln("{}", generator.as_string_view()); +} + +static void generate_unicode_data_implementation(UnicodeData unicode_data) +{ + StringBuilder builder; + SourceGenerator generator { builder }; + + generator.set("size", String::number(unicode_data.code_point_data.size())); + generator.set("last_contiguous_code_point", String::formatted("0x{:x}", unicode_data.last_contiguous_code_point)); + + generator.append(R"~~~( +#include +#include +#include +#include + +namespace Unicode { + +static constexpr Array s_unicode_data { {)~~~"); + + auto append_field = [&](StringView name, String value) { + if (!s_desired_fields.span().contains_slow(name)) + return; + + generator.set("value", move(value)); + generator.append(", @value@"); + }; + + for (auto const& data : unicode_data.code_point_data) { + generator.set("code_point", String::formatted("{:#x}", data.code_point)); + generator.append(R"~~~( + { @code_point@)~~~"); + + append_field("name", String::formatted("\"{}\"", data.name)); + append_field("general_category", String::formatted("\"{}\"", data.general_category)); + append_field("canonical_combining_class", String::number(data.canonical_combining_class)); + append_field("bidi_class", String::formatted("\"{}\"", data.bidi_class)); + append_field("decomposition_type", String::formatted("\"{}\"", data.decomposition_type)); + append_field("numeric_value_decimal", String::number(data.numeric_value_decimal.value_or(-1))); + append_field("numeric_value_digit", String::number(data.numeric_value_digit.value_or(-1))); + append_field("numeric_value_numeric", String::number(data.numeric_value_numeric.value_or(-1))); + append_field("bidi_mirrored", String::formatted("{}", data.bidi_mirrored)); + append_field("unicode_1_name", String::formatted("\"{}\"", data.unicode_1_name)); + append_field("iso_comment", String::formatted("\"{}\"", data.iso_comment)); + append_field("simple_uppercase_mapping", String::formatted("{:#x}", data.simple_uppercase_mapping.value_or(data.code_point))); + append_field("simple_lowercase_mapping", String::formatted("{:#x}", data.simple_lowercase_mapping.value_or(data.code_point))); + append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point))); + + generator.append(" },"); + } + + generator.append(R"~~~( +} }; + +static Optional index_of_code_point_in_range(u32 code_point) +{)~~~"); + + for (auto const& range : unicode_data.code_point_ranges) { + generator.set("index", String::formatted("{}", range.index)); + generator.set("first", String::formatted("{:#x}", range.first)); + generator.set("last", String::formatted("{:#x}", range.last)); + + generator.append(R"~~~( + if ((code_point > @first@) && (code_point < @last@)) + return @index@;)~~~"); + } + + generator.append(R"~~~( + return {}; +} + +Optional unicode_data_for_code_point(u32 code_point) +{ + VERIFY(is_unicode(code_point)); + + if (code_point <= @last_contiguous_code_point@) + return s_unicode_data[code_point]; + + if (auto index = index_of_code_point_in_range(code_point); index.has_value()) { + auto data_for_range = s_unicode_data[*index]; + data_for_range.simple_uppercase_mapping = code_point; + data_for_range.simple_lowercase_mapping = code_point; + return data_for_range; + } + + auto it = AK::find_if(s_unicode_data.begin(), s_unicode_data.end(), [code_point](auto const& data) { return data.code_point == code_point; }); + if (it != s_unicode_data.end()) + return *it; + + return {}; +} + +} +)~~~"); + + outln("{}", generator.as_string_view()); +} + +int main(int argc, char** argv) +{ + bool generate_header = false; + bool generate_implementation = false; + char const* unicode_data_path = nullptr; + + Core::ArgsParser args_parser; + args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h'); + args_parser.add_option(generate_implementation, "Generate the Unicode Data implementation file", "generate-implementation", 'c'); + args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); + args_parser.parse(argc, argv); + + if (!generate_header && !generate_implementation) { + warnln("At least one of -h/--generate-header or -c/--generate-implementation is required"); + args_parser.print_usage(stderr, argv[0]); + return 1; + } + if (!unicode_data_path) { + warnln("-u/--unicode-data-path is required"); + args_parser.print_usage(stderr, argv[0]); + return 1; + } + + auto file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly); + if (file_or_error.is_error()) { + warnln("Failed to open {}: {}", unicode_data_path, file_or_error.release_error()); + return 1; + } + + auto unicode_data = parse_unicode_data(file_or_error.value()); + + if (generate_header) + generate_unicode_data_header(); + if (generate_implementation) + generate_unicode_data_implementation(move(unicode_data)); + + return 0; +} diff --git a/Userland/Libraries/LibUnicode/unicode_data.cmake b/Userland/Libraries/LibUnicode/unicode_data.cmake new file mode 100644 index 00000000000..5cab92edb3d --- /dev/null +++ b/Userland/Libraries/LibUnicode/unicode_data.cmake @@ -0,0 +1,36 @@ +option(ENABLE_UNICODE_DATABASE_DOWNLOAD "Enable download of Unicode UCD files at build time" ON) + +set(UNICODE_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt) +set(UNICODE_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/UnicodeData.txt) + +if (ENABLE_UNICODE_DATABASE_DOWNLOAD) + if (NOT EXISTS ${UNICODE_DATA_PATH}) + message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...") + file(DOWNLOAD ${UNICODE_DATA_URL} ${UNICODE_DATA_PATH} INACTIVITY_TIMEOUT 10) + endif() + + set(UNICODE_GENERATOR CodeGenerators/GenerateUnicodeData) + set(UNICODE_DATA_HEADER UnicodeData.h) + set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp) + + add_custom_command( + OUTPUT ${UNICODE_DATA_HEADER} + COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} + VERBATIM + DEPENDS GenerateUnicodeData + MAIN_DEPENDENCY ${UNICODE_DATA_PATH} + ) + + add_custom_command( + OUTPUT ${UNICODE_DATA_IMPLEMENTATION} + COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} + VERBATIM + DEPENDS GenerateUnicodeData + MAIN_DEPENDENCY ${UNICODE_DATA_PATH} + ) + + set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}) + add_compile_definitions(ENABLE_UNICODE_DATA=1) +else() + add_compile_definitions(ENABLE_UNICODE_DATA=0) +endif()