diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index d1e5645dde9..08b07d31a51 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -3,7 +3,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) set(UCD_VERSION "15.1.0") set(UCD_SHA256 "cb1c663d053926500cd501229736045752713a066bd75802098598b7a7056177") set(EMOJI_SHA256 "d876ee249aa28eaa76cfa6dfaa702847a8d13b062aa488d465d0395ee8137ed9") -set(IDNA_SHA256 "402cbd285f1f952fcd0834b63541d54f69d3d8f1b8f8599bf71a1a14935f82c4") set(UCD_PATH "${SERENITY_CACHE_DIR}/UCD" CACHE PATH "Download location for UCD files") set(UCD_VERSION_FILE "${UCD_PATH}/version.txt") @@ -67,9 +66,6 @@ set(EMOJI_SERENITY_PATH "${SerenityOS_SOURCE_DIR}/Base/home/anon/Documents/emoji set(EMOJI_FILE_LIST_PATH "${SerenityOS_SOURCE_DIR}/Meta/emoji-file-list.txt") set(EMOJI_INSTALL_PATH "${CMAKE_BINARY_DIR}/Root/home/anon/Documents/emoji.txt") -set(IDNA_MAPPING_TABLE_URL "https://www.unicode.org/Public/idna/${UCD_VERSION}/IdnaMappingTable.txt") -set(IDNA_MAPPING_TABLE_PATH "${UCD_PATH}/IdnaMappingTable.txt") - if (ENABLE_UNICODE_DATABASE_DOWNLOAD) remove_path_if_version_changed("${UCD_VERSION}" "${UCD_VERSION_FILE}" "${UCD_PATH}") @@ -93,12 +89,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SENTENCE_BREAK_PROP_SOURCE}" "${SENTENCE_BREAK_PROP_PATH}") download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}") - - download_file("${IDNA_MAPPING_TABLE_URL}" "${IDNA_MAPPING_TABLE_PATH}" SHA256 "${IDNA_SHA256}") else() message(STATUS "Skipping download of ${UCD_ZIP_URL}, expecting the archive to have been extracted to ${UCD_ZIP_PATH}") message(STATUS "Skipping download of ${EMOJI_TEST_URL}, expecting the file to be at ${EMOJI_TEST_PATH}") - message(STATUS "Skipping download of ${IDNA_MAPPING_TABLE_URL}, expecting the file to be at ${IDNA_MAPPING_TABLE_PATH}") endif() @@ -108,9 +101,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) set(EMOJI_DATA_HEADER EmojiData.h) set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp) - set(IDNA_DATA_HEADER IDNAData.h) - set(IDNA_DATA_IMPLEMENTATION IDNAData.cpp) - if (SERENITYOS) set(EMOJI_INSTALL_ARG -i "${EMOJI_INSTALL_PATH}") endif() @@ -137,21 +127,11 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) # the generated emoji.txt file. dependencies "${EMOJI_RES_PATH}" "${EMOJI_SERENITY_PATH}" "${EMOJI_FILE_LIST_PATH}" ) - invoke_generator( - "IDNAData" - Lagom::GenerateIDNAData - "${UCD_VERSION_FILE}" - "${IDNA_DATA_HEADER}" - "${IDNA_DATA_IMPLEMENTATION}" - arguments -m "${IDNA_MAPPING_TABLE_PATH}" - ) set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION} ${EMOJI_DATA_HEADER} ${EMOJI_DATA_IMPLEMENTATION} - ${IDNA_DATA_HEADER} - ${IDNA_DATA_IMPLEMENTATION} ) endif() diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt index a03d4cda73b..b18637a1848 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt @@ -1,3 +1,2 @@ lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain) lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain) -lagom_tool(GenerateIDNAData SOURCES GenerateIDNAData.cpp LIBS LibMain) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp deleted file mode 100644 index 9fee7fec102..00000000000 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Copyright (c) 2023, Simon Wanner - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include "GeneratorUtil.h" -#include -#include -#include -#include - -enum class MappingStatus : u8 { - Valid, - Ignored, - Mapped, - Deviation, - Disallowed, - DisallowedStd3Valid, - DisallowedStd3Mapped, -}; - -static constexpr Array mapping_status_names { "Valid"sv, "Ignored"sv, "Mapped"sv, "Deviation"sv, "Disallowed"sv, "DisallowedStd3Valid"sv, "DisallowedStd3Mapped"sv }; - -enum class IDNA2008Status : u8 { - NV8, - XV8, -}; - -static constexpr Array idna_2008_status_names { "NV8"sv, "XV8"sv }; - -struct IDNAMapping { - Unicode::CodePointRange code_points; - MappingStatus status; - IDNA2008Status idna_2008_status; - Vector mapped_to {}; -}; - -struct IDNAData { - Vector mapping_table; -}; - -static MappingStatus parse_mapping_status(StringView status) -{ - if (status == "valid"sv) - return MappingStatus::Valid; - if (status == "ignored"sv) - return MappingStatus::Ignored; - if (status == "mapped"sv) - return MappingStatus::Mapped; - if (status == "deviation"sv) - return MappingStatus::Deviation; - if (status == "disallowed"sv) - return MappingStatus::Disallowed; - if (status == "disallowed_STD3_valid"sv) - return MappingStatus::DisallowedStd3Valid; - if (status == "disallowed_STD3_mapped"sv) - return MappingStatus::DisallowedStd3Mapped; - VERIFY_NOT_REACHED(); -} - -static ErrorOr parse_idna_mapping_table(Core::InputBufferedFile& file, Vector& mapping_table) -{ - Array buffer; - - while (TRY(file.can_read_line())) { - auto line = TRY(file.read_line(buffer)); - - if (line.is_empty() || line.starts_with('#')) - continue; - - if (auto index = line.find('#'); index.has_value()) - line = line.substring_view(0, *index); - - auto segments = line.split_view(';', SplitBehavior::KeepEmpty); - VERIFY(segments.size() >= 2); - - IDNAMapping idna_mapping {}; - idna_mapping.code_points = parse_code_point_range(segments[0].trim_whitespace()); - idna_mapping.status = parse_mapping_status(segments[1].trim_whitespace()); - - if (segments.size() >= 3) - idna_mapping.mapped_to = parse_code_point_list(segments[2].trim_whitespace()); - - if (segments.size() >= 4) { - auto trimmed = segments[3].trim_whitespace(); - if (trimmed == "NV8"sv) { - idna_mapping.idna_2008_status = IDNA2008Status::NV8; - } else { - VERIFY(trimmed == "XV8"sv); - idna_mapping.idna_2008_status = IDNA2008Status::XV8; - } - } - - TRY(mapping_table.try_append(move(idna_mapping))); - } - - return {}; -} - -static ErrorOr generate_idna_data_header(Core::InputBufferedFile& file, IDNAData&) -{ - StringBuilder builder; - SourceGenerator generator { builder }; - - generator.append(R"~~~( -#pragma once - -namespace Unicode::IDNA { -} -)~~~"); - - TRY(file.write_until_depleted(generator.as_string_view().bytes())); - return {}; -} - -static ErrorOr generate_idna_data_implementation(Core::InputBufferedFile& file, IDNAData& idna_data) -{ - StringBuilder builder; - SourceGenerator generator { builder }; - - generator.set("idna_table_size", TRY(String::number(idna_data.mapping_table.size()))); - - generator.append(R"~~~( -#include -#include -#include -#include -#include -#include - -namespace Unicode::IDNA { - -struct MappingEntry { - CodePointRange code_points {}; - MappingStatus status : 3 { MappingStatus::Valid }; - IDNA2008Status idna_2008_status : 1 { IDNA2008Status::NV8 }; - size_t mapping_offset : 20 { 0 }; - size_t mapping_length : 8 { 0 }; -}; - -static constexpr Array s_idna_mapping_table { {)~~~"); - - { - size_t mapping_offset = 0; - for (auto const& mapping : idna_data.mapping_table) { - generator.set("code_points", TRY(String::formatted("{:#x}, {:#x}", mapping.code_points.first, mapping.code_points.last))); - generator.set("status", mapping_status_names[to_underlying(mapping.status)]); - generator.set("idna_2008_status", idna_2008_status_names[to_underlying(mapping.idna_2008_status)]); - - if (mapping.mapped_to.is_empty()) { - generator.set("mapping_offset", "0"sv); - generator.set("mapping_length", "0"sv); - } else { - generator.set("mapping_offset", TRY(String::number(mapping_offset))); - generator.set("mapping_length", TRY(String::number(mapping.mapped_to.size()))); - mapping_offset += mapping.mapped_to.size(); - } - - generator.append(R"~~~( - { { @code_points@ }, MappingStatus::@status@, IDNA2008Status::@idna_2008_status@, @mapping_offset@, @mapping_length@ },)~~~"); - } - - generator.set("mapping_length_total", TRY(String::number(mapping_offset))); - } - - generator.append(R"~~~( -} }; - -static constexpr Array s_mapping_code_points { )~~~"); - - { - for (auto const& mapping : idna_data.mapping_table) { - if (mapping.mapped_to.is_empty()) - continue; - - for (u32 code_point : mapping.mapped_to) - generator.append(TRY(String::formatted("{:#x}, ", code_point))); - - generator.append(R"~~~( - )~~~"); - } - } - - generator.append(R"~~~( -}; - -Optional get_idna_mapping(u32 code_point) -{ - auto* entry = binary_search(s_idna_mapping_table, code_point, nullptr, [](auto code_point, auto entry) { - if (code_point < entry.code_points.first) - return -1; - if (code_point > entry.code_points.last) - return 1; - return 0; - }); - - if (!entry) - return {}; - - auto mapped_to = Utf32View { entry->mapping_length ? s_mapping_code_points.data() + entry->mapping_offset : nullptr, entry->mapping_length }; - return Mapping { entry->status, entry->idna_2008_status, move(mapped_to) }; -} - -} -)~~~"); - - TRY(file.write_until_depleted(generator.as_string_view().bytes())); - return {}; -} - -ErrorOr serenity_main(Main::Arguments arguments) -{ - StringView generated_header_path; - StringView generated_implementation_path; - StringView idna_mapping_table_path; - - Core::ArgsParser args_parser; - args_parser.add_option(generated_header_path, "Path to the IDNA Data header file to generate", "generated-header-path", 'h', "generated-header-path"); - args_parser.add_option(generated_implementation_path, "Path to the IDNA Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); - args_parser.add_option(idna_mapping_table_path, "Path to IdnaMappingTable.txt file", "idna-mapping-table-path", 'm', "idna-mapping-table-path"); - args_parser.parse(arguments); - - auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); - auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); - auto idna_mapping_table_file = TRY(open_file(idna_mapping_table_path, Core::File::OpenMode::Read)); - - IDNAData idna_data {}; - TRY(parse_idna_mapping_table(*idna_mapping_table_file, idna_data.mapping_table)); - - TRY(generate_idna_data_header(*generated_header_file, idna_data)); - TRY(generate_idna_data_implementation(*generated_implementation_file, idna_data)); - - return 0; -} diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt index 4e850cb8947..51cef8c6184 100644 --- a/Tests/LibUnicode/CMakeLists.txt +++ b/Tests/LibUnicode/CMakeLists.txt @@ -1,7 +1,6 @@ set(TEST_SOURCES TestEmoji.cpp TestIDNA.cpp - TestPunycode.cpp TestSegmentation.cpp TestUnicodeCharacterTypes.cpp TestUnicodeNormalization.cpp @@ -9,6 +8,4 @@ set(TEST_SOURCES foreach(source IN LISTS TEST_SOURCES) serenity_test("${source}" LibUnicode LIBS LibUnicode) - - get_filename_component(target "${source}" NAME_WLE) endforeach() diff --git a/Tests/LibUnicode/TestPunycode.cpp b/Tests/LibUnicode/TestPunycode.cpp deleted file mode 100644 index 0da2cfffb9f..00000000000 --- a/Tests/LibUnicode/TestPunycode.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2023, Simon Wanner - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include - -#include - -namespace Unicode::Punycode { - -#define ENUMERATE_TEST_CASES \ - CASE(""sv, ""sv) \ - CASE("Well hello friends!"sv, "Well hello friends!-"sv) \ - CASE("Well-hello-friends"sv, "Well-hello-friends-"sv) \ - CASE("Wгellд-бhellбвo"sv, "Well-hello-friends"sv) \ - CASE("Hallöchen Freunde!"sv, "Hallchen Freunde!-2zb"sv) \ - CASE("Nåväl hej vänner"sv, "Nvl hej vnner-cfbhg"sv) \ - CASE("Ну привіт друзі"sv, " -kjc9flsd9cjetgj5xg"sv) \ - CASE("ليهمابتكلموشعربي؟"sv, "egbpdaj6bu4bxfgehfvwxn"sv) \ - CASE("他们为什么不说中文"sv, "ihqwcrb4cv8a8dqg056pqjye"sv) \ - CASE("他們爲什麽不說中文"sv, "ihqwctvzc91f659drss3x8bo0yb"sv) \ - CASE("Pročprostěnemluvíčesky"sv, "Proprostnemluvesky-uyb24dma41a"sv) \ - CASE("למההםפשוטלאמדבריםעברית"sv, "4dbcagdahymbxekheh6e0a7fei0b"sv) \ - CASE("यहलोगहिन्दीक्योंनहींबोलसकतेहैं"sv, "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"sv) \ - CASE("なぜみんな日本語を話してくれないのか"sv, "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"sv) \ - CASE("세계의모든사람들이한국어를이해한다면얼마나좋을까"sv, "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c"sv) \ - CASE("почемужеонинеговорятпорусски"sv, "b1abfaaepdrnnbgefbadotcwatmq2g4l"sv) \ - CASE("PorquénopuedensimplementehablarenEspañol"sv, "PorqunopuedensimplementehablarenEspaol-fmd56a"sv) \ - CASE("TạisaohọkhôngthểchỉnóitiếngViệt"sv, "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"sv) \ - CASE("3年B組金八先生"sv, "3B-ww4c5e180e575a65lsy2b"sv) \ - CASE("安室奈美恵-with-SUPER-MONKEYS"sv, "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"sv) \ - CASE("Hello-Another-Way-それぞれの場所"sv, "Hello-Another-Way--fc4qua05auwb3674vfr0b"sv) \ - CASE("ひとつ屋根の下2"sv, "2-u9tlzr9756bt3uc0v"sv) \ - CASE("MajiでKoiする5秒前"sv, "MajiKoi5-783gue6qz075azm5e"sv) \ - CASE("パフィーdeルンバ"sv, "de-jg4avhby1noc0d"sv) \ - CASE("そのスピードで"sv, "d9juau41awczczp"sv) \ - CASE("-> $1.00 <-"sv, "-> $1.00 <--"sv) - -TEST_CASE(decode) -{ -#define CASE(a, b) EXPECT_EQ(TRY_OR_FAIL(decode(b)), a); - ENUMERATE_TEST_CASES -#undef CASE - EXPECT(decode("Well hello friends!"sv).is_error()); - EXPECT(decode("Nåväl hej vänner"sv).is_error()); -} - -TEST_CASE(encode) -{ -#define CASE(a, b) EXPECT_EQ(TRY_OR_FAIL(encode(a)), b); - ENUMERATE_TEST_CASES -#undef CASE -} - -} diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 652cdc64ef3..d6beddc25ed 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -6,7 +6,6 @@ set(SOURCES Emoji.cpp IDNA.cpp Normalize.cpp - Punycode.cpp Segmentation.cpp String.cpp UnicodeUtils.cpp diff --git a/Userland/Libraries/LibUnicode/IDNA.cpp b/Userland/Libraries/LibUnicode/IDNA.cpp index 7eb1e22ba09..d8ab51ade71 100644 --- a/Userland/Libraries/LibUnicode/IDNA.cpp +++ b/Userland/Libraries/LibUnicode/IDNA.cpp @@ -1,241 +1,62 @@ /* * Copyright (c) 2023, Simon Wanner + * Copyright (c) 2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ -#include -#include -#include -#include -#include -#include +#define AK_DONT_REPLACE_STD -#if ENABLE_UNICODE_DATA -# include -# include -#endif +#include +#include + +#include namespace Unicode::IDNA { -Optional __attribute__((weak)) get_idna_mapping(u32) { return {}; } - -struct ProcessingResult { - Vector result {}; - bool has_error { false }; -}; - -static MappingStatus translate_status(MappingStatus status, UseStd3AsciiRules use_std3_ascii_rules) -{ - switch (status) { - case MappingStatus::DisallowedStd3Valid: - return use_std3_ascii_rules == UseStd3AsciiRules::Yes ? MappingStatus::Disallowed : MappingStatus::Valid; - case MappingStatus::DisallowedStd3Mapped: - return use_std3_ascii_rules == UseStd3AsciiRules::Yes ? MappingStatus::Disallowed : MappingStatus::Mapped; - default: - return status; - } -} - -// https://www.unicode.org/reports/tr46/#Validity_Criteria -static bool is_valid_label(String const& label, CheckHyphens check_hyphens, CheckBidi check_bidi, CheckJoiners check_joiners, UseStd3AsciiRules use_std3_ascii_rules, TransitionalProcessing transitional_processing) -{ - // 1. The label must be in Unicode Normalization Form NFC. - auto normalized = normalize(label, NormalizationForm::NFC); - if (normalized != label) - return false; - - size_t position = 0; - for (auto code_point : label.code_points()) { - // 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character in both the third and fourth positions. - if (check_hyphens == CheckHyphens::Yes && code_point == '-' && (position == 2 || position == 3)) - return false; - - // 4. The label must not contain a U+002E ( . ) FULL STOP. - if (code_point == '.') - return false; - - // 5. The label must not begin with a combining mark, that is: General_Category=Mark. - static auto general_category_mark = general_category_from_string("Mark"sv); - if (position == 0 && general_category_mark.has_value() && code_point_has_general_category(code_point, general_category_mark.value())) - return false; - - // 6. Each code point in the label must only have certain status values according to Section 5, IDNA Mapping Table: - Optional mapping = get_idna_mapping(code_point); - if (!mapping.has_value()) - return false; - - auto status = translate_status(mapping->status, use_std3_ascii_rules); - if (transitional_processing == TransitionalProcessing::Yes) { - // 1. For Transitional Processing, each value must be valid. - if (status != MappingStatus::Valid) - return false; - } else { - // 2. For Nontransitional Processing, each value must be either valid or deviation. - if (status != MappingStatus::Valid && status != MappingStatus::Deviation) - return false; - } - position++; - } - - // 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character. - if (check_hyphens == CheckHyphens::Yes && (label.starts_with('-') || label.ends_with('-'))) - return false; - - // FIXME: 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in The Unicode Code Points and Internationalized Domain Names for Applications (IDNA) [IDNA2008]. - (void)check_joiners; - - // FIXME: 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2. - (void)check_bidi; - - return true; -} - -// https://www.unicode.org/reports/tr46/#Processing -static ErrorOr apply_main_processing_steps(Utf8View domain_name, ToAsciiOptions const& options) -{ - bool has_error = false; - StringBuilder mapped; - // 1. Map. For each code point in the domain_name string, look up the status value in Section 5, IDNA Mapping Table, and take the following actions: - for (u32 code_point : domain_name) { - Optional mapping = get_idna_mapping(code_point); - if (!mapping.has_value()) { - has_error = true; - continue; - } - switch (translate_status(mapping->status, options.use_std3_ascii_rules)) { - // disallowed: Leave the code point unchanged in the string, and record that there was an error. - case MappingStatus::Disallowed: - TRY(mapped.try_append_code_point(code_point)); - has_error = true; - break; - // ignored: Remove the code point from the string. This is equivalent to mapping the code point to an empty string. - case MappingStatus::Ignored: - break; - // mapped: Replace the code point in the string by the value for the mapping in Section 5, IDNA Mapping Table. - case MappingStatus::Mapped: - TRY(mapped.try_append(mapping->mapped_to)); - break; - // deviation: - case MappingStatus::Deviation: - if (options.transitional_processing == TransitionalProcessing::Yes) { - // If Transitional_Processing, replace the code point in the string by the value for the mapping in Section 5, IDNA Mapping Table . - TRY(mapped.try_append(mapping->mapped_to)); - } else { - TRY(mapped.try_append_code_point(code_point)); - } - break; - // valid: Leave the code point unchanged in the string. - case MappingStatus::Valid: - TRY(mapped.try_append_code_point(code_point)); - break; - - default: - VERIFY_NOT_REACHED(); - } - } - - // 2. Normalize. Normalize the domain_name string to Unicode Normalization Form C. - auto normalized = normalize(mapped.string_view(), NormalizationForm::NFC); - - // 3. Break. Break the string into labels at U+002E ( . ) FULL STOP. - auto labels = TRY(normalized.split('.', SplitBehavior::KeepEmpty)); - - // 4. Convert/Validate. For each label in the domain_name string: - for (auto& label : labels) { - // If the label starts with “xn--”: - if (label.starts_with_bytes("xn--"sv)) { - // 1. Attempt to convert the rest of the label to Unicode according to Punycode [RFC3492]. If that conversion fails, record that there was an error, and continue with the next label. - // Otherwise replace the original label in the string by the results of the conversion. - auto punycode = Punycode::decode(label.bytes_as_string_view().substring_view(4)); - if (punycode.is_error()) { - has_error = true; - continue; - } - - label = punycode.release_value(); - - // 2. Verify that the label meets the validity criteria in Section 4.1, Validity Criteria for Nontransitional Processing. - // If any of the validity criteria are not satisfied, record that there was an error. - if (!is_valid_label(label, options.check_hyphens, options.check_bidi, options.check_joiners, options.use_std3_ascii_rules, TransitionalProcessing::No)) - has_error = true; - } - // If the label does not start with “xn--”: - else { - // Verify that the label meets the validity criteria in Section 4.1, Validity Criteria for the input Processing choice (Transitional or Nontransitional). - // If any of the validity criteria are not satisfied, record that there was an error. - if (!is_valid_label(label, options.check_hyphens, options.check_bidi, options.check_joiners, options.use_std3_ascii_rules, options.transitional_processing)) - has_error = true; - } - } - - return ProcessingResult { - .result = move(labels), - .has_error = has_error, - }; -} - // https://www.unicode.org/reports/tr46/#ToASCII ErrorOr to_ascii(Utf8View domain_name, ToAsciiOptions const& options) { - // 1. To the input domain_name, apply the Processing Steps in Section 4, Processing, using the input boolean flags Transitional_Processing, CheckHyphens, CheckBidi, CheckJoiners, and UseSTD3ASCIIRules. This may record an error. - auto processed = TRY(apply_main_processing_steps(domain_name, options)); - bool has_error = processed.has_error; + u32 icu_options = UIDNA_DEFAULT; - // 2. Break the result into labels at U+002E FULL STOP. - auto labels = move(processed.result); + if (options.check_bidi == CheckBidi::Yes) + icu_options |= UIDNA_CHECK_BIDI; + if (options.check_joiners == CheckJoiners::Yes) + icu_options |= UIDNA_CHECK_CONTEXTJ; + if (options.use_std3_ascii_rules == UseStd3AsciiRules::Yes) + icu_options |= UIDNA_USE_STD3_RULES; + if (options.transitional_processing == TransitionalProcessing::No) + icu_options |= UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE; - // 3. Convert each label with non-ASCII characters into Punycode [RFC3492], and prefix by “xn--”. This may record an error. - for (auto& label : labels) { - auto all_ascii = true; - for (auto code_point : label.code_points()) { - if (!is_ascii(code_point)) { - all_ascii = false; - break; - } - } + UErrorCode status = U_ZERO_ERROR; - if (!all_ascii) { - auto punycode = Punycode::encode(label); - if (punycode.is_error()) { - has_error = true; - continue; - } - auto punycode_result = punycode.release_value(); + auto idna = adopt_own_if_nonnull(icu::IDNA::createUTS46Instance(icu_options, status)); + if (Locale::icu_failure(status)) + return Error::from_string_literal("Unable to create an IDNA instance"); - StringBuilder builder; - TRY(builder.try_append("xn--"sv)); - TRY(builder.try_append(punycode_result)); - label = TRY(builder.to_string()); - } + StringBuilder builder { domain_name.as_string().length() }; + icu::StringByteSink sink { &builder }; + + icu::IDNAInfo info; + idna->nameToASCII_UTF8(Locale::icu_string_piece(domain_name.as_string()), sink, info, status); + + auto errors = info.getErrors(); + + if (options.check_hyphens == CheckHyphens::No) { + errors &= ~UIDNA_ERROR_HYPHEN_3_4; + errors &= ~UIDNA_ERROR_LEADING_HYPHEN; + errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; + } + if (options.verify_dns_length == VerifyDnsLength::No) { + errors &= ~UIDNA_ERROR_EMPTY_LABEL; + errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; + errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; } - // 4. If the VerifyDnsLength flag is true, then verify DNS length restrictions. This may record an error. For more information, see [STD13] and [STD3]. - if (options.verify_dns_length == VerifyDnsLength::Yes) { - // 1. The length of the domain name, excluding the root label and its dot, is from 1 to 253. - size_t total_length = 0; - auto* root_label = !labels.is_empty() && labels.last().is_empty() ? &labels.last() : nullptr; - for (auto& label : labels) { - // 2. The length of each label is from 1 to 63. - auto length = label.bytes().size(); - if (label.is_empty() && &label != root_label) - return Error::from_string_literal("Invalid empty label"); - if (length > 63) - return Error::from_string_literal("Label too long"); - total_length += length; - } + if (Locale::icu_failure(status) || errors != 0) + return Error::from_string_literal("Unable to convert domain to ASCII"); - total_length += labels.size() - (root_label ? 2 : 1); - if (total_length == 0 || total_length > 253) - return Error::from_string_literal("Domain too long"); - } - - // 5. If an error was recorded in steps 1-4, then the operation has failed and a failure value is returned. No DNS lookup should be done. - if (has_error) - return Error::from_string_literal("Invalid domain name"); - - // 6. Otherwise join the labels using U+002E FULL STOP as a separator, and return the result. - return String::join('.', labels); + return builder.to_string(); } } diff --git a/Userland/Libraries/LibUnicode/IDNA.h b/Userland/Libraries/LibUnicode/IDNA.h index cf620042880..8962c5e7e1e 100644 --- a/Userland/Libraries/LibUnicode/IDNA.h +++ b/Userland/Libraries/LibUnicode/IDNA.h @@ -1,38 +1,17 @@ /* * Copyright (c) 2023, Simon Wanner + * Copyright (c) 2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once +#include #include -#include -#include namespace Unicode::IDNA { -enum class MappingStatus : u8 { - Valid, - Ignored, - Mapped, - Deviation, - Disallowed, - DisallowedStd3Valid, - DisallowedStd3Mapped, -}; - -enum class IDNA2008Status : u8 { - NV8, - XV8, -}; - -struct Mapping { - MappingStatus status; - IDNA2008Status idna_2008_status; - Utf32View mapped_to; -}; - enum class CheckHyphens { No, Yes, @@ -73,6 +52,5 @@ struct ToAsciiOptions { }; ErrorOr to_ascii(Utf8View domain_name, ToAsciiOptions const& = {}); -Optional get_idna_mapping(u32 code_point); } diff --git a/Userland/Libraries/LibUnicode/Punycode.cpp b/Userland/Libraries/LibUnicode/Punycode.cpp deleted file mode 100644 index cf82ea1930c..00000000000 --- a/Userland/Libraries/LibUnicode/Punycode.cpp +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Copyright (c) 2023, Simon Wanner - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include - -namespace Unicode::Punycode { - -// https://www.rfc-editor.org/rfc/rfc3492.html#section-5 -static constexpr u32 BASE = 36; -static constexpr u32 TMIN = 1; -static constexpr u32 TMAX = 26; -static constexpr u32 SKEW = 38; -static constexpr u32 DAMP = 700; -static constexpr u32 INITIAL_BIAS = 72; -static constexpr u32 INITIAL_N = 0x80; -static constexpr u32 DELIMITER = '-'; - -static Optional digit_value_of_code_point(u32 code_point) -{ - if (code_point >= 'A' && code_point <= 'Z') - return code_point - 'A'; - if (code_point >= 'a' && code_point <= 'z') - return code_point - 'a'; - if (code_point >= '0' && code_point <= '9') - return code_point - '0' + 26; - return {}; -} - -static u32 code_point_value_of_digit(u32 digit) -{ - VERIFY(digit < 36); - if (digit <= 25) - return 'a' + digit; - return '0' + digit - 26; -} - -// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.1 -static u32 adapt(u32 delta, u32 num_points, bool first_time) -{ - // if firsttime then let delta = delta div damp - if (first_time) - delta = delta / DAMP; - // else let delta = delta div 2 - else - delta = delta / 2; - - // let delta = delta + (delta div numpoints) - delta = delta + (delta / num_points); - - // let k = 0 - u32 k = 0; - - // while delta > ((base - tmin) * tmax) div 2 do begin - while (delta > ((BASE - TMIN) * TMAX) / 2) { - // let delta = delta div (base - tmin) - delta = delta / (BASE - TMIN); - - // let k = k + base - k = k + BASE; - } - - // return k + (((base - tmin + 1) * delta) div (delta + skew)) - return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); -} - -// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.2 -ErrorOr decode(StringView input) -{ - size_t consumed = 0; - - // let n = initial_n - Checked n = INITIAL_N; - - // let i = 0 - Checked i = 0; - - // let bias = initial_bias - u32 bias = INITIAL_BIAS; - - // let output = an empty string indexed from 0 - Vector output; - - // consume all code points before the last delimiter (if there is one) - // and copy them to output, fail on any non-basic code point - Optional last_delimiter_index = input.find_last(DELIMITER); - if (last_delimiter_index.has_value()) { - for (; consumed < last_delimiter_index.value(); consumed++) { - if (!is_ascii(input[consumed])) - return Error::from_string_literal("Unexpected non-basic code point"); - TRY(output.try_append(input[consumed])); - } - - // if more than zero code points were consumed then consume one more - // (which will be the last delimiter) - if (last_delimiter_index.value() > 0) { - auto next = input[consumed++]; - VERIFY(next == DELIMITER); - } - } - - // while the input is not exhausted do begin - while (consumed < input.length()) { - // let oldi = i - Checked old_i = i; - - // let w = 1 - Checked w = 1; - - // for k = base to infinity in steps of base do begin - for (size_t k = BASE;; k += BASE) { - // consume a code point, or fail if there was none to consume - if (consumed >= input.length()) - return Error::from_string_literal("No more code points to consume"); - auto code_point = input[consumed++]; - - // let digit = the code point's digit-value, fail if it has none - auto digit = digit_value_of_code_point(code_point); - if (!digit.has_value()) - return Error::from_string_literal("Invalid base-36 digit"); - - // let i = i + digit * w, fail on overflow - i = i + Checked(digit.value()) * w; - if (i.has_overflow()) - return Error::from_string_literal("Numeric overflow"); - - // let t = tmin if k <= bias {+ tmin}, or - // tmax if k >= bias + tmax, or k - bias otherwise - u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias); - - // if digit < t then break - if (digit.value() < t) - break; - - // let w = w * (base - t), fail on overflow - w = w * Checked(BASE - t); - if (w.has_overflow()) - return Error::from_string_literal("Numeric overflow"); - } - // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) - bias = adapt((i - old_i).value(), output.size() + 1, !old_i); - - // let n = n + i div (length(output) + 1), fail on overflow - n = n + Checked(static_cast(i.value() / static_cast(output.size() + 1))); - if (n.has_overflow()) - return Error::from_string_literal("Numeric overflow"); - - // let i = i mod (length(output) + 1) - i = i % Checked(static_cast(output.size() + 1)); - - // {if n is a basic code point then fail} - // NOTE: The full statement enclosed in braces (checking whether n is a basic code point) can be omitted if initial_n exceeds all basic code points - // (which is true for Punycode), because n is never less than initial_n. - VERIFY(!is_ascii(n.value())); - - // insert n into output at position i - TRY(output.try_insert(i.value(), n.value())); - - // increment i - i++; - } - - StringBuilder builder; - TRY(builder.try_append(Utf32View(output.data(), output.size()))); - return builder.to_string(); -} - -static Optional find_smallest_code_point_greater_than_or_equal(Utf32View code_points, u32 threshold) -{ - Optional result; - for (auto code_point : code_points) { - if (code_point >= threshold && (!result.has_value() || code_point < result.value())) - result = code_point; - } - return result; -} - -ErrorOr encode(StringView input) -{ - Vector code_points; - for (auto code_point : Utf8View(input)) - TRY(code_points.try_append(code_point)); - return encode(Utf32View(code_points.data(), code_points.size())); -} - -// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.3 -ErrorOr encode(Utf32View input) -{ - Vector output; - - // let n = initial_n - Checked n = INITIAL_N; - - // let delta = 0 - Checked delta = 0; - - // let bias = initial_bias - u32 bias = INITIAL_BIAS; - - // let h = b = the number of basic code points in the input - // copy them to the output in order, followed by a delimiter if b > 0 - size_t b = 0; - for (auto code_point : input) { - if (is_ascii(code_point)) { - TRY(output.try_append(code_point)); - b++; - } - } - auto h = b; - if (b > 0) - TRY(output.try_append(DELIMITER)); - - // while h < length(input) do begin - while (h < input.length()) { - // let m = the minimum {non-basic} code point >= n in the input - auto m = find_smallest_code_point_greater_than_or_equal(input, n.value()); - VERIFY(m.has_value()); - - // let delta = delta + (m - n) * (h + 1), fail on overflow - delta = delta + (Checked(static_cast(m.value())) - n) * Checked(h + 1); - if (delta.has_overflow()) - return Error::from_string_literal("Numeric overflow"); - - // let n = m - n = m.value(); - - // for each code point c in the input (in order) do begin - for (auto c : input) { - // if c < n {or c is basic} then increment delta, fail on overflow - if (c < n.value()) { - delta++; - if (delta.has_overflow()) - return Error::from_string_literal("Numeric overflow"); - } - - // if c == n then begin - if (c == n.value()) { - // let q = delta - auto q = delta.value(); - - // for k = base to infinity in steps of base do begin - for (size_t k = BASE;; k += BASE) { - // let t = tmin if k <= bias {+ tmin}, or - // tmax if k >= bias + tmax, or k - bias otherwise - u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias); - - // if q < t then break - if (q < t) - break; - - // output the code point for digit t + ((q - t) mod (base - t)) - auto digit = t + ((q - t) % (BASE - t)); - TRY(output.try_append(code_point_value_of_digit(digit))); - - // let q = (q - t) div (base - t) - q = (q - t) / (BASE - t); - } - // output the code point for digit q - TRY(output.try_append(code_point_value_of_digit(q))); - - // let bias = adapt(delta, h + 1, test h equals b?) - bias = adapt(delta.value(), h + 1, h == b); - - // let delta = 0 - delta = 0; - - // increment h - h++; - } - } - - // increment delta and n - delta++; - n++; - } - - StringBuilder builder; - TRY(builder.try_append(Utf32View(output.data(), output.size()))); - return builder.to_string(); -} - -} diff --git a/Userland/Libraries/LibUnicode/Punycode.h b/Userland/Libraries/LibUnicode/Punycode.h deleted file mode 100644 index bfc0981a3e3..00000000000 --- a/Userland/Libraries/LibUnicode/Punycode.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2023, Simon Wanner - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#pragma once - -#include - -namespace Unicode::Punycode { - -ErrorOr decode(StringView); -ErrorOr encode(StringView); -ErrorOr encode(Utf32View); - -}