From 93ee9220270c86f63c5629ffe974f8eccb1c414d Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 17 Nov 2021 09:56:16 -0500 Subject: [PATCH] LibUnicode: Support locales-without-script aliases for ECMA-402 As noted by ECMA-402, if a supported locale contains all of a language, script, and region subtag, then the implementation must also support the locale without the script subtag. The most complicated example of this is the zh-TW locale. The list of locales in the CLDR database does not include zh-TW or its maximized zh-Hant-TW variant. Instead, it inlcudes the zh-Hant locale. However, zh-Hant-TW is listed in the default-content locale list in the cldr-core package. This defines an alias from zh-Hant-TW to zh-Hant. We must then also support the zh-Hant-TW alias without the script subtag: zh-TW. This transitively maps zh-TW to zh-Hant, which is a case quite heavily tested by test262. --- .../LibUnicode/GenerateUnicodeLocale.cpp | 43 +++++++++++++++++++ Tests/LibUnicode/TestUnicodeLocale.cpp | 8 ++++ 2 files changed, 51 insertions(+) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp index 78e64a861c4..f8cf25ec5a0 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp @@ -427,6 +427,48 @@ static void parse_default_content_locales(String core_path, UnicodeLocaleData& l }); } +static void define_aliases_without_scripts(UnicodeLocaleData& locale_data) +{ + // From ECMA-402: https://tc39.es/ecma402/#sec-internal-slots + // + // For locales that include a script subtag in addition to language and region, the + // corresponding locale without a script subtag must also be supported. + // + // So we define aliases for locales that contain all three subtags, but we must also take + // care to handle when the locale itself or the locale without a script subtag are an alias + // by way of default-content locales. + auto find_alias = [&](auto const& locale) { + return locale_data.locale_aliases.find_if([&](auto const& alias) { return locale == alias.alias; }); + }; + + auto append_alias_without_script = [&](auto const& locale) { + auto parsed_locale = CanonicalLanguageID::parse(locale_data.unique_strings, locale); + VERIFY(parsed_locale.has_value()); + + if ((parsed_locale->language == 0) || (parsed_locale->script == 0) || (parsed_locale->region == 0)) + return; + + auto locale_without_script = String::formatted("{}-{}", + locale_data.unique_strings.get(parsed_locale->language), + locale_data.unique_strings.get(parsed_locale->region)); + + if (locale_data.locales.contains(locale_without_script)) + return; + if (find_alias(locale_without_script) != locale_data.locale_aliases.end()) + return; + + if (auto it = find_alias(locale); it != locale_data.locale_aliases.end()) + locale_data.locale_aliases.append({ it->name, locale_without_script }); + else + locale_data.locale_aliases.append({ locale, locale_without_script }); + }; + + for (auto const& locale : locale_data.locales) + append_alias_without_script(locale.key); + for (auto const& locale : locale_data.locale_aliases) + append_alias_without_script(locale.alias); +} + static void parse_all_locales(String core_path, String locale_names_path, String misc_path, String numbers_path, UnicodeLocaleData& locale_data) { auto identity_iterator = path_to_dir_iterator(locale_names_path); @@ -508,6 +550,7 @@ static void parse_all_locales(String core_path, String locale_names_path, String } parse_default_content_locales(move(core_path), locale_data); + define_aliases_without_scripts(locale_data); } static String format_identifier(StringView owner, String identifier) diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index 34619c8cd78..9f8064212a4 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -463,3 +463,11 @@ TEST_CASE(canonicalize_unicode_locale_id) test("zh-Hans-CN"sv, "zh-Hans-CN"sv); test("ZH-HANS-CN"sv, "zh-Hans-CN"sv); } + +TEST_CASE(supports_locale_aliases) +{ + EXPECT(Unicode::is_locale_available("zh"sv)); + EXPECT(Unicode::is_locale_available("zh-Hant"sv)); + EXPECT(Unicode::is_locale_available("zh-TW"sv)); + EXPECT(Unicode::is_locale_available("zh-Hant-TW"sv)); +}