From 357c97dfa864dbd779d517bac502858aa2618b96 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 9 Nov 2021 13:32:23 -0500 Subject: [PATCH] LibUnicode: Parse the CLDR's defaultContent.json locale list This file contains the list of locales which default to their parent locale's values. In the core CLDR dataset, these locales have their own files, but they are empty (except for identity data). For example: https://github.com/unicode-org/cldr/blob/main/common/main/en_US.xml In the JSON export, these files are excluded, so we currently are not recognizing these locales just by iterating the locale files. This is a prerequisite for upgrading to CLDR version 40. One of these default-content locales is the popular "en-US" locale, which defaults to "en" values. We were previously inferring the existence of this locale from the "en-US-POSIX" locale (many implementations, including ours, strip variants such as POSIX). However, v40 removes the "en-US-POSIX" locale entirely, meaning that without this change, we wouldn't know that "en-US" exists (we would default to "en"). For more detail on this and other v40 changes, see: https://cldr.unicode.org/index/downloads/cldr-40#h.nssoo2lq3cba --- .../LibUnicode/GenerateUnicodeLocale.cpp | 37 ++++++++++++++++++- Tests/LibUnicode/TestUnicodeLocale.cpp | 6 +++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp index 7a2e22f1a4b..7ea7c2e3f1e 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp @@ -464,6 +464,39 @@ static void parse_numeric_keywords(String locale_numbers_path, UnicodeLocaleData locale_data.keywords.append(key); } +static void parse_default_content_locales(String core_path, UnicodeLocaleData& locale_data) +{ + LexicalPath default_content_path(move(core_path)); + default_content_path = default_content_path.append("defaultContent.json"sv); + VERIFY(Core::File::exists(default_content_path.string())); + + auto default_content_file_or_error = Core::File::open(default_content_path.string(), Core::OpenMode::ReadOnly); + VERIFY(!default_content_file_or_error.is_error()); + + auto default_content = JsonParser(default_content_file_or_error.value()->read_all()).parse(); + VERIFY(default_content.has_value()); + + auto const& default_content_array = default_content->as_object().get("defaultContent"sv); + + default_content_array.as_array().for_each([&](JsonValue const& value) { + auto locale = value.as_string(); + StringView default_locale = locale; + + while (true) { + if (locale_data.locales.contains(default_locale)) + break; + + auto pos = default_locale.find_last('-'); + if (!pos.has_value()) + return; + + default_locale = default_locale.substring_view(0, *pos); + } + + locale_data.locales.set(locale, locale_data.locales.get(default_locale).value()); + }); +} + static Core::DirIterator path_to_dir_iterator(String path) { LexicalPath lexical_path(move(path)); @@ -486,7 +519,7 @@ static void parse_all_locales(String core_path, String locale_names_path, String auto misc_iterator = path_to_dir_iterator(move(misc_path)); auto numbers_iterator = path_to_dir_iterator(move(numbers_path)); - LexicalPath core_supplemental_path(move(core_path)); + LexicalPath core_supplemental_path(core_path); core_supplemental_path = core_supplemental_path.append("supplemental"sv); VERIFY(Core::File::is_directory(core_supplemental_path.string())); @@ -558,6 +591,8 @@ static void parse_all_locales(String core_path, String locale_names_path, String parse_locale_currencies(numbers_path, locale_data, locale); parse_numeric_keywords(numbers_path, locale_data, locale); } + + parse_default_content_locales(move(core_path), locale_data); } static String format_identifier(StringView owner, String identifier) diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index a82b9eae2b8..34619c8cd78 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -456,4 +456,10 @@ TEST_CASE(canonicalize_unicode_locale_id) test("ZH-XIANG"sv, "hsn"sv); test("ja-latn-hepburn-heploc"sv, "ja-Latn-alalc97"sv); test("JA-LATN-HEPBURN-HEPLOC"sv, "ja-Latn-alalc97"sv); + + // Default content. + test("en-us"sv, "en-US"sv); + test("EN-US"sv, "en-US"sv); + test("zh-Hans-CN"sv, "zh-Hans-CN"sv); + test("ZH-HANS-CN"sv, "zh-Hans-CN"sv); }