LibUnicode+LibJS: Store locale keyword values as a single string

Previously, LibUnicode would store the values of a keyword as a Vector.
For example, the locale "en-u-ca-abc-def" would have its keyword "ca"
stored as {"abc, "def"}. Then, canonicalization would occur on each of
the elements in that Vector.

This is incorrect because, for example, the keyword value "true" should
only be dropped if that is the entire value. That is, the canonical form
of "en-u-kb-true" is "en-u-kb", but "en-u-kb-abc-true" does not change
for canonicalization. However, we would canonicalize that locale as
"en-u-kb-abc".
This commit is contained in:
Timothy Flynn 2021-09-08 15:25:35 -04:00 committed by Linus Groh
parent 75657b79c6
commit 4f2bcebe74
Notes: sideshowbarker 2024-07-18 04:25:11 +09:00
5 changed files with 102 additions and 118 deletions

View File

@ -138,7 +138,7 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension)
auto const& expected_keyword = expected_extension.keywords[i];
EXPECT_EQ(actual_keyword.key, expected_keyword.key);
EXPECT_EQ(actual_keyword.types, expected_keyword.types);
EXPECT_EQ(actual_keyword.value, expected_keyword.value);
}
};
@ -153,15 +153,15 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension)
fail("en-u-xxxxx-"sv);
fail("en-u-xxxxxxxxx"sv);
pass("en-u-xx"sv, { {}, { { "xx"sv, {} } } });
pass("en-u-xx"sv, { {}, { { "xx"sv, ""sv } } });
pass("en-u-xx-yyyy"sv, { {}, { { "xx"sv, { "yyyy"sv } } } });
pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } } } });
pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } }, { "aa"sv, {} } } });
pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, "yyyy-zzzz"sv } } });
pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, "yyyy-zzzz"sv }, { "aa"sv, ""sv } } });
pass("en-u-xxx"sv, { { "xxx"sv }, {} });
pass("en-u-fff-gggg"sv, { { "fff"sv, "gggg"sv }, {} });
pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, {} } } });
pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, { "yyyy"sv } } } });
pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } });
pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, ""sv } } });
pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, "yyyy"sv } } });
pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, "yyyy"sv } } });
}
TEST_CASE(parse_unicode_locale_id_with_transformed_extension)
@ -192,7 +192,7 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension)
auto const& expected_field = expected_extension.fields[i];
EXPECT_EQ(actual_field.key, expected_field.key);
EXPECT_EQ(actual_field.values, expected_field.values);
EXPECT_EQ(actual_field.value, expected_field.value);
}
};
@ -225,9 +225,9 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension)
pass("en-t-en-us-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv, { "posix"sv } }, {} });
pass("en-t-en-latn-us-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv, { "posix"sv } }, {} });
pass("en-t-k0-aaa"sv, { {}, { { "k0"sv, { "aaa"sv } } } });
pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv, "bbbb" } } } });
pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, { "bbbb"sv } } } });
pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } });
pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, "aaa-bbbb"sv } } });
pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, "bbbb"sv } } });
pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, "aaa"sv } } });
}
TEST_CASE(parse_unicode_locale_id_with_other_extension)
@ -243,7 +243,7 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension)
auto const& actual_extension = locale_id->extensions[0].get<Unicode::OtherExtension>();
EXPECT_EQ(actual_extension.key, expected_extension.key);
EXPECT_EQ(actual_extension.values, expected_extension.values);
EXPECT_EQ(actual_extension.value, expected_extension.value);
};
fail("en-z"sv);
@ -259,9 +259,9 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension)
fail("en-z-aaa-a"sv);
fail("en-0-aaa-a"sv);
pass("en-z-aa", { 'z', { "aa"sv } });
pass("en-z-aa-bbb", { 'z', { "aa"sv, "bbb"sv } });
pass("en-z-aa-bbb-cccccccc", { 'z', { "aa"sv, "bbb"sv, "cccccccc"sv } });
pass("en-z-aa", { 'z', "aa"sv });
pass("en-z-aa-bbb", { 'z', "aa-bbb"sv });
pass("en-z-aa-bbb-cccccccc", { 'z', "aa-bbb-cccccccc"sv });
}
TEST_CASE(parse_unicode_locale_id_with_private_use_extension)
@ -320,8 +320,12 @@ TEST_CASE(canonicalize_unicode_locale_id)
test("EN-U-CCC-BBB-2K-AAA-1K-BBB"sv, "en-u-bbb-ccc-1k-bbb-2k-aaa"sv);
test("en-u-1k-true"sv, "en-u-1k"sv);
test("EN-U-1K-TRUE"sv, "en-u-1k"sv);
test("en-u-1k-true-abcd"sv, "en-u-1k-true-abcd"sv);
test("EN-U-1K-TRUE-ABCD"sv, "en-u-1k-true-abcd"sv);
test("en-u-kb-yes"sv, "en-u-kb"sv);
test("EN-U-KB-YES"sv, "en-u-kb"sv);
test("en-u-kb-yes-abcd"sv, "en-u-kb-yes-abcd"sv);
test("EN-U-KB-YES-ABCD"sv, "en-u-kb-yes-abcd"sv);
test("en-u-ka-yes"sv, "en-u-ka-yes"sv);
test("EN-U-KA-YES"sv, "en-u-ka-yes"sv);
test("en-u-1k-names"sv, "en-u-1k-names"sv);

View File

@ -4,7 +4,6 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/StringBuilder.h>
#include <LibJS/Runtime/GlobalObject.h>
#include <LibJS/Runtime/Intl/Locale.h>
#include <LibUnicode/Locale.h>
@ -27,29 +26,23 @@ Locale::Locale(Unicode::LocaleID const& locale_id, Object& prototype)
{
set_locale(locale_id.to_string());
auto join_keyword_types = [](auto const& types) {
StringBuilder builder;
builder.join('-', types);
return builder.build();
};
for (auto const& extension : locale_id.extensions) {
if (!extension.has<Unicode::LocaleExtension>())
continue;
for (auto const& keyword : extension.get<Unicode::LocaleExtension>().keywords) {
if (keyword.key == "ca"sv) {
set_calendar(join_keyword_types(keyword.types));
set_calendar(keyword.value);
} else if (keyword.key == "co"sv) {
set_collation(join_keyword_types(keyword.types));
set_collation(keyword.value);
} else if (keyword.key == "hc"sv) {
set_hour_cycle(join_keyword_types(keyword.types));
set_hour_cycle(keyword.value);
} else if (keyword.key == "kf"sv) {
set_case_first(join_keyword_types(keyword.types));
set_case_first(keyword.value);
} else if (keyword.key == "kn"sv) {
set_numeric(keyword.types.is_empty());
set_numeric(keyword.value.is_empty());
} else if (keyword.key == "nu"sv) {
set_numbering_system(join_keyword_types(keyword.types));
set_numbering_system(keyword.value);
}
}

View File

@ -6,7 +6,6 @@
#include <AK/Optional.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <LibJS/Runtime/AbstractOperations.h>
#include <LibJS/Runtime/GlobalObject.h>
#include <LibJS/Runtime/Intl/AbstractOperations.h>
@ -193,9 +192,7 @@ static LocaleAndKeys apply_unicode_extension_to_tag(StringView tag, LocaleAndKey
entry = &(*it);
// ii. Let value be entry.[[Value]].
StringBuilder builder;
builder.join('-', entry->types);
value = builder.build();
value = entry->value;
}
// c. Else,
// i. Let entry be empty.
@ -213,12 +210,12 @@ static LocaleAndKeys apply_unicode_extension_to_tag(StringView tag, LocaleAndKey
// iii. If entry is not empty, then
if (entry != nullptr) {
// 1. Set entry.[[Value]] to value.
entry->types = value->split('-');
entry->value = *value;
}
// iv. Else,
else {
// 1. Append the Record { [[Key]]: key, [[Value]]: value } to keywords.
keywords.append({ key, value->split('-') });
keywords.append({ key, *value });
}
}

View File

@ -211,6 +211,7 @@ static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& le
case ParseState::ParsingKeyword: {
// keyword = key (sep type)?
Keyword keyword { .key = *segment };
Vector<StringView> keyword_values;
if (!is_key(*segment)) {
lexer.retreat(segment->length() + 1);
@ -227,9 +228,13 @@ static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& le
break;
}
keyword.types.append(*type);
keyword_values.append(*type);
}
StringBuilder builder;
builder.join('-', keyword_values);
keyword.value = builder.build();
locale_extension.keywords.append(move(keyword));
break;
}
@ -283,6 +288,7 @@ static Optional<TransformedExtension> parse_transformed_extension(GenericLexer&
case ParseState::ParsingField: {
// tfield = tkey tvalue;
TransformedField field { .key = *segment };
Vector<StringView> field_values;
if (!is_transformed_key(*segment)) {
lexer.retreat(segment->length() + 1);
@ -299,12 +305,16 @@ static Optional<TransformedExtension> parse_transformed_extension(GenericLexer&
break;
}
field.values.append(*value);
field_values.append(*value);
}
if (field.values.is_empty())
if (field_values.is_empty())
return {};
StringBuilder builder;
builder.join('-', field_values);
field.value = builder.build();
transformed_extension.fields.append(move(field));
break;
}
@ -325,6 +335,7 @@ static Optional<OtherExtension> parse_other_extension(char key, GenericLexer& le
//
// other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
OtherExtension other_extension { .key = key };
Vector<StringView> other_values;
if (!is_ascii_alphanumeric(key) || (key == 'x') || (key == 'X'))
return {};
@ -339,11 +350,16 @@ static Optional<OtherExtension> parse_other_extension(char key, GenericLexer& le
break;
}
other_extension.values.append(*segment);
other_values.append(*segment);
}
if (other_extension.values.is_empty())
if (other_values.is_empty())
return {};
StringBuilder builder;
builder.join('-', other_values);
other_extension.value = builder.build();
return other_extension;
}
@ -469,8 +485,11 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/transform.xml
//
// There isn't yet a counterpart in the JSON export. See: https://unicode-org.atlassian.net/browse/CLDR-14571
if ((key == "ca"sv) && (value == "islamicc"sv)) {
value = "islamic-civil"sv;
if (key == "ca"sv) {
if (value == "islamicc"sv)
value = "islamic-civil"sv;
else if (value == "ethiopic-amete-alem"sv)
value = "ethioaa"sv;
} else if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) {
value = "true"sv;
} else if (key == "ks"sv) {
@ -521,20 +540,6 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu
}
}
static void perform_hard_coded_key_multi_value_substitutions(String const& key, Vector<String>& values)
{
// Similar to perform_hard_coded_key_value_substitutions, some aliases depend on multiple
// variants being present in the original locale. Those are canonicalized separately here.
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml
if ((key != "ca"sv) || (values.size() != 3))
return;
static Vector<String> ethiopic_amete_alem { "ethiopic"sv, "amete"sv, "alem"sv };
if (values == ethiopic_amete_alem)
values = { "ethioaa"sv };
}
static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
{
auto canonicalize_language = [](LanguageID& language_id, bool force_lowercase) {
@ -589,39 +594,32 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
}
};
auto canonicalize_key_value_list = [&](auto& key, auto& values, bool remove_true_values) {
auto canonicalize_key_value_list = [&](auto& key, auto& value, bool remove_true_values) {
key = key.to_lowercase();
value = value.to_lowercase();
auto raw_values = move(values);
perform_hard_coded_key_value_substitutions(key, value);
for (auto& value : raw_values) {
value = value.to_lowercase();
perform_hard_coded_key_value_substitutions(key, value);
// Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec
// because, for tvalues, that would result in invalid syntax:
// https://unicode-org.atlassian.net/browse/CLDR-14318
// This has also been noted by test262:
// https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js
if (remove_true_values && (value == "true"sv))
continue;
if (key.is_one_of("sd"sv, "rg"sv)) {
if (auto alias = resolve_subdivision_alias(value); alias.has_value()) {
auto aliases = alias->split_view(' ');
// FIXME: Subdivision subtags do not appear in the CLDR likelySubtags.json file.
// Implement the spec's recommendation of using just the first alias for now,
// but we should determine if there's anything else needed here.
values.append(aliases[0].to_string());
continue;
}
}
values.append(move(value));
// Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec
// because, for tvalues, that would result in invalid syntax:
// https://unicode-org.atlassian.net/browse/CLDR-14318
// This has also been noted by test262:
// https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js
if (remove_true_values && (value == "true"sv)) {
value = {};
return;
}
perform_hard_coded_key_multi_value_substitutions(key, values);
if (key.is_one_of("sd"sv, "rg"sv)) {
if (auto alias = resolve_subdivision_alias(value); alias.has_value()) {
auto aliases = alias->split_view(' ');
// FIXME: Subdivision subtags do not appear in the CLDR likelySubtags.json file.
// Implement the spec's recommendation of using just the first alias for now,
// but we should determine if there's anything else needed here.
value = aliases[0].to_string();
}
}
};
canonicalize_language(locale_id.language_id, false);
@ -643,7 +641,7 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
for (auto& attribute : ext.attributes)
attribute = attribute.to_lowercase();
for (auto& keyword : ext.keywords)
canonicalize_key_value_list(keyword.key, keyword.types, true);
canonicalize_key_value_list(keyword.key, keyword.value, true);
quick_sort(ext.attributes);
quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
@ -653,14 +651,13 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
canonicalize_language(*ext.language, true);
for (auto& field : ext.fields)
canonicalize_key_value_list(field.key, field.values, false);
canonicalize_key_value_list(field.key, field.value, false);
quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
},
[&](OtherExtension& ext) {
ext.key = static_cast<char>(to_ascii_lowercase(ext.key));
for (auto& value : ext.values)
value = value.to_lowercase();
ext.value = ext.value.to_lowercase();
});
}
@ -674,7 +671,7 @@ Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)
StringBuilder builder;
auto append_sep_and_string = [&](Optional<String> const& string) {
if (!string.has_value())
if (!string.has_value() || string->is_empty())
return;
builder.appendff("-{}", *string);
};
@ -690,13 +687,6 @@ Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)
for (auto const& variant : locale_id.language_id.variants)
append_sep_and_string(variant);
auto append_key_value_list = [&](auto const& key, auto& values) {
append_sep_and_string(key);
for (auto& value : values)
append_sep_and_string(value);
};
for (auto const& extension : locale_id.extensions) {
extension.visit(
[&](LocaleExtension const& ext) {
@ -704,8 +694,10 @@ Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)
for (auto const& attribute : ext.attributes)
append_sep_and_string(attribute);
for (auto const& keyword : ext.keywords)
append_key_value_list(keyword.key, keyword.types);
for (auto const& keyword : ext.keywords) {
append_sep_and_string(keyword.key);
append_sep_and_string(keyword.value);
}
},
[&](TransformedExtension const& ext) {
builder.append("-t"sv);
@ -718,13 +710,14 @@ Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)
append_sep_and_string(variant);
}
for (auto const& field : ext.fields)
append_key_value_list(field.key, field.values);
for (auto const& field : ext.fields) {
append_sep_and_string(field.key);
append_sep_and_string(field.value);
}
},
[&](OtherExtension const& ext) {
builder.appendff("-{:c}", to_ascii_lowercase(ext.key));
for (auto const& value : ext.values)
append_sep_and_string(value);
append_sep_and_string(ext.value);
});
}
@ -947,19 +940,13 @@ String LocaleID::to_string() const
StringBuilder builder;
auto append_segment = [&](Optional<String> const& segment) {
if (!segment.has_value())
if (!segment.has_value() || segment->is_empty())
return;
if (!builder.is_empty())
builder.append('-');
builder.append(*segment);
};
auto append_key_value_list = [&](auto const& key, auto const& values) {
append_segment(key);
for (auto const& value : values)
append_segment(value);
};
append_segment(language_id.to_string());
for (auto const& extension : extensions) {
@ -968,20 +955,23 @@ String LocaleID::to_string() const
builder.append("-u"sv);
for (auto const& attribute : ext.attributes)
append_segment(attribute);
for (auto const& keyword : ext.keywords)
append_key_value_list(keyword.key, keyword.types);
for (auto const& keyword : ext.keywords) {
append_segment(keyword.key);
append_segment(keyword.value);
}
},
[&](TransformedExtension const& ext) {
builder.append("-t"sv);
if (ext.language.has_value())
append_segment(ext.language->to_string());
for (auto const& field : ext.fields)
append_key_value_list(field.key, field.values);
for (auto const& field : ext.fields) {
append_segment(field.key);
append_segment(field.value);
}
},
[&](OtherExtension const& ext) {
builder.appendff("-{}", ext.key);
for (auto const& value : ext.values)
append_segment(value);
append_segment(ext.value);
});
}

View File

@ -29,7 +29,7 @@ struct LanguageID {
struct Keyword {
String key {};
Vector<String> types {};
String value {};
};
struct LocaleExtension {
@ -38,8 +38,8 @@ struct LocaleExtension {
};
struct TransformedField {
String key;
Vector<String> values {};
String key {};
String value {};
};
struct TransformedExtension {
@ -49,7 +49,7 @@ struct TransformedExtension {
struct OtherExtension {
char key {};
Vector<String> values {};
String value {};
};
using Extension = Variant<LocaleExtension, TransformedExtension, OtherExtension>;