LibUnicode: Parse locale extensions of the transformed extension form

This commit is contained in:
Timothy Flynn 2021-08-27 17:11:48 -04:00 committed by Linus Groh
parent eda92d15e4
commit d2d304fcf8
Notes: sideshowbarker 2024-07-18 05:04:40 +09:00
3 changed files with 176 additions and 2 deletions

View File

@ -146,6 +146,72 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension)
pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } });
}
TEST_CASE(parse_unicode_locale_id_with_transformed_extension)
{
auto fail = [](StringView locale) {
auto locale_id = Unicode::parse_unicode_locale_id(locale);
EXPECT(!locale_id.has_value());
};
auto pass = [](StringView locale, Unicode::TransformedExtension const& expected_extension) {
auto locale_id = Unicode::parse_unicode_locale_id(locale);
VERIFY(locale_id.has_value());
EXPECT_EQ(locale_id->extensions.size(), 1u);
auto const& actual_extension = locale_id->extensions[0].get<Unicode::TransformedExtension>();
VERIFY(actual_extension.language.has_value() == expected_extension.language.has_value());
if (actual_extension.language.has_value()) {
EXPECT_EQ(actual_extension.language->language, expected_extension.language->language);
EXPECT_EQ(actual_extension.language->script, expected_extension.language->script);
EXPECT_EQ(actual_extension.language->region, expected_extension.language->region);
EXPECT_EQ(actual_extension.language->variants, expected_extension.language->variants);
}
EXPECT_EQ(actual_extension.fields.size(), expected_extension.fields.size());
for (size_t i = 0; i < actual_extension.fields.size(); ++i) {
auto const& actual_field = actual_extension.fields[i];
auto const& expected_field = expected_extension.fields[i];
EXPECT_EQ(actual_field.key, expected_field.key);
EXPECT_EQ(actual_field.values, expected_field.values);
}
};
fail("en-t"sv);
fail("en-t-"sv);
fail("en-t-a"sv);
fail("en-t-en-"sv);
fail("en-t-root"sv);
fail("en-t-aaaaaaaaa"sv);
fail("en-t-en-aaa"sv);
fail("en-t-en-latn-latn"sv);
fail("en-t-en-a"sv);
fail("en-t-en-00"sv);
fail("en-t-en-latn-0"sv);
fail("en-t-en-latn-00"sv);
fail("en-t-en-latn-xyz"sv);
fail("en-t-en-aaaaaaaaa"sv);
fail("en-t-en-latn-gb-aaaa"sv);
fail("en-t-en-latn-gb-aaaaaaaaa"sv);
fail("en-t-k0"sv);
fail("en-t-k0-aa"sv);
fail("en-t-k0-aaaaaaaaa"sv);
pass("en-t-en"sv, { Unicode::LanguageID { false, "en"sv }, {} });
pass("en-t-en-latn"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv }, {} });
pass("en-t-en-us"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv }, {} });
pass("en-t-en-latn-us"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv }, {} });
pass("en-t-en-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, {}, { "posix"sv } }, {} });
pass("en-t-en-latn-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, {}, { "posix"sv } }, {} });
pass("en-t-en-us-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv, { "posix"sv } }, {} });
pass("en-t-en-latn-us-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv, { "posix"sv } }, {} });
pass("en-t-k0-aaa"sv, { {}, { { "k0"sv, { "aaa"sv } } } });
pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv, "bbbb" } } } });
pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, { "bbbb"sv } } } });
pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } });
}
TEST_CASE(canonicalize_unicode_locale_id)
{
auto test = [](StringView locale, StringView expected_canonical_locale) {

View File

@ -78,6 +78,23 @@ static bool is_attribute(StringView type)
return all_of(type, is_ascii_alphanumeric);
}
static bool is_transformed_key(StringView key)
{
// tkey = alpha digit
if (key.length() != 2)
return false;
return is_ascii_alpha(key[0]) && is_ascii_digit(key[1]);
}
static bool is_single_transformed_value(StringView value)
{
// tvalue = (sep alphanum{3,8})+
// Note: Consecutive values are not handled here, that is left to the caller.
if ((value.length() < 3) || (value.length() > 8))
return false;
return all_of(value, is_ascii_alphanumeric);
}
static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator = true)
{
constexpr auto is_separator = is_any_of("-_"sv);
@ -248,6 +265,81 @@ static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& le
return locale_extension;
}
static Optional<TransformedExtension> parse_transformed_extension(GenericLexer& lexer)
{
// https://unicode.org/reports/tr35/#transformed_extensions
//
// transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | (sep tfield)+)
TransformedExtension transformed_extension {};
enum class ParseState {
ParsingLanguageOrField,
ParsingLanguage,
ParsingField,
Done,
};
auto state = ParseState::ParsingLanguageOrField;
while (!lexer.is_eof() && (state != ParseState::Done)) {
auto segment = consume_next_segment(lexer);
if (!segment.has_value())
return {};
if (state == ParseState::ParsingLanguageOrField)
state = is_unicode_language_subtag(*segment) ? ParseState::ParsingLanguage : ParseState::ParsingField;
switch (state) {
case ParseState::ParsingLanguage:
lexer.retreat(segment->length());
if (auto language_id = parse_unicode_language_id(lexer); language_id.has_value()) {
transformed_extension.language = language_id.release_value();
state = ParseState::ParsingField;
break;
}
return {};
case ParseState::ParsingField: {
// tfield = tkey tvalue;
TransformedField field { .key = *segment };
if (!is_transformed_key(*segment)) {
lexer.retreat(segment->length() + 1);
state = ParseState::Done;
break;
}
while (true) {
auto value = consume_next_segment(lexer);
if (!value.has_value() || !is_single_transformed_value(*value)) {
if (value.has_value())
lexer.retreat(value->length() + 1);
break;
}
field.values.append(*value);
}
if (field.values.is_empty())
return {};
transformed_extension.fields.append(move(field));
break;
}
default:
VERIFY_NOT_REACHED();
}
}
if (!transformed_extension.language.has_value() && transformed_extension.fields.is_empty())
return {};
return transformed_extension;
}
static Optional<Extension> parse_extension(GenericLexer& lexer)
{
// https://unicode.org/reports/tr35/#extensions
@ -263,8 +355,14 @@ static Optional<Extension> parse_extension(GenericLexer& lexer)
return Extension { extension.release_value() };
break;
case 't':
case 'T':
if (auto extension = parse_transformed_extension(lexer); extension.has_value())
return Extension { extension.release_value() };
break;
default:
// FIXME: Handle transformed_extensions / other_extensions
// FIXME: Handle other_extensions
break;
}
}

View File

@ -33,7 +33,17 @@ struct LocaleExtension {
Vector<Keyword> keywords {};
};
using Extension = Variant<LocaleExtension>;
struct TransformedField {
StringView key;
Vector<StringView> values {};
};
struct TransformedExtension {
Optional<LanguageID> language {};
Vector<TransformedField> fields {};
};
using Extension = Variant<LocaleExtension, TransformedExtension>;
struct LocaleID {
LanguageID language_id {};