[Intl] Fix Locale Canonicalization bugs

Bug: v8:9613, v8:10447
Change-Id: Iff43b298c6edaa9b258038ae15406d5df209e8b5
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2543266
Commit-Queue: Frank Tang <ftang@chromium.org>
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71267}
This commit is contained in:
Frank Tang 2020-11-17 16:43:06 -08:00 committed by Commit Bot
parent 300573aca8
commit b346af5424
6 changed files with 78 additions and 40 deletions

View File

@ -744,22 +744,12 @@ bool IsTwoLetterLanguage(const std::string& locale) {
IsAsciiLower(locale[1]);
}
bool IsDeprecatedLanguage(const std::string& locale) {
bool IsDeprecatedOrLegacyLanguage(const std::string& locale) {
// Check if locale is one of the deprecated language tags:
return locale == "in" || locale == "iw" || locale == "ji" || locale == "jw" ||
locale == "mo";
}
// Reference:
// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
bool IsGrandfatheredTagWithoutPreferredVaule(const std::string& locale) {
if (V8_UNLIKELY(locale == "zh-min" || locale == "cel-gaulish")) return true;
if (locale.length() > 6 /* i-mingo is 7 chars long */ &&
V8_UNLIKELY(locale[0] == 'i' && locale[1] == '-')) {
return locale.substr(2) == "default" || locale.substr(2) == "enochian" ||
locale.substr(2) == "mingo";
}
return false;
locale == "mo" ||
// Check if locale is one of the legacy language tags:
locale == "sh" || locale == "tl" || locale == "no";
}
bool IsStructurallyValidLanguageTag(const std::string& tag) {
@ -788,7 +778,7 @@ Maybe<std::string> CanonicalizeLanguageTag(Isolate* isolate,
// (in, iw, ji, jw). Don't check for ~70 of 3-letter deprecated language
// codes. Instead, let them be handled by ICU in the slow path. However,
// fast-track 'fil' (3-letter canonical code).
if ((IsTwoLetterLanguage(locale) && !IsDeprecatedLanguage(locale)) ||
if ((IsTwoLetterLanguage(locale) && !IsDeprecatedOrLegacyLanguage(locale)) ||
locale == "fil") {
return Just(locale);
}
@ -797,13 +787,6 @@ Maybe<std::string> CanonicalizeLanguageTag(Isolate* isolate,
// the input before any more check.
std::transform(locale.begin(), locale.end(), locale.begin(), ToAsciiLower);
// ICU maps a few grandfathered tags to what looks like a regular language
// tag even though IANA language tag registry does not have a preferred
// entry map for them. Return them as they're with lowercasing.
if (IsGrandfatheredTagWithoutPreferredVaule(locale)) {
return Just(locale);
}
// // ECMA 402 6.2.3
// TODO(jshin): uloc_{for,to}TanguageTag can fail even for a structually valid
// language tag if it's too long (much longer than 100 chars). Even if we
@ -817,6 +800,32 @@ Maybe<std::string> CanonicalizeLanguageTag(Isolate* isolate,
// is structurally valid. Due to a couple of bugs, we can't use it
// without Chromium patches or ICU 62 or earlier.
icu::Locale icu_locale = icu::Locale::forLanguageTag(locale.c_str(), error);
if (U_FAILURE(error) || icu_locale.isBogus()) {
THROW_NEW_ERROR_RETURN_VALUE(
isolate,
NewRangeError(
MessageTemplate::kInvalidLanguageTag,
isolate->factory()->NewStringFromAsciiChecked(locale.c_str())),
Nothing<std::string>());
}
// reject attribute of wrong length.
if (std::strstr(icu_locale.getName(), "attribute=") != nullptr) {
std::string attribute =
icu_locale.getKeywordValue<std::string>("attribute", error);
if (U_SUCCESS(error) &&
(attribute.length() < 3 || attribute.length() > 8)) {
THROW_NEW_ERROR_RETURN_VALUE(
isolate,
NewRangeError(
MessageTemplate::kInvalidLanguageTag,
isolate->factory()->NewStringFromAsciiChecked(locale.c_str())),
Nothing<std::string>());
}
}
icu_locale.canonicalize(error);
if (U_FAILURE(error) || icu_locale.isBogus()) {
THROW_NEW_ERROR_RETURN_VALUE(
isolate,

View File

@ -104,6 +104,9 @@ Handle<Object> UnicodeKeywordValue(Isolate* isolate, Handle<JSLocale> locale,
if (value == "yes") {
value = "true";
}
if (value == "true" && strcmp(key, "kf") == 0) {
return isolate->factory()->NewStringFromStaticChars("");
}
return isolate->factory()->NewStringFromAsciiChecked(value.c_str());
}
@ -242,10 +245,12 @@ Maybe<bool> ApplyOptionsToTag(Isolate* isolate, Handle<String> tag,
return Just(false);
}
UErrorCode status = U_ZERO_ERROR;
builder->build(status);
icu::Locale canonicalized = builder->build(status);
canonicalized.canonicalize(status);
if (U_FAILURE(status)) {
return Just(false);
}
builder->setLocale(canonicalized);
// 3. Let language be ? GetOption(options, "language", "string", undefined,
// undefined).
@ -346,6 +351,9 @@ MaybeHandle<JSLocale> JSLocale::New(Isolate* isolate, Handle<Map> map,
MAYBE_RETURN(maybe_insert, MaybeHandle<JSLocale>());
UErrorCode status = U_ZERO_ERROR;
icu::Locale icu_locale = builder.build(status);
icu_locale.canonicalize(status);
if (!maybe_insert.FromJust() || U_FAILURE(status)) {
THROW_NEW_ERROR(isolate,
NewRangeError(MessageTemplate::kLocaleBadParameters),

View File

@ -14,3 +14,15 @@ assertEquals("en-u-ca-gregory", Intl.getCanonicalLocales("en-u-ca-gregory-ca-chi
// Check duplicate subtags (after the first tag) are detected.
assertThrows(() => Intl.getCanonicalLocales("en-foobar-foobar"), RangeError);
// Check some common case
assertEquals("id", Intl.getCanonicalLocales("in")[0]);
assertEquals("he", Intl.getCanonicalLocales("iw")[0]);
assertEquals("yi", Intl.getCanonicalLocales("ji")[0]);
assertEquals("jv", Intl.getCanonicalLocales("jw")[0]);
assertEquals("ro", Intl.getCanonicalLocales("mo")[0]);
assertEquals("sr", Intl.getCanonicalLocales("scc")[0]);
assertEquals("sr-Latn", Intl.getCanonicalLocales("sh")[0]);
assertEquals("sr-ME", Intl.getCanonicalLocales("cnr")[0]);
assertEquals("nb", Intl.getCanonicalLocales("no")[0]);
assertEquals("fil", Intl.getCanonicalLocales("tl")[0]);

View File

@ -7,7 +7,7 @@
// tag registry. Nonetheless, ICU cooks up a value when canonicalizing.
// v8 works around that ICU issue.
// See https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
["cel-gaulish", "cel-gaulish"],
["cel-gaulish", "xtg-x-cel-gaulish"],
// Matching should be case-insensitive.
].forEach(([inputLocale, expectedLocale]) => {

View File

@ -0,0 +1,19 @@
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
assertEquals("id", (new Intl.Locale("in")).toString());
assertEquals("he", (new Intl.Locale("iw")).toString());
assertEquals("yi", (new Intl.Locale("ji")).toString());
assertEquals("jv", (new Intl.Locale("jw")).toString());
assertEquals("ro", (new Intl.Locale("mo")).toString());
assertEquals("sr", (new Intl.Locale("scc")).toString());
assertEquals("hr", (new Intl.Locale("scr")).toString());
assertEquals("sr-Latn", (new Intl.Locale("sh")).toString());
assertEquals("sr-ME", (new Intl.Locale("cnr")).toString());
assertEquals("nb", (new Intl.Locale("no")).toString());
assertEquals("fil", (new Intl.Locale("tl")).toString());
assertEquals("hy-AM", (new Intl.Locale("hy-SU")).toString());
assertEquals("lv-LV", (new Intl.Locale("lv-SU")).toString());

View File

@ -546,29 +546,22 @@
# https://bugs.chromium.org/p/v8/issues/detail?id=9049
'language/comments/hashbang/use-strict': [SKIP],
# https://bugs.chromium.org/p/v8/issues/detail?id=9613
# https://github.com/tc39/test262/pull/2903
'intl402/Intl/getCanonicalLocales/canonicalized-tags': [FAIL],
'intl402/Intl/getCanonicalLocales/grandfathered': [FAIL],
'intl402/Intl/getCanonicalLocales/preferred-grandfathered': [FAIL],
'intl402/Intl/getCanonicalLocales/preferred-variant': [FAIL],
'intl402/Locale/constructor-apply-options-canonicalizes-twice': [FAIL],
# https://bugs.chromium.org/p/v8/issues/detail?id=9613
'intl402/Locale/likely-subtags-grandfathered': [FAIL],
# http://crbug/v8/11039
'intl402/Locale/reject-duplicate-variants-in-tlang': [FAIL],
# http://crbug/v8/10447
'intl402/Intl/getCanonicalLocales/complex-language-subtag-replacement': [FAIL],
'intl402/Intl/getCanonicalLocales/complex-region-subtag-replacement': [FAIL],
'intl402/Intl/getCanonicalLocales/transformed-ext-canonical': [FAIL],
'intl402/Intl/getCanonicalLocales/transformed-ext-invalid': [FAIL],
'intl402/Intl/getCanonicalLocales/unicode-ext-canonicalize-region': [FAIL],
'intl402/Intl/getCanonicalLocales/unicode-ext-canonicalize-subdivision': [FAIL],
'intl402/Intl/getCanonicalLocales/unicode-ext-canonicalize-yes-to-true': [FAIL],
'intl402/Intl/getCanonicalLocales/unicode-ext-key-with-digit': [FAIL],
# https://bugs.chromium.org/p/v8/issues/detail?id=9742
'intl402/Locale/getters': [FAIL],
# https://github.com/tc39/test262/pull/2349
'intl402/Locale/constructor-options-region-valid': [FAIL],
# http://crbug/v8/11174
'intl402/DateTimeFormat/intl-legacy-constructed-symbol': [FAIL],
@ -653,9 +646,6 @@
'built-ins/TypedArray/prototype/item/returns-undefined-for-holes-in-sparse-arrays': [FAIL],
'built-ins/TypedArray/prototype/item/returns-undefined-for-out-of-range-index': [FAIL],
# http://crbug/v8/11039
'intl402/Locale/reject-duplicate-variants-in-tlang': [FAIL],
######################## NEEDS INVESTIGATION ###########################
# https://bugs.chromium.org/p/v8/issues/detail?id=7833