[Intl] Refactor LookupSupportedLocales
Fix spec non compliance by only trimming the unicode locales and not all extensions. Remove regexp and just use straightforward string manipulation. Bug: v8:5751 Cq-Include-Trybots: luci.v8.try:v8_linux_noi18n_rel_ng Change-Id: Ie95828a8f62834daf8cde189f408e95a14e796fe Reviewed-on: https://chromium-review.googlesource.com/c/1255556 Commit-Queue: Sathya Gunasekaran <gsathya@chromium.org> Reviewed-by: Adam Klein <adamk@chromium.org> Cr-Commit-Position: refs/heads/master@{#56458}
This commit is contained in:
parent
766ab3a5a3
commit
d58f40b63b
@ -1198,96 +1198,108 @@ Maybe<bool> Intl::SetNumberFormatDigitOptions(Isolate* isolate,
|
||||
|
||||
namespace {
|
||||
|
||||
// ECMA 402 9.2.2 BestAvailableLocale(availableLocales, locale)
|
||||
// https://tc39.github.io/ecma402/#sec-bestavailablelocale
|
||||
std::string BestAvailableLocale(std::set<std::string> available_locales,
|
||||
std::string locale) {
|
||||
const char separator = '-';
|
||||
|
||||
// ecma402/#sec-bestavailablelocale
|
||||
std::string BestAvailableLocale(const std::set<std::string>& available_locales,
|
||||
const std::string& locale) {
|
||||
// 1. Let candidate be locale.
|
||||
std::string candidate = locale;
|
||||
|
||||
// 2. Repeat,
|
||||
do {
|
||||
while (true) {
|
||||
// 2.a. If availableLocales contains an element equal to candidate, return
|
||||
// candidate.
|
||||
if (available_locales.find(locale) != available_locales.end()) {
|
||||
return locale;
|
||||
if (available_locales.find(candidate) != available_locales.end()) {
|
||||
return candidate;
|
||||
}
|
||||
|
||||
// 2.b. Let pos be the character index of the last occurrence of "-"
|
||||
// (U+002D) within candidate. If that character does not occur, return
|
||||
// undefined.
|
||||
size_t pos = locale.rfind(separator);
|
||||
size_t pos = candidate.rfind('-');
|
||||
if (pos == std::string::npos) {
|
||||
return "";
|
||||
return std::string();
|
||||
}
|
||||
|
||||
// 2.c. If pos ≥ 2 and the character "-" occurs at index pos-2 of candidate,
|
||||
// decrease pos by 2.
|
||||
if (pos >= 2 && locale[pos - 2] == separator) {
|
||||
if (pos >= 2 && candidate[pos - 2] == '-') {
|
||||
pos -= 2;
|
||||
}
|
||||
|
||||
// 2.d. Let candidate be the substring of candidate from position 0,
|
||||
// inclusive, to position pos, exclusive.
|
||||
locale = locale.substr(0, pos);
|
||||
} while (true);
|
||||
candidate = candidate.substr(0, pos);
|
||||
}
|
||||
}
|
||||
|
||||
#define ANY_EXTENSION_REGEXP "-[a-z0-9]{1}-.*"
|
||||
// Removes unicode extensions from a given bcp47 language tag.
|
||||
// For example, converts 'en-US-u-co-emoji' to 'en-US'.
|
||||
std::string RemoveUnicodeExtensions(const std::string& locale) {
|
||||
size_t length = locale.length();
|
||||
|
||||
std::unique_ptr<icu::RegexMatcher> GetAnyExtensionRegexpMatcher() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
std::unique_ptr<icu::RegexMatcher> matcher(new icu::RegexMatcher(
|
||||
icu::UnicodeString(ANY_EXTENSION_REGEXP, -1, US_INV), 0, status));
|
||||
DCHECK(U_SUCCESS(status));
|
||||
return matcher;
|
||||
// Privateuse or grandfathered locales have no extension sequences.
|
||||
if ((length > 1) && (locale[1] == '-')) {
|
||||
// Check to make sure that this really is a grandfathered or
|
||||
// privateuse extension. ICU can sometimes mess up the
|
||||
// canonicalization.
|
||||
CHECK(locale[0] == 'x' || locale[0] == 'i');
|
||||
return locale;
|
||||
}
|
||||
|
||||
size_t unicode_extension_start = locale.find("-u-");
|
||||
|
||||
// No unicode extensions found.
|
||||
if (unicode_extension_start == std::string::npos) return locale;
|
||||
|
||||
size_t private_extension_start = locale.find("-x-");
|
||||
|
||||
// Unicode extensions found within privateuse subtags don't count.
|
||||
if (private_extension_start != std::string::npos &&
|
||||
private_extension_start < unicode_extension_start) {
|
||||
return locale;
|
||||
}
|
||||
|
||||
const std::string beginning = locale.substr(0, unicode_extension_start);
|
||||
size_t unicode_extension_end = length;
|
||||
DCHECK_GT(length, 2);
|
||||
|
||||
// Find the end of the extension production as per the bcp47 grammar
|
||||
// by looking for '-' followed by 2 chars and then another '-'.
|
||||
for (size_t i = unicode_extension_start + 1; i < length - 2; i++) {
|
||||
if (locale[i] != '-') continue;
|
||||
|
||||
if (locale[i + 2] == '-') {
|
||||
unicode_extension_end = i;
|
||||
break;
|
||||
}
|
||||
|
||||
i += 2;
|
||||
}
|
||||
|
||||
const std::string end = locale.substr(unicode_extension_end);
|
||||
return beginning + end;
|
||||
}
|
||||
|
||||
#undef ANY_EXTENSION_REGEXP
|
||||
|
||||
// ECMA 402 9.2.7 LookupSupportedLocales(availableLocales, requestedLocales)
|
||||
// https://tc39.github.io/ecma402/#sec-lookupsupportedlocales
|
||||
// ecma402/#sec-lookupsupportedlocales
|
||||
std::vector<std::string> LookupSupportedLocales(
|
||||
const std::set<std::string>& available_locales,
|
||||
const std::vector<std::string>& requested_locales) {
|
||||
std::unique_ptr<icu::RegexMatcher> matcher = GetAnyExtensionRegexpMatcher();
|
||||
|
||||
// 1. Let subset be a new empty List.
|
||||
std::vector<std::string> subset;
|
||||
|
||||
// 2. For each element locale of requestedLocales in List order, do
|
||||
for (const auto& locale : requested_locales) {
|
||||
// 2.a. Let noExtensionsLocale be the String value that is locale with all
|
||||
// Unicode locale extension sequences removed.
|
||||
icu::UnicodeString locale_uni(locale.c_str(), -1, US_INV);
|
||||
// TODO(bstell): look at using uloc_forLanguageTag to convert the language
|
||||
// tag to locale id
|
||||
// TODO(bstell): look at using uloc_getBaseName to just get the name without
|
||||
// all the keywords
|
||||
matcher->reset(locale_uni);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// TODO(bstell): need to determine if this is the correct behavior.
|
||||
// This matches the JS implementation but might not match the spec.
|
||||
// According to
|
||||
// https://tc39.github.io/ecma402/#sec-unicode-locale-extension-sequences:
|
||||
//
|
||||
// This standard uses the term "Unicode locale extension sequence" for
|
||||
// any substring of a language tag that is not part of a private use
|
||||
// subtag sequence, starts with a separator "-" and the singleton "u",
|
||||
// and includes the maximum sequence of following non-singleton subtags
|
||||
// and their preceding "-" separators.
|
||||
//
|
||||
// According to the spec a locale "en-t-aaa-u-bbb-v-ccc-x-u-ddd", should
|
||||
// remove only the "-u-bbb" part, and keep everything else, whereas this
|
||||
// regexp matcher would leave only the "en".
|
||||
icu::UnicodeString no_extensions_locale_uni =
|
||||
matcher->replaceAll("", status);
|
||||
DCHECK(U_SUCCESS(status));
|
||||
std::string no_extensions_locale;
|
||||
no_extensions_locale_uni.toUTF8String(no_extensions_locale);
|
||||
// 2.b. Let availableLocale be BestAvailableLocale(availableLocales,
|
||||
// noExtensionsLocale).
|
||||
for (const std::string& locale : requested_locales) {
|
||||
// 2. a. Let noExtensionsLocale be the String value that is locale
|
||||
// with all Unicode locale extension sequences removed.
|
||||
std::string no_extension_locale = RemoveUnicodeExtensions(locale);
|
||||
|
||||
// 2. b. Let availableLocale be
|
||||
// BestAvailableLocale(availableLocales, noExtensionsLocale).
|
||||
std::string available_locale =
|
||||
BestAvailableLocale(available_locales, no_extensions_locale);
|
||||
// 2.c. If availableLocale is not undefined, append locale to the end of
|
||||
// subset.
|
||||
BestAvailableLocale(available_locales, no_extension_locale);
|
||||
|
||||
// 2. c. If availableLocale is not undefined, append locale to the
|
||||
// end of subset.
|
||||
if (!available_locale.empty()) {
|
||||
subset.push_back(locale);
|
||||
}
|
||||
|
@ -27,55 +27,67 @@
|
||||
|
||||
// Tests supportedLocalesOf method.
|
||||
|
||||
var undef = Intl.DateTimeFormat.supportedLocalesOf();
|
||||
assertEquals([], undef);
|
||||
var services = [
|
||||
Intl.DateTimeFormat,
|
||||
Intl.Collator,
|
||||
Intl.NumberFormat,
|
||||
Intl.PluralRules
|
||||
];
|
||||
|
||||
var empty = Intl.DateTimeFormat.supportedLocalesOf([]);
|
||||
assertEquals([], empty);
|
||||
for (const service of services) {
|
||||
let undef = service.supportedLocalesOf();
|
||||
assertEquals([], undef);
|
||||
|
||||
var strLocale = Intl.DateTimeFormat.supportedLocalesOf('sr');
|
||||
assertEquals('sr', strLocale[0]);
|
||||
let empty = service.supportedLocalesOf([]);
|
||||
assertEquals([], empty);
|
||||
|
||||
var multiLocale =
|
||||
Intl.DateTimeFormat.supportedLocalesOf(['sr-Thai-RS', 'de', 'zh-CN']);
|
||||
assertEquals('sr-Thai-RS', multiLocale[0]);
|
||||
assertEquals('de', multiLocale[1]);
|
||||
assertEquals('zh-CN', multiLocale[2]);
|
||||
let strLocale = service.supportedLocalesOf("sr");
|
||||
assertEquals("sr", strLocale[0]);
|
||||
|
||||
collatorUndef = Intl.Collator.supportedLocalesOf();
|
||||
assertEquals([], collatorUndef);
|
||||
var locales = ["sr-Thai-RS", "de", "zh-CN"];
|
||||
let multiLocale = service.supportedLocalesOf(locales);
|
||||
assertEquals("sr-Thai-RS", multiLocale[0]);
|
||||
assertEquals("de", multiLocale[1]);
|
||||
assertEquals("zh-CN", multiLocale[2]);
|
||||
|
||||
collatorEmpty = Intl.Collator.supportedLocalesOf([]);
|
||||
assertEquals([], collatorEmpty);
|
||||
let numLocale = service.supportedLocalesOf(1);
|
||||
assertEquals([], numLocale);
|
||||
assertThrows(function() {
|
||||
numLocale = Intl.Collator.supportedLocalesOf([1]);
|
||||
}, TypeError);
|
||||
|
||||
collatorStrLocale = Intl.Collator.supportedLocalesOf('sr');
|
||||
assertEquals('sr', collatorStrLocale[0]);
|
||||
extensionLocale = service.supportedLocalesOf("id-u-co-pinyin");
|
||||
assertEquals("id-u-co-pinyin", extensionLocale[0]);
|
||||
|
||||
collatorMultiLocale =
|
||||
Intl.Collator.supportedLocalesOf(['sr-Thai-RS', 'de', 'zh-CN']);
|
||||
assertEquals('sr-Thai-RS', collatorMultiLocale[0]);
|
||||
assertEquals('de', collatorMultiLocale[1]);
|
||||
assertEquals('zh-CN', collatorMultiLocale[2]);
|
||||
bestFitLocale = service.supportedLocalesOf("de", {
|
||||
localeMatcher: "best fit"
|
||||
});
|
||||
assertEquals("de", bestFitLocale[0]);
|
||||
|
||||
numLocale = Intl.Collator.supportedLocalesOf(1);
|
||||
assertEquals([], numLocale);
|
||||
// Need a better test for "lookup" once it differs from "best fit".
|
||||
lookupLocale = service.supportedLocalesOf("zh-CN", {
|
||||
localeMatcher: "lookup"
|
||||
});
|
||||
assertEquals("zh-CN", lookupLocale[0]);
|
||||
|
||||
assertThrows(function() {
|
||||
numLocale = Intl.Collator.supportedLocalesOf([1]);
|
||||
}, TypeError);
|
||||
assertThrows(function() {
|
||||
service.supportedLocalesOf("id-u-co-pinyin", { localeMatcher: "xyz" });
|
||||
}, RangeError);
|
||||
|
||||
extensionLocale = Intl.Collator.supportedLocalesOf('id-u-co-pinyin');
|
||||
assertEquals('id-u-co-pinyin', extensionLocale[0]);
|
||||
privateuseLocale = service.supportedLocalesOf("en-US-x-twain");
|
||||
assertEquals("en-US-x-twain", privateuseLocale[0]);
|
||||
|
||||
bestFitLocale =
|
||||
Intl.Collator.supportedLocalesOf('de', {localeMatcher: 'best fit'});
|
||||
assertEquals('de', bestFitLocale[0]);
|
||||
privateuseLocale2 = service.supportedLocalesOf("x-twain");
|
||||
assertEquals(undefined, privateuseLocale2[0]);
|
||||
|
||||
// Need a better test for "lookup" once it differs from "best fit".
|
||||
lookupLocale =
|
||||
Intl.Collator.supportedLocalesOf('zh-CN', {localeMatcher: 'lookup'});
|
||||
assertEquals('zh-CN', lookupLocale[0]);
|
||||
grandfatheredLocale = service.supportedLocalesOf("art-lojban");
|
||||
assertEquals(undefined, grandfatheredLocale[0]);
|
||||
|
||||
assertThrows(function() {
|
||||
Intl.Collator.supportedLocalesOf('id-u-co-pinyin', {localeMatcher: 'xyz'});
|
||||
}, RangeError);
|
||||
grandfatheredLocale2 = service.supportedLocalesOf("i-pwn");
|
||||
assertEquals(undefined, grandfatheredLocale2[0]);
|
||||
|
||||
unicodeInPrivateuseLocale = service.supportedLocalesOf(
|
||||
"en-US-x-u-co-phonebk"
|
||||
);
|
||||
assertEquals("en-US-x-u-co-phonebk", unicodeInPrivateuseLocale[0]);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user