Fix canonicalization of grandfathered tags

ICU maps a few grandfathered tags to made-up values even when there is no preferred value entry in the IANA language tag registry. [1] 1. Check for grandfathered tags without preferred value upfront and return them as they're. 2. Lowercase the input before structural validity check to simplify check for grandfathered tag without preferred value as well as regexps used in the structural validity check. intl/general/grandfathered_tags_without_preferred_value is added and intl/general/language_tags_with_preferred_values is changed to check for case-insensitive matching of grandfathered tags. [1] https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry Bug: v8:7669 Test: test262/intl402/Intl/getCanonicalLocales/preferred-grandfathered Test: intl/general/grandfathered_tags_without_preferred_value Cq-Include-Trybots: luci.v8.try:v8_linux_noi18n_rel_ng Cq-Include-Trybots: luci.chromium.try:linux_chromium_rel_ng Change-Id: Ie0520de8712928300fd71fe152909789483ec256 Reviewed-on: https://chromium-review.googlesource.com/1156529 Commit-Queue: Jungshik Shin <jshin@chromium.org> Reviewed-by: Sathya Gunasekaran <gsathya@chromium.org> Cr-Commit-Position: refs/heads/master@{#54829}
2018-07-31 14:27:44 -07:00 · 2018-07-31 14:27:44 -07:00 · f24b575d6c
commit f24b575d6c
parent cd4b722835
4 changed files with 64 additions and 11 deletions
--- a/src/objects/intl-objects.cc
+++ b/src/objects/intl-objects.cc
@ -1351,25 +1351,28 @@ namespace {

 // Define general regexp macros.
 // Note "(?:" means the regexp group a non-capture group.
-#define REGEX_ALPHA "[a-zA-Z]"
+#define REGEX_ALPHA "[a-z]"
 #define REGEX_DIGIT "[0-9]"
 #define REGEX_ALPHANUM "(?:" REGEX_ALPHA "|" REGEX_DIGIT ")"

 void BuildLanguageTagRegexps(Isolate* isolate) {
 // Define the language tag regexp macros.
-// For info on BCP 47 see https://tools.ietf.org/html/bcp47
+// For info on BCP 47 see https://tools.ietf.org/html/bcp47 .
+// Because language tags are case insensitive per BCP 47 2.1.1 and regexp's
+// defined below will always be used after lowercasing the input, uppercase
+// ranges in BCP 47 2.1 are dropped and grandfathered tags are all lowercased.
 // clang-format off
 #define BCP47_REGULAR                                          \
  "(?:art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|" \
  "zh-min|zh-min-nan|zh-xiang)"
 #define BCP47_IRREGULAR                                  \
-  "(?:en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|" \
+  "(?:en-gb-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|" \
  "i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|"  \
-  "i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)"
+  "i-tsu|sgn-be-fr|sgn-be-nl|sgn-ch-de)"
 #define BCP47_GRANDFATHERED "(?:" BCP47_IRREGULAR "|" BCP47_REGULAR ")"
 #define BCP47_PRIVATE_USE "(?:x(?:-" REGEX_ALPHANUM "{1,8})+)"

-#define BCP47_SINGLETON "(?:" REGEX_DIGIT "|" "[A-WY-Za-wy-z])"
+#define BCP47_SINGLETON "(?:" REGEX_DIGIT "|" "[a-wy-z])"

 #define BCP47_EXTENSION "(?:" BCP47_SINGLETON "(?:-" REGEX_ALPHANUM "{2,8})+)"
 #define BCP47_VARIANT  \
@ -1603,8 +1606,6 @@ bool IsStructurallyValidLanguageTag(Isolate* isolate,
    return false;
  }

-  std::transform(locale.begin(), locale.end(), locale.begin(), AsciiToLower);
-
  // Just return if it's a x- form. It's all private.
  if (locale.find("x-") == 0) {
    return true;
@ -1684,6 +1685,18 @@ bool IsDeprecatedLanguage(const std::string& locale) {
  return locale == "in" || locale == "iw" || locale == "ji" || locale == "jw";
 }

+// Reference:
+// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+bool IsGrandfatheredTagWithoutPreferredVaule(const std::string& locale) {
+  if (V8_UNLIKELY(locale == "zh-min" || locale == "cel-gaulish")) return true;
+  if (locale.length() > 6 /* i-mingo is 7 chars long */ &&
+      V8_UNLIKELY(locale[0] == 'i' && locale[1] == '-')) {
+    return locale.substr(2) == "default" || locale.substr(2) == "enochian" ||
+           locale.substr(2) == "mingo";
+  }
+  return false;
+}
+
 }  // anonymous namespace

 MaybeHandle<String> Intl::CanonicalizeLanguageTag(Isolate* isolate,
@ -1710,6 +1723,9 @@ MaybeHandle<String> Intl::CanonicalizeLanguageTag(Isolate* isolate,
    return locale_str;
  }

+  // Because per BCP 47 2.1.1 language tags are case-insensitive, lowercase
+  // the input before any more check.
+  std::transform(locale.begin(), locale.end(), locale.begin(), AsciiToLower);
  if (!IsStructurallyValidLanguageTag(isolate, locale)) {
    THROW_NEW_ERROR(
        isolate,
@ -1717,6 +1733,12 @@ MaybeHandle<String> Intl::CanonicalizeLanguageTag(Isolate* isolate,
        String);
  }

+  // ICU maps a few grandfathered tags to what looks like a regular language
+  // tag even though IANA language tag registry does not have a preferred
+  // entry map for them. Return them as they're with lowercasing.
+  if (IsGrandfatheredTagWithoutPreferredVaule(locale))
+    return isolate->factory()->NewStringFromAsciiChecked(locale.data());
+
  // // ECMA 402 6.2.3
  // TODO(jshin): uloc_{for,to}TanguageTag can fail even for a structually valid
  // language tag if it's too long (much longer than 100 chars). Even if we
--- a/test/intl/general/grandfathered_tags_without_preferred_value.js
+++ b/test/intl/general/grandfathered_tags_without_preferred_value.js
@ -0,0 +1,27 @@
+// Copyright 2018 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+[
+  // Grandfathered tags without a preferred value in the IANA language
+  // tag registry. Nonetheless, ICU cooks up a value when canonicalizing.
+  // v8 works around that ICU issue.
+  // See https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
+  ["cel-gaulish", "cel-gaulish"],
+  ["i-default", "i-default"],
+  ["i-mingo", "i-mingo"],
+  ["i-enochian", "i-enochian"],
+  ["zh-min", "zh-min"],
+
+  // Matching should be case-insensitive.
+  ["I-default", "i-default"],
+  ["i-DEFAULT", "i-default"],
+  ["I-DEFAULT", "i-default"],
+  ["i-DEfauLT", "i-default"],
+  ["zh-Min", "zh-min"],
+  ["Zh-min", "zh-min"],
+].forEach(([inputLocale, expectedLocale]) => {
+  const canonicalLocales = Intl.getCanonicalLocales(inputLocale);
+  assertEquals(canonicalLocales.length, 1);
+  assertEquals(canonicalLocales[0], expectedLocale);
+})
--- a/test/intl/general/language_tags_with_preferred_values.js
+++ b/test/intl/general/language_tags_with_preferred_values.js
@ -7,6 +7,11 @@
  ["sgn-de", "gsg"],
  ["sgn-de-u-co-phonebk", "gsg-u-co-phonebk"],

+  // Matching should be case-insensitive.
+  ["sgn-De", "gsg"],
+  ["sgn-BE-FR", "sfb"],
+  ["Sgn-bE-Fr", "sfb"],
+
  // deprecated region tag
  ["und-Latn-dd", "und-Latn-DE"],
  ["und-dd-u-co-phonebk", "und-DE-u-co-phonebk"],
@ -22,8 +27,8 @@
  ["jw", "jv"],
  ["aam", "aas"],
  ["aam-u-ca-gregory", "aas-u-ca-gregory"],
-].forEach(function (entry) {
-  const canonicalLocales = Intl.getCanonicalLocales(entry[0]);
+].forEach(([inputLocale, expectedLocale]) => {
+  const canonicalLocales = Intl.getCanonicalLocales(inputLocale);
  assertEquals(canonicalLocales.length, 1);
-  assertEquals(canonicalLocales[0], entry[1]);
+  assertEquals(canonicalLocales[0], expectedLocale);
 })
--- a/test/test262/test262.status
+++ b/test/test262/test262.status
@ -435,7 +435,6 @@

  # https://bugs.chromium.org/p/v8/issues/detail?id=7669
  'intl402/Intl/getCanonicalLocales/canonicalized-tags': [FAIL],
-  'intl402/Intl/getCanonicalLocales/preferred-grandfathered': [FAIL],

  # Tests assume that the sort order of "same elements" (comparator returns 0)
  # is deterministic.