v8/test/intl/general/case-mapping.js

// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Flags: --icu_case_mapping

// Some edge cases that unibrow got wrong

assertEquals("𐐘", "𐑀".toUpperCase());
assertEquals("𐑀", "𐐘".toLowerCase());
assertEquals("σ", "Σ".toLowerCase());

// Some different paths in the ICU case conversion fastpath

assertEquals("σς", "\u03A3\u03A3".toLowerCase());
// Expand sharp s in latin1 fastpath
assertEquals("ASSB", "A\u00DFB".toUpperCase());
assertEquals("AB", "Ab".toUpperCase());
// Find first uppercase in fastpath
// Input length < a machine word size
assertEquals("ab", "ab".toLowerCase());
assertEquals("ab", "aB".toLowerCase());
assertEquals("AÜ", "aü".toUpperCase());
assertEquals("AÜ", "AÜ".toUpperCase());
assertEquals("aü", "aü".toLowerCase());
assertEquals("aü", "aÜ".toLowerCase());
assertEquals("aü", "AÜ".toLowerCase());
assertEquals("aü", "AÜ".toLowerCase());

// Input length >= a machine word size
assertEquals("abcdefghij", "abcdefghij".toLowerCase());
assertEquals("abcdefghij", "abcdefghiJ".toLowerCase());
assertEquals("abçdefghij", "abçdefghiJ".toLowerCase());
assertEquals("abçdefghij", "abÇdefghiJ".toLowerCase());
assertEquals("abcdefghiá", "abcdeFghiá".toLowerCase());
assertEquals("abcdefghiá", "abcdeFghiÁ".toLowerCase());

assertEquals("ABCDEFGHIJ", "ABCDEFGHIJ".toUpperCase());
assertEquals("ABCDEFGHIJ", "ABCDEFGHIj".toUpperCase());
assertEquals("ABÇDEFGHIJ", "ABÇDEFGHIj".toUpperCase());
assertEquals("ABÇDEFGHIJ", "ABçDEFGHIj".toUpperCase());
assertEquals("ABCDEFGHIÁ", "ABCDEfGHIÁ".toUpperCase());
assertEquals("ABCDEFGHIÁ", "ABCDEfGHIá".toUpperCase());


// Starts with fastpath, but switches to full Unicode path
// U+00FF is uppercased to U+0178.
assertEquals("AŸ", "aÿ".toUpperCase());
// U+00B5 (µ) is uppercased to U+039C (Μ)
assertEquals("AΜ", "aµ".toUpperCase());

// Buffer size increase
assertEquals("CSSBẶ", "cßbặ".toUpperCase());
assertEquals("FIFLFFIFFL", "\uFB01\uFB02\uFB03\uFB04".toUpperCase());
assertEquals("ABCÀCSSA", "abcàcßa".toUpperCase());
assertEquals("ABCDEFGHIÀCSSA", "ABCDEFGHIàcßa".toUpperCase());
assertEquals("ABCDEFGHIÀCSSA", "abcdeFghiàcßa".toUpperCase());

// OneByte input with buffer size increase: non-fast path
assertEquals("ABCSS", "abCß".toLocaleUpperCase("tr"));

// More comprehensive tests for "tr", "az" and "lt" are in
// test262/intl402/Strings/*

// Buffer size decrease with a single locale or locale list.
// In Turkic (tr, az), U+0307 preceeded by Capital Letter I is dropped.
assertEquals("abci", "aBcI\u0307".toLocaleLowerCase("tr"));
assertEquals("abci", "aBcI\u0307".toLocaleLowerCase("az"));
assertEquals("abci", "aBcI\u0307".toLocaleLowerCase(["tr", "en"]));

// Cons string
assertEquals("abcijkl", ("aBcI" + "\u0307jkl").toLocaleLowerCase("tr"));
assertEquals("abcijkl",
             ("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("tr"));
assertEquals("abci\u0307jkl", ("aBcI" + "\u0307jkl").toLocaleLowerCase("en"));
assertEquals("abci\u0307jkl",
             ("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("en"));
assertEquals("abci\u0307jkl",
             ("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("fil"));
assertEquals("abci\u0307jkl", ("aBcI" + "\u0307jkl").toLowerCase());
assertEquals("abci\u0307jkl",
             ("aB" + "cI" + "\u0307j" + "kl").toLowerCase());
assertEquals("[object arraybuffer]",
    (new String(new ArrayBuffer())).toLocaleLowerCase("fil"));
assertEquals("[OBJECT ARRAYBUFFER]",
    (new String(new ArrayBuffer())).toLocaleUpperCase("fil"));

assertEquals("abcde", ("a" + "b" + "cde").toLowerCase());
assertEquals("ABCDE", ("a" + "b" + "cde").toUpperCase());
assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase());
assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase());
assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase("en"));
assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase("en"));
assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase("fil"));
assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase("fil"));
assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase("longlang"));
assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase("longlang"));

// "tr" and "az" should behave identically.
assertEquals("aBcI\u0307".toLocaleLowerCase("tr"),
             "aBcI\u0307".toLocaleLowerCase("az"));
// What matters is the first locale in the locale list.
assertEquals("aBcI\u0307".toLocaleLowerCase(["tr", "en", "fr"]),
             "aBcI\u0307".toLocaleLowerCase("tr"));
assertEquals("aBcI\u0307".toLocaleLowerCase(["en", "tr", "az"]),
             "aBcI\u0307".toLocaleLowerCase("en"));
assertEquals("aBcI\u0307".toLocaleLowerCase(["en", "tr", "az"]),
             "aBcI\u0307".toLowerCase());

// An empty locale list is the same as the default locale. Try these tests
// under Turkish and Greek locale.
assertEquals("aBcI\u0307".toLocaleLowerCase([]),
             "aBcI\u0307".toLocaleLowerCase());
assertEquals("aBcI\u0307".toLocaleLowerCase([]),
             "aBcI\u0307".toLocaleLowerCase(Intl.GetDefaultLocale));
assertEquals("άόύώ".toLocaleUpperCase([]), "άόύώ".toLocaleUpperCase());
assertEquals("άόύώ".toLocaleUpperCase([]),
             "άόύώ".toLocaleUpperCase(Intl.GetDefaultLocale));


// English/root locale keeps U+0307 (combining dot above).
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("en"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("en-GB"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase(["en", "tr"]));
assertEquals("abci\u0307", "aBcI\u0307".toLowerCase());

// Anything other than 'tr' and 'az' behave like root for U+0307.
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("fil"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("zh-Hant-TW"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("i-klingon"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("i-enochian"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("x-foobar"));

// Up to 8 chars are allowed for the primary language tag in BCP 47.
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("longlang"));
assertEquals("ABCI\u0307", "aBcI\u0307".toLocaleUpperCase("longlang"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase(["longlang", "tr"]));
assertEquals("ABCI\u0307", "aBcI\u0307".toLocaleUpperCase(["longlang", "tr"]));
assertThrows(() => "abc".toLocaleLowerCase("longlang2"), RangeError);
assertThrows(() => "abc".toLocaleUpperCase("longlang2"), RangeError);
assertThrows(() => "abc".toLocaleLowerCase(["longlang2", "en"]), RangeError);
assertThrows(() => "abc".toLocaleUpperCase(["longlang2", "en"]), RangeError);

// Greek uppercasing: not covered by intl402/String/*, yet. Tonos (U+0301) and
// other diacritic marks are dropped.  See
// http://bugs.icu-project.org/trac/ticket/5456#comment:19 for more examples.
// See also http://bugs.icu-project.org/trac/ticket/12845 .
assertEquals("Α", "α\u0301".toLocaleUpperCase("el"));
assertEquals("Α", "α\u0301".toLocaleUpperCase("el-GR"));
assertEquals("Α", "α\u0301".toLocaleUpperCase("el-Grek"));
assertEquals("Α", "α\u0301".toLocaleUpperCase("el-Grek-GR"));
assertEquals("Α", "ά".toLocaleUpperCase("el"));
assertEquals("ΑΟΫΩ", "άόύώ".toLocaleUpperCase("el"));
assertEquals("ΑΟΫΩ", "α\u0301ο\u0301υ\u0301ω\u0301".toLocaleUpperCase("el"));
assertEquals("ΑΟΫΩ", "άόύώ".toLocaleUpperCase("el"));
assertEquals("ΟΕ", "Ό\u1f15".toLocaleUpperCase("el"));
assertEquals("ΟΕ", "Ο\u0301ε\u0314\u0301".toLocaleUpperCase("el"));
assertEquals("ΡΩΜΕΪΚΑ", "ρωμέικα".toLocaleUpperCase("el"));
assertEquals("ΜΑΪΟΥ, ΤΡΟΛΕΪ", "Μαΐου, τρόλεϊ".toLocaleUpperCase("el"));
assertEquals("ΤΟ ΕΝΑ Ή ΤΟ ΑΛΛΟ.", "Το ένα ή το άλλο.".toLocaleUpperCase("el"));

// Input and output are identical.
assertEquals("αβγδε", "αβγδε".toLocaleLowerCase("el"));
assertEquals("ΑΒΓΔΕ", "ΑΒΓΔΕ".toLocaleUpperCase("el"));
assertEquals("ΑΒΓΔΕАБ𝐀𝐁", "ΑΒΓΔΕАБ𝐀𝐁".toLocaleUpperCase("el"));
assertEquals("ABCDEÂÓḴ123", "ABCDEÂÓḴ123".toLocaleUpperCase("el"));
// ASCII-only or Latin-1 only: 1-byte
assertEquals("ABCDE123", "ABCDE123".toLocaleUpperCase("el"));
assertEquals("ABCDEÂÓ123", "ABCDEÂÓ123".toLocaleUpperCase("el"));

// To make sure that the input string is not overwritten in place.
var strings = ["abCdef", "αβγδε", "άόύώ", "аб"];
for (var s  of strings) {
  var backupAsArray = s.split("");
  var uppered = s.toLocaleUpperCase("el");
  assertEquals(s, backupAsArray.join(""));
}

// In other locales, U+0301 is preserved.
assertEquals("Α\u0301Ο\u0301Υ\u0301Ω\u0301",
             "α\u0301ο\u0301υ\u0301ω\u0301".toLocaleUpperCase("en"));
assertEquals("Α\u0301Ο\u0301Υ\u0301Ω\u0301",
             "α\u0301ο\u0301υ\u0301ω\u0301".toUpperCase());

// Plane 1; Deseret and Warang Citi Script.
assertEquals("\u{10400}\u{118A0}", "\u{10428}\u{118C0}".toUpperCase());
assertEquals("\u{10428}\u{118C0}", "\u{10400}\u{118A0}".toLowerCase());
// Mathematical Bold {Capital, Small} Letter A do not change.
assertEquals("\u{1D400}\u{1D41A}", "\u{1D400}\u{1D41A}".toUpperCase());
assertEquals("\u{1D400}\u{1D41A}", "\u{1D400}\u{1D41A}".toLowerCase());
// Plane 1; New characters in Unicode 8.0
assertEquals("\u{10C80}", "\u{10CC0}".toUpperCase());
assertEquals("\u{10CC0}", "\u{10C80}".toLowerCase());
assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase());
assertEquals("\u{10CC0}", "\u{10C80}".toLocaleLowerCase());
assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase(["tr"]));
assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase(["tr"]));
assertEquals("\u{10CC0}", "\u{10C80}".toLocaleLowerCase());

// check fast path for Latin-1 supplement (U+00A0 ~ U+00FF)
var latin1Suppl = "\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´µ¶·¸¹º»¼½¾¿" +
    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
var latin1SupplLowercased = "\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´µ¶·¸¹º»¼½¾¿" +
    "àáâãäåæçèéêëìíîïðñòóôõö×øùúûüýþßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
var latin1SupplUppercased = "\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´\u039C¶·¸¹º»¼½¾¿" +
    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞSSÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ÷ØÙÚÛÜÝÞ\u0178";

assertEquals(latin1SupplLowercased, latin1Suppl.toLowerCase());
assertEquals(latin1SupplUppercased, latin1Suppl.toUpperCase());
assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("de"));
assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("de"));
assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("el"));
assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("el"));
assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("tr"));
assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("tr"));
assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("az"));
assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("az"));
assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("lt"));
// Lithuanian need to have a dot-above for U+00CC(Ì) and U+00CD(Í) when
// lowercasing.
assertEquals("\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´µ¶·¸¹º»¼½¾¿" +
    "àáâãäåæçèéêëi\u0307\u0300i\u0307\u0301îïðñòóôõö×øùúûüýþß" +
    "àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ",
    latin1Suppl.toLocaleLowerCase("lt"));
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								// Copyright 2016 the V8 project authors. All rights reserved.
 								// Use of this source code is governed by a BSD-style license that can be
 								// found in the LICENSE file.
 								// Flags: --icu_case_mapping
 								// Some edge cases that unibrow got wrong
 								assertEquals("𐐘", "𐑀".toUpperCase());
 								assertEquals("𐑀", "𐐘".toLowerCase());
 								assertEquals("σ", "Σ".toLowerCase());
 								// Some different paths in the ICU case conversion fastpath
 								assertEquals("σς", "\u03A3\u03A3".toLowerCase());
 								// Expand sharp s in latin1 fastpath
 								assertEquals("ASSB", "A\u00DFB".toUpperCase());
 								assertEquals("AB", "Ab".toUpperCase());
-												Optimize case conversion with icu_case_mapping

Use FastAsciiConvert (as used by Unibrow) for i18n-aware
case conversion with --icu_case_mapping.

Move FastAsciiConvert to src/string-case.cc so that it can be used
by both runtime-{string,i18n}.

Add more tests.

BUG=v8:4477,v8:4476
TEST=intl/general/case*

Review-Url: https://codereview.chromium.org/2533983006
Cr-Commit-Position: refs/heads/master@{#41821}

											
										
										
											2016-12-19 18:43:55 +00:00
+								// Find first uppercase in fastpath
 								// Input length < a machine word size
 								assertEquals("ab", "ab".toLowerCase());
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								assertEquals("ab", "aB".toLowerCase());
 								assertEquals("AÜ", "aü".toUpperCase());
 								assertEquals("AÜ", "AÜ".toUpperCase());
 								assertEquals("aü", "aü".toLowerCase());
-												Optimize case conversion with icu_case_mapping

Use FastAsciiConvert (as used by Unibrow) for i18n-aware
case conversion with --icu_case_mapping.

Move FastAsciiConvert to src/string-case.cc so that it can be used
by both runtime-{string,i18n}.

Add more tests.

BUG=v8:4477,v8:4476
TEST=intl/general/case*

Review-Url: https://codereview.chromium.org/2533983006
Cr-Commit-Position: refs/heads/master@{#41821}

											
										
										
											2016-12-19 18:43:55 +00:00
+								assertEquals("aü", "aÜ".toLowerCase());
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								assertEquals("aü", "AÜ".toLowerCase());
 								assertEquals("aü", "AÜ".toLowerCase());
-												Optimize case conversion with icu_case_mapping

Use FastAsciiConvert (as used by Unibrow) for i18n-aware
case conversion with --icu_case_mapping.

Move FastAsciiConvert to src/string-case.cc so that it can be used
by both runtime-{string,i18n}.

Add more tests.

BUG=v8:4477,v8:4476
TEST=intl/general/case*

Review-Url: https://codereview.chromium.org/2533983006
Cr-Commit-Position: refs/heads/master@{#41821}

											
										
										
											2016-12-19 18:43:55 +00:00
+								// Input length >= a machine word size
 								assertEquals("abcdefghij", "abcdefghij".toLowerCase());
 								assertEquals("abcdefghij", "abcdefghiJ".toLowerCase());
 								assertEquals("abçdefghij", "abçdefghiJ".toLowerCase());
 								assertEquals("abçdefghij", "abÇdefghiJ".toLowerCase());
 								assertEquals("abcdefghiá", "abcdeFghiá".toLowerCase());
 								assertEquals("abcdefghiá", "abcdeFghiÁ".toLowerCase());
 								assertEquals("ABCDEFGHIJ", "ABCDEFGHIJ".toUpperCase());
 								assertEquals("ABCDEFGHIJ", "ABCDEFGHIj".toUpperCase());
 								assertEquals("ABÇDEFGHIJ", "ABÇDEFGHIj".toUpperCase());
 								assertEquals("ABÇDEFGHIJ", "ABçDEFGHIj".toUpperCase());
 								assertEquals("ABCDEFGHIÁ", "ABCDEfGHIÁ".toUpperCase());
 								assertEquals("ABCDEFGHIÁ", "ABCDEfGHIá".toUpperCase());
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								// Starts with fastpath, but switches to full Unicode path
 								// U+00FF is uppercased to U+0178.
 								assertEquals("AŸ", "aÿ".toUpperCase());
 								// U+00B5 (µ) is uppercased to U+039C (Μ)
 								assertEquals("AΜ", "aµ".toUpperCase());
 								// Buffer size increase
 								assertEquals("CSSBẶ", "cßbặ".toUpperCase());
 								assertEquals("FIFLFFIFFL", "\uFB01\uFB02\uFB03\uFB04".toUpperCase());
-												Optimize case conversion with icu_case_mapping

Use FastAsciiConvert (as used by Unibrow) for i18n-aware
case conversion with --icu_case_mapping.

Move FastAsciiConvert to src/string-case.cc so that it can be used
by both runtime-{string,i18n}.

Add more tests.

BUG=v8:4477,v8:4476
TEST=intl/general/case*

Review-Url: https://codereview.chromium.org/2533983006
Cr-Commit-Position: refs/heads/master@{#41821}

											
										
										
											2016-12-19 18:43:55 +00:00
+								assertEquals("ABCÀCSSA", "abcàcßa".toUpperCase());
 								assertEquals("ABCDEFGHIÀCSSA", "ABCDEFGHIàcßa".toUpperCase());
 								assertEquals("ABCDEFGHIÀCSSA", "abcdeFghiàcßa".toUpperCase());
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								// OneByte input with buffer size increase: non-fast path
 								assertEquals("ABCSS", "abCß".toLocaleUpperCase("tr"));
 								// More comprehensive tests for "tr", "az" and "lt" are in
 								// test262/intl402/Strings/*
 								// Buffer size decrease with a single locale or locale list.
 								// In Turkic (tr, az), U+0307 preceeded by Capital Letter I is dropped.
 								assertEquals("abci", "aBcI\u0307".toLocaleLowerCase("tr"));
 								assertEquals("abci", "aBcI\u0307".toLocaleLowerCase("az"));
 								assertEquals("abci", "aBcI\u0307".toLocaleLowerCase(["tr", "en"]));
 								// Cons string
 								assertEquals("abcijkl", ("aBcI" + "\u0307jkl").toLocaleLowerCase("tr"));
 								assertEquals("abcijkl",
 								             ("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("tr"));
 								assertEquals("abci\u0307jkl", ("aBcI" + "\u0307jkl").toLocaleLowerCase("en"));
 								assertEquals("abci\u0307jkl",
 								             ("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("en"));
-												Fix two DCHECK failures in ICU case mapping code

1.
DCHECK in runtime-i18n.cc for case mapping was wrong to
assume that the longest primary language tag is 3 characters.
BCP 47 actually allows up to 8 characters.

2. GetFlatContent() was called to a string without flattening it first.

BUG=680314,680464
TEST=intl/general/case-mapping (see also the bugs)

Review-Url: https://codereview.chromium.org/2629763003
Cr-Commit-Position: refs/heads/master@{#42343}

											
										
										
											2017-01-13 23:12:43 +00:00
+								assertEquals("abci\u0307jkl",
 								             ("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("fil"));
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								assertEquals("abci\u0307jkl", ("aBcI" + "\u0307jkl").toLowerCase());
 								assertEquals("abci\u0307jkl",
 								             ("aB" + "cI" + "\u0307j" + "kl").toLowerCase());
-												Fix two DCHECK failures in ICU case mapping code

1.
DCHECK in runtime-i18n.cc for case mapping was wrong to
assume that the longest primary language tag is 3 characters.
BCP 47 actually allows up to 8 characters.

2. GetFlatContent() was called to a string without flattening it first.

BUG=680314,680464
TEST=intl/general/case-mapping (see also the bugs)

Review-Url: https://codereview.chromium.org/2629763003
Cr-Commit-Position: refs/heads/master@{#42343}

											
										
										
											2017-01-13 23:12:43 +00:00
+								assertEquals("[object arraybuffer]",
 								    (new String(new ArrayBuffer())).toLocaleLowerCase("fil"));
 								assertEquals("[OBJECT ARRAYBUFFER]",
 								    (new String(new ArrayBuffer())).toLocaleUpperCase("fil"));
 								assertEquals("abcde", ("a" + "b" + "cde").toLowerCase());
 								assertEquals("ABCDE", ("a" + "b" + "cde").toUpperCase());
 								assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase());
 								assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase());
 								assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase("en"));
 								assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase("en"));
 								assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase("fil"));
 								assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase("fil"));
 								assertEquals("abcde", ("a" + "b" + "cde").toLocaleLowerCase("longlang"));
 								assertEquals("ABCDE", ("a" + "b" + "cde").toLocaleUpperCase("longlang"));
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
 								// "tr" and "az" should behave identically.
 								assertEquals("aBcI\u0307".toLocaleLowerCase("tr"),
 								             "aBcI\u0307".toLocaleLowerCase("az"));
 								// What matters is the first locale in the locale list.
 								assertEquals("aBcI\u0307".toLocaleLowerCase(["tr", "en", "fr"]),
 								             "aBcI\u0307".toLocaleLowerCase("tr"));
 								assertEquals("aBcI\u0307".toLocaleLowerCase(["en", "tr", "az"]),
 								             "aBcI\u0307".toLocaleLowerCase("en"));
 								assertEquals("aBcI\u0307".toLocaleLowerCase(["en", "tr", "az"]),
 								             "aBcI\u0307".toLowerCase());
 								// An empty locale list is the same as the default locale. Try these tests
 								// under Turkish and Greek locale.
 								assertEquals("aBcI\u0307".toLocaleLowerCase([]),
 								             "aBcI\u0307".toLocaleLowerCase());
 								assertEquals("aBcI\u0307".toLocaleLowerCase([]),
 								             "aBcI\u0307".toLocaleLowerCase(Intl.GetDefaultLocale));
 								assertEquals("άόύώ".toLocaleUpperCase([]), "άόύώ".toLocaleUpperCase());
 								assertEquals("άόύώ".toLocaleUpperCase([]),
 								             "άόύώ".toLocaleUpperCase(Intl.GetDefaultLocale));
 								// English/root locale keeps U+0307 (combining dot above).
 								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("en"));
-												Fix two DCHECK failures in ICU case mapping code

1.
DCHECK in runtime-i18n.cc for case mapping was wrong to
assume that the longest primary language tag is 3 characters.
BCP 47 actually allows up to 8 characters.

2. GetFlatContent() was called to a string without flattening it first.

BUG=680314,680464
TEST=intl/general/case-mapping (see also the bugs)

Review-Url: https://codereview.chromium.org/2629763003
Cr-Commit-Position: refs/heads/master@{#42343}

											
										
										
											2017-01-13 23:12:43 +00:00
+								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("en-GB"));
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase(["en", "tr"]));
 								assertEquals("abci\u0307", "aBcI\u0307".toLowerCase());
-												Fix two DCHECK failures in ICU case mapping code

1.
DCHECK in runtime-i18n.cc for case mapping was wrong to
assume that the longest primary language tag is 3 characters.
BCP 47 actually allows up to 8 characters.

2. GetFlatContent() was called to a string without flattening it first.

BUG=680314,680464
TEST=intl/general/case-mapping (see also the bugs)

Review-Url: https://codereview.chromium.org/2629763003
Cr-Commit-Position: refs/heads/master@{#42343}

											
										
										
											2017-01-13 23:12:43 +00:00
+								// Anything other than 'tr' and 'az' behave like root for U+0307.
 								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("fil"));
 								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("zh-Hant-TW"));
 								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("i-klingon"));
-												Handle private / grandfathered tags gracefully for case-conversion

Bug=v8:6083
Test=intl/general/case-mapping.js

Change-Id: I254c54520262298d6843948654d1dc4583b0c245
Reviewed-on: https://chromium-review.googlesource.com/496886
Reviewed-by: Adam Klein <adamk@chromium.org>
Commit-Queue: Jungshik Shin <jshin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#45115}
											
										
										
											2017-05-04 21:56:03 +00:00
+								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("i-enochian"));
 								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("x-foobar"));
-												Fix two DCHECK failures in ICU case mapping code

1.
DCHECK in runtime-i18n.cc for case mapping was wrong to
assume that the longest primary language tag is 3 characters.
BCP 47 actually allows up to 8 characters.

2. GetFlatContent() was called to a string without flattening it first.

BUG=680314,680464
TEST=intl/general/case-mapping (see also the bugs)

Review-Url: https://codereview.chromium.org/2629763003
Cr-Commit-Position: refs/heads/master@{#42343}

											
										
										
											2017-01-13 23:12:43 +00:00
 								// Up to 8 chars are allowed for the primary language tag in BCP 47.
 								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("longlang"));
 								assertEquals("ABCI\u0307", "aBcI\u0307".toLocaleUpperCase("longlang"));
 								assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase(["longlang", "tr"]));
 								assertEquals("ABCI\u0307", "aBcI\u0307".toLocaleUpperCase(["longlang", "tr"]));
 								assertThrows(() => "abc".toLocaleLowerCase("longlang2"), RangeError);
 								assertThrows(() => "abc".toLocaleUpperCase("longlang2"), RangeError);
 								assertThrows(() => "abc".toLocaleLowerCase(["longlang2", "en"]), RangeError);
 								assertThrows(() => "abc".toLocaleUpperCase(["longlang2", "en"]), RangeError);
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								// Greek uppercasing: not covered by intl402/String/*, yet. Tonos (U+0301) and
-												Use a regular ICU API for el-Upper

ICU now supports uppercasing in Greek via its regular uppercasing API.
So, there's no need to use a slow transliteration API for uppercasing
in Greek.

This CL includes rolling ICU to ICU 58.1.

Besides, drop intl402/Intl/getCanonicalLocales/weird-cases from
test262.status because it passes now with ICU 58.1.

BUG=chromium:637001,v8:5012

Review-Url: https://codereview.chromium.org/2491333003
Cr-Commit-Position: refs/heads/master@{#41009}

											
										
										
											2016-11-15 18:29:23 +00:00
+								// other diacritic marks are dropped.  See
 								// http://bugs.icu-project.org/trac/ticket/5456#comment:19 for more examples.
 								// See also http://bugs.icu-project.org/trac/ticket/12845 .
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								assertEquals("Α", "α\u0301".toLocaleUpperCase("el"));
 								assertEquals("Α", "α\u0301".toLocaleUpperCase("el-GR"));
 								assertEquals("Α", "α\u0301".toLocaleUpperCase("el-Grek"));
 								assertEquals("Α", "α\u0301".toLocaleUpperCase("el-Grek-GR"));
 								assertEquals("Α", "ά".toLocaleUpperCase("el"));
-												Use a regular ICU API for el-Upper

ICU now supports uppercasing in Greek via its regular uppercasing API.
So, there's no need to use a slow transliteration API for uppercasing
in Greek.

This CL includes rolling ICU to ICU 58.1.

Besides, drop intl402/Intl/getCanonicalLocales/weird-cases from
test262.status because it passes now with ICU 58.1.

BUG=chromium:637001,v8:5012

Review-Url: https://codereview.chromium.org/2491333003
Cr-Commit-Position: refs/heads/master@{#41009}

											
										
										
											2016-11-15 18:29:23 +00:00
+								assertEquals("ΑΟΫΩ", "άόύώ".toLocaleUpperCase("el"));
 								assertEquals("ΑΟΫΩ", "α\u0301ο\u0301υ\u0301ω\u0301".toLocaleUpperCase("el"));
 								assertEquals("ΑΟΫΩ", "άόύώ".toLocaleUpperCase("el"));
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
+								assertEquals("ΟΕ", "Ό\u1f15".toLocaleUpperCase("el"));
 								assertEquals("ΟΕ", "Ο\u0301ε\u0314\u0301".toLocaleUpperCase("el"));
-												Use a regular ICU API for el-Upper

ICU now supports uppercasing in Greek via its regular uppercasing API.
So, there's no need to use a slow transliteration API for uppercasing
in Greek.

This CL includes rolling ICU to ICU 58.1.

Besides, drop intl402/Intl/getCanonicalLocales/weird-cases from
test262.status because it passes now with ICU 58.1.

BUG=chromium:637001,v8:5012

Review-Url: https://codereview.chromium.org/2491333003
Cr-Commit-Position: refs/heads/master@{#41009}

											
										
										
											2016-11-15 18:29:23 +00:00
+								assertEquals("ΡΩΜΕΪΚΑ", "ρωμέικα".toLocaleUpperCase("el"));
 								assertEquals("ΜΑΪΟΥ, ΤΡΟΛΕΪ", "Μαΐου, τρόλεϊ".toLocaleUpperCase("el"));
 								assertEquals("ΤΟ ΕΝΑ Ή ΤΟ ΑΛΛΟ.", "Το ένα ή το άλλο.".toLocaleUpperCase("el"));
-												Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}

											
										
										
											2016-05-11 19:01:41 +00:00
 								// Input and output are identical.
 								assertEquals("αβγδε", "αβγδε".toLocaleLowerCase("el"));
 								assertEquals("ΑΒΓΔΕ", "ΑΒΓΔΕ".toLocaleUpperCase("el"));
 								assertEquals("ΑΒΓΔΕАБ𝐀𝐁", "ΑΒΓΔΕАБ𝐀𝐁".toLocaleUpperCase("el"));
 								assertEquals("ABCDEÂÓḴ123", "ABCDEÂÓḴ123".toLocaleUpperCase("el"));
 								// ASCII-only or Latin-1 only: 1-byte
 								assertEquals("ABCDE123", "ABCDE123".toLocaleUpperCase("el"));
 								assertEquals("ABCDEÂÓ123", "ABCDEÂÓ123".toLocaleUpperCase("el"));
 								// To make sure that the input string is not overwritten in place.
 								var strings = ["abCdef", "αβγδε", "άόύώ", "аб"];
 								for (var s  of strings) {
 								  var backupAsArray = s.split("");
 								  var uppered = s.toLocaleUpperCase("el");
 								  assertEquals(s, backupAsArray.join(""));
 								}
 								// In other locales, U+0301 is preserved.
 								assertEquals("Α\u0301Ο\u0301Υ\u0301Ω\u0301",
 								             "α\u0301ο\u0301υ\u0301ω\u0301".toLocaleUpperCase("en"));
 								assertEquals("Α\u0301Ο\u0301Υ\u0301Ω\u0301",
 								             "α\u0301ο\u0301υ\u0301ω\u0301".toUpperCase());
 								// Plane 1; Deseret and Warang Citi Script.
 								assertEquals("\u{10400}\u{118A0}", "\u{10428}\u{118C0}".toUpperCase());
 								assertEquals("\u{10428}\u{118C0}", "\u{10400}\u{118A0}".toLowerCase());
 								// Mathematical Bold {Capital, Small} Letter A do not change.
 								assertEquals("\u{1D400}\u{1D41A}", "\u{1D400}\u{1D41A}".toUpperCase());
 								assertEquals("\u{1D400}\u{1D41A}", "\u{1D400}\u{1D41A}".toLowerCase());
 								// Plane 1; New characters in Unicode 8.0
 								assertEquals("\u{10C80}", "\u{10CC0}".toUpperCase());
 								assertEquals("\u{10CC0}", "\u{10C80}".toLowerCase());
 								assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase());
 								assertEquals("\u{10CC0}", "\u{10C80}".toLocaleLowerCase());
 								assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase(["tr"]));
 								assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase(["tr"]));
 								assertEquals("\u{10CC0}", "\u{10C80}".toLocaleLowerCase());
-												Fix the uppercasing of U+00E7(ç) and U+00F7(÷)

Due to a typo in runtime-i18n.js, 'ç'(U+00E7) was not uppercased while
'÷'(U+00F7) was incorrectly uppercased to '×'(U+00D7).

Add a comprehensive test for Latin-1 supplemental block (U+00A0 ~ U+00FF).
(they're special-cased for speed-up and needs to have a test for the range.).

TEST=intl/general/case-mapping
BUG=v8:5681

Review-Url: https://codereview.chromium.org/2533033003
Cr-Commit-Position: refs/heads/master@{#41331}

											
										
										
											2016-11-28 22:55:23 +00:00
 								// check fast path for Latin-1 supplement (U+00A0 ~ U+00FF)
 								var latin1Suppl = "\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´µ¶·¸¹º»¼½¾¿" +
 								    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
 								var latin1SupplLowercased = "\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´µ¶·¸¹º»¼½¾¿" +
 								    "àáâãäåæçèéêëìíîïðñòóôõö×øùúûüýþßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
 								var latin1SupplUppercased = "\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´\u039C¶·¸¹º»¼½¾¿" +
 								    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞSSÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ÷ØÙÚÛÜÝÞ\u0178";
 								assertEquals(latin1SupplLowercased, latin1Suppl.toLowerCase());
 								assertEquals(latin1SupplUppercased, latin1Suppl.toUpperCase());
 								assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("de"));
 								assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("de"));
 								assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("el"));
 								assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("el"));
 								assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("tr"));
 								assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("tr"));
 								assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("az"));
 								assertEquals(latin1SupplLowercased, latin1Suppl.toLocaleLowerCase("az"));
 								assertEquals(latin1SupplUppercased, latin1Suppl.toLocaleUpperCase("lt"));
 								// Lithuanian need to have a dot-above for U+00CC(Ì) and U+00CD(Í) when
 								// lowercasing.
 								assertEquals("\u00A0¡¢£¤¥¦§¨©ª«¬\u00AD®°±²³´µ¶·¸¹º»¼½¾¿" +
 								    "àáâãäåæçèéêëi\u0307\u0300i\u0307\u0301îïðñòóôõö×øùúûüýþß" +
 								    "àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ",
 								    latin1Suppl.toLocaleLowerCase("lt"));