v8/test/mjsunit/es6/unicode-regexp-ignore-case.js
pthier dadd5f94f5 [regexp] Fix wrong unicode case-insensitive matches
When creating a character class in unicode, case-insensitive mode we use
icu::UnicodeSet::closeOver() to add all characters that case-insensitive
match the characters in the class.
According to the spec only simple case folding shall be performed for
case-insensitive unicode matching, but closeOver() adds all characters
that are equal w.r.t full case folding.
The current approach of just removing strings from the closeOver set is
not enough, as single code point characters still remain in the set if
they were equal only by performing full case folding.
E.g. the characters \u0390 and \u1FD3 both fold to the same string
"\u03B9\u0308\u0301" via full case folding, but they don't have a simple
case folding in common.

To prevent these wrong matches, we calculate the set of all characters
with close overs that are wrong according to the spec at build time and
remove them from the set before adding case-insensitive equivalent
characters.

Bug: v8:13377
Change-Id: I0252c79143f266911691331dd0e1e27044ea8cba
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3952095
Commit-Queue: Patrick Thier <pthier@chromium.org>
Reviewed-by: Camillo Bruni <cbruni@chromium.org>
Cr-Commit-Position: refs/heads/main@{#83791}
2022-10-19 11:12:27 +00:00

72 lines
2.5 KiB
JavaScript

// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Non-unicode use toUpperCase mappings.
assertFalse(/[\u00e5]/i.test("\u212b"));
assertFalse(/[\u212b]/i.test("\u00e5\u1234"));
assertFalse(/[\u212b]/i.test("\u00e5"));
assertTrue("\u212b".toLowerCase() == "\u00e5");
assertTrue("\u00c5".toLowerCase() == "\u00e5");
assertTrue("\u00e5".toUpperCase() == "\u00c5");
// Unicode uses case folding mappings.
assertTrue(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00e5/ui.test("\u00c5"));
assertTrue(/\u00e5/ui.test("\u00e5"));
assertTrue(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00e5"));
assertTrue(/\u00c5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00c5"));
assertTrue(/\u212b/ui.test("\u00c5"));
assertTrue(/\u212b/ui.test("\u00e5"));
assertTrue(/\u212b/ui.test("\u212b"));
// Non-BMP.
assertFalse(/\u{10400}/i.test("\u{10428}"));
assertTrue(/\u{10400}/ui.test("\u{10428}"));
assertTrue(/\ud801\udc00/ui.test("\u{10428}"));
assertTrue(/[\u{10428}]/ui.test("\u{10400}"));
assertTrue(/[\ud801\udc28]/ui.test("\u{10400}"));
assertEquals(["\uff21\u{10400}"],
/[\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc"));
assertEquals(["abc"], /[^\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc\uff23"));
assertEquals(["\uff53\u24bb"],
/[\u24d5-\uff33]+/ui.exec("\uff54\uff53\u24bb\u24ba"));
// Full mappings are ignored.
assertFalse(/\u00df/ui.test("SS"));
assertFalse(/\u1f8d/ui.test("\u1f05\u03b9"));
// Simple mappings work.
assertTrue(/\u1f8d/ui.test("\u1f85"));
// Common mappings work.
assertTrue(/\u1f6b/ui.test("\u1f63"));
// Back references.
assertEquals(["\u00e5\u212b\u00c5", "\u00e5"],
/(.)\1\1/ui.exec("\u00e5\u212b\u00c5"));
assertEquals(["\u{118aa}\u{118ca}", "\u{118aa}"],
/(.)\1/ui.exec("\u{118aa}\u{118ca}"));
// Misc.
assertTrue(/\u00e5\u00e5\u00e5/ui.test("\u212b\u00e5\u00c5"));
assertTrue(/AB\u{10400}/ui.test("ab\u{10428}"));
// Non-Latin1 maps to Latin1.
assertEquals(["s"], /^\u017F/ui.exec("s"));
assertEquals(["s"], /^\u017F/ui.exec("s\u1234"));
assertEquals(["as"], /^a[\u017F]/ui.exec("as"));
assertEquals(["as"], /^a[\u017F]/ui.exec("as\u1234"));
// Non-simple mappings created by UnicodeSet::closeOver() requiring special
// treatment.
assertFalse(/[\u0390]/ui.test("\u1fd3"));
assertFalse(/[\u1fd3]/ui.test("\u0390"));
assertFalse(/[\u03b0]/ui.test("\u1fe3"));
assertFalse(/[\u1fe3]/ui.test("\u03b0"));
assertFalse(/[\ufb05]/ui.test("\ufb06"));
assertFalse(/[\ufb06]/ui.test("\ufb05"));