v8/test/intl/regress-10573.js
Iain Ireland b65fcfe925 [regexp] Fix non-unicode ignore-case backreferences
https://crrev.com/c/2072858 rewrote the implementation of non-unicode
ignore-case matches to comply with the JS spec in some corner
cases. It fixed character matches and character class matches.

We missed a similar bug in the implementation of back references. This
CL fixes that bug.

The main change is in regexp-macro-assembler.cc, where
CaseInsensitiveCompareUC16 is split into CaseInsensitiveCompareUnicode
(which has the same semantics as before) and
CaseInsensitiveCompareNonUnicode (which has the semantics described
here: https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch).

Most of the rest of the patch undoes https://crrev.com/c/2081816 to
once again make the unicode flag available to the macroassembler, so
that we can decide which helper function to call.

The testcase is a version of test/intl/regress-10248.js, modified to
test backreferences.

Bug: v8:10573
Change-Id: I70ef7d134d37f99b1f75a5eba17020e82d59f1b9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2219284
Reviewed-by: Jakob Gruber <jgruber@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68129}
2020-06-03 08:59:08 +00:00

74 lines
2.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
function assertEquals(a,b) { if (a !== b) print("BLAH"); }
// See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
function Canonicalize(ch) {
var u = ch.toUpperCase();
if (u.length > 1) return ch;
var cu = u.charCodeAt(0);
if (ch.charCodeAt(0) >= 128 && cu < 128) return ch;
return cu;
}
function TestEquivalenceClass(eclass) {
var backref = /(.)\1/i;
var backrefUnicode = /(.)\1/iu;
for (var i = 0; i < eclass.length; i++) {
for (var j = 0; j < eclass.length; j++) {
if (i == j) continue;
var c1 = eclass[i];
var c2 = eclass[j];
var cc = c1 + c2;
var shouldMatch = Canonicalize(c1) === Canonicalize(c2);
assertEquals(backref.test(cc), shouldMatch);
assertEquals(backrefUnicode.test(cc), true);
}
}
}
function TestAll() {
for (var eclass of equivalence_classes) {
TestEquivalenceClass(eclass);
}
}
// Interesting case-folding equivalence classes (as determined by
// ICU's UnicodeSet::closeOver). A class is interesting if it contains
// more than two characters, or if it contains any characters in
// IgnoreSet or SpecialAddSet as defined in src/regexp/special-case.h.
var equivalence_classes = [
'\u0041\u0061', // Aa (sanity check)
'\u004b\u006b\u212a', // Kk
'\u0053\u0073\u017f', // Ssſ
'\u00b5\u039c\u03bc', // µΜμ
'\u00c5\u00e5\u212b', // ÅåÅ
'\u00df\u1e9e', // ßẞ
'\u03a9\u03c9\u2126', // ΩωΩ
'\u0390\u1fd3', // ΐΐ
'\u0398\u03b8\u03d1\u03f4', // Θθϑϴ
'\u03b0\u1fe3', // ΰΰ
'\u1f80\u1f88', // ᾀᾈ
'\u1fb3\u1fbc', // ᾳᾼ
'\u1fc3\u1fcc', // ῃῌ
'\u1ff3\u1ffc', // ῳῼ
'\ufb05\ufb06', // ſtst
// Everything below this line is a well-behaved case-folding
// equivalence class with more than two characters but only one
// canonical case-folded character
'\u01c4\u01c5\u01c6', '\u01c7\u01c8\u01c9', '\u01ca\u01cb\u01cc',
'\u01f1\u01f2\u01f3', '\u0345\u0399\u03b9\u1fbe', '\u0392\u03b2\u03d0',
'\u0395\u03b5\u03f5', '\u039a\u03ba\u03f0', '\u03a0\u03c0\u03d6',
'\u03a1\u03c1\u03f1', '\u03a3\u03c2\u03c3', '\u03a6\u03c6\u03d5',
'\u0412\u0432\u1c80', '\u0414\u0434\u1c81', '\u041e\u043e\u1c82',
'\u0421\u0441\u1c83', '\u0422\u0442\u1c84\u1c85', '\u042a\u044a\u1c86',
'\u0462\u0463\u1c87', '\u1c88\ua64a\ua64b', '\u1e60\u1e61\u1e9b'
];
TestAll();