3fab9d05cf
Non-unicode, case-insensitive regexps (e.g. /foo/i, not foo/iu) use a case-folding algorithm that doesn't quite match the Unicode definition. There are two places in irregexp that need to do case-folding. Prior to this patch, neither of them quite matched the spec (https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch). This patch implements the "Canonicalize" algorithm in src/regexp/special-case.h, and uses it in the relevant places. It replaces special-case logic around upper-casing / ASCII characters with the following approach: 1. For most characters, calling UnicodeSet::closeOver on a set containing that character will produce the correct set of case-insensitive matches. 2. For a small handful of characters (like the sharp S that prompted this change), UnicodeSet::closeOver will include some characters that should be omitted. For example, although closeOver('ß') = "ßẞ", uppercase('ß') is "SS", so step 3.e means that 'ß' canonicalizes to itself, and should not match 'ẞ'. In these cases, we can skip the closeOver entirely, because it will never add an equivalent character. These characters are in the IgnoreSet. 3. For an even smaller handful of characters, UnicodeSet::closeOver will produce some characters that should be omitted, but also some characters that should be included. For example, closeOver('k') = "kKK" (lowercase k, uppercase K, U+212A KELVIN SIGN), but KELVIN SIGN should not match either of the other two (step 3.g). To handle this, we put such characters in the SpecialAddSet. In these cases, we closeOver the original character, but filter out the results that do not have the same canonical value. The computation of IgnoreSet and SpecialAddSet happens at build time, using the pre-existing gen-regexp-special-case.cc step. R=jgruber@chromium.org Bug: v8:10248 Change-Id: I00d48b180c83bb8e645cc59eda57b01eab134f0b Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2072858 Reviewed-by: Frank Tang <ftang@chromium.org> Reviewed-by: Jakob Gruber <jgruber@chromium.org> Commit-Queue: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#66641}
71 lines
2.5 KiB
JavaScript
71 lines
2.5 KiB
JavaScript
// Copyright 2020 the V8 project authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style license that can be
|
||
// found in the LICENSE file.
|
||
|
||
// See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
|
||
function Canonicalize(ch) {
|
||
var u = ch.toUpperCase();
|
||
if (u.length > 1) return ch;
|
||
var cu = u.charCodeAt(0);
|
||
if (ch.charCodeAt(0) >= 128 && cu < 128) return ch;
|
||
return cu;
|
||
}
|
||
|
||
function TestEquivalenceClass(eclass) {
|
||
for (var i = 0; i < eclass.length; i++) {
|
||
for (var j = 0; j < eclass.length; j++) {
|
||
if (i == j) continue;
|
||
var c1 = eclass[i];
|
||
var c2 = eclass[j];
|
||
var shouldMatch = Canonicalize(c1) === Canonicalize(c2);
|
||
|
||
var re1 = new RegExp(c1, 'i');
|
||
var re2 = new RegExp('[' + c1 + ']', 'i');
|
||
|
||
assertEquals(re1.test(c2), shouldMatch);
|
||
assertEquals(re2.test(c2), shouldMatch);
|
||
}
|
||
}
|
||
}
|
||
|
||
function TestAll() {
|
||
for (var eclass of equivalence_classes) {
|
||
TestEquivalenceClass(eclass);
|
||
}
|
||
}
|
||
|
||
// Interesting case-folding equivalence classes (as determined by
|
||
// ICU's UnicodeSet::closeOver). A class is interesting if it contains
|
||
// more than two characters, or if it contains any characters in
|
||
// IgnoreSet or SpecialAddSet as defined in src/regexp/special-case.h.
|
||
var equivalence_classes = [
|
||
'\u0041\u0061', // Aa (sanity check)
|
||
'\u004b\u006b\u212a', // KkK
|
||
'\u0053\u0073\u017f', // Ssſ
|
||
'\u00b5\u039c\u03bc', // µΜμ
|
||
'\u00c5\u00e5\u212b', // ÅåÅ
|
||
'\u00df\u1e9e', // ßẞ
|
||
'\u03a9\u03c9\u2126', // ΩωΩ
|
||
'\u0390\u1fd3', // ΐΐ
|
||
'\u0398\u03b8\u03d1\u03f4', // Θθϑϴ
|
||
'\u03b0\u1fe3', // ΰΰ
|
||
'\u1f80\u1f88', // ᾀᾈ
|
||
'\u1fb3\u1fbc', // ᾳᾼ
|
||
'\u1fc3\u1fcc', // ῃῌ
|
||
'\u1ff3\u1ffc', // ῳῼ
|
||
'\ufb05\ufb06', // ſtst
|
||
|
||
// Everything below this line is a well-behaved case-folding
|
||
// equivalence class with more than two characters but only one
|
||
// canonical case-folded character
|
||
'\u01c4\u01c5\u01c6', '\u01c7\u01c8\u01c9', '\u01ca\u01cb\u01cc',
|
||
'\u01f1\u01f2\u01f3', '\u0345\u0399\u03b9\u1fbe', '\u0392\u03b2\u03d0',
|
||
'\u0395\u03b5\u03f5', '\u039a\u03ba\u03f0', '\u03a0\u03c0\u03d6',
|
||
'\u03a1\u03c1\u03f1', '\u03a3\u03c2\u03c3', '\u03a6\u03c6\u03d5',
|
||
'\u0412\u0432\u1c80', '\u0414\u0434\u1c81', '\u041e\u043e\u1c82',
|
||
'\u0421\u0441\u1c83', '\u0422\u0442\u1c84\u1c85', '\u042a\u044a\u1c86',
|
||
'\u0462\u0463\u1c87', '\u1c88\ua64a\ua64b', '\u1e60\u1e61\u1e9b'
|
||
];
|
||
|
||
TestAll();
|