[regexp] Fix and unify non-unicode case-folding algorithms

Non-unicode, case-insensitive regexps (e.g. /foo/i, not foo/iu) use a
case-folding algorithm that doesn't quite match the Unicode
definition. There are two places in irregexp that need to do
case-folding. Prior to this patch, neither of them quite matched the
spec (https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch).

This patch implements the "Canonicalize" algorithm in
src/regexp/special-case.h, and uses it in the relevant places. It
replaces special-case logic around upper-casing / ASCII characters
with the following approach:

1. For most characters, calling UnicodeSet::closeOver on a set
   containing that character will produce the correct set of
   case-insensitive matches.

2. For a small handful of characters (like the sharp S that prompted
   this change), UnicodeSet::closeOver will include some characters
   that should be omitted. For example, although closeOver('ß') =
   "ßẞ", uppercase('ß') is "SS", so step 3.e means that 'ß'
   canonicalizes to itself, and should not match 'ẞ'. In these cases,
   we can skip the closeOver entirely, because it will never add an
   equivalent character. These characters are in the IgnoreSet.

3. For an even smaller handful of characters, UnicodeSet::closeOver
   will produce some characters that should be omitted, but also some
   characters that should be included. For example, closeOver('k') =
   "kKK" (lowercase k, uppercase K, U+212A KELVIN SIGN), but KELVIN
   SIGN should not match either of the other two (step 3.g). To handle
   this, we put such characters in the SpecialAddSet. In these cases,
   we closeOver the original character, but filter out the results
   that do not have the same canonical value.

The computation of IgnoreSet and SpecialAddSet happens at build time,
using the pre-existing gen-regexp-special-case.cc step.

R=jgruber@chromium.org

Bug: v8:10248
Change-Id: I00d48b180c83bb8e645cc59eda57b01eab134f0b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2072858
Reviewed-by: Frank Tang <ftang@chromium.org>
Reviewed-by: Jakob Gruber <jgruber@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66641}
This commit is contained in:
Iain Ireland 2020-03-03 16:58:46 -05:00 committed by Commit Bot
parent 61d496a656
commit 3fab9d05cf
6 changed files with 294 additions and 225 deletions

View File

@ -97,6 +97,7 @@ Henrique Ferreiro <henrique.ferreiro@gmail.com>
Hirofumi Mako <mkhrfm@gmail.com>
Honggyu Kim <honggyu.kp@gmail.com>
Huáng Jùnliàng <jlhwung@gmail.com>
Iain Ireland <iireland@mozilla.com>
Ingvar Stepanyan <me@rreverser.com>
Ioseb Dzmanashvili <ioseb.dzmanashvili@gmail.com>
Isiah Meadows <impinball@gmail.com>

View File

@ -1,4 +1,4 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@ -7,19 +7,19 @@
#include <iostream>
#include <sstream>
#include "src/base/logging.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "src/regexp/special-case.h"
namespace v8 {
namespace internal {
// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
// functions into "src/regexp/special-case.cc".
// See more details in http://shorturl.at/adfO5
void PrintSet(std::ofstream& out, const char* func_name,
static const uc32 kSurrogateStart = 0xd800;
static const uc32 kSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
// The following code generates "src/regexp/special-case.cc".
void PrintSet(std::ofstream& out, const char* name,
const icu::UnicodeSet& set) {
out << "icu::UnicodeSet " << func_name << "() {\n"
out << "icu::UnicodeSet Build" << name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
@ -31,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name,
}
out << " set.freeze();\n"
<< " return set;\n"
<< "}\n";
<< "}\n\n";
out << "struct " << name << "Data {\n"
<< " " << name << "Data() : set(Build" << name << "()) {}\n"
<< " const icu::UnicodeSet set;\n"
<< "};\n\n";
out << "//static\n"
<< "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
<< " static base::LazyInstance<" << name << "Data>::type set =\n"
<< " LAZY_INSTANCE_INITIALIZER;\n"
<< " return set.Pointer()->set;\n"
<< "}\n\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
// Iterate through all chars in BMP except ASCII and Surrogate.
for (UChar32 i = 0x80; i < 0x010000; i++) {
// Ignore those characters which is already processed.
if (!processed.contains(i)) {
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Remember we already processed current.
processed.addAll(current);
// Iterate through all chars in BMP except surrogates.
for (UChar32 i = 0; i < kNonBmpStart; i++) {
if (i >= kSurrogateStart && i <= kSurrogateEnd) {
continue; // Ignore surrogate range
}
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// All uppercase characters in current.
icu::UnicodeSet keep_upper(current);
keep_upper.retainAll(upper);
// Check if we have more than one uppercase character in current.
// If there are more than one uppercase character, then it is a special
// set which need to be added into either "Special Add" set or "Ignore"
// set.
int32_t number_of_upper = 0;
for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
number_of_upper +=
keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
// Check to see if all characters in the case-folding equivalence
// class as defined by UnicodeSet::closeOver all map to the same
// canonical value.
UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
bool class_has_matching_canonical_char = false;
bool class_has_non_matching_canonical_char = false;
for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
c++) {
if (c == i) {
continue;
}
UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
if (canonical == other_canonical) {
class_has_matching_canonical_char = true;
} else {
class_has_non_matching_canonical_char = true;
}
}
if (number_of_upper > 1) {
// Add all non uppercase characters (could be Ll or Mn) to special add
// set.
current.removeAll(upper);
special_add.addAll(current);
// Add the uppercase characters of non uppercase character to
// special add set.
CHECK_GT(current.getRangeCount(), 0);
UChar32 main_upper = u_toupper(current.getRangeStart(0));
special_add.add(main_upper);
// Add all uppercase except the main upper to ignore set.
keep_upper.remove(main_upper);
ignore.addAll(keep_upper);
}
// If any other character in i's equivalence class has a
// different canonical value, then i needs special handling. If
// no other character shares a canonical value with i, we can
// ignore i when adding alternatives for case-independent
// comparison. If at least one other character shares a
// canonical value, then i needs special handling.
if (class_has_non_matching_canonical_char) {
if (class_has_matching_canonical_char) {
special_add.add(i);
} else {
ignore.add(i);
}
}
}
// Remove any ASCII
special_add.remove(0x0000, 0x007f);
PrintSet(out, "BuildIgnoreSet", ignore);
PrintSet(out, "BuildSpecialAddSet", special_add);
// Verify that no Unicode equivalence class contains two non-trivial
// JS equivalence classes. Every character in SpecialAddSet has the
// same canonical value as every other non-IgnoreSet character in
// its Unicode equivalence class. Therefore, if we call closeOver on
// a set containing no IgnoreSet characters, the only characters
// that must be removed from the result are in IgnoreSet. This fact
// is used in CharacterRange::AddCaseEquivalents.
for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
for (UChar32 c = special_add.getRangeStart(i);
c <= special_add.getRangeEnd(i); c++) {
UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
current.set(c, c);
current.closeOver(USET_CASE_INSENSITIVE);
current.removeAll(ignore);
for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c2 = current.getRangeStart(j);
c2 <= current.getRangeEnd(j); c2++) {
CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
}
}
}
}
PrintSet(out, "IgnoreSet", ignore);
PrintSet(out, "SpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
<< "// The following functions are used to build icu::UnicodeSet\n"
<< "// for specical cases different between Unicode and ECMA262.\n"
out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
<< "// Use of this source code is governed by a BSD-style license that\n"
<< "// can be found in the LICENSE file.\n\n"
<< "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
<< "// The following functions are used to build UnicodeSets\n"
<< "// for special cases where the case-folding algorithm used by\n"
<< "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
<< "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
<< "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n"
<< "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"

View File

@ -1140,39 +1140,6 @@ Vector<const int> CharacterRange::GetWordBounds() {
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
}
#ifdef V8_INTL_SUPPORT
struct IgnoreSet {
IgnoreSet() : set(BuildIgnoreSet()) {}
const icu::UnicodeSet set;
};
struct SpecialAddSet {
SpecialAddSet() : set(BuildSpecialAddSet()) {}
const icu::UnicodeSet set;
};
icu::UnicodeSet BuildAsciiAToZSet() {
icu::UnicodeSet set('a', 'z');
set.add('A', 'Z');
set.freeze();
return set;
}
struct AsciiAToZSet {
AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
const icu::UnicodeSet set;
};
static base::LazyInstance<IgnoreSet>::type ignore_set =
LAZY_INSTANCE_INITIALIZER;
static base::LazyInstance<SpecialAddSet>::type special_add_set =
LAZY_INSTANCE_INITIALIZER;
static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
LAZY_INSTANCE_INITIALIZER;
#endif // V8_INTL_SUPPORT
// static
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
@ -1195,75 +1162,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
others.add(from, to);
}
// Set of characters already added to ranges that do not need to be added
// again.
// Compute the set of additional characters that should be added,
// using UnicodeSet::closeOver. ECMA 262 defines slightly different
// case-folding rules than Unicode, so some characters that are
// added by closeOver do not match anything other than themselves in
// JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
// same case-insensitive character as 's' or 'S' according to
// Unicode, but does not match any other character in JS. To handle
// this case, we add such characters to the IgnoreSet and filter
// them out. We filter twice: once before calling closeOver (to
// prevent 'ſ' from adding 's'), and once after calling closeOver
// (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
// more information.
icu::UnicodeSet already_added(others);
// Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
icu::UnicodeSet in_ascii_a_to_z(others);
in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
// Remove all chars in [a-zA-Z] from others.
others.removeAll(in_ascii_a_to_z);
// Set of characters in ranges that are overlapping with special add set.
icu::UnicodeSet in_special_add(others);
in_special_add.retainAll(special_add_set.Pointer()->set);
others.removeAll(in_special_add);
// Ignore all chars in ignore set.
others.removeAll(ignore_set.Pointer()->set);
// For most of the chars in ranges that is still in others, find the case
// equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
others.removeAll(RegExpCaseFolding::IgnoreSet());
others.closeOver(USET_CASE_INSENSITIVE);
// Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
// but ECMA262 "i" mode won't consider that, remove them from others.
// Ex: U+017F add 'S' and 's' to others.
others.removeAll(ascii_a_to_z_set.Pointer()->set);
// Special handling for in_ascii_a_to_z.
for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
UChar32 start = in_ascii_a_to_z.getRangeStart(i);
UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
// Check if it is uppercase A-Z by checking bit 6.
if (start & 0x0020) {
// Add the lowercases
others.add(start & 0x005F, end & 0x005F);
} else {
// Add the uppercases
others.add(start | 0x0020, end | 0x0020);
}
}
// Special handling for chars in "Special Add" set.
for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
UChar32 end = in_special_add.getRangeEnd(i);
for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
// Add the uppercase of this character if itself is not an uppercase
// character.
// Note: The if condiction cannot be u_islower(ch) because ch could be
// neither uppercase nor lowercase but Mn.
if (!u_isupper(ch)) {
others.add(u_toupper(ch));
}
icu::UnicodeSet candidates(ch, ch);
candidates.closeOver(USET_CASE_INSENSITIVE);
for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
UChar32 end2 = candidates.getRangeEnd(j);
for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
// Add character that is not uppercase to others.
if (!u_isupper(ch2)) {
others.add(ch2);
}
}
}
}
}
// Remove all characters which already in the ranges.
others.removeAll(RegExpCaseFolding::IgnoreSet());
others.removeAll(already_added);
// Add others to the ranges

View File

@ -9,6 +9,9 @@
#include "src/objects/objects-inl.h"
#include "src/regexp/regexp-macro-assembler-arch.h"
#include "src/regexp/regexp-macro-assembler-tracer.h"
#ifdef V8_INTL_SUPPORT
#include "src/regexp/special-case.h"
#endif // V8_INTL_SUPPORT
#include "src/strings/unicode-inl.h"
#include "src/zone/zone-list-inl.h"
@ -725,32 +728,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
unibrow::uchar* letters,
int letter_length) {
#ifdef V8_INTL_SUPPORT
// Special case for U+017F which has upper case in ASCII range.
if (character == 0x017f) {
if (RegExpCaseFolding::IgnoreSet().contains(character)) {
letters[0] = character;
return 1;
}
bool in_special_add_set =
RegExpCaseFolding::SpecialAddSet().contains(character);
icu::UnicodeSet set;
set.add(character);
set = set.closeOver(USET_CASE_INSENSITIVE);
UChar32 canon = 0;
if (in_special_add_set) {
canon = RegExpCaseFolding::Canonicalize(character);
}
int32_t range_count = set.getRangeCount();
int items = 0;
for (int32_t i = 0; i < range_count; i++) {
UChar32 start = set.getRangeStart(i);
UChar32 end = set.getRangeEnd(i);
CHECK(end - start + items <= letter_length);
// Only add to the output if character is not in ASCII range
// or the case equivalent character is in ASCII range.
// #sec-runtime-semantics-canonicalize-ch
// 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128,
// return ch.
if (!((start >= 128) && (character < 128))) {
// No range have start and end span across code point 128.
DCHECK((start >= 128) == (end >= 128));
for (UChar32 cu = start; cu <= end; cu++) {
if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
letters[items++] = (unibrow::uchar)(cu);
for (UChar32 cu = start; cu <= end; cu++) {
if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
continue;
}
letters[items++] = (unibrow::uchar)(cu);
}
}
return items;

View File

@ -6,70 +6,109 @@
#define V8_REGEXP_SPECIAL_CASE_H_
#ifdef V8_INTL_SUPPORT
#include "unicode/uversion.h"
namespace U_ICU_NAMESPACE {
class UnicodeSet;
} // namespace U_ICU_NAMESPACE
#include "src/base/logging.h"
#include "src/common/globals.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
namespace v8 {
namespace internal {
// Functions to build special sets of Unicode characters that need special
// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
//
// For the characters in the "ignore set", the process should not treat other
// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
// equivlant under the ECMA262 RegExp "i" mode because these characters are
// uppercase themselves that no other characters in the set uppercase to.
//
// For the characters in the "special add set", the proecess should add only
// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
// and also that ONE uppercase character that other non uppercase character
// uppercase into to the set. Other uppercase characters in the result of
// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
// RegExp "i" mode consider two characters as "case equivlant" if both
// characters uppercase to the same character.
//
// For example, consider the following case equivalent set defined by Unicode
// standard. Notice there are more than one uppercase characters in this set:
// U+212B Å Angstrom Sign - an uppercase character.
// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
// uppercase to U+00C5.
// In this case equivlant set is a special set and need special handling while
// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
// different than Unicode Standard:
// * U+212B should be included into the "ignore" set because there are no other
// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
// uppercase to U+212B.
// * U+00C5 and U+00E5 will both be included into the "special add" set. While
// calculate the "equivlant set" under ECMA262 "i" mode, the process will
// add U+00E5, because it is not an uppercase character in the set. The
// process will also add U+00C5, because it is the uppercase character which
// other non uppercase character, U+00C5, uppercase into.
//
// For characters not included in "ignore set" and "special add set", the
// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
// much faster.
//
// Under Unicode 12.0, there are only 7 characters in the "special add set" and
// 4 characters in "ignore set" so even the special add process is slower, it is
// limited to a small set of cases only.
//
// The implementation of these two function will be generated by calling ICU
// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
// the code in src/regexp/gen-regexp-special-case.cc.
//
// These two function will be used with LazyInstance<> template to generate
// global sharable set to reduce memory usage and speed up performance.
// Sets of Unicode characters that need special handling under "i" mode
// Function to build and return the Ignore set.
icu::UnicodeSet BuildIgnoreSet();
// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
// defines slightly different case-folding rules than Unicode. An
// input character should match a pattern character if the result of
// the Canonicalize algorithm is the same for both characters.
//
// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
// the precise definition.
//
// While compiling such regular expressions, we need to compute the
// set of characters that should match a given input character. (See
// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
// For almost all characters, this can be efficiently computed using
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
// the remaining special cases.
//
// For a character c, the rules are as follows:
//
// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
// containing c will produce the set of characters that should
// match /c/i (or /[c]/i), and only those characters.
//
// 2. If c is in IgnoreSet, then the only character it should match is
// itself. However, closeOver will add additional incorrect
// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
// itself, and should not match 'ẞ'. In these cases, we can skip
// the closeOver entirely, because it will never add an equivalent
// character.
//
// 3. If c is in SpecialAddSet, then it should match at least one
// character other than itself. However, closeOver will add at
// least one additional incorrect match. For example, consider the
// letter 'k'. Closing over 'k' gives "kK" (lowercase k, uppercase
// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
// SIGN should not match either of the other two characters. As a
// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
// IgnoreSet). To find the correct matches for characters in
// SpecialAddSet, we closeOver the original character, but filter
// out the results that do not have the same canonical value.
//
// The contents of these sets are calculated at build time by
// src/regexp/gen-regexp-special-case.cc, which generates
// gen/src/regexp/special-case.cc. This is done by iterating over the
// result of closeOver for each BMP character, and finding sets for
// which at least one character has a different canonical value than
// another character. Characters that match no other characters in
// their equivalence class are added to IgnoreSet. Characters that
// match at least one other character are added to SpecialAddSet.
// Function to build and return the Special Add set.
icu::UnicodeSet BuildSpecialAddSet();
class RegExpCaseFolding final : public AllStatic {
public:
static const icu::UnicodeSet& IgnoreSet();
static const icu::UnicodeSet& SpecialAddSet();
// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
// Canonicalize) step 3, which is used to determine whether
// characters match when ignoreCase is true and unicode is false.
static UChar32 Canonicalize(UChar32 ch) {
// a. Assert: ch is a UTF-16 code unit.
CHECK_LE(ch, 0xffff);
// b. Let s be the String value consisting of the single code unit ch.
icu::UnicodeString s(ch);
// c. Let u be the same result produced as if by performing the algorithm
// for String.prototype.toUpperCase using s as the this value.
// d. Assert: Type(u) is String.
icu::UnicodeString& u = s.toUpper();
// e. If u does not consist of a single code unit, return ch.
if (u.length() != 1) {
return ch;
}
// f. Let cu be u's single code unit element.
UChar32 cu = u.char32At(0);
// g. If the value of ch >= 128 and the value of cu < 128, return ch.
if (ch >= 128 && cu < 128) {
return ch;
}
// h. Return cu.
return cu;
}
};
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,70 @@
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
function Canonicalize(ch) {
var u = ch.toUpperCase();
if (u.length > 1) return ch;
var cu = u.charCodeAt(0);
if (ch.charCodeAt(0) >= 128 && cu < 128) return ch;
return cu;
}
function TestEquivalenceClass(eclass) {
for (var i = 0; i < eclass.length; i++) {
for (var j = 0; j < eclass.length; j++) {
if (i == j) continue;
var c1 = eclass[i];
var c2 = eclass[j];
var shouldMatch = Canonicalize(c1) === Canonicalize(c2);
var re1 = new RegExp(c1, 'i');
var re2 = new RegExp('[' + c1 + ']', 'i');
assertEquals(re1.test(c2), shouldMatch);
assertEquals(re2.test(c2), shouldMatch);
}
}
}
function TestAll() {
for (var eclass of equivalence_classes) {
TestEquivalenceClass(eclass);
}
}
// Interesting case-folding equivalence classes (as determined by
// ICU's UnicodeSet::closeOver). A class is interesting if it contains
// more than two characters, or if it contains any characters in
// IgnoreSet or SpecialAddSet as defined in src/regexp/special-case.h.
var equivalence_classes = [
'\u0041\u0061', // Aa (sanity check)
'\u004b\u006b\u212a', // Kk
'\u0053\u0073\u017f', // Ssſ
'\u00b5\u039c\u03bc', // µΜμ
'\u00c5\u00e5\u212b', // ÅåÅ
'\u00df\u1e9e', // ßẞ
'\u03a9\u03c9\u2126', // ΩωΩ
'\u0390\u1fd3', // ΐΐ
'\u0398\u03b8\u03d1\u03f4', // Θθϑϴ
'\u03b0\u1fe3', // ΰΰ
'\u1f80\u1f88', // ᾀᾈ
'\u1fb3\u1fbc', // ᾳᾼ
'\u1fc3\u1fcc', // ῃῌ
'\u1ff3\u1ffc', // ῳῼ
'\ufb05\ufb06', // ſtst
// Everything below this line is a well-behaved case-folding
// equivalence class with more than two characters but only one
// canonical case-folded character
'\u01c4\u01c5\u01c6', '\u01c7\u01c8\u01c9', '\u01ca\u01cb\u01cc',
'\u01f1\u01f2\u01f3', '\u0345\u0399\u03b9\u1fbe', '\u0392\u03b2\u03d0',
'\u0395\u03b5\u03f5', '\u039a\u03ba\u03f0', '\u03a0\u03c0\u03d6',
'\u03a1\u03c1\u03f1', '\u03a3\u03c2\u03c3', '\u03a6\u03c6\u03d5',
'\u0412\u0432\u1c80', '\u0414\u0434\u1c81', '\u041e\u043e\u1c82',
'\u0421\u0441\u1c83', '\u0422\u0442\u1c84\u1c85', '\u042a\u044a\u1c86',
'\u0462\u0463\u1c87', '\u1c88\ua64a\ua64b', '\u1e60\u1e61\u1e9b'
];
TestAll();