Reland "Speed up CharacterRange::AddCaseEquivalents"
This is a reland of f23f644fb3
Fix the issue by wrap v8_executable("gen-regexp-special-case")
inside if (current_toolchain == v8_generator_toolchain) {
and change deps of action("run_gen-regexp-special-case")
to ":gen-regexp-special-case($v8_generator_toolchain)",
Original change's description:
> Speed up CharacterRange::AddCaseEquivalents
>
> By using the lexCss("color:") to measure the performance
> The change make the lexCss("color:")
> x21 - x40 times faster than trunk.
> x2.3 - x4.6 times faster than m74.
>
> Design Doc: http://shorturl.at/adfO5
>
> Measured by out/x64.release/d8 reg977003.js
> see reg977003.js attached to chromium:977003
>
> Also see another cl of benchmark in
> https://chromium-review.googlesource.com/c/v8/v8/+/1679651/
>
>
> Bug: chromium:977003
> Change-Id: Ie8518493d2c33df1594be1b4576bda715087b421
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1674851
> Commit-Queue: Frank Tang <ftang@chromium.org>
> Reviewed-by: Yang Guo <yangguo@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#62471}
Bug: chromium:977003
Change-Id: Ie690810f596e9551b5765f422665c9617391bcf8
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1683706
Reviewed-by: Frank Tang <ftang@chromium.org>
Reviewed-by: Yang Guo <yangguo@chromium.org>
Commit-Queue: Frank Tang <ftang@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62486}
This commit is contained in:
parent
aaf94026c0
commit
433403dc9b
47
BUILD.gn
47
BUILD.gn
@ -2708,6 +2708,7 @@ v8_source_set("v8_base_without_compiler") {
|
|||||||
"src/regexp/regexp-nodes.h",
|
"src/regexp/regexp-nodes.h",
|
||||||
"src/regexp/regexp-parser.cc",
|
"src/regexp/regexp-parser.cc",
|
||||||
"src/regexp/regexp-parser.h",
|
"src/regexp/regexp-parser.h",
|
||||||
|
"src/regexp/regexp-special-case.h",
|
||||||
"src/regexp/regexp-stack.cc",
|
"src/regexp/regexp-stack.cc",
|
||||||
"src/regexp/regexp-stack.h",
|
"src/regexp/regexp-stack.h",
|
||||||
"src/regexp/regexp-utils.cc",
|
"src/regexp/regexp-utils.cc",
|
||||||
@ -3240,6 +3241,8 @@ v8_source_set("v8_base_without_compiler") {
|
|||||||
]
|
]
|
||||||
|
|
||||||
if (v8_enable_i18n_support) {
|
if (v8_enable_i18n_support) {
|
||||||
|
deps += [ ":run_gen-regexp-special-case" ]
|
||||||
|
sources += [ "$target_gen_dir/src/regexp/special-case.cc" ]
|
||||||
if (is_win) {
|
if (is_win) {
|
||||||
deps += [ "//third_party/icu:icudata" ]
|
deps += [ "//third_party/icu:icudata" ]
|
||||||
}
|
}
|
||||||
@ -3907,6 +3910,50 @@ v8_executable("torque-language-server") {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (current_toolchain == v8_generator_toolchain) {
|
||||||
|
v8_executable("gen-regexp-special-case") {
|
||||||
|
visibility = [ ":*" ] # Only targets in this file can depend on this.
|
||||||
|
|
||||||
|
sources = [
|
||||||
|
"src/regexp/gen-regexp-special-case.cc",
|
||||||
|
]
|
||||||
|
|
||||||
|
deps = [
|
||||||
|
":v8_libbase",
|
||||||
|
"//build/win:default_exe_manifest",
|
||||||
|
"//third_party/icu",
|
||||||
|
]
|
||||||
|
|
||||||
|
configs = [ ":internal_config" ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
action("run_gen-regexp-special-case") {
|
||||||
|
visibility = [ ":*" ] # Only targets in this file can depend on this.
|
||||||
|
|
||||||
|
script = "tools/run.py"
|
||||||
|
|
||||||
|
sources = v8_extra_library_files
|
||||||
|
|
||||||
|
deps = [
|
||||||
|
":gen-regexp-special-case($v8_generator_toolchain)",
|
||||||
|
]
|
||||||
|
|
||||||
|
output_file = "$target_gen_dir/src/regexp/special-case.cc"
|
||||||
|
|
||||||
|
outputs = [
|
||||||
|
output_file,
|
||||||
|
]
|
||||||
|
|
||||||
|
args = [
|
||||||
|
"./" + rebase_path(
|
||||||
|
get_label_info(":gen-regexp-special-case($v8_generator_toolchain)",
|
||||||
|
"root_out_dir") + "/gen-regexp-special-case",
|
||||||
|
root_build_dir),
|
||||||
|
rebase_path(output_file, root_build_dir),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# Public targets
|
# Public targets
|
||||||
#
|
#
|
||||||
|
125
src/regexp/gen-regexp-special-case.cc
Normal file
125
src/regexp/gen-regexp-special-case.cc
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file.
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "src/base/logging.h"
|
||||||
|
#include "unicode/uchar.h"
|
||||||
|
#include "unicode/uniset.h"
|
||||||
|
|
||||||
|
namespace v8 {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
|
||||||
|
// functions into "src/regexp/special-case.cc".
|
||||||
|
// See more details in http://shorturl.at/adfO5
|
||||||
|
void PrintSet(std::ofstream& out, const char* func_name,
|
||||||
|
const icu::UnicodeSet& set) {
|
||||||
|
out << "icu::UnicodeSet " << func_name << "() {\n"
|
||||||
|
<< " icu::UnicodeSet set;\n";
|
||||||
|
for (int32_t i = 0; i < set.getRangeCount(); i++) {
|
||||||
|
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
|
||||||
|
out << " set.add(0x" << set.getRangeStart(i) << ");\n";
|
||||||
|
} else {
|
||||||
|
out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
|
||||||
|
<< set.getRangeEnd(i) << ");\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out << " set.freeze();\n"
|
||||||
|
<< " return set;\n"
|
||||||
|
<< "}\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
void PrintSpecial(std::ofstream& out) {
|
||||||
|
icu::UnicodeSet current;
|
||||||
|
icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
|
||||||
|
icu::UnicodeSet special_add;
|
||||||
|
icu::UnicodeSet ignore;
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
icu::UnicodeSet upper("[\\p{Lu}]", status);
|
||||||
|
CHECK(U_SUCCESS(status));
|
||||||
|
// Iterate through all chars in BMP except ASCII and Surrogate.
|
||||||
|
for (UChar32 i = 0x80; i < 0x010000; i++) {
|
||||||
|
// Ignore those characters which is already processed.
|
||||||
|
if (!processed.contains(i)) {
|
||||||
|
current.set(i, i);
|
||||||
|
current.closeOver(USET_CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
// Remember we already processed current.
|
||||||
|
processed.addAll(current);
|
||||||
|
|
||||||
|
// All uppercase characters in current.
|
||||||
|
icu::UnicodeSet keep_upper(current);
|
||||||
|
keep_upper.retainAll(upper);
|
||||||
|
|
||||||
|
// Check if we have more than one uppercase character in current.
|
||||||
|
// If there are more than one uppercase character, then it is a special
|
||||||
|
// set which need to be added into either "Special Add" set or "Ignore"
|
||||||
|
// set.
|
||||||
|
int32_t number_of_upper = 0;
|
||||||
|
for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
|
||||||
|
number_of_upper +=
|
||||||
|
keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
|
||||||
|
}
|
||||||
|
if (number_of_upper > 1) {
|
||||||
|
// Add all non uppercase characters (could be Ll or Mn) to special add
|
||||||
|
// set.
|
||||||
|
current.removeAll(upper);
|
||||||
|
special_add.addAll(current);
|
||||||
|
|
||||||
|
// Add the uppercase characters of non uppercase character to
|
||||||
|
// special add set.
|
||||||
|
CHECK_GT(current.getRangeCount(), 0);
|
||||||
|
UChar32 main_upper = u_toupper(current.getRangeStart(0));
|
||||||
|
special_add.add(main_upper);
|
||||||
|
|
||||||
|
// Add all uppercase except the main upper to ignore set.
|
||||||
|
keep_upper.remove(main_upper);
|
||||||
|
ignore.addAll(keep_upper);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove any ASCII
|
||||||
|
special_add.remove(0x0000, 0x007f);
|
||||||
|
PrintSet(out, "BuildIgnoreSet", ignore);
|
||||||
|
PrintSet(out, "BuildSpecialAddSet", special_add);
|
||||||
|
}
|
||||||
|
|
||||||
|
void WriteHeader(const char* header_filename) {
|
||||||
|
std::ofstream out(header_filename);
|
||||||
|
out << std::hex << std::setfill('0') << std::setw(4);
|
||||||
|
|
||||||
|
out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
|
||||||
|
<< "// The following functions are used to build icu::UnicodeSet\n"
|
||||||
|
<< "// for specical cases different between Unicode and ECMA262.\n"
|
||||||
|
<< "#ifdef V8_INTL_SUPPORT\n"
|
||||||
|
<< "#include \"src/regexp/special-case.h\"\n\n"
|
||||||
|
<< "#include \"unicode/uniset.h\"\n"
|
||||||
|
<< "namespace v8 {\n"
|
||||||
|
<< "namespace internal {\n\n";
|
||||||
|
|
||||||
|
PrintSpecial(out);
|
||||||
|
|
||||||
|
out << "\n"
|
||||||
|
<< "} // namespace internal\n"
|
||||||
|
<< "} // namespace v8\n"
|
||||||
|
<< "#endif // V8_INTL_SUPPORT\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace v8
|
||||||
|
|
||||||
|
int main(int argc, const char** argv) {
|
||||||
|
if (argc != 2) {
|
||||||
|
std::cerr << "Usage: " << argv[0] << " <output filename>\n";
|
||||||
|
std::exit(1);
|
||||||
|
}
|
||||||
|
v8::internal::WriteHeader(argv[1]);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -6,6 +6,9 @@
|
|||||||
|
|
||||||
#include "src/execution/isolate.h"
|
#include "src/execution/isolate.h"
|
||||||
#include "src/regexp/regexp.h"
|
#include "src/regexp/regexp.h"
|
||||||
|
#ifdef V8_INTL_SUPPORT
|
||||||
|
#include "src/regexp/special-case.h"
|
||||||
|
#endif // V8_INTL_SUPPORT
|
||||||
#include "src/strings/unicode-inl.h"
|
#include "src/strings/unicode-inl.h"
|
||||||
#include "src/zone/zone-list-inl.h"
|
#include "src/zone/zone-list-inl.h"
|
||||||
|
|
||||||
@ -1137,6 +1140,39 @@ Vector<const int> CharacterRange::GetWordBounds() {
|
|||||||
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
|
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef V8_INTL_SUPPORT
|
||||||
|
struct IgnoreSet {
|
||||||
|
IgnoreSet() : set(BuildIgnoreSet()) {}
|
||||||
|
const icu::UnicodeSet set;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SpecialAddSet {
|
||||||
|
SpecialAddSet() : set(BuildSpecialAddSet()) {}
|
||||||
|
const icu::UnicodeSet set;
|
||||||
|
};
|
||||||
|
|
||||||
|
icu::UnicodeSet BuildAsciiAToZSet() {
|
||||||
|
icu::UnicodeSet set('a', 'z');
|
||||||
|
set.add('A', 'Z');
|
||||||
|
set.freeze();
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct AsciiAToZSet {
|
||||||
|
AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
|
||||||
|
const icu::UnicodeSet set;
|
||||||
|
};
|
||||||
|
|
||||||
|
static base::LazyInstance<IgnoreSet>::type ignore_set =
|
||||||
|
LAZY_INSTANCE_INITIALIZER;
|
||||||
|
|
||||||
|
static base::LazyInstance<SpecialAddSet>::type special_add_set =
|
||||||
|
LAZY_INSTANCE_INITIALIZER;
|
||||||
|
|
||||||
|
static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
|
||||||
|
LAZY_INSTANCE_INITIALIZER;
|
||||||
|
#endif // V8_INTL_SUPPORT
|
||||||
|
|
||||||
// static
|
// static
|
||||||
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
|
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
|
||||||
ZoneList<CharacterRange>* ranges,
|
ZoneList<CharacterRange>* ranges,
|
||||||
@ -1144,58 +1180,100 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
|
|||||||
CharacterRange::Canonicalize(ranges);
|
CharacterRange::Canonicalize(ranges);
|
||||||
int range_count = ranges->length();
|
int range_count = ranges->length();
|
||||||
#ifdef V8_INTL_SUPPORT
|
#ifdef V8_INTL_SUPPORT
|
||||||
icu::UnicodeSet already_added;
|
|
||||||
icu::UnicodeSet others;
|
icu::UnicodeSet others;
|
||||||
for (int i = 0; i < range_count; i++) {
|
for (int i = 0; i < range_count; i++) {
|
||||||
CharacterRange range = ranges->at(i);
|
CharacterRange range = ranges->at(i);
|
||||||
uc32 bottom = range.from();
|
uc32 from = range.from();
|
||||||
if (bottom > String::kMaxUtf16CodeUnit) continue;
|
if (from > String::kMaxUtf16CodeUnit) continue;
|
||||||
uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
|
uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit);
|
||||||
// Nothing to be done for surrogates.
|
// Nothing to be done for surrogates.
|
||||||
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
|
if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
|
||||||
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
|
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
|
||||||
if (bottom > String::kMaxOneByteCharCode) continue;
|
if (from > String::kMaxOneByteCharCode) continue;
|
||||||
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
|
if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
|
||||||
}
|
}
|
||||||
already_added.add(bottom, top);
|
others.add(from, to);
|
||||||
icu::Locale locale = icu::Locale::getRoot();
|
}
|
||||||
while (bottom <= top) {
|
|
||||||
icu::UnicodeString upper(bottom);
|
// Set of characters already added to ranges that do not need to be added
|
||||||
upper.toUpper(locale);
|
// again.
|
||||||
icu::UnicodeSet expanded(bottom, bottom);
|
icu::UnicodeSet already_added(others);
|
||||||
expanded.closeOver(USET_CASE_INSENSITIVE);
|
|
||||||
for (int32_t i = 0; i < expanded.getRangeCount(); i++) {
|
// Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
|
||||||
UChar32 start = expanded.getRangeStart(i);
|
icu::UnicodeSet in_ascii_a_to_z(others);
|
||||||
UChar32 end = expanded.getRangeEnd(i);
|
in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
|
||||||
while (start <= end) {
|
|
||||||
icu::UnicodeString upper2(start);
|
// Remove all chars in [a-zA-Z] from others.
|
||||||
upper2.toUpper(locale);
|
others.removeAll(in_ascii_a_to_z);
|
||||||
// Only add if the upper case are the same.
|
|
||||||
if (upper[0] == upper2[0]) {
|
// Set of characters in ranges that are overlapping with special add set.
|
||||||
// #sec-runtime-semantics-canonicalize-ch
|
icu::UnicodeSet in_special_add(others);
|
||||||
// 3.g. If the numeric value of ch ≥ 128 and the numeric value of
|
in_special_add.retainAll(special_add_set.Pointer()->set);
|
||||||
// cu < 128, return ch.
|
|
||||||
if (bottom >= 128 && start < 128) {
|
others.removeAll(in_special_add);
|
||||||
others.add(bottom);
|
|
||||||
} else {
|
// Ignore all chars in ignore set.
|
||||||
// 3.h. 3.h. 3.h. Return cu.
|
others.removeAll(ignore_set.Pointer()->set);
|
||||||
others.add(start);
|
|
||||||
}
|
// For most of the chars in ranges that is still in others, find the case
|
||||||
}
|
// equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
|
||||||
start++;
|
others.closeOver(USET_CASE_INSENSITIVE);
|
||||||
}
|
|
||||||
}
|
// Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
|
||||||
bottom++;
|
// but ECMA262 "i" mode won't consider that, remove them from others.
|
||||||
|
// Ex: U+017F add 'S' and 's' to others.
|
||||||
|
others.removeAll(ascii_a_to_z_set.Pointer()->set);
|
||||||
|
|
||||||
|
// Special handling for in_ascii_a_to_z.
|
||||||
|
for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
|
||||||
|
UChar32 start = in_ascii_a_to_z.getRangeStart(i);
|
||||||
|
UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
|
||||||
|
// Check if it is uppercase A-Z by checking bit 6.
|
||||||
|
if (start & 0x0020) {
|
||||||
|
// Add the lowercases
|
||||||
|
others.add(start & 0x005F, end & 0x005F);
|
||||||
|
} else {
|
||||||
|
// Add the uppercases
|
||||||
|
others.add(start | 0x0020, end | 0x0020);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Special handling for chars in "Special Add" set.
|
||||||
|
for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
|
||||||
|
UChar32 end = in_special_add.getRangeEnd(i);
|
||||||
|
for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
|
||||||
|
// Add the uppercase of this character if itself is not an uppercase
|
||||||
|
// character.
|
||||||
|
// Note: The if condiction cannot be u_islower(ch) because ch could be
|
||||||
|
// neither uppercase nor lowercase but Mn.
|
||||||
|
if (!u_isupper(ch)) {
|
||||||
|
others.add(u_toupper(ch));
|
||||||
|
}
|
||||||
|
icu::UnicodeSet candidates(ch, ch);
|
||||||
|
candidates.closeOver(USET_CASE_INSENSITIVE);
|
||||||
|
for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
|
||||||
|
UChar32 end2 = candidates.getRangeEnd(j);
|
||||||
|
for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
|
||||||
|
// Add character that is not uppercase to others.
|
||||||
|
if (!u_isupper(ch2)) {
|
||||||
|
others.add(ch2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove all characters which already in the ranges.
|
||||||
others.removeAll(already_added);
|
others.removeAll(already_added);
|
||||||
|
|
||||||
|
// Add others to the ranges
|
||||||
for (int32_t i = 0; i < others.getRangeCount(); i++) {
|
for (int32_t i = 0; i < others.getRangeCount(); i++) {
|
||||||
UChar32 start = others.getRangeStart(i);
|
UChar32 from = others.getRangeStart(i);
|
||||||
UChar32 end = others.getRangeEnd(i);
|
UChar32 to = others.getRangeEnd(i);
|
||||||
if (start == end) {
|
if (from == to) {
|
||||||
ranges->Add(CharacterRange::Singleton(start), zone);
|
ranges->Add(CharacterRange::Singleton(from), zone);
|
||||||
} else {
|
} else {
|
||||||
ranges->Add(CharacterRange::Range(start, end), zone);
|
ranges->Add(CharacterRange::Range(from, to), zone);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
79
src/regexp/special-case.h
Normal file
79
src/regexp/special-case.h
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef V8_REGEXP_SPECIAL_CASE_H_
|
||||||
|
#define V8_REGEXP_SPECIAL_CASE_H_
|
||||||
|
|
||||||
|
#ifdef V8_INTL_SUPPORT
|
||||||
|
#include "unicode/uversion.h"
|
||||||
|
namespace U_ICU_NAMESPACE {
|
||||||
|
class UnicodeSet;
|
||||||
|
} // namespace U_ICU_NAMESPACE
|
||||||
|
|
||||||
|
namespace v8 {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// Functions to build special sets of Unicode characters that need special
|
||||||
|
// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
|
||||||
|
//
|
||||||
|
// For the characters in the "ignore set", the process should not treat other
|
||||||
|
// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
|
||||||
|
// equivlant under the ECMA262 RegExp "i" mode because these characters are
|
||||||
|
// uppercase themselves that no other characters in the set uppercase to.
|
||||||
|
//
|
||||||
|
// For the characters in the "special add set", the proecess should add only
|
||||||
|
// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
|
||||||
|
// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
|
||||||
|
// and also that ONE uppercase character that other non uppercase character
|
||||||
|
// uppercase into to the set. Other uppercase characters in the result of
|
||||||
|
// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
|
||||||
|
// RegExp "i" mode consider two characters as "case equivlant" if both
|
||||||
|
// characters uppercase to the same character.
|
||||||
|
//
|
||||||
|
// For example, consider the following case equivalent set defined by Unicode
|
||||||
|
// standard. Notice there are more than one uppercase characters in this set:
|
||||||
|
// U+212B Å Angstrom Sign - an uppercase character.
|
||||||
|
// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
|
||||||
|
// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
|
||||||
|
// uppercase to U+00C5.
|
||||||
|
// In this case equivlant set is a special set and need special handling while
|
||||||
|
// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
|
||||||
|
// different than Unicode Standard:
|
||||||
|
// * U+212B should be included into the "ignore" set because there are no other
|
||||||
|
// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
|
||||||
|
// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
|
||||||
|
// uppercase to U+212B.
|
||||||
|
// * U+00C5 and U+00E5 will both be included into the "special add" set. While
|
||||||
|
// calculate the "equivlant set" under ECMA262 "i" mode, the process will
|
||||||
|
// add U+00E5, because it is not an uppercase character in the set. The
|
||||||
|
// process will also add U+00C5, because it is the uppercase character which
|
||||||
|
// other non uppercase character, U+00C5, uppercase into.
|
||||||
|
//
|
||||||
|
// For characters not included in "ignore set" and "special add set", the
|
||||||
|
// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
|
||||||
|
// much faster.
|
||||||
|
//
|
||||||
|
// Under Unicode 12.0, there are only 7 characters in the "special add set" and
|
||||||
|
// 4 characters in "ignore set" so even the special add process is slower, it is
|
||||||
|
// limited to a small set of cases only.
|
||||||
|
//
|
||||||
|
// The implementation of these two function will be generated by calling ICU
|
||||||
|
// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
|
||||||
|
// the code in src/regexp/gen-regexp-special-case.cc.
|
||||||
|
//
|
||||||
|
// These two function will be used with LazyInstance<> template to generate
|
||||||
|
// global sharable set to reduce memory usage and speed up performance.
|
||||||
|
|
||||||
|
// Function to build and return the Ignore set.
|
||||||
|
icu::UnicodeSet BuildIgnoreSet();
|
||||||
|
|
||||||
|
// Function to build and return the Special Add set.
|
||||||
|
icu::UnicodeSet BuildSpecialAddSet();
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace v8
|
||||||
|
|
||||||
|
#endif // V8_INTL_SUPPORT
|
||||||
|
|
||||||
|
#endif // V8_REGEXP_SPECIAL_CASE_H_
|
Loading…
Reference in New Issue
Block a user