Reland "Speed up CharacterRange::AddCaseEquivalents"
This is a reland of f23f644fb3
Fix the issue by wrap v8_executable("gen-regexp-special-case")
inside if (current_toolchain == v8_generator_toolchain) {
and change deps of action("run_gen-regexp-special-case")
to ":gen-regexp-special-case($v8_generator_toolchain)",
Original change's description:
> Speed up CharacterRange::AddCaseEquivalents
>
> By using the lexCss("color:") to measure the performance
> The change make the lexCss("color:")
> x21 - x40 times faster than trunk.
> x2.3 - x4.6 times faster than m74.
>
> Design Doc: http://shorturl.at/adfO5
>
> Measured by out/x64.release/d8 reg977003.js
> see reg977003.js attached to chromium:977003
>
> Also see another cl of benchmark in
> https://chromium-review.googlesource.com/c/v8/v8/+/1679651/
>
>
> Bug: chromium:977003
> Change-Id: Ie8518493d2c33df1594be1b4576bda715087b421
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1674851
> Commit-Queue: Frank Tang <ftang@chromium.org>
> Reviewed-by: Yang Guo <yangguo@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#62471}
Bug: chromium:977003
Change-Id: Ie690810f596e9551b5765f422665c9617391bcf8
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1683706
Reviewed-by: Frank Tang <ftang@chromium.org>
Reviewed-by: Yang Guo <yangguo@chromium.org>
Commit-Queue: Frank Tang <ftang@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62486}
This commit is contained in:
parent
aaf94026c0
commit
433403dc9b
47
BUILD.gn
47
BUILD.gn
@ -2708,6 +2708,7 @@ v8_source_set("v8_base_without_compiler") {
|
||||
"src/regexp/regexp-nodes.h",
|
||||
"src/regexp/regexp-parser.cc",
|
||||
"src/regexp/regexp-parser.h",
|
||||
"src/regexp/regexp-special-case.h",
|
||||
"src/regexp/regexp-stack.cc",
|
||||
"src/regexp/regexp-stack.h",
|
||||
"src/regexp/regexp-utils.cc",
|
||||
@ -3240,6 +3241,8 @@ v8_source_set("v8_base_without_compiler") {
|
||||
]
|
||||
|
||||
if (v8_enable_i18n_support) {
|
||||
deps += [ ":run_gen-regexp-special-case" ]
|
||||
sources += [ "$target_gen_dir/src/regexp/special-case.cc" ]
|
||||
if (is_win) {
|
||||
deps += [ "//third_party/icu:icudata" ]
|
||||
}
|
||||
@ -3907,6 +3910,50 @@ v8_executable("torque-language-server") {
|
||||
}
|
||||
}
|
||||
|
||||
if (current_toolchain == v8_generator_toolchain) {
|
||||
v8_executable("gen-regexp-special-case") {
|
||||
visibility = [ ":*" ] # Only targets in this file can depend on this.
|
||||
|
||||
sources = [
|
||||
"src/regexp/gen-regexp-special-case.cc",
|
||||
]
|
||||
|
||||
deps = [
|
||||
":v8_libbase",
|
||||
"//build/win:default_exe_manifest",
|
||||
"//third_party/icu",
|
||||
]
|
||||
|
||||
configs = [ ":internal_config" ]
|
||||
}
|
||||
}
|
||||
|
||||
action("run_gen-regexp-special-case") {
|
||||
visibility = [ ":*" ] # Only targets in this file can depend on this.
|
||||
|
||||
script = "tools/run.py"
|
||||
|
||||
sources = v8_extra_library_files
|
||||
|
||||
deps = [
|
||||
":gen-regexp-special-case($v8_generator_toolchain)",
|
||||
]
|
||||
|
||||
output_file = "$target_gen_dir/src/regexp/special-case.cc"
|
||||
|
||||
outputs = [
|
||||
output_file,
|
||||
]
|
||||
|
||||
args = [
|
||||
"./" + rebase_path(
|
||||
get_label_info(":gen-regexp-special-case($v8_generator_toolchain)",
|
||||
"root_out_dir") + "/gen-regexp-special-case",
|
||||
root_build_dir),
|
||||
rebase_path(output_file, root_build_dir),
|
||||
]
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Public targets
|
||||
#
|
||||
|
125
src/regexp/gen-regexp-special-case.cc
Normal file
125
src/regexp/gen-regexp-special-case.cc
Normal file
@ -0,0 +1,125 @@
|
||||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "src/base/logging.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
|
||||
// functions into "src/regexp/special-case.cc".
|
||||
// See more details in http://shorturl.at/adfO5
|
||||
void PrintSet(std::ofstream& out, const char* func_name,
|
||||
const icu::UnicodeSet& set) {
|
||||
out << "icu::UnicodeSet " << func_name << "() {\n"
|
||||
<< " icu::UnicodeSet set;\n";
|
||||
for (int32_t i = 0; i < set.getRangeCount(); i++) {
|
||||
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
|
||||
out << " set.add(0x" << set.getRangeStart(i) << ");\n";
|
||||
} else {
|
||||
out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
|
||||
<< set.getRangeEnd(i) << ");\n";
|
||||
}
|
||||
}
|
||||
out << " set.freeze();\n"
|
||||
<< " return set;\n"
|
||||
<< "}\n";
|
||||
}
|
||||
|
||||
void PrintSpecial(std::ofstream& out) {
|
||||
icu::UnicodeSet current;
|
||||
icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
|
||||
icu::UnicodeSet special_add;
|
||||
icu::UnicodeSet ignore;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
icu::UnicodeSet upper("[\\p{Lu}]", status);
|
||||
CHECK(U_SUCCESS(status));
|
||||
// Iterate through all chars in BMP except ASCII and Surrogate.
|
||||
for (UChar32 i = 0x80; i < 0x010000; i++) {
|
||||
// Ignore those characters which is already processed.
|
||||
if (!processed.contains(i)) {
|
||||
current.set(i, i);
|
||||
current.closeOver(USET_CASE_INSENSITIVE);
|
||||
|
||||
// Remember we already processed current.
|
||||
processed.addAll(current);
|
||||
|
||||
// All uppercase characters in current.
|
||||
icu::UnicodeSet keep_upper(current);
|
||||
keep_upper.retainAll(upper);
|
||||
|
||||
// Check if we have more than one uppercase character in current.
|
||||
// If there are more than one uppercase character, then it is a special
|
||||
// set which need to be added into either "Special Add" set or "Ignore"
|
||||
// set.
|
||||
int32_t number_of_upper = 0;
|
||||
for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
|
||||
number_of_upper +=
|
||||
keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
|
||||
}
|
||||
if (number_of_upper > 1) {
|
||||
// Add all non uppercase characters (could be Ll or Mn) to special add
|
||||
// set.
|
||||
current.removeAll(upper);
|
||||
special_add.addAll(current);
|
||||
|
||||
// Add the uppercase characters of non uppercase character to
|
||||
// special add set.
|
||||
CHECK_GT(current.getRangeCount(), 0);
|
||||
UChar32 main_upper = u_toupper(current.getRangeStart(0));
|
||||
special_add.add(main_upper);
|
||||
|
||||
// Add all uppercase except the main upper to ignore set.
|
||||
keep_upper.remove(main_upper);
|
||||
ignore.addAll(keep_upper);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove any ASCII
|
||||
special_add.remove(0x0000, 0x007f);
|
||||
PrintSet(out, "BuildIgnoreSet", ignore);
|
||||
PrintSet(out, "BuildSpecialAddSet", special_add);
|
||||
}
|
||||
|
||||
void WriteHeader(const char* header_filename) {
|
||||
std::ofstream out(header_filename);
|
||||
out << std::hex << std::setfill('0') << std::setw(4);
|
||||
|
||||
out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
|
||||
<< "// The following functions are used to build icu::UnicodeSet\n"
|
||||
<< "// for specical cases different between Unicode and ECMA262.\n"
|
||||
<< "#ifdef V8_INTL_SUPPORT\n"
|
||||
<< "#include \"src/regexp/special-case.h\"\n\n"
|
||||
<< "#include \"unicode/uniset.h\"\n"
|
||||
<< "namespace v8 {\n"
|
||||
<< "namespace internal {\n\n";
|
||||
|
||||
PrintSpecial(out);
|
||||
|
||||
out << "\n"
|
||||
<< "} // namespace internal\n"
|
||||
<< "} // namespace v8\n"
|
||||
<< "#endif // V8_INTL_SUPPORT\n";
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
int main(int argc, const char** argv) {
|
||||
if (argc != 2) {
|
||||
std::cerr << "Usage: " << argv[0] << " <output filename>\n";
|
||||
std::exit(1);
|
||||
}
|
||||
v8::internal::WriteHeader(argv[1]);
|
||||
|
||||
return 0;
|
||||
}
|
@ -6,6 +6,9 @@
|
||||
|
||||
#include "src/execution/isolate.h"
|
||||
#include "src/regexp/regexp.h"
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "src/regexp/special-case.h"
|
||||
#endif // V8_INTL_SUPPORT
|
||||
#include "src/strings/unicode-inl.h"
|
||||
#include "src/zone/zone-list-inl.h"
|
||||
|
||||
@ -1137,6 +1140,39 @@ Vector<const int> CharacterRange::GetWordBounds() {
|
||||
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
|
||||
}
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
struct IgnoreSet {
|
||||
IgnoreSet() : set(BuildIgnoreSet()) {}
|
||||
const icu::UnicodeSet set;
|
||||
};
|
||||
|
||||
struct SpecialAddSet {
|
||||
SpecialAddSet() : set(BuildSpecialAddSet()) {}
|
||||
const icu::UnicodeSet set;
|
||||
};
|
||||
|
||||
icu::UnicodeSet BuildAsciiAToZSet() {
|
||||
icu::UnicodeSet set('a', 'z');
|
||||
set.add('A', 'Z');
|
||||
set.freeze();
|
||||
return set;
|
||||
}
|
||||
|
||||
struct AsciiAToZSet {
|
||||
AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
|
||||
const icu::UnicodeSet set;
|
||||
};
|
||||
|
||||
static base::LazyInstance<IgnoreSet>::type ignore_set =
|
||||
LAZY_INSTANCE_INITIALIZER;
|
||||
|
||||
static base::LazyInstance<SpecialAddSet>::type special_add_set =
|
||||
LAZY_INSTANCE_INITIALIZER;
|
||||
|
||||
static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
|
||||
LAZY_INSTANCE_INITIALIZER;
|
||||
#endif // V8_INTL_SUPPORT
|
||||
|
||||
// static
|
||||
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
@ -1144,58 +1180,100 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
int range_count = ranges->length();
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
icu::UnicodeSet already_added;
|
||||
icu::UnicodeSet others;
|
||||
for (int i = 0; i < range_count; i++) {
|
||||
CharacterRange range = ranges->at(i);
|
||||
uc32 bottom = range.from();
|
||||
if (bottom > String::kMaxUtf16CodeUnit) continue;
|
||||
uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
|
||||
uc32 from = range.from();
|
||||
if (from > String::kMaxUtf16CodeUnit) continue;
|
||||
uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit);
|
||||
// Nothing to be done for surrogates.
|
||||
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
|
||||
if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
|
||||
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
|
||||
if (bottom > String::kMaxOneByteCharCode) continue;
|
||||
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
|
||||
if (from > String::kMaxOneByteCharCode) continue;
|
||||
if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
|
||||
}
|
||||
already_added.add(bottom, top);
|
||||
icu::Locale locale = icu::Locale::getRoot();
|
||||
while (bottom <= top) {
|
||||
icu::UnicodeString upper(bottom);
|
||||
upper.toUpper(locale);
|
||||
icu::UnicodeSet expanded(bottom, bottom);
|
||||
expanded.closeOver(USET_CASE_INSENSITIVE);
|
||||
for (int32_t i = 0; i < expanded.getRangeCount(); i++) {
|
||||
UChar32 start = expanded.getRangeStart(i);
|
||||
UChar32 end = expanded.getRangeEnd(i);
|
||||
while (start <= end) {
|
||||
icu::UnicodeString upper2(start);
|
||||
upper2.toUpper(locale);
|
||||
// Only add if the upper case are the same.
|
||||
if (upper[0] == upper2[0]) {
|
||||
// #sec-runtime-semantics-canonicalize-ch
|
||||
// 3.g. If the numeric value of ch ≥ 128 and the numeric value of
|
||||
// cu < 128, return ch.
|
||||
if (bottom >= 128 && start < 128) {
|
||||
others.add(bottom);
|
||||
} else {
|
||||
// 3.h. 3.h. 3.h. Return cu.
|
||||
others.add(start);
|
||||
}
|
||||
}
|
||||
start++;
|
||||
}
|
||||
}
|
||||
bottom++;
|
||||
others.add(from, to);
|
||||
}
|
||||
|
||||
// Set of characters already added to ranges that do not need to be added
|
||||
// again.
|
||||
icu::UnicodeSet already_added(others);
|
||||
|
||||
// Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
|
||||
icu::UnicodeSet in_ascii_a_to_z(others);
|
||||
in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
|
||||
|
||||
// Remove all chars in [a-zA-Z] from others.
|
||||
others.removeAll(in_ascii_a_to_z);
|
||||
|
||||
// Set of characters in ranges that are overlapping with special add set.
|
||||
icu::UnicodeSet in_special_add(others);
|
||||
in_special_add.retainAll(special_add_set.Pointer()->set);
|
||||
|
||||
others.removeAll(in_special_add);
|
||||
|
||||
// Ignore all chars in ignore set.
|
||||
others.removeAll(ignore_set.Pointer()->set);
|
||||
|
||||
// For most of the chars in ranges that is still in others, find the case
|
||||
// equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
|
||||
others.closeOver(USET_CASE_INSENSITIVE);
|
||||
|
||||
// Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
|
||||
// but ECMA262 "i" mode won't consider that, remove them from others.
|
||||
// Ex: U+017F add 'S' and 's' to others.
|
||||
others.removeAll(ascii_a_to_z_set.Pointer()->set);
|
||||
|
||||
// Special handling for in_ascii_a_to_z.
|
||||
for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
|
||||
UChar32 start = in_ascii_a_to_z.getRangeStart(i);
|
||||
UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
|
||||
// Check if it is uppercase A-Z by checking bit 6.
|
||||
if (start & 0x0020) {
|
||||
// Add the lowercases
|
||||
others.add(start & 0x005F, end & 0x005F);
|
||||
} else {
|
||||
// Add the uppercases
|
||||
others.add(start | 0x0020, end | 0x0020);
|
||||
}
|
||||
}
|
||||
|
||||
// Special handling for chars in "Special Add" set.
|
||||
for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
|
||||
UChar32 end = in_special_add.getRangeEnd(i);
|
||||
for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
|
||||
// Add the uppercase of this character if itself is not an uppercase
|
||||
// character.
|
||||
// Note: The if condiction cannot be u_islower(ch) because ch could be
|
||||
// neither uppercase nor lowercase but Mn.
|
||||
if (!u_isupper(ch)) {
|
||||
others.add(u_toupper(ch));
|
||||
}
|
||||
icu::UnicodeSet candidates(ch, ch);
|
||||
candidates.closeOver(USET_CASE_INSENSITIVE);
|
||||
for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
|
||||
UChar32 end2 = candidates.getRangeEnd(j);
|
||||
for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
|
||||
// Add character that is not uppercase to others.
|
||||
if (!u_isupper(ch2)) {
|
||||
others.add(ch2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove all characters which already in the ranges.
|
||||
others.removeAll(already_added);
|
||||
|
||||
// Add others to the ranges
|
||||
for (int32_t i = 0; i < others.getRangeCount(); i++) {
|
||||
UChar32 start = others.getRangeStart(i);
|
||||
UChar32 end = others.getRangeEnd(i);
|
||||
if (start == end) {
|
||||
ranges->Add(CharacterRange::Singleton(start), zone);
|
||||
UChar32 from = others.getRangeStart(i);
|
||||
UChar32 to = others.getRangeEnd(i);
|
||||
if (from == to) {
|
||||
ranges->Add(CharacterRange::Singleton(from), zone);
|
||||
} else {
|
||||
ranges->Add(CharacterRange::Range(start, end), zone);
|
||||
ranges->Add(CharacterRange::Range(from, to), zone);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
79
src/regexp/special-case.h
Normal file
79
src/regexp/special-case.h
Normal file
@ -0,0 +1,79 @@
|
||||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_SPECIAL_CASE_H_
|
||||
#define V8_REGEXP_SPECIAL_CASE_H_
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "unicode/uversion.h"
|
||||
namespace U_ICU_NAMESPACE {
|
||||
class UnicodeSet;
|
||||
} // namespace U_ICU_NAMESPACE
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// Functions to build special sets of Unicode characters that need special
|
||||
// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
|
||||
//
|
||||
// For the characters in the "ignore set", the process should not treat other
|
||||
// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
|
||||
// equivlant under the ECMA262 RegExp "i" mode because these characters are
|
||||
// uppercase themselves that no other characters in the set uppercase to.
|
||||
//
|
||||
// For the characters in the "special add set", the proecess should add only
|
||||
// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
|
||||
// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
|
||||
// and also that ONE uppercase character that other non uppercase character
|
||||
// uppercase into to the set. Other uppercase characters in the result of
|
||||
// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
|
||||
// RegExp "i" mode consider two characters as "case equivlant" if both
|
||||
// characters uppercase to the same character.
|
||||
//
|
||||
// For example, consider the following case equivalent set defined by Unicode
|
||||
// standard. Notice there are more than one uppercase characters in this set:
|
||||
// U+212B Å Angstrom Sign - an uppercase character.
|
||||
// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
|
||||
// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
|
||||
// uppercase to U+00C5.
|
||||
// In this case equivlant set is a special set and need special handling while
|
||||
// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
|
||||
// different than Unicode Standard:
|
||||
// * U+212B should be included into the "ignore" set because there are no other
|
||||
// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
|
||||
// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
|
||||
// uppercase to U+212B.
|
||||
// * U+00C5 and U+00E5 will both be included into the "special add" set. While
|
||||
// calculate the "equivlant set" under ECMA262 "i" mode, the process will
|
||||
// add U+00E5, because it is not an uppercase character in the set. The
|
||||
// process will also add U+00C5, because it is the uppercase character which
|
||||
// other non uppercase character, U+00C5, uppercase into.
|
||||
//
|
||||
// For characters not included in "ignore set" and "special add set", the
|
||||
// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
|
||||
// much faster.
|
||||
//
|
||||
// Under Unicode 12.0, there are only 7 characters in the "special add set" and
|
||||
// 4 characters in "ignore set" so even the special add process is slower, it is
|
||||
// limited to a small set of cases only.
|
||||
//
|
||||
// The implementation of these two function will be generated by calling ICU
|
||||
// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
|
||||
// the code in src/regexp/gen-regexp-special-case.cc.
|
||||
//
|
||||
// These two function will be used with LazyInstance<> template to generate
|
||||
// global sharable set to reduce memory usage and speed up performance.
|
||||
|
||||
// Function to build and return the Ignore set.
|
||||
icu::UnicodeSet BuildIgnoreSet();
|
||||
|
||||
// Function to build and return the Special Add set.
|
||||
icu::UnicodeSet BuildSpecialAddSet();
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_INTL_SUPPORT
|
||||
|
||||
#endif // V8_REGEXP_SPECIAL_CASE_H_
|
Loading…
Reference in New Issue
Block a user