Reland "Speed up CharacterRange::AddCaseEquivalents"

This is a reland of f23f644fb3 Fix the issue by wrap v8_executable("gen-regexp-special-case") inside if (current_toolchain == v8_generator_toolchain) { and change deps of action("run_gen-regexp-special-case") to ":gen-regexp-special-case($v8_generator_toolchain)", Original change's description: > Speed up CharacterRange::AddCaseEquivalents > > By using the lexCss("color:") to measure the performance > The change make the lexCss("color:") > x21 - x40 times faster than trunk. > x2.3 - x4.6 times faster than m74. > > Design Doc: http://shorturl.at/adfO5 > > Measured by out/x64.release/d8 reg977003.js > see reg977003.js attached to chromium:977003 > > Also see another cl of benchmark in > https://chromium-review.googlesource.com/c/v8/v8/+/1679651/ > > > Bug: chromium:977003 > Change-Id: Ie8518493d2c33df1594be1b4576bda715087b421 > Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1674851 > Commit-Queue: Frank Tang <ftang@chromium.org> > Reviewed-by: Yang Guo <yangguo@chromium.org> > Cr-Commit-Position: refs/heads/master@{#62471} Bug: chromium:977003 Change-Id: Ie690810f596e9551b5765f422665c9617391bcf8 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1683706 Reviewed-by: Frank Tang <ftang@chromium.org> Reviewed-by: Yang Guo <yangguo@chromium.org> Commit-Queue: Frank Tang <ftang@chromium.org> Cr-Commit-Position: refs/heads/master@{#62486}
2019-07-01 13:31:08 -07:00 · 2019-07-01 13:31:08 -07:00 · 433403dc9b
commit 433403dc9b
parent aaf94026c0
4 changed files with 370 additions and 41 deletions
--- a/BUILD.gn
+++ b/BUILD.gn
@ -2708,6 +2708,7 @@ v8_source_set("v8_base_without_compiler") {
    "src/regexp/regexp-nodes.h",
    "src/regexp/regexp-parser.cc",
    "src/regexp/regexp-parser.h",
    "src/regexp/regexp-special-case.h",
    "src/regexp/regexp-stack.cc",
    "src/regexp/regexp-stack.h",
    "src/regexp/regexp-utils.cc",
@ -3240,6 +3241,8 @@ v8_source_set("v8_base_without_compiler") {
  ]
  if (v8_enable_i18n_support) {
    deps += [ ":run_gen-regexp-special-case" ]
    sources += [ "$target_gen_dir/src/regexp/special-case.cc" ]
    if (is_win) {
      deps += [ "//third_party/icu:icudata" ]
    }
@ -3907,6 +3910,50 @@ v8_executable("torque-language-server") {
  }
 }
 if (current_toolchain == v8_generator_toolchain) {
  v8_executable("gen-regexp-special-case") {
    visibility = [ ":*" ]  # Only targets in this file can depend on this.
    sources = [
      "src/regexp/gen-regexp-special-case.cc",
    ]
    deps = [
      ":v8_libbase",
      "//build/win:default_exe_manifest",
      "//third_party/icu",
    ]
    configs = [ ":internal_config" ]
  }
 }
 action("run_gen-regexp-special-case") {
  visibility = [ ":*" ]  # Only targets in this file can depend on this.
  script = "tools/run.py"
  sources = v8_extra_library_files
  deps = [
    ":gen-regexp-special-case($v8_generator_toolchain)",
  ]
  output_file = "$target_gen_dir/src/regexp/special-case.cc"
  outputs = [
    output_file,
  ]
  args = [
    "./" + rebase_path(
            get_label_info(":gen-regexp-special-case($v8_generator_toolchain)",
                           "root_out_dir") + "/gen-regexp-special-case",
            root_build_dir),
    rebase_path(output_file, root_build_dir),
  ]
 }
 ###############################################################################
 # Public targets
 #
--- a/src/regexp/gen-regexp-special-case.cc
+++ b/src/regexp/gen-regexp-special-case.cc
@ -0,0 +1,125 @@
 // Copyright 2019 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
 #include "src/base/logging.h"
 #include "unicode/uchar.h"
 #include "unicode/uniset.h"
 namespace v8 {
 namespace internal {
 // The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
 // functions into "src/regexp/special-case.cc".
 // See more details in http://shorturl.at/adfO5
 void PrintSet(std::ofstream& out, const char* func_name,
              const icu::UnicodeSet& set) {
  out << "icu::UnicodeSet " << func_name << "() {\n"
      << "  icu::UnicodeSet set;\n";
  for (int32_t i = 0; i < set.getRangeCount(); i++) {
    if (set.getRangeStart(i) == set.getRangeEnd(i)) {
      out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
    } else {
      out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
          << set.getRangeEnd(i) << ");\n";
    }
  }
  out << "  set.freeze();\n"
      << "  return set;\n"
      << "}\n";
 }
 void PrintSpecial(std::ofstream& out) {
  icu::UnicodeSet current;
  icu::UnicodeSet processed(0xd800, 0xdbff);  // Ignore surrogate range.
  icu::UnicodeSet special_add;
  icu::UnicodeSet ignore;
  UErrorCode status = U_ZERO_ERROR;
  icu::UnicodeSet upper("[\\p{Lu}]", status);
  CHECK(U_SUCCESS(status));
  // Iterate through all chars in BMP except ASCII and Surrogate.
  for (UChar32 i = 0x80; i < 0x010000; i++) {
    // Ignore those characters which is already processed.
    if (!processed.contains(i)) {
      current.set(i, i);
      current.closeOver(USET_CASE_INSENSITIVE);
      // Remember we already processed current.
      processed.addAll(current);
      // All uppercase characters in current.
      icu::UnicodeSet keep_upper(current);
      keep_upper.retainAll(upper);
      // Check if we have more than one uppercase character in current.
      // If there are more than one uppercase character, then it is a special
      // set which need to be added into either "Special Add" set or "Ignore"
      // set.
      int32_t number_of_upper = 0;
      for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
        number_of_upper +=
            keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
      }
      if (number_of_upper > 1) {
        // Add all non uppercase characters (could be Ll or Mn) to special add
        // set.
        current.removeAll(upper);
        special_add.addAll(current);
        // Add the uppercase characters of non uppercase character to
        // special add set.
        CHECK_GT(current.getRangeCount(), 0);
        UChar32 main_upper = u_toupper(current.getRangeStart(0));
        special_add.add(main_upper);
        // Add all uppercase except the main upper to ignore set.
        keep_upper.remove(main_upper);
        ignore.addAll(keep_upper);
      }
    }
  }
  // Remove any ASCII
  special_add.remove(0x0000, 0x007f);
  PrintSet(out, "BuildIgnoreSet", ignore);
  PrintSet(out, "BuildSpecialAddSet", special_add);
 }
 void WriteHeader(const char* header_filename) {
  std::ofstream out(header_filename);
  out << std::hex << std::setfill('0') << std::setw(4);
  out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
      << "// The following functions are used to build icu::UnicodeSet\n"
      << "// for specical cases different between Unicode and ECMA262.\n"
      << "#ifdef V8_INTL_SUPPORT\n"
      << "#include \"src/regexp/special-case.h\"\n\n"
      << "#include \"unicode/uniset.h\"\n"
      << "namespace v8 {\n"
      << "namespace internal {\n\n";
  PrintSpecial(out);
  out << "\n"
      << "}  // namespace internal\n"
      << "}  // namespace v8\n"
      << "#endif  // V8_INTL_SUPPORT\n";
 }
 }  // namespace internal
 }  // namespace v8
 int main(int argc, const char** argv) {
  if (argc != 2) {
    std::cerr << "Usage: " << argv[0] << " <output filename>\n";
    std::exit(1);
  }
  v8::internal::WriteHeader(argv[1]);
  return 0;
 }
--- a/src/regexp/regexp-compiler-tonode.cc
+++ b/src/regexp/regexp-compiler-tonode.cc
@ -6,6 +6,9 @@
 #include "src/execution/isolate.h"
 #include "src/regexp/regexp.h"
 #ifdef V8_INTL_SUPPORT
 #include "src/regexp/special-case.h"
 #endif  // V8_INTL_SUPPORT
 #include "src/strings/unicode-inl.h"
 #include "src/zone/zone-list-inl.h"
@ -1137,6 +1140,39 @@ Vector<const int> CharacterRange::GetWordBounds() {
  return Vector<const int>(kWordRanges, kWordRangeCount - 1);
 }
 #ifdef V8_INTL_SUPPORT
 struct IgnoreSet {
  IgnoreSet() : set(BuildIgnoreSet()) {}
  const icu::UnicodeSet set;
 };
 struct SpecialAddSet {
  SpecialAddSet() : set(BuildSpecialAddSet()) {}
  const icu::UnicodeSet set;
 };
 icu::UnicodeSet BuildAsciiAToZSet() {
  icu::UnicodeSet set('a', 'z');
  set.add('A', 'Z');
  set.freeze();
  return set;
 }
 struct AsciiAToZSet {
  AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
  const icu::UnicodeSet set;
 };
 static base::LazyInstance<IgnoreSet>::type ignore_set =
    LAZY_INSTANCE_INITIALIZER;
 static base::LazyInstance<SpecialAddSet>::type special_add_set =
    LAZY_INSTANCE_INITIALIZER;
 static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
    LAZY_INSTANCE_INITIALIZER;
 #endif  // V8_INTL_SUPPORT
 // static
 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
                                        ZoneList<CharacterRange>* ranges,
@ -1144,58 +1180,100 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
  CharacterRange::Canonicalize(ranges);
  int range_count = ranges->length();
 #ifdef V8_INTL_SUPPORT
  icu::UnicodeSet already_added;
  icu::UnicodeSet others;
  for (int i = 0; i < range_count; i++) {
    CharacterRange range = ranges->at(i);
-    uc32 bottom = range.from();
+    uc32 from = range.from();
-    if (bottom > String::kMaxUtf16CodeUnit) continue;
+    if (from > String::kMaxUtf16CodeUnit) continue;
-    uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
+    uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit);
    // Nothing to be done for surrogates.
-    if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
+    if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
-      if (bottom > String::kMaxOneByteCharCode) continue;
+      if (from > String::kMaxOneByteCharCode) continue;
-      if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
+      if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
    }
-    already_added.add(bottom, top);
+    others.add(from, to);
-    icu::Locale locale = icu::Locale::getRoot();
+  }
-    while (bottom <= top) {
+
-      icu::UnicodeString upper(bottom);
+  // Set of characters already added to ranges that do not need to be added
-      upper.toUpper(locale);
+  // again.
-      icu::UnicodeSet expanded(bottom, bottom);
+  icu::UnicodeSet already_added(others);
-      expanded.closeOver(USET_CASE_INSENSITIVE);
+
-      for (int32_t i = 0; i < expanded.getRangeCount(); i++) {
+  // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
-        UChar32 start = expanded.getRangeStart(i);
+  icu::UnicodeSet in_ascii_a_to_z(others);
-        UChar32 end = expanded.getRangeEnd(i);
+  in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
-        while (start <= end) {
+
-          icu::UnicodeString upper2(start);
+  // Remove all chars in [a-zA-Z] from others.
-          upper2.toUpper(locale);
+  others.removeAll(in_ascii_a_to_z);
-          // Only add if the upper case are the same.
+
-          if (upper[0] == upper2[0]) {
+  // Set of characters in ranges that are overlapping with special add set.
-            // #sec-runtime-semantics-canonicalize-ch
+  icu::UnicodeSet in_special_add(others);
-            // 3.g. If the numeric value of ch ≥ 128 and the numeric value of
+  in_special_add.retainAll(special_add_set.Pointer()->set);
-            // cu < 128, return ch.
+
-            if (bottom >= 128 && start < 128) {
+  others.removeAll(in_special_add);
-              others.add(bottom);
+
-            } else {
+  // Ignore all chars in ignore set.
-              // 3.h. 3.h. 3.h. Return cu.
+  others.removeAll(ignore_set.Pointer()->set);
-              others.add(start);
+
-            }
+  // For most of the chars in ranges that is still in others, find the case
-          }
+  // equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
-          start++;
+  others.closeOver(USET_CASE_INSENSITIVE);
-        }
+
-      }
+  // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
-      bottom++;
+  // but ECMA262 "i" mode won't consider that, remove them from others.
  // Ex: U+017F add 'S' and 's' to others.
  others.removeAll(ascii_a_to_z_set.Pointer()->set);
  // Special handling for in_ascii_a_to_z.
  for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
    UChar32 start = in_ascii_a_to_z.getRangeStart(i);
    UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
    // Check if it is uppercase A-Z by checking bit 6.
    if (start & 0x0020) {
      // Add the lowercases
      others.add(start & 0x005F, end & 0x005F);
    } else {
      // Add the uppercases
      others.add(start | 0x0020, end | 0x0020);
    }
  }
  // Special handling for chars in "Special Add" set.
  for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
    UChar32 end = in_special_add.getRangeEnd(i);
    for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
      // Add the uppercase of this character if itself is not an uppercase
      // character.
      // Note: The if condiction cannot be u_islower(ch) because ch could be
      // neither uppercase nor lowercase but Mn.
      if (!u_isupper(ch)) {
        others.add(u_toupper(ch));
      }
      icu::UnicodeSet candidates(ch, ch);
      candidates.closeOver(USET_CASE_INSENSITIVE);
      for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
        UChar32 end2 = candidates.getRangeEnd(j);
        for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
          // Add character that is not uppercase to others.
          if (!u_isupper(ch2)) {
            others.add(ch2);
          }
        }
      }
    }
  }
  // Remove all characters which already in the ranges.
  others.removeAll(already_added);
  // Add others to the ranges
  for (int32_t i = 0; i < others.getRangeCount(); i++) {
-    UChar32 start = others.getRangeStart(i);
+    UChar32 from = others.getRangeStart(i);
-    UChar32 end = others.getRangeEnd(i);
+    UChar32 to = others.getRangeEnd(i);
-    if (start == end) {
+    if (from == to) {
-      ranges->Add(CharacterRange::Singleton(start), zone);
+      ranges->Add(CharacterRange::Singleton(from), zone);
    } else {
-      ranges->Add(CharacterRange::Range(start, end), zone);
+      ranges->Add(CharacterRange::Range(from, to), zone);
    }
  }
 #else
--- a/src/regexp/special-case.h
+++ b/src/regexp/special-case.h
@ -0,0 +1,79 @@
 // Copyright 2019 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 #ifndef V8_REGEXP_SPECIAL_CASE_H_
 #define V8_REGEXP_SPECIAL_CASE_H_
 #ifdef V8_INTL_SUPPORT
 #include "unicode/uversion.h"
 namespace U_ICU_NAMESPACE {
 class UnicodeSet;
 }  //  namespace U_ICU_NAMESPACE
 namespace v8 {
 namespace internal {
 // Functions to build special sets of Unicode characters that need special
 // handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
 //
 // For the characters in the "ignore set", the process should not treat other
 // characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
 // equivlant under the ECMA262 RegExp "i" mode because these characters are
 // uppercase themselves that no other characters in the set uppercase to.
 //
 // For the characters in the "special add set", the proecess should add only
 // those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
 // not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
 // and also that ONE uppercase character that other non uppercase character
 // uppercase into to the set. Other uppercase characters in the result of
 // closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
 // RegExp "i" mode consider two characters as "case equivlant" if both
 // characters uppercase to the same character.
 //
 // For example, consider the following case equivalent set defined by Unicode
 // standard. Notice there are more than one uppercase characters in this set:
 //  U+212B Å Angstrom Sign - an uppercase character.
 //  U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
 //  U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
 //    uppercase to U+00C5.
 // In this case equivlant set is a special set and need special handling while
 // considering "case equivlant" under the ECMA262 RegExp "i" mode which is
 // different than Unicode Standard:
 //  * U+212B should be included into the "ignore" set because there are no other
 //    characters, under the ECMA262 "i" mode, are considered as "case equivlant"
 //    to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
 //    uppercase to U+212B.
 //  * U+00C5 and U+00E5 will both be included into the "special add" set. While
 //    calculate the "equivlant set" under ECMA262 "i" mode, the process will
 //    add U+00E5, because it is not an uppercase character in the set. The
 //    process will also add U+00C5, because it is the uppercase character which
 //    other non uppercase character, U+00C5, uppercase into.
 //
 // For characters not included in "ignore set" and "special add set", the
 // process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
 // much faster.
 //
 // Under Unicode 12.0, there are only 7 characters in the "special add set" and
 // 4 characters in "ignore set" so even the special add process is slower, it is
 // limited to a small set of cases only.
 //
 // The implementation of these two function will be generated by calling ICU
 // icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
 // the code in src/regexp/gen-regexp-special-case.cc.
 //
 // These two function will be used with LazyInstance<> template to generate
 // global sharable set to reduce memory usage and speed up performance.
 // Function to build and return the Ignore set.
 icu::UnicodeSet BuildIgnoreSet();
 // Function to build and return the Special Add set.
 icu::UnicodeSet BuildSpecialAddSet();
 }  // namespace internal
 }  // namespace v8
 #endif  // V8_INTL_SUPPORT
 #endif  // V8_REGEXP_SPECIAL_CASE_H_