[regexp] Limit ATOM regexps to single-character patterns

There's an inherent trade-off when deciding between ATOM and IRREGEXP regexps: IRREGEXP is faster at runtime for all but trivial single-character patterns, while ATOM regexps have a lower memory overhead. This CL is intended to help investigate impact on benchmarks and real-world code - if something tanks, it's easy to revert, otherwise it can be a first step towards a possible removal of ATOM regexps. Bug: v8:6633 Change-Id: Ia41d8eb28d33952735562d3d4127202746a6ac4e Reviewed-on: https://chromium-review.googlesource.com/589435 Reviewed-by: Yang Guo <yangguo@chromium.org> Commit-Queue: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#47081}
2017-07-27 17:20:16 +02:00 · 2017-07-27 17:20:16 +02:00 · 062bb7d487
commit 062bb7d487
parent 069c2ac2a1
1 changed files with 8 additions and 38 deletions
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@ -96,37 +96,6 @@ ContainedInLattice AddRange(ContainedInLattice containment,
  return containment;
 }

-
-// More makes code generation slower, less makes V8 benchmark score lower.
-const int kMaxLookaheadForBoyerMoore = 8;
-// In a 3-character pattern you can maximally step forwards 3 characters
-// at a time, which is not always enough to pay for the extra logic.
-const int kPatternTooShortForBoyerMoore = 2;
-
-
-// Identifies the sort of regexps where the regexp engine is faster
-// than the code used for atom matches.
-static bool HasFewDifferentCharacters(Handle<String> pattern) {
-  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
-  if (length <= kPatternTooShortForBoyerMoore) return false;
-  const int kMod = 128;
-  bool character_found[kMod];
-  int different = 0;
-  memset(&character_found[0], 0, sizeof(character_found));
-  for (int i = 0; i < length; i++) {
-    int ch = (pattern->Get(i) & (kMod - 1));
-    if (!character_found[ch]) {
-      character_found[ch] = true;
-      different++;
-      // We declare a regexp low-alphabet if it has at least 3 times as many
-      // characters as it has different characters.
-      if (different * 3 > length) return false;
-    }
-  }
-  return true;
-}
-
-
 // Generic RegExp methods. Dispatches to implementation specific methods.


@ -158,7 +127,7 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
  bool has_been_compiled = false;

  if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
-      !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
+      !(flags & JSRegExp::kSticky) && pattern->length() == 1) {
    // Parse-tree is a single atom that is equal to the pattern.
    AtomCompile(re, pattern, flags, pattern);
    has_been_compiled = true;
@ -166,12 +135,11 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
             !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
    RegExpAtom* atom = parse_result.tree->AsAtom();
    Vector<const uc16> atom_pattern = atom->data();
-    Handle<String> atom_string;
-    ASSIGN_RETURN_ON_EXCEPTION(
-        isolate, atom_string,
-        isolate->factory()->NewStringFromTwoByte(atom_pattern),
-        Object);
-    if (!HasFewDifferentCharacters(atom_string)) {
+    if (atom_pattern.length() == 1) {
+      Handle<String> atom_string;
+      ASSIGN_RETURN_ON_EXCEPTION(
+          isolate, atom_string,
+          isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
      AtomCompile(re, pattern, flags, atom_string);
      has_been_compiled = true;
    }
@ -3029,6 +2997,8 @@ static void EmitHat(RegExpCompiler* compiler,
  on_success->Emit(compiler, &new_trace);
 }

+// More makes code generation slower, less makes V8 benchmark score lower.
+const int kMaxLookaheadForBoyerMoore = 8;

 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {