[regexp] Limit ATOM regexps to single-character patterns

There's an inherent trade-off when deciding between ATOM and IRREGEXP
regexps: IRREGEXP is faster at runtime for all but trivial single-character
patterns, while ATOM regexps have a lower memory overhead.

This CL is intended to help investigate impact on benchmarks and real-world
code - if something tanks, it's easy to revert, otherwise it can be a first
step towards a possible removal of ATOM regexps.

Bug: v8:6633
Change-Id: Ia41d8eb28d33952735562d3d4127202746a6ac4e
Reviewed-on: https://chromium-review.googlesource.com/589435
Reviewed-by: Yang Guo <yangguo@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#47081}
This commit is contained in:
jgruber 2017-07-27 17:20:16 +02:00 committed by Commit Bot
parent 069c2ac2a1
commit 062bb7d487

View File

@ -96,37 +96,6 @@ ContainedInLattice AddRange(ContainedInLattice containment,
return containment;
}
// More makes code generation slower, less makes V8 benchmark score lower.
const int kMaxLookaheadForBoyerMoore = 8;
// In a 3-character pattern you can maximally step forwards 3 characters
// at a time, which is not always enough to pay for the extra logic.
const int kPatternTooShortForBoyerMoore = 2;
// Identifies the sort of regexps where the regexp engine is faster
// than the code used for atom matches.
static bool HasFewDifferentCharacters(Handle<String> pattern) {
int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
if (length <= kPatternTooShortForBoyerMoore) return false;
const int kMod = 128;
bool character_found[kMod];
int different = 0;
memset(&character_found[0], 0, sizeof(character_found));
for (int i = 0; i < length; i++) {
int ch = (pattern->Get(i) & (kMod - 1));
if (!character_found[ch]) {
character_found[ch] = true;
different++;
// We declare a regexp low-alphabet if it has at least 3 times as many
// characters as it has different characters.
if (different * 3 > length) return false;
}
}
return true;
}
// Generic RegExp methods. Dispatches to implementation specific methods.
@ -158,7 +127,7 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
bool has_been_compiled = false;
if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
!(flags & JSRegExp::kSticky) && pattern->length() == 1) {
// Parse-tree is a single atom that is equal to the pattern.
AtomCompile(re, pattern, flags, pattern);
has_been_compiled = true;
@ -166,12 +135,11 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
RegExpAtom* atom = parse_result.tree->AsAtom();
Vector<const uc16> atom_pattern = atom->data();
Handle<String> atom_string;
ASSIGN_RETURN_ON_EXCEPTION(
isolate, atom_string,
isolate->factory()->NewStringFromTwoByte(atom_pattern),
Object);
if (!HasFewDifferentCharacters(atom_string)) {
if (atom_pattern.length() == 1) {
Handle<String> atom_string;
ASSIGN_RETURN_ON_EXCEPTION(
isolate, atom_string,
isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
AtomCompile(re, pattern, flags, atom_string);
has_been_compiled = true;
}
@ -3029,6 +2997,8 @@ static void EmitHat(RegExpCompiler* compiler,
on_success->Emit(compiler, &new_trace);
}
// More makes code generation slower, less makes V8 benchmark score lower.
const int kMaxLookaheadForBoyerMoore = 8;
// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {