[regexp] Limit ATOM regexps to single-character patterns
There's an inherent trade-off when deciding between ATOM and IRREGEXP regexps: IRREGEXP is faster at runtime for all but trivial single-character patterns, while ATOM regexps have a lower memory overhead. This CL is intended to help investigate impact on benchmarks and real-world code - if something tanks, it's easy to revert, otherwise it can be a first step towards a possible removal of ATOM regexps. Bug: v8:6633 Change-Id: Ia41d8eb28d33952735562d3d4127202746a6ac4e Reviewed-on: https://chromium-review.googlesource.com/589435 Reviewed-by: Yang Guo <yangguo@chromium.org> Commit-Queue: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#47081}
This commit is contained in:
parent
069c2ac2a1
commit
062bb7d487
@ -96,37 +96,6 @@ ContainedInLattice AddRange(ContainedInLattice containment,
|
||||
return containment;
|
||||
}
|
||||
|
||||
|
||||
// More makes code generation slower, less makes V8 benchmark score lower.
|
||||
const int kMaxLookaheadForBoyerMoore = 8;
|
||||
// In a 3-character pattern you can maximally step forwards 3 characters
|
||||
// at a time, which is not always enough to pay for the extra logic.
|
||||
const int kPatternTooShortForBoyerMoore = 2;
|
||||
|
||||
|
||||
// Identifies the sort of regexps where the regexp engine is faster
|
||||
// than the code used for atom matches.
|
||||
static bool HasFewDifferentCharacters(Handle<String> pattern) {
|
||||
int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
|
||||
if (length <= kPatternTooShortForBoyerMoore) return false;
|
||||
const int kMod = 128;
|
||||
bool character_found[kMod];
|
||||
int different = 0;
|
||||
memset(&character_found[0], 0, sizeof(character_found));
|
||||
for (int i = 0; i < length; i++) {
|
||||
int ch = (pattern->Get(i) & (kMod - 1));
|
||||
if (!character_found[ch]) {
|
||||
character_found[ch] = true;
|
||||
different++;
|
||||
// We declare a regexp low-alphabet if it has at least 3 times as many
|
||||
// characters as it has different characters.
|
||||
if (different * 3 > length) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Generic RegExp methods. Dispatches to implementation specific methods.
|
||||
|
||||
|
||||
@ -158,7 +127,7 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
|
||||
bool has_been_compiled = false;
|
||||
|
||||
if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
|
||||
!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
|
||||
!(flags & JSRegExp::kSticky) && pattern->length() == 1) {
|
||||
// Parse-tree is a single atom that is equal to the pattern.
|
||||
AtomCompile(re, pattern, flags, pattern);
|
||||
has_been_compiled = true;
|
||||
@ -166,12 +135,11 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
|
||||
!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
|
||||
RegExpAtom* atom = parse_result.tree->AsAtom();
|
||||
Vector<const uc16> atom_pattern = atom->data();
|
||||
Handle<String> atom_string;
|
||||
ASSIGN_RETURN_ON_EXCEPTION(
|
||||
isolate, atom_string,
|
||||
isolate->factory()->NewStringFromTwoByte(atom_pattern),
|
||||
Object);
|
||||
if (!HasFewDifferentCharacters(atom_string)) {
|
||||
if (atom_pattern.length() == 1) {
|
||||
Handle<String> atom_string;
|
||||
ASSIGN_RETURN_ON_EXCEPTION(
|
||||
isolate, atom_string,
|
||||
isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
|
||||
AtomCompile(re, pattern, flags, atom_string);
|
||||
has_been_compiled = true;
|
||||
}
|
||||
@ -3029,6 +2997,8 @@ static void EmitHat(RegExpCompiler* compiler,
|
||||
on_success->Emit(compiler, &new_trace);
|
||||
}
|
||||
|
||||
// More makes code generation slower, less makes V8 benchmark score lower.
|
||||
const int kMaxLookaheadForBoyerMoore = 8;
|
||||
|
||||
// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
|
||||
void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
|
||||
|
Loading…
Reference in New Issue
Block a user