[regexp] Support the msy flags in experimental engine
The m (multiline) and s (dotall) flags just needed to be marked as allowed; the required logic was already in the regexp parser. A regexp /<x>/ without the y (sticky) flag is equivalent to the sticky regexp /.*?<x>/y. The interpreter now assumes that every regexp is sticky, and the compiler appends a preamble corresponding to /.*?/ before non-sticky regexps. To reuse existing code for compiling this preamble, the logic for each kind of quantifier is now in a separate function and called from VisitQuantifier and for the preamble. The commit also includes some improvements/fixes for character ranges: - Empty character ranges/disjunctions should never match, but before this commit they would *always* match. - The check of the range bounds in CanBeHandledVisitor was unncessary; without the unicode flag this can't be a range that can't be specified in 2-byte codepoints, and once we support unicode we simply support all codepoints. - The capacity of the list containing the complementary intervals of a character range is now calculated more accurately. Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng Bug: v8:10765 Change-Id: I71a0e07279b4e1140c0ed1651b3714200c801de9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2404766 Commit-Queue: Martin Bidlingmaier <mbid@google.com> Reviewed-by: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#70082}
This commit is contained in:
parent
339c555ba7
commit
e6e9cbac37
@ -113,6 +113,16 @@ struct RegExpInstruction {
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction ConsumeAnyChar() {
|
||||
return ConsumeRange(Uc16Range{0x0000, 0xFFFF});
|
||||
}
|
||||
|
||||
static RegExpInstruction Fail() {
|
||||
// This is encoded as the empty CONSUME_RANGE of characters 0xFFFF <= c <=
|
||||
// 0x0000.
|
||||
return ConsumeRange(Uc16Range{0xFFFF, 0x0000});
|
||||
}
|
||||
|
||||
static RegExpInstruction Fork(int32_t alt_index) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = FORK;
|
||||
|
@ -19,21 +19,23 @@ constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
|
||||
class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
// Visitor to implement `ExperimentalRegExp::CanBeHandled`.
|
||||
public:
|
||||
static bool Check(RegExpTree* node, JSRegExp::Flags flags, int capture_count,
|
||||
Zone* zone) {
|
||||
static bool Check(RegExpTree* tree, JSRegExp::Flags flags,
|
||||
int capture_count) {
|
||||
if (!AreSuitableFlags(flags)) return false;
|
||||
CanBeHandledVisitor visitor(zone);
|
||||
node->Accept(&visitor, nullptr);
|
||||
CanBeHandledVisitor visitor;
|
||||
tree->Accept(&visitor, nullptr);
|
||||
return visitor.result_;
|
||||
}
|
||||
|
||||
private:
|
||||
explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {}
|
||||
CanBeHandledVisitor() = default;
|
||||
|
||||
static bool AreSuitableFlags(JSRegExp::Flags flags) {
|
||||
// TODO(mbid, v8:10765): We should be able to support all flags in the
|
||||
// future.
|
||||
static constexpr JSRegExp::Flags kAllowedFlags = JSRegExp::kGlobal;
|
||||
static constexpr JSRegExp::Flags kAllowedFlags =
|
||||
JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline |
|
||||
JSRegExp::kDotAll;
|
||||
// We support Unicode iff kUnicode is among the supported flags.
|
||||
STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
|
||||
((kAllowedFlags & JSRegExp::kUnicode) != 0));
|
||||
@ -62,24 +64,11 @@ class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
|
||||
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
|
||||
result_ = result_ && AreSuitableFlags(node->flags());
|
||||
for (CharacterRange r : *node->ranges(zone_)) {
|
||||
// TODO(mbid, v8:10765): We don't support full unicode yet, so we only
|
||||
// allow character ranges that can be specified with two-byte characters.
|
||||
if (r.to() > kMaxSupportedCodepoint) {
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAssertion(RegExpAssertion* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Once regexps that we shouldn't try to match at
|
||||
// every input position (e.g. sticky) are supported, we should also support
|
||||
// START_OF_INPUT.
|
||||
result_ = result_ &&
|
||||
node->assertion_type() != RegExpAssertion::START_OF_INPUT &&
|
||||
AreSuitableFlags(node->flags());
|
||||
result_ = result_ && AreSuitableFlags(node->flags());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -181,16 +170,15 @@ class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
int replication_factor_ = 1;
|
||||
|
||||
bool result_ = true;
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
|
||||
JSRegExp::Flags flags,
|
||||
int capture_count, Zone* zone) {
|
||||
int capture_count) {
|
||||
DCHECK(FLAG_enable_experimental_regexp_engine);
|
||||
return CanBeHandledVisitor::Check(tree, flags, capture_count, zone);
|
||||
return CanBeHandledVisitor::Check(tree, flags, capture_count);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -286,6 +274,15 @@ class CompileVisitor : private RegExpVisitor {
|
||||
Zone* zone) {
|
||||
CompileVisitor compiler(zone);
|
||||
|
||||
if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) {
|
||||
// The match is not anchored, i.e. may start at any input position, so we
|
||||
// emit a preamble corresponding to /.*?/. This skips an arbitrary
|
||||
// prefix in the input non-greedily.
|
||||
compiler.CompileNonGreedyStar([&]() {
|
||||
compiler.code_.Add(RegExpInstruction::ConsumeAnyChar(), zone);
|
||||
});
|
||||
}
|
||||
|
||||
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(0), zone);
|
||||
tree->Accept(&compiler, nullptr);
|
||||
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(1), zone);
|
||||
@ -303,7 +300,7 @@ class CompileVisitor : private RegExpVisitor {
|
||||
// `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
|
||||
// 1` and should push code corresponding to the ith alternative onto `code_`.
|
||||
template <class F>
|
||||
void CompileDisjunction(int alt_num, F gen_alt) {
|
||||
void CompileDisjunction(int alt_num, F&& gen_alt) {
|
||||
// An alternative a1 | ... | an is compiled into
|
||||
//
|
||||
// FORK tail1
|
||||
@ -327,6 +324,8 @@ class CompileVisitor : private RegExpVisitor {
|
||||
// by the thread for a2 and so on.
|
||||
|
||||
if (alt_num == 0) {
|
||||
// The empty disjunction. This can never match.
|
||||
code_.Add(RegExpInstruction::Fail(), zone_);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -369,11 +368,12 @@ class CompileVisitor : private RegExpVisitor {
|
||||
ZoneList<CharacterRange>* ranges = node->ranges(zone_);
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
if (node->is_negated()) {
|
||||
// Capacity 2 for the common case where we compute the complement of a
|
||||
// single interval range that doesn't contain 0 and kMaxCodePoint.
|
||||
// The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
|
||||
// union of k intervals is a union of at most k + 1 intervals.
|
||||
ZoneList<CharacterRange>* negated =
|
||||
zone_->New<ZoneList<CharacterRange>>(2, zone_);
|
||||
zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
|
||||
CharacterRange::Negate(ranges, negated, zone_);
|
||||
DCHECK_LE(negated->length(), ranges->length() + 1);
|
||||
ranges = negated;
|
||||
}
|
||||
|
||||
@ -417,6 +417,114 @@ class CompileVisitor : private RegExpVisitor {
|
||||
}
|
||||
}
|
||||
|
||||
// Emit bytecode corresponding to /<emit_body>*/.
|
||||
template <class F>
|
||||
void CompileGreedyStar(F&& emit_body) {
|
||||
// This is compiled into
|
||||
//
|
||||
// begin:
|
||||
// FORK end
|
||||
// <body>
|
||||
// JMP begin
|
||||
// end:
|
||||
// ...
|
||||
//
|
||||
// This is greedy because a forked thread has lower priority than the
|
||||
// thread that spawned it.
|
||||
Label begin(code_.length());
|
||||
DeferredLabel end;
|
||||
|
||||
AddForkTo(end, code_, zone_);
|
||||
emit_body();
|
||||
AddJmpTo(begin, code_, zone_);
|
||||
|
||||
std::move(end).Bind(code_);
|
||||
}
|
||||
|
||||
// Emit bytecode corresponding to /<emit_body>*?/.
|
||||
template <class F>
|
||||
void CompileNonGreedyStar(F&& emit_body) {
|
||||
// This is compiled into
|
||||
//
|
||||
// FORK body
|
||||
// JMP end
|
||||
// body:
|
||||
// <body>
|
||||
// FORK body
|
||||
// end:
|
||||
// ...
|
||||
|
||||
Label body(code_.length() + 2);
|
||||
DeferredLabel end;
|
||||
|
||||
AddForkTo(body, code_, zone_);
|
||||
AddJmpTo(end, code_, zone_);
|
||||
|
||||
DCHECK_EQ(body.index(), code_.length());
|
||||
|
||||
emit_body();
|
||||
AddForkTo(body, code_, zone_);
|
||||
|
||||
std::move(end).Bind(code_);
|
||||
}
|
||||
|
||||
// Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
|
||||
template <class F>
|
||||
void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
|
||||
// This is compiled into
|
||||
//
|
||||
// FORK end
|
||||
// <body>
|
||||
// FORK end
|
||||
// <body>
|
||||
// ...
|
||||
// ...
|
||||
// FORK end
|
||||
// <body>
|
||||
// end:
|
||||
// ...
|
||||
|
||||
DeferredLabel end;
|
||||
for (int i = 0; i != max_repetition_num; ++i) {
|
||||
AddForkTo(end, code_, zone_);
|
||||
emit_body();
|
||||
}
|
||||
std::move(end).Bind(code_);
|
||||
}
|
||||
|
||||
// Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
|
||||
template <class F>
|
||||
void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
|
||||
// This is compiled into
|
||||
//
|
||||
// FORK body0
|
||||
// JMP end
|
||||
// body0:
|
||||
// <body>
|
||||
// FORK body1
|
||||
// JMP end
|
||||
// body1:
|
||||
// <body>
|
||||
// ...
|
||||
// ...
|
||||
// body{max_repetition_num - 1}:
|
||||
// <body>
|
||||
// end:
|
||||
// ...
|
||||
|
||||
DeferredLabel end;
|
||||
for (int i = 0; i != max_repetition_num; ++i) {
|
||||
Label body(code_.length() + 2);
|
||||
AddForkTo(body, code_, zone_);
|
||||
AddJmpTo(end, code_, zone_);
|
||||
|
||||
DCHECK_EQ(body.index(), code_.length());
|
||||
|
||||
emit_body();
|
||||
}
|
||||
std::move(end).Bind(code_);
|
||||
}
|
||||
|
||||
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
|
||||
// Emit the body, but clear registers occuring in body first.
|
||||
//
|
||||
@ -440,105 +548,20 @@ class CompileVisitor : private RegExpVisitor {
|
||||
UNREACHABLE();
|
||||
case RegExpQuantifier::GREEDY: {
|
||||
if (node->max() == RegExpTree::kInfinity) {
|
||||
// This is compiled into
|
||||
//
|
||||
// begin:
|
||||
// FORK end
|
||||
// <body>
|
||||
// JMP begin
|
||||
// end:
|
||||
// ...
|
||||
//
|
||||
// This is greedy because a forked thread has lower priority than the
|
||||
// thread that spawned it.
|
||||
Label begin(code_.length());
|
||||
DeferredLabel end;
|
||||
|
||||
AddForkTo(end, code_, zone_);
|
||||
emit_body();
|
||||
AddJmpTo(begin, code_, zone_);
|
||||
|
||||
std::move(end).Bind(code_);
|
||||
CompileGreedyStar(emit_body);
|
||||
} else {
|
||||
DCHECK_NE(node->max(), RegExpTree::kInfinity);
|
||||
// This is compiled into
|
||||
//
|
||||
// FORK end
|
||||
// <body>
|
||||
// FORK end
|
||||
// <body>
|
||||
// ... ; max - min times in total
|
||||
// ...
|
||||
// FORK end
|
||||
// <body>
|
||||
// end:
|
||||
// ...
|
||||
|
||||
DeferredLabel end;
|
||||
for (int i = node->min(); i != node->max(); ++i) {
|
||||
AddForkTo(end, code_, zone_);
|
||||
emit_body();
|
||||
}
|
||||
std::move(end).Bind(code_);
|
||||
CompileGreedyRepetition(emit_body, node->max() - node->min());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RegExpQuantifier::NON_GREEDY: {
|
||||
if (node->max() == RegExpTree::kInfinity) {
|
||||
// This is compiled into
|
||||
//
|
||||
// FORK body
|
||||
// JMP end
|
||||
// body:
|
||||
// <body>
|
||||
// FORK body
|
||||
// end:
|
||||
// ...
|
||||
|
||||
Label body(code_.length() + 2);
|
||||
DeferredLabel end;
|
||||
|
||||
AddForkTo(body, code_, zone_);
|
||||
AddJmpTo(end, code_, zone_);
|
||||
|
||||
DCHECK_EQ(body.index(), code_.length());
|
||||
|
||||
emit_body();
|
||||
AddForkTo(body, code_, zone_);
|
||||
|
||||
std::move(end).Bind(code_);
|
||||
CompileNonGreedyStar(emit_body);
|
||||
} else {
|
||||
DCHECK_NE(node->max(), RegExpTree::kInfinity);
|
||||
// This is compiled into
|
||||
//
|
||||
// FORK body0
|
||||
// JMP end
|
||||
// body0:
|
||||
// <body>
|
||||
// FORK body1
|
||||
// JMP end
|
||||
// body1:
|
||||
// <body>
|
||||
// ...
|
||||
// ...
|
||||
// body{max - min - 1}:
|
||||
// <body>
|
||||
// end:
|
||||
// ...
|
||||
|
||||
DeferredLabel end;
|
||||
for (int i = node->min(); i != node->max(); ++i) {
|
||||
Label body(code_.length() + 2);
|
||||
AddForkTo(body, code_, zone_);
|
||||
AddJmpTo(end, code_, zone_);
|
||||
|
||||
DCHECK_EQ(body.index(), code_.length());
|
||||
|
||||
emit_body();
|
||||
}
|
||||
std::move(end).Bind(code_);
|
||||
CompileNonGreedyRepetition(emit_body, node->max() - node->min());
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
|
@ -20,7 +20,7 @@ class ExperimentalRegExpCompiler final : public AllStatic {
|
||||
// TODO(mbid,v8:10765): Currently more things are not handled, e.g. some
|
||||
// quantifiers and unicode.
|
||||
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
|
||||
int capture_count, Zone* zone);
|
||||
int capture_count);
|
||||
// Compile regexp into a bytecode program. The regexp must be handlable by
|
||||
// the experimental engine; see`CanBeHandled`. The program is returned as a
|
||||
// ZoneList backed by the same Zone that is used in the RegExpTree argument.
|
||||
|
@ -240,14 +240,6 @@ class NfaInterpreter {
|
||||
uc16 input_char = input_[input_index_];
|
||||
++input_index_;
|
||||
|
||||
// If we haven't found a match yet, we add a thread with least priority
|
||||
// that attempts a match starting after `input_char`.
|
||||
if (!FoundMatch()) {
|
||||
active_threads_.Add(
|
||||
InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)},
|
||||
zone_);
|
||||
}
|
||||
|
||||
// We unblock all blocked_threads_ by feeding them the input char.
|
||||
FlushBlockedThreads(input_char);
|
||||
|
||||
|
@ -14,9 +14,8 @@ namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
|
||||
int capture_count, Zone* zone) {
|
||||
return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count,
|
||||
zone);
|
||||
int capture_count) {
|
||||
return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count);
|
||||
}
|
||||
|
||||
void ExperimentalRegExp::Initialize(Isolate* isolate, Handle<JSRegExp> re,
|
||||
|
@ -20,7 +20,7 @@ class ExperimentalRegExp final : public AllStatic {
|
||||
// checked on the fly in the parser. Not done currently because walking the
|
||||
// AST again is more flexible and less error prone (but less performant).
|
||||
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
|
||||
int capture_count, Zone* zone);
|
||||
int capture_count);
|
||||
static void Initialize(Isolate* isolate, Handle<JSRegExp> re,
|
||||
Handle<String> pattern, JSRegExp::Flags flags,
|
||||
int capture_count);
|
||||
|
@ -184,7 +184,7 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
|
||||
|
||||
if (FLAG_enable_experimental_regexp_engine &&
|
||||
ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
|
||||
parse_result.capture_count, &zone)) {
|
||||
parse_result.capture_count)) {
|
||||
ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
|
||||
parse_result.capture_count);
|
||||
has_been_compiled = true;
|
||||
|
@ -887,7 +887,7 @@ class MatchInfoBackedMatch : public String::Match {
|
||||
: isolate_(isolate), match_info_(match_info) {
|
||||
subject_ = String::Flatten(isolate, subject);
|
||||
|
||||
if (regexp->TypeTag() == JSRegExp::IRREGEXP) {
|
||||
if (JSRegExp::TypeSupportsCaptures(regexp->TypeTag())) {
|
||||
Object o = regexp->CaptureNameMap();
|
||||
has_named_captures_ = o.IsFixedArray();
|
||||
if (has_named_captures_) {
|
||||
|
@ -76,15 +76,23 @@ Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0);
|
||||
Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0);
|
||||
|
||||
// Assertions.
|
||||
// TODO(mbid,v8:10765): Once supported, we should also check ^ and $ with the
|
||||
// multiline flag.
|
||||
Test(/asdf\b/, "asdf---", ["asdf"], 0);
|
||||
Test(/asdf\b/, "asdfg", null, 0);
|
||||
Test(/asd[fg]\B/, "asdf asdgg", ["asdg"], 0);
|
||||
// TODO(mbid,v8:10765): The ^ assertion should work once we support anchored
|
||||
// regexps.
|
||||
//Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
|
||||
Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
|
||||
Test(/asd[fg]$/, "asdf asdg", ["asdg"], 0);
|
||||
|
||||
// The global flag.
|
||||
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
|
||||
|
||||
// The sticky flag.
|
||||
var r = /asdf/y;
|
||||
r.lastIndex = 2;
|
||||
Test(r, "fjasdfkkasdf", ["asdf"], 6);
|
||||
|
||||
// The multiline flag.
|
||||
Test(/^a/m, "x\na", ["a"], 0);
|
||||
Test(/x$/m, "x\na", ["x"], 0);
|
||||
|
||||
// The dotall flag.
|
||||
Test(/asdf.xyz/s, "asdf\nxyz", ["asdf\nxyz"], 0);
|
||||
|
Loading…
Reference in New Issue
Block a user