diff --git a/src/regexp/experimental/experimental-bytecode.h b/src/regexp/experimental/experimental-bytecode.h index cee36c5627..3cb65828c5 100644 --- a/src/regexp/experimental/experimental-bytecode.h +++ b/src/regexp/experimental/experimental-bytecode.h @@ -113,6 +113,16 @@ struct RegExpInstruction { return result; } + static RegExpInstruction ConsumeAnyChar() { + return ConsumeRange(Uc16Range{0x0000, 0xFFFF}); + } + + static RegExpInstruction Fail() { + // This is encoded as the empty CONSUME_RANGE of characters 0xFFFF <= c <= + // 0x0000. + return ConsumeRange(Uc16Range{0xFFFF, 0x0000}); + } + static RegExpInstruction Fork(int32_t alt_index) { RegExpInstruction result; result.opcode = FORK; diff --git a/src/regexp/experimental/experimental-compiler.cc b/src/regexp/experimental/experimental-compiler.cc index 106147e3da..615f7566f4 100644 --- a/src/regexp/experimental/experimental-compiler.cc +++ b/src/regexp/experimental/experimental-compiler.cc @@ -19,21 +19,23 @@ constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu; class CanBeHandledVisitor final : private RegExpVisitor { // Visitor to implement `ExperimentalRegExp::CanBeHandled`. public: - static bool Check(RegExpTree* node, JSRegExp::Flags flags, int capture_count, - Zone* zone) { + static bool Check(RegExpTree* tree, JSRegExp::Flags flags, + int capture_count) { if (!AreSuitableFlags(flags)) return false; - CanBeHandledVisitor visitor(zone); - node->Accept(&visitor, nullptr); + CanBeHandledVisitor visitor; + tree->Accept(&visitor, nullptr); return visitor.result_; } private: - explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {} + CanBeHandledVisitor() = default; static bool AreSuitableFlags(JSRegExp::Flags flags) { // TODO(mbid, v8:10765): We should be able to support all flags in the // future. - static constexpr JSRegExp::Flags kAllowedFlags = JSRegExp::kGlobal; + static constexpr JSRegExp::Flags kAllowedFlags = + JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline | + JSRegExp::kDotAll; // We support Unicode iff kUnicode is among the supported flags. STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode == ((kAllowedFlags & JSRegExp::kUnicode) != 0)); @@ -62,24 +64,11 @@ class CanBeHandledVisitor final : private RegExpVisitor { void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { result_ = result_ && AreSuitableFlags(node->flags()); - for (CharacterRange r : *node->ranges(zone_)) { - // TODO(mbid, v8:10765): We don't support full unicode yet, so we only - // allow character ranges that can be specified with two-byte characters. - if (r.to() > kMaxSupportedCodepoint) { - result_ = false; - return nullptr; - } - } return nullptr; } void* VisitAssertion(RegExpAssertion* node, void*) override { - // TODO(mbid,v8:10765): Once regexps that we shouldn't try to match at - // every input position (e.g. sticky) are supported, we should also support - // START_OF_INPUT. - result_ = result_ && - node->assertion_type() != RegExpAssertion::START_OF_INPUT && - AreSuitableFlags(node->flags()); + result_ = result_ && AreSuitableFlags(node->flags()); return nullptr; } @@ -181,16 +170,15 @@ class CanBeHandledVisitor final : private RegExpVisitor { int replication_factor_ = 1; bool result_ = true; - Zone* zone_; }; } // namespace bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, - int capture_count, Zone* zone) { + int capture_count) { DCHECK(FLAG_enable_experimental_regexp_engine); - return CanBeHandledVisitor::Check(tree, flags, capture_count, zone); + return CanBeHandledVisitor::Check(tree, flags, capture_count); } namespace { @@ -286,6 +274,15 @@ class CompileVisitor : private RegExpVisitor { Zone* zone) { CompileVisitor compiler(zone); + if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) { + // The match is not anchored, i.e. may start at any input position, so we + // emit a preamble corresponding to /.*?/. This skips an arbitrary + // prefix in the input non-greedily. + compiler.CompileNonGreedyStar([&]() { + compiler.code_.Add(RegExpInstruction::ConsumeAnyChar(), zone); + }); + } + compiler.code_.Add(RegExpInstruction::SetRegisterToCp(0), zone); tree->Accept(&compiler, nullptr); compiler.code_.Add(RegExpInstruction::SetRegisterToCp(1), zone); @@ -303,7 +300,7 @@ class CompileVisitor : private RegExpVisitor { // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - // 1` and should push code corresponding to the ith alternative onto `code_`. template - void CompileDisjunction(int alt_num, F gen_alt) { + void CompileDisjunction(int alt_num, F&& gen_alt) { // An alternative a1 | ... | an is compiled into // // FORK tail1 @@ -327,6 +324,8 @@ class CompileVisitor : private RegExpVisitor { // by the thread for a2 and so on. if (alt_num == 0) { + // The empty disjunction. This can never match. + code_.Add(RegExpInstruction::Fail(), zone_); return; } @@ -369,11 +368,12 @@ class CompileVisitor : private RegExpVisitor { ZoneList* ranges = node->ranges(zone_); CharacterRange::Canonicalize(ranges); if (node->is_negated()) { - // Capacity 2 for the common case where we compute the complement of a - // single interval range that doesn't contain 0 and kMaxCodePoint. + // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d) + // union of k intervals is a union of at most k + 1 intervals. ZoneList* negated = - zone_->New>(2, zone_); + zone_->New>(ranges->length() + 1, zone_); CharacterRange::Negate(ranges, negated, zone_); + DCHECK_LE(negated->length(), ranges->length() + 1); ranges = negated; } @@ -417,6 +417,114 @@ class CompileVisitor : private RegExpVisitor { } } + // Emit bytecode corresponding to /*/. + template + void CompileGreedyStar(F&& emit_body) { + // This is compiled into + // + // begin: + // FORK end + // + // JMP begin + // end: + // ... + // + // This is greedy because a forked thread has lower priority than the + // thread that spawned it. + Label begin(code_.length()); + DeferredLabel end; + + AddForkTo(end, code_, zone_); + emit_body(); + AddJmpTo(begin, code_, zone_); + + std::move(end).Bind(code_); + } + + // Emit bytecode corresponding to /*?/. + template + void CompileNonGreedyStar(F&& emit_body) { + // This is compiled into + // + // FORK body + // JMP end + // body: + // + // FORK body + // end: + // ... + + Label body(code_.length() + 2); + DeferredLabel end; + + AddForkTo(body, code_, zone_); + AddJmpTo(end, code_, zone_); + + DCHECK_EQ(body.index(), code_.length()); + + emit_body(); + AddForkTo(body, code_, zone_); + + std::move(end).Bind(code_); + } + + // Emit bytecode corresponding to /{0, max_repetition_num}/. + template + void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) { + // This is compiled into + // + // FORK end + // + // FORK end + // + // ... + // ... + // FORK end + // + // end: + // ... + + DeferredLabel end; + for (int i = 0; i != max_repetition_num; ++i) { + AddForkTo(end, code_, zone_); + emit_body(); + } + std::move(end).Bind(code_); + } + + // Emit bytecode corresponding to /{0, max_repetition_num}?/. + template + void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) { + // This is compiled into + // + // FORK body0 + // JMP end + // body0: + // + // FORK body1 + // JMP end + // body1: + // + // ... + // ... + // body{max_repetition_num - 1}: + // + // end: + // ... + + DeferredLabel end; + for (int i = 0; i != max_repetition_num; ++i) { + Label body(code_.length() + 2); + AddForkTo(body, code_, zone_); + AddJmpTo(end, code_, zone_); + + DCHECK_EQ(body.index(), code_.length()); + + emit_body(); + } + std::move(end).Bind(code_); + } + void* VisitQuantifier(RegExpQuantifier* node, void*) override { // Emit the body, but clear registers occuring in body first. // @@ -440,105 +548,20 @@ class CompileVisitor : private RegExpVisitor { UNREACHABLE(); case RegExpQuantifier::GREEDY: { if (node->max() == RegExpTree::kInfinity) { - // This is compiled into - // - // begin: - // FORK end - // - // JMP begin - // end: - // ... - // - // This is greedy because a forked thread has lower priority than the - // thread that spawned it. - Label begin(code_.length()); - DeferredLabel end; - - AddForkTo(end, code_, zone_); - emit_body(); - AddJmpTo(begin, code_, zone_); - - std::move(end).Bind(code_); + CompileGreedyStar(emit_body); } else { DCHECK_NE(node->max(), RegExpTree::kInfinity); - // This is compiled into - // - // FORK end - // - // FORK end - // - // ... ; max - min times in total - // ... - // FORK end - // - // end: - // ... - - DeferredLabel end; - for (int i = node->min(); i != node->max(); ++i) { - AddForkTo(end, code_, zone_); - emit_body(); - } - std::move(end).Bind(code_); + CompileGreedyRepetition(emit_body, node->max() - node->min()); } break; } case RegExpQuantifier::NON_GREEDY: { if (node->max() == RegExpTree::kInfinity) { - // This is compiled into - // - // FORK body - // JMP end - // body: - // - // FORK body - // end: - // ... - - Label body(code_.length() + 2); - DeferredLabel end; - - AddForkTo(body, code_, zone_); - AddJmpTo(end, code_, zone_); - - DCHECK_EQ(body.index(), code_.length()); - - emit_body(); - AddForkTo(body, code_, zone_); - - std::move(end).Bind(code_); + CompileNonGreedyStar(emit_body); } else { DCHECK_NE(node->max(), RegExpTree::kInfinity); - // This is compiled into - // - // FORK body0 - // JMP end - // body0: - // - // FORK body1 - // JMP end - // body1: - // - // ... - // ... - // body{max - min - 1}: - // - // end: - // ... - - DeferredLabel end; - for (int i = node->min(); i != node->max(); ++i) { - Label body(code_.length() + 2); - AddForkTo(body, code_, zone_); - AddJmpTo(end, code_, zone_); - - DCHECK_EQ(body.index(), code_.length()); - - emit_body(); - } - std::move(end).Bind(code_); + CompileNonGreedyRepetition(emit_body, node->max() - node->min()); } - break; } } return nullptr; diff --git a/src/regexp/experimental/experimental-compiler.h b/src/regexp/experimental/experimental-compiler.h index 115a324f0c..87abcd3917 100644 --- a/src/regexp/experimental/experimental-compiler.h +++ b/src/regexp/experimental/experimental-compiler.h @@ -20,7 +20,7 @@ class ExperimentalRegExpCompiler final : public AllStatic { // TODO(mbid,v8:10765): Currently more things are not handled, e.g. some // quantifiers and unicode. static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, - int capture_count, Zone* zone); + int capture_count); // Compile regexp into a bytecode program. The regexp must be handlable by // the experimental engine; see`CanBeHandled`. The program is returned as a // ZoneList backed by the same Zone that is used in the RegExpTree argument. diff --git a/src/regexp/experimental/experimental-interpreter.cc b/src/regexp/experimental/experimental-interpreter.cc index fda502ba98..8db93ca746 100644 --- a/src/regexp/experimental/experimental-interpreter.cc +++ b/src/regexp/experimental/experimental-interpreter.cc @@ -240,14 +240,6 @@ class NfaInterpreter { uc16 input_char = input_[input_index_]; ++input_index_; - // If we haven't found a match yet, we add a thread with least priority - // that attempts a match starting after `input_char`. - if (!FoundMatch()) { - active_threads_.Add( - InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, - zone_); - } - // We unblock all blocked_threads_ by feeding them the input char. FlushBlockedThreads(input_char); diff --git a/src/regexp/experimental/experimental.cc b/src/regexp/experimental/experimental.cc index 4b64a79bac..dc919f56c2 100644 --- a/src/regexp/experimental/experimental.cc +++ b/src/regexp/experimental/experimental.cc @@ -14,9 +14,8 @@ namespace v8 { namespace internal { bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, - int capture_count, Zone* zone) { - return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count, - zone); + int capture_count) { + return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count); } void ExperimentalRegExp::Initialize(Isolate* isolate, Handle re, diff --git a/src/regexp/experimental/experimental.h b/src/regexp/experimental/experimental.h index 80d1dd542c..02f535f621 100644 --- a/src/regexp/experimental/experimental.h +++ b/src/regexp/experimental/experimental.h @@ -20,7 +20,7 @@ class ExperimentalRegExp final : public AllStatic { // checked on the fly in the parser. Not done currently because walking the // AST again is more flexible and less error prone (but less performant). static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, - int capture_count, Zone* zone); + int capture_count); static void Initialize(Isolate* isolate, Handle re, Handle pattern, JSRegExp::Flags flags, int capture_count); diff --git a/src/regexp/regexp.cc b/src/regexp/regexp.cc index e62cc5aae4..569acdab48 100644 --- a/src/regexp/regexp.cc +++ b/src/regexp/regexp.cc @@ -184,7 +184,7 @@ MaybeHandle RegExp::Compile(Isolate* isolate, Handle re, if (FLAG_enable_experimental_regexp_engine && ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, - parse_result.capture_count, &zone)) { + parse_result.capture_count)) { ExperimentalRegExp::Initialize(isolate, re, pattern, flags, parse_result.capture_count); has_been_compiled = true; diff --git a/src/runtime/runtime-regexp.cc b/src/runtime/runtime-regexp.cc index e87b84422a..994d6e3710 100644 --- a/src/runtime/runtime-regexp.cc +++ b/src/runtime/runtime-regexp.cc @@ -887,7 +887,7 @@ class MatchInfoBackedMatch : public String::Match { : isolate_(isolate), match_info_(match_info) { subject_ = String::Flatten(isolate, subject); - if (regexp->TypeTag() == JSRegExp::IRREGEXP) { + if (JSRegExp::TypeSupportsCaptures(regexp->TypeTag())) { Object o = regexp->CaptureNameMap(); has_named_captures_ = o.IsFixedArray(); if (has_named_captures_) { diff --git a/test/mjsunit/regexp-experimental.js b/test/mjsunit/regexp-experimental.js index 5885489bbf..469da37d7f 100644 --- a/test/mjsunit/regexp-experimental.js +++ b/test/mjsunit/regexp-experimental.js @@ -76,15 +76,23 @@ Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0); Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0); // Assertions. -// TODO(mbid,v8:10765): Once supported, we should also check ^ and $ with the -// multiline flag. Test(/asdf\b/, "asdf---", ["asdf"], 0); Test(/asdf\b/, "asdfg", null, 0); Test(/asd[fg]\B/, "asdf asdgg", ["asdg"], 0); -// TODO(mbid,v8:10765): The ^ assertion should work once we support anchored -// regexps. -//Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0); +Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0); Test(/asd[fg]$/, "asdf asdg", ["asdg"], 0); // The global flag. Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6); + +// The sticky flag. +var r = /asdf/y; +r.lastIndex = 2; +Test(r, "fjasdfkkasdf", ["asdf"], 6); + +// The multiline flag. +Test(/^a/m, "x\na", ["a"], 0); +Test(/x$/m, "x\na", ["x"], 0); + +// The dotall flag. +Test(/asdf.xyz/s, "asdf\nxyz", ["asdf\nxyz"], 0);