From e2aa1a89ddb30afc26ba0a55627acce270353a52 Mon Sep 17 00:00:00 2001 From: Martin Bidlingmaier Date: Mon, 31 Aug 2020 21:49:18 +0200 Subject: [PATCH] [regexp] Split experimental regexp code into multiple files Bug: v8:10765 Change-Id: I49e425d861d900ab66b6f7801cddec8a7175ac03 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2385462 Commit-Queue: Martin Bidlingmaier Reviewed-by: Jakob Gruber Cr-Commit-Position: refs/heads/master@{#69637} --- BUILD.gn | 6 + .../experimental/experimental-bytecode.cc | 78 ++ .../experimental/experimental-bytecode.h | 167 ++++ .../experimental/experimental-compiler.cc | 365 +++++++ .../experimental/experimental-compiler.h | 33 + .../experimental/experimental-interpreter.cc | 349 +++++++ .../experimental/experimental-interpreter.h | 40 + src/regexp/experimental/experimental.cc | 921 +----------------- 8 files changed, 1066 insertions(+), 893 deletions(-) create mode 100644 src/regexp/experimental/experimental-bytecode.cc create mode 100644 src/regexp/experimental/experimental-bytecode.h create mode 100644 src/regexp/experimental/experimental-compiler.cc create mode 100644 src/regexp/experimental/experimental-compiler.h create mode 100644 src/regexp/experimental/experimental-interpreter.cc create mode 100644 src/regexp/experimental/experimental-interpreter.h diff --git a/BUILD.gn b/BUILD.gn index cbc5cccf4d..2e2e2b1eb1 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -3067,6 +3067,12 @@ v8_source_set("v8_base_without_compiler") { "src/profiler/tick-sample.h", "src/profiler/tracing-cpu-profiler.cc", "src/profiler/tracing-cpu-profiler.h", + "src/regexp/experimental/experimental-bytecode.cc", + "src/regexp/experimental/experimental-bytecode.h", + "src/regexp/experimental/experimental-compiler.cc", + "src/regexp/experimental/experimental-compiler.h", + "src/regexp/experimental/experimental-interpreter.cc", + "src/regexp/experimental/experimental-interpreter.h", "src/regexp/experimental/experimental.cc", "src/regexp/experimental/experimental.h", "src/regexp/property-sequences.cc", diff --git a/src/regexp/experimental/experimental-bytecode.cc b/src/regexp/experimental/experimental-bytecode.cc new file mode 100644 index 0000000000..1d8801bf68 --- /dev/null +++ b/src/regexp/experimental/experimental-bytecode.cc @@ -0,0 +1,78 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/regexp/experimental/experimental-bytecode.h" + +#include + +namespace v8 { +namespace internal { + +namespace { + +std::ostream& PrintAsciiOrHex(std::ostream& os, uc16 c) { + if (c < 128 && std::isprint(c)) { + os << static_cast(c); + } else { + os << "0x" << std::hex << static_cast(c); + } + return os; +} + +} // namespace + +std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) { + switch (inst.opcode) { + case RegExpInstruction::CONSUME_RANGE: { + os << "CONSUME_RANGE ["; + PrintAsciiOrHex(os, inst.payload.consume_range.min); + os << ", "; + PrintAsciiOrHex(os, inst.payload.consume_range.max); + os << "]"; + break; + } + case RegExpInstruction::FORK: + os << "FORK " << inst.payload.pc; + break; + case RegExpInstruction::JMP: + os << "JMP " << inst.payload.pc; + break; + case RegExpInstruction::ACCEPT: + os << "ACCEPT"; + break; + } + return os; +} + +namespace { + +// The maximum number of digits required to display a non-negative number < n +// in base 10. +int DigitsRequiredBelow(int n) { + DCHECK_GE(n, 0); + + int result = 1; + for (int i = 10; i < n; i *= 10) { + result += 1; + } + return result; +} + +} // namespace + +std::ostream& operator<<(std::ostream& os, + Vector insts) { + int inst_num = insts.length(); + int line_digit_num = DigitsRequiredBelow(inst_num); + + for (int i = 0; i != inst_num; ++i) { + const RegExpInstruction& inst = insts[i]; + os << std::setfill('0') << std::setw(line_digit_num) << i << ": " << inst + << std::endl; + } + return os; +} + +} // namespace internal +} // namespace v8 diff --git a/src/regexp/experimental/experimental-bytecode.h b/src/regexp/experimental/experimental-bytecode.h new file mode 100644 index 0000000000..00361586d8 --- /dev/null +++ b/src/regexp/experimental/experimental-bytecode.h @@ -0,0 +1,167 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_ +#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_ + +#include + +#include "src/utils/vector.h" + +// ---------------------------------------------------------------------------- +// Definition and semantics of the EXPERIMENTAL bytecode. +// Background: +// - Russ Cox's blog post series on regular expression matching, in particular +// https://swtch.com/~rsc/regexp/regexp2.html +// - The re2 regular regexp library: https://github.com/google/re2 +// +// This comment describes the bytecode used by the experimental regexp engine +// and its abstract semantics in terms of a VM. An implementation of the +// semantics that avoids exponential runtime can be found in `NfaInterpreter`. +// +// The experimental bytecode describes a non-deterministic finite automaton. It +// runs on a multithreaded virtual machine (VM), i.e. in several threads +// concurrently. (These "threads" don't need to be actual operating system +// threads.) Apart from a list of threads, the VM maintains an immutable +// shared input string which threads can read from. Each thread is given by a +// program counter (PC, index of the current instruction), a fixed number of +// registers of indices into the input string, and a monotonically increasing +// index which represents the current position within the input string. +// +// For the precise encoding of the instruction set, see the definition `struct +// RegExpInstruction` below. Currently we support the following instructions: +// - CONSUME_RANGE: Check whether the codepoint of the current character is +// contained in a non-empty closed interval [min, max] specified in the +// instruction payload. Abort this thread if false, otherwise advance the +// input position by 1 and continue with the next instruction. +// - ACCEPT: Stop this thread and signify the end of a match at the current +// input position. +// - FORK: If executed by a thread t, spawn a new thread t0 whose register +// values and input position agree with those of t, but whose PC value is set +// to the value specified in the instruction payload. The register values of +// t and t0 agree directly after the FORK, but they can diverge. Thread t +// continues with the instruction directly after the current FORK +// instruction. +// - JMP: Instead of incrementing the PC value after execution of this +// instruction by 1, set PC of this thread to the value specified in the +// instruction payload and continue there. +// +// Special care must be exercised with respect to thread priority. It is +// possible that more than one thread executes an ACCEPT statement. The output +// of the program is given by the contents of the matching thread's registers, +// so this is ambiguous in case of multiple matches. To resolve the ambiguity, +// every implementation of the VM must output the match that a backtracking +// implementation would output (i.e. behave the same as Irregexp). +// +// A backtracking implementation of the VM maintains a stack of postponed +// threads. Upon encountering a FORK statement, this VM will create a copy of +// the current thread, set the copy's PC value according to the instruction +// payload, and push it to the stack of postponed threads. The VM will then +// continue execution of the current thread. +// +// If at some point a thread t executes a MATCH statement, the VM stops and +// outputs the registers of t. Postponed threads are discarded. On the other +// hand, if a thread t is aborted because some input character didn't pass a +// check, then the VM pops the topmost postponed thread and continues execution +// with this thread. If there are no postponed threads, then the VM outputs +// failure, i.e. no matches. +// +// Equivalently, we can describe the behavior of the backtracking VM in terms +// of priority: Threads are linearly ordered by priority, and matches generated +// by threads with high priority must be preferred over matches generated by +// threads with low priority, regardless of the chronological order in which +// matches were found. If a thread t executes a FORK statement and spawns a +// thread t0, then the priority of t0 is such that the following holds: +// * t0 < t, i.e. t0 has lower priority than t. +// * For all threads u such that u != t and u != t0, we have t0 < u iff t < u, +// i.e. the t0 compares to other threads the same as t. +// For example, if there are currently 3 threads s, t, u such that s < t < u, +// then after t executes a fork, the thread priorities will be s < t0 < t < u. + +namespace v8 { +namespace internal { + +// Bytecode format. +// Currently very simple fixed-size: The opcode is encoded in the first 4 +// bytes, the payload takes another 4 bytes. +struct RegExpInstruction { + enum Opcode : int32_t { + CONSUME_RANGE, + FORK, + JMP, + ACCEPT, + }; + + struct Uc16Range { + uc16 min; // Inclusive. + uc16 max; // Inclusive. + }; + + static RegExpInstruction ConsumeRange(Uc16Range consume_range) { + RegExpInstruction result; + result.opcode = CONSUME_RANGE; + result.payload.consume_range = consume_range; + return result; + } + + static RegExpInstruction Fork(int32_t alt_index) { + RegExpInstruction result; + result.opcode = FORK; + result.payload.pc = alt_index; + return result; + } + + static RegExpInstruction Jmp(int32_t alt_index) { + RegExpInstruction result; + result.opcode = JMP; + result.payload.pc = alt_index; + return result; + } + + static RegExpInstruction Accept() { + RegExpInstruction result; + result.opcode = ACCEPT; + return result; + } + + Opcode opcode; + union { + // Payload of CONSUME_RANGE: + Uc16Range consume_range; + // Payload of FORK and JMP, the next/forked program counter (pc): + int32_t pc; + } payload; + STATIC_ASSERT(sizeof(payload) == 4); +}; +STATIC_ASSERT(sizeof(RegExpInstruction) == 8); +// TODO(mbid,v8:10765): This is rather wasteful. We can fit the opcode in 2-3 +// bits, so the remaining 29/30 bits can be used as payload. Problem: The +// payload of CONSUME_RANGE consists of two 16-bit values `min` and `max`, so +// this wouldn't fit. We could encode the payload of a CONSUME_RANGE +// instruction by the start of the interval and its length instead, and then +// only allows lengths that fit into 14/13 bits. A longer range can then be +// encoded as a disjunction of smaller ranges. +// +// Another thought: CONSUME_RANGEs are only valid if the payloads are such that +// min <= max. Thus there are +// +// 2^16 + 2^16 - 1 + ... + 1 +// = 2^16 * (2^16 + 1) / 2 +// = 2^31 + 2^15 +// +// valid payloads for a CONSUME_RANGE instruction. If we want to fit +// instructions into 4 bytes, we would still have almost 2^31 instructions left +// over if we encode everything as tight as possible. For example, we could +// use another 2^29 values for JMP, another 2^29 for FORK, 1 value for ACCEPT, +// and then still have almost 2^30 instructions left over for something like +// zero-width assertions and captures. + +std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst); +std::ostream& operator<<(std::ostream& os, + Vector insts); + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_ diff --git a/src/regexp/experimental/experimental-compiler.cc b/src/regexp/experimental/experimental-compiler.cc new file mode 100644 index 0000000000..4068801ce7 --- /dev/null +++ b/src/regexp/experimental/experimental-compiler.cc @@ -0,0 +1,365 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/regexp/experimental/experimental-compiler.h" + +#include "src/zone/zone-list-inl.h" + +namespace v8 { +namespace internal { + +namespace { + +// TODO(mbid, v8:10765): Currently the experimental engine doesn't support +// UTF-16, but this shouldn't be too hard to implement. +constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu; + +class CanBeHandledVisitor final : private RegExpVisitor { + // Visitor to implement `ExperimentalRegExp::CanBeHandled`. + public: + static bool Check(RegExpTree* node, JSRegExp::Flags flags, Zone* zone) { + if (!AreSuitableFlags(flags)) { + return false; + } + CanBeHandledVisitor visitor(zone); + node->Accept(&visitor, nullptr); + return visitor.result_; + } + + private: + explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {} + + static bool AreSuitableFlags(JSRegExp::Flags flags) { + // TODO(mbid, v8:10765): We should be able to support all flags in the + // future. + static constexpr JSRegExp::Flags allowed_flags = JSRegExp::kGlobal; + return (flags & ~allowed_flags) == 0; + } + + void* VisitDisjunction(RegExpDisjunction* node, void*) override { + for (RegExpTree* alt : *node->alternatives()) { + alt->Accept(this, nullptr); + if (!result_) { + return nullptr; + } + } + return nullptr; + } + + void* VisitAlternative(RegExpAlternative* node, void*) override { + for (RegExpTree* child : *node->nodes()) { + child->Accept(this, nullptr); + if (!result_) { + return nullptr; + } + } + return nullptr; + } + + void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { + result_ = result_ && AreSuitableFlags(node->flags()); + for (CharacterRange r : *node->ranges(zone_)) { + // TODO(mbid, v8:10765): We don't support full unicode yet, so we only + // allow character ranges that can be specified with two-byte characters. + if (r.to() > kMaxSupportedCodepoint) { + result_ = false; + return nullptr; + } + } + return nullptr; + } + + void* VisitAssertion(RegExpAssertion* node, void*) override { + // TODO(mbid, v8:10765): We should be able to support at least some + // assertions. re2 does, too. + result_ = false; + return nullptr; + } + + void* VisitAtom(RegExpAtom* node, void*) override { + result_ = result_ && AreSuitableFlags(node->flags()); + return nullptr; + } + + void* VisitText(RegExpText* node, void*) override { + for (TextElement& el : *node->elements()) { + el.tree()->Accept(this, nullptr); + if (!result_) { + return nullptr; + } + } + return nullptr; + } + + void* VisitQuantifier(RegExpQuantifier* node, void*) override { + // TODO(mbid, v8:10765): Theoretically we can support arbitrary min() and + // max(), but the size of the automaton grows linearly with finite max(). + // We probably want a cut-off value here, or maybe we can "virtualize" the + // repetitions. + // Non-greedy quantifiers are easy to implement, but not supported atm. + // It's not clear to me how a possessive quantifier would be implemented, + // we should check whether re2 supports this. + result_ = result_ && node->min() == 0 && + node->max() == RegExpTree::kInfinity && node->is_greedy(); + if (!result_) { + return nullptr; + } + node->body()->Accept(this, nullptr); + return nullptr; + } + + void* VisitCapture(RegExpCapture* node, void*) override { + // TODO(mbid, v8:10765): This can be implemented with the NFA interpreter, + // but not with the lazy DFA. See also re2. + result_ = false; + return nullptr; + } + + void* VisitGroup(RegExpGroup* node, void*) override { + node->body()->Accept(this, nullptr); + return nullptr; + } + + void* VisitLookaround(RegExpLookaround* node, void*) override { + // TODO(mbid, v8:10765): This will be hard to support, but not impossible I + // think. See product automata. + result_ = false; + return nullptr; + } + + void* VisitBackReference(RegExpBackReference* node, void*) override { + // This can't be implemented without backtracking. + result_ = false; + return nullptr; + } + + void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } + + private: + bool result_ = true; + Zone* zone_; +}; + +} // namespace + +bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree, + JSRegExp::Flags flags, + Zone* zone) { + DCHECK(FLAG_enable_experimental_regexp_engine); + return CanBeHandledVisitor::Check(tree, flags, zone); +} + +namespace { + +class CompileVisitor : private RegExpVisitor { + public: + static ZoneList Compile(RegExpTree* tree, + JSRegExp::Flags flags, + Zone* zone) { + CompileVisitor compiler(zone); + + tree->Accept(&compiler, nullptr); + compiler.code_.Add(RegExpInstruction::Accept(), zone); + + return std::move(compiler.code_); + } + + private: + // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from + // the `tree` size we're going to compile? + explicit CompileVisitor(Zone* zone) : zone_(zone), code_(0, zone) {} + + // Generate a disjunction of code fragments compiled by a function `alt_gen`. + // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - + // 1` and should push code corresponding to the ith alternative onto `code_`. + template + void CompileDisjunction(int alt_num, F gen_alt) { + // An alternative a0 | a1 | a2 is compiled into + // FORK + // FORK + // + // JMP $end + // + // JMP $end + // + // where $end is the index of the next instruction after . + // + // By the semantics of the FORK instruction (see above at definition and + // semantics), the forked thread has lower priority than the current + // thread. This means that with the code we're generating here, the thread + // matching the alternative a0 is indeed the thread with the highest + // priority, followed by the thread for a1 and so on. + + if (alt_num == 0) { + return; + } + + // Record the index of the first of the alt_num - 1 fork instructions in the + // beginning. + int forks_begin = code_.length(); + // Add FORKs to alts[alt_num - 1], alts[alt_num - 2], ..., alts[1]. + for (int i = alt_num - 1; i != 0; --i) { + // The FORK's address is patched once we know the address of the ith + // alternative. + code_.Add(RegExpInstruction::Fork(-1), zone_); + } + + // List containing the index of the final JMP instruction after each + // alternative but the last one. + ZoneList jmp_indices(alt_num - 1, zone_); + + for (int i = 0; i != alt_num; ++i) { + if (i != 0) { + // If this is not the first alternative, we have to patch the + // corresponding FORK statement in the beginning. + code_[forks_begin + alt_num - 1 - i].payload.pc = code_.length(); + } + gen_alt(i); + if (i != alt_num - 1) { + // If this is not the last alternative, we have to emit a JMP past the + // remaining alternatives. We don't know this address yet, so we have + // to patch patch it once all alternatives are emitted. + jmp_indices.Add(code_.length(), zone_); + code_.Add(RegExpInstruction::Jmp(-1), zone_); + } + } + + // All alternatives are emitted. Now we can patch the JMP instruction + // after each but the last alternative. + int end_index = code_.length(); + for (int jmp_index : jmp_indices) { + code_[jmp_index].payload.pc = end_index; + } + } + + void* VisitDisjunction(RegExpDisjunction* node, void*) override { + ZoneList& alts = *node->alternatives(); + CompileDisjunction(alts.length(), + [&](int i) { alts[i]->Accept(this, nullptr); }); + return nullptr; + } + + void* VisitAlternative(RegExpAlternative* node, void*) override { + for (RegExpTree* child : *node->nodes()) { + child->Accept(this, nullptr); + } + return nullptr; + } + + void* VisitAssertion(RegExpAssertion* node, void*) override { + // TODO(mbid,v8:10765): Support this case. + UNREACHABLE(); + } + + void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { + // A character class is compiled as Disjunction over its `CharacterRange`s. + ZoneList* ranges = node->ranges(zone_); + CharacterRange::Canonicalize(ranges); + if (node->is_negated()) { + // Capacity 2 for the common case where we compute the complement of a + // single interval range that doesn't contain 0 and kMaxCodePoint. + ZoneList* negated = + zone_->New>(2, zone_); + CharacterRange::Negate(ranges, negated, zone_); + ranges = negated; + } + + CompileDisjunction(ranges->length(), [&](int i) { + // We don't support utf16 for now, so only ranges that can be specified + // by (complements of) ranges with uc16 bounds. + STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits::max()); + + uc32 from = (*ranges)[i].from(); + DCHECK_LE(from, kMaxSupportedCodepoint); + uc16 from_uc16 = static_cast(from); + + uc32 to = (*ranges)[i].to(); + DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint); + uc16 to_uc16 = static_cast(std::min(to, kMaxSupportedCodepoint)); + + RegExpInstruction::Uc16Range range{from_uc16, to_uc16}; + code_.Add(RegExpInstruction::ConsumeRange(range), zone_); + }); + return nullptr; + } + + void* VisitAtom(RegExpAtom* node, void*) override { + for (uc16 c : node->data()) { + code_.Add( + RegExpInstruction::ConsumeRange(RegExpInstruction::Uc16Range{c, c}), + zone_); + } + return nullptr; + } + + void* VisitQuantifier(RegExpQuantifier* node, void*) override { + // TODO(mbid,v8:10765): For now we support a quantifier of the form /x*/, + // i.e. greedy match of any number of /x/. See also the comment in + // `CanBeHandledVisitor::VisitQuantifier`. + DCHECK_EQ(node->min(), 0); + DCHECK_EQ(node->max(), RegExpTree::kInfinity); + DCHECK(node->is_greedy()); + + // The repetition of /x/ is compiled into + // + // a: FORK d + // b: + // c: JMP a + // d: ... + // + // Note that a FORKed thread has lower priority than the main thread, so + // this will indeed match greedily. + + int initial_fork_index = code_.length(); + // The FORK's address is patched once we're done. + code_.Add(RegExpInstruction::Fork(-1), zone_); + node->body()->Accept(this, nullptr); + code_.Add(RegExpInstruction::Jmp(initial_fork_index), zone_); + int end_index = code_.length(); + code_[initial_fork_index].payload.pc = end_index; + return nullptr; + } + + void* VisitCapture(RegExpCapture* node, void*) override { + // TODO(mbid,v8:10765): Support this case. + UNREACHABLE(); + } + + void* VisitGroup(RegExpGroup* node, void*) override { + node->body()->Accept(this, nullptr); + return nullptr; + } + + void* VisitLookaround(RegExpLookaround* node, void*) override { + // TODO(mbid,v8:10765): Support this case. + UNREACHABLE(); + } + + void* VisitBackReference(RegExpBackReference* node, void*) override { + UNREACHABLE(); + } + + void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } + + void* VisitText(RegExpText* node, void*) override { + for (TextElement& text_el : *node->elements()) { + text_el.tree()->Accept(this, nullptr); + } + return nullptr; + } + + private: + Zone* zone_; + ZoneList code_; +}; + +} // namespace + +ZoneList ExperimentalRegExpCompiler::Compile( + RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) { + return CompileVisitor::Compile(tree, flags, zone); +} + +} // namespace internal +} // namespace v8 diff --git a/src/regexp/experimental/experimental-compiler.h b/src/regexp/experimental/experimental-compiler.h new file mode 100644 index 0000000000..542592796c --- /dev/null +++ b/src/regexp/experimental/experimental-compiler.h @@ -0,0 +1,33 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_ +#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_ + +#include "src/regexp/experimental/experimental-bytecode.h" +#include "src/regexp/regexp-ast.h" +#include "src/zone/zone-list.h" + +namespace v8 { +namespace internal { + +class ExperimentalRegExpCompiler final : public AllStatic { + public: + // Checks whether a given RegExpTree can be compiled into an experimental + // bytecode program. This mostly amounts to the absence of back references, + // but see the definition. + // TODO(mbid,v8:10765): Currently more things are not handled, e.g. some + // quantifiers and unicode. + static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, Zone* zone); + // Compile regexp into a bytecode program. The regexp must be handlable by + // the experimental engine; see`CanBeHandled`. The program is returned as a + // ZoneList backed by the same Zone that is used in the RegExpTree argument. + static ZoneList Compile(RegExpTree* tree, + JSRegExp::Flags flags, Zone* zone); +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_ diff --git a/src/regexp/experimental/experimental-interpreter.cc b/src/regexp/experimental/experimental-interpreter.cc new file mode 100644 index 0000000000..1e0b7528f2 --- /dev/null +++ b/src/regexp/experimental/experimental-interpreter.cc @@ -0,0 +1,349 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/regexp/experimental/experimental-interpreter.h" + +#include "src/base/optional.h" +#include "src/base/small-vector.h" + +namespace v8 { +namespace internal { + +using MatchRange = ExperimentalRegExpInterpreter::MatchRange; + +namespace { + +template +class NfaInterpreter { + // Executes a bytecode program in breadth-first mode, without backtracking. + // `Character` can be instantiated with `uint8_t` or `uc16` for one byte or + // two byte input strings. + // + // In contrast to the backtracking implementation, this has linear time + // complexity in the length of the input string. Breadth-first mode means + // that threads are executed in lockstep with respect to their input + // position, i.e. the threads share a common input index. This is similar + // to breadth-first simulation of a non-deterministic finite automaton (nfa), + // hence the name of the class. + // + // To follow the semantics of a backtracking VM implementation, we have to be + // careful about whether we stop execution when a thread executes ACCEPT. + // For example, consider execution of the bytecode generated by the regexp + // + // r = /abc|..|[a-c]{10,}/ + // + // on input "abcccccccccccccc". Clearly the three alternatives + // - /abc/ + // - /../ + // - /[a-c]{10,}/ + // all match this input. A backtracking implementation will report "abc" as + // match, because it explores the first alternative before the others. + // + // However, if we execute breadth first, then we execute the 3 threads + // - t1, which tries to match /abc/ + // - t2, which tries to match /../ + // - t3, which tries to match /[a-c]{10,}/ + // in lockstep i.e. by iterating over the input and feeding all threads one + // character at a time. t2 will execute an ACCEPT after two characters, + // while t1 will only execute ACCEPT after three characters. Thus we find a + // match for the second alternative before a match of the first alternative. + // + // This shows that we cannot always stop searching as soon as some thread t + // executes ACCEPT: If there is a thread u with higher priority than t, then + // it must be finished first. If u produces a match, then we can discard the + // match of t because matches produced by threads with higher priority are + // preferred over matches of threads with lower priority. On the other hand, + // we are allowed to abort all threads with lower priority than t if t + // produces a match: Such threads can only produce worse matches. In the + // example above, we can abort t3 after two characters because of t2's match. + // + // Thus the interpreter keeps track of a priority-ordered list of threads. + // If a thread ACCEPTs, all threads with lower priority are discarded, and + // the search continues with the threads with higher priority. If no threads + // with high priority are left, we return the match that was produced by the + // ACCEPTing thread with highest priority. + public: + NfaInterpreter(Vector bytecode, + Vector input, int32_t input_index) + : bytecode_(bytecode), + input_(input), + input_index_(input_index), + pc_last_input_index_(bytecode.size()), + active_threads_(), + blocked_threads_(), + best_match_(base::nullopt) { + DCHECK(!bytecode_.empty()); + DCHECK_GE(input_index_, 0); + DCHECK_LE(input_index_, input_.length()); + + std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); + } + + // Finds up to `max_match_num` matches and writes their boundaries to + // `matches_out`. The search begins at the current input index. Returns the + // number of matches found. + int FindMatches(MatchRange* matches_out, int max_match_num) { + int match_num; + for (match_num = 0; match_num != max_match_num; ++match_num) { + base::Optional match = FindNextMatch(); + if (!match.has_value()) { + break; + } + + matches_out[match_num] = *match; + SetInputIndex(match->end); + } + return match_num; + } + + private: + // The state of a "thread" executing experimental regexp bytecode. (Not to + // be confused with an OS thread.) + struct InterpreterThread { + // This thread's program counter, i.e. the index within `bytecode_` of the + // next instruction to be executed. + int32_t pc; + // The index in the input string where this thread started executing. + int32_t match_begin; + }; + + // Change the current input index for future calls to `FindNextMatch`. + void SetInputIndex(int new_input_index) { + DCHECK_GE(input_index_, 0); + DCHECK_LE(input_index_, input_.length()); + + input_index_ = new_input_index; + } + + // Find the next match, begin search at input_index_; + base::Optional FindNextMatch() { + DCHECK(active_threads_.empty()); + // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_` + // here? As long as + // + // pc_last_input_index_[pc] < input_index_ + // + // for all possible program counters pc that are reachable without input + // from pc = 0 and + // + // pc_last_input_index_[k] <= input_index_ + // + // for all k > 0 hold I think everything should be fine. Maybe we can do + // something about this in `SetInputIndex`. + std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); + + DCHECK(blocked_threads_.empty()); + DCHECK(active_threads_.empty()); + DCHECK_EQ(best_match_, base::nullopt); + + // All threads start at bytecode 0. + PushActiveThreadUnchecked(InterpreterThread{0, input_index_}); + // Run the initial thread, potentially forking new threads, until every + // thread is blocked without further input. + RunActiveThreads(); + + // We stop if one of the following conditions hold: + // - We have exhausted the entire input. + // - We have found a match at some point, and there are no remaining + // threads with higher priority than the thread that produced the match. + // Threads with low priority have been aborted earlier, and the remaining + // threads are blocked here, so the latter simply means that + // `blocked_threads_` is empty. + while (input_index_ != input_.length() && + !(best_match_.has_value() && blocked_threads_.empty())) { + DCHECK(active_threads_.empty()); + uc16 input_char = input_[input_index_]; + ++input_index_; + + // If we haven't found a match yet, we add a thread with least priority + // that attempts a match starting after `input_char`. + if (!best_match_.has_value()) { + active_threads_.emplace_back(InterpreterThread{0, input_index_}); + } + + // We unblock all blocked_threads_ by feeding them the input char. + FlushBlockedThreads(input_char); + + // Run all threads until they block or accept. + RunActiveThreads(); + } + + // Clean up the data structures we used. + base::Optional result = best_match_; + best_match_ = base::nullopt; + blocked_threads_.clear(); + active_threads_.clear(); + + return result; + } + + // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT + // instruction, or its PC value was already processed. + // - If processing of `t` can't continue because of CONSUME_RANGE, it is + // pushed on `blocked_threads_`. + // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and + // the current input index. All remaining `active_threads_` are discarded. + void RunActiveThread(InterpreterThread t) { + while (true) { + RegExpInstruction inst = bytecode_[t.pc]; + switch (inst.opcode) { + case RegExpInstruction::CONSUME_RANGE: { + blocked_threads_.emplace_back(t); + return; + } + case RegExpInstruction::FORK: { + InterpreterThread fork = t; + fork.pc = inst.payload.pc; + ++t.pc; + + // t has higher priority than fork. If t.pc hasn't been processed,we + // push fork on the active_thread_ stack and continue directly with + // t. Otherwise we continue directly with fork if possible. + if (!IsPcProcessed(t.pc)) { + MarkPcProcessed(t.pc); + PushActiveThread(fork); + break; + } else if (!IsPcProcessed(fork.pc)) { + t = fork; + MarkPcProcessed(t.pc); + break; + } + return; + } + case RegExpInstruction::JMP: + t.pc = inst.payload.pc; + if (IsPcProcessed(t.pc)) return; + MarkPcProcessed(t.pc); + break; + case RegExpInstruction::ACCEPT: + best_match_ = MatchRange{t.match_begin, input_index_}; + active_threads_.clear(); + return; + } + } + } + + // Run each active thread until it can't continue without further input. + // `active_threads_` is empty afterwards. `blocked_threads_` are sorted from + // low to high priority. + void RunActiveThreads() { + while (!active_threads_.empty()) { + InterpreterThread t = active_threads_.back(); + active_threads_.pop_back(); + RunActiveThread(t); + } + } + + // Unblock all blocked_threads_ by feeding them an `input_char`. Should only + // be called with `input_index_` pointing to the character *after* + // `input_char` so that `pc_last_input_index_` is updated correctly. + void FlushBlockedThreads(uc16 input_char) { + // The threads in blocked_threads_ are sorted from high to low priority, + // but active_threads_ needs to be sorted from low to high priority, so we + // need to activate blocked threads in reverse order. + // + // TODO(mbid,v8:10765): base::SmallVector doesn't support `rbegin()` and + // `rend()`, should we implement that instead of this awkward iteration? + // Maybe we could at least use an int i and check for i >= 0, but + // SmallVectors don't have length() methods. + for (size_t i = blocked_threads_.size(); i > 0; --i) { + InterpreterThread t = blocked_threads_[i - 1]; + RegExpInstruction inst = bytecode_[t.pc]; + DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE); + RegExpInstruction::Uc16Range range = inst.payload.consume_range; + if (input_char >= range.min && input_char <= range.max) { + ++t.pc; + PushActiveThreadUnchecked(t); + } + } + blocked_threads_.clear(); + } + + // It is redundant to have two threads t, t0 execute at the same PC value, + // because one of t, t0 matches iff the other does. We can thus discard + // the one with lower priority. We check whether a thread executed at some + // PC value by recording for every possible value of PC what the value of + // input_index_ was the last time a thread executed at PC. If a thread + // tries to continue execution at a PC value that we have seen before at + // the current input index, we abort it. (We execute threads with higher + // priority first, so the second thread is guaranteed to have lower + // priority.) + // + // Check whether we've seen an active thread with a given pc value since the + // last increment of `input_index_`. + bool IsPcProcessed(int pc) { + DCHECK_LE(pc_last_input_index_[pc], input_index_); + return pc_last_input_index_[pc] == input_index_; + } + + // Mark a pc as having been processed since the last increment of + // `input_index_`. + void MarkPcProcessed(int pc) { + DCHECK_LE(pc_last_input_index_[pc], input_index_); + pc_last_input_index_[pc] = input_index_; + } + + // Functions to push a thread `t` onto the list of active threads, but only + // if `t.pc` was not already the pc of some other thread at the current + // subject index. + void PushActiveThreadUnchecked(InterpreterThread t) { + DCHECK(!IsPcProcessed(t.pc)); + + MarkPcProcessed(t.pc); + active_threads_.emplace_back(t); + } + void PushActiveThread(InterpreterThread t) { + if (IsPcProcessed(t.pc)) { + return; + } + PushActiveThreadUnchecked(t); + } + + Vector bytecode_; + Vector input_; + int input_index_; + + // TODO(mbid,v8:10765): The following `SmallVector`s have somehwat + // arbitrarily chosen small capacity sizes; should benchmark to find a good + // value. + + // pc_last_input_index_[k] records the value of input_index_ the last + // time a thread t such that t.pc == k was activated, i.e. put on + // active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See + // also `RunActiveThread`. + base::SmallVector pc_last_input_index_; + + // Active threads can potentially (but not necessarily) continue without + // input. Sorted from low to high priority. + base::SmallVector active_threads_; + + // The pc of a blocked thread points to an instruction that consumes a + // character. Sorted from high to low priority (so the opposite of + // `active_threads_`). + base::SmallVector blocked_threads_; + + // The best match found so far during the current search. If several threads + // ACCEPTed, then this will be the match of the accepting thread with highest + // priority. + base::Optional best_match_; +}; + +} // namespace + +int ExperimentalRegExpInterpreter::FindMatchesNfaOneByte( + Vector bytecode, Vector input, + int start_index, MatchRange* matches_out, int max_match_num) { + NfaInterpreter interpreter(bytecode, input, start_index); + return interpreter.FindMatches(matches_out, max_match_num); +} + +int ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte( + Vector bytecode, Vector input, + int start_index, MatchRange* matches_out, int max_match_num) { + NfaInterpreter interpreter(bytecode, input, start_index); + return interpreter.FindMatches(matches_out, max_match_num); +} + +} // namespace internal +} // namespace v8 diff --git a/src/regexp/experimental/experimental-interpreter.h b/src/regexp/experimental/experimental-interpreter.h new file mode 100644 index 0000000000..5dbcd27440 --- /dev/null +++ b/src/regexp/experimental/experimental-interpreter.h @@ -0,0 +1,40 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_ +#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_ + +#include "src/regexp/experimental/experimental-bytecode.h" +#include "src/utils/vector.h" + +namespace v8 { +namespace internal { + +class ExperimentalRegExpInterpreter final : public AllStatic { + public: + // A half-open range in an a string denoting a (sub)match. Used to access + // output registers of regexp execution grouped by [begin, end) pairs. + struct MatchRange { + int32_t begin; // inclusive + int32_t end; // exclusive + }; + + // Executes a bytecode program in breadth-first NFA mode, without + // backtracking, to find matching substrings. Trys to find up to + // `max_match_num` matches in `input`, starting at `start_index`. Returns + // the actual number of matches found. The boundaires of matching subranges + // are written to `matches_out`. Provided in variants for one-byte and + // two-byte strings. + static int FindMatchesNfaOneByte(Vector bytecode, + Vector input, int start_index, + MatchRange* matches_out, int max_match_num); + static int FindMatchesNfaTwoByte(Vector bytecode, + Vector input, int start_index, + MatchRange* matches_out, int max_match_num); +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_ diff --git a/src/regexp/experimental/experimental.cc b/src/regexp/experimental/experimental.cc index 31bc6523ca..916d8b02a9 100644 --- a/src/regexp/experimental/experimental.cc +++ b/src/regexp/experimental/experimental.cc @@ -4,157 +4,18 @@ #include "src/regexp/experimental/experimental.h" -#include -#include - -#include "src/base/optional.h" -#include "src/base/small-vector.h" #include "src/objects/js-regexp-inl.h" -#include "src/regexp/regexp-ast.h" +#include "src/regexp/experimental/experimental-compiler.h" +#include "src/regexp/experimental/experimental-interpreter.h" #include "src/regexp/regexp-parser.h" #include "src/utils/ostreams.h" namespace v8 { namespace internal { -namespace { - -// TODO(mbid, v8:10765): Currently the experimental engine doesn't support -// UTF-16, but this shouldn't be too hard to implement. -constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu; - -class CanBeHandledVisitor final : private RegExpVisitor { - // Visitor to implement `ExperimentalRegExp::CanBeHandled`. - public: - static bool Check(RegExpTree* node, JSRegExp::Flags flags, Zone* zone) { - if (!AreSuitableFlags(flags)) { - return false; - } - CanBeHandledVisitor visitor(zone); - node->Accept(&visitor, nullptr); - return visitor.result_; - } - - private: - explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {} - - static bool AreSuitableFlags(JSRegExp::Flags flags) { - // TODO(mbid, v8:10765): We should be able to support all flags in the - // future. - static constexpr JSRegExp::Flags allowed_flags = JSRegExp::kGlobal; - return (flags & ~allowed_flags) == 0; - } - - void* VisitDisjunction(RegExpDisjunction* node, void*) override { - for (RegExpTree* alt : *node->alternatives()) { - alt->Accept(this, nullptr); - if (!result_) { - return nullptr; - } - } - return nullptr; - } - - void* VisitAlternative(RegExpAlternative* node, void*) override { - for (RegExpTree* child : *node->nodes()) { - child->Accept(this, nullptr); - if (!result_) { - return nullptr; - } - } - return nullptr; - } - - void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { - result_ = result_ && AreSuitableFlags(node->flags()); - for (CharacterRange r : *node->ranges(zone_)) { - // TODO(mbid, v8:10765): We don't support full unicode yet, so we only - // allow character ranges that can be specified with two-byte characters. - if (r.to() > kMaxSupportedCodepoint) { - result_ = false; - return nullptr; - } - } - return nullptr; - } - - void* VisitAssertion(RegExpAssertion* node, void*) override { - // TODO(mbid, v8:10765): We should be able to support at least some - // assertions. re2 does, too. - result_ = false; - return nullptr; - } - - void* VisitAtom(RegExpAtom* node, void*) override { - result_ = result_ && AreSuitableFlags(node->flags()); - return nullptr; - } - - void* VisitText(RegExpText* node, void*) override { - for (TextElement& el : *node->elements()) { - el.tree()->Accept(this, nullptr); - if (!result_) { - return nullptr; - } - } - return nullptr; - } - - void* VisitQuantifier(RegExpQuantifier* node, void*) override { - // TODO(mbid, v8:10765): Theoretically we can support arbitrary min() and - // max(), but the size of the automaton grows linearly with finite max(). - // We probably want a cut-off value here, or maybe we can "virtualize" the - // repetitions. - // Non-greedy quantifiers are easy to implement, but not supported atm. - // It's not clear to me how a possessive quantifier would be implemented, - // we should check whether re2 supports this. - result_ = result_ && node->min() == 0 && - node->max() == RegExpTree::kInfinity && node->is_greedy(); - if (!result_) { - return nullptr; - } - node->body()->Accept(this, nullptr); - return nullptr; - } - - void* VisitCapture(RegExpCapture* node, void*) override { - // TODO(mbid, v8:10765): This can be implemented with the NFA interpreter, - // but not with the lazy DFA. See also re2. - result_ = false; - return nullptr; - } - - void* VisitGroup(RegExpGroup* node, void*) override { - node->body()->Accept(this, nullptr); - return nullptr; - } - - void* VisitLookaround(RegExpLookaround* node, void*) override { - // TODO(mbid, v8:10765): This will be hard to support, but not impossible I - // think. See product automata. - result_ = false; - return nullptr; - } - - void* VisitBackReference(RegExpBackReference* node, void*) override { - // This can't be implemented without backtracking. - result_ = false; - return nullptr; - } - - void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } - - private: - bool result_ = true; - Zone* zone_; -}; - -} // namespace - bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) { - DCHECK(FLAG_enable_experimental_regexp_engine); - return CanBeHandledVisitor::Check(tree, flags, zone); + return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, zone); } void ExperimentalRegExp::Initialize(Isolate* isolate, Handle re, @@ -182,423 +43,6 @@ bool ExperimentalRegExp::IsCompiled(Handle re, Isolate* isolate) { Smi::FromInt(JSRegExp::kUninitializedValue); } -// ---------------------------------------------------------------------------- -// Definition and semantics of the EXPERIMENTAL bytecode. -// Background: -// - Russ Cox's blog post series on regular expression matching, in particular -// https://swtch.com/~rsc/regexp/regexp2.html -// - The re2 regular regexp library: https://github.com/google/re2 -// -// This comment describes the bytecode used by the experimental regexp engine -// and its abstract semantics in terms of a VM. An implementation of the -// semantics that avoids exponential runtime can be found in `NfaInterpreter`. -// -// The experimental bytecode describes a non-deterministic finite automaton. It -// runs on a multithreaded virtual machine (VM), i.e. in several threads -// concurrently. (These "threads" don't need to be actual operating system -// threads.) Apart from a list of threads, the VM maintains an immutable -// shared input string which threads can read from. Each thread is given by a -// program counter (PC, index of the current instruction), a fixed number of -// registers of indices into the input string, and a monotonically increasing -// index which represents the current position within the input string. -// -// For the precise encoding of the instruction set, see the definition `struct -// RegExpInstruction` below. Currently we support the following instructions: -// - CONSUME_RANGE: Check whether the codepoint of the current character is -// contained in a non-empty closed interval [min, max] specified in the -// instruction payload. Abort this thread if false, otherwise advance the -// input position by 1 and continue with the next instruction. -// - ACCEPT: Stop this thread and signify the end of a match at the current -// input position. -// - FORK: If executed by a thread t, spawn a new thread t0 whose register -// values and input position agree with those of t, but whose PC value is set -// to the value specified in the instruction payload. The register values of -// t and t0 agree directly after the FORK, but they can diverge. Thread t -// continues with the instruction directly after the current FORK -// instruction. -// - JMP: Instead of incrementing the PC value after execution of this -// instruction by 1, set PC of this thread to the value specified in the -// instruction payload and continue there. -// -// Special care must be exercised with respect to thread priority. It is -// possible that more than one thread executes an ACCEPT statement. The output -// of the program is given by the contents of the matching thread's registers, -// so this is ambiguous in case of multiple matches. To resolve the ambiguity, -// every implementation of the VM must output the match that a backtracking -// implementation would output (i.e. behave the same as Irregexp). -// -// A backtracking implementation of the VM maintains a stack of postponed -// threads. Upon encountering a FORK statement, this VM will create a copy of -// the current thread, set the copy's PC value according to the instruction -// payload, and push it to the stack of postponed threads. The VM will then -// continue execution of the current thread. -// -// If at some point a thread t executes a MATCH statement, the VM stops and -// outputs the registers of t. Postponed threads are discarded. On the other -// hand, if a thread t is aborted because some input character didn't pass a -// check, then the VM pops the topmost postponed thread and continues execution -// with this thread. If there are no postponed threads, then the VM outputs -// failure, i.e. no matches. -// -// Equivalently, we can describe the behavior of the backtracking VM in terms -// of priority: Threads are linearly ordered by priority, and matches generated -// by threads with high priority must be preferred over matches generated by -// threads with low priority, regardless of the chronological order in which -// matches were found. If a thread t executes a FORK statement and spawns a -// thread t0, then the priority of t0 is such that the following holds: -// * t0 < t, i.e. t0 has lower priority than t. -// * For all threads u such that u != t and u != t0, we have t0 < u iff t < u, -// i.e. the t0 compares to other threads the same as t. -// For example, if there are currently 3 threads s, t, u such that s < t < u, -// then after t executes a fork, the thread priorities will be s < t0 < t < u. - -namespace { - -struct Uc16Range { - uc16 min; // Inclusive. - uc16 max; // Inclusive. -}; - -// Bytecode format. -// Currently very simple fixed-size: The opcode is encoded in the first 4 -// bytes, the payload takes another 4 bytes. -struct RegExpInstruction { - enum Opcode : int32_t { - CONSUME_RANGE, - FORK, - JMP, - ACCEPT, - }; - - static RegExpInstruction ConsumeRange(Uc16Range consume_range) { - RegExpInstruction result; - result.opcode = CONSUME_RANGE; - result.payload.consume_range = consume_range; - return result; - } - - static RegExpInstruction Fork(int32_t alt_index) { - RegExpInstruction result; - result.opcode = FORK; - result.payload.pc = alt_index; - return result; - } - - static RegExpInstruction Jmp(int32_t alt_index) { - RegExpInstruction result; - result.opcode = JMP; - result.payload.pc = alt_index; - return result; - } - - static RegExpInstruction Accept() { - RegExpInstruction result; - result.opcode = ACCEPT; - return result; - } - - Opcode opcode; - union { - // Payload of CONSUME_RANGE: - Uc16Range consume_range; - // Payload of FORK and JMP, the next/forked program counter (pc): - int32_t pc; - } payload; - STATIC_ASSERT(sizeof(payload) == 4); -}; -STATIC_ASSERT(sizeof(RegExpInstruction) == 8); -// TODO(mbid,v8:10765): This is rather wasteful. We can fit the opcode in 2-3 -// bits, so the remaining 29/30 bits can be used as payload. Problem: The -// payload of CONSUME_RANGE consists of two 16-bit values `min` and `max`, so -// this wouldn't fit. We could encode the payload of a CONSUME_RANGE -// instruction by the start of the interval and its length instead, and then -// only allows lengths that fit into 14/13 bits. A longer range can then be -// encoded as a disjunction of smaller ranges. -// -// Another thought: CONSUME_RANGEs are only valid if the payloads are such that -// min <= max. Thus there are -// -// 2^16 + 2^16 - 1 + ... + 1 -// = 2^16 * (2^16 + 1) / 2 -// = 2^31 + 2^15 -// -// valid payloads for a CONSUME_RANGE instruction. If we want to fit -// instructions into 4 bytes, we would still have almost 2^31 instructions left -// over if we encode everything as tight as possible. For example, we could -// use another 2^29 values for JMP, another 2^29 for FORK, 1 value for ACCEPT, -// and then still have almost 2^30 instructions left over for something like -// zero-width assertions and captures. - -std::ostream& PrintAsciiOrHex(std::ostream& os, uc16 c) { - if (c < 128 && std::isprint(c)) { - os << static_cast(c); - } else { - os << "0x" << std::hex << static_cast(c); - } - return os; -} - -std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) { - switch (inst.opcode) { - case RegExpInstruction::CONSUME_RANGE: { - os << "CONSUME_RANGE ["; - PrintAsciiOrHex(os, inst.payload.consume_range.min); - os << ", "; - PrintAsciiOrHex(os, inst.payload.consume_range.max); - os << "]"; - break; - } - case RegExpInstruction::FORK: - os << "FORK " << inst.payload.pc; - break; - case RegExpInstruction::JMP: - os << "JMP " << inst.payload.pc; - break; - case RegExpInstruction::ACCEPT: - os << "ACCEPT"; - break; - } - return os; -} - -// The maximum number of digits required to display a non-negative number < n -// in base 10. -int DigitsRequiredBelow(int n) { - DCHECK_GE(n, 0); - - int result = 1; - for (int i = 10; i < n; i *= 10) { - result += 1; - } - return result; -} - -std::ostream& operator<<(std::ostream& os, - Vector insts) { - int inst_num = insts.length(); - int line_digit_num = DigitsRequiredBelow(inst_num); - - for (int i = 0; i != inst_num; ++i) { - const RegExpInstruction& inst = insts[i]; - os << std::setfill('0') << std::setw(line_digit_num) << i << ": " << inst - << std::endl; - } - return os; -} - -Vector AsInstructionSequence(ByteArray raw_bytes) { - RegExpInstruction* inst_begin = - reinterpret_cast(raw_bytes.GetDataStartAddress()); - int inst_num = raw_bytes.length() / sizeof(RegExpInstruction); - DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length()); - return Vector(inst_begin, inst_num); -} - -class Compiler : private RegExpVisitor { - public: - static Handle Compile(RegExpTree* tree, Isolate* isolate, - Zone* zone) { - Compiler compiler(zone); - - tree->Accept(&compiler, nullptr); - compiler.code_.Add(RegExpInstruction::Accept(), zone); - - int byte_length = sizeof(RegExpInstruction) * compiler.code_.length(); - Handle array = isolate->factory()->NewByteArray(byte_length); - MemCopy(array->GetDataStartAddress(), compiler.code_.begin(), byte_length); - - return array; - } - - private: - // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from - // the `tree` size we're going to compile? - explicit Compiler(Zone* zone) : zone_(zone), code_(0, zone) {} - - // Generate a disjunction of code fragments compiled by a function `alt_gen`. - // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - - // 1` and should push code corresponding to the ith alternative onto `code_`. - template - void CompileDisjunction(int alt_num, F gen_alt) { - // An alternative a0 | a1 | a2 is compiled into - // FORK - // FORK - // - // JMP $end - // - // JMP $end - // - // where $end is the index of the next instruction after . - // - // By the semantics of the FORK instruction (see above at definition and - // semantics), the forked thread has lower priority than the current - // thread. This means that with the code we're generating here, the thread - // matching the alternative a0 is indeed the thread with the highest - // priority, followed by the thread for a1 and so on. - - if (alt_num == 0) { - return; - } - - // Record the index of the first of the alt_num - 1 fork instructions in the - // beginning. - int forks_begin = code_.length(); - // Add FORKs to alts[alt_num - 1], alts[alt_num - 2], ..., alts[1]. - for (int i = alt_num - 1; i != 0; --i) { - // The FORK's address is patched once we know the address of the ith - // alternative. - code_.Add(RegExpInstruction::Fork(-1), zone_); - } - - // List containing the index of the final JMP instruction after each - // alternative but the last one. - ZoneList jmp_indices(alt_num - 1, zone_); - - for (int i = 0; i != alt_num; ++i) { - if (i != 0) { - // If this is not the first alternative, we have to patch the - // corresponding FORK statement in the beginning. - code_[forks_begin + alt_num - 1 - i].payload.pc = code_.length(); - } - gen_alt(i); - if (i != alt_num - 1) { - // If this is not the last alternative, we have to emit a JMP past the - // remaining alternatives. We don't know this address yet, so we have - // to patch patch it once all alternatives are emitted. - jmp_indices.Add(code_.length(), zone_); - code_.Add(RegExpInstruction::Jmp(-1), zone_); - } - } - - // All alternatives are emitted. Now we can patch the JMP instruction - // after each but the last alternative. - int end_index = code_.length(); - for (int jmp_index : jmp_indices) { - code_[jmp_index].payload.pc = end_index; - } - } - - void* VisitDisjunction(RegExpDisjunction* node, void*) override { - ZoneList& alts = *node->alternatives(); - CompileDisjunction(alts.length(), - [&](int i) { alts[i]->Accept(this, nullptr); }); - return nullptr; - } - - void* VisitAlternative(RegExpAlternative* node, void*) override { - for (RegExpTree* child : *node->nodes()) { - child->Accept(this, nullptr); - } - return nullptr; - } - - void* VisitAssertion(RegExpAssertion* node, void*) override { - // TODO(mbid,v8:10765): Support this case. - UNREACHABLE(); - } - - void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { - // A character class is compiled as Disjunction over its `CharacterRange`s. - ZoneList* ranges = node->ranges(zone_); - CharacterRange::Canonicalize(ranges); - if (node->is_negated()) { - // Capacity 2 for the common case where we compute the complement of a - // single interval range that doesn't contain 0 and kMaxCodePoint. - ZoneList* negated = - zone_->New>(2, zone_); - CharacterRange::Negate(ranges, negated, zone_); - ranges = negated; - } - - CompileDisjunction(ranges->length(), [&](int i) { - // We don't support utf16 for now, so only ranges that can be specified - // by (complements of) ranges with uc16 bounds. - STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits::max()); - - uc32 from = (*ranges)[i].from(); - DCHECK_LE(from, kMaxSupportedCodepoint); - uc16 from_uc16 = static_cast(from); - - uc32 to = (*ranges)[i].to(); - DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint); - uc16 to_uc16 = static_cast(std::min(to, kMaxSupportedCodepoint)); - - Uc16Range range{from_uc16, to_uc16}; - code_.Add(RegExpInstruction::ConsumeRange(range), zone_); - }); - return nullptr; - } - - void* VisitAtom(RegExpAtom* node, void*) override { - for (uc16 c : node->data()) { - code_.Add(RegExpInstruction::ConsumeRange(Uc16Range{c, c}), zone_); - } - return nullptr; - } - - void* VisitQuantifier(RegExpQuantifier* node, void*) override { - // TODO(mbid,v8:10765): For now we support a quantifier of the form /x*/, - // i.e. greedy match of any number of /x/. See also the comment in - // `CanBeHandledVisitor::VisitQuantifier`. - DCHECK_EQ(node->min(), 0); - DCHECK_EQ(node->max(), RegExpTree::kInfinity); - DCHECK(node->is_greedy()); - - // The repetition of /x/ is compiled into - // - // a: FORK d - // b: - // c: JMP a - // d: ... - // - // Note that a FORKed thread has lower priority than the main thread, so - // this will indeed match greedily. - - int initial_fork_index = code_.length(); - // The FORK's address is patched once we're done. - code_.Add(RegExpInstruction::Fork(-1), zone_); - node->body()->Accept(this, nullptr); - code_.Add(RegExpInstruction::Jmp(initial_fork_index), zone_); - int end_index = code_.length(); - code_[initial_fork_index].payload.pc = end_index; - return nullptr; - } - - void* VisitCapture(RegExpCapture* node, void*) override { - // TODO(mbid,v8:10765): Support this case. - UNREACHABLE(); - } - - void* VisitGroup(RegExpGroup* node, void*) override { - node->body()->Accept(this, nullptr); - return nullptr; - } - - void* VisitLookaround(RegExpLookaround* node, void*) override { - // TODO(mbid,v8:10765): Support this case. - UNREACHABLE(); - } - - void* VisitBackReference(RegExpBackReference* node, void*) override { - UNREACHABLE(); - } - - void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } - - void* VisitText(RegExpText* node, void*) override { - for (TextElement& text_el : *node->elements()) { - text_el.tree()->Accept(this, nullptr); - } - return nullptr; - } - - private: - Zone* zone_; - ZoneList code_; -}; - -} // namespace - void ExperimentalRegExp::Compile(Isolate* isolate, Handle re) { DCHECK_EQ(re->TypeTag(), JSRegExp::EXPERIMENTAL); #ifdef VERIFY_HEAP @@ -624,341 +68,32 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle re) { RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, &parse_result); CHECK(parse_success); - Handle bytecode = - Compiler::Compile(parse_result.tree, isolate, &zone); - re->SetDataAt(JSRegExp::kIrregexpLatin1BytecodeIndex, *bytecode); - re->SetDataAt(JSRegExp::kIrregexpUC16BytecodeIndex, *bytecode); + ZoneList bytecode = + ExperimentalRegExpCompiler::Compile(parse_result.tree, flags, &zone); + + int byte_length = sizeof(RegExpInstruction) * bytecode.length(); + Handle bytecode_byte_array = + isolate->factory()->NewByteArray(byte_length); + MemCopy(bytecode_byte_array->GetDataStartAddress(), bytecode.begin(), + byte_length); + + re->SetDataAt(JSRegExp::kIrregexpLatin1BytecodeIndex, *bytecode_byte_array); + re->SetDataAt(JSRegExp::kIrregexpUC16BytecodeIndex, *bytecode_byte_array); Handle trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline); re->SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline); re->SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline); } -namespace { +Vector AsInstructionSequence(ByteArray raw_bytes) { + RegExpInstruction* inst_begin = + reinterpret_cast(raw_bytes.GetDataStartAddress()); + int inst_num = raw_bytes.length() / sizeof(RegExpInstruction); + DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length()); + return Vector(inst_begin, inst_num); +} -// A half-open range in the input string denoting a (sub)match. Used to access -// output registers of a regexp match grouped by [begin, end) pairs. -struct MatchRange { - int32_t begin; // inclusive - int32_t end; // exclusive -}; - -template -class NfaInterpreter { - // Executes a bytecode program in breadth-first mode, without backtracking. - // `Character` can be instantiated with `uint8_t` or `uc16` for one byte or - // two byte input strings. - // - // In contrast to the backtracking implementation, this has linear time - // complexity in the length of the input string. Breadth-first mode means - // that threads are executed in lockstep with respect to their input - // position, i.e. the threads share a common input index. This is similar - // to breadth-first simulation of a non-deterministic finite automaton (nfa), - // hence the name of the class. - // - // To follow the semantics of a backtracking VM implementation, we have to be - // careful about whether we stop execution when a thread executes ACCEPT. - // For example, consider execution of the bytecode generated by the regexp - // - // r = /abc|..|[a-c]{10,}/ - // - // on input "abcccccccccccccc". Clearly the three alternatives - // - /abc/ - // - /../ - // - /[a-c]{10,}/ - // all match this input. A backtracking implementation will report "abc" as - // match, because it explores the first alternative before the others. - // - // However, if we execute breadth first, then we execute the 3 threads - // - t1, which tries to match /abc/ - // - t2, which tries to match /../ - // - t3, which tries to match /[a-c]{10,}/ - // in lockstep i.e. by iterating over the input and feeding all threads one - // character at a time. t2 will execute an ACCEPT after two characters, - // while t1 will only execute ACCEPT after three characters. Thus we find a - // match for the second alternative before a match of the first alternative. - // - // This shows that we cannot always stop searching as soon as some thread t - // executes ACCEPT: If there is a thread u with higher priority than t, then - // it must be finished first. If u produces a match, then we can discard the - // match of t because matches produced by threads with higher priority are - // preferred over matches of threads with lower priority. On the other hand, - // we are allowed to abort all threads with lower priority than t if t - // produces a match: Such threads can only produce worse matches. In the - // example above, we can abort t3 after two characters because of t2's match. - // - // Thus the interpreter keeps track of a priority-ordered list of threads. - // If a thread ACCEPTs, all threads with lower priority are discarded, and - // the search continues with the threads with higher priority. If no threads - // with high priority are left, we return the match that was produced by the - // ACCEPTing thread with highest priority. - public: - NfaInterpreter(Vector bytecode, - Vector input, int32_t input_index) - : bytecode_(bytecode), - input_(input), - input_index_(input_index), - pc_last_input_index_(bytecode.size()), - active_threads_(), - blocked_threads_(), - best_match_(base::nullopt) { - DCHECK(!bytecode_.empty()); - DCHECK_GE(input_index_, 0); - DCHECK_LE(input_index_, input_.length()); - - std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); - } - - // Finds up to `max_match_num` matches and writes their boundaries to - // `matches_out`. The search begins at the current input index. Returns the - // number of matches found. - int FindMatches(MatchRange* matches_out, int max_match_num) { - int match_num; - for (match_num = 0; match_num != max_match_num; ++match_num) { - base::Optional match = FindNextMatch(); - if (!match.has_value()) { - break; - } - - matches_out[match_num] = *match; - SetInputIndex(match->end); - } - return match_num; - } - - private: - // The state of a "thread" executing experimental regexp bytecode. (Not to - // be confused with an OS thread.) - struct InterpreterThread { - // This thread's program counter, i.e. the index within `bytecode_` of the - // next instruction to be executed. - int32_t pc; - // The index in the input string where this thread started executing. - int32_t match_begin; - }; - - // Change the current input index for future calls to `FindNextMatch`. - void SetInputIndex(int new_input_index) { - DCHECK_GE(input_index_, 0); - DCHECK_LE(input_index_, input_.length()); - - input_index_ = new_input_index; - } - - // Find the next match, begin search at input_index_; - base::Optional FindNextMatch() { - DCHECK(active_threads_.empty()); - // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_` - // here? As long as - // - // pc_last_input_index_[pc] < input_index_ - // - // for all possible program counters pc that are reachable without input - // from pc = 0 and - // - // pc_last_input_index_[k] <= input_index_ - // - // for all k > 0 hold I think everything should be fine. Maybe we can do - // something about this in `SetInputIndex`. - std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); - - DCHECK(blocked_threads_.empty()); - DCHECK(active_threads_.empty()); - DCHECK_EQ(best_match_, base::nullopt); - - // All threads start at bytecode 0. - PushActiveThreadUnchecked(InterpreterThread{0, input_index_}); - // Run the initial thread, potentially forking new threads, until every - // thread is blocked without further input. - RunActiveThreads(); - - // We stop if one of the following conditions hold: - // - We have exhausted the entire input. - // - We have found a match at some point, and there are no remaining - // threads with higher priority than the thread that produced the match. - // Threads with low priority have been aborted earlier, and the remaining - // threads are blocked here, so the latter simply means that - // `blocked_threads_` is empty. - while (input_index_ != input_.length() && - !(best_match_.has_value() && blocked_threads_.empty())) { - DCHECK(active_threads_.empty()); - uc16 input_char = input_[input_index_]; - ++input_index_; - - // If we haven't found a match yet, we add a thread with least priority - // that attempts a match starting after `input_char`. - if (!best_match_.has_value()) { - active_threads_.emplace_back(InterpreterThread{0, input_index_}); - } - - // We unblock all blocked_threads_ by feeding them the input char. - FlushBlockedThreads(input_char); - - // Run all threads until they block or accept. - RunActiveThreads(); - } - - // Clean up the data structures we used. - base::Optional result = best_match_; - best_match_ = base::nullopt; - blocked_threads_.clear(); - active_threads_.clear(); - - return result; - } - - // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT - // instruction, or its PC value was already processed. - // - If processing of `t` can't continue because of CONSUME_RANGE, it is - // pushed on `blocked_threads_`. - // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and - // the current input index. All remaining `active_threads_` are discarded. - void RunActiveThread(InterpreterThread t) { - while (true) { - RegExpInstruction inst = bytecode_[t.pc]; - switch (inst.opcode) { - case RegExpInstruction::CONSUME_RANGE: { - blocked_threads_.emplace_back(t); - return; - } - case RegExpInstruction::FORK: { - InterpreterThread fork = t; - fork.pc = inst.payload.pc; - ++t.pc; - - // t has higher priority than fork. If t.pc hasn't been processed,we - // push fork on the active_thread_ stack and continue directly with - // t. Otherwise we continue directly with fork if possible. - if (!IsPcProcessed(t.pc)) { - MarkPcProcessed(t.pc); - PushActiveThread(fork); - break; - } else if (!IsPcProcessed(fork.pc)) { - t = fork; - MarkPcProcessed(t.pc); - break; - } - return; - } - case RegExpInstruction::JMP: - t.pc = inst.payload.pc; - if (IsPcProcessed(t.pc)) return; - MarkPcProcessed(t.pc); - break; - case RegExpInstruction::ACCEPT: - best_match_ = MatchRange{t.match_begin, input_index_}; - active_threads_.clear(); - return; - } - } - } - - // Run each active thread until it can't continue without further input. - // `active_threads_` is empty afterwards. `blocked_threads_` are sorted from - // low to high priority. - void RunActiveThreads() { - while (!active_threads_.empty()) { - InterpreterThread t = active_threads_.back(); - active_threads_.pop_back(); - RunActiveThread(t); - } - } - - // Unblock all blocked_threads_ by feeding them an `input_char`. Should only - // be called with `input_index_` pointing to the character *after* - // `input_char` so that `pc_last_input_index_` is updated correctly. - void FlushBlockedThreads(uc16 input_char) { - // The threads in blocked_threads_ are sorted from high to low priority, - // but active_threads_ needs to be sorted from low to high priority, so we - // need to activate blocked threads in reverse order. - // - // TODO(mbid,v8:10765): base::SmallVector doesn't support `rbegin()` and - // `rend()`, should we implement that instead of this awkward iteration? - // Maybe we could at least use an int i and check for i >= 0, but - // SmallVectors don't have length() methods. - for (size_t i = blocked_threads_.size(); i > 0; --i) { - InterpreterThread t = blocked_threads_[i - 1]; - RegExpInstruction inst = bytecode_[t.pc]; - DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE); - Uc16Range range = inst.payload.consume_range; - if (input_char >= range.min && input_char <= range.max) { - ++t.pc; - PushActiveThreadUnchecked(t); - } - } - blocked_threads_.clear(); - } - - // It is redundant to have two threads t, t0 execute at the same PC value, - // because one of t, t0 matches iff the other does. We can thus discard - // the one with lower priority. We check whether a thread executed at some - // PC value by recording for every possible value of PC what the value of - // input_index_ was the last time a thread executed at PC. If a thread - // tries to continue execution at a PC value that we have seen before at - // the current input index, we abort it. (We execute threads with higher - // priority first, so the second thread is guaranteed to have lower - // priority.) - // - // Check whether we've seen an active thread with a given pc value since the - // last increment of `input_index_`. - bool IsPcProcessed(int pc) { - DCHECK_LE(pc_last_input_index_[pc], input_index_); - return pc_last_input_index_[pc] == input_index_; - } - - // Mark a pc as having been processed since the last increment of - // `input_index_`. - void MarkPcProcessed(int pc) { - DCHECK_LE(pc_last_input_index_[pc], input_index_); - pc_last_input_index_[pc] = input_index_; - } - - // Functions to push a thread `t` onto the list of active threads, but only - // if `t.pc` was not already the pc of some other thread at the current - // subject index. - void PushActiveThreadUnchecked(InterpreterThread t) { - DCHECK(!IsPcProcessed(t.pc)); - - MarkPcProcessed(t.pc); - active_threads_.emplace_back(t); - } - void PushActiveThread(InterpreterThread t) { - if (IsPcProcessed(t.pc)) { - return; - } - PushActiveThreadUnchecked(t); - } - - Vector bytecode_; - Vector input_; - int input_index_; - - // TODO(mbid,v8:10765): The following `SmallVector`s have somehwat - // arbitrarily chosen small capacity sizes; should benchmark to find a good - // value. - - // pc_last_input_index_[k] records the value of input_index_ the last - // time a thread t such that t.pc == k was activated, i.e. put on - // active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See - // also `RunActiveThread`. - base::SmallVector pc_last_input_index_; - - // Active threads can potentially (but not necessarily) continue without - // input. Sorted from low to high priority. - base::SmallVector active_threads_; - - // The pc of a blocked thread points to an instruction that consumes a - // character. Sorted from high to low priority (so the opposite of - // `active_threads_`). - base::SmallVector blocked_threads_; - - // The best match found so far during the current search. If several threads - // ACCEPTed, then this will be the match of the accepting thread with highest - // priority. - base::Optional best_match_; -}; - -} // namespace +using MatchRange = ExperimentalRegExpInterpreter::MatchRange; // Returns the number of matches. int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject, @@ -990,13 +125,13 @@ int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject, const int32_t max_match_num = output_register_count / 2; if (subject_content.IsOneByte()) { - NfaInterpreter interpreter( - bytecode, subject_content.ToOneByteVector(), subject_index); - return interpreter.FindMatches(matches, max_match_num); + return ExperimentalRegExpInterpreter::FindMatchesNfaOneByte( + bytecode, subject_content.ToOneByteVector(), subject_index, matches, + max_match_num); } else { - NfaInterpreter interpreter(bytecode, subject_content.ToUC16Vector(), - subject_index); - return interpreter.FindMatches(matches, max_match_num); + return ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte( + bytecode, subject_content.ToUC16Vector(), subject_index, matches, + max_match_num); } }