[regexp] Split experimental regexp code into multiple files
Bug: v8:10765 Change-Id: I49e425d861d900ab66b6f7801cddec8a7175ac03 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2385462 Commit-Queue: Martin Bidlingmaier <mbid@google.com> Reviewed-by: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#69637}
This commit is contained in:
parent
7c912ffac1
commit
e2aa1a89dd
6
BUILD.gn
6
BUILD.gn
@ -3067,6 +3067,12 @@ v8_source_set("v8_base_without_compiler") {
|
||||
"src/profiler/tick-sample.h",
|
||||
"src/profiler/tracing-cpu-profiler.cc",
|
||||
"src/profiler/tracing-cpu-profiler.h",
|
||||
"src/regexp/experimental/experimental-bytecode.cc",
|
||||
"src/regexp/experimental/experimental-bytecode.h",
|
||||
"src/regexp/experimental/experimental-compiler.cc",
|
||||
"src/regexp/experimental/experimental-compiler.h",
|
||||
"src/regexp/experimental/experimental-interpreter.cc",
|
||||
"src/regexp/experimental/experimental-interpreter.h",
|
||||
"src/regexp/experimental/experimental.cc",
|
||||
"src/regexp/experimental/experimental.h",
|
||||
"src/regexp/property-sequences.cc",
|
||||
|
78
src/regexp/experimental/experimental-bytecode.cc
Normal file
78
src/regexp/experimental/experimental-bytecode.cc
Normal file
@ -0,0 +1,78 @@
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "src/regexp/experimental/experimental-bytecode.h"
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
std::ostream& PrintAsciiOrHex(std::ostream& os, uc16 c) {
|
||||
if (c < 128 && std::isprint(c)) {
|
||||
os << static_cast<char>(c);
|
||||
} else {
|
||||
os << "0x" << std::hex << static_cast<int>(c);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
|
||||
switch (inst.opcode) {
|
||||
case RegExpInstruction::CONSUME_RANGE: {
|
||||
os << "CONSUME_RANGE [";
|
||||
PrintAsciiOrHex(os, inst.payload.consume_range.min);
|
||||
os << ", ";
|
||||
PrintAsciiOrHex(os, inst.payload.consume_range.max);
|
||||
os << "]";
|
||||
break;
|
||||
}
|
||||
case RegExpInstruction::FORK:
|
||||
os << "FORK " << inst.payload.pc;
|
||||
break;
|
||||
case RegExpInstruction::JMP:
|
||||
os << "JMP " << inst.payload.pc;
|
||||
break;
|
||||
case RegExpInstruction::ACCEPT:
|
||||
os << "ACCEPT";
|
||||
break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// The maximum number of digits required to display a non-negative number < n
|
||||
// in base 10.
|
||||
int DigitsRequiredBelow(int n) {
|
||||
DCHECK_GE(n, 0);
|
||||
|
||||
int result = 1;
|
||||
for (int i = 10; i < n; i *= 10) {
|
||||
result += 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::ostream& operator<<(std::ostream& os,
|
||||
Vector<const RegExpInstruction> insts) {
|
||||
int inst_num = insts.length();
|
||||
int line_digit_num = DigitsRequiredBelow(inst_num);
|
||||
|
||||
for (int i = 0; i != inst_num; ++i) {
|
||||
const RegExpInstruction& inst = insts[i];
|
||||
os << std::setfill('0') << std::setw(line_digit_num) << i << ": " << inst
|
||||
<< std::endl;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
167
src/regexp/experimental/experimental-bytecode.h
Normal file
167
src/regexp/experimental/experimental-bytecode.h
Normal file
@ -0,0 +1,167 @@
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_
|
||||
#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_
|
||||
|
||||
#include <ios>
|
||||
|
||||
#include "src/utils/vector.h"
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Definition and semantics of the EXPERIMENTAL bytecode.
|
||||
// Background:
|
||||
// - Russ Cox's blog post series on regular expression matching, in particular
|
||||
// https://swtch.com/~rsc/regexp/regexp2.html
|
||||
// - The re2 regular regexp library: https://github.com/google/re2
|
||||
//
|
||||
// This comment describes the bytecode used by the experimental regexp engine
|
||||
// and its abstract semantics in terms of a VM. An implementation of the
|
||||
// semantics that avoids exponential runtime can be found in `NfaInterpreter`.
|
||||
//
|
||||
// The experimental bytecode describes a non-deterministic finite automaton. It
|
||||
// runs on a multithreaded virtual machine (VM), i.e. in several threads
|
||||
// concurrently. (These "threads" don't need to be actual operating system
|
||||
// threads.) Apart from a list of threads, the VM maintains an immutable
|
||||
// shared input string which threads can read from. Each thread is given by a
|
||||
// program counter (PC, index of the current instruction), a fixed number of
|
||||
// registers of indices into the input string, and a monotonically increasing
|
||||
// index which represents the current position within the input string.
|
||||
//
|
||||
// For the precise encoding of the instruction set, see the definition `struct
|
||||
// RegExpInstruction` below. Currently we support the following instructions:
|
||||
// - CONSUME_RANGE: Check whether the codepoint of the current character is
|
||||
// contained in a non-empty closed interval [min, max] specified in the
|
||||
// instruction payload. Abort this thread if false, otherwise advance the
|
||||
// input position by 1 and continue with the next instruction.
|
||||
// - ACCEPT: Stop this thread and signify the end of a match at the current
|
||||
// input position.
|
||||
// - FORK: If executed by a thread t, spawn a new thread t0 whose register
|
||||
// values and input position agree with those of t, but whose PC value is set
|
||||
// to the value specified in the instruction payload. The register values of
|
||||
// t and t0 agree directly after the FORK, but they can diverge. Thread t
|
||||
// continues with the instruction directly after the current FORK
|
||||
// instruction.
|
||||
// - JMP: Instead of incrementing the PC value after execution of this
|
||||
// instruction by 1, set PC of this thread to the value specified in the
|
||||
// instruction payload and continue there.
|
||||
//
|
||||
// Special care must be exercised with respect to thread priority. It is
|
||||
// possible that more than one thread executes an ACCEPT statement. The output
|
||||
// of the program is given by the contents of the matching thread's registers,
|
||||
// so this is ambiguous in case of multiple matches. To resolve the ambiguity,
|
||||
// every implementation of the VM must output the match that a backtracking
|
||||
// implementation would output (i.e. behave the same as Irregexp).
|
||||
//
|
||||
// A backtracking implementation of the VM maintains a stack of postponed
|
||||
// threads. Upon encountering a FORK statement, this VM will create a copy of
|
||||
// the current thread, set the copy's PC value according to the instruction
|
||||
// payload, and push it to the stack of postponed threads. The VM will then
|
||||
// continue execution of the current thread.
|
||||
//
|
||||
// If at some point a thread t executes a MATCH statement, the VM stops and
|
||||
// outputs the registers of t. Postponed threads are discarded. On the other
|
||||
// hand, if a thread t is aborted because some input character didn't pass a
|
||||
// check, then the VM pops the topmost postponed thread and continues execution
|
||||
// with this thread. If there are no postponed threads, then the VM outputs
|
||||
// failure, i.e. no matches.
|
||||
//
|
||||
// Equivalently, we can describe the behavior of the backtracking VM in terms
|
||||
// of priority: Threads are linearly ordered by priority, and matches generated
|
||||
// by threads with high priority must be preferred over matches generated by
|
||||
// threads with low priority, regardless of the chronological order in which
|
||||
// matches were found. If a thread t executes a FORK statement and spawns a
|
||||
// thread t0, then the priority of t0 is such that the following holds:
|
||||
// * t0 < t, i.e. t0 has lower priority than t.
|
||||
// * For all threads u such that u != t and u != t0, we have t0 < u iff t < u,
|
||||
// i.e. the t0 compares to other threads the same as t.
|
||||
// For example, if there are currently 3 threads s, t, u such that s < t < u,
|
||||
// then after t executes a fork, the thread priorities will be s < t0 < t < u.
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// Bytecode format.
|
||||
// Currently very simple fixed-size: The opcode is encoded in the first 4
|
||||
// bytes, the payload takes another 4 bytes.
|
||||
struct RegExpInstruction {
|
||||
enum Opcode : int32_t {
|
||||
CONSUME_RANGE,
|
||||
FORK,
|
||||
JMP,
|
||||
ACCEPT,
|
||||
};
|
||||
|
||||
struct Uc16Range {
|
||||
uc16 min; // Inclusive.
|
||||
uc16 max; // Inclusive.
|
||||
};
|
||||
|
||||
static RegExpInstruction ConsumeRange(Uc16Range consume_range) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = CONSUME_RANGE;
|
||||
result.payload.consume_range = consume_range;
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction Fork(int32_t alt_index) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = FORK;
|
||||
result.payload.pc = alt_index;
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction Jmp(int32_t alt_index) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = JMP;
|
||||
result.payload.pc = alt_index;
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction Accept() {
|
||||
RegExpInstruction result;
|
||||
result.opcode = ACCEPT;
|
||||
return result;
|
||||
}
|
||||
|
||||
Opcode opcode;
|
||||
union {
|
||||
// Payload of CONSUME_RANGE:
|
||||
Uc16Range consume_range;
|
||||
// Payload of FORK and JMP, the next/forked program counter (pc):
|
||||
int32_t pc;
|
||||
} payload;
|
||||
STATIC_ASSERT(sizeof(payload) == 4);
|
||||
};
|
||||
STATIC_ASSERT(sizeof(RegExpInstruction) == 8);
|
||||
// TODO(mbid,v8:10765): This is rather wasteful. We can fit the opcode in 2-3
|
||||
// bits, so the remaining 29/30 bits can be used as payload. Problem: The
|
||||
// payload of CONSUME_RANGE consists of two 16-bit values `min` and `max`, so
|
||||
// this wouldn't fit. We could encode the payload of a CONSUME_RANGE
|
||||
// instruction by the start of the interval and its length instead, and then
|
||||
// only allows lengths that fit into 14/13 bits. A longer range can then be
|
||||
// encoded as a disjunction of smaller ranges.
|
||||
//
|
||||
// Another thought: CONSUME_RANGEs are only valid if the payloads are such that
|
||||
// min <= max. Thus there are
|
||||
//
|
||||
// 2^16 + 2^16 - 1 + ... + 1
|
||||
// = 2^16 * (2^16 + 1) / 2
|
||||
// = 2^31 + 2^15
|
||||
//
|
||||
// valid payloads for a CONSUME_RANGE instruction. If we want to fit
|
||||
// instructions into 4 bytes, we would still have almost 2^31 instructions left
|
||||
// over if we encode everything as tight as possible. For example, we could
|
||||
// use another 2^29 values for JMP, another 2^29 for FORK, 1 value for ACCEPT,
|
||||
// and then still have almost 2^30 instructions left over for something like
|
||||
// zero-width assertions and captures.
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst);
|
||||
std::ostream& operator<<(std::ostream& os,
|
||||
Vector<const RegExpInstruction> insts);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_
|
365
src/regexp/experimental/experimental-compiler.cc
Normal file
365
src/regexp/experimental/experimental-compiler.cc
Normal file
@ -0,0 +1,365 @@
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "src/regexp/experimental/experimental-compiler.h"
|
||||
|
||||
#include "src/zone/zone-list-inl.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO(mbid, v8:10765): Currently the experimental engine doesn't support
|
||||
// UTF-16, but this shouldn't be too hard to implement.
|
||||
constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
|
||||
|
||||
class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
// Visitor to implement `ExperimentalRegExp::CanBeHandled`.
|
||||
public:
|
||||
static bool Check(RegExpTree* node, JSRegExp::Flags flags, Zone* zone) {
|
||||
if (!AreSuitableFlags(flags)) {
|
||||
return false;
|
||||
}
|
||||
CanBeHandledVisitor visitor(zone);
|
||||
node->Accept(&visitor, nullptr);
|
||||
return visitor.result_;
|
||||
}
|
||||
|
||||
private:
|
||||
explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {}
|
||||
|
||||
static bool AreSuitableFlags(JSRegExp::Flags flags) {
|
||||
// TODO(mbid, v8:10765): We should be able to support all flags in the
|
||||
// future.
|
||||
static constexpr JSRegExp::Flags allowed_flags = JSRegExp::kGlobal;
|
||||
return (flags & ~allowed_flags) == 0;
|
||||
}
|
||||
|
||||
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
|
||||
for (RegExpTree* alt : *node->alternatives()) {
|
||||
alt->Accept(this, nullptr);
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAlternative(RegExpAlternative* node, void*) override {
|
||||
for (RegExpTree* child : *node->nodes()) {
|
||||
child->Accept(this, nullptr);
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
|
||||
result_ = result_ && AreSuitableFlags(node->flags());
|
||||
for (CharacterRange r : *node->ranges(zone_)) {
|
||||
// TODO(mbid, v8:10765): We don't support full unicode yet, so we only
|
||||
// allow character ranges that can be specified with two-byte characters.
|
||||
if (r.to() > kMaxSupportedCodepoint) {
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAssertion(RegExpAssertion* node, void*) override {
|
||||
// TODO(mbid, v8:10765): We should be able to support at least some
|
||||
// assertions. re2 does, too.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAtom(RegExpAtom* node, void*) override {
|
||||
result_ = result_ && AreSuitableFlags(node->flags());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitText(RegExpText* node, void*) override {
|
||||
for (TextElement& el : *node->elements()) {
|
||||
el.tree()->Accept(this, nullptr);
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
|
||||
// TODO(mbid, v8:10765): Theoretically we can support arbitrary min() and
|
||||
// max(), but the size of the automaton grows linearly with finite max().
|
||||
// We probably want a cut-off value here, or maybe we can "virtualize" the
|
||||
// repetitions.
|
||||
// Non-greedy quantifiers are easy to implement, but not supported atm.
|
||||
// It's not clear to me how a possessive quantifier would be implemented,
|
||||
// we should check whether re2 supports this.
|
||||
result_ = result_ && node->min() == 0 &&
|
||||
node->max() == RegExpTree::kInfinity && node->is_greedy();
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
node->body()->Accept(this, nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitCapture(RegExpCapture* node, void*) override {
|
||||
// TODO(mbid, v8:10765): This can be implemented with the NFA interpreter,
|
||||
// but not with the lazy DFA. See also re2.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitGroup(RegExpGroup* node, void*) override {
|
||||
node->body()->Accept(this, nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitLookaround(RegExpLookaround* node, void*) override {
|
||||
// TODO(mbid, v8:10765): This will be hard to support, but not impossible I
|
||||
// think. See product automata.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitBackReference(RegExpBackReference* node, void*) override {
|
||||
// This can't be implemented without backtracking.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
|
||||
|
||||
private:
|
||||
bool result_ = true;
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
|
||||
JSRegExp::Flags flags,
|
||||
Zone* zone) {
|
||||
DCHECK(FLAG_enable_experimental_regexp_engine);
|
||||
return CanBeHandledVisitor::Check(tree, flags, zone);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
class CompileVisitor : private RegExpVisitor {
|
||||
public:
|
||||
static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
|
||||
JSRegExp::Flags flags,
|
||||
Zone* zone) {
|
||||
CompileVisitor compiler(zone);
|
||||
|
||||
tree->Accept(&compiler, nullptr);
|
||||
compiler.code_.Add(RegExpInstruction::Accept(), zone);
|
||||
|
||||
return std::move(compiler.code_);
|
||||
}
|
||||
|
||||
private:
|
||||
// TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
|
||||
// the `tree` size we're going to compile?
|
||||
explicit CompileVisitor(Zone* zone) : zone_(zone), code_(0, zone) {}
|
||||
|
||||
// Generate a disjunction of code fragments compiled by a function `alt_gen`.
|
||||
// `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
|
||||
// 1` and should push code corresponding to the ith alternative onto `code_`.
|
||||
template <class F>
|
||||
void CompileDisjunction(int alt_num, F gen_alt) {
|
||||
// An alternative a0 | a1 | a2 is compiled into
|
||||
// FORK <a2>
|
||||
// FORK <a1>
|
||||
// <a0>
|
||||
// JMP $end
|
||||
// <a1>
|
||||
// JMP $end
|
||||
// <a2>
|
||||
// where $end is the index of the next instruction after <a2>.
|
||||
//
|
||||
// By the semantics of the FORK instruction (see above at definition and
|
||||
// semantics), the forked thread has lower priority than the current
|
||||
// thread. This means that with the code we're generating here, the thread
|
||||
// matching the alternative a0 is indeed the thread with the highest
|
||||
// priority, followed by the thread for a1 and so on.
|
||||
|
||||
if (alt_num == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Record the index of the first of the alt_num - 1 fork instructions in the
|
||||
// beginning.
|
||||
int forks_begin = code_.length();
|
||||
// Add FORKs to alts[alt_num - 1], alts[alt_num - 2], ..., alts[1].
|
||||
for (int i = alt_num - 1; i != 0; --i) {
|
||||
// The FORK's address is patched once we know the address of the ith
|
||||
// alternative.
|
||||
code_.Add(RegExpInstruction::Fork(-1), zone_);
|
||||
}
|
||||
|
||||
// List containing the index of the final JMP instruction after each
|
||||
// alternative but the last one.
|
||||
ZoneList<int> jmp_indices(alt_num - 1, zone_);
|
||||
|
||||
for (int i = 0; i != alt_num; ++i) {
|
||||
if (i != 0) {
|
||||
// If this is not the first alternative, we have to patch the
|
||||
// corresponding FORK statement in the beginning.
|
||||
code_[forks_begin + alt_num - 1 - i].payload.pc = code_.length();
|
||||
}
|
||||
gen_alt(i);
|
||||
if (i != alt_num - 1) {
|
||||
// If this is not the last alternative, we have to emit a JMP past the
|
||||
// remaining alternatives. We don't know this address yet, so we have
|
||||
// to patch patch it once all alternatives are emitted.
|
||||
jmp_indices.Add(code_.length(), zone_);
|
||||
code_.Add(RegExpInstruction::Jmp(-1), zone_);
|
||||
}
|
||||
}
|
||||
|
||||
// All alternatives are emitted. Now we can patch the JMP instruction
|
||||
// after each but the last alternative.
|
||||
int end_index = code_.length();
|
||||
for (int jmp_index : jmp_indices) {
|
||||
code_[jmp_index].payload.pc = end_index;
|
||||
}
|
||||
}
|
||||
|
||||
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
|
||||
ZoneList<RegExpTree*>& alts = *node->alternatives();
|
||||
CompileDisjunction(alts.length(),
|
||||
[&](int i) { alts[i]->Accept(this, nullptr); });
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAlternative(RegExpAlternative* node, void*) override {
|
||||
for (RegExpTree* child : *node->nodes()) {
|
||||
child->Accept(this, nullptr);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAssertion(RegExpAssertion* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Support this case.
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
|
||||
// A character class is compiled as Disjunction over its `CharacterRange`s.
|
||||
ZoneList<CharacterRange>* ranges = node->ranges(zone_);
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
if (node->is_negated()) {
|
||||
// Capacity 2 for the common case where we compute the complement of a
|
||||
// single interval range that doesn't contain 0 and kMaxCodePoint.
|
||||
ZoneList<CharacterRange>* negated =
|
||||
zone_->New<ZoneList<CharacterRange>>(2, zone_);
|
||||
CharacterRange::Negate(ranges, negated, zone_);
|
||||
ranges = negated;
|
||||
}
|
||||
|
||||
CompileDisjunction(ranges->length(), [&](int i) {
|
||||
// We don't support utf16 for now, so only ranges that can be specified
|
||||
// by (complements of) ranges with uc16 bounds.
|
||||
STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits<uc16>::max());
|
||||
|
||||
uc32 from = (*ranges)[i].from();
|
||||
DCHECK_LE(from, kMaxSupportedCodepoint);
|
||||
uc16 from_uc16 = static_cast<uc16>(from);
|
||||
|
||||
uc32 to = (*ranges)[i].to();
|
||||
DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint);
|
||||
uc16 to_uc16 = static_cast<uc16>(std::min(to, kMaxSupportedCodepoint));
|
||||
|
||||
RegExpInstruction::Uc16Range range{from_uc16, to_uc16};
|
||||
code_.Add(RegExpInstruction::ConsumeRange(range), zone_);
|
||||
});
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAtom(RegExpAtom* node, void*) override {
|
||||
for (uc16 c : node->data()) {
|
||||
code_.Add(
|
||||
RegExpInstruction::ConsumeRange(RegExpInstruction::Uc16Range{c, c}),
|
||||
zone_);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
|
||||
// TODO(mbid,v8:10765): For now we support a quantifier of the form /x*/,
|
||||
// i.e. greedy match of any number of /x/. See also the comment in
|
||||
// `CanBeHandledVisitor::VisitQuantifier`.
|
||||
DCHECK_EQ(node->min(), 0);
|
||||
DCHECK_EQ(node->max(), RegExpTree::kInfinity);
|
||||
DCHECK(node->is_greedy());
|
||||
|
||||
// The repetition of /x/ is compiled into
|
||||
//
|
||||
// a: FORK d
|
||||
// b: <x>
|
||||
// c: JMP a
|
||||
// d: ...
|
||||
//
|
||||
// Note that a FORKed thread has lower priority than the main thread, so
|
||||
// this will indeed match greedily.
|
||||
|
||||
int initial_fork_index = code_.length();
|
||||
// The FORK's address is patched once we're done.
|
||||
code_.Add(RegExpInstruction::Fork(-1), zone_);
|
||||
node->body()->Accept(this, nullptr);
|
||||
code_.Add(RegExpInstruction::Jmp(initial_fork_index), zone_);
|
||||
int end_index = code_.length();
|
||||
code_[initial_fork_index].payload.pc = end_index;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitCapture(RegExpCapture* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Support this case.
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitGroup(RegExpGroup* node, void*) override {
|
||||
node->body()->Accept(this, nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitLookaround(RegExpLookaround* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Support this case.
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitBackReference(RegExpBackReference* node, void*) override {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
|
||||
|
||||
void* VisitText(RegExpText* node, void*) override {
|
||||
for (TextElement& text_el : *node->elements()) {
|
||||
text_el.tree()->Accept(this, nullptr);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
Zone* zone_;
|
||||
ZoneList<RegExpInstruction> code_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
|
||||
RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) {
|
||||
return CompileVisitor::Compile(tree, flags, zone);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
33
src/regexp/experimental/experimental-compiler.h
Normal file
33
src/regexp/experimental/experimental-compiler.h
Normal file
@ -0,0 +1,33 @@
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_
|
||||
#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_
|
||||
|
||||
#include "src/regexp/experimental/experimental-bytecode.h"
|
||||
#include "src/regexp/regexp-ast.h"
|
||||
#include "src/zone/zone-list.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class ExperimentalRegExpCompiler final : public AllStatic {
|
||||
public:
|
||||
// Checks whether a given RegExpTree can be compiled into an experimental
|
||||
// bytecode program. This mostly amounts to the absence of back references,
|
||||
// but see the definition.
|
||||
// TODO(mbid,v8:10765): Currently more things are not handled, e.g. some
|
||||
// quantifiers and unicode.
|
||||
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, Zone* zone);
|
||||
// Compile regexp into a bytecode program. The regexp must be handlable by
|
||||
// the experimental engine; see`CanBeHandled`. The program is returned as a
|
||||
// ZoneList backed by the same Zone that is used in the RegExpTree argument.
|
||||
static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
|
||||
JSRegExp::Flags flags, Zone* zone);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_
|
349
src/regexp/experimental/experimental-interpreter.cc
Normal file
349
src/regexp/experimental/experimental-interpreter.cc
Normal file
@ -0,0 +1,349 @@
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "src/regexp/experimental/experimental-interpreter.h"
|
||||
|
||||
#include "src/base/optional.h"
|
||||
#include "src/base/small-vector.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
using MatchRange = ExperimentalRegExpInterpreter::MatchRange;
|
||||
|
||||
namespace {
|
||||
|
||||
template <class Character>
|
||||
class NfaInterpreter {
|
||||
// Executes a bytecode program in breadth-first mode, without backtracking.
|
||||
// `Character` can be instantiated with `uint8_t` or `uc16` for one byte or
|
||||
// two byte input strings.
|
||||
//
|
||||
// In contrast to the backtracking implementation, this has linear time
|
||||
// complexity in the length of the input string. Breadth-first mode means
|
||||
// that threads are executed in lockstep with respect to their input
|
||||
// position, i.e. the threads share a common input index. This is similar
|
||||
// to breadth-first simulation of a non-deterministic finite automaton (nfa),
|
||||
// hence the name of the class.
|
||||
//
|
||||
// To follow the semantics of a backtracking VM implementation, we have to be
|
||||
// careful about whether we stop execution when a thread executes ACCEPT.
|
||||
// For example, consider execution of the bytecode generated by the regexp
|
||||
//
|
||||
// r = /abc|..|[a-c]{10,}/
|
||||
//
|
||||
// on input "abcccccccccccccc". Clearly the three alternatives
|
||||
// - /abc/
|
||||
// - /../
|
||||
// - /[a-c]{10,}/
|
||||
// all match this input. A backtracking implementation will report "abc" as
|
||||
// match, because it explores the first alternative before the others.
|
||||
//
|
||||
// However, if we execute breadth first, then we execute the 3 threads
|
||||
// - t1, which tries to match /abc/
|
||||
// - t2, which tries to match /../
|
||||
// - t3, which tries to match /[a-c]{10,}/
|
||||
// in lockstep i.e. by iterating over the input and feeding all threads one
|
||||
// character at a time. t2 will execute an ACCEPT after two characters,
|
||||
// while t1 will only execute ACCEPT after three characters. Thus we find a
|
||||
// match for the second alternative before a match of the first alternative.
|
||||
//
|
||||
// This shows that we cannot always stop searching as soon as some thread t
|
||||
// executes ACCEPT: If there is a thread u with higher priority than t, then
|
||||
// it must be finished first. If u produces a match, then we can discard the
|
||||
// match of t because matches produced by threads with higher priority are
|
||||
// preferred over matches of threads with lower priority. On the other hand,
|
||||
// we are allowed to abort all threads with lower priority than t if t
|
||||
// produces a match: Such threads can only produce worse matches. In the
|
||||
// example above, we can abort t3 after two characters because of t2's match.
|
||||
//
|
||||
// Thus the interpreter keeps track of a priority-ordered list of threads.
|
||||
// If a thread ACCEPTs, all threads with lower priority are discarded, and
|
||||
// the search continues with the threads with higher priority. If no threads
|
||||
// with high priority are left, we return the match that was produced by the
|
||||
// ACCEPTing thread with highest priority.
|
||||
public:
|
||||
NfaInterpreter(Vector<const RegExpInstruction> bytecode,
|
||||
Vector<const Character> input, int32_t input_index)
|
||||
: bytecode_(bytecode),
|
||||
input_(input),
|
||||
input_index_(input_index),
|
||||
pc_last_input_index_(bytecode.size()),
|
||||
active_threads_(),
|
||||
blocked_threads_(),
|
||||
best_match_(base::nullopt) {
|
||||
DCHECK(!bytecode_.empty());
|
||||
DCHECK_GE(input_index_, 0);
|
||||
DCHECK_LE(input_index_, input_.length());
|
||||
|
||||
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
|
||||
}
|
||||
|
||||
// Finds up to `max_match_num` matches and writes their boundaries to
|
||||
// `matches_out`. The search begins at the current input index. Returns the
|
||||
// number of matches found.
|
||||
int FindMatches(MatchRange* matches_out, int max_match_num) {
|
||||
int match_num;
|
||||
for (match_num = 0; match_num != max_match_num; ++match_num) {
|
||||
base::Optional<MatchRange> match = FindNextMatch();
|
||||
if (!match.has_value()) {
|
||||
break;
|
||||
}
|
||||
|
||||
matches_out[match_num] = *match;
|
||||
SetInputIndex(match->end);
|
||||
}
|
||||
return match_num;
|
||||
}
|
||||
|
||||
private:
|
||||
// The state of a "thread" executing experimental regexp bytecode. (Not to
|
||||
// be confused with an OS thread.)
|
||||
struct InterpreterThread {
|
||||
// This thread's program counter, i.e. the index within `bytecode_` of the
|
||||
// next instruction to be executed.
|
||||
int32_t pc;
|
||||
// The index in the input string where this thread started executing.
|
||||
int32_t match_begin;
|
||||
};
|
||||
|
||||
// Change the current input index for future calls to `FindNextMatch`.
|
||||
void SetInputIndex(int new_input_index) {
|
||||
DCHECK_GE(input_index_, 0);
|
||||
DCHECK_LE(input_index_, input_.length());
|
||||
|
||||
input_index_ = new_input_index;
|
||||
}
|
||||
|
||||
// Find the next match, begin search at input_index_;
|
||||
base::Optional<MatchRange> FindNextMatch() {
|
||||
DCHECK(active_threads_.empty());
|
||||
// TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
|
||||
// here? As long as
|
||||
//
|
||||
// pc_last_input_index_[pc] < input_index_
|
||||
//
|
||||
// for all possible program counters pc that are reachable without input
|
||||
// from pc = 0 and
|
||||
//
|
||||
// pc_last_input_index_[k] <= input_index_
|
||||
//
|
||||
// for all k > 0 hold I think everything should be fine. Maybe we can do
|
||||
// something about this in `SetInputIndex`.
|
||||
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
|
||||
|
||||
DCHECK(blocked_threads_.empty());
|
||||
DCHECK(active_threads_.empty());
|
||||
DCHECK_EQ(best_match_, base::nullopt);
|
||||
|
||||
// All threads start at bytecode 0.
|
||||
PushActiveThreadUnchecked(InterpreterThread{0, input_index_});
|
||||
// Run the initial thread, potentially forking new threads, until every
|
||||
// thread is blocked without further input.
|
||||
RunActiveThreads();
|
||||
|
||||
// We stop if one of the following conditions hold:
|
||||
// - We have exhausted the entire input.
|
||||
// - We have found a match at some point, and there are no remaining
|
||||
// threads with higher priority than the thread that produced the match.
|
||||
// Threads with low priority have been aborted earlier, and the remaining
|
||||
// threads are blocked here, so the latter simply means that
|
||||
// `blocked_threads_` is empty.
|
||||
while (input_index_ != input_.length() &&
|
||||
!(best_match_.has_value() && blocked_threads_.empty())) {
|
||||
DCHECK(active_threads_.empty());
|
||||
uc16 input_char = input_[input_index_];
|
||||
++input_index_;
|
||||
|
||||
// If we haven't found a match yet, we add a thread with least priority
|
||||
// that attempts a match starting after `input_char`.
|
||||
if (!best_match_.has_value()) {
|
||||
active_threads_.emplace_back(InterpreterThread{0, input_index_});
|
||||
}
|
||||
|
||||
// We unblock all blocked_threads_ by feeding them the input char.
|
||||
FlushBlockedThreads(input_char);
|
||||
|
||||
// Run all threads until they block or accept.
|
||||
RunActiveThreads();
|
||||
}
|
||||
|
||||
// Clean up the data structures we used.
|
||||
base::Optional<MatchRange> result = best_match_;
|
||||
best_match_ = base::nullopt;
|
||||
blocked_threads_.clear();
|
||||
active_threads_.clear();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
|
||||
// instruction, or its PC value was already processed.
|
||||
// - If processing of `t` can't continue because of CONSUME_RANGE, it is
|
||||
// pushed on `blocked_threads_`.
|
||||
// - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
|
||||
// the current input index. All remaining `active_threads_` are discarded.
|
||||
void RunActiveThread(InterpreterThread t) {
|
||||
while (true) {
|
||||
RegExpInstruction inst = bytecode_[t.pc];
|
||||
switch (inst.opcode) {
|
||||
case RegExpInstruction::CONSUME_RANGE: {
|
||||
blocked_threads_.emplace_back(t);
|
||||
return;
|
||||
}
|
||||
case RegExpInstruction::FORK: {
|
||||
InterpreterThread fork = t;
|
||||
fork.pc = inst.payload.pc;
|
||||
++t.pc;
|
||||
|
||||
// t has higher priority than fork. If t.pc hasn't been processed,we
|
||||
// push fork on the active_thread_ stack and continue directly with
|
||||
// t. Otherwise we continue directly with fork if possible.
|
||||
if (!IsPcProcessed(t.pc)) {
|
||||
MarkPcProcessed(t.pc);
|
||||
PushActiveThread(fork);
|
||||
break;
|
||||
} else if (!IsPcProcessed(fork.pc)) {
|
||||
t = fork;
|
||||
MarkPcProcessed(t.pc);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
case RegExpInstruction::JMP:
|
||||
t.pc = inst.payload.pc;
|
||||
if (IsPcProcessed(t.pc)) return;
|
||||
MarkPcProcessed(t.pc);
|
||||
break;
|
||||
case RegExpInstruction::ACCEPT:
|
||||
best_match_ = MatchRange{t.match_begin, input_index_};
|
||||
active_threads_.clear();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run each active thread until it can't continue without further input.
|
||||
// `active_threads_` is empty afterwards. `blocked_threads_` are sorted from
|
||||
// low to high priority.
|
||||
void RunActiveThreads() {
|
||||
while (!active_threads_.empty()) {
|
||||
InterpreterThread t = active_threads_.back();
|
||||
active_threads_.pop_back();
|
||||
RunActiveThread(t);
|
||||
}
|
||||
}
|
||||
|
||||
// Unblock all blocked_threads_ by feeding them an `input_char`. Should only
|
||||
// be called with `input_index_` pointing to the character *after*
|
||||
// `input_char` so that `pc_last_input_index_` is updated correctly.
|
||||
void FlushBlockedThreads(uc16 input_char) {
|
||||
// The threads in blocked_threads_ are sorted from high to low priority,
|
||||
// but active_threads_ needs to be sorted from low to high priority, so we
|
||||
// need to activate blocked threads in reverse order.
|
||||
//
|
||||
// TODO(mbid,v8:10765): base::SmallVector doesn't support `rbegin()` and
|
||||
// `rend()`, should we implement that instead of this awkward iteration?
|
||||
// Maybe we could at least use an int i and check for i >= 0, but
|
||||
// SmallVectors don't have length() methods.
|
||||
for (size_t i = blocked_threads_.size(); i > 0; --i) {
|
||||
InterpreterThread t = blocked_threads_[i - 1];
|
||||
RegExpInstruction inst = bytecode_[t.pc];
|
||||
DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
|
||||
RegExpInstruction::Uc16Range range = inst.payload.consume_range;
|
||||
if (input_char >= range.min && input_char <= range.max) {
|
||||
++t.pc;
|
||||
PushActiveThreadUnchecked(t);
|
||||
}
|
||||
}
|
||||
blocked_threads_.clear();
|
||||
}
|
||||
|
||||
// It is redundant to have two threads t, t0 execute at the same PC value,
|
||||
// because one of t, t0 matches iff the other does. We can thus discard
|
||||
// the one with lower priority. We check whether a thread executed at some
|
||||
// PC value by recording for every possible value of PC what the value of
|
||||
// input_index_ was the last time a thread executed at PC. If a thread
|
||||
// tries to continue execution at a PC value that we have seen before at
|
||||
// the current input index, we abort it. (We execute threads with higher
|
||||
// priority first, so the second thread is guaranteed to have lower
|
||||
// priority.)
|
||||
//
|
||||
// Check whether we've seen an active thread with a given pc value since the
|
||||
// last increment of `input_index_`.
|
||||
bool IsPcProcessed(int pc) {
|
||||
DCHECK_LE(pc_last_input_index_[pc], input_index_);
|
||||
return pc_last_input_index_[pc] == input_index_;
|
||||
}
|
||||
|
||||
// Mark a pc as having been processed since the last increment of
|
||||
// `input_index_`.
|
||||
void MarkPcProcessed(int pc) {
|
||||
DCHECK_LE(pc_last_input_index_[pc], input_index_);
|
||||
pc_last_input_index_[pc] = input_index_;
|
||||
}
|
||||
|
||||
// Functions to push a thread `t` onto the list of active threads, but only
|
||||
// if `t.pc` was not already the pc of some other thread at the current
|
||||
// subject index.
|
||||
void PushActiveThreadUnchecked(InterpreterThread t) {
|
||||
DCHECK(!IsPcProcessed(t.pc));
|
||||
|
||||
MarkPcProcessed(t.pc);
|
||||
active_threads_.emplace_back(t);
|
||||
}
|
||||
void PushActiveThread(InterpreterThread t) {
|
||||
if (IsPcProcessed(t.pc)) {
|
||||
return;
|
||||
}
|
||||
PushActiveThreadUnchecked(t);
|
||||
}
|
||||
|
||||
Vector<const RegExpInstruction> bytecode_;
|
||||
Vector<const Character> input_;
|
||||
int input_index_;
|
||||
|
||||
// TODO(mbid,v8:10765): The following `SmallVector`s have somehwat
|
||||
// arbitrarily chosen small capacity sizes; should benchmark to find a good
|
||||
// value.
|
||||
|
||||
// pc_last_input_index_[k] records the value of input_index_ the last
|
||||
// time a thread t such that t.pc == k was activated, i.e. put on
|
||||
// active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See
|
||||
// also `RunActiveThread`.
|
||||
base::SmallVector<int, 64> pc_last_input_index_;
|
||||
|
||||
// Active threads can potentially (but not necessarily) continue without
|
||||
// input. Sorted from low to high priority.
|
||||
base::SmallVector<InterpreterThread, 64> active_threads_;
|
||||
|
||||
// The pc of a blocked thread points to an instruction that consumes a
|
||||
// character. Sorted from high to low priority (so the opposite of
|
||||
// `active_threads_`).
|
||||
base::SmallVector<InterpreterThread, 64> blocked_threads_;
|
||||
|
||||
// The best match found so far during the current search. If several threads
|
||||
// ACCEPTed, then this will be the match of the accepting thread with highest
|
||||
// priority.
|
||||
base::Optional<MatchRange> best_match_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
int ExperimentalRegExpInterpreter::FindMatchesNfaOneByte(
|
||||
Vector<const RegExpInstruction> bytecode, Vector<const uint8_t> input,
|
||||
int start_index, MatchRange* matches_out, int max_match_num) {
|
||||
NfaInterpreter<uint8_t> interpreter(bytecode, input, start_index);
|
||||
return interpreter.FindMatches(matches_out, max_match_num);
|
||||
}
|
||||
|
||||
int ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte(
|
||||
Vector<const RegExpInstruction> bytecode, Vector<const uc16> input,
|
||||
int start_index, MatchRange* matches_out, int max_match_num) {
|
||||
NfaInterpreter<uc16> interpreter(bytecode, input, start_index);
|
||||
return interpreter.FindMatches(matches_out, max_match_num);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
40
src/regexp/experimental/experimental-interpreter.h
Normal file
40
src/regexp/experimental/experimental-interpreter.h
Normal file
@ -0,0 +1,40 @@
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_
|
||||
#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_
|
||||
|
||||
#include "src/regexp/experimental/experimental-bytecode.h"
|
||||
#include "src/utils/vector.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class ExperimentalRegExpInterpreter final : public AllStatic {
|
||||
public:
|
||||
// A half-open range in an a string denoting a (sub)match. Used to access
|
||||
// output registers of regexp execution grouped by [begin, end) pairs.
|
||||
struct MatchRange {
|
||||
int32_t begin; // inclusive
|
||||
int32_t end; // exclusive
|
||||
};
|
||||
|
||||
// Executes a bytecode program in breadth-first NFA mode, without
|
||||
// backtracking, to find matching substrings. Trys to find up to
|
||||
// `max_match_num` matches in `input`, starting at `start_index`. Returns
|
||||
// the actual number of matches found. The boundaires of matching subranges
|
||||
// are written to `matches_out`. Provided in variants for one-byte and
|
||||
// two-byte strings.
|
||||
static int FindMatchesNfaOneByte(Vector<const RegExpInstruction> bytecode,
|
||||
Vector<const uint8_t> input, int start_index,
|
||||
MatchRange* matches_out, int max_match_num);
|
||||
static int FindMatchesNfaTwoByte(Vector<const RegExpInstruction> bytecode,
|
||||
Vector<const uc16> input, int start_index,
|
||||
MatchRange* matches_out, int max_match_num);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_
|
@ -4,157 +4,18 @@
|
||||
|
||||
#include "src/regexp/experimental/experimental.h"
|
||||
|
||||
#include <iomanip>
|
||||
#include <ios>
|
||||
|
||||
#include "src/base/optional.h"
|
||||
#include "src/base/small-vector.h"
|
||||
#include "src/objects/js-regexp-inl.h"
|
||||
#include "src/regexp/regexp-ast.h"
|
||||
#include "src/regexp/experimental/experimental-compiler.h"
|
||||
#include "src/regexp/experimental/experimental-interpreter.h"
|
||||
#include "src/regexp/regexp-parser.h"
|
||||
#include "src/utils/ostreams.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO(mbid, v8:10765): Currently the experimental engine doesn't support
|
||||
// UTF-16, but this shouldn't be too hard to implement.
|
||||
constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
|
||||
|
||||
class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
// Visitor to implement `ExperimentalRegExp::CanBeHandled`.
|
||||
public:
|
||||
static bool Check(RegExpTree* node, JSRegExp::Flags flags, Zone* zone) {
|
||||
if (!AreSuitableFlags(flags)) {
|
||||
return false;
|
||||
}
|
||||
CanBeHandledVisitor visitor(zone);
|
||||
node->Accept(&visitor, nullptr);
|
||||
return visitor.result_;
|
||||
}
|
||||
|
||||
private:
|
||||
explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {}
|
||||
|
||||
static bool AreSuitableFlags(JSRegExp::Flags flags) {
|
||||
// TODO(mbid, v8:10765): We should be able to support all flags in the
|
||||
// future.
|
||||
static constexpr JSRegExp::Flags allowed_flags = JSRegExp::kGlobal;
|
||||
return (flags & ~allowed_flags) == 0;
|
||||
}
|
||||
|
||||
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
|
||||
for (RegExpTree* alt : *node->alternatives()) {
|
||||
alt->Accept(this, nullptr);
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAlternative(RegExpAlternative* node, void*) override {
|
||||
for (RegExpTree* child : *node->nodes()) {
|
||||
child->Accept(this, nullptr);
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
|
||||
result_ = result_ && AreSuitableFlags(node->flags());
|
||||
for (CharacterRange r : *node->ranges(zone_)) {
|
||||
// TODO(mbid, v8:10765): We don't support full unicode yet, so we only
|
||||
// allow character ranges that can be specified with two-byte characters.
|
||||
if (r.to() > kMaxSupportedCodepoint) {
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAssertion(RegExpAssertion* node, void*) override {
|
||||
// TODO(mbid, v8:10765): We should be able to support at least some
|
||||
// assertions. re2 does, too.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAtom(RegExpAtom* node, void*) override {
|
||||
result_ = result_ && AreSuitableFlags(node->flags());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitText(RegExpText* node, void*) override {
|
||||
for (TextElement& el : *node->elements()) {
|
||||
el.tree()->Accept(this, nullptr);
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
|
||||
// TODO(mbid, v8:10765): Theoretically we can support arbitrary min() and
|
||||
// max(), but the size of the automaton grows linearly with finite max().
|
||||
// We probably want a cut-off value here, or maybe we can "virtualize" the
|
||||
// repetitions.
|
||||
// Non-greedy quantifiers are easy to implement, but not supported atm.
|
||||
// It's not clear to me how a possessive quantifier would be implemented,
|
||||
// we should check whether re2 supports this.
|
||||
result_ = result_ && node->min() == 0 &&
|
||||
node->max() == RegExpTree::kInfinity && node->is_greedy();
|
||||
if (!result_) {
|
||||
return nullptr;
|
||||
}
|
||||
node->body()->Accept(this, nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitCapture(RegExpCapture* node, void*) override {
|
||||
// TODO(mbid, v8:10765): This can be implemented with the NFA interpreter,
|
||||
// but not with the lazy DFA. See also re2.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitGroup(RegExpGroup* node, void*) override {
|
||||
node->body()->Accept(this, nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitLookaround(RegExpLookaround* node, void*) override {
|
||||
// TODO(mbid, v8:10765): This will be hard to support, but not impossible I
|
||||
// think. See product automata.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitBackReference(RegExpBackReference* node, void*) override {
|
||||
// This can't be implemented without backtracking.
|
||||
result_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
|
||||
|
||||
private:
|
||||
bool result_ = true;
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
|
||||
Zone* zone) {
|
||||
DCHECK(FLAG_enable_experimental_regexp_engine);
|
||||
return CanBeHandledVisitor::Check(tree, flags, zone);
|
||||
return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, zone);
|
||||
}
|
||||
|
||||
void ExperimentalRegExp::Initialize(Isolate* isolate, Handle<JSRegExp> re,
|
||||
@ -182,423 +43,6 @@ bool ExperimentalRegExp::IsCompiled(Handle<JSRegExp> re, Isolate* isolate) {
|
||||
Smi::FromInt(JSRegExp::kUninitializedValue);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Definition and semantics of the EXPERIMENTAL bytecode.
|
||||
// Background:
|
||||
// - Russ Cox's blog post series on regular expression matching, in particular
|
||||
// https://swtch.com/~rsc/regexp/regexp2.html
|
||||
// - The re2 regular regexp library: https://github.com/google/re2
|
||||
//
|
||||
// This comment describes the bytecode used by the experimental regexp engine
|
||||
// and its abstract semantics in terms of a VM. An implementation of the
|
||||
// semantics that avoids exponential runtime can be found in `NfaInterpreter`.
|
||||
//
|
||||
// The experimental bytecode describes a non-deterministic finite automaton. It
|
||||
// runs on a multithreaded virtual machine (VM), i.e. in several threads
|
||||
// concurrently. (These "threads" don't need to be actual operating system
|
||||
// threads.) Apart from a list of threads, the VM maintains an immutable
|
||||
// shared input string which threads can read from. Each thread is given by a
|
||||
// program counter (PC, index of the current instruction), a fixed number of
|
||||
// registers of indices into the input string, and a monotonically increasing
|
||||
// index which represents the current position within the input string.
|
||||
//
|
||||
// For the precise encoding of the instruction set, see the definition `struct
|
||||
// RegExpInstruction` below. Currently we support the following instructions:
|
||||
// - CONSUME_RANGE: Check whether the codepoint of the current character is
|
||||
// contained in a non-empty closed interval [min, max] specified in the
|
||||
// instruction payload. Abort this thread if false, otherwise advance the
|
||||
// input position by 1 and continue with the next instruction.
|
||||
// - ACCEPT: Stop this thread and signify the end of a match at the current
|
||||
// input position.
|
||||
// - FORK: If executed by a thread t, spawn a new thread t0 whose register
|
||||
// values and input position agree with those of t, but whose PC value is set
|
||||
// to the value specified in the instruction payload. The register values of
|
||||
// t and t0 agree directly after the FORK, but they can diverge. Thread t
|
||||
// continues with the instruction directly after the current FORK
|
||||
// instruction.
|
||||
// - JMP: Instead of incrementing the PC value after execution of this
|
||||
// instruction by 1, set PC of this thread to the value specified in the
|
||||
// instruction payload and continue there.
|
||||
//
|
||||
// Special care must be exercised with respect to thread priority. It is
|
||||
// possible that more than one thread executes an ACCEPT statement. The output
|
||||
// of the program is given by the contents of the matching thread's registers,
|
||||
// so this is ambiguous in case of multiple matches. To resolve the ambiguity,
|
||||
// every implementation of the VM must output the match that a backtracking
|
||||
// implementation would output (i.e. behave the same as Irregexp).
|
||||
//
|
||||
// A backtracking implementation of the VM maintains a stack of postponed
|
||||
// threads. Upon encountering a FORK statement, this VM will create a copy of
|
||||
// the current thread, set the copy's PC value according to the instruction
|
||||
// payload, and push it to the stack of postponed threads. The VM will then
|
||||
// continue execution of the current thread.
|
||||
//
|
||||
// If at some point a thread t executes a MATCH statement, the VM stops and
|
||||
// outputs the registers of t. Postponed threads are discarded. On the other
|
||||
// hand, if a thread t is aborted because some input character didn't pass a
|
||||
// check, then the VM pops the topmost postponed thread and continues execution
|
||||
// with this thread. If there are no postponed threads, then the VM outputs
|
||||
// failure, i.e. no matches.
|
||||
//
|
||||
// Equivalently, we can describe the behavior of the backtracking VM in terms
|
||||
// of priority: Threads are linearly ordered by priority, and matches generated
|
||||
// by threads with high priority must be preferred over matches generated by
|
||||
// threads with low priority, regardless of the chronological order in which
|
||||
// matches were found. If a thread t executes a FORK statement and spawns a
|
||||
// thread t0, then the priority of t0 is such that the following holds:
|
||||
// * t0 < t, i.e. t0 has lower priority than t.
|
||||
// * For all threads u such that u != t and u != t0, we have t0 < u iff t < u,
|
||||
// i.e. the t0 compares to other threads the same as t.
|
||||
// For example, if there are currently 3 threads s, t, u such that s < t < u,
|
||||
// then after t executes a fork, the thread priorities will be s < t0 < t < u.
|
||||
|
||||
namespace {
|
||||
|
||||
struct Uc16Range {
|
||||
uc16 min; // Inclusive.
|
||||
uc16 max; // Inclusive.
|
||||
};
|
||||
|
||||
// Bytecode format.
|
||||
// Currently very simple fixed-size: The opcode is encoded in the first 4
|
||||
// bytes, the payload takes another 4 bytes.
|
||||
struct RegExpInstruction {
|
||||
enum Opcode : int32_t {
|
||||
CONSUME_RANGE,
|
||||
FORK,
|
||||
JMP,
|
||||
ACCEPT,
|
||||
};
|
||||
|
||||
static RegExpInstruction ConsumeRange(Uc16Range consume_range) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = CONSUME_RANGE;
|
||||
result.payload.consume_range = consume_range;
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction Fork(int32_t alt_index) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = FORK;
|
||||
result.payload.pc = alt_index;
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction Jmp(int32_t alt_index) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = JMP;
|
||||
result.payload.pc = alt_index;
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction Accept() {
|
||||
RegExpInstruction result;
|
||||
result.opcode = ACCEPT;
|
||||
return result;
|
||||
}
|
||||
|
||||
Opcode opcode;
|
||||
union {
|
||||
// Payload of CONSUME_RANGE:
|
||||
Uc16Range consume_range;
|
||||
// Payload of FORK and JMP, the next/forked program counter (pc):
|
||||
int32_t pc;
|
||||
} payload;
|
||||
STATIC_ASSERT(sizeof(payload) == 4);
|
||||
};
|
||||
STATIC_ASSERT(sizeof(RegExpInstruction) == 8);
|
||||
// TODO(mbid,v8:10765): This is rather wasteful. We can fit the opcode in 2-3
|
||||
// bits, so the remaining 29/30 bits can be used as payload. Problem: The
|
||||
// payload of CONSUME_RANGE consists of two 16-bit values `min` and `max`, so
|
||||
// this wouldn't fit. We could encode the payload of a CONSUME_RANGE
|
||||
// instruction by the start of the interval and its length instead, and then
|
||||
// only allows lengths that fit into 14/13 bits. A longer range can then be
|
||||
// encoded as a disjunction of smaller ranges.
|
||||
//
|
||||
// Another thought: CONSUME_RANGEs are only valid if the payloads are such that
|
||||
// min <= max. Thus there are
|
||||
//
|
||||
// 2^16 + 2^16 - 1 + ... + 1
|
||||
// = 2^16 * (2^16 + 1) / 2
|
||||
// = 2^31 + 2^15
|
||||
//
|
||||
// valid payloads for a CONSUME_RANGE instruction. If we want to fit
|
||||
// instructions into 4 bytes, we would still have almost 2^31 instructions left
|
||||
// over if we encode everything as tight as possible. For example, we could
|
||||
// use another 2^29 values for JMP, another 2^29 for FORK, 1 value for ACCEPT,
|
||||
// and then still have almost 2^30 instructions left over for something like
|
||||
// zero-width assertions and captures.
|
||||
|
||||
std::ostream& PrintAsciiOrHex(std::ostream& os, uc16 c) {
|
||||
if (c < 128 && std::isprint(c)) {
|
||||
os << static_cast<char>(c);
|
||||
} else {
|
||||
os << "0x" << std::hex << static_cast<int>(c);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
|
||||
switch (inst.opcode) {
|
||||
case RegExpInstruction::CONSUME_RANGE: {
|
||||
os << "CONSUME_RANGE [";
|
||||
PrintAsciiOrHex(os, inst.payload.consume_range.min);
|
||||
os << ", ";
|
||||
PrintAsciiOrHex(os, inst.payload.consume_range.max);
|
||||
os << "]";
|
||||
break;
|
||||
}
|
||||
case RegExpInstruction::FORK:
|
||||
os << "FORK " << inst.payload.pc;
|
||||
break;
|
||||
case RegExpInstruction::JMP:
|
||||
os << "JMP " << inst.payload.pc;
|
||||
break;
|
||||
case RegExpInstruction::ACCEPT:
|
||||
os << "ACCEPT";
|
||||
break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
// The maximum number of digits required to display a non-negative number < n
|
||||
// in base 10.
|
||||
int DigitsRequiredBelow(int n) {
|
||||
DCHECK_GE(n, 0);
|
||||
|
||||
int result = 1;
|
||||
for (int i = 10; i < n; i *= 10) {
|
||||
result += 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os,
|
||||
Vector<const RegExpInstruction> insts) {
|
||||
int inst_num = insts.length();
|
||||
int line_digit_num = DigitsRequiredBelow(inst_num);
|
||||
|
||||
for (int i = 0; i != inst_num; ++i) {
|
||||
const RegExpInstruction& inst = insts[i];
|
||||
os << std::setfill('0') << std::setw(line_digit_num) << i << ": " << inst
|
||||
<< std::endl;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) {
|
||||
RegExpInstruction* inst_begin =
|
||||
reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress());
|
||||
int inst_num = raw_bytes.length() / sizeof(RegExpInstruction);
|
||||
DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length());
|
||||
return Vector<RegExpInstruction>(inst_begin, inst_num);
|
||||
}
|
||||
|
||||
class Compiler : private RegExpVisitor {
|
||||
public:
|
||||
static Handle<ByteArray> Compile(RegExpTree* tree, Isolate* isolate,
|
||||
Zone* zone) {
|
||||
Compiler compiler(zone);
|
||||
|
||||
tree->Accept(&compiler, nullptr);
|
||||
compiler.code_.Add(RegExpInstruction::Accept(), zone);
|
||||
|
||||
int byte_length = sizeof(RegExpInstruction) * compiler.code_.length();
|
||||
Handle<ByteArray> array = isolate->factory()->NewByteArray(byte_length);
|
||||
MemCopy(array->GetDataStartAddress(), compiler.code_.begin(), byte_length);
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
private:
|
||||
// TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
|
||||
// the `tree` size we're going to compile?
|
||||
explicit Compiler(Zone* zone) : zone_(zone), code_(0, zone) {}
|
||||
|
||||
// Generate a disjunction of code fragments compiled by a function `alt_gen`.
|
||||
// `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
|
||||
// 1` and should push code corresponding to the ith alternative onto `code_`.
|
||||
template <class F>
|
||||
void CompileDisjunction(int alt_num, F gen_alt) {
|
||||
// An alternative a0 | a1 | a2 is compiled into
|
||||
// FORK <a2>
|
||||
// FORK <a1>
|
||||
// <a0>
|
||||
// JMP $end
|
||||
// <a1>
|
||||
// JMP $end
|
||||
// <a2>
|
||||
// where $end is the index of the next instruction after <a2>.
|
||||
//
|
||||
// By the semantics of the FORK instruction (see above at definition and
|
||||
// semantics), the forked thread has lower priority than the current
|
||||
// thread. This means that with the code we're generating here, the thread
|
||||
// matching the alternative a0 is indeed the thread with the highest
|
||||
// priority, followed by the thread for a1 and so on.
|
||||
|
||||
if (alt_num == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Record the index of the first of the alt_num - 1 fork instructions in the
|
||||
// beginning.
|
||||
int forks_begin = code_.length();
|
||||
// Add FORKs to alts[alt_num - 1], alts[alt_num - 2], ..., alts[1].
|
||||
for (int i = alt_num - 1; i != 0; --i) {
|
||||
// The FORK's address is patched once we know the address of the ith
|
||||
// alternative.
|
||||
code_.Add(RegExpInstruction::Fork(-1), zone_);
|
||||
}
|
||||
|
||||
// List containing the index of the final JMP instruction after each
|
||||
// alternative but the last one.
|
||||
ZoneList<int> jmp_indices(alt_num - 1, zone_);
|
||||
|
||||
for (int i = 0; i != alt_num; ++i) {
|
||||
if (i != 0) {
|
||||
// If this is not the first alternative, we have to patch the
|
||||
// corresponding FORK statement in the beginning.
|
||||
code_[forks_begin + alt_num - 1 - i].payload.pc = code_.length();
|
||||
}
|
||||
gen_alt(i);
|
||||
if (i != alt_num - 1) {
|
||||
// If this is not the last alternative, we have to emit a JMP past the
|
||||
// remaining alternatives. We don't know this address yet, so we have
|
||||
// to patch patch it once all alternatives are emitted.
|
||||
jmp_indices.Add(code_.length(), zone_);
|
||||
code_.Add(RegExpInstruction::Jmp(-1), zone_);
|
||||
}
|
||||
}
|
||||
|
||||
// All alternatives are emitted. Now we can patch the JMP instruction
|
||||
// after each but the last alternative.
|
||||
int end_index = code_.length();
|
||||
for (int jmp_index : jmp_indices) {
|
||||
code_[jmp_index].payload.pc = end_index;
|
||||
}
|
||||
}
|
||||
|
||||
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
|
||||
ZoneList<RegExpTree*>& alts = *node->alternatives();
|
||||
CompileDisjunction(alts.length(),
|
||||
[&](int i) { alts[i]->Accept(this, nullptr); });
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAlternative(RegExpAlternative* node, void*) override {
|
||||
for (RegExpTree* child : *node->nodes()) {
|
||||
child->Accept(this, nullptr);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAssertion(RegExpAssertion* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Support this case.
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
|
||||
// A character class is compiled as Disjunction over its `CharacterRange`s.
|
||||
ZoneList<CharacterRange>* ranges = node->ranges(zone_);
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
if (node->is_negated()) {
|
||||
// Capacity 2 for the common case where we compute the complement of a
|
||||
// single interval range that doesn't contain 0 and kMaxCodePoint.
|
||||
ZoneList<CharacterRange>* negated =
|
||||
zone_->New<ZoneList<CharacterRange>>(2, zone_);
|
||||
CharacterRange::Negate(ranges, negated, zone_);
|
||||
ranges = negated;
|
||||
}
|
||||
|
||||
CompileDisjunction(ranges->length(), [&](int i) {
|
||||
// We don't support utf16 for now, so only ranges that can be specified
|
||||
// by (complements of) ranges with uc16 bounds.
|
||||
STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits<uc16>::max());
|
||||
|
||||
uc32 from = (*ranges)[i].from();
|
||||
DCHECK_LE(from, kMaxSupportedCodepoint);
|
||||
uc16 from_uc16 = static_cast<uc16>(from);
|
||||
|
||||
uc32 to = (*ranges)[i].to();
|
||||
DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint);
|
||||
uc16 to_uc16 = static_cast<uc16>(std::min(to, kMaxSupportedCodepoint));
|
||||
|
||||
Uc16Range range{from_uc16, to_uc16};
|
||||
code_.Add(RegExpInstruction::ConsumeRange(range), zone_);
|
||||
});
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitAtom(RegExpAtom* node, void*) override {
|
||||
for (uc16 c : node->data()) {
|
||||
code_.Add(RegExpInstruction::ConsumeRange(Uc16Range{c, c}), zone_);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
|
||||
// TODO(mbid,v8:10765): For now we support a quantifier of the form /x*/,
|
||||
// i.e. greedy match of any number of /x/. See also the comment in
|
||||
// `CanBeHandledVisitor::VisitQuantifier`.
|
||||
DCHECK_EQ(node->min(), 0);
|
||||
DCHECK_EQ(node->max(), RegExpTree::kInfinity);
|
||||
DCHECK(node->is_greedy());
|
||||
|
||||
// The repetition of /x/ is compiled into
|
||||
//
|
||||
// a: FORK d
|
||||
// b: <x>
|
||||
// c: JMP a
|
||||
// d: ...
|
||||
//
|
||||
// Note that a FORKed thread has lower priority than the main thread, so
|
||||
// this will indeed match greedily.
|
||||
|
||||
int initial_fork_index = code_.length();
|
||||
// The FORK's address is patched once we're done.
|
||||
code_.Add(RegExpInstruction::Fork(-1), zone_);
|
||||
node->body()->Accept(this, nullptr);
|
||||
code_.Add(RegExpInstruction::Jmp(initial_fork_index), zone_);
|
||||
int end_index = code_.length();
|
||||
code_[initial_fork_index].payload.pc = end_index;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitCapture(RegExpCapture* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Support this case.
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitGroup(RegExpGroup* node, void*) override {
|
||||
node->body()->Accept(this, nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitLookaround(RegExpLookaround* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Support this case.
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitBackReference(RegExpBackReference* node, void*) override {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
|
||||
|
||||
void* VisitText(RegExpText* node, void*) override {
|
||||
for (TextElement& text_el : *node->elements()) {
|
||||
text_el.tree()->Accept(this, nullptr);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
Zone* zone_;
|
||||
ZoneList<RegExpInstruction> code_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
|
||||
DCHECK_EQ(re->TypeTag(), JSRegExp::EXPERIMENTAL);
|
||||
#ifdef VERIFY_HEAP
|
||||
@ -624,341 +68,32 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
|
||||
RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, &parse_result);
|
||||
CHECK(parse_success);
|
||||
|
||||
Handle<ByteArray> bytecode =
|
||||
Compiler::Compile(parse_result.tree, isolate, &zone);
|
||||
re->SetDataAt(JSRegExp::kIrregexpLatin1BytecodeIndex, *bytecode);
|
||||
re->SetDataAt(JSRegExp::kIrregexpUC16BytecodeIndex, *bytecode);
|
||||
ZoneList<RegExpInstruction> bytecode =
|
||||
ExperimentalRegExpCompiler::Compile(parse_result.tree, flags, &zone);
|
||||
|
||||
int byte_length = sizeof(RegExpInstruction) * bytecode.length();
|
||||
Handle<ByteArray> bytecode_byte_array =
|
||||
isolate->factory()->NewByteArray(byte_length);
|
||||
MemCopy(bytecode_byte_array->GetDataStartAddress(), bytecode.begin(),
|
||||
byte_length);
|
||||
|
||||
re->SetDataAt(JSRegExp::kIrregexpLatin1BytecodeIndex, *bytecode_byte_array);
|
||||
re->SetDataAt(JSRegExp::kIrregexpUC16BytecodeIndex, *bytecode_byte_array);
|
||||
|
||||
Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
|
||||
re->SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
|
||||
re->SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
|
||||
}
|
||||
|
||||
namespace {
|
||||
Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) {
|
||||
RegExpInstruction* inst_begin =
|
||||
reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress());
|
||||
int inst_num = raw_bytes.length() / sizeof(RegExpInstruction);
|
||||
DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length());
|
||||
return Vector<RegExpInstruction>(inst_begin, inst_num);
|
||||
}
|
||||
|
||||
// A half-open range in the input string denoting a (sub)match. Used to access
|
||||
// output registers of a regexp match grouped by [begin, end) pairs.
|
||||
struct MatchRange {
|
||||
int32_t begin; // inclusive
|
||||
int32_t end; // exclusive
|
||||
};
|
||||
|
||||
template <class Character>
|
||||
class NfaInterpreter {
|
||||
// Executes a bytecode program in breadth-first mode, without backtracking.
|
||||
// `Character` can be instantiated with `uint8_t` or `uc16` for one byte or
|
||||
// two byte input strings.
|
||||
//
|
||||
// In contrast to the backtracking implementation, this has linear time
|
||||
// complexity in the length of the input string. Breadth-first mode means
|
||||
// that threads are executed in lockstep with respect to their input
|
||||
// position, i.e. the threads share a common input index. This is similar
|
||||
// to breadth-first simulation of a non-deterministic finite automaton (nfa),
|
||||
// hence the name of the class.
|
||||
//
|
||||
// To follow the semantics of a backtracking VM implementation, we have to be
|
||||
// careful about whether we stop execution when a thread executes ACCEPT.
|
||||
// For example, consider execution of the bytecode generated by the regexp
|
||||
//
|
||||
// r = /abc|..|[a-c]{10,}/
|
||||
//
|
||||
// on input "abcccccccccccccc". Clearly the three alternatives
|
||||
// - /abc/
|
||||
// - /../
|
||||
// - /[a-c]{10,}/
|
||||
// all match this input. A backtracking implementation will report "abc" as
|
||||
// match, because it explores the first alternative before the others.
|
||||
//
|
||||
// However, if we execute breadth first, then we execute the 3 threads
|
||||
// - t1, which tries to match /abc/
|
||||
// - t2, which tries to match /../
|
||||
// - t3, which tries to match /[a-c]{10,}/
|
||||
// in lockstep i.e. by iterating over the input and feeding all threads one
|
||||
// character at a time. t2 will execute an ACCEPT after two characters,
|
||||
// while t1 will only execute ACCEPT after three characters. Thus we find a
|
||||
// match for the second alternative before a match of the first alternative.
|
||||
//
|
||||
// This shows that we cannot always stop searching as soon as some thread t
|
||||
// executes ACCEPT: If there is a thread u with higher priority than t, then
|
||||
// it must be finished first. If u produces a match, then we can discard the
|
||||
// match of t because matches produced by threads with higher priority are
|
||||
// preferred over matches of threads with lower priority. On the other hand,
|
||||
// we are allowed to abort all threads with lower priority than t if t
|
||||
// produces a match: Such threads can only produce worse matches. In the
|
||||
// example above, we can abort t3 after two characters because of t2's match.
|
||||
//
|
||||
// Thus the interpreter keeps track of a priority-ordered list of threads.
|
||||
// If a thread ACCEPTs, all threads with lower priority are discarded, and
|
||||
// the search continues with the threads with higher priority. If no threads
|
||||
// with high priority are left, we return the match that was produced by the
|
||||
// ACCEPTing thread with highest priority.
|
||||
public:
|
||||
NfaInterpreter(Vector<const RegExpInstruction> bytecode,
|
||||
Vector<const Character> input, int32_t input_index)
|
||||
: bytecode_(bytecode),
|
||||
input_(input),
|
||||
input_index_(input_index),
|
||||
pc_last_input_index_(bytecode.size()),
|
||||
active_threads_(),
|
||||
blocked_threads_(),
|
||||
best_match_(base::nullopt) {
|
||||
DCHECK(!bytecode_.empty());
|
||||
DCHECK_GE(input_index_, 0);
|
||||
DCHECK_LE(input_index_, input_.length());
|
||||
|
||||
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
|
||||
}
|
||||
|
||||
// Finds up to `max_match_num` matches and writes their boundaries to
|
||||
// `matches_out`. The search begins at the current input index. Returns the
|
||||
// number of matches found.
|
||||
int FindMatches(MatchRange* matches_out, int max_match_num) {
|
||||
int match_num;
|
||||
for (match_num = 0; match_num != max_match_num; ++match_num) {
|
||||
base::Optional<MatchRange> match = FindNextMatch();
|
||||
if (!match.has_value()) {
|
||||
break;
|
||||
}
|
||||
|
||||
matches_out[match_num] = *match;
|
||||
SetInputIndex(match->end);
|
||||
}
|
||||
return match_num;
|
||||
}
|
||||
|
||||
private:
|
||||
// The state of a "thread" executing experimental regexp bytecode. (Not to
|
||||
// be confused with an OS thread.)
|
||||
struct InterpreterThread {
|
||||
// This thread's program counter, i.e. the index within `bytecode_` of the
|
||||
// next instruction to be executed.
|
||||
int32_t pc;
|
||||
// The index in the input string where this thread started executing.
|
||||
int32_t match_begin;
|
||||
};
|
||||
|
||||
// Change the current input index for future calls to `FindNextMatch`.
|
||||
void SetInputIndex(int new_input_index) {
|
||||
DCHECK_GE(input_index_, 0);
|
||||
DCHECK_LE(input_index_, input_.length());
|
||||
|
||||
input_index_ = new_input_index;
|
||||
}
|
||||
|
||||
// Find the next match, begin search at input_index_;
|
||||
base::Optional<MatchRange> FindNextMatch() {
|
||||
DCHECK(active_threads_.empty());
|
||||
// TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
|
||||
// here? As long as
|
||||
//
|
||||
// pc_last_input_index_[pc] < input_index_
|
||||
//
|
||||
// for all possible program counters pc that are reachable without input
|
||||
// from pc = 0 and
|
||||
//
|
||||
// pc_last_input_index_[k] <= input_index_
|
||||
//
|
||||
// for all k > 0 hold I think everything should be fine. Maybe we can do
|
||||
// something about this in `SetInputIndex`.
|
||||
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
|
||||
|
||||
DCHECK(blocked_threads_.empty());
|
||||
DCHECK(active_threads_.empty());
|
||||
DCHECK_EQ(best_match_, base::nullopt);
|
||||
|
||||
// All threads start at bytecode 0.
|
||||
PushActiveThreadUnchecked(InterpreterThread{0, input_index_});
|
||||
// Run the initial thread, potentially forking new threads, until every
|
||||
// thread is blocked without further input.
|
||||
RunActiveThreads();
|
||||
|
||||
// We stop if one of the following conditions hold:
|
||||
// - We have exhausted the entire input.
|
||||
// - We have found a match at some point, and there are no remaining
|
||||
// threads with higher priority than the thread that produced the match.
|
||||
// Threads with low priority have been aborted earlier, and the remaining
|
||||
// threads are blocked here, so the latter simply means that
|
||||
// `blocked_threads_` is empty.
|
||||
while (input_index_ != input_.length() &&
|
||||
!(best_match_.has_value() && blocked_threads_.empty())) {
|
||||
DCHECK(active_threads_.empty());
|
||||
uc16 input_char = input_[input_index_];
|
||||
++input_index_;
|
||||
|
||||
// If we haven't found a match yet, we add a thread with least priority
|
||||
// that attempts a match starting after `input_char`.
|
||||
if (!best_match_.has_value()) {
|
||||
active_threads_.emplace_back(InterpreterThread{0, input_index_});
|
||||
}
|
||||
|
||||
// We unblock all blocked_threads_ by feeding them the input char.
|
||||
FlushBlockedThreads(input_char);
|
||||
|
||||
// Run all threads until they block or accept.
|
||||
RunActiveThreads();
|
||||
}
|
||||
|
||||
// Clean up the data structures we used.
|
||||
base::Optional<MatchRange> result = best_match_;
|
||||
best_match_ = base::nullopt;
|
||||
blocked_threads_.clear();
|
||||
active_threads_.clear();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
|
||||
// instruction, or its PC value was already processed.
|
||||
// - If processing of `t` can't continue because of CONSUME_RANGE, it is
|
||||
// pushed on `blocked_threads_`.
|
||||
// - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
|
||||
// the current input index. All remaining `active_threads_` are discarded.
|
||||
void RunActiveThread(InterpreterThread t) {
|
||||
while (true) {
|
||||
RegExpInstruction inst = bytecode_[t.pc];
|
||||
switch (inst.opcode) {
|
||||
case RegExpInstruction::CONSUME_RANGE: {
|
||||
blocked_threads_.emplace_back(t);
|
||||
return;
|
||||
}
|
||||
case RegExpInstruction::FORK: {
|
||||
InterpreterThread fork = t;
|
||||
fork.pc = inst.payload.pc;
|
||||
++t.pc;
|
||||
|
||||
// t has higher priority than fork. If t.pc hasn't been processed,we
|
||||
// push fork on the active_thread_ stack and continue directly with
|
||||
// t. Otherwise we continue directly with fork if possible.
|
||||
if (!IsPcProcessed(t.pc)) {
|
||||
MarkPcProcessed(t.pc);
|
||||
PushActiveThread(fork);
|
||||
break;
|
||||
} else if (!IsPcProcessed(fork.pc)) {
|
||||
t = fork;
|
||||
MarkPcProcessed(t.pc);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
case RegExpInstruction::JMP:
|
||||
t.pc = inst.payload.pc;
|
||||
if (IsPcProcessed(t.pc)) return;
|
||||
MarkPcProcessed(t.pc);
|
||||
break;
|
||||
case RegExpInstruction::ACCEPT:
|
||||
best_match_ = MatchRange{t.match_begin, input_index_};
|
||||
active_threads_.clear();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run each active thread until it can't continue without further input.
|
||||
// `active_threads_` is empty afterwards. `blocked_threads_` are sorted from
|
||||
// low to high priority.
|
||||
void RunActiveThreads() {
|
||||
while (!active_threads_.empty()) {
|
||||
InterpreterThread t = active_threads_.back();
|
||||
active_threads_.pop_back();
|
||||
RunActiveThread(t);
|
||||
}
|
||||
}
|
||||
|
||||
// Unblock all blocked_threads_ by feeding them an `input_char`. Should only
|
||||
// be called with `input_index_` pointing to the character *after*
|
||||
// `input_char` so that `pc_last_input_index_` is updated correctly.
|
||||
void FlushBlockedThreads(uc16 input_char) {
|
||||
// The threads in blocked_threads_ are sorted from high to low priority,
|
||||
// but active_threads_ needs to be sorted from low to high priority, so we
|
||||
// need to activate blocked threads in reverse order.
|
||||
//
|
||||
// TODO(mbid,v8:10765): base::SmallVector doesn't support `rbegin()` and
|
||||
// `rend()`, should we implement that instead of this awkward iteration?
|
||||
// Maybe we could at least use an int i and check for i >= 0, but
|
||||
// SmallVectors don't have length() methods.
|
||||
for (size_t i = blocked_threads_.size(); i > 0; --i) {
|
||||
InterpreterThread t = blocked_threads_[i - 1];
|
||||
RegExpInstruction inst = bytecode_[t.pc];
|
||||
DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
|
||||
Uc16Range range = inst.payload.consume_range;
|
||||
if (input_char >= range.min && input_char <= range.max) {
|
||||
++t.pc;
|
||||
PushActiveThreadUnchecked(t);
|
||||
}
|
||||
}
|
||||
blocked_threads_.clear();
|
||||
}
|
||||
|
||||
// It is redundant to have two threads t, t0 execute at the same PC value,
|
||||
// because one of t, t0 matches iff the other does. We can thus discard
|
||||
// the one with lower priority. We check whether a thread executed at some
|
||||
// PC value by recording for every possible value of PC what the value of
|
||||
// input_index_ was the last time a thread executed at PC. If a thread
|
||||
// tries to continue execution at a PC value that we have seen before at
|
||||
// the current input index, we abort it. (We execute threads with higher
|
||||
// priority first, so the second thread is guaranteed to have lower
|
||||
// priority.)
|
||||
//
|
||||
// Check whether we've seen an active thread with a given pc value since the
|
||||
// last increment of `input_index_`.
|
||||
bool IsPcProcessed(int pc) {
|
||||
DCHECK_LE(pc_last_input_index_[pc], input_index_);
|
||||
return pc_last_input_index_[pc] == input_index_;
|
||||
}
|
||||
|
||||
// Mark a pc as having been processed since the last increment of
|
||||
// `input_index_`.
|
||||
void MarkPcProcessed(int pc) {
|
||||
DCHECK_LE(pc_last_input_index_[pc], input_index_);
|
||||
pc_last_input_index_[pc] = input_index_;
|
||||
}
|
||||
|
||||
// Functions to push a thread `t` onto the list of active threads, but only
|
||||
// if `t.pc` was not already the pc of some other thread at the current
|
||||
// subject index.
|
||||
void PushActiveThreadUnchecked(InterpreterThread t) {
|
||||
DCHECK(!IsPcProcessed(t.pc));
|
||||
|
||||
MarkPcProcessed(t.pc);
|
||||
active_threads_.emplace_back(t);
|
||||
}
|
||||
void PushActiveThread(InterpreterThread t) {
|
||||
if (IsPcProcessed(t.pc)) {
|
||||
return;
|
||||
}
|
||||
PushActiveThreadUnchecked(t);
|
||||
}
|
||||
|
||||
Vector<const RegExpInstruction> bytecode_;
|
||||
Vector<const Character> input_;
|
||||
int input_index_;
|
||||
|
||||
// TODO(mbid,v8:10765): The following `SmallVector`s have somehwat
|
||||
// arbitrarily chosen small capacity sizes; should benchmark to find a good
|
||||
// value.
|
||||
|
||||
// pc_last_input_index_[k] records the value of input_index_ the last
|
||||
// time a thread t such that t.pc == k was activated, i.e. put on
|
||||
// active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See
|
||||
// also `RunActiveThread`.
|
||||
base::SmallVector<int, 64> pc_last_input_index_;
|
||||
|
||||
// Active threads can potentially (but not necessarily) continue without
|
||||
// input. Sorted from low to high priority.
|
||||
base::SmallVector<InterpreterThread, 64> active_threads_;
|
||||
|
||||
// The pc of a blocked thread points to an instruction that consumes a
|
||||
// character. Sorted from high to low priority (so the opposite of
|
||||
// `active_threads_`).
|
||||
base::SmallVector<InterpreterThread, 64> blocked_threads_;
|
||||
|
||||
// The best match found so far during the current search. If several threads
|
||||
// ACCEPTed, then this will be the match of the accepting thread with highest
|
||||
// priority.
|
||||
base::Optional<MatchRange> best_match_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
using MatchRange = ExperimentalRegExpInterpreter::MatchRange;
|
||||
|
||||
// Returns the number of matches.
|
||||
int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject,
|
||||
@ -990,13 +125,13 @@ int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject,
|
||||
const int32_t max_match_num = output_register_count / 2;
|
||||
|
||||
if (subject_content.IsOneByte()) {
|
||||
NfaInterpreter<uint8_t> interpreter(
|
||||
bytecode, subject_content.ToOneByteVector(), subject_index);
|
||||
return interpreter.FindMatches(matches, max_match_num);
|
||||
return ExperimentalRegExpInterpreter::FindMatchesNfaOneByte(
|
||||
bytecode, subject_content.ToOneByteVector(), subject_index, matches,
|
||||
max_match_num);
|
||||
} else {
|
||||
NfaInterpreter<uc16> interpreter(bytecode, subject_content.ToUC16Vector(),
|
||||
subject_index);
|
||||
return interpreter.FindMatches(matches, max_match_num);
|
||||
return ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte(
|
||||
bytecode, subject_content.ToUC16Vector(), subject_index, matches,
|
||||
max_match_num);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user