[regexp] Support capture groups in experimental engine
This commit adds support for capture groups (as in e.g. /x(123|abc)y/) in the experimental regexp engine. Now every InterpreterThread owns a register array containing (sub)match boundaries. There is a new instruction to record the current input index in some register. Submatches in quantifier bodies should be reported only if they occur during the last repetition. Thus we reset those registers before attempting to match the body of a quantifier. This is implemented with another new instruction. Because of concerns for the growing sizeof the NfaInterpreter object (which is allocated on the stack), this commit replaces the `SmallVector` members of the NfaInterpreter with zone-allocated arrays. Register arrays, which for a fixed regexp are all the same size, are allocated with a RecyclingZoneAllocator for cheap memory reclamation via a linked list of equally-sized free blocks. Possible optimizations for management of register array memory: 1. If there are few register per thread, then it is likely faster to store them inline in the InterpreterThread struct. 2. re2 implements copy-on-write: InterpreterThreads can share the same register array. If a thread attempts to write to shared register array, the register array is cloned first. 3. The register at index 1 contains the end of the match; this is only written to right before an ACCEPT statement. We could make ACCEPT equivalent to what's currently CAPTURE 1 followed by ACCEPT. We could then save the memory for register 1 for threads that haven't finished yet. This is particularly interesting if now optimization 1 kicks in. Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng Bug: v8:10765 Change-Id: I2c0503206ce331e13ac9912945bb66736d740197 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2390770 Commit-Queue: Martin Bidlingmaier <mbid@google.com> Reviewed-by: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#69929}
This commit is contained in:
parent
10ffb113e2
commit
98b8ca89a2
@ -257,11 +257,15 @@ TNode<JSRegExpResult> RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(
|
||||
TNode<FixedArray> data =
|
||||
CAST(LoadObjectField(regexp, JSRegExp::kDataOffset));
|
||||
|
||||
// We reach this point only if captures exist, implying that this is an
|
||||
// IRREGEXP JSRegExp.
|
||||
CSA_ASSERT(this,
|
||||
SmiEqual(CAST(LoadFixedArrayElement(data, JSRegExp::kTagIndex)),
|
||||
SmiConstant(JSRegExp::IRREGEXP)));
|
||||
// We reach this point only if captures exist, implying that the assigned
|
||||
// regexp engine must be able to handle captures.
|
||||
CSA_ASSERT(
|
||||
this,
|
||||
Word32Or(
|
||||
SmiEqual(CAST(LoadFixedArrayElement(data, JSRegExp::kTagIndex)),
|
||||
SmiConstant(JSRegExp::IRREGEXP)),
|
||||
SmiEqual(CAST(LoadFixedArrayElement(data, JSRegExp::kTagIndex)),
|
||||
SmiConstant(JSRegExp::EXPERIMENTAL))));
|
||||
|
||||
// The names fixed array associates names at even indices with a capture
|
||||
// index at odd indices.
|
||||
|
@ -1242,10 +1242,8 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) {
|
||||
|
||||
CHECK_EQ(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex),
|
||||
uninitialized);
|
||||
// TODO(mbid,v8:10765): Once the EXPERIMENTAL regexps support captures,
|
||||
// the capture count should be allowed to be a Smi >= 0.
|
||||
CHECK_EQ(arr.get(JSRegExp::kIrregexpCaptureCountIndex), Smi::FromInt(0));
|
||||
CHECK_EQ(arr.get(JSRegExp::kIrregexpCaptureNameMapIndex), uninitialized);
|
||||
CHECK(arr.get(JSRegExp::kIrregexpCaptureCountIndex).IsSmi());
|
||||
CHECK_GE(Smi::ToInt(arr.get(JSRegExp::kIrregexpCaptureCountIndex)), 0);
|
||||
CHECK_EQ(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex),
|
||||
uninitialized);
|
||||
CHECK_EQ(arr.get(JSRegExp::kIrregexpBacktrackLimit), uninitialized);
|
||||
@ -1282,6 +1280,7 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) {
|
||||
CHECK_IMPLIES(uc16_data.IsSmi(), uc16_bytecode.IsSmi());
|
||||
|
||||
CHECK(arr.get(JSRegExp::kIrregexpCaptureCountIndex).IsSmi());
|
||||
CHECK_GE(Smi::ToInt(arr.get(JSRegExp::kIrregexpCaptureCountIndex)), 0);
|
||||
CHECK(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex).IsSmi());
|
||||
CHECK(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex).IsSmi());
|
||||
CHECK(arr.get(JSRegExp::kIrregexpBacktrackLimit).IsSmi());
|
||||
|
@ -67,7 +67,7 @@ String JSRegExp::Pattern() {
|
||||
|
||||
Object JSRegExp::CaptureNameMap() {
|
||||
DCHECK(this->data().IsFixedArray());
|
||||
DCHECK_EQ(TypeTag(), IRREGEXP);
|
||||
DCHECK(TypeSupportsCaptures(TypeTag()));
|
||||
Object value = DataAt(kIrregexpCaptureNameMapIndex);
|
||||
DCHECK_NE(value, Smi::FromInt(JSRegExp::kUninitializedValue));
|
||||
return value;
|
||||
@ -85,6 +85,14 @@ void JSRegExp::SetDataAt(int index, Object value) {
|
||||
FixedArray::cast(data()).set(index, value);
|
||||
}
|
||||
|
||||
void JSRegExp::SetCaptureNameMap(Handle<FixedArray> capture_name_map) {
|
||||
if (capture_name_map.is_null()) {
|
||||
SetDataAt(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::zero());
|
||||
} else {
|
||||
SetDataAt(JSRegExp::kIrregexpCaptureNameMapIndex, *capture_name_map);
|
||||
}
|
||||
}
|
||||
|
||||
bool JSRegExp::HasCompiledCode() const {
|
||||
if (TypeTag() != IRREGEXP) return false;
|
||||
Smi uninitialized = Smi::FromInt(kUninitializedValue);
|
||||
|
@ -89,6 +89,9 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
|
||||
void MarkTierUpForNextExec();
|
||||
|
||||
inline Type TypeTag() const;
|
||||
static bool TypeSupportsCaptures(Type t) {
|
||||
return t == IRREGEXP || t == EXPERIMENTAL;
|
||||
}
|
||||
|
||||
// Maximum number of captures allowed.
|
||||
static constexpr int kMaxCaptures = 1 << 16;
|
||||
@ -105,6 +108,7 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
|
||||
inline Object DataAt(int index) const;
|
||||
// Set implementation data after the object has been prepared.
|
||||
inline void SetDataAt(int index, Object value);
|
||||
inline void SetCaptureNameMap(Handle<FixedArray> capture_name_map);
|
||||
|
||||
static constexpr int code_index(bool is_latin1) {
|
||||
return is_latin1 ? kIrregexpLatin1CodeIndex : kIrregexpUC16CodeIndex;
|
||||
|
@ -41,6 +41,12 @@ std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
|
||||
case RegExpInstruction::ACCEPT:
|
||||
os << "ACCEPT";
|
||||
break;
|
||||
case RegExpInstruction::SET_REGISTER_TO_CP:
|
||||
os << "SET_REGISTER_TO_CP " << inst.payload.register_index;
|
||||
break;
|
||||
case RegExpInstruction::CLEAR_REGISTER:
|
||||
os << "CLEAR_REGISTER " << inst.payload.register_index;
|
||||
break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
@ -46,6 +46,10 @@
|
||||
// - JMP: Instead of incrementing the PC value after execution of this
|
||||
// instruction by 1, set PC of this thread to the value specified in the
|
||||
// instruction payload and continue there.
|
||||
// - SET_REGISTER_TO_CP: Set a register specified in the paylod to the current
|
||||
// position (CP) within the input, then continue with the next instruction.
|
||||
// - CLEAR_REGISTER: Clear the register specified in the payload by resetting
|
||||
// it to the initial value -1.
|
||||
//
|
||||
// Special care must be exercised with respect to thread priority. It is
|
||||
// possible that more than one thread executes an ACCEPT statement. The output
|
||||
@ -91,6 +95,8 @@ struct RegExpInstruction {
|
||||
FORK,
|
||||
JMP,
|
||||
ACCEPT,
|
||||
SET_REGISTER_TO_CP,
|
||||
CLEAR_REGISTER,
|
||||
};
|
||||
|
||||
struct Uc16Range {
|
||||
@ -125,12 +131,28 @@ struct RegExpInstruction {
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction SetRegisterToCp(int32_t register_index) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = SET_REGISTER_TO_CP;
|
||||
result.payload.register_index = register_index;
|
||||
return result;
|
||||
}
|
||||
|
||||
static RegExpInstruction ClearRegister(int32_t register_index) {
|
||||
RegExpInstruction result;
|
||||
result.opcode = CLEAR_REGISTER;
|
||||
result.payload.register_index = register_index;
|
||||
return result;
|
||||
}
|
||||
|
||||
Opcode opcode;
|
||||
union {
|
||||
// Payload of CONSUME_RANGE:
|
||||
Uc16Range consume_range;
|
||||
// Payload of FORK and JMP, the next/forked program counter (pc):
|
||||
int32_t pc;
|
||||
// Payload of SET_REGISTER_TO_CP and CLEAR_REGISTER:
|
||||
int32_t register_index;
|
||||
} payload;
|
||||
STATIC_ASSERT(sizeof(payload) == 4);
|
||||
};
|
||||
|
@ -21,9 +21,7 @@ class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
public:
|
||||
static bool Check(RegExpTree* node, JSRegExp::Flags flags, int capture_count,
|
||||
Zone* zone) {
|
||||
if (!AreSuitableFlags(flags) || capture_count > 0) {
|
||||
return false;
|
||||
}
|
||||
if (!AreSuitableFlags(flags)) return false;
|
||||
CanBeHandledVisitor visitor(zone);
|
||||
node->Accept(&visitor, nullptr);
|
||||
return visitor.result_;
|
||||
@ -151,9 +149,7 @@ class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
}
|
||||
|
||||
void* VisitCapture(RegExpCapture* node, void*) override {
|
||||
// TODO(mbid, v8:10765): This can be implemented with the NFA interpreter,
|
||||
// but not with the lazy DFA. See also re2.
|
||||
result_ = false;
|
||||
node->body()->Accept(this, nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -287,7 +283,9 @@ class CompileVisitor : private RegExpVisitor {
|
||||
Zone* zone) {
|
||||
CompileVisitor compiler(zone);
|
||||
|
||||
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(0), zone);
|
||||
tree->Accept(&compiler, nullptr);
|
||||
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(1), zone);
|
||||
compiler.code_.Add(RegExpInstruction::Accept(), zone);
|
||||
|
||||
return std::move(compiler.code_);
|
||||
@ -404,11 +402,35 @@ class CompileVisitor : private RegExpVisitor {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
|
||||
// First repeat the body `min()` times.
|
||||
for (int i = 0; i != node->min(); ++i) {
|
||||
node->body()->Accept(this, nullptr);
|
||||
void ClearRegisters(Interval indices) {
|
||||
if (indices.is_empty()) return;
|
||||
DCHECK_EQ(indices.from() % 2, 0);
|
||||
DCHECK_EQ(indices.to() % 2, 1);
|
||||
for (int i = indices.from(); i <= indices.to(); i += 2) {
|
||||
// It suffices to clear the register containing the `begin` of a capture
|
||||
// because this indicates that the capture is undefined, regardless of
|
||||
// the value in the `end` register.
|
||||
code_.Add(RegExpInstruction::ClearRegister(i), zone_);
|
||||
}
|
||||
}
|
||||
|
||||
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
|
||||
// Emit the body, but clear registers occuring in body first.
|
||||
//
|
||||
// TODO(mbid,v8:10765): It's not always necessary to a) capture registers
|
||||
// and b) clear them. For example, we don't have to capture anything for
|
||||
// the first 4 repetitions if node->min() >= 5, and then we don't have to
|
||||
// clear registers in the first node->min() repetitions.
|
||||
// Later, and if node->min() == 0, we don't have to clear registers before
|
||||
// the first optional repetition.
|
||||
Interval body_registers = node->body()->CaptureRegisters();
|
||||
auto emit_body = [&]() {
|
||||
ClearRegisters(body_registers);
|
||||
node->body()->Accept(this, nullptr);
|
||||
};
|
||||
|
||||
// First repeat the body `min()` times.
|
||||
for (int i = 0; i != node->min(); ++i) emit_body();
|
||||
|
||||
switch (node->quantifier_type()) {
|
||||
case RegExpQuantifier::POSSESSIVE:
|
||||
@ -430,7 +452,7 @@ class CompileVisitor : private RegExpVisitor {
|
||||
DeferredLabel end;
|
||||
|
||||
AddForkTo(end, code_, zone_);
|
||||
node->body()->Accept(this, nullptr);
|
||||
emit_body();
|
||||
AddJmpTo(begin, code_, zone_);
|
||||
|
||||
std::move(end).Bind(code_);
|
||||
@ -452,7 +474,7 @@ class CompileVisitor : private RegExpVisitor {
|
||||
DeferredLabel end;
|
||||
for (int i = node->min(); i != node->max(); ++i) {
|
||||
AddForkTo(end, code_, zone_);
|
||||
node->body()->Accept(this, nullptr);
|
||||
emit_body();
|
||||
}
|
||||
std::move(end).Bind(code_);
|
||||
}
|
||||
@ -478,7 +500,7 @@ class CompileVisitor : private RegExpVisitor {
|
||||
|
||||
DCHECK_EQ(body.index(), code_.length());
|
||||
|
||||
node->body()->Accept(this, nullptr);
|
||||
emit_body();
|
||||
AddForkTo(body, code_, zone_);
|
||||
|
||||
std::move(end).Bind(code_);
|
||||
@ -509,20 +531,24 @@ class CompileVisitor : private RegExpVisitor {
|
||||
|
||||
DCHECK_EQ(body.index(), code_.length());
|
||||
|
||||
node->body()->Accept(this, nullptr);
|
||||
emit_body();
|
||||
}
|
||||
std::move(end).Bind(code_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitCapture(RegExpCapture* node, void*) override {
|
||||
// TODO(mbid,v8:10765): Support this case.
|
||||
UNREACHABLE();
|
||||
int index = node->index();
|
||||
int start_register = RegExpCapture::StartRegister(index);
|
||||
int end_register = RegExpCapture::EndRegister(index);
|
||||
code_.Add(RegExpInstruction::SetRegisterToCp(start_register), zone_);
|
||||
node->body()->Accept(this, nullptr);
|
||||
code_.Add(RegExpInstruction::SetRegisterToCp(end_register), zone_);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitGroup(RegExpGroup* node, void*) override {
|
||||
|
@ -5,16 +5,17 @@
|
||||
#include "src/regexp/experimental/experimental-interpreter.h"
|
||||
|
||||
#include "src/base/optional.h"
|
||||
#include "src/base/small-vector.h"
|
||||
#include "src/regexp/experimental/experimental.h"
|
||||
#include "src/zone/zone-allocator.h"
|
||||
#include "src/zone/zone-list-inl.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
using MatchRange = ExperimentalRegExpInterpreter::MatchRange;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr int kUndefinedRegisterValue = -1;
|
||||
|
||||
template <class Character>
|
||||
class NfaInterpreter {
|
||||
// Executes a bytecode program in breadth-first mode, without backtracking.
|
||||
@ -66,14 +67,19 @@ class NfaInterpreter {
|
||||
// ACCEPTing thread with highest priority.
|
||||
public:
|
||||
NfaInterpreter(Vector<const RegExpInstruction> bytecode,
|
||||
Vector<const Character> input, int32_t input_index)
|
||||
int register_count_per_match, Vector<const Character> input,
|
||||
int32_t input_index, Zone* zone)
|
||||
: bytecode_(bytecode),
|
||||
register_count_per_match_(register_count_per_match),
|
||||
input_(input),
|
||||
input_index_(input_index),
|
||||
pc_last_input_index_(bytecode.size()),
|
||||
active_threads_(),
|
||||
blocked_threads_(),
|
||||
best_match_(base::nullopt) {
|
||||
pc_last_input_index_(zone->NewArray<int>(bytecode.length()),
|
||||
bytecode.length()),
|
||||
active_threads_(0, zone),
|
||||
blocked_threads_(0, zone),
|
||||
register_array_allocator_(zone),
|
||||
best_match_registers_(base::nullopt),
|
||||
zone_(zone) {
|
||||
DCHECK(!bytecode_.empty());
|
||||
DCHECK_GE(input_index_, 0);
|
||||
DCHECK_LE(input_index_, input_.length());
|
||||
@ -81,31 +87,38 @@ class NfaInterpreter {
|
||||
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
|
||||
}
|
||||
|
||||
// Finds up to `max_match_num` matches and writes their boundaries to
|
||||
// `matches_out`. The search begins at the current input index. Returns the
|
||||
// number of matches found.
|
||||
int FindMatches(MatchRange* matches_out, int max_match_num) {
|
||||
// Finds matches and writes their concatenated capture registers to
|
||||
// `output_registers`. `output_registers[i]` has to be valid for all i <
|
||||
// output_register_count. The search continues until all remaining matches
|
||||
// have been found or there is no space left in `output_registers`. Returns
|
||||
// the number of matches found.
|
||||
int FindMatches(int32_t* output_registers, int output_register_count) {
|
||||
const int max_match_num = output_register_count / register_count_per_match_;
|
||||
|
||||
int match_num = 0;
|
||||
while (match_num != max_match_num) {
|
||||
base::Optional<MatchRange> match = FindNextMatch();
|
||||
if (!match.has_value()) {
|
||||
break;
|
||||
}
|
||||
FindNextMatch();
|
||||
if (!FoundMatch()) break;
|
||||
Vector<int> registers = *best_match_registers_;
|
||||
|
||||
matches_out[match_num] = *match;
|
||||
output_registers =
|
||||
std::copy(registers.begin(), registers.end(), output_registers);
|
||||
++match_num;
|
||||
|
||||
int match_length = match->end - match->begin;
|
||||
const int match_begin = registers[0];
|
||||
const int match_end = registers[1];
|
||||
DCHECK_LE(match_begin, match_end);
|
||||
const int match_length = match_end - match_begin;
|
||||
if (match_length != 0) {
|
||||
SetInputIndex(match->end);
|
||||
} else if (match->end == input_.length()) {
|
||||
SetInputIndex(match_end);
|
||||
} else if (match_end == input_.length()) {
|
||||
// Zero-length match, input exhausted.
|
||||
SetInputIndex(match->end);
|
||||
SetInputIndex(match_end);
|
||||
break;
|
||||
} else {
|
||||
// Zero-length match, more input. We don't want to report more matches
|
||||
// here endlessly, so we advance by 1.
|
||||
SetInputIndex(match->end + 1);
|
||||
SetInputIndex(match_end + 1);
|
||||
|
||||
// TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to
|
||||
// the next codepoint, not to the next code unit. See also
|
||||
@ -113,6 +126,7 @@ class NfaInterpreter {
|
||||
STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode);
|
||||
}
|
||||
}
|
||||
|
||||
return match_num;
|
||||
}
|
||||
|
||||
@ -122,9 +136,11 @@ class NfaInterpreter {
|
||||
struct InterpreterThread {
|
||||
// This thread's program counter, i.e. the index within `bytecode_` of the
|
||||
// next instruction to be executed.
|
||||
int32_t pc;
|
||||
// The index in the input string where this thread started executing.
|
||||
int32_t match_begin;
|
||||
int pc;
|
||||
// Pointer to the array of registers, which is always size
|
||||
// `register_count_per_match_`. Should be deallocated with
|
||||
// `register_array_allocator_`.
|
||||
int* register_array_begin;
|
||||
};
|
||||
|
||||
// Change the current input index for future calls to `FindNextMatch`.
|
||||
@ -135,9 +151,11 @@ class NfaInterpreter {
|
||||
input_index_ = new_input_index;
|
||||
}
|
||||
|
||||
// Find the next match, begin search at input_index_;
|
||||
base::Optional<MatchRange> FindNextMatch() {
|
||||
DCHECK(active_threads_.empty());
|
||||
// Find the next match and return the corresponding capture registers and
|
||||
// write its capture registers to `best_match_registers_`. The search starts
|
||||
// at the current `input_index_`.
|
||||
void FindNextMatch() {
|
||||
DCHECK(active_threads_.is_empty());
|
||||
// TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
|
||||
// here? As long as
|
||||
//
|
||||
@ -152,12 +170,25 @@ class NfaInterpreter {
|
||||
// something about this in `SetInputIndex`.
|
||||
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
|
||||
|
||||
DCHECK(blocked_threads_.empty());
|
||||
DCHECK(active_threads_.empty());
|
||||
DCHECK_EQ(best_match_, base::nullopt);
|
||||
// Clean up left-over data from a previous call to FindNextMatch.
|
||||
for (InterpreterThread t : blocked_threads_) {
|
||||
DestroyThread(t);
|
||||
}
|
||||
blocked_threads_.DropAndClear();
|
||||
|
||||
for (InterpreterThread t : active_threads_) {
|
||||
DestroyThread(t);
|
||||
}
|
||||
active_threads_.DropAndClear();
|
||||
|
||||
if (best_match_registers_.has_value()) {
|
||||
FreeRegisterArray(best_match_registers_->begin());
|
||||
best_match_registers_ = base::nullopt;
|
||||
}
|
||||
|
||||
// All threads start at bytecode 0.
|
||||
active_threads_.emplace_back(InterpreterThread{0, input_index_});
|
||||
active_threads_.Add(
|
||||
InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_);
|
||||
// Run the initial thread, potentially forking new threads, until every
|
||||
// thread is blocked without further input.
|
||||
RunActiveThreads();
|
||||
@ -170,15 +201,17 @@ class NfaInterpreter {
|
||||
// threads are blocked here, so the latter simply means that
|
||||
// `blocked_threads_` is empty.
|
||||
while (input_index_ != input_.length() &&
|
||||
!(best_match_.has_value() && blocked_threads_.empty())) {
|
||||
DCHECK(active_threads_.empty());
|
||||
!(FoundMatch() && blocked_threads_.is_empty())) {
|
||||
DCHECK(active_threads_.is_empty());
|
||||
uc16 input_char = input_[input_index_];
|
||||
++input_index_;
|
||||
|
||||
// If we haven't found a match yet, we add a thread with least priority
|
||||
// that attempts a match starting after `input_char`.
|
||||
if (!best_match_.has_value()) {
|
||||
active_threads_.emplace_back(InterpreterThread{0, input_index_});
|
||||
if (!FoundMatch()) {
|
||||
active_threads_.Add(
|
||||
InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)},
|
||||
zone_);
|
||||
}
|
||||
|
||||
// We unblock all blocked_threads_ by feeding them the input char.
|
||||
@ -187,14 +220,6 @@ class NfaInterpreter {
|
||||
// Run all threads until they block or accept.
|
||||
RunActiveThreads();
|
||||
}
|
||||
|
||||
// Clean up the data structures we used.
|
||||
base::Optional<MatchRange> result = best_match_;
|
||||
best_match_ = base::nullopt;
|
||||
blocked_threads_.clear();
|
||||
active_threads_.clear();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
|
||||
@ -211,13 +236,19 @@ class NfaInterpreter {
|
||||
RegExpInstruction inst = bytecode_[t.pc];
|
||||
switch (inst.opcode) {
|
||||
case RegExpInstruction::CONSUME_RANGE: {
|
||||
blocked_threads_.emplace_back(t);
|
||||
blocked_threads_.Add(t, zone_);
|
||||
return;
|
||||
}
|
||||
case RegExpInstruction::FORK: {
|
||||
InterpreterThread fork = t;
|
||||
fork.pc = inst.payload.pc;
|
||||
active_threads_.emplace_back(fork);
|
||||
InterpreterThread fork{inst.payload.pc,
|
||||
NewRegisterArrayUninitialized()};
|
||||
Vector<int> fork_registers = GetRegisterArray(fork);
|
||||
Vector<int> t_registers = GetRegisterArray(t);
|
||||
DCHECK_EQ(fork_registers.length(), t_registers.length());
|
||||
std::copy(t_registers.begin(), t_registers.end(),
|
||||
fork_registers.begin());
|
||||
active_threads_.Add(fork, zone_);
|
||||
|
||||
++t.pc;
|
||||
break;
|
||||
}
|
||||
@ -225,9 +256,25 @@ class NfaInterpreter {
|
||||
t.pc = inst.payload.pc;
|
||||
break;
|
||||
case RegExpInstruction::ACCEPT:
|
||||
best_match_ = MatchRange{t.match_begin, input_index_};
|
||||
active_threads_.clear();
|
||||
if (best_match_registers_.has_value()) {
|
||||
FreeRegisterArray(best_match_registers_->begin());
|
||||
}
|
||||
best_match_registers_ = GetRegisterArray(t);
|
||||
|
||||
for (InterpreterThread s : active_threads_) {
|
||||
FreeRegisterArray(s.register_array_begin);
|
||||
}
|
||||
active_threads_.DropAndClear();
|
||||
return;
|
||||
case RegExpInstruction::SET_REGISTER_TO_CP:
|
||||
GetRegisterArray(t)[inst.payload.register_index] = input_index_;
|
||||
++t.pc;
|
||||
break;
|
||||
case RegExpInstruction::CLEAR_REGISTER:
|
||||
GetRegisterArray(t)[inst.payload.register_index] =
|
||||
kUndefinedRegisterValue;
|
||||
++t.pc;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -236,10 +283,8 @@ class NfaInterpreter {
|
||||
// `active_threads_` is empty afterwards. `blocked_threads_` are sorted from
|
||||
// low to high priority.
|
||||
void RunActiveThreads() {
|
||||
while (!active_threads_.empty()) {
|
||||
InterpreterThread t = active_threads_.back();
|
||||
active_threads_.pop_back();
|
||||
RunActiveThread(t);
|
||||
while (!active_threads_.is_empty()) {
|
||||
RunActiveThread(active_threads_.RemoveLast());
|
||||
}
|
||||
}
|
||||
|
||||
@ -250,22 +295,45 @@ class NfaInterpreter {
|
||||
// The threads in blocked_threads_ are sorted from high to low priority,
|
||||
// but active_threads_ needs to be sorted from low to high priority, so we
|
||||
// need to activate blocked threads in reverse order.
|
||||
//
|
||||
// TODO(mbid,v8:10765): base::SmallVector doesn't support `rbegin()` and
|
||||
// `rend()`, should we implement that instead of this awkward iteration?
|
||||
// Maybe we could at least use an int i and check for i >= 0, but
|
||||
// SmallVectors don't have length() methods.
|
||||
for (size_t i = blocked_threads_.size(); i > 0; --i) {
|
||||
InterpreterThread t = blocked_threads_[i - 1];
|
||||
for (int i = blocked_threads_.length() - 1; i >= 0; --i) {
|
||||
InterpreterThread t = blocked_threads_[i];
|
||||
RegExpInstruction inst = bytecode_[t.pc];
|
||||
DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
|
||||
RegExpInstruction::Uc16Range range = inst.payload.consume_range;
|
||||
if (input_char >= range.min && input_char <= range.max) {
|
||||
++t.pc;
|
||||
active_threads_.emplace_back(t);
|
||||
active_threads_.Add(t, zone_);
|
||||
} else {
|
||||
DestroyThread(t);
|
||||
}
|
||||
}
|
||||
blocked_threads_.clear();
|
||||
blocked_threads_.DropAndClear();
|
||||
}
|
||||
|
||||
bool FoundMatch() const { return best_match_registers_.has_value(); }
|
||||
|
||||
Vector<int> GetRegisterArray(InterpreterThread t) {
|
||||
return Vector<int>(t.register_array_begin, register_count_per_match_);
|
||||
}
|
||||
|
||||
int* NewRegisterArrayUninitialized() {
|
||||
return register_array_allocator_.allocate(register_count_per_match_);
|
||||
}
|
||||
|
||||
int* NewRegisterArray(int fill_value) {
|
||||
int* array_begin = NewRegisterArrayUninitialized();
|
||||
int* array_end = array_begin + register_count_per_match_;
|
||||
std::fill(array_begin, array_end, fill_value);
|
||||
return array_begin;
|
||||
}
|
||||
|
||||
void FreeRegisterArray(int* register_array_begin) {
|
||||
register_array_allocator_.deallocate(register_array_begin,
|
||||
register_count_per_match_);
|
||||
}
|
||||
|
||||
void DestroyThread(InterpreterThread t) {
|
||||
FreeRegisterArray(t.register_array_begin);
|
||||
}
|
||||
|
||||
// It is redundant to have two threads t, t0 execute at the same PC value,
|
||||
@ -292,49 +360,60 @@ class NfaInterpreter {
|
||||
pc_last_input_index_[pc] = input_index_;
|
||||
}
|
||||
|
||||
Vector<const RegExpInstruction> bytecode_;
|
||||
Vector<const Character> input_;
|
||||
int input_index_;
|
||||
const Vector<const RegExpInstruction> bytecode_;
|
||||
|
||||
// TODO(mbid,v8:10765): The following `SmallVector`s have somehwat
|
||||
// arbitrarily chosen small capacity sizes; should benchmark to find a good
|
||||
// value.
|
||||
// Number of registers used per thread.
|
||||
const int register_count_per_match_;
|
||||
|
||||
const Vector<const Character> input_;
|
||||
int input_index_;
|
||||
|
||||
// pc_last_input_index_[k] records the value of input_index_ the last
|
||||
// time a thread t such that t.pc == k was activated, i.e. put on
|
||||
// active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See
|
||||
// also `RunActiveThread`.
|
||||
base::SmallVector<int, 64> pc_last_input_index_;
|
||||
Vector<int> pc_last_input_index_;
|
||||
|
||||
// Active threads can potentially (but not necessarily) continue without
|
||||
// input. Sorted from low to high priority.
|
||||
base::SmallVector<InterpreterThread, 64> active_threads_;
|
||||
ZoneList<InterpreterThread> active_threads_;
|
||||
|
||||
// The pc of a blocked thread points to an instruction that consumes a
|
||||
// character. Sorted from high to low priority (so the opposite of
|
||||
// `active_threads_`).
|
||||
base::SmallVector<InterpreterThread, 64> blocked_threads_;
|
||||
ZoneList<InterpreterThread> blocked_threads_;
|
||||
|
||||
// The best match found so far during the current search. If several threads
|
||||
// ACCEPTed, then this will be the match of the accepting thread with highest
|
||||
// priority.
|
||||
base::Optional<MatchRange> best_match_;
|
||||
// RecyclingZoneAllocator maintains a linked list through freed allocations
|
||||
// for reuse if possible.
|
||||
RecyclingZoneAllocator<int> register_array_allocator_;
|
||||
|
||||
// The register array of the best match found so far during the current
|
||||
// search. If several threads ACCEPTed, then this will be the register array
|
||||
// of the accepting thread with highest priority. Should be deallocated with
|
||||
// `register_array_allocator_`.
|
||||
base::Optional<Vector<int>> best_match_registers_;
|
||||
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
int ExperimentalRegExpInterpreter::FindMatchesNfaOneByte(
|
||||
Vector<const RegExpInstruction> bytecode, Vector<const uint8_t> input,
|
||||
int start_index, MatchRange* matches_out, int max_match_num) {
|
||||
NfaInterpreter<uint8_t> interpreter(bytecode, input, start_index);
|
||||
return interpreter.FindMatches(matches_out, max_match_num);
|
||||
Vector<const RegExpInstruction> bytecode, int register_count_per_match,
|
||||
Vector<const uint8_t> input, int start_index, int32_t* output_registers,
|
||||
int output_register_count, Zone* zone) {
|
||||
NfaInterpreter<uint8_t> interpreter(bytecode, register_count_per_match, input,
|
||||
start_index, zone);
|
||||
return interpreter.FindMatches(output_registers, output_register_count);
|
||||
}
|
||||
|
||||
int ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte(
|
||||
Vector<const RegExpInstruction> bytecode, Vector<const uc16> input,
|
||||
int start_index, MatchRange* matches_out, int max_match_num) {
|
||||
NfaInterpreter<uc16> interpreter(bytecode, input, start_index);
|
||||
return interpreter.FindMatches(matches_out, max_match_num);
|
||||
Vector<const RegExpInstruction> bytecode, int register_count_per_match,
|
||||
Vector<const uc16> input, int start_index, int32_t* output_registers,
|
||||
int output_register_count, Zone* zone) {
|
||||
NfaInterpreter<uc16> interpreter(bytecode, register_count_per_match, input,
|
||||
start_index, zone);
|
||||
return interpreter.FindMatches(output_registers, output_register_count);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
@ -11,15 +11,10 @@
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class Zone;
|
||||
|
||||
class ExperimentalRegExpInterpreter final : public AllStatic {
|
||||
public:
|
||||
// A half-open range in an a string denoting a (sub)match. Used to access
|
||||
// output registers of regexp execution grouped by [begin, end) pairs.
|
||||
struct MatchRange {
|
||||
int32_t begin; // inclusive
|
||||
int32_t end; // exclusive
|
||||
};
|
||||
|
||||
// Executes a bytecode program in breadth-first NFA mode, without
|
||||
// backtracking, to find matching substrings. Trys to find up to
|
||||
// `max_match_num` matches in `input`, starting at `start_index`. Returns
|
||||
@ -27,11 +22,14 @@ class ExperimentalRegExpInterpreter final : public AllStatic {
|
||||
// are written to `matches_out`. Provided in variants for one-byte and
|
||||
// two-byte strings.
|
||||
static int FindMatchesNfaOneByte(Vector<const RegExpInstruction> bytecode,
|
||||
int capture_count,
|
||||
Vector<const uint8_t> input, int start_index,
|
||||
MatchRange* matches_out, int max_match_num);
|
||||
int32_t* output_registers,
|
||||
int output_register_count, Zone* zone);
|
||||
static int FindMatchesNfaTwoByte(Vector<const RegExpInstruction> bytecode,
|
||||
Vector<const uc16> input, int start_index,
|
||||
MatchRange* matches_out, int max_match_num);
|
||||
int capture_count, Vector<const uc16> input,
|
||||
int start_index, int32_t* output_registers,
|
||||
int output_register_count, Zone* zone);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
@ -44,7 +44,7 @@ bool ExperimentalRegExp::IsCompiled(Handle<JSRegExp> re, Isolate* isolate) {
|
||||
Smi::FromInt(JSRegExp::kUninitializedValue);
|
||||
}
|
||||
|
||||
void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
|
||||
bool ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
|
||||
DCHECK_EQ(re->TypeTag(), JSRegExp::EXPERIMENTAL);
|
||||
#ifdef VERIFY_HEAP
|
||||
re->JSRegExpVerify(isolate);
|
||||
@ -63,11 +63,15 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
|
||||
FlatStringReader reader(isolate, source);
|
||||
DCHECK(!isolate->has_pending_exception());
|
||||
|
||||
// The pattern was already parsed during initialization, so it should never
|
||||
// fail here:
|
||||
bool parse_success =
|
||||
RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, &parse_result);
|
||||
CHECK(parse_success);
|
||||
if (!parse_success) {
|
||||
// The pattern was already parsed successfully during initialization, so
|
||||
// the only way parsing can fail now is because of stack overflow.
|
||||
CHECK_EQ(parse_result.error, RegExpError::kStackOverflow);
|
||||
USE(RegExp::ThrowRegExpException(isolate, re, source, parse_result.error));
|
||||
return false;
|
||||
}
|
||||
|
||||
ZoneList<RegExpInstruction> bytecode =
|
||||
ExperimentalRegExpCompiler::Compile(parse_result.tree, flags, &zone);
|
||||
@ -84,6 +88,10 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
|
||||
Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
|
||||
re->SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
|
||||
re->SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
|
||||
|
||||
re->SetCaptureNameMap(parse_result.capture_name_map);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) {
|
||||
@ -94,11 +102,9 @@ Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) {
|
||||
return Vector<RegExpInstruction>(inst_begin, inst_num);
|
||||
}
|
||||
|
||||
using MatchRange = ExperimentalRegExpInterpreter::MatchRange;
|
||||
|
||||
// Returns the number of matches.
|
||||
int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject,
|
||||
int32_t* output_registers,
|
||||
int32_t ExperimentalRegExp::ExecRaw(Isolate* isolate, JSRegExp regexp,
|
||||
String subject, int32_t* output_registers,
|
||||
int32_t output_register_count,
|
||||
int32_t subject_index) {
|
||||
DisallowHeapAllocation no_gc;
|
||||
@ -118,21 +124,22 @@ int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject,
|
||||
StdoutStream{} << bytecode << std::endl;
|
||||
}
|
||||
|
||||
int register_count_per_match =
|
||||
JSRegExp::RegistersForCaptureCount(regexp.CaptureCount());
|
||||
|
||||
DCHECK(subject.IsFlat());
|
||||
String::FlatContent subject_content = subject.GetFlatContent(no_gc);
|
||||
|
||||
DCHECK_EQ(output_register_count % 2, 0);
|
||||
MatchRange* matches = reinterpret_cast<MatchRange*>(output_registers);
|
||||
const int32_t max_match_num = output_register_count / 2;
|
||||
Zone zone(isolate->allocator(), ZONE_NAME);
|
||||
|
||||
if (subject_content.IsOneByte()) {
|
||||
return ExperimentalRegExpInterpreter::FindMatchesNfaOneByte(
|
||||
bytecode, subject_content.ToOneByteVector(), subject_index, matches,
|
||||
max_match_num);
|
||||
bytecode, register_count_per_match, subject_content.ToOneByteVector(),
|
||||
subject_index, output_registers, output_register_count, &zone);
|
||||
} else {
|
||||
return ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte(
|
||||
bytecode, subject_content.ToUC16Vector(), subject_index, matches,
|
||||
max_match_num);
|
||||
bytecode, register_count_per_match, subject_content.ToUC16Vector(),
|
||||
subject_index, output_registers, output_register_count, &zone);
|
||||
}
|
||||
}
|
||||
|
||||
@ -156,7 +163,7 @@ int32_t ExperimentalRegExp::MatchForCallFromJs(
|
||||
|
||||
JSRegExp regexp_obj = JSRegExp::cast(Object(regexp));
|
||||
|
||||
return ExecRaw(regexp_obj, subject_string, output_registers,
|
||||
return ExecRaw(isolate, regexp_obj, subject_string, output_registers,
|
||||
output_register_count, start_position);
|
||||
}
|
||||
|
||||
@ -170,22 +177,28 @@ MaybeHandle<Object> ExperimentalRegExp::Exec(
|
||||
regexp->JSRegExpVerify(isolate);
|
||||
#endif
|
||||
|
||||
if (!IsCompiled(regexp, isolate)) {
|
||||
Compile(isolate, regexp);
|
||||
if (!IsCompiled(regexp, isolate) && !Compile(isolate, regexp)) {
|
||||
DCHECK(isolate->has_pending_exception());
|
||||
return MaybeHandle<Object>();
|
||||
}
|
||||
|
||||
DCHECK(IsCompiled(regexp, isolate));
|
||||
|
||||
subject = String::Flatten(isolate, subject);
|
||||
|
||||
MatchRange match;
|
||||
|
||||
int32_t* output_registers = &match.begin;
|
||||
int32_t output_register_count = sizeof(MatchRange) / sizeof(int32_t);
|
||||
|
||||
int capture_count = regexp->CaptureCount();
|
||||
int output_register_count = JSRegExp::RegistersForCaptureCount(capture_count);
|
||||
|
||||
int num_matches = ExecRaw(*regexp, *subject, output_registers,
|
||||
int32_t* output_registers;
|
||||
std::unique_ptr<int32_t[]> output_registers_release;
|
||||
if (output_register_count <= Isolate::kJSRegexpStaticOffsetsVectorSize) {
|
||||
output_registers = isolate->jsregexp_static_offsets_vector();
|
||||
} else {
|
||||
output_registers = NewArray<int32_t>(output_register_count);
|
||||
output_registers_release.reset(output_registers);
|
||||
}
|
||||
|
||||
int num_matches = ExecRaw(isolate, *regexp, *subject, output_registers,
|
||||
output_register_count, subject_index);
|
||||
|
||||
if (num_matches == 0) {
|
||||
|
@ -25,7 +25,8 @@ class ExperimentalRegExp final : public AllStatic {
|
||||
Handle<String> pattern, JSRegExp::Flags flags,
|
||||
int capture_count);
|
||||
static bool IsCompiled(Handle<JSRegExp> re, Isolate* isolate);
|
||||
static void Compile(Isolate* isolate, Handle<JSRegExp> re);
|
||||
V8_WARN_UNUSED_RESULT
|
||||
static bool Compile(Isolate* isolate, Handle<JSRegExp> re);
|
||||
|
||||
// Execution:
|
||||
static int32_t MatchForCallFromJs(Address subject, int32_t start_position,
|
||||
@ -38,7 +39,7 @@ class ExperimentalRegExp final : public AllStatic {
|
||||
static MaybeHandle<Object> Exec(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
Handle<String> subject, int index,
|
||||
Handle<RegExpMatchInfo> last_match_info);
|
||||
static int32_t ExecRaw(JSRegExp regexp, String subject,
|
||||
static int32_t ExecRaw(Isolate* isolate, JSRegExp regexp, String subject,
|
||||
int32_t* output_registers,
|
||||
int32_t output_register_count, int32_t subject_index);
|
||||
|
||||
|
@ -37,6 +37,16 @@ class RegExpImpl final : public AllStatic {
|
||||
Handle<String> pattern, JSRegExp::Flags flags,
|
||||
int capture_count, uint32_t backtrack_limit);
|
||||
|
||||
// Prepare a RegExp for being executed one or more times (using
|
||||
// IrregexpExecOnce) on the subject.
|
||||
// This ensures that the regexp is compiled for the subject, and that
|
||||
// the subject is flat.
|
||||
// Returns the number of integer spaces required by IrregexpExecOnce
|
||||
// as its "registers" argument. If the regexp cannot be compiled,
|
||||
// an exception is set as pending, and this function returns negative.
|
||||
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
Handle<String> subject);
|
||||
|
||||
static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
|
||||
Handle<String> pattern, JSRegExp::Flags flags,
|
||||
Handle<String> match_pattern);
|
||||
@ -83,18 +93,15 @@ class RegExpImpl final : public AllStatic {
|
||||
// For acting on the JSRegExp data FixedArray.
|
||||
static int IrregexpMaxRegisterCount(FixedArray re);
|
||||
static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
|
||||
static void SetIrregexpCaptureNameMap(FixedArray re,
|
||||
Handle<FixedArray> value);
|
||||
static int IrregexpNumberOfCaptures(FixedArray re);
|
||||
static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
|
||||
static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
|
||||
};
|
||||
|
||||
V8_WARN_UNUSED_RESULT
|
||||
static inline MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
|
||||
Handle<JSRegExp> re,
|
||||
Handle<String> pattern,
|
||||
RegExpError error) {
|
||||
MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
|
||||
Handle<JSRegExp> re,
|
||||
Handle<String> pattern,
|
||||
RegExpError error) {
|
||||
Vector<const char> error_data = CStrVector(RegExpErrorString(error));
|
||||
Handle<String> error_text =
|
||||
isolate->factory()
|
||||
@ -106,8 +113,8 @@ static inline MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
|
||||
Object);
|
||||
}
|
||||
|
||||
inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
|
||||
RegExpError error_text) {
|
||||
void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
|
||||
RegExpError error_text) {
|
||||
USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
|
||||
error_text));
|
||||
}
|
||||
@ -169,7 +176,8 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
|
||||
if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
|
||||
&parse_result)) {
|
||||
// Throw an exception if we fail to parse the pattern.
|
||||
return ThrowRegExpException(isolate, re, pattern, parse_result.error);
|
||||
return RegExp::ThrowRegExpException(isolate, re, pattern,
|
||||
parse_result.error);
|
||||
}
|
||||
|
||||
bool has_been_compiled = false;
|
||||
@ -215,6 +223,30 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
|
||||
return re;
|
||||
}
|
||||
|
||||
// static
|
||||
bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
|
||||
Handle<String> subject) {
|
||||
switch (re->TypeTag()) {
|
||||
case JSRegExp::NOT_COMPILED:
|
||||
UNREACHABLE();
|
||||
case JSRegExp::ATOM:
|
||||
return true;
|
||||
case JSRegExp::IRREGEXP:
|
||||
if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
|
||||
DCHECK(isolate->has_pending_exception());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
case JSRegExp::EXPERIMENTAL:
|
||||
if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
|
||||
!ExperimentalRegExp::Compile(isolate, re)) {
|
||||
DCHECK(isolate->has_pending_exception());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
Handle<String> subject, int index,
|
||||
@ -407,7 +439,7 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
|
||||
&compile_data)) {
|
||||
// Throw an exception if we fail to parse the pattern.
|
||||
// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
|
||||
USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
|
||||
USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
|
||||
return false;
|
||||
}
|
||||
// The compilation target is a kBytecode if we're interpreting all regexp
|
||||
@ -423,7 +455,7 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
|
||||
is_one_byte, re->BacktrackLimit());
|
||||
if (!compilation_succeeded) {
|
||||
DCHECK(compile_data.error != RegExpError::kNone);
|
||||
ThrowRegExpException(isolate, re, compile_data.error);
|
||||
RegExp::ThrowRegExpException(isolate, re, compile_data.error);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -445,7 +477,7 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
|
||||
BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
|
||||
data->set(JSRegExp::code_index(is_one_byte), *trampoline);
|
||||
}
|
||||
SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
|
||||
re->SetCaptureNameMap(compile_data.capture_name_map);
|
||||
int register_max = IrregexpMaxRegisterCount(*data);
|
||||
if (compile_data.register_count > register_max) {
|
||||
SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
|
||||
@ -471,15 +503,6 @@ void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
|
||||
re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
|
||||
}
|
||||
|
||||
void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray re,
|
||||
Handle<FixedArray> value) {
|
||||
if (value.is_null()) {
|
||||
re.set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::zero());
|
||||
} else {
|
||||
re.set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
|
||||
}
|
||||
}
|
||||
|
||||
int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
|
||||
return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
|
||||
}
|
||||
@ -502,8 +525,8 @@ void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
|
||||
}
|
||||
|
||||
// static
|
||||
int RegExp::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
Handle<String> subject) {
|
||||
int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
Handle<String> subject) {
|
||||
DCHECK(subject->IsFlat());
|
||||
|
||||
// Check representation of the underlying storage.
|
||||
@ -617,7 +640,8 @@ MaybeHandle<Object> RegExpImpl::IrregexpExec(
|
||||
}
|
||||
|
||||
// Prepare space for the return values.
|
||||
int required_registers = RegExp::IrregexpPrepare(isolate, regexp, subject);
|
||||
int required_registers =
|
||||
RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
|
||||
if (required_registers < 0) {
|
||||
// Compiling failed with an exception.
|
||||
DCHECK(isolate->has_pending_exception());
|
||||
@ -879,43 +903,53 @@ RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
|
||||
regexp_(regexp),
|
||||
subject_(subject),
|
||||
isolate_(isolate) {
|
||||
bool interpreted = regexp->ShouldProduceBytecode();
|
||||
DCHECK(IsGlobal(regexp->GetFlags()));
|
||||
|
||||
switch (regexp_->TypeTag()) {
|
||||
case JSRegExp::NOT_COMPILED:
|
||||
UNREACHABLE();
|
||||
case JSRegExp::EXPERIMENTAL:
|
||||
// TODO(mbid,v8:10765): At the moment experimental regexps can't deal with
|
||||
// captures; this should change in the future.
|
||||
case JSRegExp::ATOM: {
|
||||
// ATOM regexps do not have a global loop, so we search for one match at
|
||||
// a time.
|
||||
static const int kAtomRegistersPerMatch = 2;
|
||||
registers_per_match_ = kAtomRegistersPerMatch;
|
||||
// There is no distinction between interpreted and native for atom
|
||||
// regexps.
|
||||
interpreted = false;
|
||||
register_array_size_ = registers_per_match_;
|
||||
break;
|
||||
}
|
||||
case JSRegExp::IRREGEXP:
|
||||
case JSRegExp::IRREGEXP: {
|
||||
registers_per_match_ =
|
||||
RegExp::IrregexpPrepare(isolate_, regexp_, subject_);
|
||||
RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
|
||||
if (registers_per_match_ < 0) {
|
||||
num_matches_ = -1; // Signal exception.
|
||||
return;
|
||||
}
|
||||
if (regexp->ShouldProduceBytecode()) {
|
||||
// Global loop in interpreted regexp is not implemented. We choose the
|
||||
// size of the offsets vector so that it can only store one match.
|
||||
register_array_size_ = registers_per_match_;
|
||||
max_matches_ = 1;
|
||||
} else {
|
||||
register_array_size_ = Max(registers_per_match_,
|
||||
Isolate::kJSRegexpStaticOffsetsVectorSize);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case JSRegExp::EXPERIMENTAL: {
|
||||
if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
|
||||
!ExperimentalRegExp::Compile(isolate_, regexp)) {
|
||||
DCHECK(isolate->has_pending_exception());
|
||||
num_matches_ = -1; // Signal exception.
|
||||
return;
|
||||
}
|
||||
registers_per_match_ =
|
||||
JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
|
||||
register_array_size_ =
|
||||
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
DCHECK(IsGlobal(regexp->GetFlags()));
|
||||
if (!interpreted) {
|
||||
register_array_size_ =
|
||||
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
|
||||
max_matches_ = register_array_size_ / registers_per_match_;
|
||||
} else {
|
||||
// Global loop in interpreted regexp is not implemented. We choose
|
||||
// the size of the offsets vector so that it can only store one match.
|
||||
register_array_size_ = registers_per_match_;
|
||||
max_matches_ = 1;
|
||||
}
|
||||
max_matches_ = register_array_size_ / registers_per_match_;
|
||||
|
||||
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
|
||||
register_array_ = NewArray<int32_t>(register_array_size_);
|
||||
@ -977,13 +1011,11 @@ int32_t* RegExpGlobalCache::FetchNext() {
|
||||
register_array_, register_array_size_);
|
||||
break;
|
||||
case JSRegExp::EXPERIMENTAL: {
|
||||
if (!ExperimentalRegExp::IsCompiled(regexp_, isolate_)) {
|
||||
ExperimentalRegExp::Compile(isolate_, regexp_);
|
||||
}
|
||||
DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
|
||||
DisallowHeapAllocation no_gc;
|
||||
num_matches_ =
|
||||
ExperimentalRegExp::ExecRaw(*regexp_, *subject_, register_array_,
|
||||
register_array_size_, last_end_index);
|
||||
num_matches_ = ExperimentalRegExp::ExecRaw(
|
||||
isolate_, *regexp_, *subject_, register_array_,
|
||||
register_array_size_, last_end_index);
|
||||
break;
|
||||
}
|
||||
case JSRegExp::IRREGEXP: {
|
||||
|
@ -74,6 +74,13 @@ class RegExp final : public AllStatic {
|
||||
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
|
||||
JSRegExp::Flags flags, uint32_t backtrack_limit);
|
||||
|
||||
// Ensures that a regexp is fully compiled and ready to be executed on a
|
||||
// subject string. Returns true on success. Return false on failure, and
|
||||
// then an exception will be pending.
|
||||
V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate,
|
||||
Handle<JSRegExp> re,
|
||||
Handle<String> subject);
|
||||
|
||||
enum CallOrigin : int {
|
||||
kFromRuntime = 0,
|
||||
kFromJs = 1,
|
||||
@ -97,16 +104,6 @@ class RegExp final : public AllStatic {
|
||||
RE_EXCEPTION = kInternalRegExpException,
|
||||
};
|
||||
|
||||
// Prepare a RegExp for being executed one or more times (using
|
||||
// IrregexpExecOnce) on the subject.
|
||||
// This ensures that the regexp is compiled for the subject, and that
|
||||
// the subject is flat.
|
||||
// Returns the number of integer spaces required by IrregexpExecOnce
|
||||
// as its "registers" argument. If the regexp cannot be compiled,
|
||||
// an exception is set as pending, and this function returns negative.
|
||||
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
Handle<String> subject);
|
||||
|
||||
// Set last match info. If match is nullptr, then setting captures is
|
||||
// omitted.
|
||||
static Handle<RegExpMatchInfo> SetLastMatchInfo(
|
||||
@ -124,6 +121,14 @@ class RegExp final : public AllStatic {
|
||||
RegExpNode* node);
|
||||
|
||||
static const int kRegExpTooLargeToOptimize = 20 * KB;
|
||||
|
||||
V8_WARN_UNUSED_RESULT
|
||||
static MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
|
||||
Handle<JSRegExp> re,
|
||||
Handle<String> pattern,
|
||||
RegExpError error);
|
||||
static void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
|
||||
RegExpError error_text);
|
||||
};
|
||||
|
||||
// Uses a special global mode of irregexp-generated code to perform a global
|
||||
|
@ -322,7 +322,7 @@ bool CompiledReplacement::Compile(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
|
||||
FixedArray capture_name_map;
|
||||
if (capture_count > 0) {
|
||||
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
|
||||
DCHECK(JSRegExp::TypeSupportsCaptures(regexp->TypeTag()));
|
||||
Object maybe_capture_name_map = regexp->CaptureNameMap();
|
||||
if (maybe_capture_name_map.IsFixedArray()) {
|
||||
capture_name_map = FixedArray::cast(maybe_capture_name_map);
|
||||
@ -611,13 +611,9 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString(
|
||||
int capture_count = regexp->CaptureCount();
|
||||
int subject_length = subject->length();
|
||||
|
||||
JSRegExp::Type typeTag = regexp->TypeTag();
|
||||
if (typeTag == JSRegExp::IRREGEXP) {
|
||||
// Ensure the RegExp is compiled so we can access the capture-name map.
|
||||
if (RegExp::IrregexpPrepare(isolate, regexp, subject) == -1) {
|
||||
DCHECK(isolate->has_pending_exception());
|
||||
return ReadOnlyRoots(isolate).exception();
|
||||
}
|
||||
// Ensure the RegExp is compiled so we can access the capture-name map.
|
||||
if (!RegExp::EnsureFullyCompiled(isolate, regexp, subject)) {
|
||||
return ReadOnlyRoots(isolate).exception();
|
||||
}
|
||||
|
||||
// CompiledReplacement uses zone allocation.
|
||||
@ -627,7 +623,7 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString(
|
||||
isolate, regexp, replacement, capture_count, subject_length);
|
||||
|
||||
// Shortcut for simple non-regexp global replacements
|
||||
if (typeTag == JSRegExp::ATOM && simple_replace) {
|
||||
if (regexp->TypeTag() == JSRegExp::ATOM && simple_replace) {
|
||||
if (subject->IsOneByteRepresentation() &&
|
||||
replacement->IsOneByteRepresentation()) {
|
||||
return StringReplaceGlobalAtomRegExpWithString<SeqOneByteString>(
|
||||
@ -1460,8 +1456,7 @@ RUNTIME_FUNCTION(Runtime_StringReplaceNonGlobalRegExpWithFunction) {
|
||||
bool has_named_captures = false;
|
||||
Handle<FixedArray> capture_map;
|
||||
if (m > 1) {
|
||||
// The existence of capture groups implies IRREGEXP kind.
|
||||
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
|
||||
DCHECK(JSRegExp::TypeSupportsCaptures(regexp->TypeTag()));
|
||||
|
||||
Object maybe_capture_map = regexp->CaptureNameMap();
|
||||
if (maybe_capture_map.IsFixedArray()) {
|
||||
|
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// Flags: --allow-natives-syntax
|
||||
// Flags: --allow-natives-syntax --no-enable-experimental-regexp-engine
|
||||
|
||||
const kNoBacktrackLimit = 0; // To match JSRegExp::kNoBacktrackLimit.
|
||||
const re0 = %NewRegExpWithBacktrackLimit("(\\d+)+x", "", kNoBacktrackLimit);
|
||||
|
@ -60,5 +60,16 @@ Test(/(?:asdf)/, "123asdfxyz", ["asdf"], 0);
|
||||
Test(/(?:asdf)|123/, "xyz123asdf", ["123"], 0);
|
||||
Test(/asdf(?:[0-9]|(?:xy|x)*)*/, "kkkasdf5xyx8xyyky", ["asdf5xyx8xy"], 0);
|
||||
|
||||
// Capturing groups.
|
||||
Test(/()/, "asdf", ["", ""], 0);
|
||||
Test(/(123)/, "asdf123xyz", ["123", "123"], 0);
|
||||
Test(/asdf(123)xyz/, "asdf123xyz", ["asdf123xyz", "123"], 0);
|
||||
Test(/(123|xyz)/, "123", ["123", "123"], 0);
|
||||
Test(/(123|xyz)/, "xyz", ["xyz", "xyz"], 0);
|
||||
Test(/(123)|(xyz)/, "123", ["123", "123", undefined], 0);
|
||||
Test(/(123)|(xyz)/, "xyz", ["xyz", undefined, "xyz"], 0);
|
||||
Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0);
|
||||
Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0);
|
||||
|
||||
// The global flag.
|
||||
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
|
||||
|
Loading…
Reference in New Issue
Block a user