[regexp] Extract more parts of the regexp compiler

Bug: v8:9359
Change-Id: I06a4ccc53abff25237a1113774a0b17bdf861c86
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1658157
Reviewed-by: Peter Marshall <petermarshall@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62198}
This commit is contained in:
Jakob Gruber 2019-06-17 09:22:43 +02:00 committed by Commit Bot
parent dbfe54b12b
commit def9aa5d0a
10 changed files with 5338 additions and 5690 deletions

View File

@ -2686,7 +2686,10 @@ v8_source_set("v8_base_without_compiler") {
"src/regexp/regexp-ast.h",
"src/regexp/regexp-bytecodes.h",
"src/regexp/regexp-compiler-tonode.cc",
"src/regexp/regexp-compiler.cc",
"src/regexp/regexp-compiler.h",
"src/regexp/regexp-dotprinter.cc",
"src/regexp/regexp-dotprinter.h",
"src/regexp/regexp-interpreter.cc",
"src/regexp/regexp-interpreter.h",
"src/regexp/regexp-macro-assembler-arch.h",
@ -2697,6 +2700,7 @@ v8_source_set("v8_base_without_compiler") {
"src/regexp/regexp-macro-assembler-tracer.h",
"src/regexp/regexp-macro-assembler.cc",
"src/regexp/regexp-macro-assembler.h",
"src/regexp/regexp-nodes.h",
"src/regexp/regexp-parser.cc",
"src/regexp/regexp-parser.h",
"src/regexp/regexp-stack.cc",
@ -2949,6 +2953,10 @@ v8_source_set("v8_base_without_compiler") {
"src/objects/elements.cc",
"src/objects/objects.cc",
"src/parsing/parser.cc",
# Explicit template instantiation clash (these files are also very large).
"src/regexp/regexp-compiler-tonode.cc",
"src/regexp/regexp-compiler.cc",
]
if (v8_current_cpu == "x86") {

View File

@ -75,11 +75,6 @@ int32_t* RegExpImpl::GlobalCache::LastSuccessfulMatch() {
return &register_array_[index];
}
RegExpEngine::CompilationResult::CompilationResult(Isolate* isolate,
const char* error_message)
: error_message(error_message),
code(ReadOnlyRoots(isolate).the_hole_value()) {}
} // namespace internal
} // namespace v8

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,8 +5,8 @@
#ifndef V8_REGEXP_REGEXP_COMPILER_H_
#define V8_REGEXP_REGEXP_COMPILER_H_
#include "src/regexp/jsregexp.h" // TODO(jgruber): Remove if possible.
#include "src/regexp/regexp-macro-assembler-arch.h"
#include "src/regexp/regexp-nodes.h"
#include "src/zone/zone-splay-tree.h"
namespace v8 {
namespace internal {
@ -37,8 +37,530 @@ constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
0x2028, 0x202A, kRangeEndMarker};
constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
// More makes code generation slower, less makes V8 benchmark score lower.
constexpr int kMaxLookaheadForBoyerMoore = 8;
// In a 3-character pattern you can maximally step forwards 3 characters
// at a time, which is not always enough to pay for the extra logic.
constexpr int kPatternTooShortForBoyerMoore = 2;
} // namespace regexp_compiler_constants
// A set of unsigned integers that behaves especially well on small
// integers (< 32). May do zone-allocation.
class OutSet : public ZoneObject {
public:
OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
OutSet* Extend(unsigned value, Zone* zone);
V8_EXPORT_PRIVATE bool Get(unsigned value) const;
static const unsigned kFirstLimit = 32;
private:
// Destructively set a value in this set. In most cases you want
// to use Extend instead to ensure that only one instance exists
// that contains the same values.
void Set(unsigned value, Zone* zone);
// The successors are a list of sets that contain the same values
// as this set and the one more value that is not present in this
// set.
ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
OutSet(uint32_t first, ZoneList<unsigned>* remaining)
: first_(first), remaining_(remaining), successors_(nullptr) {}
uint32_t first_;
ZoneList<unsigned>* remaining_;
ZoneList<OutSet*>* successors_;
friend class Trace;
};
// A mapping from integers, specified as ranges, to a set of integers.
// Used for mapping character ranges to choices.
class DispatchTable : public ZoneObject {
public:
explicit DispatchTable(Zone* zone) : tree_(zone) {}
class Entry {
public:
Entry() : from_(0), to_(0), out_set_(nullptr) {}
Entry(uc32 from, uc32 to, OutSet* out_set)
: from_(from), to_(to), out_set_(out_set) {
DCHECK(from <= to);
}
uc32 from() { return from_; }
uc32 to() { return to_; }
void set_to(uc32 value) { to_ = value; }
void AddValue(int value, Zone* zone) {
out_set_ = out_set_->Extend(value, zone);
}
OutSet* out_set() { return out_set_; }
private:
uc32 from_;
uc32 to_;
OutSet* out_set_;
};
class Config {
public:
using Key = uc32;
using Value = Entry;
static const uc32 kNoKey;
static const Entry NoValue() { return Value(); }
static inline int Compare(uc32 a, uc32 b) {
if (a == b)
return 0;
else if (a < b)
return -1;
else
return 1;
}
};
V8_EXPORT_PRIVATE void AddRange(CharacterRange range, int value, Zone* zone);
V8_EXPORT_PRIVATE OutSet* Get(uc32 value);
void Dump();
template <typename Callback>
void ForEach(Callback* callback) {
return tree()->ForEach(callback);
}
private:
// There can't be a static empty set since it allocates its
// successors in a zone and caches them.
OutSet* empty() { return &empty_; }
OutSet empty_;
ZoneSplayTree<Config>* tree() { return &tree_; }
ZoneSplayTree<Config> tree_;
};
// Node visitor used to add the start set of the alternatives to the
// dispatch table of a choice node.
class V8_EXPORT_PRIVATE DispatchTableConstructor : public NodeVisitor {
public:
DispatchTableConstructor(DispatchTable* table, bool ignore_case, Zone* zone)
: table_(table),
choice_index_(-1),
ignore_case_(ignore_case),
zone_(zone) {}
void BuildTable(ChoiceNode* node);
void AddRange(CharacterRange range) {
table()->AddRange(range, choice_index_, zone_);
}
void AddInverse(ZoneList<CharacterRange>* ranges);
#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
DispatchTable* table() { return table_; }
void set_choice_index(int value) { choice_index_ = value; }
protected:
DispatchTable* table_;
int choice_index_;
bool ignore_case_;
Zone* zone_;
};
// Details of a quick mask-compare check that can look ahead in the
// input stream.
class QuickCheckDetails {
public:
QuickCheckDetails()
: characters_(0), mask_(0), value_(0), cannot_match_(false) {}
explicit QuickCheckDetails(int characters)
: characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
bool Rationalize(bool one_byte);
// Merge in the information from another branch of an alternation.
void Merge(QuickCheckDetails* other, int from_index);
// Advance the current position by some amount.
void Advance(int by, bool one_byte);
void Clear();
bool cannot_match() { return cannot_match_; }
void set_cannot_match() { cannot_match_ = true; }
struct Position {
Position() : mask(0), value(0), determines_perfectly(false) {}
uc16 mask;
uc16 value;
bool determines_perfectly;
};
int characters() { return characters_; }
void set_characters(int characters) { characters_ = characters; }
Position* positions(int index) {
DCHECK_LE(0, index);
DCHECK_GT(characters_, index);
return positions_ + index;
}
uint32_t mask() { return mask_; }
uint32_t value() { return value_; }
private:
// How many characters do we have quick check information from. This is
// the same for all branches of a choice node.
int characters_;
Position positions_[4];
// These values are the condensate of the above array after Rationalize().
uint32_t mask_;
uint32_t value_;
// If set to true, there is no way this quick check can match at all.
// E.g., if it requires to be at the start of the input, and isn't.
bool cannot_match_;
};
// Improve the speed that we scan for an initial point where a non-anchored
// regexp can match by using a Boyer-Moore-like table. This is done by
// identifying non-greedy non-capturing loops in the nodes that eat any
// character one at a time. For example in the middle of the regexp
// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
// inserted at the start of any non-anchored regexp.
//
// When we have found such a loop we look ahead in the nodes to find the set of
// characters that can come at given distances. For example for the regexp
// /.?foo/ we know that there are at least 3 characters ahead of us, and the
// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
// the lookahead info where the set of characters is reasonably constrained. In
// our example this is from index 1 to 2 (0 is not constrained). We can now
// look 3 characters ahead and if we don't find one of [f, o] (the union of
// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
//
// For Unicode input strings we do the same, but modulo 128.
//
// We also look at the first string fed to the regexp and use that to get a hint
// of the character frequencies in the inputs. This affects the assessment of
// whether the set of characters is 'reasonably constrained'.
//
// We also have another lookahead mechanism (called quick check in the code),
// which uses a wide load of multiple characters followed by a mask and compare
// to determine whether a match is possible at this point.
enum ContainedInLattice {
kNotYet = 0,
kLatticeIn = 1,
kLatticeOut = 2,
kLatticeUnknown = 3 // Can also mean both in and out.
};
inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
return static_cast<ContainedInLattice>(a | b);
}
ContainedInLattice AddRange(ContainedInLattice a, const int* ranges,
int ranges_size, Interval new_range);
class BoyerMoorePositionInfo : public ZoneObject {
public:
explicit BoyerMoorePositionInfo(Zone* zone)
: map_(new (zone) ZoneList<bool>(kMapSize, zone)),
map_count_(0),
w_(kNotYet),
s_(kNotYet),
d_(kNotYet),
surrogate_(kNotYet) {
for (int i = 0; i < kMapSize; i++) {
map_->Add(false, zone);
}
}
bool& at(int i) { return map_->at(i); }
static const int kMapSize = 128;
static const int kMask = kMapSize - 1;
int map_count() const { return map_count_; }
void Set(int character);
void SetInterval(const Interval& interval);
void SetAll();
bool is_non_word() { return w_ == kLatticeOut; }
bool is_word() { return w_ == kLatticeIn; }
private:
ZoneList<bool>* map_;
int map_count_; // Number of set bits in the map.
ContainedInLattice w_; // The \w character class.
ContainedInLattice s_; // The \s character class.
ContainedInLattice d_; // The \d character class.
ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
};
class BoyerMooreLookahead : public ZoneObject {
public:
BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
int length() { return length_; }
int max_char() { return max_char_; }
RegExpCompiler* compiler() { return compiler_; }
int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
void Set(int map_number, int character) {
if (character > max_char_) return;
BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
info->Set(character);
}
void SetInterval(int map_number, const Interval& interval) {
if (interval.from() > max_char_) return;
BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
if (interval.to() > max_char_) {
info->SetInterval(Interval(interval.from(), max_char_));
} else {
info->SetInterval(interval);
}
}
void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
void SetRest(int from_map) {
for (int i = from_map; i < length_; i++) SetAll(i);
}
void EmitSkipInstructions(RegExpMacroAssembler* masm);
private:
// This is the value obtained by EatsAtLeast. If we do not have at least this
// many characters left in the sample string then the match is bound to fail.
// Therefore it is OK to read a character this far ahead of the current match
// point.
int length_;
RegExpCompiler* compiler_;
// 0xff for Latin1, 0xffff for UTF-16.
int max_char_;
ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
int GetSkipTable(int min_lookahead, int max_lookahead,
Handle<ByteArray> boolean_skip_table);
bool FindWorthwhileInterval(int* from, int* to);
int FindBestInterval(int max_number_of_chars, int old_biggest_points,
int* from, int* to);
};
// There are many ways to generate code for a node. This class encapsulates
// the current way we should be generating. In other words it encapsulates
// the current state of the code generator. The effect of this is that we
// generate code for paths that the matcher can take through the regular
// expression. A given node in the regexp can be code-generated several times
// as it can be part of several traces. For example for the regexp:
// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
// to match foo is generated only once (the traces have a common prefix). The
// code to store the capture is deferred and generated (twice) after the places
// where baz has been matched.
class Trace {
public:
// A value for a property that is either known to be true, know to be false,
// or not known.
enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
class DeferredAction {
public:
DeferredAction(ActionNode::ActionType action_type, int reg)
: action_type_(action_type), reg_(reg), next_(nullptr) {}
DeferredAction* next() { return next_; }
bool Mentions(int reg);
int reg() { return reg_; }
ActionNode::ActionType action_type() { return action_type_; }
private:
ActionNode::ActionType action_type_;
int reg_;
DeferredAction* next_;
friend class Trace;
};
class DeferredCapture : public DeferredAction {
public:
DeferredCapture(int reg, bool is_capture, Trace* trace)
: DeferredAction(ActionNode::STORE_POSITION, reg),
cp_offset_(trace->cp_offset()),
is_capture_(is_capture) {}
int cp_offset() { return cp_offset_; }
bool is_capture() { return is_capture_; }
private:
int cp_offset_;
bool is_capture_;
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
};
class DeferredSetRegister : public DeferredAction {
public:
DeferredSetRegister(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER, reg), value_(value) {}
int value() { return value_; }
private:
int value_;
};
class DeferredClearCaptures : public DeferredAction {
public:
explicit DeferredClearCaptures(Interval range)
: DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
Interval range() { return range_; }
private:
Interval range_;
};
class DeferredIncrementRegister : public DeferredAction {
public:
explicit DeferredIncrementRegister(int reg)
: DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
};
Trace()
: cp_offset_(0),
actions_(nullptr),
backtrack_(nullptr),
stop_node_(nullptr),
loop_label_(nullptr),
characters_preloaded_(0),
bound_checked_up_to_(0),
flush_budget_(100),
at_start_(UNKNOWN) {}
// End the trace. This involves flushing the deferred actions in the trace
// and pushing a backtrack location onto the backtrack stack. Once this is
// done we can start a new trace or go to one that has already been
// generated.
void Flush(RegExpCompiler* compiler, RegExpNode* successor);
int cp_offset() { return cp_offset_; }
DeferredAction* actions() { return actions_; }
// A trivial trace is one that has no deferred actions or other state that
// affects the assumptions used when generating code. There is no recorded
// backtrack location in a trivial trace, so with a trivial trace we will
// generate code that, on a failure to match, gets the backtrack location
// from the backtrack stack rather than using a direct jump instruction. We
// always start code generation with a trivial trace and non-trivial traces
// are created as we emit code for nodes or add to the list of deferred
// actions in the trace. The location of the code generated for a node using
// a trivial trace is recorded in a label in the node so that gotos can be
// generated to that code.
bool is_trivial() {
return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
}
TriBool at_start() { return at_start_; }
void set_at_start(TriBool at_start) { at_start_ = at_start; }
Label* backtrack() { return backtrack_; }
Label* loop_label() { return loop_label_; }
RegExpNode* stop_node() { return stop_node_; }
int characters_preloaded() { return characters_preloaded_; }
int bound_checked_up_to() { return bound_checked_up_to_; }
int flush_budget() { return flush_budget_; }
QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
bool mentions_reg(int reg);
// Returns true if a deferred position store exists to the specified
// register and stores the offset in the out-parameter. Otherwise
// returns false.
bool GetStoredPosition(int reg, int* cp_offset);
// These set methods and AdvanceCurrentPositionInTrace should be used only on
// new traces - the intention is that traces are immutable after creation.
void add_action(DeferredAction* new_action) {
DCHECK(new_action->next_ == nullptr);
new_action->next_ = actions_;
actions_ = new_action;
}
void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
void set_stop_node(RegExpNode* node) { stop_node_ = node; }
void set_loop_label(Label* label) { loop_label_ = label; }
void set_characters_preloaded(int count) { characters_preloaded_ = count; }
void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
void set_flush_budget(int to) { flush_budget_ = to; }
void set_quick_check_performed(QuickCheckDetails* d) {
quick_check_performed_ = *d;
}
void InvalidateCurrentCharacter();
void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
private:
int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
const OutSet& affected_registers,
OutSet* registers_to_pop,
OutSet* registers_to_clear, Zone* zone);
void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
const OutSet& registers_to_pop,
const OutSet& registers_to_clear);
int cp_offset_;
DeferredAction* actions_;
Label* backtrack_;
RegExpNode* stop_node_;
Label* loop_label_;
int characters_preloaded_;
int bound_checked_up_to_;
QuickCheckDetails quick_check_performed_;
int flush_budget_;
TriBool at_start_;
};
class GreedyLoopState {
public:
explicit GreedyLoopState(bool not_at_start);
Label* label() { return &label_; }
Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
private:
Label label_;
Trace counter_backtrack_trace_;
};
struct PreloadState {
static const int kEatsAtLeastNotYetInitialized = -1;
bool preload_is_current_;
bool preload_has_checked_bounds_;
int preload_characters_;
int eats_at_least_;
void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
};
// Assertion propagation moves information about assertions such as
// \b to the affected nodes. For instance, in /.\b./ information must
// be propagated to the first '.' that whatever follows needs to know
// if it matched a word or a non-word, and to the second '.' that it
// has to check if it succeeds a word or non-word. In this case the
// result will be something like:
//
// +-------+ +------------+
// | . | | . |
// +-------+ ---> +------------+
// | word? | | check word |
// +-------+ +------------+
class Analysis : public NodeVisitor {
public:
Analysis(Isolate* isolate, bool is_one_byte)
: isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
void EnsureAnalyzed(RegExpNode* node);
#define DECLARE_VISIT(Type) void Visit##Type(Type##Node* that) override;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
void VisitLoopChoice(LoopChoiceNode* that) override;
bool has_failed() { return error_message_ != nullptr; }
const char* error_message() {
DCHECK(error_message_ != nullptr);
return error_message_;
}
void fail(const char* error_message) { error_message_ = error_message; }
Isolate* isolate() const { return isolate_; }
private:
Isolate* isolate_;
bool is_one_byte_;
const char* error_message_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
};
class FrequencyCollator {
public:
FrequencyCollator() : total_samples_(0) {
@ -113,10 +635,30 @@ class RegExpCompiler {
return unicode_lookaround_position_register_;
}
RegExpEngine::CompilationResult Assemble(Isolate* isolate,
RegExpMacroAssembler* assembler,
RegExpNode* start, int capture_count,
Handle<String> pattern);
struct CompilationResult final {
explicit CompilationResult(const char* error_message)
: error_message(error_message) {}
CompilationResult(Object code, int registers)
: code(code), num_registers(registers) {}
static CompilationResult RegExpTooBig() {
return CompilationResult("RegExp too big");
}
const char* const error_message = nullptr;
Object code;
int num_registers = 0;
};
CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
RegExpNode* start, int capture_count,
Handle<String> pattern);
// If the regexp matching starts within a surrogate pair, step back to the
// lead surrogate and start matching from there.
static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
RegExpNode* on_success,
JSRegExp::Flags flags);
inline void AddWork(RegExpNode* node) {
if (!node->on_work_list() && !node->label()->is_bound()) {

View File

@ -0,0 +1,339 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/regexp-dotprinter.h"
#include "src/regexp/regexp-compiler.h"
#include "src/utils/ostreams.h"
#include "src/utils/splay-tree-inl.h"
namespace v8 {
namespace internal {
// -------------------------------------------------------------------
// Dot/dotty output
#ifdef DEBUG
class DotPrinterImpl : public NodeVisitor {
public:
DotPrinterImpl(std::ostream& os, bool ignore_case) // NOLINT
: os_(os), ignore_case_(ignore_case) {}
void PrintNode(const char* label, RegExpNode* node);
void Visit(RegExpNode* node);
void PrintAttributes(RegExpNode* from);
void PrintOnFailure(RegExpNode* from, RegExpNode* to);
#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
private:
std::ostream& os_;
bool ignore_case_;
};
void DotPrinterImpl::PrintNode(const char* label, RegExpNode* node) {
os_ << "digraph G {\n graph [label=\"";
for (int i = 0; label[i]; i++) {
switch (label[i]) {
case '\\':
os_ << "\\\\";
break;
case '"':
os_ << "\"";
break;
default:
os_ << label[i];
break;
}
}
os_ << "\"];\n";
Visit(node);
os_ << "}" << std::endl;
}
void DotPrinterImpl::Visit(RegExpNode* node) {
if (node->info()->visited) return;
node->info()->visited = true;
node->Accept(this);
}
void DotPrinterImpl::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
Visit(on_failure);
}
class TableEntryBodyPrinter {
public:
TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
: os_(os), choice_(choice) {}
void Call(uc16 from, DispatchTable::Entry entry) {
OutSet* out_set = entry.out_set();
for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
if (out_set->Get(i)) {
os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
<< choice()->alternatives()->at(i).node() << ";\n";
}
}
}
private:
ChoiceNode* choice() { return choice_; }
std::ostream& os_;
ChoiceNode* choice_;
};
class TableEntryHeaderPrinter {
public:
explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
: first_(true), os_(os) {}
void Call(uc16 from, DispatchTable::Entry entry) {
if (first_) {
first_ = false;
} else {
os_ << "|";
}
os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
OutSet* out_set = entry.out_set();
int priority = 0;
for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
if (out_set->Get(i)) {
if (priority > 0) os_ << "|";
os_ << "<s" << from << "o" << i << "> " << priority;
priority++;
}
}
os_ << "}}";
}
private:
bool first_;
std::ostream& os_;
};
class AttributePrinter {
public:
explicit AttributePrinter(std::ostream& os) // NOLINT
: os_(os), first_(true) {}
void PrintSeparator() {
if (first_) {
first_ = false;
} else {
os_ << "|";
}
}
void PrintBit(const char* name, bool value) {
if (!value) return;
PrintSeparator();
os_ << "{" << name << "}";
}
void PrintPositive(const char* name, int value) {
if (value < 0) return;
PrintSeparator();
os_ << "{" << name << "|" << value << "}";
}
private:
std::ostream& os_;
bool first_;
};
void DotPrinterImpl::PrintAttributes(RegExpNode* that) {
os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
<< "margin=0.1, fontsize=10, label=\"{";
AttributePrinter printer(os_);
NodeInfo* info = that->info();
printer.PrintBit("NI", info->follows_newline_interest);
printer.PrintBit("WI", info->follows_word_interest);
printer.PrintBit("SI", info->follows_start_interest);
Label* label = that->label();
if (label->is_bound()) printer.PrintPositive("@", label->pos());
os_ << "}\"];\n"
<< " a" << that << " -> n" << that
<< " [style=dashed, color=grey, arrowhead=none];\n";
}
static const bool kPrintDispatchTable = false;
void DotPrinterImpl::VisitChoice(ChoiceNode* that) {
if (kPrintDispatchTable) {
os_ << " n" << that << " [shape=Mrecord, label=\"";
TableEntryHeaderPrinter header_printer(os_);
that->GetTable(ignore_case_)->ForEach(&header_printer);
os_ << "\"]\n";
PrintAttributes(that);
TableEntryBodyPrinter body_printer(os_, that);
that->GetTable(ignore_case_)->ForEach(&body_printer);
} else {
os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
for (int i = 0; i < that->alternatives()->length(); i++) {
GuardedAlternative alt = that->alternatives()->at(i);
os_ << " n" << that << " -> n" << alt.node();
}
}
for (int i = 0; i < that->alternatives()->length(); i++) {
GuardedAlternative alt = that->alternatives()->at(i);
alt.node()->Accept(this);
}
}
void DotPrinterImpl::VisitText(TextNode* that) {
Zone* zone = that->zone();
os_ << " n" << that << " [label=\"";
for (int i = 0; i < that->elements()->length(); i++) {
if (i > 0) os_ << " ";
TextElement elm = that->elements()->at(i);
switch (elm.text_type()) {
case TextElement::ATOM: {
Vector<const uc16> data = elm.atom()->data();
for (int i = 0; i < data.length(); i++) {
os_ << static_cast<char>(data[i]);
}
break;
}
case TextElement::CHAR_CLASS: {
RegExpCharacterClass* node = elm.char_class();
os_ << "[";
if (node->is_negated()) os_ << "^";
for (int j = 0; j < node->ranges(zone)->length(); j++) {
CharacterRange range = node->ranges(zone)->at(j);
os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
}
os_ << "]";
break;
}
default:
UNREACHABLE();
}
}
os_ << "\", shape=box, peripheries=2];\n";
PrintAttributes(that);
os_ << " n" << that << " -> n" << that->on_success() << ";\n";
Visit(that->on_success());
}
void DotPrinterImpl::VisitBackReference(BackReferenceNode* that) {
os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
<< that->end_register() << "\", shape=doubleoctagon];\n";
PrintAttributes(that);
os_ << " n" << that << " -> n" << that->on_success() << ";\n";
Visit(that->on_success());
}
void DotPrinterImpl::VisitEnd(EndNode* that) {
os_ << " n" << that << " [style=bold, shape=point];\n";
PrintAttributes(that);
}
void DotPrinterImpl::VisitAssertion(AssertionNode* that) {
os_ << " n" << that << " [";
switch (that->assertion_type()) {
case AssertionNode::AT_END:
os_ << "label=\"$\", shape=septagon";
break;
case AssertionNode::AT_START:
os_ << "label=\"^\", shape=septagon";
break;
case AssertionNode::AT_BOUNDARY:
os_ << "label=\"\\b\", shape=septagon";
break;
case AssertionNode::AT_NON_BOUNDARY:
os_ << "label=\"\\B\", shape=septagon";
break;
case AssertionNode::AFTER_NEWLINE:
os_ << "label=\"(?<=\\n)\", shape=septagon";
break;
}
os_ << "];\n";
PrintAttributes(that);
RegExpNode* successor = that->on_success();
os_ << " n" << that << " -> n" << successor << ";\n";
Visit(successor);
}
void DotPrinterImpl::VisitAction(ActionNode* that) {
os_ << " n" << that << " [";
switch (that->action_type_) {
case ActionNode::SET_REGISTER:
os_ << "label=\"$" << that->data_.u_store_register.reg
<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
break;
case ActionNode::INCREMENT_REGISTER:
os_ << "label=\"$" << that->data_.u_increment_register.reg
<< "++\", shape=octagon";
break;
case ActionNode::STORE_POSITION:
os_ << "label=\"$" << that->data_.u_position_register.reg
<< ":=$pos\", shape=octagon";
break;
case ActionNode::BEGIN_SUBMATCH:
os_ << "label=\"$" << that->data_.u_submatch.current_position_register
<< ":=$pos,begin\", shape=septagon";
break;
case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
os_ << "label=\"escape\", shape=septagon";
break;
case ActionNode::EMPTY_MATCH_CHECK:
os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
<< "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
<< "<" << that->data_.u_empty_match_check.repetition_limit
<< "?\", shape=septagon";
break;
case ActionNode::CLEAR_CAPTURES: {
os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
<< " to $" << that->data_.u_clear_captures.range_to
<< "\", shape=septagon";
break;
}
}
os_ << "];\n";
PrintAttributes(that);
RegExpNode* successor = that->on_success();
os_ << " n" << that << " -> n" << successor << ";\n";
Visit(successor);
}
class DispatchTableDumper {
public:
explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
void Call(uc16 key, DispatchTable::Entry entry);
private:
std::ostream& os_;
};
void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
OutSet* set = entry.out_set();
bool first = true;
for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
if (set->Get(i)) {
if (first) {
first = false;
} else {
os_ << ", ";
}
os_ << i;
}
}
os_ << "}\n";
}
void DispatchTable::Dump() {
OFStream os(stderr);
DispatchTableDumper dumper(os);
tree()->ForEach(&dumper);
}
#endif // DEBUG
void DotPrinter::DotPrint(const char* label, RegExpNode* node,
bool ignore_case) {
#ifdef DEBUG
StdoutStream os;
DotPrinterImpl printer(os, ignore_case);
printer.PrintNode(label, node);
#endif // DEBUG
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,23 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_DOTPRINTER_H_
#define V8_REGEXP_REGEXP_DOTPRINTER_H_
#include "src/common/globals.h"
namespace v8 {
namespace internal {
class RegExpNode;
class DotPrinter final : public AllStatic {
public:
static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_DOTPRINTER_H_

678
src/regexp/regexp-nodes.h Normal file
View File

@ -0,0 +1,678 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_NODES_H_
#define V8_REGEXP_REGEXP_NODES_H_
#include "src/regexp/regexp-macro-assembler.h"
#include "src/zone/zone.h"
namespace v8 {
namespace internal {
class AlternativeGenerationList;
class BoyerMooreLookahead;
class DispatchTable;
class GreedyLoopState;
class Label;
class NodeVisitor;
class QuickCheckDetails;
class RegExpCompiler;
class Trace;
struct PreloadState;
#define FOR_EACH_NODE_TYPE(VISIT) \
VISIT(End) \
VISIT(Action) \
VISIT(Choice) \
VISIT(BackReference) \
VISIT(Assertion) \
VISIT(Text)
struct NodeInfo final {
NodeInfo()
: being_analyzed(false),
been_analyzed(false),
follows_word_interest(false),
follows_newline_interest(false),
follows_start_interest(false),
at_end(false),
visited(false),
replacement_calculated(false) {}
// Returns true if the interests and assumptions of this node
// matches the given one.
bool Matches(NodeInfo* that) {
return (at_end == that->at_end) &&
(follows_word_interest == that->follows_word_interest) &&
(follows_newline_interest == that->follows_newline_interest) &&
(follows_start_interest == that->follows_start_interest);
}
// Updates the interests of this node given the interests of the
// node preceding it.
void AddFromPreceding(NodeInfo* that) {
at_end |= that->at_end;
follows_word_interest |= that->follows_word_interest;
follows_newline_interest |= that->follows_newline_interest;
follows_start_interest |= that->follows_start_interest;
}
bool HasLookbehind() {
return follows_word_interest || follows_newline_interest ||
follows_start_interest;
}
// Sets the interests of this node to include the interests of the
// following node.
void AddFromFollowing(NodeInfo* that) {
follows_word_interest |= that->follows_word_interest;
follows_newline_interest |= that->follows_newline_interest;
follows_start_interest |= that->follows_start_interest;
}
void ResetCompilationState() {
being_analyzed = false;
been_analyzed = false;
}
bool being_analyzed : 1;
bool been_analyzed : 1;
// These bits are set of this node has to know what the preceding
// character was.
bool follows_word_interest : 1;
bool follows_newline_interest : 1;
bool follows_start_interest : 1;
bool at_end : 1;
bool visited : 1;
bool replacement_calculated : 1;
};
class RegExpNode : public ZoneObject {
public:
explicit RegExpNode(Zone* zone)
: replacement_(nullptr),
on_work_list_(false),
trace_count_(0),
zone_(zone) {
bm_info_[0] = bm_info_[1] = nullptr;
}
virtual ~RegExpNode();
virtual void Accept(NodeVisitor* visitor) = 0;
// Generates a goto to this node or actually generates the code at this point.
virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
// How many characters must this node consume at a minimum in order to
// succeed. If we have found at least 'still_to_find' characters that
// must be consumed there is no need to ask any following nodes whether
// they are sure to eat any more characters. The not_at_start argument is
// used to indicate that we know we are not at the start of the input. In
// this case anchored branches will always fail and can be ignored when
// determining how many characters are consumed on success.
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
// Emits some quick code that checks whether the preloaded characters match.
// Falls through on certain failure, jumps to the label on possible success.
// If the node cannot make a quick check it does nothing and returns false.
bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace,
Trace* trace, bool preload_has_checked_bounds,
Label* on_possible_success,
QuickCheckDetails* details_return,
bool fall_through_on_failure);
// For a given number of characters this returns a mask and a value. The
// next n characters are anded with the mask and compared with the value.
// A comparison failure indicates the node cannot match the next n characters.
// A comparison success indicates the node may match.
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) = 0;
static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
// Only returns the successor for a text node of length 1 that matches any
// character and that has no guards on it.
virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) {
return nullptr;
}
// Collects information on the possible code units (mod 128) that can match if
// we look forward. This is used for a Boyer-Moore-like string searching
// implementation. TODO(erikcorry): This should share more code with
// EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
// the number of nodes we are willing to look at in order to create this data.
static const int kRecursionBudget = 200;
bool KeepRecursing(RegExpCompiler* compiler);
virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) {
UNREACHABLE();
}
// If we know that the input is one-byte then there are some nodes that can
// never match. This method returns a node that can be substituted for
// itself, or nullptr if the node can never match.
virtual RegExpNode* FilterOneByte(int depth) { return this; }
// Helper for FilterOneByte.
RegExpNode* replacement() {
DCHECK(info()->replacement_calculated);
return replacement_;
}
RegExpNode* set_replacement(RegExpNode* replacement) {
info()->replacement_calculated = true;
replacement_ = replacement;
return replacement; // For convenience.
}
// We want to avoid recalculating the lookahead info, so we store it on the
// node. Only info that is for this node is stored. We can tell that the
// info is for this node when offset == 0, so the information is calculated
// relative to this node.
void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
if (offset == 0) set_bm_info(not_at_start, bm);
}
Label* label() { return &label_; }
// If non-generic code is generated for a node (i.e. the node is not at the
// start of the trace) then it cannot be reused. This variable sets a limit
// on how often we allow that to happen before we insist on starting a new
// trace and generating generic code for a node that can be reused by flushing
// the deferred actions in the current trace and generating a goto.
static const int kMaxCopiesCodeGenerated = 10;
bool on_work_list() { return on_work_list_; }
void set_on_work_list(bool value) { on_work_list_ = value; }
NodeInfo* info() { return &info_; }
BoyerMooreLookahead* bm_info(bool not_at_start) {
return bm_info_[not_at_start ? 1 : 0];
}
Zone* zone() const { return zone_; }
protected:
enum LimitResult { DONE, CONTINUE };
RegExpNode* replacement_;
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
bm_info_[not_at_start ? 1 : 0] = bm;
}
private:
static const int kFirstCharBudget = 10;
Label label_;
bool on_work_list_;
NodeInfo info_;
// This variable keeps track of how many times code has been generated for
// this node (in different traces). We don't keep track of where the
// generated code is located unless the code is generated at the start of
// a trace, in which case it is generic and can be reused by flushing the
// deferred operations in the current trace and generating a goto.
int trace_count_;
BoyerMooreLookahead* bm_info_[2];
Zone* zone_;
};
class SeqRegExpNode : public RegExpNode {
public:
explicit SeqRegExpNode(RegExpNode* on_success)
: RegExpNode(on_success->zone()), on_success_(on_success) {}
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
RegExpNode* FilterOneByte(int depth) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override {
on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
if (offset == 0) set_bm_info(not_at_start, bm);
}
protected:
RegExpNode* FilterSuccessor(int depth);
private:
RegExpNode* on_success_;
};
class ActionNode : public SeqRegExpNode {
public:
enum ActionType {
SET_REGISTER,
INCREMENT_REGISTER,
STORE_POSITION,
BEGIN_SUBMATCH,
POSITIVE_SUBMATCH_SUCCESS,
EMPTY_MATCH_CHECK,
CLEAR_CAPTURES
};
static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
static ActionNode* StorePosition(int reg, bool is_capture,
RegExpNode* on_success);
static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
static ActionNode* BeginSubmatch(int stack_pointer_reg, int position_reg,
RegExpNode* on_success);
static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
int restore_reg,
int clear_capture_count,
int clear_capture_from,
RegExpNode* on_success);
static ActionNode* EmptyMatchCheck(int start_register,
int repetition_register,
int repetition_limit,
RegExpNode* on_success);
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) override {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
not_at_start);
}
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
ActionType action_type() { return action_type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
int GreedyLoopTextLength() override {
return kNodeIsTooComplexForGreedyLoops;
}
private:
union {
struct {
int reg;
int value;
} u_store_register;
struct {
int reg;
} u_increment_register;
struct {
int reg;
bool is_capture;
} u_position_register;
struct {
int stack_pointer_register;
int current_position_register;
int clear_register_count;
int clear_register_from;
} u_submatch;
struct {
int start_register;
int repetition_register;
int repetition_limit;
} u_empty_match_check;
struct {
int range_from;
int range_to;
} u_clear_captures;
} data_;
ActionNode(ActionType action_type, RegExpNode* on_success)
: SeqRegExpNode(on_success), action_type_(action_type) {}
ActionType action_type_;
friend class DotPrinterImpl;
};
class TextNode : public SeqRegExpNode {
public:
TextNode(ZoneList<TextElement>* elms, bool read_backward,
RegExpNode* on_success)
: SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
TextNode(RegExpCharacterClass* that, bool read_backward,
RegExpNode* on_success)
: SeqRegExpNode(on_success),
elms_(new (zone()) ZoneList<TextElement>(1, zone())),
read_backward_(read_backward) {
elms_->Add(TextElement::CharClass(that), zone());
}
// Create TextNode for a single character class for the given ranges.
static TextNode* CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags);
// Create TextNode for a surrogate pair with a range given for the
// lead and the trail surrogate each.
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags);
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
ZoneList<TextElement>* elements() { return elms_; }
bool read_backward() { return read_backward_; }
void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
int GreedyLoopTextLength() override;
RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
void CalculateOffsets();
RegExpNode* FilterOneByte(int depth) override;
private:
enum TextEmitPassType {
NON_LATIN1_MATCH, // Check for characters that can't match.
SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
CASE_CHARACTER_MATCH, // Case-independent single character check.
CHARACTER_CLASS_MATCH // Character class.
};
static bool SkipPass(TextEmitPassType pass, bool ignore_case);
static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
static const int kLastPass = CHARACTER_CLASS_MATCH;
void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
bool preloaded, Trace* trace, bool first_element_checked,
int* checked_up_to);
int Length();
ZoneList<TextElement>* elms_;
bool read_backward_;
};
class AssertionNode : public SeqRegExpNode {
public:
enum AssertionType {
AT_END,
AT_START,
AT_BOUNDARY,
AT_NON_BOUNDARY,
AFTER_NEWLINE
};
static AssertionNode* AtEnd(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_END, on_success);
}
static AssertionNode* AtStart(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_START, on_success);
}
static AssertionNode* AtBoundary(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
}
static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
}
static AssertionNode* AfterNewline(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
}
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
AssertionType assertion_type() { return assertion_type_; }
private:
void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
enum IfPrevious { kIsNonWord, kIsWord };
void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace,
IfPrevious backtrack_if_previous);
AssertionNode(AssertionType t, RegExpNode* on_success)
: SeqRegExpNode(on_success), assertion_type_(t) {}
AssertionType assertion_type_;
};
class BackReferenceNode : public SeqRegExpNode {
public:
BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
bool read_backward, RegExpNode* on_success)
: SeqRegExpNode(on_success),
start_reg_(start_reg),
end_reg_(end_reg),
flags_(flags),
read_backward_(read_backward) {}
void Accept(NodeVisitor* visitor) override;
int start_register() { return start_reg_; }
int end_register() { return end_reg_; }
bool read_backward() { return read_backward_; }
void Emit(RegExpCompiler* compiler, Trace* trace) override;
int EatsAtLeast(int still_to_find, int recursion_depth,
bool not_at_start) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override {
return;
}
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
private:
int start_reg_;
int end_reg_;
JSRegExp::Flags flags_;
bool read_backward_;
};
class EndNode : public RegExpNode {
public:
enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
int EatsAtLeast(int still_to_find, int recursion_depth,
bool not_at_start) override {
return 0;
}
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
private:
Action action_;
};
class NegativeSubmatchSuccess : public EndNode {
public:
NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg,
int clear_capture_count, int clear_capture_start,
Zone* zone)
: EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
stack_pointer_register_(stack_pointer_reg),
current_position_register_(position_reg),
clear_capture_count_(clear_capture_count),
clear_capture_start_(clear_capture_start) {}
void Emit(RegExpCompiler* compiler, Trace* trace) override;
private:
int stack_pointer_register_;
int current_position_register_;
int clear_capture_count_;
int clear_capture_start_;
};
class Guard : public ZoneObject {
public:
enum Relation { LT, GEQ };
Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {}
int reg() { return reg_; }
Relation op() { return op_; }
int value() { return value_; }
private:
int reg_;
Relation op_;
int value_;
};
class GuardedAlternative {
public:
explicit GuardedAlternative(RegExpNode* node)
: node_(node), guards_(nullptr) {}
void AddGuard(Guard* guard, Zone* zone);
RegExpNode* node() { return node_; }
void set_node(RegExpNode* node) { node_ = node; }
ZoneList<Guard*>* guards() { return guards_; }
private:
RegExpNode* node_;
ZoneList<Guard*>* guards_;
};
class AlternativeGeneration;
class ChoiceNode : public RegExpNode {
public:
explicit ChoiceNode(int expected_size, Zone* zone)
: RegExpNode(zone),
alternatives_(new (zone)
ZoneList<GuardedAlternative>(expected_size, zone)),
table_(nullptr),
not_at_start_(false),
being_calculated_(false) {}
void Accept(NodeVisitor* visitor) override;
void AddAlternative(GuardedAlternative node) {
alternatives()->Add(node, zone());
}
ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
DispatchTable* GetTable(bool ignore_case);
void Emit(RegExpCompiler* compiler, Trace* trace) override;
int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
int EatsAtLeastHelper(int still_to_find, int budget,
RegExpNode* ignore_this_node, bool not_at_start);
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
bool being_calculated() { return being_calculated_; }
bool not_at_start() { return not_at_start_; }
void set_not_at_start() { not_at_start_ = true; }
void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
return true;
}
RegExpNode* FilterOneByte(int depth) override;
virtual bool read_backward() { return false; }
protected:
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
ZoneList<GuardedAlternative>* alternatives_;
private:
friend class DispatchTableConstructor;
friend class Analysis;
void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard,
Trace* trace);
int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace,
GuardedAlternative alternative,
AlternativeGeneration* alt_gen,
int preload_characters,
bool next_expects_preload);
void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
PreloadState* preloads);
void AssertGuardsMentionRegisters(Trace* trace);
int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
AlternativeGenerationList* alt_gens,
PreloadState* preloads,
GreedyLoopState* greedy_loop_state, int text_length);
void EmitChoices(RegExpCompiler* compiler,
AlternativeGenerationList* alt_gens, int first_choice,
Trace* trace, PreloadState* preloads);
DispatchTable* table_;
// If true, this node is never checked at the start of the input.
// Allows a new trace to start with at_start() set to false.
bool not_at_start_;
bool being_calculated_;
};
class NegativeLookaroundChoiceNode : public ChoiceNode {
public:
explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
GuardedAlternative then_do_this,
Zone* zone)
: ChoiceNode(2, zone) {
AddAlternative(this_must_fail);
AddAlternative(then_do_this);
}
int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override {
alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
not_at_start);
if (offset == 0) set_bm_info(not_at_start, bm);
}
// For a negative lookahead we don't emit the quick check for the
// alternative that is expected to fail. This is because quick check code
// starts by loading enough characters for the alternative that takes fewest
// characters, but on a negative lookahead the negative branch did not take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
bool try_to_emit_quick_check_for_alternative(bool is_first) override {
return !is_first;
}
RegExpNode* FilterOneByte(int depth) override;
};
class LoopChoiceNode : public ChoiceNode {
public:
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
: ChoiceNode(2, zone),
loop_node_(nullptr),
continue_node_(nullptr),
body_can_be_zero_length_(body_can_be_zero_length),
read_backward_(read_backward) {}
void AddLoopAlternative(GuardedAlternative alt);
void AddContinueAlternative(GuardedAlternative alt);
void Emit(RegExpCompiler* compiler, Trace* trace) override;
int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
bool read_backward() override { return read_backward_; }
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
private:
// AddAlternative is made private for loop nodes because alternatives
// should not be added freely, we need to keep track of which node
// goes back to the node itself.
void AddAlternative(GuardedAlternative node) {
ChoiceNode::AddAlternative(node);
}
RegExpNode* loop_node_;
RegExpNode* continue_node_;
bool body_can_be_zero_length_;
bool read_backward_;
};
class NodeVisitor {
public:
virtual ~NodeVisitor() = default;
#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_NODES_H_

View File

@ -11,6 +11,7 @@
#include "src/objects/objects-inl.h"
#include "src/regexp/jsregexp.h"
#include "src/regexp/property-sequences.h"
#include "src/regexp/regexp-macro-assembler.h"
#include "src/strings/char-predicates-inl.h"
#include "src/utils/ostreams.h"
#include "src/utils/utils.h"