Trace contains information about whether we know that we are at the start of input.

Choice nodes may know that they are never not at the start of input.
This can remove start_of_input assertions in cases where they are statically known to fail.
The initial loop is unrolled once if the regexp might check for the start of input. Only the first iteration may be at the start, the following loop knows that it isn't.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1217 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
lrn@chromium.org 2009-02-03 11:43:55 +00:00
parent 3835e915c4
commit cf1e1b1b98
7 changed files with 170 additions and 51 deletions

View File

@ -250,7 +250,13 @@ bool RegExpAssertion::IsAnchored() {
bool RegExpAlternative::IsAnchored() {
return this->nodes()->at(0)->IsAnchored();
ZoneList<RegExpTree*>* nodes = this->nodes();
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
if (node->IsAnchored()) { return true; }
if (node->max_match() > 0) { return false; }
}
return false;
}

View File

@ -1461,7 +1461,8 @@ class RegExpQuantifier: public RegExpTree {
bool is_greedy,
RegExpTree* body,
RegExpCompiler* compiler,
RegExpNode* on_success);
RegExpNode* on_success,
bool not_at_start = false);
virtual RegExpQuantifier* AsQuantifier();
virtual Interval CaptureRegisters();
virtual bool IsQuantifier();

View File

@ -438,6 +438,7 @@ int FlagList::SetFlagsFromCommandLine(int* argc,
if (FLAG_help) {
PrintHelp();
exit(0);
}
// parsed all flags successfully
return 0;

View File

@ -1269,7 +1269,7 @@ Handle<FixedArray> RegExpCompiler::Assemble(
List <RegExpNode*> work_list(0);
work_list_ = &work_list;
Label fail;
macro_assembler->PushBacktrack(&fail);
macro_assembler_->PushBacktrack(&fail);
Trace new_trace;
start->Emit(this, &new_trace);
macro_assembler_->Bind(&fail);
@ -2068,11 +2068,12 @@ int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
QuickCheckDetails* details,
RegExpCompiler* compiler,
int filled_in) {
int filled_in,
bool not_at_start) {
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
RegExpNode* node = alternatives_->at(1).node();
return node->GetQuickCheckDetails(details, compiler, filled_in);
return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
}
@ -2145,7 +2146,8 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
QuickCheckDetails* details,
bool fall_through_on_failure) {
if (details->characters() == 0) return false;
GetQuickCheckDetails(details, compiler, 0);
GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
if (details->cannot_match()) return false;
if (!details->Rationalize(compiler->ascii())) return false;
uint32_t mask = details->mask();
uint32_t value = details->value();
@ -2210,7 +2212,8 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
// generating a quick check.
void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in) {
int characters_filled_in,
bool not_at_start) {
ASSERT(characters_filled_in < details->characters());
int characters = details->characters();
int char_mask;
@ -2322,7 +2325,10 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
}
}
ASSERT(characters_filled_in != details->characters());
on_success()-> GetQuickCheckDetails(details, compiler, characters_filled_in);
on_success()-> GetQuickCheckDetails(details,
compiler,
characters_filled_in,
true);
}
@ -2359,6 +2365,13 @@ void QuickCheckDetails::Advance(int by, bool ascii) {
void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
ASSERT(characters_ == other->characters_);
if (other->cannot_match_) {
return;
}
if (cannot_match_) {
*this = *other;
return;
}
for (int i = from_index; i < characters_; i++) {
QuickCheckDetails::Position* pos = positions(i);
QuickCheckDetails::Position* other_pos = other->positions(i);
@ -2395,27 +2408,34 @@ class VisitMarker {
void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in) {
int characters_filled_in,
bool not_at_start) {
if (body_can_be_zero_length_ || info()->visited) return;
VisitMarker marker(info());
return ChoiceNode::GetQuickCheckDetails(details,
compiler,
characters_filled_in);
characters_filled_in,
not_at_start);
}
void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in) {
int characters_filled_in,
bool not_at_start) {
not_at_start = not_at_start || not_at_start_;
int choice_count = alternatives_->length();
ASSERT(choice_count > 0);
alternatives_->at(0).node()->GetQuickCheckDetails(details,
compiler,
characters_filled_in);
characters_filled_in,
not_at_start);
for (int i = 1; i < choice_count; i++) {
QuickCheckDetails new_details(details->characters());
RegExpNode* node = alternatives_->at(i).node();
node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in);
node->GetQuickCheckDetails(&new_details, compiler,
characters_filled_in,
not_at_start);
// Here we merge the quick match details of the two branches.
details->Merge(&new_details, characters_filled_in);
}
@ -2541,6 +2561,21 @@ static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
}
void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int filled_in,
bool not_at_start) {
if (type_ == AT_START && not_at_start) {
details->set_cannot_match();
return;
}
return on_success()->GetQuickCheckDetails(details,
compiler,
filled_in,
not_at_start);
}
void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
RegExpMacroAssembler* assembler = compiler->macro_assembler();
switch (type_) {
@ -2551,9 +2586,20 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
assembler->Bind(&ok);
break;
}
case AT_START:
assembler->CheckNotAtStart(trace->backtrack());
break;
case AT_START: {
if (trace->at_start() == Trace::FALSE) {
assembler->GoTo(trace->backtrack());
return;
}
if (trace->at_start() == Trace::UNKNOWN) {
assembler->CheckNotAtStart(trace->backtrack());
Trace at_start_trace = *trace;
at_start_trace.set_at_start(true);
on_success()->Emit(compiler, &at_start_trace);
return;
}
}
break;
case AFTER_NEWLINE:
EmitHat(compiler, on_success(), trace);
return;
@ -2772,6 +2818,7 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
&bound_checked_to);
Trace successor_trace(*trace);
successor_trace.set_at_start(false);
successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
RecursionCheck rc(compiler);
on_success()->Emit(compiler, &successor_trace);
@ -3065,6 +3112,8 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
Label greedy_loop_label;
Trace counter_backtrack_trace;
counter_backtrack_trace.set_backtrack(&greedy_loop_label);
if (not_at_start()) counter_backtrack_trace.set_at_start(false);
if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
// Here we have special handling for greedy loops containing only text nodes
// and other simple nodes. These are handled by pushing the current
@ -3079,6 +3128,7 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
current_trace = &counter_backtrack_trace;
Label greedy_match_failed;
Trace greedy_match_trace;
if (not_at_start()) greedy_match_trace.set_at_start(false);
greedy_match_trace.set_backtrack(&greedy_match_failed);
Label loop_label;
macro_assembler->Bind(&loop_label);
@ -3139,6 +3189,11 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
new_trace.set_bound_checked_up_to(preload_characters);
generate_full_check_inline = true;
}
} else if (alt_gen->quick_check_details.cannot_match()) {
if (i == choice_count - 1 && !greedy_loop) {
macro_assembler->GoTo(trace->backtrack());
}
continue;
} else {
// No quick check was generated. Put the full code here.
// If this is not the first choice then there could be slow checks from
@ -3868,7 +3923,8 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
bool is_greedy,
RegExpTree* body,
RegExpCompiler* compiler,
RegExpNode* on_success) {
RegExpNode* on_success,
bool not_at_start) {
// x{f, t} becomes this:
//
// (r++)<-.
@ -3907,8 +3963,8 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
if (min > 0 && min <= kMaxUnrolledMinMatches) {
int new_max = (max == kInfinity) ? max : max - min;
// Recurse once to get the loop or optional matches after the fixed ones.
RegExpNode* answer =
ToNode(0, new_max, is_greedy, body, compiler, on_success);
RegExpNode* answer = ToNode(
0, new_max, is_greedy, body, compiler, on_success, true);
// Unroll the forced matches from 0 to min. This can cause chains of
// TextNodes (which the parser does not generate). These should be
// combined if it turns out they hinder good code generation.
@ -3933,6 +3989,7 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
answer)));
}
answer = alternation;
if (not_at_start) alternation->set_not_at_start();
}
return answer;
}
@ -3944,6 +4001,7 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
? compiler->AllocateRegister()
: RegExpCompiler::kNoRegister;
LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
if (not_at_start) center->set_not_at_start();
RegExpNode* loop_return = needs_counter
? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
: static_cast<RegExpNode*>(center);
@ -4764,12 +4822,26 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
if (!data->tree->IsAnchored()) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning.
node = RegExpQuantifier::ToNode(0,
RegExpTree::kInfinity,
false,
new RegExpCharacterClass('*'),
&compiler,
captured_body);
RegExpNode* loop_node =
RegExpQuantifier::ToNode(0,
RegExpTree::kInfinity,
false,
new RegExpCharacterClass('*'),
&compiler,
captured_body,
data->contains_anchor);
if (data->contains_anchor) {
// Unroll loop once, to take care of the case that might start
// at the start of input.
ChoiceNode* first_step_node = new ChoiceNode(2);
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(
new TextNode(new RegExpCharacterClass('*'), loop_node)));
node = first_step_node;
} else {
node = loop_node;
}
}
data->node = node;
Analysis analysis(ignore_case);

View File

@ -454,10 +454,6 @@ class Trace;
struct NodeInfo {
enum TriBool {
UNKNOWN = -1, FALSE = 0, TRUE = 1
};
NodeInfo()
: being_analyzed(false),
been_analyzed(false),
@ -544,17 +540,21 @@ class QuickCheckDetails {
QuickCheckDetails()
: characters_(0),
mask_(0),
value_(0) { }
value_(0),
cannot_match_(false) { }
explicit QuickCheckDetails(int characters)
: characters_(characters),
mask_(0),
value_(0) { }
value_(0),
cannot_match_(false) { }
bool Rationalize(bool ascii);
// Merge in the information from another branch of an alternation.
void Merge(QuickCheckDetails* other, int from_index);
// Advance the current position by some amount.
void Advance(int by, bool ascii);
void Clear();
bool cannot_match() { return cannot_match_; }
void set_cannot_match() { cannot_match_ = true; }
struct Position {
Position() : mask(0), value(0), determines_perfectly(false) { }
uc16 mask;
@ -579,6 +579,9 @@ class QuickCheckDetails {
// These values are the condensate of the above array after Rationalize().
uint32_t mask_;
uint32_t value_;
// If set to true, there is no way this quick check can match at all.
// E.g., if it requires to be at the start of the input, and isn't.
bool cannot_match_;
};
@ -609,7 +612,8 @@ class RegExpNode: public ZoneObject {
// A comparison success indicates the node may match.
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in) = 0;
int characters_filled_in,
bool not_at_start) = 0;
static const int kNodeIsTooComplexForGreedyLoops = -1;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
Label* label() { return &label_; }
@ -740,8 +744,10 @@ class ActionNode: public SeqRegExpNode {
virtual int EatsAtLeast(int still_to_find, int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int filled_in) {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in);
int filled_in,
bool not_at_start) {
return on_success()->GetQuickCheckDetails(
details, compiler, filled_in, not_at_start);
}
Type type() { return type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
@ -802,7 +808,8 @@ class TextNode: public SeqRegExpNode {
virtual int EatsAtLeast(int still_to_find, int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in);
int characters_filled_in,
bool not_at_start);
ZoneList<TextElement>* elements() { return elms_; }
void MakeCaseIndependent();
virtual int GreedyLoopTextLength();
@ -860,9 +867,8 @@ class AssertionNode: public SeqRegExpNode {
virtual int EatsAtLeast(int still_to_find, int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int filled_in) {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in);
}
int filled_in,
bool not_at_start);
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
private:
@ -887,7 +893,8 @@ class BackReferenceNode: public SeqRegExpNode {
virtual int EatsAtLeast(int still_to_find, int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in) {
int characters_filled_in,
bool not_at_start) {
return;
}
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
@ -907,7 +914,8 @@ class EndNode: public RegExpNode {
virtual int EatsAtLeast(int still_to_find, int recursion_depth) { return 0; }
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in) {
int characters_filled_in,
bool not_at_start) {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
@ -979,6 +987,7 @@ class ChoiceNode: public RegExpNode {
explicit ChoiceNode(int expected_size)
: alternatives_(new ZoneList<GuardedAlternative>(expected_size)),
table_(NULL),
not_at_start_(false),
being_calculated_(false) { }
virtual void Accept(NodeVisitor* visitor);
void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); }
@ -991,10 +1000,13 @@ class ChoiceNode: public RegExpNode {
RegExpNode* ignore_this_node);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in);
int characters_filled_in,
bool not_at_start);
virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
bool being_calculated() { return being_calculated_; }
bool not_at_start() { return not_at_start_; }
void set_not_at_start() { not_at_start_ = true; }
void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
@ -1016,6 +1028,9 @@ class ChoiceNode: public RegExpNode {
int preload_characters,
bool next_expects_preload);
DispatchTable* table_;
// If true, this node is never checked at the start of the input.
// Allows a new trace to start with at_start() set to false.
bool not_at_start_;
bool being_calculated_;
};
@ -1031,7 +1046,8 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
virtual int EatsAtLeast(int still_to_find, int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in);
int characters_filled_in,
bool not_at_start);
// For a negative lookahead we don't emit the quick check for the
// alternative that is expected to fail. This is because quick check code
// starts by loading enough characters for the alternative that takes fewest
@ -1054,7 +1070,8 @@ class LoopChoiceNode: public ChoiceNode {
virtual int EatsAtLeast(int still_to_find, int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in);
int characters_filled_in,
bool not_at_start);
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
@ -1088,6 +1105,12 @@ class LoopChoiceNode: public ChoiceNode {
// where baz has been matched.
class Trace {
public:
// A value for a property that is either known to be true, know to be false,
// or not known.
enum TriBool {
UNKNOWN = -1, FALSE = 0, TRUE = 1
};
class DeferredAction {
public:
DeferredAction(ActionNode::Type type, int reg)
@ -1150,7 +1173,9 @@ class Trace {
stop_node_(NULL),
loop_label_(NULL),
characters_preloaded_(0),
bound_checked_up_to_(0) { }
bound_checked_up_to_(0),
at_start_(UNKNOWN) { }
// End the trace. This involves flushing the deferred actions in the trace
// and pushing a backtrack location onto the backtrack stack. Once this is
// done we can start a new trace or go to one that has already been
@ -1174,8 +1199,11 @@ class Trace {
cp_offset_ == 0 &&
characters_preloaded_ == 0 &&
bound_checked_up_to_ == 0 &&
quick_check_performed_.characters() == 0;
quick_check_performed_.characters() == 0 &&
at_start_ == UNKNOWN;
}
TriBool at_start() { return at_start_; }
void set_at_start(bool at_start) { at_start_ = at_start ? TRUE : FALSE; }
Label* backtrack() { return backtrack_; }
Label* loop_label() { return loop_label_; }
RegExpNode* stop_node() { return stop_node_; }
@ -1223,6 +1251,7 @@ class Trace {
int characters_preloaded_;
int bound_checked_up_to_;
QuickCheckDetails quick_check_performed_;
TriBool at_start_;
};
@ -1305,10 +1334,12 @@ struct RegExpCompileData {
: tree(NULL),
node(NULL),
simple(true),
contains_anchor(false),
capture_count(0) { }
RegExpTree* tree;
RegExpNode* node;
bool simple;
bool contains_anchor;
Handle<String> error;
int capture_count;
};

View File

@ -532,7 +532,8 @@ class RegExpParser {
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
bool contains_anchor() { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
@ -555,6 +556,7 @@ class RegExpParser {
FlatStringReader* in_;
Handle<String>* error_;
bool simple_;
bool contains_anchor_;
ZoneList<RegExpCapture*>* captures_;
bool is_scanned_for_captures_;
// The capture count is only valid after we have scanned for captures.
@ -3486,6 +3488,7 @@ RegExpParser::RegExpParser(FlatStringReader* in,
in_(in),
error_(error),
simple_(true),
contains_anchor_(false),
captures_(NULL),
is_scanned_for_captures_(false),
capture_count_(0),
@ -3603,10 +3606,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
case '^': {
Advance();
RegExpAssertion::Type type =
multiline_ ? RegExpAssertion::START_OF_LINE :
RegExpAssertion::START_OF_INPUT;
builder.AddAssertion(new RegExpAssertion(type));
if (multiline_) {
builder.AddAssertion(
new RegExpAssertion(RegExpAssertion::START_OF_LINE));
} else {
builder.AddAssertion(
new RegExpAssertion(RegExpAssertion::START_OF_INPUT));
set_contains_anchor();
}
continue;
}
case '$': {
@ -4312,6 +4319,7 @@ bool ParseRegExp(FlatStringReader* input,
result->tree = tree;
int capture_count = parser.captures_started();
result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
result->contains_anchor = parser.contains_anchor();
result->capture_count = capture_count;
}
return !parser.failed();

View File

@ -636,7 +636,7 @@ Handle<Object> RegExpMacroAssemblerIA32::GetCode(Handle<String> source) {
__ push(esi);
__ push(edi);
__ push(ebx); // Callee-save on MacOS.
__ push(Immediate(0)); // Make room for input start minus one
__ push(Immediate(0)); // Make room for "input start - 1" constant.
// Check if we have space on the stack for registers.
Label retry_stack_check;