Regexp: Remove nodes from the regexp that cannot match because

they contain non-ASCII characters and the input string is ASCII.
Remove unused Clone() method.
Review URL: https://chromiumcodereview.appspot.com/10174017

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11445 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
erik.corry@gmail.com 2012-04-26 09:11:19 +00:00
parent 4b920a20ab
commit d511b69e86
3 changed files with 214 additions and 32 deletions

View File

@ -2426,15 +2426,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
QuickCheckDetails::Position* pos =
details->positions(characters_filled_in);
uc16 c = quarks[i];
if (c > char_mask) {
// If we expect a non-ASCII character from an ASCII string,
// there is no way we can match. Not even case independent
// matching can turn an ASCII character into non-ASCII or
// vice versa.
details->set_cannot_match();
pos->determines_perfectly = false;
return;
}
// We should already have filtered out nodes that have non-ASCII
// characters if we are matching against an ASCII string.
ASSERT(c <= char_mask);
if (compiler->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
@ -2496,11 +2490,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
int first_range = 0;
while (ranges->at(first_range).from() > char_mask) {
first_range++;
if (first_range == ranges->length()) {
details->set_cannot_match();
pos->determines_perfectly = false;
return;
}
// We should already have filtered out nodes that cannot match
// so the first range should be a valid range.
ASSERT(first_range != ranges->length());
}
CharacterRange range = ranges->at(first_range);
uc16 from = range.from();
@ -2629,6 +2621,144 @@ class VisitMarker {
};
RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
ASSERT(!info()->visited);
VisitMarker marker(info());
return FilterSuccessor(depth - 1);
}
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
RegExpNode* next = on_success_->FilterASCII(depth - 1);
if (next == NULL) return set_replacement(NULL);
on_success_ = next;
return set_replacement(this);
}
RegExpNode* TextNode::FilterASCII(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
ASSERT(!info()->visited);
VisitMarker marker(info());
int element_count = elms_->length();
for (int i = 0; i < element_count; i++) {
TextElement elm = elms_->at(i);
if (elm.type == TextElement::ATOM) {
Vector<const uc16> quarks = elm.data.u_atom->data();
for (int j = 0; j < quarks.length(); j++) {
// We don't need special handling for case independence
// because of the rule that case independence cannot make
// a non-ASCII character match an ASCII character.
if (quarks[j] > String::kMaxAsciiCharCode) {
return set_replacement(NULL);
}
}
} else {
ASSERT(elm.type == TextElement::CHAR_CLASS);
RegExpCharacterClass* cc = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = cc->ranges();
if (!CharacterRange::IsCanonical(ranges)) {
CharacterRange::Canonicalize(ranges);
}
// Now they are in order so we only need to look at the first.
int range_count = ranges->length();
if (cc->is_negated()) {
if (range_count != 0 &&
ranges->at(0).from() == 0 &&
ranges->at(0).to() >= String::kMaxAsciiCharCode) {
return set_replacement(NULL);
}
} else {
if (range_count == 0 ||
ranges->at(0).from() > String::kMaxAsciiCharCode) {
return set_replacement(NULL);
}
}
}
}
return FilterSuccessor(depth - 1);
}
RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
VisitMarker marker(info());
RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
// If we can't continue after the loop then there is no sense in doing the
// loop.
if (continue_replacement == NULL) return set_replacement(NULL);
return ChoiceNode::FilterASCII(depth - 1);
}
RegExpNode* ChoiceNode::FilterASCII(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
VisitMarker marker(info());
int choice_count = alternatives_->length();
int surviving = 0;
RegExpNode* survivor = NULL;
for (int i = 0; i < choice_count; i++) {
GuardedAlternative alternative = alternatives_->at(i);
RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
alternatives_->at(i).set_node(replacement);
if (replacement != NULL) {
surviving++;
survivor = replacement;
}
}
if (surviving < 2) return set_replacement(survivor);
set_replacement(this);
if (surviving == choice_count) {
return this;
}
// Only some of the nodes survived the filtering. We need to rebuild the
// alternatives list.
ZoneList<GuardedAlternative>* new_alternatives =
new ZoneList<GuardedAlternative>(surviving);
for (int i = 0; i < choice_count; i++) {
GuardedAlternative alternative = alternatives_->at(i);
if (alternative.node() != NULL) {
new_alternatives->Add(alternative);
}
}
alternatives_ = new_alternatives;
return this;
}
RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
VisitMarker marker(info());
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
RegExpNode* node = alternatives_->at(1).node();
RegExpNode* replacement = node->FilterASCII(depth - 1);
if (replacement == NULL) return set_replacement(NULL);
alternatives_->at(1).set_node(replacement);
RegExpNode* neg_node = alternatives_->at(0).node();
RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
// If the negative lookahead is always going to fail then
// we don't need to check it.
if (neg_replacement == NULL) return set_replacement(replacement);
alternatives_->at(0).set_node(neg_replacement);
return set_replacement(this);
}
void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
@ -5690,6 +5820,9 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
node = loop_node;
}
}
if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
data->node = node;
Analysis analysis(ignore_case, is_ascii);
analysis.EnsureAnalyzed(node);

View File

@ -225,6 +225,8 @@ enum ElementInSetsRelation {
};
// Represents code units in the range from from_ to to_, both ends are
// inclusive.
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) { }
@ -414,7 +416,8 @@ struct NodeInfo {
follows_newline_interest(false),
follows_start_interest(false),
at_end(false),
visited(false) { }
visited(false),
replacement_calculated(false) { }
// Returns true if the interests and assumptions of this node
// matches the given one.
@ -464,6 +467,7 @@ struct NodeInfo {
bool at_end: 1;
bool visited: 1;
bool replacement_calculated: 1;
};
@ -519,9 +523,12 @@ class QuickCheckDetails {
};
extern int kUninitializedRegExpNodePlaceHolder;
class RegExpNode: public ZoneObject {
public:
RegExpNode() : trace_count_(0) {
RegExpNode() : replacement_(NULL), trace_count_(0) {
bm_info_[0] = bm_info_[1] = NULL;
}
virtual ~RegExpNode();
@ -572,6 +579,22 @@ class RegExpNode: public ZoneObject {
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
UNREACHABLE();
}
// If we know that the input is ASCII then there are some nodes that can
// never match. This method returns a node that can be substituted for
// itself, or NULL if the node can never match.
virtual RegExpNode* FilterASCII(int depth) { return this; }
// Helper for FilterASCII.
RegExpNode* replacement() {
ASSERT(info()->replacement_calculated);
return replacement_;
}
RegExpNode* set_replacement(RegExpNode* replacement) {
info()->replacement_calculated = true;
replacement_ = replacement;
return replacement; // For convenience.
}
// We want to avoid recalculating the lookahead info, so we store it on the
// node. Only info that is for this node is stored. We can tell that the
// info is for this node when offset == 0, so the information is calculated
@ -596,14 +619,10 @@ class RegExpNode: public ZoneObject {
protected:
enum LimitResult { DONE, CONTINUE };
RegExpNode* replacement_;
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
// Returns a clone of this node initialized using the copy constructor
// of its concrete class. Note that the node may have to be pre-
// processed before it is on a usable state.
virtual RegExpNode* Clone() = 0;
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
bm_info_[not_at_start ? 1 : 0] = bm;
}
@ -655,11 +674,16 @@ class SeqRegExpNode: public RegExpNode {
: on_success_(on_success) { }
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
virtual RegExpNode* FilterASCII(int depth);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
on_success_->FillInBMInfo(offset, bm, not_at_start);
if (offset == 0) set_bm_info(not_at_start, bm);
}
protected:
RegExpNode* FilterSuccessor(int depth);
private:
RegExpNode* on_success_;
};
@ -711,7 +735,6 @@ class ActionNode: public SeqRegExpNode {
Type type() { return type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
virtual ActionNode* Clone() { return new ActionNode(*this); }
private:
union {
@ -778,12 +801,8 @@ class TextNode: public SeqRegExpNode {
RegExpCompiler* compiler);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual TextNode* Clone() {
TextNode* result = new TextNode(*this);
result->CalculateOffsets();
return result;
}
void CalculateOffsets();
virtual RegExpNode* FilterASCII(int depth);
private:
enum TextEmitPassType {
@ -842,7 +861,6 @@ class AssertionNode: public SeqRegExpNode {
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
void set_type(AssertionNodeType type) { type_ = type; }
@ -881,7 +899,6 @@ class BackReferenceNode: public SeqRegExpNode {
}
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
private:
int start_reg_;
@ -910,7 +927,7 @@ class EndNode: public RegExpNode {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
virtual EndNode* Clone() { return new EndNode(*this); }
private:
Action action_;
};
@ -997,13 +1014,13 @@ class ChoiceNode: public RegExpNode {
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
bool being_calculated() { return being_calculated_; }
bool not_at_start() { return not_at_start_; }
void set_not_at_start() { not_at_start_ = true; }
void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
virtual RegExpNode* FilterASCII(int depth);
protected:
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
@ -1056,6 +1073,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
// characters, but on a negative lookahead the negative branch did not take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
virtual RegExpNode* FilterASCII(int depth);
};
@ -1078,11 +1096,11 @@ class LoopChoiceNode: public ChoiceNode {
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
virtual void Accept(NodeVisitor* visitor);
virtual RegExpNode* FilterASCII(int depth);
private:
// AddAlternative is made private for loop nodes because alternatives

View File

@ -156,3 +156,34 @@ assertEquals("foo baz", a);
a = "foo bar baz".replace(/^|bar/g, "*");
assertEquals("*foo * baz", a);
// We test FilterASCII using regexps that will backtrack forever. Since
// a regexp with a non-ASCII character in it can never match an ASCII
// string we can test that the relevant node is removed by verifying that
// there is no hang.
function NoHang(re) {
print(re);
"This is an ASCII string that could take forever".match(re);
}
NoHang(/(((.*)*)*x)å/); // Continuation after loop is filtered, so is loop.
NoHang(/(((.*)*)*å)foo/); // Body of loop filtered.
NoHang(/å(((.*)*)*x)/); // Everything after a filtered character is filtered.
NoHang(/(((.*)*)*x)å/); // Everything before a filtered character is filtered.
NoHang(/[æøå](((.*)*)*x)/); // Everything after a filtered class is filtered.
NoHang(/(((.*)*)*x)[æøå]/); // Everything before a filtered class is filtered.
NoHang(/[^\x00-\x7f](((.*)*)*x)/); // After negated class.
NoHang(/(((.*)*)*x)[^\x00-\x7f]/); // Before negated class.
NoHang(/(?!(((.*)*)*x)å)foo/); // Negative lookahead is filtered.
NoHang(/(?!(((.*)*)*x))å/); // Continuation branch of negative lookahead.
NoHang(/(?=(((.*)*)*x)å)foo/); // Positive lookahead is filtered.
NoHang(/(?=(((.*)*)*x))å/); // Continuation branch of positive lookahead.
NoHang(/(?=å)(((.*)*)*x)/); // Positive lookahead also prunes continuation.
NoHang(/(æ|ø|å)(((.*)*)*x)/); // All branches of alternation are filtered.
NoHang(/(a|b|(((.*)*)*x))å/); // 1 out of 3 branches pruned.
NoHang(/(a|(((.*)*)*x)ø|(((.*)*)*x)å)/); // 2 out of 3 branches pruned.
var s = "Don't prune based on a repetition of length 0";
assertEquals(null, s.match(/å{1,1}prune/));
assertEquals("prune", (s.match(/å{0,0}prune/)[0]));