Regexp: Remove nodes from the regexp that cannot match because
they contain non-ASCII characters and the input string is ASCII. Remove unused Clone() method. Review URL: https://chromiumcodereview.appspot.com/10174017 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11445 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
4b920a20ab
commit
d511b69e86
161
src/jsregexp.cc
161
src/jsregexp.cc
@ -2426,15 +2426,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
QuickCheckDetails::Position* pos =
|
||||
details->positions(characters_filled_in);
|
||||
uc16 c = quarks[i];
|
||||
if (c > char_mask) {
|
||||
// If we expect a non-ASCII character from an ASCII string,
|
||||
// there is no way we can match. Not even case independent
|
||||
// matching can turn an ASCII character into non-ASCII or
|
||||
// vice versa.
|
||||
details->set_cannot_match();
|
||||
pos->determines_perfectly = false;
|
||||
return;
|
||||
}
|
||||
// We should already have filtered out nodes that have non-ASCII
|
||||
// characters if we are matching against an ASCII string.
|
||||
ASSERT(c <= char_mask);
|
||||
if (compiler->ignore_case()) {
|
||||
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
|
||||
int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
|
||||
@ -2496,11 +2490,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
int first_range = 0;
|
||||
while (ranges->at(first_range).from() > char_mask) {
|
||||
first_range++;
|
||||
if (first_range == ranges->length()) {
|
||||
details->set_cannot_match();
|
||||
pos->determines_perfectly = false;
|
||||
return;
|
||||
}
|
||||
// We should already have filtered out nodes that cannot match
|
||||
// so the first range should be a valid range.
|
||||
ASSERT(first_range != ranges->length());
|
||||
}
|
||||
CharacterRange range = ranges->at(first_range);
|
||||
uc16 from = range.from();
|
||||
@ -2629,6 +2621,144 @@ class VisitMarker {
|
||||
};
|
||||
|
||||
|
||||
RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
ASSERT(!info()->visited);
|
||||
VisitMarker marker(info());
|
||||
return FilterSuccessor(depth - 1);
|
||||
}
|
||||
|
||||
|
||||
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
|
||||
RegExpNode* next = on_success_->FilterASCII(depth - 1);
|
||||
if (next == NULL) return set_replacement(NULL);
|
||||
on_success_ = next;
|
||||
return set_replacement(this);
|
||||
}
|
||||
|
||||
|
||||
RegExpNode* TextNode::FilterASCII(int depth) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
ASSERT(!info()->visited);
|
||||
VisitMarker marker(info());
|
||||
int element_count = elms_->length();
|
||||
for (int i = 0; i < element_count; i++) {
|
||||
TextElement elm = elms_->at(i);
|
||||
if (elm.type == TextElement::ATOM) {
|
||||
Vector<const uc16> quarks = elm.data.u_atom->data();
|
||||
for (int j = 0; j < quarks.length(); j++) {
|
||||
// We don't need special handling for case independence
|
||||
// because of the rule that case independence cannot make
|
||||
// a non-ASCII character match an ASCII character.
|
||||
if (quarks[j] > String::kMaxAsciiCharCode) {
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ASSERT(elm.type == TextElement::CHAR_CLASS);
|
||||
RegExpCharacterClass* cc = elm.data.u_char_class;
|
||||
ZoneList<CharacterRange>* ranges = cc->ranges();
|
||||
if (!CharacterRange::IsCanonical(ranges)) {
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
}
|
||||
// Now they are in order so we only need to look at the first.
|
||||
int range_count = ranges->length();
|
||||
if (cc->is_negated()) {
|
||||
if (range_count != 0 &&
|
||||
ranges->at(0).from() == 0 &&
|
||||
ranges->at(0).to() >= String::kMaxAsciiCharCode) {
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
} else {
|
||||
if (range_count == 0 ||
|
||||
ranges->at(0).from() > String::kMaxAsciiCharCode) {
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return FilterSuccessor(depth - 1);
|
||||
}
|
||||
|
||||
|
||||
RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
if (info()->visited) return this;
|
||||
VisitMarker marker(info());
|
||||
|
||||
RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
|
||||
// If we can't continue after the loop then there is no sense in doing the
|
||||
// loop.
|
||||
if (continue_replacement == NULL) return set_replacement(NULL);
|
||||
|
||||
return ChoiceNode::FilterASCII(depth - 1);
|
||||
}
|
||||
|
||||
|
||||
RegExpNode* ChoiceNode::FilterASCII(int depth) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
if (info()->visited) return this;
|
||||
VisitMarker marker(info());
|
||||
int choice_count = alternatives_->length();
|
||||
int surviving = 0;
|
||||
RegExpNode* survivor = NULL;
|
||||
for (int i = 0; i < choice_count; i++) {
|
||||
GuardedAlternative alternative = alternatives_->at(i);
|
||||
RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
|
||||
ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
|
||||
alternatives_->at(i).set_node(replacement);
|
||||
if (replacement != NULL) {
|
||||
surviving++;
|
||||
survivor = replacement;
|
||||
}
|
||||
}
|
||||
if (surviving < 2) return set_replacement(survivor);
|
||||
|
||||
set_replacement(this);
|
||||
if (surviving == choice_count) {
|
||||
return this;
|
||||
}
|
||||
// Only some of the nodes survived the filtering. We need to rebuild the
|
||||
// alternatives list.
|
||||
ZoneList<GuardedAlternative>* new_alternatives =
|
||||
new ZoneList<GuardedAlternative>(surviving);
|
||||
for (int i = 0; i < choice_count; i++) {
|
||||
GuardedAlternative alternative = alternatives_->at(i);
|
||||
if (alternative.node() != NULL) {
|
||||
new_alternatives->Add(alternative);
|
||||
}
|
||||
}
|
||||
alternatives_ = new_alternatives;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
if (info()->visited) return this;
|
||||
VisitMarker marker(info());
|
||||
// Alternative 0 is the negative lookahead, alternative 1 is what comes
|
||||
// afterwards.
|
||||
RegExpNode* node = alternatives_->at(1).node();
|
||||
RegExpNode* replacement = node->FilterASCII(depth - 1);
|
||||
if (replacement == NULL) return set_replacement(NULL);
|
||||
alternatives_->at(1).set_node(replacement);
|
||||
|
||||
RegExpNode* neg_node = alternatives_->at(0).node();
|
||||
RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
|
||||
// If the negative lookahead is always going to fail then
|
||||
// we don't need to check it.
|
||||
if (neg_replacement == NULL) return set_replacement(replacement);
|
||||
alternatives_->at(0).set_node(neg_replacement);
|
||||
return set_replacement(this);
|
||||
}
|
||||
|
||||
|
||||
void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler,
|
||||
int characters_filled_in,
|
||||
@ -5690,6 +5820,9 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
|
||||
node = loop_node;
|
||||
}
|
||||
}
|
||||
if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
|
||||
|
||||
if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
|
||||
data->node = node;
|
||||
Analysis analysis(ignore_case, is_ascii);
|
||||
analysis.EnsureAnalyzed(node);
|
||||
|
@ -225,6 +225,8 @@ enum ElementInSetsRelation {
|
||||
};
|
||||
|
||||
|
||||
// Represents code units in the range from from_ to to_, both ends are
|
||||
// inclusive.
|
||||
class CharacterRange {
|
||||
public:
|
||||
CharacterRange() : from_(0), to_(0) { }
|
||||
@ -414,7 +416,8 @@ struct NodeInfo {
|
||||
follows_newline_interest(false),
|
||||
follows_start_interest(false),
|
||||
at_end(false),
|
||||
visited(false) { }
|
||||
visited(false),
|
||||
replacement_calculated(false) { }
|
||||
|
||||
// Returns true if the interests and assumptions of this node
|
||||
// matches the given one.
|
||||
@ -464,6 +467,7 @@ struct NodeInfo {
|
||||
|
||||
bool at_end: 1;
|
||||
bool visited: 1;
|
||||
bool replacement_calculated: 1;
|
||||
};
|
||||
|
||||
|
||||
@ -519,9 +523,12 @@ class QuickCheckDetails {
|
||||
};
|
||||
|
||||
|
||||
extern int kUninitializedRegExpNodePlaceHolder;
|
||||
|
||||
|
||||
class RegExpNode: public ZoneObject {
|
||||
public:
|
||||
RegExpNode() : trace_count_(0) {
|
||||
RegExpNode() : replacement_(NULL), trace_count_(0) {
|
||||
bm_info_[0] = bm_info_[1] = NULL;
|
||||
}
|
||||
virtual ~RegExpNode();
|
||||
@ -572,6 +579,22 @@ class RegExpNode: public ZoneObject {
|
||||
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
// If we know that the input is ASCII then there are some nodes that can
|
||||
// never match. This method returns a node that can be substituted for
|
||||
// itself, or NULL if the node can never match.
|
||||
virtual RegExpNode* FilterASCII(int depth) { return this; }
|
||||
// Helper for FilterASCII.
|
||||
RegExpNode* replacement() {
|
||||
ASSERT(info()->replacement_calculated);
|
||||
return replacement_;
|
||||
}
|
||||
RegExpNode* set_replacement(RegExpNode* replacement) {
|
||||
info()->replacement_calculated = true;
|
||||
replacement_ = replacement;
|
||||
return replacement; // For convenience.
|
||||
}
|
||||
|
||||
// We want to avoid recalculating the lookahead info, so we store it on the
|
||||
// node. Only info that is for this node is stored. We can tell that the
|
||||
// info is for this node when offset == 0, so the information is calculated
|
||||
@ -596,14 +619,10 @@ class RegExpNode: public ZoneObject {
|
||||
|
||||
protected:
|
||||
enum LimitResult { DONE, CONTINUE };
|
||||
RegExpNode* replacement_;
|
||||
|
||||
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
|
||||
|
||||
// Returns a clone of this node initialized using the copy constructor
|
||||
// of its concrete class. Note that the node may have to be pre-
|
||||
// processed before it is on a usable state.
|
||||
virtual RegExpNode* Clone() = 0;
|
||||
|
||||
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
|
||||
bm_info_[not_at_start ? 1 : 0] = bm;
|
||||
}
|
||||
@ -655,11 +674,16 @@ class SeqRegExpNode: public RegExpNode {
|
||||
: on_success_(on_success) { }
|
||||
RegExpNode* on_success() { return on_success_; }
|
||||
void set_on_success(RegExpNode* node) { on_success_ = node; }
|
||||
virtual RegExpNode* FilterASCII(int depth);
|
||||
virtual void FillInBMInfo(
|
||||
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
|
||||
on_success_->FillInBMInfo(offset, bm, not_at_start);
|
||||
if (offset == 0) set_bm_info(not_at_start, bm);
|
||||
}
|
||||
|
||||
protected:
|
||||
RegExpNode* FilterSuccessor(int depth);
|
||||
|
||||
private:
|
||||
RegExpNode* on_success_;
|
||||
};
|
||||
@ -711,7 +735,6 @@ class ActionNode: public SeqRegExpNode {
|
||||
Type type() { return type_; }
|
||||
// TODO(erikcorry): We should allow some action nodes in greedy loops.
|
||||
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
|
||||
virtual ActionNode* Clone() { return new ActionNode(*this); }
|
||||
|
||||
private:
|
||||
union {
|
||||
@ -778,12 +801,8 @@ class TextNode: public SeqRegExpNode {
|
||||
RegExpCompiler* compiler);
|
||||
virtual void FillInBMInfo(
|
||||
int offset, BoyerMooreLookahead* bm, bool not_at_start);
|
||||
virtual TextNode* Clone() {
|
||||
TextNode* result = new TextNode(*this);
|
||||
result->CalculateOffsets();
|
||||
return result;
|
||||
}
|
||||
void CalculateOffsets();
|
||||
virtual RegExpNode* FilterASCII(int depth);
|
||||
|
||||
private:
|
||||
enum TextEmitPassType {
|
||||
@ -842,7 +861,6 @@ class AssertionNode: public SeqRegExpNode {
|
||||
bool not_at_start);
|
||||
virtual void FillInBMInfo(
|
||||
int offset, BoyerMooreLookahead* bm, bool not_at_start);
|
||||
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
|
||||
AssertionNodeType type() { return type_; }
|
||||
void set_type(AssertionNodeType type) { type_ = type; }
|
||||
|
||||
@ -881,7 +899,6 @@ class BackReferenceNode: public SeqRegExpNode {
|
||||
}
|
||||
virtual void FillInBMInfo(
|
||||
int offset, BoyerMooreLookahead* bm, bool not_at_start);
|
||||
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
|
||||
|
||||
private:
|
||||
int start_reg_;
|
||||
@ -910,7 +927,7 @@ class EndNode: public RegExpNode {
|
||||
// Returning 0 from EatsAtLeast should ensure we never get here.
|
||||
UNREACHABLE();
|
||||
}
|
||||
virtual EndNode* Clone() { return new EndNode(*this); }
|
||||
|
||||
private:
|
||||
Action action_;
|
||||
};
|
||||
@ -997,13 +1014,13 @@ class ChoiceNode: public RegExpNode {
|
||||
bool not_at_start);
|
||||
virtual void FillInBMInfo(
|
||||
int offset, BoyerMooreLookahead* bm, bool not_at_start);
|
||||
virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
|
||||
|
||||
bool being_calculated() { return being_calculated_; }
|
||||
bool not_at_start() { return not_at_start_; }
|
||||
void set_not_at_start() { not_at_start_ = true; }
|
||||
void set_being_calculated(bool b) { being_calculated_ = b; }
|
||||
virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
|
||||
virtual RegExpNode* FilterASCII(int depth);
|
||||
|
||||
protected:
|
||||
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
|
||||
@ -1056,6 +1073,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
|
||||
// characters, but on a negative lookahead the negative branch did not take
|
||||
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
|
||||
virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
|
||||
virtual RegExpNode* FilterASCII(int depth);
|
||||
};
|
||||
|
||||
|
||||
@ -1078,11 +1096,11 @@ class LoopChoiceNode: public ChoiceNode {
|
||||
bool not_at_start);
|
||||
virtual void FillInBMInfo(
|
||||
int offset, BoyerMooreLookahead* bm, bool not_at_start);
|
||||
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
|
||||
RegExpNode* loop_node() { return loop_node_; }
|
||||
RegExpNode* continue_node() { return continue_node_; }
|
||||
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
|
||||
virtual void Accept(NodeVisitor* visitor);
|
||||
virtual RegExpNode* FilterASCII(int depth);
|
||||
|
||||
private:
|
||||
// AddAlternative is made private for loop nodes because alternatives
|
||||
|
@ -156,3 +156,34 @@ assertEquals("foo baz", a);
|
||||
|
||||
a = "foo bar baz".replace(/^|bar/g, "*");
|
||||
assertEquals("*foo * baz", a);
|
||||
|
||||
// We test FilterASCII using regexps that will backtrack forever. Since
|
||||
// a regexp with a non-ASCII character in it can never match an ASCII
|
||||
// string we can test that the relevant node is removed by verifying that
|
||||
// there is no hang.
|
||||
function NoHang(re) {
|
||||
print(re);
|
||||
"This is an ASCII string that could take forever".match(re);
|
||||
}
|
||||
|
||||
|
||||
NoHang(/(((.*)*)*x)å/); // Continuation after loop is filtered, so is loop.
|
||||
NoHang(/(((.*)*)*å)foo/); // Body of loop filtered.
|
||||
NoHang(/å(((.*)*)*x)/); // Everything after a filtered character is filtered.
|
||||
NoHang(/(((.*)*)*x)å/); // Everything before a filtered character is filtered.
|
||||
NoHang(/[æøå](((.*)*)*x)/); // Everything after a filtered class is filtered.
|
||||
NoHang(/(((.*)*)*x)[æøå]/); // Everything before a filtered class is filtered.
|
||||
NoHang(/[^\x00-\x7f](((.*)*)*x)/); // After negated class.
|
||||
NoHang(/(((.*)*)*x)[^\x00-\x7f]/); // Before negated class.
|
||||
NoHang(/(?!(((.*)*)*x)å)foo/); // Negative lookahead is filtered.
|
||||
NoHang(/(?!(((.*)*)*x))å/); // Continuation branch of negative lookahead.
|
||||
NoHang(/(?=(((.*)*)*x)å)foo/); // Positive lookahead is filtered.
|
||||
NoHang(/(?=(((.*)*)*x))å/); // Continuation branch of positive lookahead.
|
||||
NoHang(/(?=å)(((.*)*)*x)/); // Positive lookahead also prunes continuation.
|
||||
NoHang(/(æ|ø|å)(((.*)*)*x)/); // All branches of alternation are filtered.
|
||||
NoHang(/(a|b|(((.*)*)*x))å/); // 1 out of 3 branches pruned.
|
||||
NoHang(/(a|(((.*)*)*x)ø|(((.*)*)*x)å)/); // 2 out of 3 branches pruned.
|
||||
|
||||
var s = "Don't prune based on a repetition of length 0";
|
||||
assertEquals(null, s.match(/å{1,1}prune/));
|
||||
assertEquals("prune", (s.match(/å{0,0}prune/)[0]));
|
||||
|
Loading…
Reference in New Issue
Block a user