Regexp: Remove nodes from the regexp that cannot match because

they contain non-ASCII characters and the input string is ASCII. Remove unused Clone() method. Review URL: https://chromiumcodereview.appspot.com/10174017 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11445 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2012-04-26 09:11:19 +00:00 · 2012-04-26 09:11:19 +00:00 · d511b69e86
commit d511b69e86
parent 4b920a20ab
3 changed files with 214 additions and 32 deletions
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@ -2426,15 +2426,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
        QuickCheckDetails::Position* pos =
            details->positions(characters_filled_in);
        uc16 c = quarks[i];
-        if (c > char_mask) {
-          // If we expect a non-ASCII character from an ASCII string,
-          // there is no way we can match. Not even case independent
-          // matching can turn an ASCII character into non-ASCII or
-          // vice versa.
-          details->set_cannot_match();
-          pos->determines_perfectly = false;
-          return;
-        }
+        // We should already have filtered out nodes that have non-ASCII
+        // characters if we are matching against an ASCII string.
+        ASSERT(c <= char_mask);
        if (compiler->ignore_case()) {
          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
          int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
@ -2496,11 +2490,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
        int first_range = 0;
        while (ranges->at(first_range).from() > char_mask) {
          first_range++;
-          if (first_range == ranges->length()) {
-            details->set_cannot_match();
-            pos->determines_perfectly = false;
-            return;
-          }
+          // We should already have filtered out nodes that cannot match
+          // so the first range should be a valid range.
+          ASSERT(first_range != ranges->length());
        }
        CharacterRange range = ranges->at(first_range);
        uc16 from = range.from();
@ -2629,6 +2621,144 @@ class VisitMarker {
 };


+RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  ASSERT(!info()->visited);
+  VisitMarker marker(info());
+  return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
+  RegExpNode* next = on_success_->FilterASCII(depth - 1);
+  if (next == NULL) return set_replacement(NULL);
+  on_success_ = next;
+  return set_replacement(this);
+}
+
+
+RegExpNode* TextNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  ASSERT(!info()->visited);
+  VisitMarker marker(info());
+  int element_count = elms_->length();
+  for (int i = 0; i < element_count; i++) {
+    TextElement elm = elms_->at(i);
+    if (elm.type == TextElement::ATOM) {
+      Vector<const uc16> quarks = elm.data.u_atom->data();
+      for (int j = 0; j < quarks.length(); j++) {
+        // We don't need special handling for case independence
+        // because of the rule that case independence cannot make
+        // a non-ASCII character match an ASCII character.
+        if (quarks[j] > String::kMaxAsciiCharCode) {
+          return set_replacement(NULL);
+        }
+      }
+    } else {
+      ASSERT(elm.type == TextElement::CHAR_CLASS);
+      RegExpCharacterClass* cc = elm.data.u_char_class;
+      ZoneList<CharacterRange>* ranges = cc->ranges();
+      if (!CharacterRange::IsCanonical(ranges)) {
+        CharacterRange::Canonicalize(ranges);
+      }
+      // Now they are in order so we only need to look at the first.
+      int range_count = ranges->length();
+      if (cc->is_negated()) {
+        if (range_count != 0 &&
+            ranges->at(0).from() == 0 &&
+            ranges->at(0).to() >= String::kMaxAsciiCharCode) {
+          return set_replacement(NULL);
+        }
+      } else {
+        if (range_count == 0 ||
+            ranges->at(0).from() > String::kMaxAsciiCharCode) {
+          return set_replacement(NULL);
+        }
+      }
+    }
+  }
+  return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  if (info()->visited) return this;
+  VisitMarker marker(info());
+
+  RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
+  // If we can't continue after the loop then there is no sense in doing the
+  // loop.
+  if (continue_replacement == NULL) return set_replacement(NULL);
+
+  return ChoiceNode::FilterASCII(depth - 1);
+}
+
+
+RegExpNode* ChoiceNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  if (info()->visited) return this;
+  VisitMarker marker(info());
+  int choice_count = alternatives_->length();
+  int surviving = 0;
+  RegExpNode* survivor = NULL;
+  for (int i = 0; i < choice_count; i++) {
+    GuardedAlternative alternative = alternatives_->at(i);
+    RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
+    ASSERT(replacement != this);  // No missing EMPTY_MATCH_CHECK.
+    alternatives_->at(i).set_node(replacement);
+    if (replacement != NULL) {
+      surviving++;
+      survivor = replacement;
+    }
+  }
+  if (surviving < 2) return set_replacement(survivor);
+
+  set_replacement(this);
+  if (surviving == choice_count) {
+    return this;
+  }
+  // Only some of the nodes survived the filtering.  We need to rebuild the
+  // alternatives list.
+  ZoneList<GuardedAlternative>* new_alternatives =
+      new ZoneList<GuardedAlternative>(surviving);
+  for (int i = 0; i < choice_count; i++) {
+    GuardedAlternative alternative = alternatives_->at(i);
+    if (alternative.node() != NULL) {
+      new_alternatives->Add(alternative);
+    }
+  }
+  alternatives_ = new_alternatives;
+  return this;
+}
+
+
+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  if (info()->visited) return this;
+  VisitMarker marker(info());
+  // Alternative 0 is the negative lookahead, alternative 1 is what comes
+  // afterwards.
+  RegExpNode* node = alternatives_->at(1).node();
+  RegExpNode* replacement = node->FilterASCII(depth - 1);
+  if (replacement == NULL) return set_replacement(NULL);
+  alternatives_->at(1).set_node(replacement);
+
+  RegExpNode* neg_node = alternatives_->at(0).node();
+  RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
+  // If the negative lookahead is always going to fail then
+  // we don't need to check it.
+  if (neg_replacement == NULL) return set_replacement(replacement);
+  alternatives_->at(0).set_node(neg_replacement);
+  return set_replacement(this);
+}
+
+
 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
                                          RegExpCompiler* compiler,
                                          int characters_filled_in,
@ -5690,6 +5820,9 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
      node = loop_node;
    }
  }
+  if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
+
+  if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
  data->node = node;
  Analysis analysis(ignore_case, is_ascii);
  analysis.EnsureAnalyzed(node);
--- a/src/jsregexp.h
+++ b/src/jsregexp.h
@ -225,6 +225,8 @@ enum ElementInSetsRelation {
 };


+// Represents code units in the range from from_ to to_, both ends are
+// inclusive.
 class CharacterRange {
 public:
  CharacterRange() : from_(0), to_(0) { }
@ -414,7 +416,8 @@ struct NodeInfo {
        follows_newline_interest(false),
        follows_start_interest(false),
        at_end(false),
-        visited(false) { }
+        visited(false),
+        replacement_calculated(false) { }

  // Returns true if the interests and assumptions of this node
  // matches the given one.
@ -464,6 +467,7 @@ struct NodeInfo {

  bool at_end: 1;
  bool visited: 1;
+  bool replacement_calculated: 1;
 };


@ -519,9 +523,12 @@ class QuickCheckDetails {
 };


+extern int kUninitializedRegExpNodePlaceHolder;
+
+
 class RegExpNode: public ZoneObject {
 public:
-  RegExpNode() : trace_count_(0) {
+  RegExpNode() : replacement_(NULL), trace_count_(0) {
    bm_info_[0] = bm_info_[1] = NULL;
  }
  virtual ~RegExpNode();
@ -572,6 +579,22 @@ class RegExpNode: public ZoneObject {
      int offset, BoyerMooreLookahead* bm, bool not_at_start) {
    UNREACHABLE();
  }
+
+  // If we know that the input is ASCII then there are some nodes that can
+  // never match.  This method returns a node that can be substituted for
+  // itself, or NULL if the node can never match.
+  virtual RegExpNode* FilterASCII(int depth) { return this; }
+  // Helper for FilterASCII.
+  RegExpNode* replacement() {
+    ASSERT(info()->replacement_calculated);
+    return replacement_;
+  }
+  RegExpNode* set_replacement(RegExpNode* replacement) {
+    info()->replacement_calculated = true;
+    replacement_ =  replacement;
+    return replacement;  // For convenience.
+  }
+
  // We want to avoid recalculating the lookahead info, so we store it on the
  // node.  Only info that is for this node is stored.  We can tell that the
  // info is for this node when offset == 0, so the information is calculated
@ -596,14 +619,10 @@ class RegExpNode: public ZoneObject {

 protected:
  enum LimitResult { DONE, CONTINUE };
+  RegExpNode* replacement_;

  LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);

-  // Returns a clone of this node initialized using the copy constructor
-  // of its concrete class.  Note that the node may have to be pre-
-  // processed before it is on a usable state.
-  virtual RegExpNode* Clone() = 0;
-
  void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
    bm_info_[not_at_start ? 1 : 0] = bm;
  }
@ -655,11 +674,16 @@ class SeqRegExpNode: public RegExpNode {
      : on_success_(on_success) { }
  RegExpNode* on_success() { return on_success_; }
  void set_on_success(RegExpNode* node) { on_success_ = node; }
+  virtual RegExpNode* FilterASCII(int depth);
  virtual void FillInBMInfo(
      int offset, BoyerMooreLookahead* bm, bool not_at_start) {
    on_success_->FillInBMInfo(offset, bm, not_at_start);
    if (offset == 0) set_bm_info(not_at_start, bm);
  }
+
+ protected:
+  RegExpNode* FilterSuccessor(int depth);
+
 private:
  RegExpNode* on_success_;
 };
@ -711,7 +735,6 @@ class ActionNode: public SeqRegExpNode {
  Type type() { return type_; }
  // TODO(erikcorry): We should allow some action nodes in greedy loops.
  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
-  virtual ActionNode* Clone() { return new ActionNode(*this); }

 private:
  union {
@ -778,12 +801,8 @@ class TextNode: public SeqRegExpNode {
      RegExpCompiler* compiler);
  virtual void FillInBMInfo(
      int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual TextNode* Clone() {
-    TextNode* result = new TextNode(*this);
-    result->CalculateOffsets();
-    return result;
-  }
  void CalculateOffsets();
+  virtual RegExpNode* FilterASCII(int depth);

 private:
  enum TextEmitPassType {
@ -842,7 +861,6 @@ class AssertionNode: public SeqRegExpNode {
                                    bool not_at_start);
  virtual void FillInBMInfo(
      int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual AssertionNode* Clone() { return new AssertionNode(*this); }
  AssertionNodeType type() { return type_; }
  void set_type(AssertionNodeType type) { type_ = type; }

@ -881,7 +899,6 @@ class BackReferenceNode: public SeqRegExpNode {
  }
  virtual void FillInBMInfo(
      int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }

 private:
  int start_reg_;
@ -910,7 +927,7 @@ class EndNode: public RegExpNode {
    // Returning 0 from EatsAtLeast should ensure we never get here.
    UNREACHABLE();
  }
-  virtual EndNode* Clone() { return new EndNode(*this); }
+
 private:
  Action action_;
 };
@ -997,13 +1014,13 @@ class ChoiceNode: public RegExpNode {
                                    bool not_at_start);
  virtual void FillInBMInfo(
      int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }

  bool being_calculated() { return being_calculated_; }
  bool not_at_start() { return not_at_start_; }
  void set_not_at_start() { not_at_start_ = true; }
  void set_being_calculated(bool b) { being_calculated_ = b; }
  virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
+  virtual RegExpNode* FilterASCII(int depth);

 protected:
  int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
@ -1056,6 +1073,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
  // characters, but on a negative lookahead the negative branch did not take
  // part in that calculation (EatsAtLeast) so the assumptions don't hold.
  virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
+  virtual RegExpNode* FilterASCII(int depth);
 };


@ -1078,11 +1096,11 @@ class LoopChoiceNode: public ChoiceNode {
                                    bool not_at_start);
  virtual void FillInBMInfo(
      int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
  RegExpNode* loop_node() { return loop_node_; }
  RegExpNode* continue_node() { return continue_node_; }
  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
  virtual void Accept(NodeVisitor* visitor);
+  virtual RegExpNode* FilterASCII(int depth);

 private:
  // AddAlternative is made private for loop nodes because alternatives
--- a/test/mjsunit/regexp-capture-3.js
+++ b/test/mjsunit/regexp-capture-3.js
@ -156,3 +156,34 @@ assertEquals("foo  baz", a);

 a = "foo bar baz".replace(/^|bar/g, "*");
 assertEquals("*foo * baz", a);
+
+// We test FilterASCII using regexps that will backtrack forever.  Since
+// a regexp with a non-ASCII character in it can never match an ASCII
+// string we can test that the relevant node is removed by verifying that
+// there is no hang.
+function NoHang(re) {
+  print(re);
+  "This is an ASCII string that could take forever".match(re);
+}
+
+
+NoHang(/(((.*)*)*x)å/);  // Continuation after loop is filtered, so is loop.
+NoHang(/(((.*)*)*å)foo/);  // Body of loop filtered.
+NoHang(/å(((.*)*)*x)/);   // Everything after a filtered character is filtered.
+NoHang(/(((.*)*)*x)å/);   // Everything before a filtered character is filtered.
+NoHang(/[æøå](((.*)*)*x)/);   // Everything after a filtered class is filtered.
+NoHang(/(((.*)*)*x)[æøå]/);   // Everything before a filtered class is filtered.
+NoHang(/[^\x00-\x7f](((.*)*)*x)/);   // After negated class.
+NoHang(/(((.*)*)*x)[^\x00-\x7f]/);   // Before negated class.
+NoHang(/(?!(((.*)*)*x)å)foo/);  // Negative lookahead is filtered.
+NoHang(/(?!(((.*)*)*x))å/);  // Continuation branch of negative lookahead.
+NoHang(/(?=(((.*)*)*x)å)foo/);  // Positive lookahead is filtered.
+NoHang(/(?=(((.*)*)*x))å/);  // Continuation branch of positive lookahead.
+NoHang(/(?=å)(((.*)*)*x)/);  // Positive lookahead also prunes continuation.
+NoHang(/(æ|ø|å)(((.*)*)*x)/);  // All branches of alternation are filtered.
+NoHang(/(a|b|(((.*)*)*x))å/);  // 1 out of 3 branches pruned.
+NoHang(/(a|(((.*)*)*x)ø|(((.*)*)*x)å)/);  // 2 out of 3 branches pruned.
+
+var s = "Don't prune based on a repetition of length 0";
+assertEquals(null, s.match(/å{1,1}prune/));
+assertEquals("prune", (s.match(/å{0,0}prune/)[0]));