[regexp] Consider surrogate pairs when optimizing disjunctions

RationalizeConsecutiveAtoms optimizes ab|ac|az to a(?:b|c|d). Ensure that this optimization does not split surrogate pairs in unicode mode. BUG=chromium:641091 Review-Url: https://codereview.chromium.org/2813893002 Cr-Commit-Position: refs/heads/master@{#44599}
2017-04-12 02:09:12 -07:00 · 2017-04-12 02:09:12 -07:00 · 4635572471
commit 4635572471
parent 483812d46c
4 changed files with 64 additions and 24 deletions
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@ -3327,9 +3327,8 @@ TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
                                             RegExpNode* on_success) {
  DCHECK_NOT_NULL(ranges);
  ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
-  elms->Add(
-      TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
-      zone);
+  elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
+            zone);
  return new (zone) TextNode(elms, read_backward, on_success);
 }

@ -3341,12 +3340,12 @@ TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
  ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
  ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
  ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
-  elms->Add(TextElement::CharClass(
-                new (zone) RegExpCharacterClass(lead_ranges, false)),
-            zone);
-  elms->Add(TextElement::CharClass(
-                new (zone) RegExpCharacterClass(trail_ranges, false)),
-            zone);
+  elms->Add(
+      TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
+      zone);
+  elms->Add(
+      TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
+      zone);
  return new (zone) TextNode(elms, read_backward, on_success);
 }

@ -4851,7 +4850,7 @@ static bool CompareRanges(ZoneList<CharacterRange>* ranges,
 bool RegExpCharacterClass::is_standard(Zone* zone) {
  // TODO(lrn): Remove need for this function, by not throwing away information
  // along the way.
-  if (is_negated_) {
+  if (is_negated()) {
    return false;
  }
  if (set_.is_standard()) {
@ -5144,7 +5143,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
  if (compiler->needs_unicode_case_equivalents()) {
    AddUnicodeCaseEquivalents(ranges, zone);
  }
-  if (compiler->unicode() && !compiler->one_byte()) {
+  if (compiler->unicode() && !compiler->one_byte() &&
+      !contains_split_surrogate()) {
    if (is_negated()) {
      ZoneList<CharacterRange>* negated =
          new (zone) ZoneList<CharacterRange>(2, zone);
@ -5154,7 +5154,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
    if (ranges->length() == 0) {
      ranges->Add(CharacterRange::Everything(), zone);
      RegExpCharacterClass* fail =
-          new (zone) RegExpCharacterClass(ranges, true);
+          new (zone) RegExpCharacterClass(ranges, NEGATED);
      return new (zone) TextNode(fail, compiler->read_backward(), on_success);
    }
    if (standard_type() == '*') {
@ -5368,6 +5368,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
      i++;
      continue;
    }
+    DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
+    bool contains_trail_surrogate =
+        unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
    int first_in_run = i;
    i++;
    while (i < length) {
@ -5375,6 +5378,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
      if (!alternative->IsAtom()) break;
      atom = alternative->AsAtom();
      if (atom->length() != 1) break;
+      DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
+      contains_trail_surrogate |=
+          unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
      i++;
    }
    if (i > first_in_run + 1) {
@ -5387,8 +5393,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
        DCHECK_EQ(old_atom->length(), 1);
        ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
      }
+      RegExpCharacterClass::Flags flags;
+      if (compiler->unicode() && contains_trail_surrogate) {
+        flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
+      }
      alternatives->at(write_posn++) =
-          new (zone) RegExpCharacterClass(ranges, false);
+          new (zone) RegExpCharacterClass(ranges, flags);
    } else {
      // Just copy any trivial alternatives.
      for (int j = first_in_run; j < i; j++) {
--- a/src/regexp/regexp-ast.h
+++ b/src/regexp/regexp-ast.h
@ -291,9 +291,20 @@ class RegExpAssertion final : public RegExpTree {

 class RegExpCharacterClass final : public RegExpTree {
 public:
-  RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
-      : set_(ranges), is_negated_(is_negated) {}
-  explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
+  // NEGATED: The character class is negated and should match everything but
+  //     the specified ranges.
+  // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
+  //     surrogate and should not be unicode-desugared (crbug.com/641091).
+  enum Flag {
+    NEGATED = 1 << 0,
+    CONTAINS_SPLIT_SURROGATE = 1 << 1,
+  };
+  typedef base::Flags<Flag> Flags;
+
+  explicit RegExpCharacterClass(ZoneList<CharacterRange>* ranges,
+                                Flags flags = Flags())
+      : set_(ranges), flags_(flags) {}
+  explicit RegExpCharacterClass(uc16 type) : set_(type), flags_(0) {}
  void* Accept(RegExpVisitor* visitor, void* data) override;
  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
  RegExpCharacterClass* AsCharacterClass() override;
@ -322,11 +333,14 @@ class RegExpCharacterClass final : public RegExpTree {
  // * : All characters, for advancing unanchored regexp
  uc16 standard_type() { return set_.standard_set_type(); }
  ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
-  bool is_negated() { return is_negated_; }
+  bool is_negated() const { return (flags_ & NEGATED) != 0; }
+  bool contains_split_surrogate() const {
+    return (flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
+  }

 private:
  CharacterSet set_;
-  bool is_negated_;
+  const Flags flags_;
 };


--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@ -283,8 +283,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
          CharacterRange::AddClassEscape('.', ranges, false, zone());
        }

-        RegExpCharacterClass* cc =
-            new (zone()) RegExpCharacterClass(ranges, false);
+        RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
        builder->AddCharacterClass(cc);
        break;
      }
@ -392,7 +391,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
            CharacterRange::AddClassEscape(c, ranges,
                                           unicode() && ignore_case(), zone());
            RegExpCharacterClass* cc =
-                new (zone()) RegExpCharacterClass(ranges, false);
+                new (zone()) RegExpCharacterClass(ranges);
            builder->AddCharacterClass(cc);
            break;
          }
@ -408,7 +407,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
                  return ReportError(CStrVector("Invalid property name"));
                }
                RegExpCharacterClass* cc =
-                    new (zone()) RegExpCharacterClass(ranges, false);
+                    new (zone()) RegExpCharacterClass(ranges);
                builder->AddCharacterClass(cc);
              } else {
                // With /u, no identity escapes except for syntax characters
@ -1548,7 +1547,9 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
    ranges->Add(CharacterRange::Everything(), zone());
    is_negated = !is_negated;
  }
-  return new (zone()) RegExpCharacterClass(ranges, is_negated);
+  RegExpCharacterClass::Flags flags;
+  if (is_negated) flags = RegExpCharacterClass::NEGATED;
+  return new (zone()) RegExpCharacterClass(ranges, flags);
 }


@ -1722,7 +1723,7 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
  AddTerm(new (zone()) RegExpCharacterClass(
-      CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
+      CharacterRange::List(zone(), CharacterRange::Singleton(c))));
 }


--- a/test/mjsunit/regress/regress-641091.js
+++ b/test/mjsunit/regress/regress-641091.js
@ -0,0 +1,15 @@
+// Copyright 2017 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+assertEquals(["🍤", "🍤"],
+             '🍤🍦🍋ππ🍋🍦🍤'.match(/🍤/ug));
+
+assertEquals(["🍤", "🍦", "🍦", "🍤"],
+             '🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦/ug));
+
+assertEquals(["🍤", "🍦", "🍋", "🍋", "🍦", "🍤"],
+             '🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦|🍋/ug));
+
+assertEquals(["🍤", "🍦", "🍋", "π", "π", "🍋", "🍦", "🍤"],
+             '🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦|π|🍋/ug));