Revert of [regexp] restrict pattern syntax for unicode mode. (patchset #6 id:120001 of https://codereview.chromium.org/1645573002/ )

Reason for revert: [Sheriff] Breaks layout tests: https://build.chromium.org/p/client.v8.fyi/builders/V8-Blink%20Linux%2064/builds/4222 Original issue's description: > [regexp] restrict pattern syntax for unicode mode. > > ES2015 Annex B.1.4 specifies a restricted pattern language for unicode > mode. This change reflects that, based on some test262 test cases. > > R=littledan@chromium.org > BUG=v8:2952 > LOG=N > > Committed: https://crrev.com/e918c4ec464456a374098049ca22eac2107f6223 > Cr-Commit-Position: refs/heads/master@{#33584} TBR=vogelheim@chromium.org,yangguo@chromium.org # Skipping CQ checks because original CL landed less than 1 days ago. NOPRESUBMIT=true NOTREECHECKS=true NOTRY=true BUG=v8:2952 Review URL: https://codereview.chromium.org/1648673002 Cr-Commit-Position: refs/heads/master@{#33588}
2016-01-28 07:26:55 -08:00 · 2016-01-28 07:26:55 -08:00 · 8b5a7eb6bf
commit 8b5a7eb6bf
parent 9c4d1396b6
3 changed files with 75 additions and 152 deletions
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@ -102,28 +102,11 @@ void RegExpParser::Advance(int dist) {

 bool RegExpParser::simple() { return simple_; }

-bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
-  switch (c) {
-    case '^':
-    case '$':
-    case '\\':
-    case '.':
-    case '*':
-    case '+':
-    case '?':
-    case '(':
-    case ')':
-    case '[':
-    case ']':
-    case '{':
-    case '}':
-    case '|':
-    case '/':
-      return true;
-    default:
-      break;
-  }
-  return false;
+
+bool RegExpParser::IsSyntaxCharacter(uc32 c) {
+  return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
+         c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
+         c == '{' || c == '}' || c == '|';
 }


@ -178,14 +161,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      case kEndMarker:
        if (state->IsSubexpression()) {
          // Inside a parenthesized group when hitting end of input.
-          return ReportError(CStrVector("Unterminated group"));
+          ReportError(CStrVector("Unterminated group") CHECK_FAILED);
        }
        DCHECK_EQ(INITIAL, state->group_type());
        // Parsing completed successfully.
        return builder->ToRegExp();
      case ')': {
        if (!state->IsSubexpression()) {
-          return ReportError(CStrVector("Unmatched ')'"));
+          ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
        }
        DCHECK_NE(INITIAL, state->group_type());

@ -293,12 +276,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
              }
            // Fall through.
            default:
-              return ReportError(CStrVector("Invalid group"));
+              ReportError(CStrVector("Invalid group") CHECK_FAILED);
+              break;
          }
          Advance(2);
        } else {
          if (captures_started_ >= kMaxCaptures) {
-            return ReportError(CStrVector("Too many captures"));
+            ReportError(CStrVector("Too many captures") CHECK_FAILED);
          }
          captures_started_++;
        }
@ -376,25 +360,24 @@ RegExpTree* RegExpParser::ParseDisjunction() {
              }
              break;
            }
-            // With /u, no identity escapes except for syntax characters
-            // are allowed. Otherwise, all identity escapes are allowed.
-            if (unicode()) {
-              return ReportError(CStrVector("Invalid escape"));
-            }
            uc32 first_digit = Next();
            if (first_digit == '8' || first_digit == '9') {
-              builder->AddCharacter(first_digit);
-              Advance(2);
+              // If the 'u' flag is present, only syntax characters can be
+              // escaped,
+              // no other identity escapes are allowed. If the 'u' flag is not
+              // present, all identity escapes are allowed.
+              if (!unicode()) {
+                builder->AddCharacter(first_digit);
+                Advance(2);
+              } else {
+                return ReportError(CStrVector("Invalid escape"));
+              }
              break;
            }
          }
          // FALLTHROUGH
          case '0': {
            Advance();
-            if (unicode() && Next() >= '0' && Next() <= '9') {
-              // With /u, decimal escape with leading 0 are not parsed as octal.
-              return ReportError(CStrVector("Invalid decimal escape"));
-            }
            uc32 octal = ParseOctalLiteral();
            builder->AddCharacter(octal);
            break;
@ -432,10 +415,6 @@ RegExpTree* RegExpParser::ParseDisjunction() {
              // This is outside the specification. We match JSC in
              // reading the backslash as a literal character instead
              // of as starting an escape.
-              if (unicode()) {
-                // With /u, invalid escapes are not treated as identity escapes.
-                return ReportError(CStrVector("Invalid unicode escape"));
-              }
              builder->AddCharacter('\\');
            } else {
              Advance(2);
@ -451,7 +430,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
            } else if (!unicode()) {
              builder->AddCharacter('x');
            } else {
-              // With /u, invalid escapes are not treated as identity escapes.
+              // If the 'u' flag is present, invalid escapes are not treated as
+              // identity escapes.
              return ReportError(CStrVector("Invalid escape"));
            }
            break;
@ -464,16 +444,20 @@ RegExpTree* RegExpParser::ParseDisjunction() {
            } else if (!unicode()) {
              builder->AddCharacter('u');
            } else {
-              // With /u, invalid escapes are not treated as identity escapes.
+              // If the 'u' flag is present, invalid escapes are not treated as
+              // identity escapes.
              return ReportError(CStrVector("Invalid unicode escape"));
            }
            break;
          }
          default:
            Advance();
-            // With /u, no identity escapes except for syntax characters
-            // are allowed. Otherwise, all identity escapes are allowed.
-            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
+            // If the 'u' flag is present, only syntax characters can be
+            // escaped, no
+            // other identity escapes are allowed. If the 'u' flag is not
+            // present,
+            // all identity escapes are allowed.
+            if (!unicode() || IsSyntaxCharacter(current())) {
              builder->AddCharacter(current());
              Advance();
            } else {
@ -485,16 +469,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      case '{': {
        int dummy;
        if (ParseIntervalQuantifier(&dummy, &dummy)) {
-          return ReportError(CStrVector("Nothing to repeat"));
+          ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
        }
        // fallthrough
      }
-      case '}':
-      case ']':
-        if (unicode()) {
-          return ReportError(CStrVector("Lone quantifier brackets"));
-        }
-      // fallthrough
      default:
        builder->AddUnicodeCharacter(current());
        Advance();
@ -527,15 +505,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      case '{':
        if (ParseIntervalQuantifier(&min, &max)) {
          if (max < min) {
-            return ReportError(
-                CStrVector("numbers out of order in {} quantifier"));
+            ReportError(CStrVector("numbers out of order in {} quantifier.")
+                            CHECK_FAILED);
          }
          break;
-        } else if (unicode()) {
-          // With /u, incomplete quantifiers are not allowed.
-          return ReportError(CStrVector("Incomplete quantifier"));
+        } else {
+          continue;
        }
-        continue;
      default:
        continue;
    }
@ -548,9 +524,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      quantifier_type = RegExpQuantifier::POSSESSIVE;
      Advance();
    }
-    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
-      return ReportError(CStrVector("Invalid quantifier"));
-    }
+    builder->AddQuantifierToAtom(min, max, quantifier_type);
  }
 }

@ -848,24 +822,15 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
    case 'c': {
      uc32 controlLetter = Next();
      uc32 letter = controlLetter & ~('A' ^ 'a');
-      // For compatibility with JSC, inside a character class. We also accept
-      // digits and underscore as control characters, unless with /u.
-      if (letter >= 'A' && letter <= 'Z') {
+      // For compatibility with JSC, inside a character class
+      // we also accept digits and underscore as control characters.
+      if ((controlLetter >= '0' && controlLetter <= '9') ||
+          controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
        Advance(2);
        // Control letters mapped to ASCII control characters in the range
        // 0x00-0x1f.
        return controlLetter & 0x1f;
      }
-      if (unicode()) {
-        // With /u, invalid escapes are not treated as identity escapes.
-        ReportError(CStrVector("Invalid class escape"));
-        return 0;
-      }
-      if ((controlLetter >= '0' && controlLetter <= '9') ||
-          controlLetter == '_') {
-        Advance(2);
-        return controlLetter & 0x1f;
-      }
      // We match JSC in reading the backslash as a literal
      // character instead of as starting an escape.
      return '\\';
@ -881,43 +846,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
      // For compatibility, we interpret a decimal escape that isn't
      // a back reference (and therefore either \0 or not valid according
      // to the specification) as a 1..3 digit octal character code.
-      if (unicode()) {
-        // With /u, decimal escape is not interpreted as octal character code.
-        ReportError(CStrVector("Invalid class escape"));
-        return 0;
-      }
      return ParseOctalLiteral();
    case 'x': {
      Advance();
      uc32 value;
-      if (ParseHexEscape(2, &value)) return value;
-      if (unicode()) {
-        // With /u, invalid escapes are not treated as identity escapes.
-        ReportError(CStrVector("Invalid escape"));
-        return 0;
+      if (ParseHexEscape(2, &value)) {
+        return value;
      }
-      // If \x is not followed by a two-digit hexadecimal, treat it
-      // as an identity escape.
-      return 'x';
+      if (!unicode()) {
+        // If \x is not followed by a two-digit hexadecimal, treat it
+        // as an identity escape.
+        return 'x';
+      }
+      // If the 'u' flag is present, invalid escapes are not treated as
+      // identity escapes.
+      ReportError(CStrVector("Invalid escape"));
+      return 0;
    }
    case 'u': {
      Advance();
      uc32 value;
-      if (ParseUnicodeEscape(&value)) return value;
-      if (unicode()) {
-        // With /u, invalid escapes are not treated as identity escapes.
-        ReportError(CStrVector("Invalid unicode escape"));
-        return 0;
+      if (ParseUnicodeEscape(&value)) {
+        return value;
      }
-      // If \u is not followed by a two-digit hexadecimal, treat it
-      // as an identity escape.
-      return 'u';
+      if (!unicode()) {
+        return 'u';
+      }
+      // If the 'u' flag is present, invalid escapes are not treated as
+      // identity escapes.
+      ReportError(CStrVector("Invalid unicode escape"));
+      return 0;
    }
    default: {
      uc32 result = current();
-      // With /u, no identity escapes except for syntax characters are
-      // allowed. Otherwise, all identity escapes are allowed.
-      if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
+      // If the 'u' flag is present, only syntax characters can be escaped, no
+      // other identity escapes are allowed. If the 'u' flag is not present, all
+      // identity escapes are allowed.
+      if (!unicode() || IsSyntaxCharacter(result)) {
        Advance();
        return result;
      }
@ -991,7 +956,6 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

 RegExpTree* RegExpParser::ParseCharacterClass() {
  static const char* kUnterminated = "Unterminated character class";
-  static const char* kRangeInvalid = "Invalid character class";
  static const char* kRangeOutOfOrder = "Range out of order in character class";

  DCHECK_EQ(current(), '[');
@ -1021,18 +985,13 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
      CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
      if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
        // Either end is an escaped character class. Treat the '-' verbatim.
-        if (unicode()) {
-          // ES2015 21.2.2.15.1 step 1.
-          return ReportError(CStrVector(kRangeInvalid));
-        }
        AddRangeOrEscape(ranges, char_class, first, zone());
        ranges->Add(CharacterRange::Singleton('-'), zone());
        AddRangeOrEscape(ranges, char_class_2, next, zone());
        continue;
      }
-      // ES2015 21.2.2.15.1 step 6.
      if (first.from() > next.to()) {
-        return ReportError(CStrVector(kRangeOutOfOrder));
+        return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
      }
      ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
    } else {
@ -1040,7 +999,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
    }
  }
  if (!has_more()) {
-    return ReportError(CStrVector(kUnterminated));
+    return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
  }
  Advance();
  if (ranges->length() == 0) {
@ -1203,7 +1162,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
  if (NeedsDesugaringForUnicode(cc)) {
-    // With /u, character class needs to be desugared, so it
+    // In unicode mode, character class needs to be desugared, so it
    // must be a standalone term instead of being part of a RegExpText.
    AddTerm(cc);
  } else {
@ -1316,12 +1275,13 @@ RegExpTree* RegExpBuilder::ToRegExp() {
  return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
 }

-bool RegExpBuilder::AddQuantifierToAtom(
+
+void RegExpBuilder::AddQuantifierToAtom(
    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
  FlushPendingSurrogate();
  if (pending_empty_) {
    pending_empty_ = false;
-    return true;
+    return;
  }
  RegExpTree* atom;
  if (characters_ != NULL) {
@ -1344,26 +1304,23 @@ bool RegExpBuilder::AddQuantifierToAtom(
  } else if (terms_.length() > 0) {
    DCHECK(last_added_ == ADD_ATOM);
    atom = terms_.RemoveLast();
-    // With /u, lookarounds are not quantifiable.
-    if (unicode() && atom->IsLookaround()) return false;
    if (atom->max_match() == 0) {
      // Guaranteed to only match an empty string.
      LAST(ADD_TERM);
      if (min == 0) {
-        return true;
+        return;
      }
      terms_.Add(atom, zone());
-      return true;
+      return;
    }
  } else {
    // Only call immediately after adding an atom or character!
    UNREACHABLE();
-    return false;
+    return;
  }
  terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
             zone());
  LAST(ADD_TERM);
-  return true;
 }

 }  // namespace internal
--- a/src/regexp/regexp-parser.h
+++ b/src/regexp/regexp-parser.h
@ -111,7 +111,7 @@ class RegExpBuilder : public ZoneObject {
  void AddTerm(RegExpTree* tree);
  void AddAssertion(RegExpTree* tree);
  void NewAlternative();  // '|'
-  bool AddQuantifierToAtom(int min, int max,
+  void AddQuantifierToAtom(int min, int max,
                           RegExpQuantifier::QuantifierType type);
  RegExpTree* ToRegExp();

@ -198,7 +198,7 @@ class RegExpParser BASE_EMBEDDED {
  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
  bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }

-  static bool IsSyntaxCharacterOrSlash(uc32 c);
+  static bool IsSyntaxCharacter(uc32 c);

  static const int kMaxCaptures = 1 << 16;
  static const uc32 kEndMarker = (1 << 21);
--- a/test/mjsunit/harmony/unicode-regexp-restricted-syntax.js
+++ b/test/mjsunit/harmony/unicode-regexp-restricted-syntax.js
@ -1,34 +0,0 @@
-// Copyright 2016 the V8 project authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Flags: --harmony-unicode-regexps
-
-// test262/data/test/language/literals/regexp/u-dec-esc
-assertThrows("/\\1/u");
-// test262/language/literals/regexp/u-invalid-char-range-a
-assertThrows("/[\\w-a]/u");
-// test262/language/literals/regexp/u-invalid-char-range-b
-assertThrows("/[a-\\w]/u");
-// test262/language/literals/regexp/u-invalid-char-esc
-assertThrows("/\\c/u");
-assertThrows("/\\c0/u");
-// test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion
-assertThrows("/(?=.)*/u");
-// test262/built-ins/RegExp/unicode_restricted_octal_escape
-assertThrows("/[\\1]/u");
-assertThrows("/\\00/u");
-assertThrows("/\\09/u");
-// test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha
-assertThrows("/[\\c]/u");
-// test262/built-ins/RegExp/unicode_restricted_identity_escape_c
-assertThrows("/[\\c0]/u");
-// test262/built-ins/RegExp/unicode_restricted_incomple_quantifier
-assertThrows("/a{/u");
-assertThrows("/a{1,/u");
-assertThrows("/{/u");
-assertThrows("/}/u");
-// test262/data/test/built-ins/RegExp/unicode_restricted_brackets
-assertThrows("/]/u");
-// test262/built-ins/RegExp/unicode_identity_escape
-/\//u;