[regexp] restrict pattern syntax for unicode mode.

ES2015 Annex B.1.4 specifies a restricted pattern language for unicode mode. This change reflects that, based on some test262 test cases. R=littledan@chromium.org BUG=v8:2952 LOG=N Committed: https://crrev.com/e918c4ec464456a374098049ca22eac2107f6223 Cr-Commit-Position: refs/heads/master@{#33584} Review URL: https://codereview.chromium.org/1645573002 Cr-Commit-Position: refs/heads/master@{#33603}
2016-01-29 01:21:09 -08:00 · 2016-01-29 01:21:09 -08:00 · bb6a53573c
commit bb6a53573c
parent b6c9b70356
3 changed files with 152 additions and 75 deletions
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) {

 bool RegExpParser::simple() { return simple_; }

-
-bool RegExpParser::IsSyntaxCharacter(uc32 c) {
-  return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
-         c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
-         c == '{' || c == '}' || c == '|';
+bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
+  switch (c) {
+    case '^':
+    case '$':
+    case '\\':
+    case '.':
+    case '*':
+    case '+':
+    case '?':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '|':
+    case '/':
+      return true;
+    default:
+      break;
+  }
+  return false;
 }


@ -161,14 +178,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      case kEndMarker:
        if (state->IsSubexpression()) {
          // Inside a parenthesized group when hitting end of input.
-          ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+          return ReportError(CStrVector("Unterminated group"));
        }
        DCHECK_EQ(INITIAL, state->group_type());
        // Parsing completed successfully.
        return builder->ToRegExp();
      case ')': {
        if (!state->IsSubexpression()) {
-          ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
+          return ReportError(CStrVector("Unmatched ')'"));
        }
        DCHECK_NE(INITIAL, state->group_type());

@ -276,13 +293,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
              }
            // Fall through.
            default:
-              ReportError(CStrVector("Invalid group") CHECK_FAILED);
-              break;
+              return ReportError(CStrVector("Invalid group"));
          }
          Advance(2);
        } else {
          if (captures_started_ >= kMaxCaptures) {
-            ReportError(CStrVector("Too many captures") CHECK_FAILED);
+            return ReportError(CStrVector("Too many captures"));
          }
          captures_started_++;
        }
@ -360,24 +376,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
              }
              break;
            }
+            // With /u, no identity escapes except for syntax characters
+            // are allowed. Otherwise, all identity escapes are allowed.
+            if (unicode()) {
+              return ReportError(CStrVector("Invalid escape"));
+            }
            uc32 first_digit = Next();
            if (first_digit == '8' || first_digit == '9') {
-              // If the 'u' flag is present, only syntax characters can be
-              // escaped,
-              // no other identity escapes are allowed. If the 'u' flag is not
-              // present, all identity escapes are allowed.
-              if (!unicode()) {
-                builder->AddCharacter(first_digit);
-                Advance(2);
-              } else {
-                return ReportError(CStrVector("Invalid escape"));
-              }
+              builder->AddCharacter(first_digit);
+              Advance(2);
              break;
            }
          }
          // FALLTHROUGH
          case '0': {
            Advance();
+            if (unicode() && Next() >= '0' && Next() <= '9') {
+              // With /u, decimal escape with leading 0 are not parsed as octal.
+              return ReportError(CStrVector("Invalid decimal escape"));
+            }
            uc32 octal = ParseOctalLiteral();
            builder->AddCharacter(octal);
            break;
@ -415,6 +432,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
              // This is outside the specification. We match JSC in
              // reading the backslash as a literal character instead
              // of as starting an escape.
+              if (unicode()) {
+                // With /u, invalid escapes are not treated as identity escapes.
+                return ReportError(CStrVector("Invalid unicode escape"));
+              }
              builder->AddCharacter('\\');
            } else {
              Advance(2);
@ -430,8 +451,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
            } else if (!unicode()) {
              builder->AddCharacter('x');
            } else {
-              // If the 'u' flag is present, invalid escapes are not treated as
-              // identity escapes.
+              // With /u, invalid escapes are not treated as identity escapes.
              return ReportError(CStrVector("Invalid escape"));
            }
            break;
@ -444,20 +464,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
            } else if (!unicode()) {
              builder->AddCharacter('u');
            } else {
-              // If the 'u' flag is present, invalid escapes are not treated as
-              // identity escapes.
+              // With /u, invalid escapes are not treated as identity escapes.
              return ReportError(CStrVector("Invalid unicode escape"));
            }
            break;
          }
          default:
            Advance();
-            // If the 'u' flag is present, only syntax characters can be
-            // escaped, no
-            // other identity escapes are allowed. If the 'u' flag is not
-            // present,
-            // all identity escapes are allowed.
-            if (!unicode() || IsSyntaxCharacter(current())) {
+            // With /u, no identity escapes except for syntax characters
+            // are allowed. Otherwise, all identity escapes are allowed.
+            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
              builder->AddCharacter(current());
              Advance();
            } else {
@ -469,10 +485,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      case '{': {
        int dummy;
        if (ParseIntervalQuantifier(&dummy, &dummy)) {
-          ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
+          return ReportError(CStrVector("Nothing to repeat"));
        }
        // fallthrough
      }
+      case '}':
+      case ']':
+        if (unicode()) {
+          return ReportError(CStrVector("Lone quantifier brackets"));
+        }
+      // fallthrough
      default:
        builder->AddUnicodeCharacter(current());
        Advance();
@ -505,13 +527,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      case '{':
        if (ParseIntervalQuantifier(&min, &max)) {
          if (max < min) {
-            ReportError(CStrVector("numbers out of order in {} quantifier.")
-                            CHECK_FAILED);
+            return ReportError(
+                CStrVector("numbers out of order in {} quantifier"));
          }
          break;
-        } else {
-          continue;
+        } else if (unicode()) {
+          // With /u, incomplete quantifiers are not allowed.
+          return ReportError(CStrVector("Incomplete quantifier"));
        }
+        continue;
      default:
        continue;
    }
@ -524,7 +548,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
      quantifier_type = RegExpQuantifier::POSSESSIVE;
      Advance();
    }
-    builder->AddQuantifierToAtom(min, max, quantifier_type);
+    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+      return ReportError(CStrVector("Invalid quantifier"));
+    }
  }
 }

@ -822,15 +848,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
    case 'c': {
      uc32 controlLetter = Next();
      uc32 letter = controlLetter & ~('A' ^ 'a');
-      // For compatibility with JSC, inside a character class
-      // we also accept digits and underscore as control characters.
-      if ((controlLetter >= '0' && controlLetter <= '9') ||
-          controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
+      // For compatibility with JSC, inside a character class. We also accept
+      // digits and underscore as control characters, unless with /u.
+      if (letter >= 'A' && letter <= 'Z') {
        Advance(2);
        // Control letters mapped to ASCII control characters in the range
        // 0x00-0x1f.
        return controlLetter & 0x1f;
      }
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid class escape"));
+        return 0;
+      }
+      if ((controlLetter >= '0' && controlLetter <= '9') ||
+          controlLetter == '_') {
+        Advance(2);
+        return controlLetter & 0x1f;
+      }
      // We match JSC in reading the backslash as a literal
      // character instead of as starting an escape.
      return '\\';
@ -846,43 +881,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
      // For compatibility, we interpret a decimal escape that isn't
      // a back reference (and therefore either \0 or not valid according
      // to the specification) as a 1..3 digit octal character code.
+      if (unicode()) {
+        // With /u, decimal escape is not interpreted as octal character code.
+        ReportError(CStrVector("Invalid class escape"));
+        return 0;
+      }
      return ParseOctalLiteral();
    case 'x': {
      Advance();
      uc32 value;
-      if (ParseHexEscape(2, &value)) {
-        return value;
+      if (ParseHexEscape(2, &value)) return value;
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid escape"));
+        return 0;
      }
-      if (!unicode()) {
-        // If \x is not followed by a two-digit hexadecimal, treat it
-        // as an identity escape.
-        return 'x';
-      }
-      // If the 'u' flag is present, invalid escapes are not treated as
-      // identity escapes.
-      ReportError(CStrVector("Invalid escape"));
-      return 0;
+      // If \x is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'x';
    }
    case 'u': {
      Advance();
      uc32 value;
-      if (ParseUnicodeEscape(&value)) {
-        return value;
+      if (ParseUnicodeEscape(&value)) return value;
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid unicode escape"));
+        return 0;
      }
-      if (!unicode()) {
-        return 'u';
-      }
-      // If the 'u' flag is present, invalid escapes are not treated as
-      // identity escapes.
-      ReportError(CStrVector("Invalid unicode escape"));
-      return 0;
+      // If \u is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'u';
    }
    default: {
      uc32 result = current();
-      // If the 'u' flag is present, only syntax characters can be escaped, no
-      // other identity escapes are allowed. If the 'u' flag is not present, all
-      // identity escapes are allowed.
-      if (!unicode() || IsSyntaxCharacter(result)) {
+      // With /u, no identity escapes except for syntax characters are
+      // allowed. Otherwise, all identity escapes are allowed.
+      if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
        Advance();
        return result;
      }
@ -956,6 +991,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

 RegExpTree* RegExpParser::ParseCharacterClass() {
  static const char* kUnterminated = "Unterminated character class";
+  static const char* kRangeInvalid = "Invalid character class";
  static const char* kRangeOutOfOrder = "Range out of order in character class";

  DCHECK_EQ(current(), '[');
@ -985,13 +1021,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
      CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
      if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
        // Either end is an escaped character class. Treat the '-' verbatim.
+        if (unicode()) {
+          // ES2015 21.2.2.15.1 step 1.
+          return ReportError(CStrVector(kRangeInvalid));
+        }
        AddRangeOrEscape(ranges, char_class, first, zone());
        ranges->Add(CharacterRange::Singleton('-'), zone());
        AddRangeOrEscape(ranges, char_class_2, next, zone());
        continue;
      }
+      // ES2015 21.2.2.15.1 step 6.
      if (first.from() > next.to()) {
-        return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
+        return ReportError(CStrVector(kRangeOutOfOrder));
      }
      ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
    } else {
@ -999,7 +1040,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
    }
  }
  if (!has_more()) {
-    return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
+    return ReportError(CStrVector(kUnterminated));
  }
  Advance();
  if (ranges->length() == 0) {
@ -1162,7 +1203,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
  if (NeedsDesugaringForUnicode(cc)) {
-    // In unicode mode, character class needs to be desugared, so it
+    // With /u, character class needs to be desugared, so it
    // must be a standalone term instead of being part of a RegExpText.
    AddTerm(cc);
  } else {
@ -1275,13 +1316,12 @@ RegExpTree* RegExpBuilder::ToRegExp() {
  return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
 }

-
-void RegExpBuilder::AddQuantifierToAtom(
+bool RegExpBuilder::AddQuantifierToAtom(
    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
  FlushPendingSurrogate();
  if (pending_empty_) {
    pending_empty_ = false;
-    return;
+    return true;
  }
  RegExpTree* atom;
  if (characters_ != NULL) {
@ -1304,23 +1344,26 @@ void RegExpBuilder::AddQuantifierToAtom(
  } else if (terms_.length() > 0) {
    DCHECK(last_added_ == ADD_ATOM);
    atom = terms_.RemoveLast();
+    // With /u, lookarounds are not quantifiable.
+    if (unicode() && atom->IsLookaround()) return false;
    if (atom->max_match() == 0) {
      // Guaranteed to only match an empty string.
      LAST(ADD_TERM);
      if (min == 0) {
-        return;
+        return true;
      }
      terms_.Add(atom, zone());
-      return;
+      return true;
    }
  } else {
    // Only call immediately after adding an atom or character!
    UNREACHABLE();
-    return;
+    return false;
  }
  terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
             zone());
  LAST(ADD_TERM);
+  return true;
 }

 }  // namespace internal
--- a/src/regexp/regexp-parser.h
+++ b/src/regexp/regexp-parser.h
@ -111,7 +111,7 @@ class RegExpBuilder : public ZoneObject {
  void AddTerm(RegExpTree* tree);
  void AddAssertion(RegExpTree* tree);
  void NewAlternative();  // '|'
-  void AddQuantifierToAtom(int min, int max,
+  bool AddQuantifierToAtom(int min, int max,
                           RegExpQuantifier::QuantifierType type);
  RegExpTree* ToRegExp();

@ -198,7 +198,7 @@ class RegExpParser BASE_EMBEDDED {
  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
  bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }

-  static bool IsSyntaxCharacter(uc32 c);
+  static bool IsSyntaxCharacterOrSlash(uc32 c);

  static const int kMaxCaptures = 1 << 16;
  static const uc32 kEndMarker = (1 << 21);
--- a/test/mjsunit/harmony/unicode-regexp-restricted-syntax.js
+++ b/test/mjsunit/harmony/unicode-regexp-restricted-syntax.js
@ -0,0 +1,34 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Flags: --harmony-unicode-regexps
+
+// test262/data/test/language/literals/regexp/u-dec-esc
+assertThrows("/\\1/u");
+// test262/language/literals/regexp/u-invalid-char-range-a
+assertThrows("/[\\w-a]/u");
+// test262/language/literals/regexp/u-invalid-char-range-b
+assertThrows("/[a-\\w]/u");
+// test262/language/literals/regexp/u-invalid-char-esc
+assertThrows("/\\c/u");
+assertThrows("/\\c0/u");
+// test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion
+assertThrows("/(?=.)*/u");
+// test262/built-ins/RegExp/unicode_restricted_octal_escape
+assertThrows("/[\\1]/u");
+assertThrows("/\\00/u");
+assertThrows("/\\09/u");
+// test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha
+assertThrows("/[\\c]/u");
+// test262/built-ins/RegExp/unicode_restricted_identity_escape_c
+assertThrows("/[\\c0]/u");
+// test262/built-ins/RegExp/unicode_restricted_incomple_quantifier
+assertThrows("/a{/u");
+assertThrows("/a{1,/u");
+assertThrows("/{/u");
+assertThrows("/}/u");
+// test262/data/test/built-ins/RegExp/unicode_restricted_brackets
+assertThrows("/]/u");
+// test262/built-ins/RegExp/unicode_identity_escape
+/\//u;