[regexp] Restrict unicode property value expressions

The unicode property escape syntax restricts unicode property names and unicode property values to consist only of characters taken from the [a-zA-Z0-9_] character class. See the spec at: https://tc39.github.io/proposal-regexp-unicode-property-escapes/ In most cases, we do not actually need to validate that this is the case, since subsequent property lookup in ICU will fail (and throw a SyntaxError) if the given property does not exist. However, there one special case. The ICU lookup takes the property name as a null-terminated string, so it will accept carefully malformed property names (e.g. '\p{Number\0[}'). This can end up confusing the regexp parser. With this CL, we explicitly restrict potential property names / values to the character set as specified. Bug: v8:4743, chromium:793793 Change-Id: Ic97deea8602571ec6793b79c4bb858e1c7597405 Reviewed-on: https://chromium-review.googlesource.com/824272 Reviewed-by: Mathias Bynens <mathias@chromium.org> Reviewed-by: Sathya Gunasekaran <gsathya@chromium.org> Reviewed-by: Yang Guo <yangguo@chromium.org> Commit-Queue: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#50130}
2017-12-13 15:00:50 +01:00 · 2017-12-13 15:00:50 +01:00 · 0da56e74cf
commit 0da56e74cf
parent 649ab060c0
2 changed files with 26 additions and 0 deletions
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@ -1336,6 +1336,19 @@ bool IsSupportedBinaryProperty(UProperty property) {
  return false;
 }

+bool IsUnicodePropertyValueCharacter(char c) {
+  // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
+  //
+  // Note that using this to validate each parsed char is quite conservative.
+  // A possible alternative solution would be to only ensure the parsed
+  // property name/value candidate string does not contain '\0' characters and
+  // let ICU lookups trigger the final failure.
+  if ('a' <= c && c <= 'z') return true;
+  if ('A' <= c && c <= 'Z') return true;
+  if ('0' <= c && c <= '9') return true;
+  return (c == '_');
+}
+
 }  // anonymous namespace

 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
@ -1353,11 +1366,13 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
  if (current() == '{') {
    // Parse \p{[PropertyName=]PropertyNameValue}
    for (Advance(); current() != '}' && current() != '='; Advance()) {
+      if (!IsUnicodePropertyValueCharacter(current())) return false;
      if (!has_next()) return false;
      first_part.push_back(static_cast<char>(current()));
    }
    if (current() == '=') {
      for (Advance(); current() != '}'; Advance()) {
+        if (!IsUnicodePropertyValueCharacter(current())) return false;
        if (!has_next()) return false;
        second_part.push_back(static_cast<char>(current()));
      }
@ -1369,6 +1384,10 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
  Advance();
  first_part.push_back(0);  // null-terminate string.

+  DCHECK(first_part.size() - 1 == std::strlen(first_part.data()));
+  DCHECK(second_part.empty() ||
+         second_part.size() - 1 == std::strlen(second_part.data()));
+
  if (second_part.empty()) {
    // First attempt to interpret as general category property value name.
    const char* name = first_part.data();
--- a/test/mjsunit/regress/regress-793793.js
+++ b/test/mjsunit/regress/regress-793793.js
@ -0,0 +1,7 @@
+// Copyright 2017 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Flags: --harmony-regexp-property
+
+assertThrows(() => new RegExp("\\1(\\P{P\0[}()/", "u"), SyntaxError);