[regexp] restrict pattern syntax for unicode mode.
ES2015 Annex B.1.4 specifies a restricted pattern language for unicode mode. This change reflects that, based on some test262 test cases. R=littledan@chromium.org BUG=v8:2952 LOG=N Committed: https://crrev.com/e918c4ec464456a374098049ca22eac2107f6223 Cr-Commit-Position: refs/heads/master@{#33584} Review URL: https://codereview.chromium.org/1645573002 Cr-Commit-Position: refs/heads/master@{#33603}
This commit is contained in:
parent
b6c9b70356
commit
bb6a53573c
@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) {
|
||||
|
||||
bool RegExpParser::simple() { return simple_; }
|
||||
|
||||
|
||||
bool RegExpParser::IsSyntaxCharacter(uc32 c) {
|
||||
return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
|
||||
c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
|
||||
c == '{' || c == '}' || c == '|';
|
||||
bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
|
||||
switch (c) {
|
||||
case '^':
|
||||
case '$':
|
||||
case '\\':
|
||||
case '.':
|
||||
case '*':
|
||||
case '+':
|
||||
case '?':
|
||||
case '(':
|
||||
case ')':
|
||||
case '[':
|
||||
case ']':
|
||||
case '{':
|
||||
case '}':
|
||||
case '|':
|
||||
case '/':
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -161,14 +178,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
case kEndMarker:
|
||||
if (state->IsSubexpression()) {
|
||||
// Inside a parenthesized group when hitting end of input.
|
||||
ReportError(CStrVector("Unterminated group") CHECK_FAILED);
|
||||
return ReportError(CStrVector("Unterminated group"));
|
||||
}
|
||||
DCHECK_EQ(INITIAL, state->group_type());
|
||||
// Parsing completed successfully.
|
||||
return builder->ToRegExp();
|
||||
case ')': {
|
||||
if (!state->IsSubexpression()) {
|
||||
ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
|
||||
return ReportError(CStrVector("Unmatched ')'"));
|
||||
}
|
||||
DCHECK_NE(INITIAL, state->group_type());
|
||||
|
||||
@ -276,13 +293,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
}
|
||||
// Fall through.
|
||||
default:
|
||||
ReportError(CStrVector("Invalid group") CHECK_FAILED);
|
||||
break;
|
||||
return ReportError(CStrVector("Invalid group"));
|
||||
}
|
||||
Advance(2);
|
||||
} else {
|
||||
if (captures_started_ >= kMaxCaptures) {
|
||||
ReportError(CStrVector("Too many captures") CHECK_FAILED);
|
||||
return ReportError(CStrVector("Too many captures"));
|
||||
}
|
||||
captures_started_++;
|
||||
}
|
||||
@ -360,24 +376,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
}
|
||||
break;
|
||||
}
|
||||
// With /u, no identity escapes except for syntax characters
|
||||
// are allowed. Otherwise, all identity escapes are allowed.
|
||||
if (unicode()) {
|
||||
return ReportError(CStrVector("Invalid escape"));
|
||||
}
|
||||
uc32 first_digit = Next();
|
||||
if (first_digit == '8' || first_digit == '9') {
|
||||
// If the 'u' flag is present, only syntax characters can be
|
||||
// escaped,
|
||||
// no other identity escapes are allowed. If the 'u' flag is not
|
||||
// present, all identity escapes are allowed.
|
||||
if (!unicode()) {
|
||||
builder->AddCharacter(first_digit);
|
||||
Advance(2);
|
||||
} else {
|
||||
return ReportError(CStrVector("Invalid escape"));
|
||||
}
|
||||
builder->AddCharacter(first_digit);
|
||||
Advance(2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// FALLTHROUGH
|
||||
case '0': {
|
||||
Advance();
|
||||
if (unicode() && Next() >= '0' && Next() <= '9') {
|
||||
// With /u, decimal escape with leading 0 are not parsed as octal.
|
||||
return ReportError(CStrVector("Invalid decimal escape"));
|
||||
}
|
||||
uc32 octal = ParseOctalLiteral();
|
||||
builder->AddCharacter(octal);
|
||||
break;
|
||||
@ -415,6 +432,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
// This is outside the specification. We match JSC in
|
||||
// reading the backslash as a literal character instead
|
||||
// of as starting an escape.
|
||||
if (unicode()) {
|
||||
// With /u, invalid escapes are not treated as identity escapes.
|
||||
return ReportError(CStrVector("Invalid unicode escape"));
|
||||
}
|
||||
builder->AddCharacter('\\');
|
||||
} else {
|
||||
Advance(2);
|
||||
@ -430,8 +451,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
} else if (!unicode()) {
|
||||
builder->AddCharacter('x');
|
||||
} else {
|
||||
// If the 'u' flag is present, invalid escapes are not treated as
|
||||
// identity escapes.
|
||||
// With /u, invalid escapes are not treated as identity escapes.
|
||||
return ReportError(CStrVector("Invalid escape"));
|
||||
}
|
||||
break;
|
||||
@ -444,20 +464,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
} else if (!unicode()) {
|
||||
builder->AddCharacter('u');
|
||||
} else {
|
||||
// If the 'u' flag is present, invalid escapes are not treated as
|
||||
// identity escapes.
|
||||
// With /u, invalid escapes are not treated as identity escapes.
|
||||
return ReportError(CStrVector("Invalid unicode escape"));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
Advance();
|
||||
// If the 'u' flag is present, only syntax characters can be
|
||||
// escaped, no
|
||||
// other identity escapes are allowed. If the 'u' flag is not
|
||||
// present,
|
||||
// all identity escapes are allowed.
|
||||
if (!unicode() || IsSyntaxCharacter(current())) {
|
||||
// With /u, no identity escapes except for syntax characters
|
||||
// are allowed. Otherwise, all identity escapes are allowed.
|
||||
if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
|
||||
builder->AddCharacter(current());
|
||||
Advance();
|
||||
} else {
|
||||
@ -469,10 +485,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
case '{': {
|
||||
int dummy;
|
||||
if (ParseIntervalQuantifier(&dummy, &dummy)) {
|
||||
ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
|
||||
return ReportError(CStrVector("Nothing to repeat"));
|
||||
}
|
||||
// fallthrough
|
||||
}
|
||||
case '}':
|
||||
case ']':
|
||||
if (unicode()) {
|
||||
return ReportError(CStrVector("Lone quantifier brackets"));
|
||||
}
|
||||
// fallthrough
|
||||
default:
|
||||
builder->AddUnicodeCharacter(current());
|
||||
Advance();
|
||||
@ -505,13 +527,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
case '{':
|
||||
if (ParseIntervalQuantifier(&min, &max)) {
|
||||
if (max < min) {
|
||||
ReportError(CStrVector("numbers out of order in {} quantifier.")
|
||||
CHECK_FAILED);
|
||||
return ReportError(
|
||||
CStrVector("numbers out of order in {} quantifier"));
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
continue;
|
||||
} else if (unicode()) {
|
||||
// With /u, incomplete quantifiers are not allowed.
|
||||
return ReportError(CStrVector("Incomplete quantifier"));
|
||||
}
|
||||
continue;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
@ -524,7 +548,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
quantifier_type = RegExpQuantifier::POSSESSIVE;
|
||||
Advance();
|
||||
}
|
||||
builder->AddQuantifierToAtom(min, max, quantifier_type);
|
||||
if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
|
||||
return ReportError(CStrVector("Invalid quantifier"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -822,15 +848,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
|
||||
case 'c': {
|
||||
uc32 controlLetter = Next();
|
||||
uc32 letter = controlLetter & ~('A' ^ 'a');
|
||||
// For compatibility with JSC, inside a character class
|
||||
// we also accept digits and underscore as control characters.
|
||||
if ((controlLetter >= '0' && controlLetter <= '9') ||
|
||||
controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
|
||||
// For compatibility with JSC, inside a character class. We also accept
|
||||
// digits and underscore as control characters, unless with /u.
|
||||
if (letter >= 'A' && letter <= 'Z') {
|
||||
Advance(2);
|
||||
// Control letters mapped to ASCII control characters in the range
|
||||
// 0x00-0x1f.
|
||||
return controlLetter & 0x1f;
|
||||
}
|
||||
if (unicode()) {
|
||||
// With /u, invalid escapes are not treated as identity escapes.
|
||||
ReportError(CStrVector("Invalid class escape"));
|
||||
return 0;
|
||||
}
|
||||
if ((controlLetter >= '0' && controlLetter <= '9') ||
|
||||
controlLetter == '_') {
|
||||
Advance(2);
|
||||
return controlLetter & 0x1f;
|
||||
}
|
||||
// We match JSC in reading the backslash as a literal
|
||||
// character instead of as starting an escape.
|
||||
return '\\';
|
||||
@ -846,43 +881,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
|
||||
// For compatibility, we interpret a decimal escape that isn't
|
||||
// a back reference (and therefore either \0 or not valid according
|
||||
// to the specification) as a 1..3 digit octal character code.
|
||||
if (unicode()) {
|
||||
// With /u, decimal escape is not interpreted as octal character code.
|
||||
ReportError(CStrVector("Invalid class escape"));
|
||||
return 0;
|
||||
}
|
||||
return ParseOctalLiteral();
|
||||
case 'x': {
|
||||
Advance();
|
||||
uc32 value;
|
||||
if (ParseHexEscape(2, &value)) {
|
||||
return value;
|
||||
if (ParseHexEscape(2, &value)) return value;
|
||||
if (unicode()) {
|
||||
// With /u, invalid escapes are not treated as identity escapes.
|
||||
ReportError(CStrVector("Invalid escape"));
|
||||
return 0;
|
||||
}
|
||||
if (!unicode()) {
|
||||
// If \x is not followed by a two-digit hexadecimal, treat it
|
||||
// as an identity escape.
|
||||
return 'x';
|
||||
}
|
||||
// If the 'u' flag is present, invalid escapes are not treated as
|
||||
// identity escapes.
|
||||
ReportError(CStrVector("Invalid escape"));
|
||||
return 0;
|
||||
// If \x is not followed by a two-digit hexadecimal, treat it
|
||||
// as an identity escape.
|
||||
return 'x';
|
||||
}
|
||||
case 'u': {
|
||||
Advance();
|
||||
uc32 value;
|
||||
if (ParseUnicodeEscape(&value)) {
|
||||
return value;
|
||||
if (ParseUnicodeEscape(&value)) return value;
|
||||
if (unicode()) {
|
||||
// With /u, invalid escapes are not treated as identity escapes.
|
||||
ReportError(CStrVector("Invalid unicode escape"));
|
||||
return 0;
|
||||
}
|
||||
if (!unicode()) {
|
||||
return 'u';
|
||||
}
|
||||
// If the 'u' flag is present, invalid escapes are not treated as
|
||||
// identity escapes.
|
||||
ReportError(CStrVector("Invalid unicode escape"));
|
||||
return 0;
|
||||
// If \u is not followed by a two-digit hexadecimal, treat it
|
||||
// as an identity escape.
|
||||
return 'u';
|
||||
}
|
||||
default: {
|
||||
uc32 result = current();
|
||||
// If the 'u' flag is present, only syntax characters can be escaped, no
|
||||
// other identity escapes are allowed. If the 'u' flag is not present, all
|
||||
// identity escapes are allowed.
|
||||
if (!unicode() || IsSyntaxCharacter(result)) {
|
||||
// With /u, no identity escapes except for syntax characters are
|
||||
// allowed. Otherwise, all identity escapes are allowed.
|
||||
if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
|
||||
Advance();
|
||||
return result;
|
||||
}
|
||||
@ -956,6 +991,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
|
||||
|
||||
RegExpTree* RegExpParser::ParseCharacterClass() {
|
||||
static const char* kUnterminated = "Unterminated character class";
|
||||
static const char* kRangeInvalid = "Invalid character class";
|
||||
static const char* kRangeOutOfOrder = "Range out of order in character class";
|
||||
|
||||
DCHECK_EQ(current(), '[');
|
||||
@ -985,13 +1021,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
|
||||
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
|
||||
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
|
||||
// Either end is an escaped character class. Treat the '-' verbatim.
|
||||
if (unicode()) {
|
||||
// ES2015 21.2.2.15.1 step 1.
|
||||
return ReportError(CStrVector(kRangeInvalid));
|
||||
}
|
||||
AddRangeOrEscape(ranges, char_class, first, zone());
|
||||
ranges->Add(CharacterRange::Singleton('-'), zone());
|
||||
AddRangeOrEscape(ranges, char_class_2, next, zone());
|
||||
continue;
|
||||
}
|
||||
// ES2015 21.2.2.15.1 step 6.
|
||||
if (first.from() > next.to()) {
|
||||
return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
|
||||
return ReportError(CStrVector(kRangeOutOfOrder));
|
||||
}
|
||||
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
|
||||
} else {
|
||||
@ -999,7 +1040,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
|
||||
}
|
||||
}
|
||||
if (!has_more()) {
|
||||
return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
|
||||
return ReportError(CStrVector(kUnterminated));
|
||||
}
|
||||
Advance();
|
||||
if (ranges->length() == 0) {
|
||||
@ -1162,7 +1203,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
|
||||
|
||||
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
|
||||
if (NeedsDesugaringForUnicode(cc)) {
|
||||
// In unicode mode, character class needs to be desugared, so it
|
||||
// With /u, character class needs to be desugared, so it
|
||||
// must be a standalone term instead of being part of a RegExpText.
|
||||
AddTerm(cc);
|
||||
} else {
|
||||
@ -1275,13 +1316,12 @@ RegExpTree* RegExpBuilder::ToRegExp() {
|
||||
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
|
||||
}
|
||||
|
||||
|
||||
void RegExpBuilder::AddQuantifierToAtom(
|
||||
bool RegExpBuilder::AddQuantifierToAtom(
|
||||
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
|
||||
FlushPendingSurrogate();
|
||||
if (pending_empty_) {
|
||||
pending_empty_ = false;
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
RegExpTree* atom;
|
||||
if (characters_ != NULL) {
|
||||
@ -1304,23 +1344,26 @@ void RegExpBuilder::AddQuantifierToAtom(
|
||||
} else if (terms_.length() > 0) {
|
||||
DCHECK(last_added_ == ADD_ATOM);
|
||||
atom = terms_.RemoveLast();
|
||||
// With /u, lookarounds are not quantifiable.
|
||||
if (unicode() && atom->IsLookaround()) return false;
|
||||
if (atom->max_match() == 0) {
|
||||
// Guaranteed to only match an empty string.
|
||||
LAST(ADD_TERM);
|
||||
if (min == 0) {
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
terms_.Add(atom, zone());
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
// Only call immediately after adding an atom or character!
|
||||
UNREACHABLE();
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
|
||||
zone());
|
||||
LAST(ADD_TERM);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
@ -111,7 +111,7 @@ class RegExpBuilder : public ZoneObject {
|
||||
void AddTerm(RegExpTree* tree);
|
||||
void AddAssertion(RegExpTree* tree);
|
||||
void NewAlternative(); // '|'
|
||||
void AddQuantifierToAtom(int min, int max,
|
||||
bool AddQuantifierToAtom(int min, int max,
|
||||
RegExpQuantifier::QuantifierType type);
|
||||
RegExpTree* ToRegExp();
|
||||
|
||||
@ -198,7 +198,7 @@ class RegExpParser BASE_EMBEDDED {
|
||||
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
|
||||
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
|
||||
|
||||
static bool IsSyntaxCharacter(uc32 c);
|
||||
static bool IsSyntaxCharacterOrSlash(uc32 c);
|
||||
|
||||
static const int kMaxCaptures = 1 << 16;
|
||||
static const uc32 kEndMarker = (1 << 21);
|
||||
|
34
test/mjsunit/harmony/unicode-regexp-restricted-syntax.js
Normal file
34
test/mjsunit/harmony/unicode-regexp-restricted-syntax.js
Normal file
@ -0,0 +1,34 @@
|
||||
// Copyright 2016 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// Flags: --harmony-unicode-regexps
|
||||
|
||||
// test262/data/test/language/literals/regexp/u-dec-esc
|
||||
assertThrows("/\\1/u");
|
||||
// test262/language/literals/regexp/u-invalid-char-range-a
|
||||
assertThrows("/[\\w-a]/u");
|
||||
// test262/language/literals/regexp/u-invalid-char-range-b
|
||||
assertThrows("/[a-\\w]/u");
|
||||
// test262/language/literals/regexp/u-invalid-char-esc
|
||||
assertThrows("/\\c/u");
|
||||
assertThrows("/\\c0/u");
|
||||
// test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion
|
||||
assertThrows("/(?=.)*/u");
|
||||
// test262/built-ins/RegExp/unicode_restricted_octal_escape
|
||||
assertThrows("/[\\1]/u");
|
||||
assertThrows("/\\00/u");
|
||||
assertThrows("/\\09/u");
|
||||
// test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha
|
||||
assertThrows("/[\\c]/u");
|
||||
// test262/built-ins/RegExp/unicode_restricted_identity_escape_c
|
||||
assertThrows("/[\\c0]/u");
|
||||
// test262/built-ins/RegExp/unicode_restricted_incomple_quantifier
|
||||
assertThrows("/a{/u");
|
||||
assertThrows("/a{1,/u");
|
||||
assertThrows("/{/u");
|
||||
assertThrows("/}/u");
|
||||
// test262/data/test/built-ins/RegExp/unicode_restricted_brackets
|
||||
assertThrows("/]/u");
|
||||
// test262/built-ins/RegExp/unicode_identity_escape
|
||||
/\//u;
|
Loading…
Reference in New Issue
Block a user