[regexp] restrict pattern syntax for unicode mode.

ES2015 Annex B.1.4 specifies a restricted pattern language for unicode
mode. This change reflects that, based on some test262 test cases.

R=littledan@chromium.org
BUG=v8:2952
LOG=N

Committed: https://crrev.com/e918c4ec464456a374098049ca22eac2107f6223
Cr-Commit-Position: refs/heads/master@{#33584}

Review URL: https://codereview.chromium.org/1645573002

Cr-Commit-Position: refs/heads/master@{#33603}
This commit is contained in:
yangguo 2016-01-29 01:21:09 -08:00 committed by Commit bot
parent b6c9b70356
commit bb6a53573c
3 changed files with 152 additions and 75 deletions

View File

@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) {
bool RegExpParser::simple() { return simple_; }
bool RegExpParser::IsSyntaxCharacter(uc32 c) {
return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
c == '{' || c == '}' || c == '|';
bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
switch (c) {
case '^':
case '$':
case '\\':
case '.':
case '*':
case '+':
case '?':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '|':
case '/':
return true;
default:
break;
}
return false;
}
@ -161,14 +178,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case kEndMarker:
if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
ReportError(CStrVector("Unterminated group") CHECK_FAILED);
return ReportError(CStrVector("Unterminated group"));
}
DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
if (!state->IsSubexpression()) {
ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
return ReportError(CStrVector("Unmatched ')'"));
}
DCHECK_NE(INITIAL, state->group_type());
@ -276,13 +293,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
// Fall through.
default:
ReportError(CStrVector("Invalid group") CHECK_FAILED);
break;
return ReportError(CStrVector("Invalid group"));
}
Advance(2);
} else {
if (captures_started_ >= kMaxCaptures) {
ReportError(CStrVector("Too many captures") CHECK_FAILED);
return ReportError(CStrVector("Too many captures"));
}
captures_started_++;
}
@ -360,24 +376,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
break;
}
// With /u, no identity escapes except for syntax characters
// are allowed. Otherwise, all identity escapes are allowed.
if (unicode()) {
return ReportError(CStrVector("Invalid escape"));
}
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
// If the 'u' flag is present, only syntax characters can be
// escaped,
// no other identity escapes are allowed. If the 'u' flag is not
// present, all identity escapes are allowed.
if (!unicode()) {
builder->AddCharacter(first_digit);
Advance(2);
} else {
return ReportError(CStrVector("Invalid escape"));
}
builder->AddCharacter(first_digit);
Advance(2);
break;
}
}
// FALLTHROUGH
case '0': {
Advance();
if (unicode() && Next() >= '0' && Next() <= '9') {
// With /u, decimal escape with leading 0 are not parsed as octal.
return ReportError(CStrVector("Invalid decimal escape"));
}
uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal);
break;
@ -415,6 +432,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// This is outside the specification. We match JSC in
// reading the backslash as a literal character instead
// of as starting an escape.
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid unicode escape"));
}
builder->AddCharacter('\\');
} else {
Advance(2);
@ -430,8 +451,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
} else if (!unicode()) {
builder->AddCharacter('x');
} else {
// If the 'u' flag is present, invalid escapes are not treated as
// identity escapes.
// With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid escape"));
}
break;
@ -444,20 +464,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
} else if (!unicode()) {
builder->AddCharacter('u');
} else {
// If the 'u' flag is present, invalid escapes are not treated as
// identity escapes.
// With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid unicode escape"));
}
break;
}
default:
Advance();
// If the 'u' flag is present, only syntax characters can be
// escaped, no
// other identity escapes are allowed. If the 'u' flag is not
// present,
// all identity escapes are allowed.
if (!unicode() || IsSyntaxCharacter(current())) {
// With /u, no identity escapes except for syntax characters
// are allowed. Otherwise, all identity escapes are allowed.
if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
builder->AddCharacter(current());
Advance();
} else {
@ -469,10 +485,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{': {
int dummy;
if (ParseIntervalQuantifier(&dummy, &dummy)) {
ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
return ReportError(CStrVector("Nothing to repeat"));
}
// fallthrough
}
case '}':
case ']':
if (unicode()) {
return ReportError(CStrVector("Lone quantifier brackets"));
}
// fallthrough
default:
builder->AddUnicodeCharacter(current());
Advance();
@ -505,13 +527,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{':
if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) {
ReportError(CStrVector("numbers out of order in {} quantifier.")
CHECK_FAILED);
return ReportError(
CStrVector("numbers out of order in {} quantifier"));
}
break;
} else {
continue;
} else if (unicode()) {
// With /u, incomplete quantifiers are not allowed.
return ReportError(CStrVector("Incomplete quantifier"));
}
continue;
default:
continue;
}
@ -524,7 +548,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
quantifier_type = RegExpQuantifier::POSSESSIVE;
Advance();
}
builder->AddQuantifierToAtom(min, max, quantifier_type);
if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
return ReportError(CStrVector("Invalid quantifier"));
}
}
}
@ -822,15 +848,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
case 'c': {
uc32 controlLetter = Next();
uc32 letter = controlLetter & ~('A' ^ 'a');
// For compatibility with JSC, inside a character class
// we also accept digits and underscore as control characters.
if ((controlLetter >= '0' && controlLetter <= '9') ||
controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
// For compatibility with JSC, inside a character class. We also accept
// digits and underscore as control characters, unless with /u.
if (letter >= 'A' && letter <= 'Z') {
Advance(2);
// Control letters mapped to ASCII control characters in the range
// 0x00-0x1f.
return controlLetter & 0x1f;
}
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
ReportError(CStrVector("Invalid class escape"));
return 0;
}
if ((controlLetter >= '0' && controlLetter <= '9') ||
controlLetter == '_') {
Advance(2);
return controlLetter & 0x1f;
}
// We match JSC in reading the backslash as a literal
// character instead of as starting an escape.
return '\\';
@ -846,43 +881,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
// For compatibility, we interpret a decimal escape that isn't
// a back reference (and therefore either \0 or not valid according
// to the specification) as a 1..3 digit octal character code.
if (unicode()) {
// With /u, decimal escape is not interpreted as octal character code.
ReportError(CStrVector("Invalid class escape"));
return 0;
}
return ParseOctalLiteral();
case 'x': {
Advance();
uc32 value;
if (ParseHexEscape(2, &value)) {
return value;
if (ParseHexEscape(2, &value)) return value;
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
ReportError(CStrVector("Invalid escape"));
return 0;
}
if (!unicode()) {
// If \x is not followed by a two-digit hexadecimal, treat it
// as an identity escape.
return 'x';
}
// If the 'u' flag is present, invalid escapes are not treated as
// identity escapes.
ReportError(CStrVector("Invalid escape"));
return 0;
// If \x is not followed by a two-digit hexadecimal, treat it
// as an identity escape.
return 'x';
}
case 'u': {
Advance();
uc32 value;
if (ParseUnicodeEscape(&value)) {
return value;
if (ParseUnicodeEscape(&value)) return value;
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
ReportError(CStrVector("Invalid unicode escape"));
return 0;
}
if (!unicode()) {
return 'u';
}
// If the 'u' flag is present, invalid escapes are not treated as
// identity escapes.
ReportError(CStrVector("Invalid unicode escape"));
return 0;
// If \u is not followed by a two-digit hexadecimal, treat it
// as an identity escape.
return 'u';
}
default: {
uc32 result = current();
// If the 'u' flag is present, only syntax characters can be escaped, no
// other identity escapes are allowed. If the 'u' flag is not present, all
// identity escapes are allowed.
if (!unicode() || IsSyntaxCharacter(result)) {
// With /u, no identity escapes except for syntax characters are
// allowed. Otherwise, all identity escapes are allowed.
if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
Advance();
return result;
}
@ -956,6 +991,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
RegExpTree* RegExpParser::ParseCharacterClass() {
static const char* kUnterminated = "Unterminated character class";
static const char* kRangeInvalid = "Invalid character class";
static const char* kRangeOutOfOrder = "Range out of order in character class";
DCHECK_EQ(current(), '[');
@ -985,13 +1021,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
// Either end is an escaped character class. Treat the '-' verbatim.
if (unicode()) {
// ES2015 21.2.2.15.1 step 1.
return ReportError(CStrVector(kRangeInvalid));
}
AddRangeOrEscape(ranges, char_class, first, zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
AddRangeOrEscape(ranges, char_class_2, next, zone());
continue;
}
// ES2015 21.2.2.15.1 step 6.
if (first.from() > next.to()) {
return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
return ReportError(CStrVector(kRangeOutOfOrder));
}
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
} else {
@ -999,7 +1040,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
}
}
if (!has_more()) {
return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
return ReportError(CStrVector(kUnterminated));
}
Advance();
if (ranges->length() == 0) {
@ -1162,7 +1203,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
if (NeedsDesugaringForUnicode(cc)) {
// In unicode mode, character class needs to be desugared, so it
// With /u, character class needs to be desugared, so it
// must be a standalone term instead of being part of a RegExpText.
AddTerm(cc);
} else {
@ -1275,13 +1316,12 @@ RegExpTree* RegExpBuilder::ToRegExp() {
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
}
void RegExpBuilder::AddQuantifierToAtom(
bool RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
FlushPendingSurrogate();
if (pending_empty_) {
pending_empty_ = false;
return;
return true;
}
RegExpTree* atom;
if (characters_ != NULL) {
@ -1304,23 +1344,26 @@ void RegExpBuilder::AddQuantifierToAtom(
} else if (terms_.length() > 0) {
DCHECK(last_added_ == ADD_ATOM);
atom = terms_.RemoveLast();
// With /u, lookarounds are not quantifiable.
if (unicode() && atom->IsLookaround()) return false;
if (atom->max_match() == 0) {
// Guaranteed to only match an empty string.
LAST(ADD_TERM);
if (min == 0) {
return;
return true;
}
terms_.Add(atom, zone());
return;
return true;
}
} else {
// Only call immediately after adding an atom or character!
UNREACHABLE();
return;
return false;
}
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
zone());
LAST(ADD_TERM);
return true;
}
} // namespace internal

View File

@ -111,7 +111,7 @@ class RegExpBuilder : public ZoneObject {
void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(int min, int max,
bool AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type);
RegExpTree* ToRegExp();
@ -198,7 +198,7 @@ class RegExpParser BASE_EMBEDDED {
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
static bool IsSyntaxCharacter(uc32 c);
static bool IsSyntaxCharacterOrSlash(uc32 c);
static const int kMaxCaptures = 1 << 16;
static const uc32 kEndMarker = (1 << 21);

View File

@ -0,0 +1,34 @@
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
// test262/data/test/language/literals/regexp/u-dec-esc
assertThrows("/\\1/u");
// test262/language/literals/regexp/u-invalid-char-range-a
assertThrows("/[\\w-a]/u");
// test262/language/literals/regexp/u-invalid-char-range-b
assertThrows("/[a-\\w]/u");
// test262/language/literals/regexp/u-invalid-char-esc
assertThrows("/\\c/u");
assertThrows("/\\c0/u");
// test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion
assertThrows("/(?=.)*/u");
// test262/built-ins/RegExp/unicode_restricted_octal_escape
assertThrows("/[\\1]/u");
assertThrows("/\\00/u");
assertThrows("/\\09/u");
// test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha
assertThrows("/[\\c]/u");
// test262/built-ins/RegExp/unicode_restricted_identity_escape_c
assertThrows("/[\\c0]/u");
// test262/built-ins/RegExp/unicode_restricted_incomple_quantifier
assertThrows("/a{/u");
assertThrows("/a{1,/u");
assertThrows("/{/u");
assertThrows("/}/u");
// test262/data/test/built-ins/RegExp/unicode_restricted_brackets
assertThrows("/]/u");
// test262/built-ins/RegExp/unicode_identity_escape
/\//u;