[regexp] quantifier refers to the surrogate pair in unicode regexp.

R=rossberg@chromium.org

Review URL: https://codereview.chromium.org/1571563003

Cr-Commit-Position: refs/heads/master@{#33209}
This commit is contained in:
yangguo 2016-01-11 07:06:57 -08:00 committed by Commit bot
parent 67f99ee102
commit 8645a5ccd0
4 changed files with 38 additions and 8 deletions

View File

@ -56,6 +56,16 @@ void RegExpParser::Advance() {
} else {
current_ = in()->Get(next_pos_);
next_pos_++;
// Read the whole surrogate pair in case of unicode flag, if possible.
if (unicode_ && next_pos_ < in()->length() &&
unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
uc16 trail = in()->Get(next_pos_);
if (unibrow::Utf16::IsTrailSurrogate(trail)) {
current_ = unibrow::Utf16::CombineSurrogatePair(
static_cast<uc16>(current_), trail);
next_pos_++;
}
}
}
} else {
current_ = kEndMarker;
@ -417,12 +427,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
uc32 value;
if (ParseUnicodeEscape(&value)) {
if (value > unibrow::Utf16::kMaxNonSurrogateCharCode) {
builder->AddCharacter(unibrow::Utf16::LeadSurrogate(value));
builder->AddCharacter(unibrow::Utf16::TrailSurrogate(value));
} else {
builder->AddCharacter(static_cast<uc16>(value));
}
builder->AddUnicodeCharacter(value);
} else if (!unicode_) {
builder->AddCharacter('u');
} else {
@ -456,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// fallthrough
}
default:
builder->AddCharacter(current());
builder->AddUnicodeCharacter(current());
Advance();
break;
} // end switch(current())
@ -1057,6 +1062,19 @@ void RegExpBuilder::AddCharacter(uc16 c) {
}
void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
ZoneList<uc16> surrogate_pair(2, zone());
surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
AddAtom(atom);
} else {
AddCharacter(static_cast<uc16>(c));
}
}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

View File

@ -101,6 +101,7 @@ class RegExpBuilder : public ZoneObject {
public:
explicit RegExpBuilder(Zone* zone);
void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();

View File

@ -309,9 +309,12 @@ void TestRegExpParser(bool lookbehind) {
// Unicode regexps
CheckParseEq("\\u{12345}", "'\\ud808\\udf45'", true);
CheckParseEq("\\u{12345}\\u{23456}", "'\\ud808\\udf45\\ud84d\\udc56'", true);
CheckParseEq("\\u{12345}\\u{23456}", "(! '\\ud808\\udf45' '\\ud84d\\udc56')",
true);
CheckParseEq("\\u{12345}|\\u{23456}", "(| '\\ud808\\udf45' '\\ud84d\\udc56')",
true);
CheckParseEq("\\u{12345}{3}", "(# 3 3 g '\\ud808\\udf45')", true);
CheckParseEq("\\u{12345}*", "(# 0 - g '\\ud808\\udf45')", true);
CHECK_SIMPLE("", false);
CHECK_SIMPLE("a", true);

View File

@ -247,3 +247,11 @@ assertTrue(/(\u{12345}|\u{23456}).\1/u.test("\u{12345}b\u{12345}"));
assertFalse(new RegExp("(\u{12345}|\u{23456}).\\1", "u").test(
"\u{12345}b\u{23456}"));
assertFalse(/(\u{12345}|\u{23456}).\1/u.test("\u{12345}b\u{23456}"));
// Quantifier.
assertTrue(new RegExp("\u{12345}{3}", "u").test("\u{12345}\u{12345}\u{12345}"));
assertTrue(/\u{12345}{3}/u.test("\u{12345}\u{12345}\u{12345}"));
assertTrue(new RegExp("\u{12345}{3}").test("\u{12345}\udf45\udf45"));
assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\udf45\udf45"));
assertFalse(new RegExp("\u{12345}{3}", "u").test("\u{12345}\udf45\udf45"));
assertFalse(/\u{12345}{3}/u.test("\u{12345}\udf45\udf45"));