[regexp] Support unicode capture names in non-unicode patterns

This ensures that capture names containing surrogate pairs are parsed
correctly even in non-unicode RegExp patterns by introducing a new
scanning mode which unconditionally combines surrogate pairs.

BUG=v8:5437,v8:6192

Review-Url: https://codereview.chromium.org/2791163003
Cr-Commit-Position: refs/heads/master@{#44466}
This commit is contained in:
jgruber 2017-04-07 00:34:10 -07:00 committed by Commit bot
parent 57bef9a1e2
commit a8651c5671
3 changed files with 28 additions and 20 deletions

View File

@ -46,13 +46,13 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Advance(); Advance();
} }
template <bool update_position> inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
inline uc32 RegExpParser::ReadNext() {
int position = next_pos_; int position = next_pos_;
uc32 c0 = in()->Get(position); uc32 c0 = in()->Get(position);
position++; position++;
// Read the whole surrogate pair in case of unicode flag, if possible. const bool try_combine_surrogate_pairs =
if (unicode() && position < in()->length() && (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
if (try_combine_surrogate_pairs && position < in()->length() &&
unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
uc16 c1 = in()->Get(position); uc16 c1 = in()->Get(position);
if (unibrow::Utf16::IsTrailSurrogate(c1)) { if (unibrow::Utf16::IsTrailSurrogate(c1)) {
@ -67,14 +67,13 @@ inline uc32 RegExpParser::ReadNext() {
uc32 RegExpParser::Next() { uc32 RegExpParser::Next() {
if (has_next()) { if (has_next()) {
return ReadNext<false>(); return ReadNext(false, ScanMode::DEFAULT);
} else { } else {
return kEndMarker; return kEndMarker;
} }
} }
void RegExpParser::Advance(ScanMode mode) {
void RegExpParser::Advance() {
if (has_next()) { if (has_next()) {
StackLimitCheck check(isolate()); StackLimitCheck check(isolate());
if (check.HasOverflowed()) { if (check.HasOverflowed()) {
@ -84,7 +83,7 @@ void RegExpParser::Advance() {
} else if (zone()->excess_allocation()) { } else if (zone()->excess_allocation()) {
ReportError(CStrVector("Regular expression too large")); ReportError(CStrVector("Regular expression too large"));
} else { } else {
current_ = ReadNext<true>(); current_ = ReadNext(true, mode);
} }
} else { } else {
current_ = kEndMarker; current_ = kEndMarker;
@ -102,10 +101,9 @@ void RegExpParser::Reset(int pos) {
Advance(); Advance();
} }
void RegExpParser::Advance(int dist, ScanMode mode) {
void RegExpParser::Advance(int dist) {
next_pos_ += dist - 1; next_pos_ += dist - 1;
Advance(); Advance(mode);
} }
@ -329,7 +327,6 @@ RegExpTree* RegExpParser::ParseDisjunction() {
if (FLAG_harmony_regexp_named_captures) { if (FLAG_harmony_regexp_named_captures) {
has_named_captures_ = true; has_named_captures_ = true;
is_named_capture = true; is_named_capture = true;
Advance();
break; break;
} }
// Fall through. // Fall through.
@ -769,20 +766,26 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
DCHECK(FLAG_harmony_regexp_named_captures); DCHECK(FLAG_harmony_regexp_named_captures);
DCHECK_EQ(current(), '<');
ZoneVector<uc16>* name = ZoneVector<uc16>* name =
new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
// Capture names can always contain surrogate pairs, and we need to scan
// accordingly.
const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
Advance(scan_mode);
bool at_start = true; bool at_start = true;
while (true) { while (true) {
uc32 c = current(); uc32 c = current();
Advance(); Advance(scan_mode);
// Convert unicode escapes. // Convert unicode escapes.
if (c == '\\' && current() == 'u') { if (c == '\\' && current() == 'u') {
// TODO(jgruber): Reconsider this once the spec has settled. // TODO(jgruber): Reconsider this once the spec has settled.
// https://github.com/tc39/proposal-regexp-named-groups/issues/23 // https://github.com/tc39/proposal-regexp-named-groups/issues/23
Advance(); Advance(scan_mode);
if (!ParseUnicodeEscape(&c)) { if (!ParseUnicodeEscape(&c)) {
ReportError(CStrVector("Invalid Unicode escape sequence")); ReportError(CStrVector("Invalid Unicode escape sequence"));
return nullptr; return nullptr;
@ -853,7 +856,6 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
return false; return false;
} }
Advance();
const ZoneVector<uc16>* name = ParseCaptureGroupName(); const ZoneVector<uc16>* name = ParseCaptureGroupName();
if (name == nullptr) { if (name == nullptr) {
return false; return false;

View File

@ -184,11 +184,18 @@ class RegExpParser BASE_EMBEDDED {
// can be reparsed. // can be reparsed.
bool ParseBackReferenceIndex(int* index_out); bool ParseBackReferenceIndex(int* index_out);
// The default behavior is to combine surrogate pairs in unicode mode and
// don't combine them otherwise (a quantifier after a surrogate pair would
// then apply only to the trailing surrogate). Forcing combination is required
// when parsing capture names since they can always legally contain surrogate
// pairs.
enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS };
bool ParseClassProperty(ZoneList<CharacterRange>* result); bool ParseClassProperty(ZoneList<CharacterRange>* result);
CharacterRange ParseClassAtom(uc16* char_class); CharacterRange ParseClassAtom(uc16* char_class);
RegExpTree* ReportError(Vector<const char> message); RegExpTree* ReportError(Vector<const char> message);
void Advance(); void Advance(ScanMode mode = ScanMode::DEFAULT);
void Advance(int dist); void Advance(int dist, ScanMode mode = ScanMode::DEFAULT);
void Reset(int pos); void Reset(int pos);
// Reports whether the pattern might be used as a literal search string. // Reports whether the pattern might be used as a literal search string.
@ -304,8 +311,7 @@ class RegExpParser BASE_EMBEDDED {
bool has_more() { return has_more_; } bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); } bool has_next() { return next_pos_ < in()->length(); }
uc32 Next(); uc32 Next();
template <bool update_position> uc32 ReadNext(bool update_position, ScanMode mode);
uc32 ReadNext();
FlatStringReader* in() { return in_; } FlatStringReader* in() { return in_; }
void ScanForCaptures(); void ScanForCaptures();

View File

@ -147,7 +147,7 @@ assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
assertEquals("a", /(?<π>a)/.exec("bab").groups.π); assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
assertEquals("a", /(?<$>a)/.exec("bab").groups.$); assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
assertEquals("a", /(?<_>a)/.exec("bab").groups._); assertEquals("a", /(?<_>a)/.exec("bab").groups._);
assertThrows("/(?<$𐒤>a)/", SyntaxError); assertEquals("a", /(?<$𐒤>a)/.exec("bab").groups.$𐒤);
assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups._ಠ); assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups._ಠ);
assertThrows('/(?<❤>a)/', SyntaxError); assertThrows('/(?<❤>a)/', SyntaxError);
assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start. assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start.