[regexp] Support unicode capture names in non-unicode patterns
This ensures that capture names containing surrogate pairs are parsed correctly even in non-unicode RegExp patterns by introducing a new scanning mode which unconditionally combines surrogate pairs. BUG=v8:5437,v8:6192 Review-Url: https://codereview.chromium.org/2791163003 Cr-Commit-Position: refs/heads/master@{#44466}
This commit is contained in:
parent
57bef9a1e2
commit
a8651c5671
@ -46,13 +46,13 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
|
|||||||
Advance();
|
Advance();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool update_position>
|
inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
|
||||||
inline uc32 RegExpParser::ReadNext() {
|
|
||||||
int position = next_pos_;
|
int position = next_pos_;
|
||||||
uc32 c0 = in()->Get(position);
|
uc32 c0 = in()->Get(position);
|
||||||
position++;
|
position++;
|
||||||
// Read the whole surrogate pair in case of unicode flag, if possible.
|
const bool try_combine_surrogate_pairs =
|
||||||
if (unicode() && position < in()->length() &&
|
(unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
|
||||||
|
if (try_combine_surrogate_pairs && position < in()->length() &&
|
||||||
unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
|
unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
|
||||||
uc16 c1 = in()->Get(position);
|
uc16 c1 = in()->Get(position);
|
||||||
if (unibrow::Utf16::IsTrailSurrogate(c1)) {
|
if (unibrow::Utf16::IsTrailSurrogate(c1)) {
|
||||||
@ -67,14 +67,13 @@ inline uc32 RegExpParser::ReadNext() {
|
|||||||
|
|
||||||
uc32 RegExpParser::Next() {
|
uc32 RegExpParser::Next() {
|
||||||
if (has_next()) {
|
if (has_next()) {
|
||||||
return ReadNext<false>();
|
return ReadNext(false, ScanMode::DEFAULT);
|
||||||
} else {
|
} else {
|
||||||
return kEndMarker;
|
return kEndMarker;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void RegExpParser::Advance(ScanMode mode) {
|
||||||
void RegExpParser::Advance() {
|
|
||||||
if (has_next()) {
|
if (has_next()) {
|
||||||
StackLimitCheck check(isolate());
|
StackLimitCheck check(isolate());
|
||||||
if (check.HasOverflowed()) {
|
if (check.HasOverflowed()) {
|
||||||
@ -84,7 +83,7 @@ void RegExpParser::Advance() {
|
|||||||
} else if (zone()->excess_allocation()) {
|
} else if (zone()->excess_allocation()) {
|
||||||
ReportError(CStrVector("Regular expression too large"));
|
ReportError(CStrVector("Regular expression too large"));
|
||||||
} else {
|
} else {
|
||||||
current_ = ReadNext<true>();
|
current_ = ReadNext(true, mode);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
current_ = kEndMarker;
|
current_ = kEndMarker;
|
||||||
@ -102,10 +101,9 @@ void RegExpParser::Reset(int pos) {
|
|||||||
Advance();
|
Advance();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void RegExpParser::Advance(int dist, ScanMode mode) {
|
||||||
void RegExpParser::Advance(int dist) {
|
|
||||||
next_pos_ += dist - 1;
|
next_pos_ += dist - 1;
|
||||||
Advance();
|
Advance(mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -329,7 +327,6 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
|||||||
if (FLAG_harmony_regexp_named_captures) {
|
if (FLAG_harmony_regexp_named_captures) {
|
||||||
has_named_captures_ = true;
|
has_named_captures_ = true;
|
||||||
is_named_capture = true;
|
is_named_capture = true;
|
||||||
Advance();
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Fall through.
|
// Fall through.
|
||||||
@ -769,20 +766,26 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
|
|||||||
|
|
||||||
const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
|
const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
|
||||||
DCHECK(FLAG_harmony_regexp_named_captures);
|
DCHECK(FLAG_harmony_regexp_named_captures);
|
||||||
|
DCHECK_EQ(current(), '<');
|
||||||
|
|
||||||
ZoneVector<uc16>* name =
|
ZoneVector<uc16>* name =
|
||||||
new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
|
new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
|
||||||
|
|
||||||
|
// Capture names can always contain surrogate pairs, and we need to scan
|
||||||
|
// accordingly.
|
||||||
|
const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
|
||||||
|
Advance(scan_mode);
|
||||||
|
|
||||||
bool at_start = true;
|
bool at_start = true;
|
||||||
while (true) {
|
while (true) {
|
||||||
uc32 c = current();
|
uc32 c = current();
|
||||||
Advance();
|
Advance(scan_mode);
|
||||||
|
|
||||||
// Convert unicode escapes.
|
// Convert unicode escapes.
|
||||||
if (c == '\\' && current() == 'u') {
|
if (c == '\\' && current() == 'u') {
|
||||||
// TODO(jgruber): Reconsider this once the spec has settled.
|
// TODO(jgruber): Reconsider this once the spec has settled.
|
||||||
// https://github.com/tc39/proposal-regexp-named-groups/issues/23
|
// https://github.com/tc39/proposal-regexp-named-groups/issues/23
|
||||||
Advance();
|
Advance(scan_mode);
|
||||||
if (!ParseUnicodeEscape(&c)) {
|
if (!ParseUnicodeEscape(&c)) {
|
||||||
ReportError(CStrVector("Invalid Unicode escape sequence"));
|
ReportError(CStrVector("Invalid Unicode escape sequence"));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -853,7 +856,6 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Advance();
|
|
||||||
const ZoneVector<uc16>* name = ParseCaptureGroupName();
|
const ZoneVector<uc16>* name = ParseCaptureGroupName();
|
||||||
if (name == nullptr) {
|
if (name == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -184,11 +184,18 @@ class RegExpParser BASE_EMBEDDED {
|
|||||||
// can be reparsed.
|
// can be reparsed.
|
||||||
bool ParseBackReferenceIndex(int* index_out);
|
bool ParseBackReferenceIndex(int* index_out);
|
||||||
|
|
||||||
|
// The default behavior is to combine surrogate pairs in unicode mode and
|
||||||
|
// don't combine them otherwise (a quantifier after a surrogate pair would
|
||||||
|
// then apply only to the trailing surrogate). Forcing combination is required
|
||||||
|
// when parsing capture names since they can always legally contain surrogate
|
||||||
|
// pairs.
|
||||||
|
enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS };
|
||||||
|
|
||||||
bool ParseClassProperty(ZoneList<CharacterRange>* result);
|
bool ParseClassProperty(ZoneList<CharacterRange>* result);
|
||||||
CharacterRange ParseClassAtom(uc16* char_class);
|
CharacterRange ParseClassAtom(uc16* char_class);
|
||||||
RegExpTree* ReportError(Vector<const char> message);
|
RegExpTree* ReportError(Vector<const char> message);
|
||||||
void Advance();
|
void Advance(ScanMode mode = ScanMode::DEFAULT);
|
||||||
void Advance(int dist);
|
void Advance(int dist, ScanMode mode = ScanMode::DEFAULT);
|
||||||
void Reset(int pos);
|
void Reset(int pos);
|
||||||
|
|
||||||
// Reports whether the pattern might be used as a literal search string.
|
// Reports whether the pattern might be used as a literal search string.
|
||||||
@ -304,8 +311,7 @@ class RegExpParser BASE_EMBEDDED {
|
|||||||
bool has_more() { return has_more_; }
|
bool has_more() { return has_more_; }
|
||||||
bool has_next() { return next_pos_ < in()->length(); }
|
bool has_next() { return next_pos_ < in()->length(); }
|
||||||
uc32 Next();
|
uc32 Next();
|
||||||
template <bool update_position>
|
uc32 ReadNext(bool update_position, ScanMode mode);
|
||||||
uc32 ReadNext();
|
|
||||||
FlatStringReader* in() { return in_; }
|
FlatStringReader* in() { return in_; }
|
||||||
void ScanForCaptures();
|
void ScanForCaptures();
|
||||||
|
|
||||||
|
@ -147,7 +147,7 @@ assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
|
|||||||
assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
|
assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
|
||||||
assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
|
assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
|
||||||
assertEquals("a", /(?<_>a)/.exec("bab").groups._);
|
assertEquals("a", /(?<_>a)/.exec("bab").groups._);
|
||||||
assertThrows("/(?<$𐒤>a)/", SyntaxError);
|
assertEquals("a", /(?<$𐒤>a)/.exec("bab").groups.$𐒤);
|
||||||
assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups.ಠ_ಠ);
|
assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups.ಠ_ಠ);
|
||||||
assertThrows('/(?<❤>a)/', SyntaxError);
|
assertThrows('/(?<❤>a)/', SyntaxError);
|
||||||
assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start.
|
assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start.
|
||||||
|
Loading…
Reference in New Issue
Block a user