[regexp] Consider surrogate pairs when optimizing disjunctions
RationalizeConsecutiveAtoms optimizes ab|ac|az to a(?:b|c|d). Ensure that this optimization does not split surrogate pairs in unicode mode. BUG=chromium:641091 Review-Url: https://codereview.chromium.org/2813893002 Cr-Commit-Position: refs/heads/master@{#44599}
This commit is contained in:
parent
483812d46c
commit
4635572471
@ -3327,9 +3327,8 @@ TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
|
||||
RegExpNode* on_success) {
|
||||
DCHECK_NOT_NULL(ranges);
|
||||
ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
|
||||
elms->Add(
|
||||
TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
|
||||
zone);
|
||||
elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
|
||||
zone);
|
||||
return new (zone) TextNode(elms, read_backward, on_success);
|
||||
}
|
||||
|
||||
@ -3341,12 +3340,12 @@ TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
|
||||
ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
|
||||
ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
|
||||
ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
|
||||
elms->Add(TextElement::CharClass(
|
||||
new (zone) RegExpCharacterClass(lead_ranges, false)),
|
||||
zone);
|
||||
elms->Add(TextElement::CharClass(
|
||||
new (zone) RegExpCharacterClass(trail_ranges, false)),
|
||||
zone);
|
||||
elms->Add(
|
||||
TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
|
||||
zone);
|
||||
elms->Add(
|
||||
TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
|
||||
zone);
|
||||
return new (zone) TextNode(elms, read_backward, on_success);
|
||||
}
|
||||
|
||||
@ -4851,7 +4850,7 @@ static bool CompareRanges(ZoneList<CharacterRange>* ranges,
|
||||
bool RegExpCharacterClass::is_standard(Zone* zone) {
|
||||
// TODO(lrn): Remove need for this function, by not throwing away information
|
||||
// along the way.
|
||||
if (is_negated_) {
|
||||
if (is_negated()) {
|
||||
return false;
|
||||
}
|
||||
if (set_.is_standard()) {
|
||||
@ -5144,7 +5143,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
|
||||
if (compiler->needs_unicode_case_equivalents()) {
|
||||
AddUnicodeCaseEquivalents(ranges, zone);
|
||||
}
|
||||
if (compiler->unicode() && !compiler->one_byte()) {
|
||||
if (compiler->unicode() && !compiler->one_byte() &&
|
||||
!contains_split_surrogate()) {
|
||||
if (is_negated()) {
|
||||
ZoneList<CharacterRange>* negated =
|
||||
new (zone) ZoneList<CharacterRange>(2, zone);
|
||||
@ -5154,7 +5154,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
|
||||
if (ranges->length() == 0) {
|
||||
ranges->Add(CharacterRange::Everything(), zone);
|
||||
RegExpCharacterClass* fail =
|
||||
new (zone) RegExpCharacterClass(ranges, true);
|
||||
new (zone) RegExpCharacterClass(ranges, NEGATED);
|
||||
return new (zone) TextNode(fail, compiler->read_backward(), on_success);
|
||||
}
|
||||
if (standard_type() == '*') {
|
||||
@ -5368,6 +5368,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
|
||||
bool contains_trail_surrogate =
|
||||
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
|
||||
int first_in_run = i;
|
||||
i++;
|
||||
while (i < length) {
|
||||
@ -5375,6 +5378,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
|
||||
if (!alternative->IsAtom()) break;
|
||||
atom = alternative->AsAtom();
|
||||
if (atom->length() != 1) break;
|
||||
DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
|
||||
contains_trail_surrogate |=
|
||||
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
|
||||
i++;
|
||||
}
|
||||
if (i > first_in_run + 1) {
|
||||
@ -5387,8 +5393,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
|
||||
DCHECK_EQ(old_atom->length(), 1);
|
||||
ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
|
||||
}
|
||||
RegExpCharacterClass::Flags flags;
|
||||
if (compiler->unicode() && contains_trail_surrogate) {
|
||||
flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
|
||||
}
|
||||
alternatives->at(write_posn++) =
|
||||
new (zone) RegExpCharacterClass(ranges, false);
|
||||
new (zone) RegExpCharacterClass(ranges, flags);
|
||||
} else {
|
||||
// Just copy any trivial alternatives.
|
||||
for (int j = first_in_run; j < i; j++) {
|
||||
|
@ -291,9 +291,20 @@ class RegExpAssertion final : public RegExpTree {
|
||||
|
||||
class RegExpCharacterClass final : public RegExpTree {
|
||||
public:
|
||||
RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
|
||||
: set_(ranges), is_negated_(is_negated) {}
|
||||
explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
|
||||
// NEGATED: The character class is negated and should match everything but
|
||||
// the specified ranges.
|
||||
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
|
||||
// surrogate and should not be unicode-desugared (crbug.com/641091).
|
||||
enum Flag {
|
||||
NEGATED = 1 << 0,
|
||||
CONTAINS_SPLIT_SURROGATE = 1 << 1,
|
||||
};
|
||||
typedef base::Flags<Flag> Flags;
|
||||
|
||||
explicit RegExpCharacterClass(ZoneList<CharacterRange>* ranges,
|
||||
Flags flags = Flags())
|
||||
: set_(ranges), flags_(flags) {}
|
||||
explicit RegExpCharacterClass(uc16 type) : set_(type), flags_(0) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpCharacterClass* AsCharacterClass() override;
|
||||
@ -322,11 +333,14 @@ class RegExpCharacterClass final : public RegExpTree {
|
||||
// * : All characters, for advancing unanchored regexp
|
||||
uc16 standard_type() { return set_.standard_set_type(); }
|
||||
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
|
||||
bool is_negated() { return is_negated_; }
|
||||
bool is_negated() const { return (flags_ & NEGATED) != 0; }
|
||||
bool contains_split_surrogate() const {
|
||||
return (flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
|
||||
}
|
||||
|
||||
private:
|
||||
CharacterSet set_;
|
||||
bool is_negated_;
|
||||
const Flags flags_;
|
||||
};
|
||||
|
||||
|
||||
|
@ -283,8 +283,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
CharacterRange::AddClassEscape('.', ranges, false, zone());
|
||||
}
|
||||
|
||||
RegExpCharacterClass* cc =
|
||||
new (zone()) RegExpCharacterClass(ranges, false);
|
||||
RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
|
||||
builder->AddCharacterClass(cc);
|
||||
break;
|
||||
}
|
||||
@ -392,7 +391,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
CharacterRange::AddClassEscape(c, ranges,
|
||||
unicode() && ignore_case(), zone());
|
||||
RegExpCharacterClass* cc =
|
||||
new (zone()) RegExpCharacterClass(ranges, false);
|
||||
new (zone()) RegExpCharacterClass(ranges);
|
||||
builder->AddCharacterClass(cc);
|
||||
break;
|
||||
}
|
||||
@ -408,7 +407,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
||||
return ReportError(CStrVector("Invalid property name"));
|
||||
}
|
||||
RegExpCharacterClass* cc =
|
||||
new (zone()) RegExpCharacterClass(ranges, false);
|
||||
new (zone()) RegExpCharacterClass(ranges);
|
||||
builder->AddCharacterClass(cc);
|
||||
} else {
|
||||
// With /u, no identity escapes except for syntax characters
|
||||
@ -1548,7 +1547,9 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
|
||||
ranges->Add(CharacterRange::Everything(), zone());
|
||||
is_negated = !is_negated;
|
||||
}
|
||||
return new (zone()) RegExpCharacterClass(ranges, is_negated);
|
||||
RegExpCharacterClass::Flags flags;
|
||||
if (is_negated) flags = RegExpCharacterClass::NEGATED;
|
||||
return new (zone()) RegExpCharacterClass(ranges, flags);
|
||||
}
|
||||
|
||||
|
||||
@ -1722,7 +1723,7 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
|
||||
|
||||
void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
|
||||
AddTerm(new (zone()) RegExpCharacterClass(
|
||||
CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
|
||||
CharacterRange::List(zone(), CharacterRange::Singleton(c))));
|
||||
}
|
||||
|
||||
|
||||
|
15
test/mjsunit/regress/regress-641091.js
Normal file
15
test/mjsunit/regress/regress-641091.js
Normal file
@ -0,0 +1,15 @@
|
||||
// Copyright 2017 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
assertEquals(["🍤", "🍤"],
|
||||
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤/ug));
|
||||
|
||||
assertEquals(["🍤", "🍦", "🍦", "🍤"],
|
||||
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦/ug));
|
||||
|
||||
assertEquals(["🍤", "🍦", "🍋", "🍋", "🍦", "🍤"],
|
||||
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦|🍋/ug));
|
||||
|
||||
assertEquals(["🍤", "🍦", "🍋", "π", "π", "🍋", "🍦", "🍤"],
|
||||
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦|π|🍋/ug));
|
Loading…
Reference in New Issue
Block a user