[regexp] Handle empty nested classes correctly

With the recent introduction of unicode sets (v-flag), nested character
classes are allowed in regular expressions.
We always expect a nested class to be of type
`RegExpClassSetExpression`, but the empty nested class was not handled
correctly.

Bug: v8:11935, chromium:1412942
Change-Id: I3b644c8627d8fc6b320a419216372810e8003983
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4224311
Reviewed-by: Jakob Linke <jgruber@chromium.org>
Commit-Queue: Patrick Thier <pthier@chromium.org>
Cr-Commit-Position: refs/heads/main@{#85680}
This commit is contained in:
pthier 2023-02-06 14:16:23 +01:00 committed by V8 LUCI CQ
parent 8c4779241a
commit ee93bc8035
5 changed files with 73 additions and 24 deletions

View File

@ -200,10 +200,12 @@ void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that,
if (i > 0) os_ << " "; if (i > 0) os_ << " ";
VisitCharacterRange(that->ranges()->at(i)); VisitCharacterRange(that->ranges()->at(i));
} }
for (auto iter : *that->strings()) { if (that->has_strings()) {
os_ << " '"; for (auto iter : *that->strings()) {
os_ << std::string(iter.first.begin(), iter.first.end()); os_ << " '";
os_ << "'"; os_ << std::string(iter.first.begin(), iter.first.end());
os_ << "'";
}
} }
os_ << "]"; os_ << "]";
return nullptr; return nullptr;
@ -382,16 +384,17 @@ RegExpClassSetOperand::RegExpClassSetOperand(ZoneList<CharacterRange>* ranges,
CharacterClassStrings* strings) CharacterClassStrings* strings)
: ranges_(ranges), strings_(strings) { : ranges_(ranges), strings_(strings) {
DCHECK_NOT_NULL(ranges); DCHECK_NOT_NULL(ranges);
DCHECK_NOT_NULL(strings);
min_match_ = 0; min_match_ = 0;
max_match_ = 0; max_match_ = 0;
if (!ranges->is_empty()) { if (!ranges->is_empty()) {
min_match_ = 1; min_match_ = 1;
max_match_ = 2; max_match_ = 2;
} }
for (auto string : *strings) { if (has_strings()) {
min_match_ = std::min(min_match_, string.second->min_match()); for (auto string : *strings) {
max_match_ = std::max(max_match_, string.second->max_match()); min_match_ = std::min(min_match_, string.second->min_match());
max_match_ = std::max(max_match_, string.second->max_match());
}
} }
} }
@ -410,5 +413,20 @@ RegExpClassSetExpression::RegExpClassSetExpression(
} }
} }
// static
RegExpClassSetExpression* RegExpClassSetExpression::Empty(Zone* zone,
bool is_negated) {
ZoneList<CharacterRange>* ranges =
zone->template New<ZoneList<CharacterRange>>(0, zone);
RegExpClassSetOperand* op =
zone->template New<RegExpClassSetOperand>(ranges, nullptr);
ZoneList<RegExpTree*>* operands =
zone->template New<ZoneList<RegExpTree*>>(1, zone);
operands->Add(op, zone);
return zone->template New<RegExpClassSetExpression>(
RegExpClassSetExpression::OperationType::kUnion, is_negated, false,
operands);
}
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8

View File

@ -413,9 +413,12 @@ class RegExpClassSetOperand final : public RegExpTree {
void Subtract(RegExpClassSetOperand* other, void Subtract(RegExpClassSetOperand* other,
ZoneList<CharacterRange>* temp_ranges, Zone* zone); ZoneList<CharacterRange>* temp_ranges, Zone* zone);
bool has_strings() const { return !strings_->empty(); } bool has_strings() const { return strings_ != nullptr && !strings_->empty(); }
ZoneList<CharacterRange>* ranges() { return ranges_; } ZoneList<CharacterRange>* ranges() { return ranges_; }
CharacterClassStrings* strings() { return strings_; } CharacterClassStrings* strings() {
DCHECK_NOT_NULL(strings_);
return strings_;
}
private: private:
ZoneList<CharacterRange>* ranges_; ZoneList<CharacterRange>* ranges_;
@ -434,6 +437,10 @@ class RegExpClassSetExpression final : public RegExpTree {
DECL_BOILERPLATE(ClassSetExpression); DECL_BOILERPLATE(ClassSetExpression);
// Create an empty class set expression (matches everything if |is_negated|,
// nothing otherwise).
static RegExpClassSetExpression* Empty(Zone* zone, bool is_negated);
bool IsTextElement() override { return true; } bool IsTextElement() override { return true; }
int min_match() override { return 0; } int min_match() override { return 0; }
int max_match() override { return max_match_; } int max_match() override { return max_match_; }

View File

@ -593,7 +593,12 @@ RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,
void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) { void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) {
ranges()->AddAll(*other->ranges(), zone); ranges()->AddAll(*other->ranges(), zone);
strings()->insert(other->strings()->begin(), other->strings()->end()); if (other->has_strings()) {
if (strings_ == nullptr) {
strings_ = zone->template New<CharacterClassStrings>(zone);
}
strings()->insert(other->strings()->begin(), other->strings()->end());
}
} }
void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other, void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
@ -602,11 +607,17 @@ void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone); CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);
std::swap(*ranges(), *temp_ranges); std::swap(*ranges(), *temp_ranges);
temp_ranges->Rewind(0); temp_ranges->Rewind(0);
for (auto iter = strings()->begin(); iter != strings()->end();) { if (has_strings()) {
if (other->strings()->find(iter->first) == other->strings()->end()) { if (!other->has_strings()) {
iter = strings()->erase(iter); strings()->clear();
} else { } else {
iter++; for (auto iter = strings()->begin(); iter != strings()->end();) {
if (other->strings()->find(iter->first) == other->strings()->end()) {
iter = strings()->erase(iter);
} else {
iter++;
}
}
} }
} }
} }
@ -617,11 +628,13 @@ void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone); CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);
std::swap(*ranges(), *temp_ranges); std::swap(*ranges(), *temp_ranges);
temp_ranges->Rewind(0); temp_ranges->Rewind(0);
for (auto iter = strings()->begin(); iter != strings()->end();) { if (has_strings() && other->has_strings()) {
if (other->strings()->find(iter->first) != other->strings()->end()) { for (auto iter = strings()->begin(); iter != strings()->end();) {
iter = strings()->erase(iter); if (other->strings()->find(iter->first) != other->strings()->end()) {
} else { iter = strings()->erase(iter);
iter++; } else {
iter++;
}
} }
} }
} }

View File

@ -2913,10 +2913,14 @@ RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
zone()->template New<ZoneList<CharacterRange>>(2, zone()); zone()->template New<ZoneList<CharacterRange>>(2, zone());
if (current() == ']') { if (current() == ']') {
Advance(); Advance();
RegExpClassRanges::ClassRangesFlags class_ranges_flags; if (unicode_sets()) {
if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED; return RegExpClassSetExpression::Empty(zone(), is_negated);
return zone()->template New<RegExpClassRanges>(zone(), ranges, } else {
class_ranges_flags); RegExpClassRanges::ClassRangesFlags class_ranges_flags;
if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;
return zone()->template New<RegExpClassRanges>(zone(), ranges,
class_ranges_flags);
}
} }
if (!unicode_sets()) { if (!unicode_sets()) {

View File

@ -184,6 +184,13 @@ check(
/[\q{ĀĂĄĆ|AaAc}--\q{āăąć}]/vi, ['AaAc', 'aAaC'], ['ĀĂĄĆ', 'āăąć'], /[\q{ĀĂĄĆ|AaAc}--\q{āăąć}]/vi, ['AaAc', 'aAaC'], ['ĀĂĄĆ', 'āăąć'],
false); false);
// Empty nested classes.
check(/[a-c\q{foo|bar}[]]/v, ['a','b','c','foo','bar'], [], false);
check(/[[a-c\q{foo|bar}]&&[]]/v, [], ['a','b','c','foo','bar'], true);
check(/[[a-c\q{foo|bar}]--[]]/v, ['a','b','c','foo','bar'], [], false);
check(/[[]&&[a-c\q{foo|bar}]]/v, [], ['a','b','c','foo','bar'], true);
check(/[[]--[a-c\q{foo|bar}]]/v, [], ['a','b','c','foo','bar'], true);
// Empty string disjunctions matches nothing, but succeeds. // Empty string disjunctions matches nothing, but succeeds.
let res = /[\q{}]/v.exec('foo'); let res = /[\q{}]/v.exec('foo');
assertNotNull(res); assertNotNull(res);