[regexp] Handle empty nested classes correctly

With the recent introduction of unicode sets (v-flag), nested character
classes are allowed in regular expressions.
We always expect a nested class to be of type
`RegExpClassSetExpression`, but the empty nested class was not handled
correctly.

Bug: v8:11935, chromium:1412942
Change-Id: I3b644c8627d8fc6b320a419216372810e8003983
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4224311
Reviewed-by: Jakob Linke <jgruber@chromium.org>
Commit-Queue: Patrick Thier <pthier@chromium.org>
Cr-Commit-Position: refs/heads/main@{#85680}
This commit is contained in:
pthier 2023-02-06 14:16:23 +01:00 committed by V8 LUCI CQ
parent 8c4779241a
commit ee93bc8035
5 changed files with 73 additions and 24 deletions

View File

@ -200,11 +200,13 @@ void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that,
if (i > 0) os_ << " ";
VisitCharacterRange(that->ranges()->at(i));
}
if (that->has_strings()) {
for (auto iter : *that->strings()) {
os_ << " '";
os_ << std::string(iter.first.begin(), iter.first.end());
os_ << "'";
}
}
os_ << "]";
return nullptr;
}
@ -382,18 +384,19 @@ RegExpClassSetOperand::RegExpClassSetOperand(ZoneList<CharacterRange>* ranges,
CharacterClassStrings* strings)
: ranges_(ranges), strings_(strings) {
DCHECK_NOT_NULL(ranges);
DCHECK_NOT_NULL(strings);
min_match_ = 0;
max_match_ = 0;
if (!ranges->is_empty()) {
min_match_ = 1;
max_match_ = 2;
}
if (has_strings()) {
for (auto string : *strings) {
min_match_ = std::min(min_match_, string.second->min_match());
max_match_ = std::max(max_match_, string.second->max_match());
}
}
}
RegExpClassSetExpression::RegExpClassSetExpression(
OperationType op, bool is_negated, bool may_contain_strings,
@ -410,5 +413,20 @@ RegExpClassSetExpression::RegExpClassSetExpression(
}
}
// static
RegExpClassSetExpression* RegExpClassSetExpression::Empty(Zone* zone,
bool is_negated) {
ZoneList<CharacterRange>* ranges =
zone->template New<ZoneList<CharacterRange>>(0, zone);
RegExpClassSetOperand* op =
zone->template New<RegExpClassSetOperand>(ranges, nullptr);
ZoneList<RegExpTree*>* operands =
zone->template New<ZoneList<RegExpTree*>>(1, zone);
operands->Add(op, zone);
return zone->template New<RegExpClassSetExpression>(
RegExpClassSetExpression::OperationType::kUnion, is_negated, false,
operands);
}
} // namespace internal
} // namespace v8

View File

@ -413,9 +413,12 @@ class RegExpClassSetOperand final : public RegExpTree {
void Subtract(RegExpClassSetOperand* other,
ZoneList<CharacterRange>* temp_ranges, Zone* zone);
bool has_strings() const { return !strings_->empty(); }
bool has_strings() const { return strings_ != nullptr && !strings_->empty(); }
ZoneList<CharacterRange>* ranges() { return ranges_; }
CharacterClassStrings* strings() { return strings_; }
CharacterClassStrings* strings() {
DCHECK_NOT_NULL(strings_);
return strings_;
}
private:
ZoneList<CharacterRange>* ranges_;
@ -434,6 +437,10 @@ class RegExpClassSetExpression final : public RegExpTree {
DECL_BOILERPLATE(ClassSetExpression);
// Create an empty class set expression (matches everything if |is_negated|,
// nothing otherwise).
static RegExpClassSetExpression* Empty(Zone* zone, bool is_negated);
bool IsTextElement() override { return true; }
int min_match() override { return 0; }
int max_match() override { return max_match_; }

View File

@ -593,8 +593,13 @@ RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,
void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) {
ranges()->AddAll(*other->ranges(), zone);
if (other->has_strings()) {
if (strings_ == nullptr) {
strings_ = zone->template New<CharacterClassStrings>(zone);
}
strings()->insert(other->strings()->begin(), other->strings()->end());
}
}
void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
ZoneList<CharacterRange>* temp_ranges,
@ -602,6 +607,10 @@ void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);
std::swap(*ranges(), *temp_ranges);
temp_ranges->Rewind(0);
if (has_strings()) {
if (!other->has_strings()) {
strings()->clear();
} else {
for (auto iter = strings()->begin(); iter != strings()->end();) {
if (other->strings()->find(iter->first) == other->strings()->end()) {
iter = strings()->erase(iter);
@ -610,6 +619,8 @@ void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
}
}
}
}
}
void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
ZoneList<CharacterRange>* temp_ranges,
@ -617,6 +628,7 @@ void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);
std::swap(*ranges(), *temp_ranges);
temp_ranges->Rewind(0);
if (has_strings() && other->has_strings()) {
for (auto iter = strings()->begin(); iter != strings()->end();) {
if (other->strings()->find(iter->first) != other->strings()->end()) {
iter = strings()->erase(iter);
@ -625,6 +637,7 @@ void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
}
}
}
}
// static
RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression(

View File

@ -2913,11 +2913,15 @@ RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
zone()->template New<ZoneList<CharacterRange>>(2, zone());
if (current() == ']') {
Advance();
if (unicode_sets()) {
return RegExpClassSetExpression::Empty(zone(), is_negated);
} else {
RegExpClassRanges::ClassRangesFlags class_ranges_flags;
if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;
return zone()->template New<RegExpClassRanges>(zone(), ranges,
class_ranges_flags);
}
}
if (!unicode_sets()) {
bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case();

View File

@ -184,6 +184,13 @@ check(
/[\q{ĀĂĄĆ|AaAc}--\q{āăąć}]/vi, ['AaAc', 'aAaC'], ['ĀĂĄĆ', 'āăąć'],
false);
// Empty nested classes.
check(/[a-c\q{foo|bar}[]]/v, ['a','b','c','foo','bar'], [], false);
check(/[[a-c\q{foo|bar}]&&[]]/v, [], ['a','b','c','foo','bar'], true);
check(/[[a-c\q{foo|bar}]--[]]/v, ['a','b','c','foo','bar'], [], false);
check(/[[]&&[a-c\q{foo|bar}]]/v, [], ['a','b','c','foo','bar'], true);
check(/[[]--[a-c\q{foo|bar}]]/v, [], ['a','b','c','foo','bar'], true);
// Empty string disjunctions matches nothing, but succeeds.
let res = /[\q{}]/v.exec('foo');
assertNotNull(res);