From ee93bc803514f2b056c30faf08867dffdba10c3e Mon Sep 17 00:00:00 2001 From: pthier Date: Mon, 6 Feb 2023 14:16:23 +0100 Subject: [PATCH] [regexp] Handle empty nested classes correctly With the recent introduction of unicode sets (v-flag), nested character classes are allowed in regular expressions. We always expect a nested class to be of type `RegExpClassSetExpression`, but the empty nested class was not handled correctly. Bug: v8:11935, chromium:1412942 Change-Id: I3b644c8627d8fc6b320a419216372810e8003983 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4224311 Reviewed-by: Jakob Linke Commit-Queue: Patrick Thier Cr-Commit-Position: refs/heads/main@{#85680} --- src/regexp/regexp-ast.cc | 34 ++++++++++++++++----- src/regexp/regexp-ast.h | 11 +++++-- src/regexp/regexp-compiler-tonode.cc | 33 ++++++++++++++------ src/regexp/regexp-parser.cc | 12 +++++--- test/mjsunit/harmony/regexp-unicode-sets.js | 7 +++++ 5 files changed, 73 insertions(+), 24 deletions(-) diff --git a/src/regexp/regexp-ast.cc b/src/regexp/regexp-ast.cc index ba9c8f0d1f..a814628183 100644 --- a/src/regexp/regexp-ast.cc +++ b/src/regexp/regexp-ast.cc @@ -200,10 +200,12 @@ void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that, if (i > 0) os_ << " "; VisitCharacterRange(that->ranges()->at(i)); } - for (auto iter : *that->strings()) { - os_ << " '"; - os_ << std::string(iter.first.begin(), iter.first.end()); - os_ << "'"; + if (that->has_strings()) { + for (auto iter : *that->strings()) { + os_ << " '"; + os_ << std::string(iter.first.begin(), iter.first.end()); + os_ << "'"; + } } os_ << "]"; return nullptr; @@ -382,16 +384,17 @@ RegExpClassSetOperand::RegExpClassSetOperand(ZoneList* ranges, CharacterClassStrings* strings) : ranges_(ranges), strings_(strings) { DCHECK_NOT_NULL(ranges); - DCHECK_NOT_NULL(strings); min_match_ = 0; max_match_ = 0; if (!ranges->is_empty()) { min_match_ = 1; max_match_ = 2; } - for (auto string : *strings) { - min_match_ = std::min(min_match_, string.second->min_match()); - max_match_ = std::max(max_match_, string.second->max_match()); + if (has_strings()) { + for (auto string : *strings) { + min_match_ = std::min(min_match_, string.second->min_match()); + max_match_ = std::max(max_match_, string.second->max_match()); + } } } @@ -410,5 +413,20 @@ RegExpClassSetExpression::RegExpClassSetExpression( } } +// static +RegExpClassSetExpression* RegExpClassSetExpression::Empty(Zone* zone, + bool is_negated) { + ZoneList* ranges = + zone->template New>(0, zone); + RegExpClassSetOperand* op = + zone->template New(ranges, nullptr); + ZoneList* operands = + zone->template New>(1, zone); + operands->Add(op, zone); + return zone->template New( + RegExpClassSetExpression::OperationType::kUnion, is_negated, false, + operands); +} + } // namespace internal } // namespace v8 diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h index 6939fde07f..34f59f6c31 100644 --- a/src/regexp/regexp-ast.h +++ b/src/regexp/regexp-ast.h @@ -413,9 +413,12 @@ class RegExpClassSetOperand final : public RegExpTree { void Subtract(RegExpClassSetOperand* other, ZoneList* temp_ranges, Zone* zone); - bool has_strings() const { return !strings_->empty(); } + bool has_strings() const { return strings_ != nullptr && !strings_->empty(); } ZoneList* ranges() { return ranges_; } - CharacterClassStrings* strings() { return strings_; } + CharacterClassStrings* strings() { + DCHECK_NOT_NULL(strings_); + return strings_; + } private: ZoneList* ranges_; @@ -434,6 +437,10 @@ class RegExpClassSetExpression final : public RegExpTree { DECL_BOILERPLATE(ClassSetExpression); + // Create an empty class set expression (matches everything if |is_negated|, + // nothing otherwise). + static RegExpClassSetExpression* Empty(Zone* zone, bool is_negated); + bool IsTextElement() override { return true; } int min_match() override { return 0; } int max_match() override { return max_match_; } diff --git a/src/regexp/regexp-compiler-tonode.cc b/src/regexp/regexp-compiler-tonode.cc index 54e57298da..3258bb5149 100644 --- a/src/regexp/regexp-compiler-tonode.cc +++ b/src/regexp/regexp-compiler-tonode.cc @@ -593,7 +593,12 @@ RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler, void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) { ranges()->AddAll(*other->ranges(), zone); - strings()->insert(other->strings()->begin(), other->strings()->end()); + if (other->has_strings()) { + if (strings_ == nullptr) { + strings_ = zone->template New(zone); + } + strings()->insert(other->strings()->begin(), other->strings()->end()); + } } void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other, @@ -602,11 +607,17 @@ void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other, CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone); std::swap(*ranges(), *temp_ranges); temp_ranges->Rewind(0); - for (auto iter = strings()->begin(); iter != strings()->end();) { - if (other->strings()->find(iter->first) == other->strings()->end()) { - iter = strings()->erase(iter); + if (has_strings()) { + if (!other->has_strings()) { + strings()->clear(); } else { - iter++; + for (auto iter = strings()->begin(); iter != strings()->end();) { + if (other->strings()->find(iter->first) == other->strings()->end()) { + iter = strings()->erase(iter); + } else { + iter++; + } + } } } } @@ -617,11 +628,13 @@ void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other, CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone); std::swap(*ranges(), *temp_ranges); temp_ranges->Rewind(0); - for (auto iter = strings()->begin(); iter != strings()->end();) { - if (other->strings()->find(iter->first) != other->strings()->end()) { - iter = strings()->erase(iter); - } else { - iter++; + if (has_strings() && other->has_strings()) { + for (auto iter = strings()->begin(); iter != strings()->end();) { + if (other->strings()->find(iter->first) != other->strings()->end()) { + iter = strings()->erase(iter); + } else { + iter++; + } } } } diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc index 06e612321b..f7707e80a1 100644 --- a/src/regexp/regexp-parser.cc +++ b/src/regexp/regexp-parser.cc @@ -2913,10 +2913,14 @@ RegExpTree* RegExpParserImpl::ParseCharacterClass( zone()->template New>(2, zone()); if (current() == ']') { Advance(); - RegExpClassRanges::ClassRangesFlags class_ranges_flags; - if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED; - return zone()->template New(zone(), ranges, - class_ranges_flags); + if (unicode_sets()) { + return RegExpClassSetExpression::Empty(zone(), is_negated); + } else { + RegExpClassRanges::ClassRangesFlags class_ranges_flags; + if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED; + return zone()->template New(zone(), ranges, + class_ranges_flags); + } } if (!unicode_sets()) { diff --git a/test/mjsunit/harmony/regexp-unicode-sets.js b/test/mjsunit/harmony/regexp-unicode-sets.js index 8288b34c86..b5a66192cf 100644 --- a/test/mjsunit/harmony/regexp-unicode-sets.js +++ b/test/mjsunit/harmony/regexp-unicode-sets.js @@ -184,6 +184,13 @@ check( /[\q{ĀĂĄĆ|AaAc}--\q{āăąć}]/vi, ['AaAc', 'aAaC'], ['ĀĂĄĆ', 'āăąć'], false); +// Empty nested classes. +check(/[a-c\q{foo|bar}[]]/v, ['a','b','c','foo','bar'], [], false); +check(/[[a-c\q{foo|bar}]&&[]]/v, [], ['a','b','c','foo','bar'], true); +check(/[[a-c\q{foo|bar}]--[]]/v, ['a','b','c','foo','bar'], [], false); +check(/[[]&&[a-c\q{foo|bar}]]/v, [], ['a','b','c','foo','bar'], true); +check(/[[]--[a-c\q{foo|bar}]]/v, [], ['a','b','c','foo','bar'], true); + // Empty string disjunctions matches nothing, but succeeds. let res = /[\q{}]/v.exec('foo'); assertNotNull(res);