[regexp] Support string disjunctions in unicode set mode
Add support for string disjunctions within regular expression character classes in unicode sets mode (/v). Bug: v8:11935 Change-Id: Ida607123ced11c4dc3dfc687996f6abffeb6eeff Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4051243 Commit-Queue: Patrick Thier <pthier@chromium.org> Reviewed-by: Mathias Bynens <mathias@chromium.org> Cr-Commit-Position: refs/heads/main@{#84480}
This commit is contained in:
parent
1211605a39
commit
5d7782f694
@ -69,6 +69,11 @@ class CanBeHandledVisitor final : private RegExpVisitor {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitClassSetOperand(RegExpClassSetOperand* node, void*) override {
|
||||
result_ = !node->has_strings();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitClassSetExpression(RegExpClassSetExpression* node,
|
||||
void*) override {
|
||||
result_ = false;
|
||||
@ -391,11 +396,10 @@ class CompileVisitor : private RegExpVisitor {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitClassRanges(RegExpClassRanges* node, void*) override {
|
||||
void CompileCharacterRanges(ZoneList<CharacterRange>* ranges, bool negated) {
|
||||
// A character class is compiled as Disjunction over its `CharacterRange`s.
|
||||
ZoneList<CharacterRange>* ranges = node->ranges(zone_);
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
if (node->is_negated()) {
|
||||
if (negated) {
|
||||
// The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
|
||||
// union of k intervals is a union of at most k + 1 intervals.
|
||||
ZoneList<CharacterRange>* negated =
|
||||
@ -422,6 +426,17 @@ class CompileVisitor : private RegExpVisitor {
|
||||
|
||||
assembler_.ConsumeRange(from_uc16, to_uc16);
|
||||
});
|
||||
}
|
||||
|
||||
void* VisitClassRanges(RegExpClassRanges* node, void*) override {
|
||||
CompileCharacterRanges(node->ranges(zone_), node->is_negated());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* VisitClassSetOperand(RegExpClassSetOperand* node, void*) override {
|
||||
// TODO(v8:11935): Support strings.
|
||||
DCHECK(!node->has_strings());
|
||||
CompileCharacterRanges(node->ranges(), false);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -193,6 +193,22 @@ void* RegExpUnparser::VisitClassRanges(RegExpClassRanges* that, void* data) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that,
|
||||
void* data) {
|
||||
os_ << "![";
|
||||
for (int i = 0; i < that->ranges()->length(); i++) {
|
||||
if (i > 0) os_ << " ";
|
||||
VisitCharacterRange(that->ranges()->at(i));
|
||||
}
|
||||
for (auto iter : *that->strings()) {
|
||||
os_ << " '";
|
||||
os_ << std::string(iter.first.begin(), iter.first.end());
|
||||
os_ << "'";
|
||||
}
|
||||
os_ << "]";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* RegExpUnparser::VisitClassSetExpression(RegExpClassSetExpression* that,
|
||||
void* data) {
|
||||
switch (that->operation()) {
|
||||
@ -362,6 +378,37 @@ RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
|
||||
}
|
||||
}
|
||||
|
||||
RegExpClassSetOperand::RegExpClassSetOperand(ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassStrings* strings)
|
||||
: ranges_(ranges), strings_(strings) {
|
||||
DCHECK_NOT_NULL(ranges);
|
||||
DCHECK_NOT_NULL(strings);
|
||||
min_match_ = 0;
|
||||
max_match_ = 0;
|
||||
if (!ranges->is_empty()) {
|
||||
min_match_ = 1;
|
||||
max_match_ = 2;
|
||||
}
|
||||
for (auto string : *strings) {
|
||||
min_match_ = std::min(min_match_, string.second->min_match());
|
||||
max_match_ = std::max(max_match_, string.second->max_match());
|
||||
}
|
||||
}
|
||||
|
||||
RegExpClassSetExpression::RegExpClassSetExpression(
|
||||
OperationType op, bool is_negated, bool may_contain_strings,
|
||||
ZoneList<RegExpTree*>* operands)
|
||||
: operation_(op),
|
||||
is_negated_(is_negated),
|
||||
may_contain_strings_(may_contain_strings),
|
||||
operands_(operands) {
|
||||
DCHECK_NOT_NULL(operands);
|
||||
DCHECK_IMPLIES(is_negated_, !may_contain_strings_);
|
||||
max_match_ = 0;
|
||||
for (auto op : *operands) {
|
||||
max_match_ = std::max(max_match_, op->max_match());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
@ -22,6 +22,7 @@ namespace internal {
|
||||
VISIT(Alternative) \
|
||||
VISIT(Assertion) \
|
||||
VISIT(ClassRanges) \
|
||||
VISIT(ClassSetOperand) \
|
||||
VISIT(ClassSetExpression) \
|
||||
VISIT(Atom) \
|
||||
VISIT(Quantifier) \
|
||||
@ -365,45 +366,101 @@ class RegExpClassRanges final : public RegExpTree {
|
||||
ClassRangesFlags class_ranges_flags_;
|
||||
};
|
||||
|
||||
struct CharacterClassStringLess {
|
||||
bool operator()(const base::Vector<const base::uc32>& lhs,
|
||||
const base::Vector<const base::uc32>& rhs) const {
|
||||
// Longer strings first so we generate matches for the largest string
|
||||
// possible.
|
||||
if (lhs.length() != rhs.length()) {
|
||||
return lhs.length() > rhs.length();
|
||||
}
|
||||
for (int i = 0; i < lhs.length(); i++) {
|
||||
if (lhs[i] != rhs[i]) {
|
||||
return lhs[i] < rhs[i];
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// A type used for strings as part of character classes (only possible in
|
||||
// unicode sets mode).
|
||||
// We use a ZoneMap instead of an UnorderedZoneMap because we need to match
|
||||
// the longest alternatives first. By using a ZoneMap with the custom comparator
|
||||
// we can avoid sorting before assembling the code.
|
||||
// Strings are likely short (the largest string in current unicode properties
|
||||
// consists of 10 code points).
|
||||
using CharacterClassStrings = ZoneMap<base::Vector<const base::uc32>,
|
||||
RegExpTree*, CharacterClassStringLess>;
|
||||
|
||||
// TODO(pthier): If we are sure we don't want to use icu::UnicodeSets
|
||||
// (performance evaluation pending), this class can be merged with
|
||||
// RegExpClassRanges.
|
||||
class RegExpClassSetOperand final : public RegExpTree {
|
||||
public:
|
||||
RegExpClassSetOperand(ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassStrings* strings);
|
||||
|
||||
DECL_BOILERPLATE(ClassSetOperand);
|
||||
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
|
||||
void Union(RegExpClassSetOperand* other, Zone* zone);
|
||||
void Intersect(RegExpClassSetOperand* other,
|
||||
ZoneList<CharacterRange>* temp_ranges, Zone* zone);
|
||||
void Subtract(RegExpClassSetOperand* other,
|
||||
ZoneList<CharacterRange>* temp_ranges, Zone* zone);
|
||||
|
||||
bool has_strings() const { return !strings_->empty(); }
|
||||
ZoneList<CharacterRange>* ranges() { return ranges_; }
|
||||
CharacterClassStrings* strings() { return strings_; }
|
||||
|
||||
private:
|
||||
ZoneList<CharacterRange>* ranges_;
|
||||
CharacterClassStrings* strings_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
class RegExpClassSetExpression final : public RegExpTree {
|
||||
public:
|
||||
enum class OperationType { kUnion, kIntersection, kSubtraction };
|
||||
|
||||
RegExpClassSetExpression(OperationType op, bool is_negated,
|
||||
ZoneList<RegExpTree*>* operands)
|
||||
: operation_(op), is_negated_(is_negated), operands_(operands) {}
|
||||
bool may_contain_strings,
|
||||
ZoneList<RegExpTree*>* operands);
|
||||
|
||||
DECL_BOILERPLATE(ClassSetExpression);
|
||||
|
||||
bool IsTextElement() override { return true; }
|
||||
// At least 1 character is consumed.
|
||||
int min_match() override { return 1; }
|
||||
// Up to two code points might be consumed.
|
||||
int max_match() override { return 2; }
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return max_match_; }
|
||||
|
||||
OperationType operation() const { return operation_; }
|
||||
bool is_negated() const { return is_negated_; }
|
||||
bool may_contain_strings() const { return may_contain_strings_; }
|
||||
const ZoneList<RegExpTree*>* operands() const { return operands_; }
|
||||
ZoneList<RegExpTree*>* operands() { return operands_; }
|
||||
|
||||
private:
|
||||
RegExpClassRanges* ToCharacterClass(Zone* zone);
|
||||
|
||||
// Recursively evaluates the tree rooted at |root|, computing the valid
|
||||
// CharacterRanges after applying all set operations and storing the result in
|
||||
// |result_ranges|. |temp_ranges| is list used for intermediate results,
|
||||
// passed as parameter to avoid allocating new lists all the time.
|
||||
static void ComputeCharacterRanges(RegExpTree* root,
|
||||
ZoneList<CharacterRange>* result_ranges,
|
||||
ZoneList<CharacterRange>* temp_ranges,
|
||||
Zone* zone);
|
||||
// CharacterRanges and strings after applying all set operations.
|
||||
// The original tree will be modified by this method, so don't store pointers
|
||||
// to inner nodes of the tree somewhere else!
|
||||
// Modifying the tree in-place saves memory and speeds up multiple calls of
|
||||
// the method (e.g. when unrolling quantifiers).
|
||||
// |temp_ranges| is used for intermediate results, passed as parameter to
|
||||
// avoid allocating new lists all the time.
|
||||
static RegExpClassSetOperand* ComputeExpression(
|
||||
RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone);
|
||||
|
||||
const OperationType operation_;
|
||||
const bool is_negated_;
|
||||
const bool may_contain_strings_;
|
||||
ZoneList<RegExpTree*>* operands_ = nullptr;
|
||||
#ifdef ENABLE_SLOW_DCHECKS
|
||||
// Cache ranges for each node during computation for (slow) DCHECKs.
|
||||
ZoneList<CharacterRange>* ranges_ = nullptr;
|
||||
#endif
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
class RegExpAtom final : public RegExpTree {
|
||||
|
@ -535,9 +535,145 @@ RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler,
|
||||
return result;
|
||||
}
|
||||
|
||||
RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success) {
|
||||
Zone* zone = compiler->zone();
|
||||
const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) +
|
||||
(ranges()->is_empty() ? 0 : 1);
|
||||
if (size == 0) {
|
||||
// If neither ranges nor strings are present, the operand is equal to an
|
||||
// empty range (matching nothing).
|
||||
ZoneList<CharacterRange>* empty =
|
||||
zone->template New<ZoneList<CharacterRange>>(0, zone);
|
||||
return zone->template New<RegExpClassRanges>(zone, empty)
|
||||
->ToNode(compiler, on_success);
|
||||
}
|
||||
ZoneList<RegExpTree*>* alternatives =
|
||||
zone->template New<ZoneList<RegExpTree*>>(size, zone);
|
||||
// Strings are sorted by length first (larger strings before shorter ones).
|
||||
// See the comment on CharacterClassStrings.
|
||||
// Empty strings (if present) are added after character ranges.
|
||||
RegExpTree* empty_string = nullptr;
|
||||
if (has_strings()) {
|
||||
for (auto string : *strings()) {
|
||||
if (string.second->IsEmpty()) {
|
||||
empty_string = string.second;
|
||||
} else {
|
||||
alternatives->Add(string.second, zone);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!ranges()->is_empty()) {
|
||||
alternatives->Add(zone->template New<RegExpClassRanges>(zone, ranges()),
|
||||
zone);
|
||||
}
|
||||
if (empty_string != nullptr) {
|
||||
alternatives->Add(empty_string, zone);
|
||||
}
|
||||
|
||||
RegExpTree* node = nullptr;
|
||||
if (size == 1) {
|
||||
DCHECK_EQ(alternatives->length(), 1);
|
||||
node = alternatives->first();
|
||||
} else {
|
||||
node = zone->template New<RegExpDisjunction>(alternatives);
|
||||
}
|
||||
return node->ToNode(compiler, on_success);
|
||||
}
|
||||
|
||||
RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success) {
|
||||
return ToCharacterClass(compiler->zone())->ToNode(compiler, on_success);
|
||||
Zone* zone = compiler->zone();
|
||||
ZoneList<CharacterRange>* temp_ranges =
|
||||
zone->template New<ZoneList<CharacterRange>>(4, zone);
|
||||
RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone);
|
||||
return root->ToNode(compiler, on_success);
|
||||
}
|
||||
|
||||
void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) {
|
||||
ranges()->AddAll(*other->ranges(), zone);
|
||||
strings()->insert(other->strings()->begin(), other->strings()->end());
|
||||
}
|
||||
|
||||
void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
|
||||
ZoneList<CharacterRange>* temp_ranges,
|
||||
Zone* zone) {
|
||||
CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);
|
||||
std::swap(*ranges(), *temp_ranges);
|
||||
temp_ranges->Rewind(0);
|
||||
for (auto iter = strings()->begin(); iter != strings()->end();) {
|
||||
if (other->strings()->find(iter->first) == other->strings()->end()) {
|
||||
iter = strings()->erase(iter);
|
||||
} else {
|
||||
iter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
|
||||
ZoneList<CharacterRange>* temp_ranges,
|
||||
Zone* zone) {
|
||||
CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);
|
||||
std::swap(*ranges(), *temp_ranges);
|
||||
temp_ranges->Rewind(0);
|
||||
for (auto iter = strings()->begin(); iter != strings()->end();) {
|
||||
if (other->strings()->find(iter->first) != other->strings()->end()) {
|
||||
iter = strings()->erase(iter);
|
||||
} else {
|
||||
iter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression(
|
||||
RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
|
||||
DCHECK(temp_ranges->is_empty());
|
||||
if (root->IsClassSetOperand()) {
|
||||
return root->AsClassSetOperand();
|
||||
}
|
||||
DCHECK(root->IsClassSetExpression());
|
||||
RegExpClassSetExpression* node = root->AsClassSetExpression();
|
||||
RegExpClassSetOperand* result =
|
||||
ComputeExpression(node->operands()->at(0), temp_ranges, zone);
|
||||
switch (node->operation()) {
|
||||
case OperationType::kUnion: {
|
||||
for (int i = 1; i < node->operands()->length(); i++) {
|
||||
RegExpClassSetOperand* op =
|
||||
ComputeExpression(node->operands()->at(i), temp_ranges, zone);
|
||||
result->Union(op, zone);
|
||||
}
|
||||
CharacterRange::Canonicalize(result->ranges());
|
||||
break;
|
||||
}
|
||||
case OperationType::kIntersection: {
|
||||
for (int i = 1; i < node->operands()->length(); i++) {
|
||||
RegExpClassSetOperand* op =
|
||||
ComputeExpression(node->operands()->at(i), temp_ranges, zone);
|
||||
result->Intersect(op, temp_ranges, zone);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OperationType::kSubtraction: {
|
||||
for (int i = 1; i < node->operands()->length(); i++) {
|
||||
RegExpClassSetOperand* op =
|
||||
ComputeExpression(node->operands()->at(i), temp_ranges, zone);
|
||||
result->Subtract(op, temp_ranges, zone);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (node->is_negated()) {
|
||||
DCHECK(!result->has_strings());
|
||||
CharacterRange::Negate(result->ranges(), temp_ranges, zone);
|
||||
std::swap(*result->ranges(), *temp_ranges);
|
||||
temp_ranges->Rewind(0);
|
||||
}
|
||||
// Store the result as single operand of the current node.
|
||||
node->operands()->Set(0, result);
|
||||
node->operands()->Rewind(1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -1498,128 +1634,6 @@ void CharacterSet::Canonicalize() {
|
||||
CharacterRange::Canonicalize(ranges_);
|
||||
}
|
||||
|
||||
RegExpClassRanges* RegExpClassSetExpression::ToCharacterClass(Zone* zone) {
|
||||
ZoneList<CharacterRange>* result_ranges =
|
||||
zone->template New<ZoneList<CharacterRange>>(2, zone);
|
||||
ZoneList<CharacterRange>* temp_ranges =
|
||||
zone->template New<ZoneList<CharacterRange>>(2, zone);
|
||||
ComputeCharacterRanges(this, result_ranges, temp_ranges, zone);
|
||||
return zone->template New<RegExpClassRanges>(zone, result_ranges);
|
||||
}
|
||||
|
||||
// static
|
||||
void RegExpClassSetExpression::ComputeCharacterRanges(
|
||||
RegExpTree* root, ZoneList<CharacterRange>* result_ranges,
|
||||
ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
|
||||
DCHECK_EQ(temp_ranges->length(), 0);
|
||||
DCHECK(root->IsClassRanges() || root->IsClassSetExpression());
|
||||
if (root->IsClassRanges()) {
|
||||
DCHECK(!root->AsClassRanges()->is_negated());
|
||||
ZoneList<CharacterRange>* ranges = root->AsClassRanges()->ranges(zone);
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
result_ranges->AddAll(*ranges, zone);
|
||||
return;
|
||||
}
|
||||
RegExpClassSetExpression* node = root->AsClassSetExpression();
|
||||
switch (node->operation()) {
|
||||
case OperationType::kUnion: {
|
||||
ZoneList<CharacterRange>* op_ranges =
|
||||
zone->template New<ZoneList<CharacterRange>>(2, zone);
|
||||
for (int i = 0; i < node->operands()->length(); i++) {
|
||||
RegExpTree* op = node->operands()->at(i);
|
||||
ComputeCharacterRanges(op, op_ranges, temp_ranges, zone);
|
||||
result_ranges->AddAll(*op_ranges, zone);
|
||||
op_ranges->Rewind(0);
|
||||
}
|
||||
CharacterRange::Canonicalize(result_ranges);
|
||||
break;
|
||||
}
|
||||
case OperationType::kIntersection: {
|
||||
ZoneList<CharacterRange>* op_ranges =
|
||||
zone->template New<ZoneList<CharacterRange>>(2, zone);
|
||||
ComputeCharacterRanges(node->operands()->at(0), op_ranges, temp_ranges,
|
||||
zone);
|
||||
result_ranges->AddAll(*op_ranges, zone);
|
||||
op_ranges->Rewind(0);
|
||||
for (int i = 1; i < node->operands()->length(); i++) {
|
||||
ComputeCharacterRanges(node->operands()->at(i), op_ranges, temp_ranges,
|
||||
zone);
|
||||
CharacterRange::Intersect(result_ranges, op_ranges, temp_ranges, zone);
|
||||
std::swap(*result_ranges, *temp_ranges);
|
||||
temp_ranges->Rewind(0);
|
||||
op_ranges->Rewind(0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OperationType::kSubtraction: {
|
||||
ZoneList<CharacterRange>* op_ranges =
|
||||
zone->template New<ZoneList<CharacterRange>>(2, zone);
|
||||
ComputeCharacterRanges(node->operands()->at(0), op_ranges, temp_ranges,
|
||||
zone);
|
||||
result_ranges->AddAll(*op_ranges, zone);
|
||||
op_ranges->Rewind(0);
|
||||
for (int i = 1; i < node->operands()->length(); i++) {
|
||||
ComputeCharacterRanges(node->operands()->at(i), op_ranges, temp_ranges,
|
||||
zone);
|
||||
CharacterRange::Subtract(result_ranges, op_ranges, temp_ranges, zone);
|
||||
std::swap(*result_ranges, *temp_ranges);
|
||||
temp_ranges->Rewind(0);
|
||||
op_ranges->Rewind(0);
|
||||
}
|
||||
#ifdef ENABLE_SLOW_DCHECKS
|
||||
// Check that the result is equal to subtracting the union of all RHS
|
||||
// operands from the LHS operand.
|
||||
// TODO(pthier): It is unclear whether this variant is faster or slower
|
||||
// than subtracting multiple ranges in practice.
|
||||
ZoneList<CharacterRange>* lhs_range =
|
||||
node->operands()->at(0)->IsClassRanges()
|
||||
? node->operands()->at(0)->AsClassRanges()->ranges(zone)
|
||||
: node->operands()->at(0)->AsClassSetExpression()->ranges_;
|
||||
ZoneList<CharacterRange>* rhs_union =
|
||||
zone->template New<ZoneList<CharacterRange>>(2, zone);
|
||||
for (int i = 1; i < node->operands()->length(); i++) {
|
||||
ZoneList<CharacterRange>* op_range =
|
||||
node->operands()->at(i)->IsClassRanges()
|
||||
? node->operands()->at(i)->AsClassRanges()->ranges(zone)
|
||||
: node->operands()->at(i)->AsClassSetExpression()->ranges_;
|
||||
rhs_union->AddAll(*op_range, zone);
|
||||
}
|
||||
CharacterRange::Canonicalize(rhs_union);
|
||||
ZoneList<CharacterRange>* ranges_check =
|
||||
zone->template New<ZoneList<CharacterRange>>(2, zone);
|
||||
CharacterRange::Subtract(lhs_range, rhs_union, ranges_check, zone);
|
||||
DCHECK(CharacterRange::Equals(result_ranges, ranges_check));
|
||||
|
||||
// Check that the result is equal to intersecting the LHS operand with the
|
||||
// complemented union of all RHS operands
|
||||
ZoneList<CharacterRange>* rhs_union_negated =
|
||||
zone->template New<ZoneList<CharacterRange>>(rhs_union->length(),
|
||||
zone);
|
||||
CharacterRange::Negate(rhs_union, rhs_union_negated, zone);
|
||||
ranges_check->Rewind(0);
|
||||
CharacterRange::Intersect(lhs_range, rhs_union_negated, ranges_check,
|
||||
zone);
|
||||
DCHECK(CharacterRange::Equals(result_ranges, ranges_check));
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (node->is_negated()) {
|
||||
CharacterRange::Negate(result_ranges, temp_ranges, zone);
|
||||
std::swap(*result_ranges, *temp_ranges);
|
||||
temp_ranges->Rewind(0);
|
||||
}
|
||||
|
||||
DCHECK_EQ(temp_ranges->length(), 0);
|
||||
|
||||
#ifdef ENABLE_SLOW_DCHECKS
|
||||
// Cache results for DCHECKs.
|
||||
node->ranges_ =
|
||||
zone->template New<ZoneList<CharacterRange>>(*result_ranges, zone);
|
||||
#endif
|
||||
}
|
||||
|
||||
// static
|
||||
void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
|
||||
if (character_ranges->length() <= 1) return;
|
||||
@ -1740,6 +1754,9 @@ void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,
|
||||
DCHECK(CharacterRange::IsCanonical(src));
|
||||
DCHECK(CharacterRange::IsCanonical(to_remove));
|
||||
DCHECK_EQ(0, result->length());
|
||||
|
||||
if (src->is_empty()) return;
|
||||
|
||||
int src_index = 0;
|
||||
int to_remove_index = 0;
|
||||
base::uc32 from = src->at(src_index).from();
|
||||
|
@ -44,7 +44,9 @@ namespace internal {
|
||||
T(UnterminatedCharacterClass, "Unterminated character class") \
|
||||
T(OutOfOrderCharacterClass, "Range out of order in character class") \
|
||||
T(InvalidClassSetOperation, "Invalid set operation in character class") \
|
||||
T(InvalidCharacterInClass, "Invalid character in character class")
|
||||
T(InvalidCharacterInClass, "Invalid character in character class") \
|
||||
T(NegatedCharacterClassWithStrings, \
|
||||
"Negated character class may contain strings")
|
||||
|
||||
enum class RegExpError : uint32_t {
|
||||
#define TEMPLATE(NAME, STRING) k##NAME,
|
||||
|
@ -18,6 +18,8 @@
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#endif // V8_INTL_SUPPORT
|
||||
|
||||
namespace v8 {
|
||||
@ -62,6 +64,7 @@ class RegExpTextBuilder {
|
||||
void FlushPendingSurrogate();
|
||||
void FlushText();
|
||||
RegExpTree* PopLastAtom();
|
||||
RegExpTree* ToRegExp();
|
||||
|
||||
private:
|
||||
static const base::uc16 kNoPendingSurrogate = 0;
|
||||
@ -280,6 +283,15 @@ RegExpTree* RegExpTextBuilder::PopLastAtom() {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
RegExpTree* RegExpTextBuilder::ToRegExp() {
|
||||
FlushText();
|
||||
size_t num_alternatives = terms_->size();
|
||||
if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
|
||||
if (num_alternatives == 1) return terms_->back();
|
||||
return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
|
||||
base::VectorOf(terms_->begin(), terms_->size()), zone()));
|
||||
}
|
||||
|
||||
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
|
||||
class RegExpBuilder {
|
||||
public:
|
||||
@ -455,12 +467,15 @@ class RegExpParserImpl final {
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
Zone* zone,
|
||||
bool add_unicode_case_equivalents);
|
||||
RegExpTree* ParseClassStringDisjunction();
|
||||
RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassStrings* strings);
|
||||
RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
|
||||
ClassSetOperandType* type_out);
|
||||
RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
|
||||
ClassSetOperandType* type_out,
|
||||
ZoneList<CharacterRange>* ranges);
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassStrings* strings);
|
||||
base::uc32 ParseClassSetCharacter();
|
||||
// Parses and returns a single escaped character.
|
||||
base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
|
||||
bool* is_escaped_unicode_character);
|
||||
@ -468,12 +483,14 @@ class RegExpParserImpl final {
|
||||
RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated,
|
||||
RegExpTree* first_operand,
|
||||
ClassSetOperandType first_operand_type,
|
||||
ZoneList<CharacterRange>* ranges);
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassStrings* strings);
|
||||
RegExpTree* ParseClassIntersection(const RegExpBuilder* builder,
|
||||
bool is_negated,
|
||||
RegExpTree* first_operand);
|
||||
bool is_negated, RegExpTree* first_operand,
|
||||
ClassSetOperandType first_operand_type);
|
||||
RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder,
|
||||
bool is_negated, RegExpTree* first_operand);
|
||||
bool is_negated, RegExpTree* first_operand,
|
||||
ClassSetOperandType first_operand_type);
|
||||
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
|
||||
|
||||
base::uc32 ParseOctalLiteral();
|
||||
@ -498,15 +515,15 @@ class RegExpParserImpl final {
|
||||
int captures_started() const { return captures_started_; }
|
||||
int position() const { return next_pos_ - 1; }
|
||||
bool failed() const { return failed_; }
|
||||
RegExpFlags flags() const { return top_level_flags_; }
|
||||
bool IsUnicodeMode() const {
|
||||
// Either /v or /u enable UnicodeMode
|
||||
// TODO(v8:11935): Change permalink once proposal is in stage 4.
|
||||
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
|
||||
return IsUnicode(top_level_flags_) || IsUnicodeSets(top_level_flags_) ||
|
||||
force_unicode_;
|
||||
return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_;
|
||||
}
|
||||
bool unicode_sets() const { return IsUnicodeSets(top_level_flags_); }
|
||||
bool ignore_case() const { return IsIgnoreCase(top_level_flags_); }
|
||||
bool unicode_sets() const { return IsUnicodeSets(flags()); }
|
||||
bool ignore_case() const { return IsIgnoreCase(flags()); }
|
||||
|
||||
static bool IsSyntaxCharacterOrSlash(base::uc32 c);
|
||||
static bool IsClassSetSyntaxCharacter(base::uc32 c);
|
||||
@ -869,7 +886,7 @@ template <class CharT>
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
|
||||
// Used to store current state while parsing subexpressions.
|
||||
RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
|
||||
0, nullptr, top_level_flags_, zone());
|
||||
0, nullptr, flags(), zone());
|
||||
RegExpParserState* state = &initial_state;
|
||||
// Cache the builder in a local variable for quick access.
|
||||
RegExpBuilder* builder = initial_state.builder();
|
||||
@ -2377,10 +2394,27 @@ bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// Add |string| to |ranges| if length of |string| == 1, otherwise add |string|
|
||||
// to |strings|.
|
||||
void AddClassString(ZoneList<base::uc32>* normalized_string,
|
||||
RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassStrings* strings, Zone* zone) {
|
||||
if (normalized_string->length() == 1) {
|
||||
ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone);
|
||||
} else {
|
||||
strings->emplace(normalized_string->ToVector(), regexp_string);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// TODO(v8:11935): Change permalink once proposal is in stage 4.
|
||||
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassStringDisjunction
|
||||
template <class CharT>
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction() {
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
|
||||
ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
|
||||
DCHECK(unicode_sets());
|
||||
DCHECK_EQ(current(), '\\');
|
||||
DCHECK_EQ(Next(), 'q');
|
||||
@ -2391,73 +2425,98 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction() {
|
||||
}
|
||||
Advance();
|
||||
|
||||
// TODO(pthier, v8:11935): Implement.
|
||||
return ReportError(RegExpError::kInvalidCharacterClass);
|
||||
ZoneList<base::uc32>* string =
|
||||
zone()->template New<ZoneList<base::uc32>>(4, zone());
|
||||
RegExpTextBuilder::SmallRegExpTreeVector string_storage(
|
||||
ZoneAllocator<RegExpTree*>{zone()});
|
||||
RegExpTextBuilder string_builder(zone(), &string_storage, flags());
|
||||
|
||||
while (has_more() && current() != '}') {
|
||||
if (current() == '|') {
|
||||
AddClassString(string, string_builder.ToRegExp(), ranges, strings,
|
||||
zone());
|
||||
string = zone()->template New<ZoneList<base::uc32>>(4, zone());
|
||||
string_storage.clear();
|
||||
Advance();
|
||||
} else {
|
||||
base::uc32 c = ParseClassSetCharacter(CHECK_FAILED);
|
||||
if (ignore_case()) {
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
#else
|
||||
c = AsciiAlphaToLower(c);
|
||||
#endif
|
||||
}
|
||||
string->Add(c, zone());
|
||||
string_builder.AddUnicodeCharacter(c);
|
||||
}
|
||||
}
|
||||
|
||||
AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone());
|
||||
|
||||
// We don't need to handle missing closing '}' here.
|
||||
// If the character class is correctly closed, ParseClassSetCharacter will
|
||||
// report an error.
|
||||
DCHECK_EQ(current(), '}');
|
||||
Advance();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// TODO(v8:11935): Change permalink once proposal is in stage 4.
|
||||
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
|
||||
// Tree returned based on type_out:
|
||||
// * kClassStringDisjunction: RegExpAlternative | RegExpAtom
|
||||
// * kNestedClass: RegExpClassSetExpression
|
||||
// * For all other types: RegExpClassRanges
|
||||
// * For all other types: RegExpClassSetOperand
|
||||
template <class CharT>
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
|
||||
const RegExpBuilder* builder, ClassSetOperandType* type_out) {
|
||||
ZoneList<CharacterRange>* ranges =
|
||||
zone()->template New<ZoneList<CharacterRange>>(1, zone());
|
||||
CharacterClassStrings* strings =
|
||||
zone()->template New<CharacterClassStrings>(zone());
|
||||
RegExpTree* tree =
|
||||
ParseClassSetOperand(builder, type_out, ranges CHECK_FAILED);
|
||||
ParseClassSetOperand(builder, type_out, ranges, strings CHECK_FAILED);
|
||||
DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass,
|
||||
tree == nullptr);
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
|
||||
ranges->length() == 1);
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
|
||||
tree == nullptr);
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kCharacterClassEscape,
|
||||
!ranges->is_empty());
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kCharacterClassEscape,
|
||||
tree == nullptr);
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassStringDisjunction,
|
||||
ranges->is_empty());
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassStringDisjunction,
|
||||
tree->IsAtom() || tree->IsAlternative());
|
||||
strings->empty());
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
|
||||
ranges->is_empty());
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
|
||||
strings->empty());
|
||||
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
|
||||
tree->IsClassSetExpression());
|
||||
// ClassSetRange is only used within ClassSetUnion().
|
||||
DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange);
|
||||
// There are no restrictions for kCharacterClassEscape.
|
||||
// CharacterClassEscape includes \p{}, which can contain ranges, strings or
|
||||
// both and \P{}, which could contain nothing (i.e. \P{Any}).
|
||||
if (tree == nullptr) {
|
||||
tree = zone()->template New<RegExpClassRanges>(zone(), ranges);
|
||||
tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);
|
||||
}
|
||||
return tree;
|
||||
}
|
||||
|
||||
// TODO(v8:11935): Change permalink once proposal is in stage 4.
|
||||
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
|
||||
// Based on |type_out| either a tree is returned or ranges modifed (never both).
|
||||
// Tree returned based on type_out:
|
||||
// * kClassStringDisjunction: RegExpAlternative | RegExpAtom
|
||||
// * kNestedClass: RegExpClassSetExpression
|
||||
// For all other types, ranges is modified and nullptr is returned.
|
||||
// Based on |type_out| either a tree is returned or ranges/strings modified.
|
||||
// If a tree is returned, ranges/strings are not modified.
|
||||
// If |type_out| is kNestedClass, a tree of type RegExpClassSetExpression is
|
||||
// returned. For all other types, ranges is modified and nullptr is returned.
|
||||
template <class CharT>
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
|
||||
const RegExpBuilder* builder, ClassSetOperandType* type_out,
|
||||
ZoneList<CharacterRange>* ranges) {
|
||||
ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
|
||||
DCHECK(unicode_sets());
|
||||
const base::uc32 c = current();
|
||||
base::uc32 c = current();
|
||||
if (c == '\\') {
|
||||
base::uc32 next = Next();
|
||||
switch (next) {
|
||||
case 'b':
|
||||
*type_out = ClassSetOperandType::kClassSetCharacter;
|
||||
ranges->Add(CharacterRange::Singleton('\b'), zone());
|
||||
Advance(2);
|
||||
return nullptr;
|
||||
case 'q':
|
||||
*type_out = ClassSetOperandType::kClassStringDisjunction;
|
||||
return ParseClassStringDisjunction();
|
||||
case kEndMarker:
|
||||
return ReportError(RegExpError::kEscapeAtEndOfPattern);
|
||||
const base::uc32 next = Next();
|
||||
if (next == 'q') {
|
||||
*type_out = ClassSetOperandType::kClassStringDisjunction;
|
||||
ParseClassStringDisjunction(ranges, strings CHECK_FAILED);
|
||||
return nullptr;
|
||||
}
|
||||
static constexpr InClassEscapeState kInClassEscape =
|
||||
InClassEscapeState::kInClass;
|
||||
@ -2467,44 +2526,86 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
|
||||
*type_out = ClassSetOperandType::kCharacterClassEscape;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool dummy = false; // Unused.
|
||||
base::uc32 escaped_char = ParseCharacterEscape(kInClassEscape, &dummy);
|
||||
*type_out = ClassSetOperandType::kClassSetCharacter;
|
||||
ranges->Add(CharacterRange::Singleton(escaped_char), zone());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (c == '[') {
|
||||
*type_out = ClassSetOperandType::kNestedClass;
|
||||
return ParseCharacterClass(builder);
|
||||
}
|
||||
if (IsClassSetSyntaxCharacter(c)) {
|
||||
return ReportError(RegExpError::kInvalidCharacterInClass);
|
||||
}
|
||||
if (IsClassSetReservedDoublePunctuator(c)) {
|
||||
return ReportError(RegExpError::kInvalidClassSetOperation);
|
||||
}
|
||||
|
||||
*type_out = ClassSetOperandType::kClassSetCharacter;
|
||||
c = ParseClassSetCharacter(CHECK_FAILED);
|
||||
ranges->Add(CharacterRange::Singleton(c), zone());
|
||||
Advance();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() {
|
||||
DCHECK(unicode_sets());
|
||||
const base::uc32 c = current();
|
||||
if (c == '\\') {
|
||||
const base::uc32 next = Next();
|
||||
switch (next) {
|
||||
case 'b':
|
||||
Advance(2);
|
||||
return '\b';
|
||||
case kEndMarker:
|
||||
ReportError(RegExpError::kEscapeAtEndOfPattern);
|
||||
return 0;
|
||||
}
|
||||
static constexpr InClassEscapeState kInClassEscape =
|
||||
InClassEscapeState::kInClass;
|
||||
|
||||
bool dummy = false; // Unused.
|
||||
return ParseCharacterEscape(kInClassEscape, &dummy);
|
||||
}
|
||||
if (IsClassSetSyntaxCharacter(c)) {
|
||||
ReportError(RegExpError::kInvalidCharacterInClass);
|
||||
return 0;
|
||||
}
|
||||
if (IsClassSetReservedDoublePunctuator(c)) {
|
||||
ReportError(RegExpError::kInvalidClassSetOperation);
|
||||
return 0;
|
||||
}
|
||||
Advance();
|
||||
return c;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {
|
||||
switch (type) {
|
||||
case ClassSetOperandType::kClassSetCharacter:
|
||||
case ClassSetOperandType::kClassSetRange:
|
||||
return false;
|
||||
case ClassSetOperandType::kCharacterClassEscape:
|
||||
case ClassSetOperandType::kClassStringDisjunction:
|
||||
return operand->AsClassSetOperand()->has_strings();
|
||||
case ClassSetOperandType::kNestedClass:
|
||||
if (operand->IsClassRanges()) return false;
|
||||
return operand->AsClassSetExpression()->may_contain_strings();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// TODO(v8:11935): Change permalink once proposal is in stage 4.
|
||||
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassUnion
|
||||
template <class CharT>
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
|
||||
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
|
||||
ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges) {
|
||||
ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassStrings* strings) {
|
||||
DCHECK(unicode_sets());
|
||||
ZoneList<RegExpTree*>* operands =
|
||||
zone()->template New<ZoneList<RegExpTree*>>(2, zone());
|
||||
bool may_contain_strings = false;
|
||||
// Add the lhs to operands if necessary.
|
||||
// Either the lhs values were added to |ranges| (in which case |first_operand|
|
||||
// is null), or the lhs was evaluated to a tree and passed as |first_operand|
|
||||
// (in which case |ranges| are empty).
|
||||
DCHECK_EQ(first_operand != nullptr, ranges->is_empty());
|
||||
// Either the lhs values were added to |ranges|/|strings| (in which case
|
||||
// |first_operand| is nullptr), or the lhs was evaluated to a tree and passed
|
||||
// as |first_operand| (in which case |ranges| and |strings| are empty).
|
||||
if (first_operand != nullptr) {
|
||||
may_contain_strings = MayContainStrings(first_operand_type, first_operand);
|
||||
operands->Add(first_operand, zone());
|
||||
}
|
||||
ClassSetOperandType last_type = first_operand_type;
|
||||
@ -2531,7 +2632,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
|
||||
if (last_type != ClassSetOperandType::kClassSetCharacter) {
|
||||
return ReportError(RegExpError::kInvalidCharacterClass);
|
||||
}
|
||||
ParseClassSetOperand(builder, &last_type, ranges CHECK_FAILED);
|
||||
ParseClassSetOperand(builder, &last_type, ranges, strings CHECK_FAILED);
|
||||
if (last_type != ClassSetOperandType::kClassSetCharacter) {
|
||||
return ReportError(RegExpError::kInvalidCharacterClass);
|
||||
}
|
||||
@ -2550,18 +2651,22 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
|
||||
last_type = ClassSetOperandType::kClassSetRange;
|
||||
} else {
|
||||
DCHECK_NE(current(), '-');
|
||||
RegExpTree* operand =
|
||||
ParseClassSetOperand(builder, &last_type, ranges CHECK_FAILED);
|
||||
RegExpTree* operand = ParseClassSetOperand(builder, &last_type, ranges,
|
||||
strings CHECK_FAILED);
|
||||
if (operand != nullptr) {
|
||||
may_contain_strings |= MayContainStrings(last_type, operand);
|
||||
// Add the range we started building as operand and reset the current
|
||||
// range.
|
||||
if (!ranges->is_empty()) {
|
||||
if (!ranges->is_empty() || !strings->empty()) {
|
||||
if (needs_case_folding) {
|
||||
CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
|
||||
}
|
||||
operands->Add(zone()->template New<RegExpClassRanges>(zone(), ranges),
|
||||
zone());
|
||||
may_contain_strings |= !strings->empty();
|
||||
operands->Add(
|
||||
zone()->template New<RegExpClassSetOperand>(ranges, strings),
|
||||
zone());
|
||||
ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone());
|
||||
strings = zone()->template New<CharacterClassStrings>(zone());
|
||||
}
|
||||
operands->Add(operand, zone());
|
||||
}
|
||||
@ -2573,26 +2678,37 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
|
||||
}
|
||||
|
||||
// Add the range we started building as operand.
|
||||
if (!ranges->is_empty()) {
|
||||
if (!ranges->is_empty() || !strings->empty()) {
|
||||
if (needs_case_folding) {
|
||||
CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
|
||||
}
|
||||
operands->Add(zone()->template New<RegExpClassRanges>(zone(), ranges),
|
||||
may_contain_strings |= !strings->empty();
|
||||
operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings),
|
||||
zone());
|
||||
}
|
||||
|
||||
DCHECK_EQ(current(), ']');
|
||||
Advance();
|
||||
|
||||
if (is_negated && may_contain_strings) {
|
||||
return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
|
||||
}
|
||||
|
||||
return zone()->template New<RegExpClassSetExpression>(
|
||||
RegExpClassSetExpression::OperationType::kUnion, is_negated, operands);
|
||||
RegExpClassSetExpression::OperationType::kUnion, is_negated,
|
||||
may_contain_strings, operands);
|
||||
}
|
||||
|
||||
// TODO(v8:11935): Change permalink once proposal is in stage 4.
|
||||
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassIntersection
|
||||
template <class CharT>
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
|
||||
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand) {
|
||||
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
|
||||
ClassSetOperandType first_operand_type) {
|
||||
DCHECK(unicode_sets());
|
||||
DCHECK(current() == '&' && Next() == '&');
|
||||
bool may_contain_strings =
|
||||
MayContainStrings(first_operand_type, first_operand);
|
||||
ZoneList<RegExpTree*>* operands =
|
||||
zone()->template New<ZoneList<RegExpTree*>>(2, zone());
|
||||
operands->Add(first_operand, zone());
|
||||
@ -2606,27 +2722,38 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
|
||||
return ReportError(RegExpError::kInvalidCharacterInClass);
|
||||
}
|
||||
|
||||
ClassSetOperandType dummy; // unused
|
||||
RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED);
|
||||
ClassSetOperandType operand_type;
|
||||
RegExpTree* operand =
|
||||
ParseClassSetOperand(builder, &operand_type CHECK_FAILED);
|
||||
may_contain_strings &= MayContainStrings(operand_type, operand);
|
||||
operands->Add(operand, zone());
|
||||
}
|
||||
if (!has_more()) {
|
||||
return ReportError(RegExpError::kUnterminatedCharacterClass);
|
||||
}
|
||||
if (is_negated && may_contain_strings) {
|
||||
return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
|
||||
}
|
||||
DCHECK_EQ(current(), ']');
|
||||
Advance();
|
||||
return zone()->template New<RegExpClassSetExpression>(
|
||||
RegExpClassSetExpression::OperationType::kIntersection, is_negated,
|
||||
operands);
|
||||
may_contain_strings, operands);
|
||||
}
|
||||
|
||||
// TODO(v8:11935): Change permalink once proposal is in stage 4.
|
||||
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSubtraction
|
||||
template <class CharT>
|
||||
RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
|
||||
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand) {
|
||||
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
|
||||
ClassSetOperandType first_operand_type) {
|
||||
DCHECK(unicode_sets());
|
||||
DCHECK(current() == '-' && Next() == '-');
|
||||
const bool may_contain_strings =
|
||||
MayContainStrings(first_operand_type, first_operand);
|
||||
if (is_negated && may_contain_strings) {
|
||||
return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
|
||||
}
|
||||
ZoneList<RegExpTree*>* operands =
|
||||
zone()->template New<ZoneList<RegExpTree*>>(2, zone());
|
||||
operands->Add(first_operand, zone());
|
||||
@ -2646,7 +2773,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
|
||||
Advance();
|
||||
return zone()->template New<RegExpClassSetExpression>(
|
||||
RegExpClassSetExpression::OperationType::kSubtraction, is_negated,
|
||||
operands);
|
||||
may_contain_strings, operands);
|
||||
}
|
||||
|
||||
// https://tc39.es/ecma262/#prod-CharacterClass
|
||||
@ -2684,27 +2811,34 @@ RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
|
||||
character_class_flags);
|
||||
} else {
|
||||
ClassSetOperandType operand_type;
|
||||
RegExpTree* operand =
|
||||
ParseClassSetOperand(builder, &operand_type, ranges CHECK_FAILED);
|
||||
CharacterClassStrings* strings =
|
||||
zone()->template New<CharacterClassStrings>(zone());
|
||||
RegExpTree* operand = ParseClassSetOperand(builder, &operand_type, ranges,
|
||||
strings CHECK_FAILED);
|
||||
switch (current()) {
|
||||
case '-':
|
||||
if (Next() == '-') {
|
||||
if (operand == nullptr) {
|
||||
operand = zone()->template New<RegExpClassRanges>(zone(), ranges);
|
||||
operand =
|
||||
zone()->template New<RegExpClassSetOperand>(ranges, strings);
|
||||
}
|
||||
return ParseClassSubtraction(builder, is_negated, operand);
|
||||
return ParseClassSubtraction(builder, is_negated, operand,
|
||||
operand_type);
|
||||
}
|
||||
// ClassSetRange is handled in ParseClassUnion().
|
||||
break;
|
||||
case '&':
|
||||
if (Next() == '&') {
|
||||
if (operand == nullptr) {
|
||||
operand = zone()->template New<RegExpClassRanges>(zone(), ranges);
|
||||
operand =
|
||||
zone()->template New<RegExpClassSetOperand>(ranges, strings);
|
||||
}
|
||||
return ParseClassIntersection(builder, is_negated, operand);
|
||||
return ParseClassIntersection(builder, is_negated, operand,
|
||||
operand_type);
|
||||
}
|
||||
}
|
||||
return ParseClassUnion(builder, is_negated, operand, operand_type, ranges);
|
||||
return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,
|
||||
strings);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -48,10 +48,29 @@ assertEarlyError('/[~~]/v');
|
||||
assertEarlyError('/[a&&&]/v');
|
||||
assertEarlyError('/[&&&a]/v');
|
||||
|
||||
// Unterminated string disjunction.
|
||||
assertEarlyError('/[\q{foo]/v');
|
||||
assertEarlyError('/[\q{foo|]/v');
|
||||
|
||||
// Negating classes containing strings is not allowed.
|
||||
assertEarlyError('/[^\q{foo}]/v');
|
||||
assertEarlyError('/[^\q{}]/v'); // Empty string counts as string.
|
||||
assertEarlyError('/[^[\q{foo}]]/v');
|
||||
assertEarlyError('/[^[\p{Basic_Emoji}]/v');
|
||||
assertEarlyError('/[^\q{foo}&&\q{bar}]/v');
|
||||
assertEarlyError('/[^\q{foo}--\q{bar}]/v');
|
||||
// Exceptions when negating the class is allowed:
|
||||
// The "string" contains only single characters.
|
||||
/[^\q{a|b|c}]/v;
|
||||
// Not all operands of an intersection contain strings.
|
||||
/[^\q{foo}&&\q{bar}&&a]/v;
|
||||
// The first operand of a subtraction doesn't contain strings.
|
||||
/[^a--\q{foo}--\q{bar}]/v;
|
||||
|
||||
const allAscii = Array.from(
|
||||
{length: 127}, (v, i) => { return String.fromCharCode(i); });
|
||||
|
||||
function check(re, expectMatch, expectNoMatch) {
|
||||
function check(re, expectMatch, expectNoMatch = [], negationValid = true) {
|
||||
if (expectNoMatch === undefined) {
|
||||
const expectSet = new Set(expectMatch.map(val => {
|
||||
return (typeof val == 'number') ? String(val) : val; }));
|
||||
@ -63,14 +82,22 @@ function check(re, expectMatch, expectNoMatch) {
|
||||
for (const noMatch of expectNoMatch) {
|
||||
assertFalse(re.test(noMatch), `${re}.test(${noMatch})`);
|
||||
}
|
||||
// Nest the current RegExp in a negated class and check expectations are
|
||||
// inversed.
|
||||
const inverted = new RegExp(`[^${re.source}]`, re.flags);
|
||||
for (const match of expectMatch) {
|
||||
assertFalse(inverted.test(match), `${inverted}.test(${match})`);
|
||||
}
|
||||
for (const noMatch of expectNoMatch) {
|
||||
assertTrue(inverted.test(noMatch), `${inverted}.test(${noMatch})`);
|
||||
if (!negationValid) {
|
||||
// Negation of classes containing strings is an error.
|
||||
const negated = `[^${re.source}]`;
|
||||
assertThrows(() => { new RegExp(negated, `${re.flags}`); }, SyntaxError,
|
||||
`Invalid regular expression: /${negated}/: ` +
|
||||
`Negated character class may contain strings`);
|
||||
} else {
|
||||
// Nest the current RegExp in a negated class and check expectations are
|
||||
// inversed.
|
||||
const inverted = new RegExp(`[^${re.source}]`, re.flags);
|
||||
for (const match of expectMatch) {
|
||||
assertFalse(inverted.test(match), `${inverted}.test(${match})`);
|
||||
}
|
||||
for (const noMatch of expectNoMatch) {
|
||||
assertTrue(inverted.test(noMatch), `${inverted}.test(${noMatch})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -126,6 +153,41 @@ check(/[Ā-č]/v, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
|
||||
check(/[ĀĂĄĆ]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
|
||||
check(/[āăąć]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
|
||||
|
||||
// String disjunctions
|
||||
check(/[\q{foo|bar|0|5}]/v, ['foo', 'bar', 0, 5], ['fo', 'baz'], false)
|
||||
check(/[\q{foo|bar}[05]]/v, ['foo', 'bar', 0, 5], ['fo', 'baz'], false)
|
||||
check(/[\q{foo|bar|0|5}&&\q{bar}]/v, ['bar'], ['foo', 0, 5, 'fo', 'baz'], false)
|
||||
// The second operand of the intersection doesn't contain strings, so the result
|
||||
// will not contain strings and therefore negation is valid.
|
||||
check(/[\q{foo|bar|0|5}&&\d]/v, [0, 5], ['foo', 'bar', 'fo', 'baz'], true)
|
||||
check(/[\q{foo|bar|0|5}--\q{foo}]/v, ['bar', 0, 5], ['foo', 'fo', 'baz'], false)
|
||||
check(/[\q{foo|bar|0|5}--\d]/v, ['foo', 'bar'], [0, 5, 'fo', 'baz'], false)
|
||||
|
||||
check(
|
||||
/[\q{foo|bar|0|5}&&\q{bAr}]/vi, ['bar', 'bAr', 'BAR'],
|
||||
['foo', 0, 5, 'fo', 'baz'], false)
|
||||
check(
|
||||
/[\q{foo|bar|0|5}--\q{FoO}]/vi, ['bar', 'bAr', 'BAR', 0, 5],
|
||||
['foo', 'FOO', 'fo', 'baz'], false)
|
||||
|
||||
check(/[\q{ĀĂĄĆ|AaAc}&&\q{āăąć}]/vi, ['ĀĂĄĆ', 'āăąć'], ['AaAc'], false);
|
||||
check(
|
||||
/[\q{ĀĂĄĆ|AaAc}--\q{āăąć}]/vi, ['AaAc', 'aAaC'], ['ĀĂĄĆ', 'āăąć'],
|
||||
false);
|
||||
|
||||
// Empty string disjunctions matches nothing, but succeeds.
|
||||
let res = /[\q{}]/v.exec('foo');
|
||||
assertNotNull(res);
|
||||
assertEquals(1, res.length);
|
||||
assertEquals('', res[0]);
|
||||
|
||||
// Ensure longest strings are matched first.
|
||||
assertEquals(['xyz'], /[a-c\q{W|xy|xyz}]/v.exec('xyzabc'))
|
||||
assertEquals(['xyz'], /[a-c\q{W|xyz|xy}]/v.exec('xyzabc'))
|
||||
assertEquals(['xyz'], /[\q{W|xyz|xy}a-c]/v.exec('xyzabc'))
|
||||
// Empty string is last.
|
||||
assertEquals(['a'], /[\q{W|}a-c]/v.exec('abc'))
|
||||
|
||||
// Some more sophisticated tests taken from
|
||||
// https://v8.dev/features/regexp-v-flag
|
||||
assertFalse(/[\p{Script_Extensions=Greek}--π]/v.test('π'));
|
||||
|
@ -325,29 +325,17 @@
|
||||
'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-P': [SKIP],
|
||||
'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-u': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-difference-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-difference-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-union-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-class-union-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-difference-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-difference-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-intersection-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-intersection-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-union-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-union-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character-class-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character-class': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character': [SKIP],
|
||||
@ -366,24 +354,9 @@
|
||||
'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-character-property-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-difference-character-class-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-difference-character-class': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-difference-character': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-difference-character-property-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-difference-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-difference-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character-class-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character-class': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character-property-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-intersection-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-intersection-string-literal': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-union-character-class-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-union-character-class': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-union-character': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-union-character-property-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-union-property-of-strings-escape': [SKIP],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-union-string-literal': [SKIP],
|
||||
|
||||
# https://bugs.chromium.org/p/v8/issues/detail?id=13173
|
||||
'built-ins/RegExp/duplicate-named-capturing-groups-syntax': [FAIL],
|
||||
@ -1049,18 +1022,24 @@
|
||||
'built-ins/RegExp/unicodeSets/generated/character-difference-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-intersection-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-string-literal': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-string-literal': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-union-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-string-literal': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/character-union-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-difference-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character-property-escape': [PASS,FAIL],
|
||||
'built-ins/RegExp/unicodeSets/generated/string-literal-union-character-property-escape': [PASS,FAIL],
|
||||
|
||||
# Unicode in identifiers.
|
||||
'language/identifiers/part-unicode-*': [FAIL],
|
||||
|
Loading…
Reference in New Issue
Block a user