[regexp] Unicode Sets: Implement parser changes and set operations

- Adapt parser for /v.
- Implement set operations (union, intersect, subtract).
- String disjunctions not implemented yet.

Bug: v8:11935
Change-Id: I1c3a6785500664dacc5d6562f49d7deed73ac15f
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3921517
Commit-Queue: Patrick Thier <pthier@chromium.org>
Reviewed-by: Jakob Linke <jgruber@chromium.org>
Reviewed-by: Mathias Bynens <mathias@chromium.org>
Reviewed-by: Kim-Anh Tran <kimanh@chromium.org>
Cr-Commit-Position: refs/heads/main@{#83647}
This commit is contained in:
pthier 2022-10-06 15:09:19 +02:00 committed by V8 LUCI CQ
parent 316dd0326c
commit a169bab6f7
11 changed files with 1156 additions and 170 deletions

View File

@ -56,6 +56,9 @@ String16 _descriptionForRegExpFlags(v8::Local<v8::RegExp> value) {
if (flags & v8::RegExp::Flags::kMultiline) result_string_builder.append('m');
if (flags & v8::RegExp::Flags::kDotAll) result_string_builder.append('s');
if (flags & v8::RegExp::Flags::kUnicode) result_string_builder.append('u');
if (flags & v8::RegExp::Flags::kUnicodeSets) {
result_string_builder.append('v');
}
if (flags & v8::RegExp::Flags::kSticky) result_string_builder.append('y');
return result_string_builder.toString();
}

View File

@ -243,6 +243,7 @@ String16 descriptionForRegExp(v8::Isolate* isolate,
if (flags & v8::RegExp::Flags::kMultiline) description.append('m');
if (flags & v8::RegExp::Flags::kDotAll) description.append('s');
if (flags & v8::RegExp::Flags::kUnicode) description.append('u');
if (flags & v8::RegExp::Flags::kUnicodeSets) description.append('v');
if (flags & v8::RegExp::Flags::kSticky) description.append('y');
return description.toString();
}

View File

@ -69,6 +69,11 @@ class CanBeHandledVisitor final : private RegExpVisitor {
return nullptr;
}
void* VisitClassSet(RegExpClassSet* node, void*) override {
result_ = false;
return nullptr;
}
void* VisitAssertion(RegExpAssertion* node, void*) override {
return nullptr;
}
@ -419,6 +424,11 @@ class CompileVisitor : private RegExpVisitor {
return nullptr;
}
void* VisitClassSet(RegExpClassSet* node, void*) override {
// TODO(v8:11935): Add support.
UNREACHABLE();
}
void* VisitAtom(RegExpAtom* node, void*) override {
for (base::uc16 c : node->data()) {
assembler_.ConsumeRange(c, c);

View File

@ -195,6 +195,27 @@ void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
return nullptr;
}
void* RegExpUnparser::VisitClassSet(RegExpClassSet* that, void* data) {
switch (that->operation()) {
case RegExpClassSet::OperationType::kUnion:
os_ << "++";
break;
case RegExpClassSet::OperationType::kIntersection:
os_ << "&&";
break;
case RegExpClassSet::OperationType::kSubtraction:
os_ << "--";
break;
}
if (that->is_negated()) os_ << "^";
os_ << "[";
for (int i = 0; i < that->operands()->length(); i++) {
if (i > 0) os_ << " ";
that->operands()->at(i)->Accept(this, data);
}
os_ << "]";
return nullptr;
}
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
switch (that->assertion_type()) {

View File

@ -19,6 +19,7 @@ namespace internal {
VISIT(Alternative) \
VISIT(Assertion) \
VISIT(CharacterClass) \
VISIT(ClassSet) \
VISIT(Atom) \
VISIT(Quantifier) \
VISIT(Capture) \
@ -117,29 +118,49 @@ class CharacterRange {
StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents,
Zone* zone);
// Add case equivalents to ranges. Only used for /i, not for /ui or /vi, as
// the semantics for unicode mode are slightly different.
// See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4.
V8_EXPORT_PRIVATE static void AddCaseEquivalents(
Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
bool is_one_byte);
// Add case equivalent code points to ranges. Only used for /ui and /vi, not
// for /i, as the semantics for non-unicode mode are slightly different.
// See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4.
static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
Zone* zone);
bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
base::uc32 from() const { return from_; }
base::uc32 to() const { return to_; }
bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; }
bool IsSingleton() const { return from_ == to_; }
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
V8_EXPORT_PRIVATE static bool IsCanonical(
const ZoneList<CharacterRange>* ranges);
// Convert range list to canonical form. The characters covered by the ranges
// will still be the same, but no character is in more than one range, and
// adjacent ranges are merged. The resulting list may be shorter than the
// original, but cannot be longer.
static void Canonicalize(ZoneList<CharacterRange>* ranges);
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
static void Negate(const ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst, Zone* zone);
// Intersect the contents of two character ranges in canonical form.
static void Intersect(const ZoneList<CharacterRange>* lhs,
const ZoneList<CharacterRange>* rhs,
ZoneList<CharacterRange>* dst, Zone* zone);
// Subtract the contents of |to_remove| from the contents of |src|.
static void Subtract(const ZoneList<CharacterRange>* src,
const ZoneList<CharacterRange>* to_remove,
ZoneList<CharacterRange>* dst, Zone* zone);
// Remove all ranges outside the one-byte range.
static void ClampToOneByte(ZoneList<CharacterRange>* ranges);
// Checks if two ranges (both need to be canonical) are equal.
static bool Equals(const ZoneList<CharacterRange>* lhs,
const ZoneList<CharacterRange>* rhs);
private:
CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {}
@ -150,6 +171,13 @@ class CharacterRange {
base::uc32 to_ = 0;
};
inline bool operator==(const CharacterRange& lhs, const CharacterRange& rhs) {
return lhs.from() == rhs.from() && lhs.to() == rhs.to();
}
inline bool operator!=(const CharacterRange& lhs, const CharacterRange& rhs) {
return !operator==(lhs, rhs);
}
#define DECL_BOILERPLATE(Name) \
void* Accept(RegExpVisitor* visitor, void* data) override; \
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \
@ -329,6 +357,47 @@ class RegExpCharacterClass final : public RegExpTree {
CharacterClassFlags character_class_flags_;
};
class RegExpClassSet final : public RegExpTree {
public:
enum class OperationType { kUnion, kIntersection, kSubtraction };
RegExpClassSet(OperationType op, bool is_negated,
ZoneList<RegExpTree*>* operands)
: operation_(op), is_negated_(is_negated), operands_(operands) {}
DECL_BOILERPLATE(ClassSet);
bool IsTextElement() override { return true; }
// At least 1 character is consumed.
int min_match() override { return 1; }
// Up to two code points might be consumed.
int max_match() override { return 2; }
OperationType operation() const { return operation_; }
bool is_negated() const { return is_negated_; }
const ZoneList<RegExpTree*>* operands() const { return operands_; }
private:
RegExpCharacterClass* ToCharacterClass(Zone* zone);
// Recursively evaluates the tree rooted at |root|, computing the valid
// CharacterRanges after applying all set operations and storing the result in
// |result_ranges|. |temp_ranges| is list used for intermediate results,
// passed as parameter to avoid allocating new lists all the time.
static void ComputeCharacterRanges(RegExpTree* root,
ZoneList<CharacterRange>* result_ranges,
ZoneList<CharacterRange>* temp_ranges,
Zone* zone);
const OperationType operation_;
const bool is_negated_;
ZoneList<RegExpTree*>* operands_ = nullptr;
#ifdef ENABLE_SLOW_DCHECKS
// Cache ranges for each node during computation for (slow) DCHECKs.
ZoneList<CharacterRange>* ranges_ = nullptr;
#endif
};
class RegExpAtom final : public RegExpTree {
public:
explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {}

View File

@ -419,9 +419,23 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
}
void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
} // namespace
// TODO(pthier, v8:11935): We use this method to implement
// MaybeSimpleCaseFolding
// TODO(v8:11935): Change to permalink once proposal is in stage 4.
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-maybesimplecasefolding
// which is slightly different. The main difference is that we retain original
// characters and add case equivalents, whereas according to the spec original
// characters should be replaced with their case equivalent.
// This shouldn't make a difference for correctness, but we could potentially
// create smaller character classes for unicode sets.
// static
void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
Zone* zone) {
#ifdef V8_INTL_SUPPORT
DCHECK(CharacterRange::IsCanonical(ranges));
DCHECK(IsCanonical(ranges));
// Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
// See also https://crbug.com/v8/6727.
@ -444,16 +458,13 @@ void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
// we end up with only simple and common case mappings.
set.removeAllStrings();
for (int i = 0; i < set.getRangeCount(); i++) {
ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
zone);
ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
}
// No errors and everything we collected have been ranges.
CharacterRange::Canonicalize(ranges);
Canonicalize(ranges);
#endif // V8_INTL_SUPPORT
}
} // namespace
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
set_.Canonicalize();
@ -461,7 +472,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ZoneList<CharacterRange>* ranges = this->ranges(zone);
if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
AddUnicodeCaseEquivalents(ranges, zone);
CharacterRange::AddUnicodeCaseEquivalents(ranges, zone);
}
if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() ||
@ -470,6 +481,17 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
}
if (is_negated()) {
// With /v, character classes are never negated.
// TODO(v8:11935): Change permalink once proposal is in stage 4.
// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-compileatom
// Atom :: CharacterClass
// 4. Assert: cc.[[Invert]] is false.
// Instead the complement is created when evaluating the class set.
// The only exception is the "nothing range" (negated everything), which is
// internally created for an empty set.
DCHECK_IMPLIES(
IsUnicodeSets(compiler->flags()),
ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint));
ZoneList<CharacterRange>* negated =
zone->New<ZoneList<CharacterRange>>(2, zone);
CharacterRange::Negate(ranges, negated, zone);
@ -505,6 +527,11 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
return result;
}
RegExpNode* RegExpClassSet::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return ToCharacterClass(compiler->zone())->ToNode(compiler, on_success);
}
namespace {
int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
@ -1359,7 +1386,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
#endif // V8_INTL_SUPPORT
}
bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) {
DCHECK_NOT_NULL(ranges);
int n = ranges->length();
if (n <= 1) return true;
@ -1463,6 +1490,129 @@ void CharacterSet::Canonicalize() {
CharacterRange::Canonicalize(ranges_);
}
RegExpCharacterClass* RegExpClassSet::ToCharacterClass(Zone* zone) {
ZoneList<CharacterRange>* result_ranges =
zone->template New<ZoneList<CharacterRange>>(2, zone);
ZoneList<CharacterRange>* temp_ranges =
zone->template New<ZoneList<CharacterRange>>(2, zone);
ComputeCharacterRanges(this, result_ranges, temp_ranges, zone);
return zone->template New<RegExpCharacterClass>(zone, result_ranges);
}
// static
void RegExpClassSet::ComputeCharacterRanges(
RegExpTree* root, ZoneList<CharacterRange>* result_ranges,
ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
DCHECK_EQ(temp_ranges->length(), 0);
DCHECK(root->IsCharacterClass() || root->IsClassSet());
if (root->IsCharacterClass()) {
DCHECK(!root->AsCharacterClass()->is_negated());
ZoneList<CharacterRange>* ranges = root->AsCharacterClass()->ranges(zone);
CharacterRange::Canonicalize(ranges);
result_ranges->AddAll(*ranges, zone);
return;
}
RegExpClassSet* node = root->AsClassSet();
switch (node->operation()) {
case OperationType::kUnion: {
ZoneList<CharacterRange>* op_ranges =
zone->template New<ZoneList<CharacterRange>>(2, zone);
for (int i = 0; i < node->operands()->length(); i++) {
RegExpTree* op = node->operands()->at(i);
ComputeCharacterRanges(op, op_ranges, temp_ranges, zone);
result_ranges->AddAll(*op_ranges, zone);
op_ranges->Rewind(0);
}
CharacterRange::Canonicalize(result_ranges);
break;
}
case OperationType::kIntersection: {
ZoneList<CharacterRange>* op_ranges =
zone->template New<ZoneList<CharacterRange>>(2, zone);
ComputeCharacterRanges(node->operands()->at(0), op_ranges, temp_ranges,
zone);
result_ranges->AddAll(*op_ranges, zone);
op_ranges->Rewind(0);
for (int i = 1; i < node->operands()->length(); i++) {
ComputeCharacterRanges(node->operands()->at(i), op_ranges, temp_ranges,
zone);
CharacterRange::Intersect(result_ranges, op_ranges, temp_ranges, zone);
std::swap(*result_ranges, *temp_ranges);
temp_ranges->Rewind(0);
op_ranges->Rewind(0);
}
break;
}
case OperationType::kSubtraction: {
ZoneList<CharacterRange>* op_ranges =
zone->template New<ZoneList<CharacterRange>>(2, zone);
ComputeCharacterRanges(node->operands()->at(0), op_ranges, temp_ranges,
zone);
result_ranges->AddAll(*op_ranges, zone);
op_ranges->Rewind(0);
for (int i = 1; i < node->operands()->length(); i++) {
ComputeCharacterRanges(node->operands()->at(i), op_ranges, temp_ranges,
zone);
CharacterRange::Subtract(result_ranges, op_ranges, temp_ranges, zone);
std::swap(*result_ranges, *temp_ranges);
temp_ranges->Rewind(0);
op_ranges->Rewind(0);
}
#ifdef ENABLE_SLOW_DCHECKS
// Check that the result is equal to subtracting the union of all RHS
// operands from the LHS operand.
// TODO(pthier): It is unclear whether this variant is faster or slower
// than subtracting multiple ranges in practice.
ZoneList<CharacterRange>* lhs_range =
// node->operands()->at(0)->AsCharacterClass()->ranges(zone);
node->operands()->at(0)->IsCharacterClass()
? node->operands()->at(0)->AsCharacterClass()->ranges(zone)
: node->operands()->at(0)->AsClassSet()->ranges_;
ZoneList<CharacterRange>* rhs_union =
zone->template New<ZoneList<CharacterRange>>(2, zone);
for (int i = 1; i < node->operands()->length(); i++) {
ZoneList<CharacterRange>* op_range =
node->operands()->at(i)->IsCharacterClass()
? node->operands()->at(i)->AsCharacterClass()->ranges(zone)
: node->operands()->at(i)->AsClassSet()->ranges_;
rhs_union->AddAll(*op_range, zone);
}
CharacterRange::Canonicalize(rhs_union);
ZoneList<CharacterRange>* ranges_check =
zone->template New<ZoneList<CharacterRange>>(2, zone);
CharacterRange::Subtract(lhs_range, rhs_union, ranges_check, zone);
DCHECK(CharacterRange::Equals(result_ranges, ranges_check));
// Check that the result is equal to intersecting the LHS operand with the
// complemented union of all RHS operands
ZoneList<CharacterRange>* rhs_union_negated =
zone->template New<ZoneList<CharacterRange>>(rhs_union->length(),
zone);
CharacterRange::Negate(rhs_union, rhs_union_negated, zone);
ranges_check->Rewind(0);
CharacterRange::Intersect(lhs_range, rhs_union_negated, ranges_check,
zone);
DCHECK(CharacterRange::Equals(result_ranges, ranges_check));
#endif
break;
}
}
if (node->is_negated()) {
CharacterRange::Negate(result_ranges, temp_ranges, zone);
std::swap(*result_ranges, *temp_ranges);
temp_ranges->Rewind(0);
}
DCHECK_EQ(temp_ranges->length(), 0);
#ifdef ENABLE_SLOW_DCHECKS
// Cache results for DCHECKs.
node->ranges_ =
zone->template New<ZoneList<CharacterRange>>(*result_ranges, zone);
#endif
}
// static
void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
if (character_ranges->length() <= 1) return;
@ -1500,7 +1650,7 @@ void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
}
// static
void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges,
ZoneList<CharacterRange>* negated_ranges,
Zone* zone) {
DCHECK(CharacterRange::IsCanonical(ranges));
@ -1523,6 +1673,128 @@ void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
}
}
// static
void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs,
const ZoneList<CharacterRange>* rhs,
ZoneList<CharacterRange>* intersection,
Zone* zone) {
DCHECK(CharacterRange::IsCanonical(lhs));
DCHECK(CharacterRange::IsCanonical(rhs));
DCHECK_EQ(0, intersection->length());
int lhs_index = 0;
int rhs_index = 0;
while (lhs_index < lhs->length() && rhs_index < rhs->length()) {
// Skip non-overlapping ranges.
if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) {
lhs_index++;
continue;
}
if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) {
rhs_index++;
continue;
}
base::uc32 from =
std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from());
base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to());
intersection->Add(CharacterRange::Range(from, to), zone);
if (to == lhs->at(lhs_index).to()) {
lhs_index++;
} else {
rhs_index++;
}
}
DCHECK(IsCanonical(intersection));
}
namespace {
// Advance |index| and set |from| and |to| to the new range, if not out of
// bounds of |range|, otherwise |from| is set to a code point beyond the legal
// unicode character range.
void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,
base::uc32* from, base::uc32* to) {
++(*index);
if (*index < range->length()) {
*from = range->at(*index).from();
*to = range->at(*index).to();
} else {
*from = kMaxCodePoint + 1;
}
}
} // namespace
// static
void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,
const ZoneList<CharacterRange>* to_remove,
ZoneList<CharacterRange>* result, Zone* zone) {
DCHECK(CharacterRange::IsCanonical(src));
DCHECK(CharacterRange::IsCanonical(to_remove));
DCHECK_EQ(0, result->length());
int src_index = 0;
int to_remove_index = 0;
base::uc32 from = src->at(src_index).from();
base::uc32 to = src->at(src_index).to();
while (src_index < src->length() && to_remove_index < to_remove->length()) {
CharacterRange remove_range = to_remove->at(to_remove_index);
if (remove_range.to() < from) {
// (a) Non-overlapping case, ignore current to_remove range.
// |-------|
// |-------|
to_remove_index++;
} else if (to < remove_range.from()) {
// (b) Non-overlapping case, add full current range to result.
// |-------|
// |-------|
result->Add(CharacterRange::Range(from, to), zone);
SafeAdvanceRange(src, &src_index, &from, &to);
} else if (from >= remove_range.from() && to <= remove_range.to()) {
// (c) Current to_remove range fully covers current range.
// |---|
// |-------|
SafeAdvanceRange(src, &src_index, &from, &to);
} else if (from < remove_range.from() && to > remove_range.to()) {
// (d) Split current range.
// |-------|
// |---|
result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone);
from = remove_range.to() + 1;
to_remove_index++;
} else if (from < remove_range.from()) {
// (e) End current range.
// |-------|
// |-------|
to = remove_range.from() - 1;
result->Add(CharacterRange::Range(from, to), zone);
SafeAdvanceRange(src, &src_index, &from, &to);
} else if (to > remove_range.to()) {
// (f) Modify start of current range.
// |-------|
// |-------|
from = remove_range.to() + 1;
to_remove_index++;
} else {
UNREACHABLE();
}
}
// The last range needs special treatment after |to_remove| is exhausted, as
// |from| might have been modified by the last |to_remove| range and |to| was
// not yet known (i.e. cases d and f).
if (from <= to) {
result->Add(CharacterRange::Range(from, to), zone);
}
src_index++;
// Add remaining ranges after |to_remove| is exhausted.
for (; src_index < src->length(); src_index++) {
result->Add(src->at(src_index), zone);
}
DCHECK(IsCanonical(result));
}
// static
void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
DCHECK(IsCanonical(ranges));
@ -1544,6 +1816,20 @@ void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
ranges->Rewind(n);
}
// static
bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs,
const ZoneList<CharacterRange>* rhs) {
DCHECK(IsCanonical(lhs));
DCHECK(IsCanonical(rhs));
if (lhs->length() != rhs->length()) return false;
for (int i = 0; i < lhs->length(); i++) {
if (lhs->at(i) != rhs->at(i)) return false;
}
return true;
}
namespace {
// Scoped object to keep track of how much we unroll quantifier loops in the

View File

@ -42,7 +42,9 @@ namespace internal {
T(InvalidClassPropertyName, "Invalid property name in character class") \
T(InvalidCharacterClass, "Invalid character class") \
T(UnterminatedCharacterClass, "Unterminated character class") \
T(OutOfOrderCharacterClass, "Range out of order in character class")
T(OutOfOrderCharacterClass, "Range out of order in character class") \
T(InvalidClassSetOperation, "Invalid set operation in character class") \
T(InvalidCharacterInClass, "Invalid character in character class")
enum class RegExpError : uint32_t {
#define TEMPLATE(NAME, STRING) k##NAME,

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,143 @@
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-unicode-sets
// u and v are not allowed together.
assertEarlyError('/./uv');
assertThrowsAtRuntime("new RegExp('.','uv')", SyntaxError);
assertEquals('v', /./v.flags);
assertTrue(/./v.unicodeSets);
// Characters that require escaping within a character class in /v mode
assertEarlyError('/[(]/v');
assertEarlyError('/[)]/v');
assertEarlyError('/[[]/v');
assertEarlyError('/[]]/v');
assertEarlyError('/[{]/v');
assertEarlyError('/[}]/v');
assertEarlyError('/[/]/v');
assertEarlyError('/[-]/v');
// Need to escape the backslash, as assertEarlyError uses eval().
assertEarlyError('/[\\]/v');
assertEarlyError('/[|]/v');
assertEarlyError('/[&&]/v');
assertEarlyError('/[!!]/v');
assertEarlyError('/[##]/v');
assertEarlyError('/[$$]/v');
assertEarlyError('/[%%]/v');
assertEarlyError('/[**]/v');
assertEarlyError('/[++]/v');
assertEarlyError('/[,,]/v');
assertEarlyError('/[..]/v');
assertEarlyError('/[::]/v');
assertEarlyError('/[;;]/v');
assertEarlyError('/[<<]/v');
assertEarlyError('/[==]/v');
assertEarlyError('/[>>]/v');
assertEarlyError('/[??]/v');
assertEarlyError('/[@@]/v');
// The first ^ negates the class. The following two are not valid.
assertEarlyError('/[^^^]/v');
assertEarlyError('/[``]/v');
assertEarlyError('/[~~]/v');
assertEarlyError('/[a&&&]/v');
assertEarlyError('/[&&&a]/v');
const allAscii = Array.from(
{length: 127}, (v, i) => { return String.fromCharCode(i); });
function check(re, expectMatch, expectNoMatch) {
if (expectNoMatch === undefined) {
const expectSet = new Set(expectMatch.map(val => {
return (typeof val == 'number') ? String(val) : val; }));
expectNoMatch = allAscii.filter(val => !expectSet.has(val));
}
for (const match of expectMatch) {
assertTrue(re.test(match), `${re}.test(${match})`);
}
for (const noMatch of expectNoMatch) {
assertFalse(re.test(noMatch), `${re}.test(${noMatch})`);
}
// Nest the current RegExp in a negated class and check expectations are
// inversed.
const inverted = new RegExp(`[^${re.source}]`, re.flags);
for (const match of expectMatch) {
assertFalse(inverted.test(match), `${inverted}.test(${match})`);
}
for (const noMatch of expectNoMatch) {
assertTrue(inverted.test(noMatch), `${inverted}.test(${noMatch})`);
}
}
// Union with nested class
check(
/[\da-f[xy][^[^z]]]/v, Array.from('0123456789abcdefxyz'),
Array.from('ghijklmnopqrstuv!?'));
// Intersections
check(/[\d&&[0-9]]/v, Array.from('0123456789'), []);
check(/[\d&&0]/v, [0], Array.from('123456789'));
check(/[\d&&9]/v, [9], Array.from('012345678'));
check(/[\d&&[02468]]/v, Array.from('02468'), Array.from('13579'));
check(/[\d&&[13579]]/v, Array.from('13579'), Array.from('02468'));
check(
/[\w&&[^a-zA-Z_]]/v, Array.from('0123456789'),
Array.from('abcdxyzABCDXYZ_!?'));
check(
/[^\w&&[a-zA-Z_]]/v, Array.from('0123456789!?'),
Array.from('abcdxyzABCDXYZ_'));
// Subtractions
check(/[\d--[!-%]]/v, Array.from('0123456789'));
check(/[\d--[A-Z]]/v, Array.from('0123456789'));
check(/[\d--[0-9]]/v, []);
check(/[\d--[\w]]/v, []);
check(/[\d--0]/v, Array.from('123456789'));
check(/[\d--9]/v, Array.from('012345678'));
check(/[[\d[a-c]]--9]/v, Array.from('012345678abc'));
check(/[\d--[02468]]/v, Array.from('13579'));
check(/[\d--[13579]]/v, Array.from('02468'));
check(/[[3-7]--[0-9]]/v, []);
check(/[[3-7]--[0-7]]/v, []);
check(/[[3-7]--[3-9]]/v, []);
check(/[[3-79]--[0-7]]/v, [9]);
check(/[[3-79]--[3-9]]/v, []);
check(/[[3-7]--[0-3]]/v, Array.from('4567'));
check(/[[3-7]--[0-5]]/v, Array.from('67'));
check(/[[3-7]--[7-9]]/v, Array.from('3456'));
check(/[[3-7]--[5-9]]/v, Array.from('34'));
check(/[[3-7a-c]--[0-3]]/v, Array.from('4567abc'));
check(/[[3-7a-c]--[0-5]]/v, Array.from('67abc'));
check(/[[3-7a-c]--[7-9]]/v, Array.from('3456abc'));
check(/[[3-7a-c]--[5-9]]/v, Array.from('34abc'));
check(/[[2-8]--[0-3]--5--[7-9]]/v, Array.from('46'));
check(/[[2-57-8]--[0-3]--[5-7]]/v, Array.from('48'));
check(/[[0-57-8]--[1-34]--[5-7]]/v, Array.from('08'));
check(/[\d--[^02468]]/v, Array.from('02468'));
check(/[\d--[^13579]]/v, Array.from('13579'));
// Ignore-Case
check(/[Ā-č]/v, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
check(/[ĀĂĄĆ]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
check(/[āăąć]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
// Some more sophisticated tests taken from
// https://v8.dev/features/regexp-v-flag
assertFalse(/[\p{Script_Extensions=Greek}--π]/v.test('π'));
assertFalse(/[\p{Script_Extensions=Greek}--[αβγ]]/v.test('α'));
assertFalse(/[\p{Script_Extensions=Greek}--[α-γ]]/v.test('β'));
assertTrue(/[\p{Decimal_Number}--[0-9]]/v.test('𑜹'));
assertFalse(/[\p{Decimal_Number}--[0-9]]/v.test('4'));
assertTrue(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('π'));
assertFalse(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('𐆊'));
assertTrue(/[\p{White_Space}&&\p{ASCII}]/v.test('\n'));
assertFalse(/[\p{White_Space}&&\p{ASCII}]/v.test('\u2028'));
assertTrue(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test(''));
assertFalse(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test(''));
assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/\p{Lowercase_Letter}/giv, 'X'));
assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/[^\P{Lowercase_Letter}]/giv, 'X'));

View File

@ -434,6 +434,10 @@
'regress/regress-1262423': [PASS,FAIL],
'regress/regress-793588': [PASS,FAIL],
# RegExp unicode tests relies on ICU for property classes and
# case-insensitive unicode patterns.
'harmony/regexp-unicode-sets': [PASS,FAIL],
# The noi18n build cannot parse characters in supplementary plane.
'harmony/regexp-named-captures': [FAIL],
'regress/regress-v8-10384': [FAIL],

View File

@ -317,76 +317,28 @@
'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-CharacterClass': [SKIP],
'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-P': [SKIP],
'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-u': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-difference-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-difference-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-difference-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-union-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-union-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-union-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-union-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-union-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-union-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-difference-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-difference-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-difference-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-difference-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-difference-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-difference-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-intersection-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-intersection-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-intersection-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-intersection-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-intersection-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-intersection-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-union-character-class-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-union-character-class': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-union-character': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-union-character-property-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-union-property-of-strings-escape': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-union-string-literal': [SKIP],
'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character-class-escape': [SKIP],
@ -805,6 +757,27 @@
'built-ins/RegExp/named-groups/unicode-property-names-valid': [SKIP],
'built-ins/RegExp/named-groups/non-unicode-property-names-valid': [FAIL],
'built-ins/RegExp/match-indices/indices-array-unicode-property-names': [SKIP],
'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-class-union-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-difference-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-intersection-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-union-character-property-escape': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class': [PASS,FAIL],
'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-property-escape': [PASS,FAIL],
# Unicode in identifiers.
'language/identifiers/part-unicode-*': [FAIL],