[regexp] Unicode Sets: Implement parser changes and set operations

- Adapt parser for /v. - Implement set operations (union, intersect, subtract). - String disjunctions not implemented yet. Bug: v8:11935 Change-Id: I1c3a6785500664dacc5d6562f49d7deed73ac15f Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3921517 Commit-Queue: Patrick Thier <pthier@chromium.org> Reviewed-by: Jakob Linke <jgruber@chromium.org> Reviewed-by: Mathias Bynens <mathias@chromium.org> Reviewed-by: Kim-Anh Tran <kimanh@chromium.org> Cr-Commit-Position: refs/heads/main@{#83647}
2022-10-06 15:09:19 +02:00 · 2022-10-06 15:09:19 +02:00 · a169bab6f7
commit a169bab6f7
parent 316dd0326c
11 changed files with 1156 additions and 170 deletions
--- a/src/inspector/v8-webdriver-serializer.cc
+++ b/src/inspector/v8-webdriver-serializer.cc
@ -56,6 +56,9 @@ String16 _descriptionForRegExpFlags(v8::Local<v8::RegExp> value) {
  if (flags & v8::RegExp::Flags::kMultiline) result_string_builder.append('m');
  if (flags & v8::RegExp::Flags::kDotAll) result_string_builder.append('s');
  if (flags & v8::RegExp::Flags::kUnicode) result_string_builder.append('u');
+  if (flags & v8::RegExp::Flags::kUnicodeSets) {
+    result_string_builder.append('v');
+  }
  if (flags & v8::RegExp::Flags::kSticky) result_string_builder.append('y');
  return result_string_builder.toString();
 }
--- a/src/inspector/value-mirror.cc
+++ b/src/inspector/value-mirror.cc
@ -243,6 +243,7 @@ String16 descriptionForRegExp(v8::Isolate* isolate,
  if (flags & v8::RegExp::Flags::kMultiline) description.append('m');
  if (flags & v8::RegExp::Flags::kDotAll) description.append('s');
  if (flags & v8::RegExp::Flags::kUnicode) description.append('u');
+  if (flags & v8::RegExp::Flags::kUnicodeSets) description.append('v');
  if (flags & v8::RegExp::Flags::kSticky) description.append('y');
  return description.toString();
 }
--- a/src/regexp/experimental/experimental-compiler.cc
+++ b/src/regexp/experimental/experimental-compiler.cc
@ -69,6 +69,11 @@ class CanBeHandledVisitor final : private RegExpVisitor {
    return nullptr;
  }

+  void* VisitClassSet(RegExpClassSet* node, void*) override {
+    result_ = false;
+    return nullptr;
+  }
+
  void* VisitAssertion(RegExpAssertion* node, void*) override {
    return nullptr;
  }
@ -419,6 +424,11 @@ class CompileVisitor : private RegExpVisitor {
    return nullptr;
  }

+  void* VisitClassSet(RegExpClassSet* node, void*) override {
+    // TODO(v8:11935): Add support.
+    UNREACHABLE();
+  }
+
  void* VisitAtom(RegExpAtom* node, void*) override {
    for (base::uc16 c : node->data()) {
      assembler_.ConsumeRange(c, c);
--- a/src/regexp/regexp-ast.cc
+++ b/src/regexp/regexp-ast.cc
@ -195,6 +195,27 @@ void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
  return nullptr;
 }

+void* RegExpUnparser::VisitClassSet(RegExpClassSet* that, void* data) {
+  switch (that->operation()) {
+    case RegExpClassSet::OperationType::kUnion:
+      os_ << "++";
+      break;
+    case RegExpClassSet::OperationType::kIntersection:
+      os_ << "&&";
+      break;
+    case RegExpClassSet::OperationType::kSubtraction:
+      os_ << "--";
+      break;
+  }
+  if (that->is_negated()) os_ << "^";
+  os_ << "[";
+  for (int i = 0; i < that->operands()->length(); i++) {
+    if (i > 0) os_ << " ";
+    that->operands()->at(i)->Accept(this, data);
+  }
+  os_ << "]";
+  return nullptr;
+}

 void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
  switch (that->assertion_type()) {
--- a/src/regexp/regexp-ast.h
+++ b/src/regexp/regexp-ast.h
@ -19,6 +19,7 @@ namespace internal {
  VISIT(Alternative)                      \
  VISIT(Assertion)                        \
  VISIT(CharacterClass)                   \
+  VISIT(ClassSet)                         \
  VISIT(Atom)                             \
  VISIT(Quantifier)                       \
  VISIT(Capture)                          \
@ -117,29 +118,49 @@ class CharacterRange {
      StandardCharacterSet standard_character_set,
      ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents,
      Zone* zone);
+  // Add case equivalents to ranges. Only used for /i, not for /ui or /vi, as
+  // the semantics for unicode mode are slightly different.
+  // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4.
  V8_EXPORT_PRIVATE static void AddCaseEquivalents(
      Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
      bool is_one_byte);
+  // Add case equivalent code points to ranges. Only used for /ui and /vi, not
+  // for /i, as the semantics for non-unicode mode are slightly different.
+  // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4.
+  static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
+                                        Zone* zone);

  bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
  base::uc32 from() const { return from_; }
  base::uc32 to() const { return to_; }
  bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; }
  bool IsSingleton() const { return from_ == to_; }
+
  // Whether a range list is in canonical form: Ranges ordered by from value,
  // and ranges non-overlapping and non-adjacent.
-  V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
+  V8_EXPORT_PRIVATE static bool IsCanonical(
+      const ZoneList<CharacterRange>* ranges);
  // Convert range list to canonical form. The characters covered by the ranges
  // will still be the same, but no character is in more than one range, and
  // adjacent ranges are merged. The resulting list may be shorter than the
  // original, but cannot be longer.
  static void Canonicalize(ZoneList<CharacterRange>* ranges);
  // Negate the contents of a character range in canonical form.
-  static void Negate(ZoneList<CharacterRange>* src,
+  static void Negate(const ZoneList<CharacterRange>* src,
                     ZoneList<CharacterRange>* dst, Zone* zone);
-
+  // Intersect the contents of two character ranges in canonical form.
+  static void Intersect(const ZoneList<CharacterRange>* lhs,
+                        const ZoneList<CharacterRange>* rhs,
+                        ZoneList<CharacterRange>* dst, Zone* zone);
+  // Subtract the contents of |to_remove| from the contents of |src|.
+  static void Subtract(const ZoneList<CharacterRange>* src,
+                       const ZoneList<CharacterRange>* to_remove,
+                       ZoneList<CharacterRange>* dst, Zone* zone);
  // Remove all ranges outside the one-byte range.
  static void ClampToOneByte(ZoneList<CharacterRange>* ranges);
+  // Checks if two ranges (both need to be canonical) are equal.
+  static bool Equals(const ZoneList<CharacterRange>* lhs,
+                     const ZoneList<CharacterRange>* rhs);

 private:
  CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {}
@ -150,6 +171,13 @@ class CharacterRange {
  base::uc32 to_ = 0;
 };

+inline bool operator==(const CharacterRange& lhs, const CharacterRange& rhs) {
+  return lhs.from() == rhs.from() && lhs.to() == rhs.to();
+}
+inline bool operator!=(const CharacterRange& lhs, const CharacterRange& rhs) {
+  return !operator==(lhs, rhs);
+}
+
 #define DECL_BOILERPLATE(Name)                                         \
  void* Accept(RegExpVisitor* visitor, void* data) override;           \
  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \
@ -329,6 +357,47 @@ class RegExpCharacterClass final : public RegExpTree {
  CharacterClassFlags character_class_flags_;
 };

+class RegExpClassSet final : public RegExpTree {
+ public:
+  enum class OperationType { kUnion, kIntersection, kSubtraction };
+
+  RegExpClassSet(OperationType op, bool is_negated,
+                 ZoneList<RegExpTree*>* operands)
+      : operation_(op), is_negated_(is_negated), operands_(operands) {}
+
+  DECL_BOILERPLATE(ClassSet);
+
+  bool IsTextElement() override { return true; }
+  // At least 1 character is consumed.
+  int min_match() override { return 1; }
+  // Up to two code points might be consumed.
+  int max_match() override { return 2; }
+
+  OperationType operation() const { return operation_; }
+  bool is_negated() const { return is_negated_; }
+  const ZoneList<RegExpTree*>* operands() const { return operands_; }
+
+ private:
+  RegExpCharacterClass* ToCharacterClass(Zone* zone);
+
+  // Recursively evaluates the tree rooted at |root|, computing the valid
+  // CharacterRanges after applying all set operations and storing the result in
+  // |result_ranges|. |temp_ranges| is list used for intermediate results,
+  // passed as parameter to avoid allocating new lists all the time.
+  static void ComputeCharacterRanges(RegExpTree* root,
+                                     ZoneList<CharacterRange>* result_ranges,
+                                     ZoneList<CharacterRange>* temp_ranges,
+                                     Zone* zone);
+
+  const OperationType operation_;
+  const bool is_negated_;
+  ZoneList<RegExpTree*>* operands_ = nullptr;
+#ifdef ENABLE_SLOW_DCHECKS
+  // Cache ranges for each node during computation for (slow) DCHECKs.
+  ZoneList<CharacterRange>* ranges_ = nullptr;
+#endif
+};
+
 class RegExpAtom final : public RegExpTree {
 public:
  explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {}
--- a/src/regexp/regexp-compiler-tonode.cc
+++ b/src/regexp/regexp-compiler-tonode.cc
@ -419,9 +419,23 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
  return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
 }

-void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
+}  // namespace
+
+// TODO(pthier, v8:11935): We use this method to implement
+// MaybeSimpleCaseFolding
+// TODO(v8:11935): Change to permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-maybesimplecasefolding
+// which is slightly different. The main difference is that we retain original
+// characters and add case equivalents, whereas according to the spec original
+// characters should be replaced with their case equivalent.
+// This shouldn't make a difference for correctness, but we could potentially
+// create smaller character classes for unicode sets.
+
+// static
+void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
+                                               Zone* zone) {
 #ifdef V8_INTL_SUPPORT
-  DCHECK(CharacterRange::IsCanonical(ranges));
+  DCHECK(IsCanonical(ranges));

  // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
  // See also https://crbug.com/v8/6727.
@ -444,16 +458,13 @@ void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
  // we end up with only simple and common case mappings.
  set.removeAllStrings();
  for (int i = 0; i < set.getRangeCount(); i++) {
-    ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
-                zone);
+    ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
  }
  // No errors and everything we collected have been ranges.
-  CharacterRange::Canonicalize(ranges);
+  Canonicalize(ranges);
 #endif  // V8_INTL_SUPPORT
 }

-}  // namespace
-
 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
                                         RegExpNode* on_success) {
  set_.Canonicalize();
@ -461,7 +472,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
  ZoneList<CharacterRange>* ranges = this->ranges(zone);

  if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
-    AddUnicodeCaseEquivalents(ranges, zone);
+    CharacterRange::AddUnicodeCaseEquivalents(ranges, zone);
  }

  if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() ||
@ -470,6 +481,17 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
  }

  if (is_negated()) {
+    // With /v, character classes are never negated.
+    // TODO(v8:11935): Change permalink once proposal is in stage 4.
+    // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-compileatom
+    // Atom :: CharacterClass
+    //   4. Assert: cc.[[Invert]] is false.
+    // Instead the complement is created when evaluating the class set.
+    // The only exception is the "nothing range" (negated everything), which is
+    // internally created for an empty set.
+    DCHECK_IMPLIES(
+        IsUnicodeSets(compiler->flags()),
+        ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint));
    ZoneList<CharacterRange>* negated =
        zone->New<ZoneList<CharacterRange>>(2, zone);
    CharacterRange::Negate(ranges, negated, zone);
@ -505,6 +527,11 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
  return result;
 }

+RegExpNode* RegExpClassSet::ToNode(RegExpCompiler* compiler,
+                                   RegExpNode* on_success) {
+  return ToCharacterClass(compiler->zone())->ToNode(compiler, on_success);
+}
+
 namespace {

 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
@ -1359,7 +1386,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
 #endif  // V8_INTL_SUPPORT
 }

-bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
+bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) {
  DCHECK_NOT_NULL(ranges);
  int n = ranges->length();
  if (n <= 1) return true;
@ -1463,6 +1490,129 @@ void CharacterSet::Canonicalize() {
  CharacterRange::Canonicalize(ranges_);
 }

+RegExpCharacterClass* RegExpClassSet::ToCharacterClass(Zone* zone) {
+  ZoneList<CharacterRange>* result_ranges =
+      zone->template New<ZoneList<CharacterRange>>(2, zone);
+  ZoneList<CharacterRange>* temp_ranges =
+      zone->template New<ZoneList<CharacterRange>>(2, zone);
+  ComputeCharacterRanges(this, result_ranges, temp_ranges, zone);
+  return zone->template New<RegExpCharacterClass>(zone, result_ranges);
+}
+
+// static
+void RegExpClassSet::ComputeCharacterRanges(
+    RegExpTree* root, ZoneList<CharacterRange>* result_ranges,
+    ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
+  DCHECK_EQ(temp_ranges->length(), 0);
+  DCHECK(root->IsCharacterClass() || root->IsClassSet());
+  if (root->IsCharacterClass()) {
+    DCHECK(!root->AsCharacterClass()->is_negated());
+    ZoneList<CharacterRange>* ranges = root->AsCharacterClass()->ranges(zone);
+    CharacterRange::Canonicalize(ranges);
+    result_ranges->AddAll(*ranges, zone);
+    return;
+  }
+  RegExpClassSet* node = root->AsClassSet();
+  switch (node->operation()) {
+    case OperationType::kUnion: {
+      ZoneList<CharacterRange>* op_ranges =
+          zone->template New<ZoneList<CharacterRange>>(2, zone);
+      for (int i = 0; i < node->operands()->length(); i++) {
+        RegExpTree* op = node->operands()->at(i);
+        ComputeCharacterRanges(op, op_ranges, temp_ranges, zone);
+        result_ranges->AddAll(*op_ranges, zone);
+        op_ranges->Rewind(0);
+      }
+      CharacterRange::Canonicalize(result_ranges);
+      break;
+    }
+    case OperationType::kIntersection: {
+      ZoneList<CharacterRange>* op_ranges =
+          zone->template New<ZoneList<CharacterRange>>(2, zone);
+      ComputeCharacterRanges(node->operands()->at(0), op_ranges, temp_ranges,
+                             zone);
+      result_ranges->AddAll(*op_ranges, zone);
+      op_ranges->Rewind(0);
+      for (int i = 1; i < node->operands()->length(); i++) {
+        ComputeCharacterRanges(node->operands()->at(i), op_ranges, temp_ranges,
+                               zone);
+        CharacterRange::Intersect(result_ranges, op_ranges, temp_ranges, zone);
+        std::swap(*result_ranges, *temp_ranges);
+        temp_ranges->Rewind(0);
+        op_ranges->Rewind(0);
+      }
+      break;
+    }
+    case OperationType::kSubtraction: {
+      ZoneList<CharacterRange>* op_ranges =
+          zone->template New<ZoneList<CharacterRange>>(2, zone);
+      ComputeCharacterRanges(node->operands()->at(0), op_ranges, temp_ranges,
+                             zone);
+      result_ranges->AddAll(*op_ranges, zone);
+      op_ranges->Rewind(0);
+      for (int i = 1; i < node->operands()->length(); i++) {
+        ComputeCharacterRanges(node->operands()->at(i), op_ranges, temp_ranges,
+                               zone);
+        CharacterRange::Subtract(result_ranges, op_ranges, temp_ranges, zone);
+        std::swap(*result_ranges, *temp_ranges);
+        temp_ranges->Rewind(0);
+        op_ranges->Rewind(0);
+      }
+#ifdef ENABLE_SLOW_DCHECKS
+      // Check that the result is equal to subtracting the union of all RHS
+      // operands from the LHS operand.
+      // TODO(pthier): It is unclear whether this variant is faster or slower
+      // than subtracting multiple ranges in practice.
+      ZoneList<CharacterRange>* lhs_range =
+          // node->operands()->at(0)->AsCharacterClass()->ranges(zone);
+          node->operands()->at(0)->IsCharacterClass()
+              ? node->operands()->at(0)->AsCharacterClass()->ranges(zone)
+              : node->operands()->at(0)->AsClassSet()->ranges_;
+      ZoneList<CharacterRange>* rhs_union =
+          zone->template New<ZoneList<CharacterRange>>(2, zone);
+      for (int i = 1; i < node->operands()->length(); i++) {
+        ZoneList<CharacterRange>* op_range =
+            node->operands()->at(i)->IsCharacterClass()
+                ? node->operands()->at(i)->AsCharacterClass()->ranges(zone)
+                : node->operands()->at(i)->AsClassSet()->ranges_;
+        rhs_union->AddAll(*op_range, zone);
+      }
+      CharacterRange::Canonicalize(rhs_union);
+      ZoneList<CharacterRange>* ranges_check =
+          zone->template New<ZoneList<CharacterRange>>(2, zone);
+      CharacterRange::Subtract(lhs_range, rhs_union, ranges_check, zone);
+      DCHECK(CharacterRange::Equals(result_ranges, ranges_check));
+
+      // Check that the result is equal to intersecting the LHS operand with the
+      // complemented union of all RHS operands
+      ZoneList<CharacterRange>* rhs_union_negated =
+          zone->template New<ZoneList<CharacterRange>>(rhs_union->length(),
+                                                       zone);
+      CharacterRange::Negate(rhs_union, rhs_union_negated, zone);
+      ranges_check->Rewind(0);
+      CharacterRange::Intersect(lhs_range, rhs_union_negated, ranges_check,
+                                zone);
+      DCHECK(CharacterRange::Equals(result_ranges, ranges_check));
+#endif
+      break;
+    }
+  }
+
+  if (node->is_negated()) {
+    CharacterRange::Negate(result_ranges, temp_ranges, zone);
+    std::swap(*result_ranges, *temp_ranges);
+    temp_ranges->Rewind(0);
+  }
+
+  DCHECK_EQ(temp_ranges->length(), 0);
+
+#ifdef ENABLE_SLOW_DCHECKS
+  // Cache results for DCHECKs.
+  node->ranges_ =
+      zone->template New<ZoneList<CharacterRange>>(*result_ranges, zone);
+#endif
+}
+
 // static
 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
  if (character_ranges->length() <= 1) return;
@ -1500,7 +1650,7 @@ void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
 }

 // static
-void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
+void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges,
                            ZoneList<CharacterRange>* negated_ranges,
                            Zone* zone) {
  DCHECK(CharacterRange::IsCanonical(ranges));
@ -1523,6 +1673,128 @@ void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
  }
 }

+// static
+void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs,
+                               const ZoneList<CharacterRange>* rhs,
+                               ZoneList<CharacterRange>* intersection,
+                               Zone* zone) {
+  DCHECK(CharacterRange::IsCanonical(lhs));
+  DCHECK(CharacterRange::IsCanonical(rhs));
+  DCHECK_EQ(0, intersection->length());
+  int lhs_index = 0;
+  int rhs_index = 0;
+  while (lhs_index < lhs->length() && rhs_index < rhs->length()) {
+    // Skip non-overlapping ranges.
+    if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) {
+      lhs_index++;
+      continue;
+    }
+    if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) {
+      rhs_index++;
+      continue;
+    }
+
+    base::uc32 from =
+        std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from());
+    base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to());
+    intersection->Add(CharacterRange::Range(from, to), zone);
+    if (to == lhs->at(lhs_index).to()) {
+      lhs_index++;
+    } else {
+      rhs_index++;
+    }
+  }
+
+  DCHECK(IsCanonical(intersection));
+}
+
+namespace {
+
+// Advance |index| and set |from| and |to| to the new range, if not out of
+// bounds of |range|, otherwise |from| is set to a code point beyond the legal
+// unicode character range.
+void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,
+                      base::uc32* from, base::uc32* to) {
+  ++(*index);
+  if (*index < range->length()) {
+    *from = range->at(*index).from();
+    *to = range->at(*index).to();
+  } else {
+    *from = kMaxCodePoint + 1;
+  }
+}
+
+}  // namespace
+
+// static
+void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,
+                              const ZoneList<CharacterRange>* to_remove,
+                              ZoneList<CharacterRange>* result, Zone* zone) {
+  DCHECK(CharacterRange::IsCanonical(src));
+  DCHECK(CharacterRange::IsCanonical(to_remove));
+  DCHECK_EQ(0, result->length());
+  int src_index = 0;
+  int to_remove_index = 0;
+  base::uc32 from = src->at(src_index).from();
+  base::uc32 to = src->at(src_index).to();
+  while (src_index < src->length() && to_remove_index < to_remove->length()) {
+    CharacterRange remove_range = to_remove->at(to_remove_index);
+    if (remove_range.to() < from) {
+      // (a) Non-overlapping case, ignore current to_remove range.
+      //            |-------|
+      // |-------|
+      to_remove_index++;
+    } else if (to < remove_range.from()) {
+      // (b) Non-overlapping case, add full current range to result.
+      // |-------|
+      //            |-------|
+      result->Add(CharacterRange::Range(from, to), zone);
+      SafeAdvanceRange(src, &src_index, &from, &to);
+    } else if (from >= remove_range.from() && to <= remove_range.to()) {
+      // (c) Current to_remove range fully covers current range.
+      //   |---|
+      // |-------|
+      SafeAdvanceRange(src, &src_index, &from, &to);
+    } else if (from < remove_range.from() && to > remove_range.to()) {
+      // (d) Split current range.
+      // |-------|
+      //   |---|
+      result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone);
+      from = remove_range.to() + 1;
+      to_remove_index++;
+    } else if (from < remove_range.from()) {
+      // (e) End current range.
+      // |-------|
+      //    |-------|
+      to = remove_range.from() - 1;
+      result->Add(CharacterRange::Range(from, to), zone);
+      SafeAdvanceRange(src, &src_index, &from, &to);
+    } else if (to > remove_range.to()) {
+      // (f) Modify start of current range.
+      //    |-------|
+      // |-------|
+      from = remove_range.to() + 1;
+      to_remove_index++;
+    } else {
+      UNREACHABLE();
+    }
+  }
+  // The last range needs special treatment after |to_remove| is exhausted, as
+  // |from| might have been modified by the last |to_remove| range and |to| was
+  // not yet known (i.e. cases d and f).
+  if (from <= to) {
+    result->Add(CharacterRange::Range(from, to), zone);
+  }
+  src_index++;
+
+  // Add remaining ranges after |to_remove| is exhausted.
+  for (; src_index < src->length(); src_index++) {
+    result->Add(src->at(src_index), zone);
+  }
+
+  DCHECK(IsCanonical(result));
+}
+
 // static
 void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
  DCHECK(IsCanonical(ranges));
@ -1544,6 +1816,20 @@ void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
  ranges->Rewind(n);
 }

+// static
+bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs,
+                            const ZoneList<CharacterRange>* rhs) {
+  DCHECK(IsCanonical(lhs));
+  DCHECK(IsCanonical(rhs));
+  if (lhs->length() != rhs->length()) return false;
+
+  for (int i = 0; i < lhs->length(); i++) {
+    if (lhs->at(i) != rhs->at(i)) return false;
+  }
+
+  return true;
+}
+
 namespace {

 // Scoped object to keep track of how much we unroll quantifier loops in the
--- a/src/regexp/regexp-error.h
+++ b/src/regexp/regexp-error.h
@ -42,7 +42,9 @@ namespace internal {
  T(InvalidClassPropertyName, "Invalid property name in character class") \
  T(InvalidCharacterClass, "Invalid character class")                     \
  T(UnterminatedCharacterClass, "Unterminated character class")           \
-  T(OutOfOrderCharacterClass, "Range out of order in character class")
+  T(OutOfOrderCharacterClass, "Range out of order in character class")    \
+  T(InvalidClassSetOperation, "Invalid set operation in character class") \
+  T(InvalidCharacterInClass, "Invalid character in character class")

 enum class RegExpError : uint32_t {
 #define TEMPLATE(NAME, STRING) k##NAME,
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
--- a/test/mjsunit/harmony/regexp-unicode-sets.js
+++ b/test/mjsunit/harmony/regexp-unicode-sets.js
@ -0,0 +1,143 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Flags: --harmony-regexp-unicode-sets
+
+// u and v are not allowed together.
+assertEarlyError('/./uv');
+assertThrowsAtRuntime("new RegExp('.','uv')", SyntaxError);
+
+assertEquals('v', /./v.flags);
+assertTrue(/./v.unicodeSets);
+
+// Characters that require escaping within a character class in /v mode
+assertEarlyError('/[(]/v');
+assertEarlyError('/[)]/v');
+assertEarlyError('/[[]/v');
+assertEarlyError('/[]]/v');
+assertEarlyError('/[{]/v');
+assertEarlyError('/[}]/v');
+assertEarlyError('/[/]/v');
+assertEarlyError('/[-]/v');
+// Need to escape the backslash, as assertEarlyError uses eval().
+assertEarlyError('/[\\]/v');
+assertEarlyError('/[|]/v');
+
+assertEarlyError('/[&&]/v');
+assertEarlyError('/[!!]/v');
+assertEarlyError('/[##]/v');
+assertEarlyError('/[$$]/v');
+assertEarlyError('/[%%]/v');
+assertEarlyError('/[**]/v');
+assertEarlyError('/[++]/v');
+assertEarlyError('/[,,]/v');
+assertEarlyError('/[..]/v');
+assertEarlyError('/[::]/v');
+assertEarlyError('/[;;]/v');
+assertEarlyError('/[<<]/v');
+assertEarlyError('/[==]/v');
+assertEarlyError('/[>>]/v');
+assertEarlyError('/[??]/v');
+assertEarlyError('/[@@]/v');
+// The first ^ negates the class. The following two are not valid.
+assertEarlyError('/[^^^]/v');
+assertEarlyError('/[``]/v');
+assertEarlyError('/[~~]/v');
+
+assertEarlyError('/[a&&&]/v');
+assertEarlyError('/[&&&a]/v');
+
+const allAscii = Array.from(
+    {length: 127}, (v, i) => { return String.fromCharCode(i); });
+
+function check(re, expectMatch, expectNoMatch) {
+  if (expectNoMatch === undefined) {
+    const expectSet = new Set(expectMatch.map(val => {
+      return (typeof val == 'number') ? String(val) : val; }));
+    expectNoMatch = allAscii.filter(val => !expectSet.has(val));
+  }
+  for (const match of expectMatch) {
+    assertTrue(re.test(match), `${re}.test(${match})`);
+  }
+  for (const noMatch of expectNoMatch) {
+    assertFalse(re.test(noMatch), `${re}.test(${noMatch})`);
+  }
+  // Nest the current RegExp in a negated class and check expectations are
+  // inversed.
+  const inverted = new RegExp(`[^${re.source}]`, re.flags);
+  for (const match of expectMatch) {
+    assertFalse(inverted.test(match), `${inverted}.test(${match})`);
+  }
+  for (const noMatch of expectNoMatch) {
+    assertTrue(inverted.test(noMatch), `${inverted}.test(${noMatch})`);
+  }
+}
+
+// Union with nested class
+check(
+    /[\da-f[xy][^[^z]]]/v, Array.from('0123456789abcdefxyz'),
+    Array.from('ghijklmnopqrstuv!?'));
+
+// Intersections
+check(/[\d&&[0-9]]/v, Array.from('0123456789'), []);
+check(/[\d&&0]/v, [0], Array.from('123456789'));
+check(/[\d&&9]/v, [9], Array.from('012345678'));
+check(/[\d&&[02468]]/v, Array.from('02468'), Array.from('13579'));
+check(/[\d&&[13579]]/v, Array.from('13579'), Array.from('02468'));
+check(
+    /[\w&&[^a-zA-Z_]]/v, Array.from('0123456789'),
+    Array.from('abcdxyzABCDXYZ_!?'));
+check(
+    /[^\w&&[a-zA-Z_]]/v, Array.from('0123456789!?'),
+    Array.from('abcdxyzABCDXYZ_'));
+
+// Subtractions
+check(/[\d--[!-%]]/v, Array.from('0123456789'));
+check(/[\d--[A-Z]]/v, Array.from('0123456789'));
+check(/[\d--[0-9]]/v, []);
+check(/[\d--[\w]]/v, []);
+check(/[\d--0]/v, Array.from('123456789'));
+check(/[\d--9]/v, Array.from('012345678'));
+check(/[[\d[a-c]]--9]/v, Array.from('012345678abc'));
+check(/[\d--[02468]]/v, Array.from('13579'));
+check(/[\d--[13579]]/v, Array.from('02468'));
+check(/[[3-7]--[0-9]]/v, []);
+check(/[[3-7]--[0-7]]/v, []);
+check(/[[3-7]--[3-9]]/v, []);
+check(/[[3-79]--[0-7]]/v, [9]);
+check(/[[3-79]--[3-9]]/v, []);
+check(/[[3-7]--[0-3]]/v, Array.from('4567'));
+check(/[[3-7]--[0-5]]/v, Array.from('67'));
+check(/[[3-7]--[7-9]]/v, Array.from('3456'));
+check(/[[3-7]--[5-9]]/v, Array.from('34'));
+check(/[[3-7a-c]--[0-3]]/v, Array.from('4567abc'));
+check(/[[3-7a-c]--[0-5]]/v, Array.from('67abc'));
+check(/[[3-7a-c]--[7-9]]/v, Array.from('3456abc'));
+check(/[[3-7a-c]--[5-9]]/v, Array.from('34abc'));
+check(/[[2-8]--[0-3]--5--[7-9]]/v, Array.from('46'));
+check(/[[2-57-8]--[0-3]--[5-7]]/v, Array.from('48'));
+check(/[[0-57-8]--[1-34]--[5-7]]/v, Array.from('08'));
+check(/[\d--[^02468]]/v, Array.from('02468'));
+check(/[\d--[^13579]]/v, Array.from('13579'));
+
+// Ignore-Case
+check(/[Ā-č]/v, Array.from('ĀāĂăĄąĆć'), Array.from('abc'));
+check(/[ĀĂĄĆ]/vi, Array.from('ĀāĂăĄąĆć'), Array.from('abc'));
+check(/[āăąć]/vi, Array.from('ĀāĂăĄąĆć'), Array.from('abc'));
+
+// Some more sophisticated tests taken from
+// https://v8.dev/features/regexp-v-flag
+assertFalse(/[\p{Script_Extensions=Greek}--π]/v.test('π'));
+assertFalse(/[\p{Script_Extensions=Greek}--[αβγ]]/v.test('α'));
+assertFalse(/[\p{Script_Extensions=Greek}--[α-γ]]/v.test('β'));
+assertTrue(/[\p{Decimal_Number}--[0-9]]/v.test('𑜹'));
+assertFalse(/[\p{Decimal_Number}--[0-9]]/v.test('4'));
+assertTrue(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('π'));
+assertFalse(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('𐆊'));
+assertTrue(/[\p{White_Space}&&\p{ASCII}]/v.test('\n'));
+assertFalse(/[\p{White_Space}&&\p{ASCII}]/v.test('\u2028'));
+assertTrue(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test('᠗'));
+assertFalse(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test('ᠴ'));
+assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/\p{Lowercase_Letter}/giv, 'X'));
+assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/[^\P{Lowercase_Letter}]/giv, 'X'));
--- a/test/mjsunit/mjsunit.status
+++ b/test/mjsunit/mjsunit.status
@ -434,6 +434,10 @@
  'regress/regress-1262423': [PASS,FAIL],
  'regress/regress-793588': [PASS,FAIL],

+  # RegExp unicode tests relies on ICU for property classes and
+  # case-insensitive unicode patterns.
+  'harmony/regexp-unicode-sets': [PASS,FAIL],
+
  # The noi18n build cannot parse characters in supplementary plane.
  'harmony/regexp-named-captures': [FAIL],
  'regress/regress-v8-10384': [FAIL],
--- a/test/test262/test262.status
+++ b/test/test262/test262.status
@ -317,76 +317,28 @@
  'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-CharacterClass': [SKIP],
  'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-P': [SKIP],
  'built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-u': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-difference-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-difference-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-difference-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-intersection-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-intersection-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-union-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-union-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-union-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-class-union-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-union-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-class-union-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-difference-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-difference-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-difference-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-difference-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-difference-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-difference-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-intersection-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-intersection-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-intersection-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-intersection-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-intersection-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-intersection-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-string-literal': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-union-character-class-escape': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-union-character-class': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-union-character': [SKIP],
-  'built-ins/RegExp/unicodeSets/generated/character-union-character-property-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-union-property-of-strings-escape': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/character-union-string-literal': [SKIP],
  'built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character-class-escape': [SKIP],
@ -805,6 +757,27 @@
  'built-ins/RegExp/named-groups/unicode-property-names-valid': [SKIP],
  'built-ins/RegExp/named-groups/non-unicode-property-names-valid': [FAIL],
  'built-ins/RegExp/match-indices/indices-array-unicode-property-names': [SKIP],
+  'built-ins/RegExp/unicodeSets/generated/character-class-difference-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-class-union-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-difference-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-intersection-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-union-character-property-escape': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class': [PASS,FAIL],
+  'built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-property-escape': [PASS,FAIL],

  # Unicode in identifiers.
  'language/identifiers/part-unicode-*': [FAIL],