[wasm][revec] Add RevectorizePhase in WASM compilation pipeline

Bug: v8:12716 Change-Id: I7ef53709e9757b58951086fc01af6b2eda296b27 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3411357 Reviewed-by: Maya Lekova <mslekova@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Jie Pan <jie.pan@intel.com> Cr-Commit-Position: refs/heads/main@{#84888}
2022-12-16 12:55:40 +08:00 · 2022-12-16 12:55:40 +08:00 · 256546319c
commit 256546319c
parent cf4b096065
21 changed files with 1390 additions and 260 deletions
--- a/BUILD.gn
+++ b/BUILD.gn
@ -554,6 +554,10 @@ assert(!v8_enable_static_roots ||
                v8_enable_webassembly && v8_enable_i18n_support),
       "Trying to enable static roots in a configuration that is not supported")

+if (v8_enable_webassembly && !target_is_simulator && v8_current_cpu == "x64") {
+  v8_enable_wasm_simd256_revec = true
+}
+
 assert(!v8_disable_write_barriers || v8_enable_single_generation,
       "Disabling write barriers works only with single generation")

@ -1091,6 +1095,9 @@ config("features") {
  if (v8_value_deserializer_hard_fail) {
    defines += [ "V8_VALUE_DESERIALIZER_HARD_FAIL" ]
  }
+  if (v8_enable_wasm_simd256_revec) {
+    defines += [ "V8_ENABLE_WASM_SIMD256_REVEC" ]
+  }
 }

 config("toolchain") {
@ -3746,7 +3753,10 @@ v8_header_set("v8_internal_headers") {
  }

  if (v8_enable_wasm_simd256_revec) {
-    sources += [ "src/compiler/linear-scheduler.h" ]
+    sources += [
+      "src/compiler/linear-scheduler.h",
+      "src/compiler/revectorizer.h",
+    ]
  }

  if (!v8_enable_third_party_heap) {
@ -4258,7 +4268,10 @@ if (v8_enable_webassembly) {
 }

 if (v8_enable_wasm_simd256_revec) {
-  v8_compiler_sources += [ "src/compiler/linear-scheduler.cc" ]
+  v8_compiler_sources += [
+    "src/compiler/linear-scheduler.cc",
+    "src/compiler/revectorizer.cc",
+  ]
 }

 # The src/compiler files with optimizations.
--- a/src/compiler/backend/instruction-selector.h
+++ b/src/compiler/backend/instruction-selector.h
@ -588,7 +588,8 @@ class V8_EXPORT_PRIVATE InstructionSelector final {

 #define DECLARE_GENERATOR(x) void Visit##x(Node* node);
  MACHINE_OP_LIST(DECLARE_GENERATOR)
-  MACHINE_SIMD_OP_LIST(DECLARE_GENERATOR)
+  MACHINE_SIMD128_OP_LIST(DECLARE_GENERATOR)
+  MACHINE_SIMD256_OP_LIST(DECLARE_GENERATOR)
 #undef DECLARE_GENERATOR

  // Visit the load node with a value and opcode to replace with.
--- a/src/compiler/graph.cc
+++ b/src/compiler/graph.cc
@ -20,7 +20,9 @@ Graph::Graph(Zone* zone)
      end_(nullptr),
      mark_max_(0),
      next_node_id_(0),
-      decorators_(zone) {
+      decorators_(zone),
+      has_simd_(false),
+      simd_stores_(zone) {
  // Nodes use compressed pointers, so zone must support pointer compression.
  // If the check fails, ensure the zone is created with kCompressGraphZone
  // flag.
@ -78,6 +80,10 @@ NodeId Graph::NextNodeId() {

 void Graph::Print() const { StdoutStream{} << AsRPO(*this); }

+void Graph::RecordSimdStore(Node* store) { simd_stores_.push_back(store); }
+
+ZoneVector<Node*> const& Graph::GetSimdStoreNodes() { return simd_stores_; }
+
 }  // namespace compiler
 }  // namespace internal
 }  // namespace v8
--- a/src/compiler/graph.h
+++ b/src/compiler/graph.h
@ -95,6 +95,12 @@ class V8_EXPORT_PRIVATE Graph final : public NON_EXPORTED_BASE(ZoneObject) {
  // Very simple print API usable in a debugger.
  void Print() const;

+  bool HasSimd() const { return has_simd_; }
+  void SetSimd(bool has_simd) { has_simd_ = has_simd; }
+
+  void RecordSimdStore(Node* store);
+  ZoneVector<Node*> const& GetSimdStoreNodes();
+
 private:
  friend class NodeMarkerBase;

@ -106,6 +112,8 @@ class V8_EXPORT_PRIVATE Graph final : public NON_EXPORTED_BASE(ZoneObject) {
  Mark mark_max_;
  NodeId next_node_id_;
  ZoneVector<GraphDecorator*> decorators_;
+  bool has_simd_;
+  ZoneVector<Node*> simd_stores_;
 };


--- a/src/compiler/linear-scheduler.cc
+++ b/src/compiler/linear-scheduler.cc
@ -99,8 +99,8 @@ Node* LinearScheduler::GetEarlySchedulePosition(Node* node) {
      NodeState& use = stack.top();
      if (use.early_schedule_position == nullptr ||
          GetControlLevel(use.early_schedule_position) <
-              GetControlLevel(top.early_schedule_position)) {
-        use.early_schedule_position = top.early_schedule_position;
+              GetControlLevel(early_schedule_position)) {
+        use.early_schedule_position = early_schedule_position;
      }
    }
  }
--- a/src/compiler/machine-operator.cc
+++ b/src/compiler/machine-operator.cc
@ -126,6 +126,11 @@ std::ostream& operator<<(std::ostream& os, LoadTransformation rep) {
      return os << "kS128Load32Zero";
    case LoadTransformation::kS128Load64Zero:
      return os << "kS128Load64Zero";
+    // Simd256
+    case LoadTransformation::kS256Load32Splat:
+      return os << "kS256Load32Splat";
+    case LoadTransformation::kS256Load64Splat:
+      return os << "kS256Load64Splat";
  }
  UNREACHABLE();
 }
@ -637,7 +642,18 @@ std::ostream& operator<<(std::ostream& os, TruncateKind kind) {
  V(I32x4RelaxedTruncF64x2UZero, Operator::kNoProperties, 1, 0, 1)         \
  V(I16x8RelaxedQ15MulRS, Operator::kCommutative, 2, 0, 1)                 \
  V(I16x8DotI8x16I7x16S, Operator::kCommutative, 2, 0, 1)                  \
-  V(I32x4DotI8x16I7x16AddS, Operator::kNoProperties, 3, 0, 1)
+  V(I32x4DotI8x16I7x16AddS, Operator::kNoProperties, 3, 0, 1)              \
+  V(F32x8Add, Operator::kCommutative, 2, 0, 1)                             \
+  V(F32x8Sub, Operator::kNoProperties, 2, 0, 1)                            \
+  V(F32x8Mul, Operator::kCommutative, 2, 0, 1)                             \
+  V(F32x8Div, Operator::kNoProperties, 2, 0, 1)                            \
+  V(F32x8Pmin, Operator::kNoProperties, 2, 0, 1)                           \
+  V(F32x8Pmax, Operator::kNoProperties, 2, 0, 1)                           \
+  V(F32x8Eq, Operator::kCommutative, 2, 0, 1)                              \
+  V(F32x8Ne, Operator::kCommutative, 2, 0, 1)                              \
+  V(F32x8Lt, Operator::kNoProperties, 2, 0, 1)                             \
+  V(F32x8Le, Operator::kNoProperties, 2, 0, 1)                             \
+  V(S256Select, Operator::kNoProperties, 3, 0, 1)

 // The format is:
 // V(Name, properties, value_input_count, control_input_count, output_count)
@ -729,7 +745,9 @@ std::ostream& operator<<(std::ostream& os, TruncateKind kind) {
  V(S128Load32x2S)             \
  V(S128Load32x2U)             \
  V(S128Load32Zero)            \
-  V(S128Load64Zero)
+  V(S128Load64Zero)            \
+  V(S256Load32Splat)           \
+  V(S256Load64Splat)

 #if TAGGED_SIZE_8_BYTES

@ -2226,6 +2244,21 @@ StackCheckKind StackCheckKindOf(Operator const* op) {
  return OpParameter<StackCheckKind>(op);
 }

+const Operator* MachineOperatorBuilder::ExtractF128(int32_t lane_index) {
+  DCHECK(0 <= lane_index && lane_index < 2);
+  class ExtractF128Operator final : public Operator1<int32_t> {
+   public:
+    explicit ExtractF128Operator(int32_t lane_index)
+        : Operator1<int32_t>(IrOpcode::kExtractF128, Operator::kPure,
+                             "ExtractF128", 1, 0, 0, 1, 0, 0, lane_index) {
+      lane_index_ = lane_index;
+    }
+
+    int32_t lane_index_;
+  };
+  return zone_->New<ExtractF128Operator>(lane_index);
+}
+
 #undef PURE_BINARY_OP_LIST_32
 #undef PURE_BINARY_OP_LIST_64
 #undef MACHINE_PURE_OP_LIST
--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@ -126,6 +126,8 @@ enum class LoadTransformation {
  kS128Load32x2U,
  kS128Load32Zero,
  kS128Load64Zero,
+  kS256Load32Splat,
+  kS256Load64Splat,
 };

 size_t hash_value(LoadTransformation);
@ -964,6 +966,22 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final

  const Operator* TraceInstruction(uint32_t markid);

+  // SIMD256
+  const Operator* F32x8Add();
+  const Operator* F32x8Sub();
+  const Operator* F32x8Mul();
+  const Operator* F32x8Div();
+  const Operator* F32x8Min();
+  const Operator* F32x8Max();
+  const Operator* F32x8Pmin();
+  const Operator* F32x8Pmax();
+  const Operator* F32x8Eq();
+  const Operator* F32x8Ne();
+  const Operator* F32x8Lt();
+  const Operator* F32x8Le();
+  const Operator* S256Select();
+  const Operator* ExtractF128(int32_t lane_index);
+
  // load [base + index]
  const Operator* Load(LoadRepresentation rep);
  const Operator* LoadImmutable(LoadRepresentation rep);
--- a/src/compiler/node-properties.h
+++ b/src/compiler/node-properties.h
@ -117,6 +117,9 @@ class V8_EXPORT_PRIVATE NodeProperties {
  static bool IsPhi(Node* node) {
    return IrOpcode::IsPhiOpcode(node->opcode());
  }
+  static bool IsSimd128Operation(Node* node) {
+    return IrOpcode::IsSimd128Opcode(node->opcode());
+  }

  // Determines whether exceptions thrown by the given node are handled locally
  // within the graph (i.e. an IfException projection is present). Optionally
--- a/src/compiler/opcodes.h
+++ b/src/compiler/opcodes.h
@ -825,248 +825,264 @@
  V(StackPointerGreaterThan)             \
  V(TraceInstruction)

-#define MACHINE_SIMD_OP_LIST(V)  \
-  V(F64x2Splat)                  \
-  V(F64x2ExtractLane)            \
-  V(F64x2ReplaceLane)            \
-  V(F64x2Abs)                    \
-  V(F64x2Neg)                    \
-  V(F64x2Sqrt)                   \
-  V(F64x2Add)                    \
-  V(F64x2Sub)                    \
-  V(F64x2Mul)                    \
-  V(F64x2Div)                    \
-  V(F64x2Min)                    \
-  V(F64x2Max)                    \
-  V(F64x2Eq)                     \
-  V(F64x2Ne)                     \
-  V(F64x2Lt)                     \
-  V(F64x2Le)                     \
-  V(F64x2Qfma)                   \
-  V(F64x2Qfms)                   \
-  V(F64x2Pmin)                   \
-  V(F64x2Pmax)                   \
-  V(F64x2Ceil)                   \
-  V(F64x2Floor)                  \
-  V(F64x2Trunc)                  \
-  V(F64x2NearestInt)             \
-  V(F64x2ConvertLowI32x4S)       \
-  V(F64x2ConvertLowI32x4U)       \
-  V(F64x2PromoteLowF32x4)        \
-  V(F32x4Splat)                  \
-  V(F32x4ExtractLane)            \
-  V(F32x4ReplaceLane)            \
-  V(F32x4SConvertI32x4)          \
-  V(F32x4UConvertI32x4)          \
-  V(F32x4Abs)                    \
-  V(F32x4Neg)                    \
-  V(F32x4Sqrt)                   \
-  V(F32x4Add)                    \
-  V(F32x4Sub)                    \
-  V(F32x4Mul)                    \
-  V(F32x4Div)                    \
-  V(F32x4Min)                    \
-  V(F32x4Max)                    \
-  V(F32x4Eq)                     \
-  V(F32x4Ne)                     \
-  V(F32x4Lt)                     \
-  V(F32x4Le)                     \
-  V(F32x4Gt)                     \
-  V(F32x4Ge)                     \
-  V(F32x4Qfma)                   \
-  V(F32x4Qfms)                   \
-  V(F32x4Pmin)                   \
-  V(F32x4Pmax)                   \
-  V(F32x4Ceil)                   \
-  V(F32x4Floor)                  \
-  V(F32x4Trunc)                  \
-  V(F32x4NearestInt)             \
-  V(F32x4DemoteF64x2Zero)        \
-  V(I64x2Splat)                  \
-  V(I64x2SplatI32Pair)           \
-  V(I64x2ExtractLane)            \
-  V(I64x2ReplaceLane)            \
-  V(I64x2ReplaceLaneI32Pair)     \
-  V(I64x2Abs)                    \
-  V(I64x2Neg)                    \
-  V(I64x2SConvertI32x4Low)       \
-  V(I64x2SConvertI32x4High)      \
-  V(I64x2UConvertI32x4Low)       \
-  V(I64x2UConvertI32x4High)      \
-  V(I64x2BitMask)                \
-  V(I64x2Shl)                    \
-  V(I64x2ShrS)                   \
-  V(I64x2Add)                    \
-  V(I64x2Sub)                    \
-  V(I64x2Mul)                    \
-  V(I64x2Eq)                     \
-  V(I64x2Ne)                     \
-  V(I64x2GtS)                    \
-  V(I64x2GeS)                    \
-  V(I64x2ShrU)                   \
-  V(I64x2ExtMulLowI32x4S)        \
-  V(I64x2ExtMulHighI32x4S)       \
-  V(I64x2ExtMulLowI32x4U)        \
-  V(I64x2ExtMulHighI32x4U)       \
-  V(I32x4Splat)                  \
-  V(I32x4ExtractLane)            \
-  V(I32x4ReplaceLane)            \
-  V(I32x4SConvertF32x4)          \
-  V(I32x4SConvertI16x8Low)       \
-  V(I32x4SConvertI16x8High)      \
-  V(I32x4Neg)                    \
-  V(I32x4Shl)                    \
-  V(I32x4ShrS)                   \
-  V(I32x4Add)                    \
-  V(I32x4Sub)                    \
-  V(I32x4Mul)                    \
-  V(I32x4MinS)                   \
-  V(I32x4MaxS)                   \
-  V(I32x4Eq)                     \
-  V(I32x4Ne)                     \
-  V(I32x4LtS)                    \
-  V(I32x4LeS)                    \
-  V(I32x4GtS)                    \
-  V(I32x4GeS)                    \
-  V(I32x4UConvertF32x4)          \
-  V(I32x4UConvertI16x8Low)       \
-  V(I32x4UConvertI16x8High)      \
-  V(I32x4ShrU)                   \
-  V(I32x4MinU)                   \
-  V(I32x4MaxU)                   \
-  V(I32x4LtU)                    \
-  V(I32x4LeU)                    \
-  V(I32x4GtU)                    \
-  V(I32x4GeU)                    \
-  V(I32x4Abs)                    \
-  V(I32x4BitMask)                \
-  V(I32x4DotI16x8S)              \
-  V(I32x4ExtMulLowI16x8S)        \
-  V(I32x4ExtMulHighI16x8S)       \
-  V(I32x4ExtMulLowI16x8U)        \
-  V(I32x4ExtMulHighI16x8U)       \
-  V(I32x4ExtAddPairwiseI16x8S)   \
-  V(I32x4ExtAddPairwiseI16x8U)   \
-  V(I32x4TruncSatF64x2SZero)     \
-  V(I32x4TruncSatF64x2UZero)     \
-  V(I16x8Splat)                  \
-  V(I16x8ExtractLaneU)           \
-  V(I16x8ExtractLaneS)           \
-  V(I16x8ReplaceLane)            \
-  V(I16x8SConvertI8x16Low)       \
-  V(I16x8SConvertI8x16High)      \
-  V(I16x8Neg)                    \
-  V(I16x8Shl)                    \
-  V(I16x8ShrS)                   \
-  V(I16x8SConvertI32x4)          \
-  V(I16x8Add)                    \
-  V(I16x8AddSatS)                \
-  V(I16x8Sub)                    \
-  V(I16x8SubSatS)                \
-  V(I16x8Mul)                    \
-  V(I16x8MinS)                   \
-  V(I16x8MaxS)                   \
-  V(I16x8Eq)                     \
-  V(I16x8Ne)                     \
-  V(I16x8LtS)                    \
-  V(I16x8LeS)                    \
-  V(I16x8GtS)                    \
-  V(I16x8GeS)                    \
-  V(I16x8UConvertI8x16Low)       \
-  V(I16x8UConvertI8x16High)      \
-  V(I16x8ShrU)                   \
-  V(I16x8UConvertI32x4)          \
-  V(I16x8AddSatU)                \
-  V(I16x8SubSatU)                \
-  V(I16x8MinU)                   \
-  V(I16x8MaxU)                   \
-  V(I16x8LtU)                    \
-  V(I16x8LeU)                    \
-  V(I16x8GtU)                    \
-  V(I16x8GeU)                    \
-  V(I16x8RoundingAverageU)       \
-  V(I16x8Q15MulRSatS)            \
-  V(I16x8Abs)                    \
-  V(I16x8BitMask)                \
-  V(I16x8ExtMulLowI8x16S)        \
-  V(I16x8ExtMulHighI8x16S)       \
-  V(I16x8ExtMulLowI8x16U)        \
-  V(I16x8ExtMulHighI8x16U)       \
-  V(I16x8ExtAddPairwiseI8x16S)   \
-  V(I16x8ExtAddPairwiseI8x16U)   \
-  V(I8x16Splat)                  \
-  V(I8x16ExtractLaneU)           \
-  V(I8x16ExtractLaneS)           \
-  V(I8x16ReplaceLane)            \
-  V(I8x16SConvertI16x8)          \
-  V(I8x16Neg)                    \
-  V(I8x16Shl)                    \
-  V(I8x16ShrS)                   \
-  V(I8x16Add)                    \
-  V(I8x16AddSatS)                \
-  V(I8x16Sub)                    \
-  V(I8x16SubSatS)                \
-  V(I8x16MinS)                   \
-  V(I8x16MaxS)                   \
-  V(I8x16Eq)                     \
-  V(I8x16Ne)                     \
-  V(I8x16LtS)                    \
-  V(I8x16LeS)                    \
-  V(I8x16GtS)                    \
-  V(I8x16GeS)                    \
-  V(I8x16UConvertI16x8)          \
-  V(I8x16AddSatU)                \
-  V(I8x16SubSatU)                \
-  V(I8x16ShrU)                   \
-  V(I8x16MinU)                   \
-  V(I8x16MaxU)                   \
-  V(I8x16LtU)                    \
-  V(I8x16LeU)                    \
-  V(I8x16GtU)                    \
-  V(I8x16GeU)                    \
-  V(I8x16RoundingAverageU)       \
-  V(I8x16Popcnt)                 \
-  V(I8x16Abs)                    \
-  V(I8x16BitMask)                \
-  V(S128Zero)                    \
-  V(S128Const)                   \
-  V(S128Not)                     \
-  V(S128And)                     \
-  V(S128Or)                      \
-  V(S128Xor)                     \
-  V(S128Select)                  \
-  V(S128AndNot)                  \
-  V(I8x16Swizzle)                \
-  V(I8x16RelaxedLaneSelect)      \
-  V(I16x8RelaxedLaneSelect)      \
-  V(I32x4RelaxedLaneSelect)      \
-  V(I64x2RelaxedLaneSelect)      \
-  V(F32x4RelaxedMin)             \
-  V(F32x4RelaxedMax)             \
-  V(F64x2RelaxedMin)             \
-  V(F64x2RelaxedMax)             \
-  V(I32x4RelaxedTruncF32x4S)     \
-  V(I32x4RelaxedTruncF32x4U)     \
-  V(I32x4RelaxedTruncF64x2SZero) \
-  V(I32x4RelaxedTruncF64x2UZero) \
-  V(I16x8RelaxedQ15MulRS)        \
-  V(I16x8DotI8x16I7x16S)         \
-  V(I32x4DotI8x16I7x16AddS)      \
-  V(I8x16Shuffle)                \
-  V(V128AnyTrue)                 \
-  V(I64x2AllTrue)                \
-  V(I32x4AllTrue)                \
-  V(I16x8AllTrue)                \
-  V(I8x16AllTrue)                \
-  V(LoadTransform)               \
-  V(LoadLane)                    \
+#define MACHINE_SIMD128_OP_LIST(V) \
+  V(F64x2Splat)                    \
+  V(F64x2ExtractLane)              \
+  V(F64x2ReplaceLane)              \
+  V(F64x2Abs)                      \
+  V(F64x2Neg)                      \
+  V(F64x2Sqrt)                     \
+  V(F64x2Add)                      \
+  V(F64x2Sub)                      \
+  V(F64x2Mul)                      \
+  V(F64x2Div)                      \
+  V(F64x2Min)                      \
+  V(F64x2Max)                      \
+  V(F64x2Eq)                       \
+  V(F64x2Ne)                       \
+  V(F64x2Lt)                       \
+  V(F64x2Le)                       \
+  V(F64x2Qfma)                     \
+  V(F64x2Qfms)                     \
+  V(F64x2Pmin)                     \
+  V(F64x2Pmax)                     \
+  V(F64x2Ceil)                     \
+  V(F64x2Floor)                    \
+  V(F64x2Trunc)                    \
+  V(F64x2NearestInt)               \
+  V(F64x2ConvertLowI32x4S)         \
+  V(F64x2ConvertLowI32x4U)         \
+  V(F64x2PromoteLowF32x4)          \
+  V(F32x4Splat)                    \
+  V(F32x4ExtractLane)              \
+  V(F32x4ReplaceLane)              \
+  V(F32x4SConvertI32x4)            \
+  V(F32x4UConvertI32x4)            \
+  V(F32x4Abs)                      \
+  V(F32x4Neg)                      \
+  V(F32x4Sqrt)                     \
+  V(F32x4Add)                      \
+  V(F32x4Sub)                      \
+  V(F32x4Mul)                      \
+  V(F32x4Div)                      \
+  V(F32x4Min)                      \
+  V(F32x4Max)                      \
+  V(F32x4Eq)                       \
+  V(F32x4Ne)                       \
+  V(F32x4Lt)                       \
+  V(F32x4Le)                       \
+  V(F32x4Gt)                       \
+  V(F32x4Ge)                       \
+  V(F32x4Qfma)                     \
+  V(F32x4Qfms)                     \
+  V(F32x4Pmin)                     \
+  V(F32x4Pmax)                     \
+  V(F32x4Ceil)                     \
+  V(F32x4Floor)                    \
+  V(F32x4Trunc)                    \
+  V(F32x4NearestInt)               \
+  V(F32x4DemoteF64x2Zero)          \
+  V(I64x2Splat)                    \
+  V(I64x2SplatI32Pair)             \
+  V(I64x2ExtractLane)              \
+  V(I64x2ReplaceLane)              \
+  V(I64x2ReplaceLaneI32Pair)       \
+  V(I64x2Abs)                      \
+  V(I64x2Neg)                      \
+  V(I64x2SConvertI32x4Low)         \
+  V(I64x2SConvertI32x4High)        \
+  V(I64x2UConvertI32x4Low)         \
+  V(I64x2UConvertI32x4High)        \
+  V(I64x2BitMask)                  \
+  V(I64x2Shl)                      \
+  V(I64x2ShrS)                     \
+  V(I64x2Add)                      \
+  V(I64x2Sub)                      \
+  V(I64x2Mul)                      \
+  V(I64x2Eq)                       \
+  V(I64x2Ne)                       \
+  V(I64x2GtS)                      \
+  V(I64x2GeS)                      \
+  V(I64x2ShrU)                     \
+  V(I64x2ExtMulLowI32x4S)          \
+  V(I64x2ExtMulHighI32x4S)         \
+  V(I64x2ExtMulLowI32x4U)          \
+  V(I64x2ExtMulHighI32x4U)         \
+  V(I32x4Splat)                    \
+  V(I32x4ExtractLane)              \
+  V(I32x4ReplaceLane)              \
+  V(I32x4SConvertF32x4)            \
+  V(I32x4SConvertI16x8Low)         \
+  V(I32x4SConvertI16x8High)        \
+  V(I32x4Neg)                      \
+  V(I32x4Shl)                      \
+  V(I32x4ShrS)                     \
+  V(I32x4Add)                      \
+  V(I32x4Sub)                      \
+  V(I32x4Mul)                      \
+  V(I32x4MinS)                     \
+  V(I32x4MaxS)                     \
+  V(I32x4Eq)                       \
+  V(I32x4Ne)                       \
+  V(I32x4LtS)                      \
+  V(I32x4LeS)                      \
+  V(I32x4GtS)                      \
+  V(I32x4GeS)                      \
+  V(I32x4UConvertF32x4)            \
+  V(I32x4UConvertI16x8Low)         \
+  V(I32x4UConvertI16x8High)        \
+  V(I32x4ShrU)                     \
+  V(I32x4MinU)                     \
+  V(I32x4MaxU)                     \
+  V(I32x4LtU)                      \
+  V(I32x4LeU)                      \
+  V(I32x4GtU)                      \
+  V(I32x4GeU)                      \
+  V(I32x4Abs)                      \
+  V(I32x4BitMask)                  \
+  V(I32x4DotI16x8S)                \
+  V(I32x4ExtMulLowI16x8S)          \
+  V(I32x4ExtMulHighI16x8S)         \
+  V(I32x4ExtMulLowI16x8U)          \
+  V(I32x4ExtMulHighI16x8U)         \
+  V(I32x4ExtAddPairwiseI16x8S)     \
+  V(I32x4ExtAddPairwiseI16x8U)     \
+  V(I32x4TruncSatF64x2SZero)       \
+  V(I32x4TruncSatF64x2UZero)       \
+  V(I16x8Splat)                    \
+  V(I16x8ExtractLaneU)             \
+  V(I16x8ExtractLaneS)             \
+  V(I16x8ReplaceLane)              \
+  V(I16x8SConvertI8x16Low)         \
+  V(I16x8SConvertI8x16High)        \
+  V(I16x8Neg)                      \
+  V(I16x8Shl)                      \
+  V(I16x8ShrS)                     \
+  V(I16x8SConvertI32x4)            \
+  V(I16x8Add)                      \
+  V(I16x8AddSatS)                  \
+  V(I16x8Sub)                      \
+  V(I16x8SubSatS)                  \
+  V(I16x8Mul)                      \
+  V(I16x8MinS)                     \
+  V(I16x8MaxS)                     \
+  V(I16x8Eq)                       \
+  V(I16x8Ne)                       \
+  V(I16x8LtS)                      \
+  V(I16x8LeS)                      \
+  V(I16x8GtS)                      \
+  V(I16x8GeS)                      \
+  V(I16x8UConvertI8x16Low)         \
+  V(I16x8UConvertI8x16High)        \
+  V(I16x8ShrU)                     \
+  V(I16x8UConvertI32x4)            \
+  V(I16x8AddSatU)                  \
+  V(I16x8SubSatU)                  \
+  V(I16x8MinU)                     \
+  V(I16x8MaxU)                     \
+  V(I16x8LtU)                      \
+  V(I16x8LeU)                      \
+  V(I16x8GtU)                      \
+  V(I16x8GeU)                      \
+  V(I16x8RoundingAverageU)         \
+  V(I16x8Q15MulRSatS)              \
+  V(I16x8Abs)                      \
+  V(I16x8BitMask)                  \
+  V(I16x8ExtMulLowI8x16S)          \
+  V(I16x8ExtMulHighI8x16S)         \
+  V(I16x8ExtMulLowI8x16U)          \
+  V(I16x8ExtMulHighI8x16U)         \
+  V(I16x8ExtAddPairwiseI8x16S)     \
+  V(I16x8ExtAddPairwiseI8x16U)     \
+  V(I8x16Splat)                    \
+  V(I8x16ExtractLaneU)             \
+  V(I8x16ExtractLaneS)             \
+  V(I8x16ReplaceLane)              \
+  V(I8x16SConvertI16x8)            \
+  V(I8x16Neg)                      \
+  V(I8x16Shl)                      \
+  V(I8x16ShrS)                     \
+  V(I8x16Add)                      \
+  V(I8x16AddSatS)                  \
+  V(I8x16Sub)                      \
+  V(I8x16SubSatS)                  \
+  V(I8x16MinS)                     \
+  V(I8x16MaxS)                     \
+  V(I8x16Eq)                       \
+  V(I8x16Ne)                       \
+  V(I8x16LtS)                      \
+  V(I8x16LeS)                      \
+  V(I8x16GtS)                      \
+  V(I8x16GeS)                      \
+  V(I8x16UConvertI16x8)            \
+  V(I8x16AddSatU)                  \
+  V(I8x16SubSatU)                  \
+  V(I8x16ShrU)                     \
+  V(I8x16MinU)                     \
+  V(I8x16MaxU)                     \
+  V(I8x16LtU)                      \
+  V(I8x16LeU)                      \
+  V(I8x16GtU)                      \
+  V(I8x16GeU)                      \
+  V(I8x16RoundingAverageU)         \
+  V(I8x16Popcnt)                   \
+  V(I8x16Abs)                      \
+  V(I8x16BitMask)                  \
+  V(S128Zero)                      \
+  V(S128Const)                     \
+  V(S128Not)                       \
+  V(S128And)                       \
+  V(S128Or)                        \
+  V(S128Xor)                       \
+  V(S128Select)                    \
+  V(S128AndNot)                    \
+  V(I8x16Swizzle)                  \
+  V(I8x16RelaxedLaneSelect)        \
+  V(I16x8RelaxedLaneSelect)        \
+  V(I32x4RelaxedLaneSelect)        \
+  V(I64x2RelaxedLaneSelect)        \
+  V(F32x4RelaxedMin)               \
+  V(F32x4RelaxedMax)               \
+  V(F64x2RelaxedMin)               \
+  V(F64x2RelaxedMax)               \
+  V(I32x4RelaxedTruncF32x4S)       \
+  V(I32x4RelaxedTruncF32x4U)       \
+  V(I32x4RelaxedTruncF64x2SZero)   \
+  V(I32x4RelaxedTruncF64x2UZero)   \
+  V(I16x8RelaxedQ15MulRS)          \
+  V(I16x8DotI8x16I7x16S)           \
+  V(I32x4DotI8x16I7x16AddS)        \
+  V(I8x16Shuffle)                  \
+  V(V128AnyTrue)                   \
+  V(I64x2AllTrue)                  \
+  V(I32x4AllTrue)                  \
+  V(I16x8AllTrue)                  \
+  V(I8x16AllTrue)                  \
+  V(LoadTransform)                 \
+  V(LoadLane)                      \
  V(StoreLane)

-#define VALUE_OP_LIST(V)  \
-  COMMON_OP_LIST(V)       \
-  SIMPLIFIED_OP_LIST(V)   \
-  MACHINE_OP_LIST(V)      \
-  MACHINE_SIMD_OP_LIST(V) \
+// SIMD256 for AVX
+#define MACHINE_SIMD256_OP_LIST(V) \
+  V(F32x8Add)                      \
+  V(F32x8Sub)                      \
+  V(F32x8Mul)                      \
+  V(F32x8Div)                      \
+  V(F32x8Pmin)                     \
+  V(F32x8Pmax)                     \
+  V(F32x8Eq)                       \
+  V(F32x8Ne)                       \
+  V(F32x8Lt)                       \
+  V(F32x8Le)                       \
+  V(S256Select)                    \
+  V(ExtractF128)
+
+#define VALUE_OP_LIST(V)     \
+  COMMON_OP_LIST(V)          \
+  SIMPLIFIED_OP_LIST(V)      \
+  MACHINE_OP_LIST(V)         \
+  MACHINE_SIMD128_OP_LIST(V) \
+  MACHINE_SIMD256_OP_LIST(V) \
  JS_OP_LIST(V)

 // The combination of all operators at all levels and the common operators.
@ -1252,6 +1268,18 @@ class V8_EXPORT_PRIVATE IrOpcode {
    }
    UNREACHABLE();
  }
+
+  static bool IsSimd128Opcode(Value value) {
+#define CASE(Name, ...) case k##Name:
+    switch (value) {
+      MACHINE_SIMD128_OP_LIST(CASE)
+      return true;
+      default:
+        return false;
+    }
+#undef CASE
+    UNREACHABLE();
+  }
 };

 V8_EXPORT_PRIVATE std::ostream& operator<<(std::ostream&, IrOpcode::Value);
--- a/src/compiler/operator-properties.cc
+++ b/src/compiler/operator-properties.cc
@ -126,7 +126,8 @@ bool OperatorProperties::NeedsExactContext(const Operator* op) {
      COMMON_OP_LIST(CASE)
      CONTROL_OP_LIST(CASE)
      MACHINE_OP_LIST(CASE)
-      MACHINE_SIMD_OP_LIST(CASE)
+      MACHINE_SIMD128_OP_LIST(CASE)
+      MACHINE_SIMD256_OP_LIST(CASE)
      SIMPLIFIED_OP_LIST(CASE)
      break;
 #undef CASE
--- a/src/compiler/pipeline.cc
+++ b/src/compiler/pipeline.cc
@ -129,6 +129,10 @@
 #include "src/wasm/wasm-engine.h"
 #endif  // V8_ENABLE_WEBASSEMBLY

+#if V8_ENABLE_WASM_SIMD256_REVEC
+#include "src/compiler/revectorizer.h"
+#endif  // V8_ENABLE_WASM_SIMD256_REVEC
+
 namespace v8 {
 namespace internal {
 namespace compiler {
@ -728,6 +732,10 @@ class PipelineImpl final {
  // Substep B.1. Produce a scheduled graph.
  void ComputeScheduledGraph();

+#if V8_ENABLE_WASM_SIMD256_REVEC
+  void Revectorize();
+#endif  // V8_ENABLE_WASM_SIMD256_REVEC
+
  // Substep B.2. Select instructions from a scheduled graph.
  bool SelectInstructions(Linkage* linkage);

@ -2355,6 +2363,17 @@ struct ComputeSchedulePhase {
  }
 };

+#if V8_ENABLE_WASM_SIMD256_REVEC
+struct RevectorizePhase {
+  DECL_PIPELINE_PHASE_CONSTANTS(Revectorizer)
+
+  void Run(PipelineData* data, Zone* temp_zone) {
+    Revectorizer revec(temp_zone, data->graph(), data->mcgraph());
+    revec.TryRevectorize(data->info()->GetDebugName().get());
+  }
+};
+#endif  // V8_ENABLE_WASM_SIMD256_REVEC
+
 struct InstructionRangesAsJSON {
  const InstructionSequence* sequence;
  const ZoneVector<std::pair<int, int>>* instr_origins;
@ -3458,6 +3477,13 @@ void Pipeline::GenerateCodeForWasmFunction(

  pipeline.RunPrintAndVerify("V8.WasmMachineCode", true);

+#if V8_ENABLE_WASM_SIMD256_REVEC
+  if (v8_flags.experimental_wasm_revectorize) {
+    pipeline.Revectorize();
+    pipeline.RunPrintAndVerify("V8.WasmRevec", true);
+  }
+#endif  // V8_ENABLE_WASM_SIMD256_REVEC
+
  data.BeginPhaseKind("V8.WasmOptimization");
  if (v8_flags.wasm_inlining) {
    pipeline.Run<WasmInliningPhase>(env, function_index, wire_bytes_storage,
@ -3763,6 +3789,10 @@ void PipelineImpl::ComputeScheduledGraph() {
  TraceScheduleAndVerify(data->info(), data, data->schedule(), "schedule");
 }

+#if V8_ENABLE_WASM_SIMD256_REVEC
+void PipelineImpl::Revectorize() { Run<RevectorizePhase>(); }
+#endif  // V8_ENABLE_WASM_SIMD256_REVEC
+
 bool PipelineImpl::SelectInstructions(Linkage* linkage) {
  auto call_descriptor = linkage->GetIncomingDescriptor();
  PipelineData* data = this->data_;
--- a/src/compiler/revectorizer.cc
+++ b/src/compiler/revectorizer.cc
@ -0,0 +1,647 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/compiler/revectorizer.h"
+
+#include "src/base/cpu.h"
+#include "src/base/logging.h"
+#include "src/compiler/all-nodes.h"
+#include "src/compiler/machine-operator.h"
+#include "src/compiler/verifier.h"
+
+namespace v8 {
+namespace internal {
+namespace compiler {
+
+#define TRACE(...)                         \
+  do {                                     \
+    if (v8_flags.trace_wasm_revectorize) { \
+      PrintF("Revec: ");                   \
+      PrintF(__VA_ARGS__);                 \
+    }                                      \
+  } while (false)
+
+namespace {
+
+#ifdef DEBUG
+// Currently, only Load/ProtectedLoad/LoadTransfrom are supported.
+// TODO(jiepan): add support for UnalignedLoad, LoadLane
+bool IsSupportedLoad(const Node* node) {
+  if (node->opcode() == IrOpcode::kProtectedLoad ||
+      node->opcode() == IrOpcode::kLoad ||
+      node->opcode() == IrOpcode::kLoadTransform) {
+    return true;
+  }
+  return false;
+}
+bool IsSupportedLoad(const ZoneVector<Node*>& node_group) {
+  for (auto node : node_group) {
+    if (!IsSupportedLoad(node)) return false;
+  }
+  return true;
+}
+#endif
+
+int64_t GetConstantValue(const Node* node) {
+  int64_t value = -1;
+  if (node->opcode() == IrOpcode::kInt64Constant) {
+    value = OpParameter<int64_t>(node->op());
+  }
+  return value;
+}
+
+int64_t GetMemoryOffsetValue(const Node* node) {
+  DCHECK(node->opcode() == IrOpcode::kProtectedLoad ||
+         node->opcode() == IrOpcode::kStore ||
+         node->opcode() == IrOpcode::kProtectedStore);
+
+  Node* offset = node->InputAt(0);
+  if (offset->opcode() == IrOpcode::kLoadFromObject ||
+      offset->opcode() == IrOpcode::kLoad) {
+    return 0;
+  }
+
+  int64_t offset_value = -1;
+  if (offset->opcode() == IrOpcode::kInt64Add) {
+    if (NodeProperties::IsConstant(offset->InputAt(0))) {
+      offset_value = GetConstantValue(offset->InputAt(0));
+    } else if (NodeProperties::IsConstant(offset->InputAt(1))) {
+      offset_value = GetConstantValue(offset->InputAt(1));
+    }
+  }
+  return offset_value;
+}
+
+// We want to combine load/store nodes with continuous memory address,
+// for load/store node, input(0) is memory_start + offset,  input(1) is index,
+// we currently use index as the address of the node, nodes with same index and
+// continuous offset can be combined together.
+Node* GetNodeAddress(const Node* node) {
+  Node* address = node->InputAt(1);
+  // The index is changed to Uint64 for memory32
+  if (address->opcode() == IrOpcode::kChangeUint32ToUint64) {
+    address = address->InputAt(0);
+  }
+  return address;
+}
+
+bool IsContinuousAccess(const ZoneVector<Node*>& node_group) {
+  DCHECK_GT(node_group.size(), 0);
+  int64_t previous_offset = GetMemoryOffsetValue(node_group[0]);
+  for (size_t i = 1; i < node_group.size(); ++i) {
+    int64_t current_offset = GetMemoryOffsetValue(node_group[i]);
+    int64_t diff = current_offset - previous_offset;
+    if (diff != kSimd128Size) {
+      TRACE("Non-continuous store!");
+      return false;
+    }
+    previous_offset = current_offset;
+  }
+  return true;
+}
+
+// Returns true if all of the nodes in node_group are constants.
+bool AllConstant(const ZoneVector<Node*>& node_group) {
+  for (Node* node : node_group) {
+    if (!NodeProperties::IsConstant(node)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns true if all the addresses of the nodes in node_group are identical.
+bool AllSameAddress(const ZoneVector<Node*>& nodes) {
+  Node* address = GetNodeAddress(nodes[0]);
+  for (size_t i = 1; i < nodes.size(); i++) {
+    if (GetNodeAddress(nodes[i]) != address) {
+      TRACE("Diff address #%d,#%d!\n", address->id(),
+            GetNodeAddress(nodes[i])->id());
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns true if all of the nodes in node_group are identical.
+// Splat opcode in WASM SIMD is used to create vector with identical lanes.
+bool IsSplat(const ZoneVector<Node*>& node_group) {
+  for (ZoneVector<Node*>::size_type i = 1; i < node_group.size(); ++i) {
+    if (node_group[i] != node_group[0]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns true if all of the nodes in node_group have the same type.
+bool AllSameOperator(const ZoneVector<Node*>& node_group) {
+  auto op = node_group[0]->op();
+  for (ZoneVector<Node*>::size_type i = 1; i < node_group.size(); i++) {
+    if (node_group[i]->op() != op) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // anonymous namespace
+
+// Sort load/store node by offset
+bool MemoryOffsetComparer::operator()(const Node* lhs, const Node* rhs) const {
+  return GetMemoryOffsetValue(lhs) < GetMemoryOffsetValue(rhs);
+}
+
+void PackNode::Print() const {
+  if (revectorized_node_ != nullptr) {
+    TRACE("0x%p #%d:%s(%d %d, %s)\n", this, revectorized_node_->id(),
+          revectorized_node_->op()->mnemonic(), nodes_[0]->id(),
+          nodes_[1]->id(), nodes_[0]->op()->mnemonic());
+  } else {
+    TRACE("0x%p null(%d %d, %s)\n", this, nodes_[0]->id(), nodes_[1]->id(),
+          nodes_[0]->op()->mnemonic());
+  }
+}
+
+bool SLPTree::CanBePacked(const ZoneVector<Node*>& node_group) {
+  DCHECK_EQ(node_group.size(), 2);
+  if (!SameBasicBlock(node_group[0], node_group[1])) {
+    TRACE("%s(#%d, #%d) not in same BB!\n", node_group[0]->op()->mnemonic(),
+          node_group[0]->id(), node_group[1]->id());
+    return false;
+  }
+  if (!AllSameOperator(node_group)) {
+    TRACE("%s(#%d, #%d) have different operator!\n",
+          node_group[0]->op()->mnemonic(), node_group[0]->id(),
+          node_group[1]->id());
+    return false;
+  }
+  // TODO(jiepan): add support for Constant
+  if (AllConstant(node_group)) {
+    TRACE("%s(#%d, #%d) are constantant, not supported yet!\n",
+          node_group[0]->op()->mnemonic(), node_group[0]->id(),
+          node_group[1]->id());
+    return false;
+  }
+
+  // Only Support simd128 operators or common operators with simd128
+  // MachineRepresentation. The MachineRepresentation of root had been checked,
+  // and the leaf node will be checked later. here we omit the check of
+  // MachineRepresentation, only check the opcode itself.
+  IrOpcode::Value op = node_group[0]->opcode();
+  if (NodeProperties::IsSimd128Operation(node_group[0]) ||
+      (op == IrOpcode::kStore) || (op == IrOpcode::kProtectedStore) ||
+      (op == IrOpcode::kLoad) || (op == IrOpcode::kProtectedLoad) ||
+      (op == IrOpcode::kPhi) || (op == IrOpcode::kLoopExitValue) ||
+      (op == IrOpcode::kExtractF128)) {
+    return true;
+  }
+  return false;
+}
+
+PackNode* SLPTree::NewPackNode(const ZoneVector<Node*>& node_group) {
+  TRACE("PackNode %s(#%d:, #%d)\n", node_group[0]->op()->mnemonic(),
+        node_group[0]->id(), node_group[1]->id());
+  PackNode* pnode = zone_->New<PackNode>(zone_, node_group);
+  for (Node* node : node_group) {
+    node_to_packnode_[node] = pnode;
+  }
+  return pnode;
+}
+
+PackNode* SLPTree::NewPackNodeAndRecurs(const ZoneVector<Node*>& node_group,
+                                        int start_index, int count,
+                                        unsigned recursion_depth) {
+  PackNode* pnode = NewPackNode(node_group);
+  for (int i = start_index; i < start_index + count; ++i) {
+    ZoneVector<Node*> operands(zone_);
+    // Prepare the operand vector.
+    for (size_t j = 0; j < node_group.size(); j++) {
+      Node* node = node_group[j];
+      operands.push_back(NodeProperties::GetValueInput(node, i));
+    }
+
+    PackNode* child = BuildTreeRec(operands, recursion_depth + 1);
+    if (child) {
+      pnode->SetOperand(i, child);
+    } else {
+      return nullptr;
+    }
+  }
+  return pnode;
+}
+
+PackNode* SLPTree::GetPackNode(Node* node) {
+  auto I = node_to_packnode_.find(node);
+  if (I != node_to_packnode_.end()) {
+    return I->second;
+  }
+  return nullptr;
+}
+
+void SLPTree::PushStack(const ZoneVector<Node*>& node_group) {
+  TRACE("Stack Push (%d %s, %d %s)\n", node_group[0]->id(),
+        node_group[0]->op()->mnemonic(), node_group[1]->id(),
+        node_group[1]->op()->mnemonic());
+  for (auto node : node_group) {
+    on_stack_.insert(node);
+  }
+  stack_.push({node_group});
+}
+
+void SLPTree::PopStack() {
+  const ZoneVector<Node*>& node_group = stack_.top();
+  DCHECK_EQ(node_group.size(), 2);
+  TRACE("Stack Pop (%d %s, %d %s)\n", node_group[0]->id(),
+        node_group[0]->op()->mnemonic(), node_group[1]->id(),
+        node_group[1]->op()->mnemonic());
+  for (auto node : node_group) {
+    on_stack_.erase(node);
+  }
+  stack_.pop();
+}
+
+bool SLPTree::OnStack(Node* node) {
+  return on_stack_.find(node) != on_stack_.end();
+}
+
+bool SLPTree::AllOnStack(const ZoneVector<Node*>& node_group) {
+  for (auto node : node_group) {
+    if (OnStack(node)) return true;
+  }
+  return false;
+}
+
+bool SLPTree::StackTopIsPhi() {
+  const ZoneVector<Node*>& node_group = stack_.top();
+  DCHECK_EQ(node_group.size(), 2);
+  return NodeProperties::IsPhi(node_group[0]);
+}
+
+void SLPTree::ClearStack() {
+  stack_ = ZoneStack<ZoneVector<Node*>>(zone_);
+  on_stack_.clear();
+}
+
+bool SLPTree::IsSideEffectFreeLoad(const ZoneVector<Node*>& node_group) {
+  DCHECK(IsSupportedLoad(node_group));
+  DCHECK_EQ(node_group.size(), 2);
+  TRACE("Enter IsSideEffectFreeLoad (%d %s, %d %s)\n", node_group[0]->id(),
+        node_group[0]->op()->mnemonic(), node_group[1]->id(),
+        node_group[1]->op()->mnemonic());
+
+  std::stack<Node*> to_visit;
+  std::unordered_set<Node*> visited;
+  // Visit all the inputs (except for control inputs) of Loads.
+  for (size_t i = 0, e = node_group.size(); i < e; i++) {
+    Node* load = node_group[i];
+    for (int j = 0; j < NodeProperties::FirstControlIndex(load); ++j) {
+      Node* input = load->InputAt(j);
+      if (std::find(node_group.begin(), node_group.end(), input) ==
+          node_group.end()) {
+        to_visit.push(input);
+      }
+    }
+  }
+
+  // Check the inputs of Loads and find if they are connected to existing nodes
+  // in SLPTree. If there is, then there will be side effect and we can not
+  // merge such Loads.
+  while (!to_visit.empty()) {
+    Node* input = to_visit.top();
+    to_visit.pop();
+    TRACE("IsSideEffectFreeLoad visit (%d %s)\n", input->id(),
+          input->op()->mnemonic());
+    if (visited.find(input) == visited.end()) {
+      visited.insert(input);
+
+      if (OnStack(input)) {
+        TRACE("Has internal dependency because (%d %s) on stack\n", input->id(),
+              input->op()->mnemonic());
+        return false;
+      }
+
+      // If the input is not in same basic block as Loads, it must not be in
+      // SLPTree. Otherwise recursively visit all input's edges and find if they
+      // are connected to SLPTree.
+      if (SameBasicBlock(input, node_group[0])) {
+        for (int i = 0; i < NodeProperties::FirstControlIndex(input); ++i) {
+          to_visit.push(input->InputAt(i));
+        }
+      }
+    }
+  }
+  return true;
+}
+
+PackNode* SLPTree::BuildTree(const ZoneVector<Node*>& roots) {
+  TRACE("Enter %s\n", __func__);
+
+  DeleteTree();
+
+  root_ = BuildTreeRec(roots, 0);
+  return root_;
+}
+
+PackNode* SLPTree::BuildTreeRec(const ZoneVector<Node*>& node_group,
+                                unsigned recursion_depth) {
+  TRACE("Enter %s\n", __func__);
+  DCHECK_EQ(node_group.size(), 2);
+
+  Node* node0 = node_group[0];
+  Node* node1 = node_group[1];
+
+  if (recursion_depth == RecursionMaxDepth) {
+    TRACE("Failed due to max recursion depth!\n");
+    return nullptr;
+  }
+
+  if (AllOnStack(node_group)) {
+    if (!StackTopIsPhi()) {
+      TRACE("Failed due to (%d %s, %d %s) on stack!\n", node0->id(),
+            node0->op()->mnemonic(), node1->id(), node1->op()->mnemonic());
+      return nullptr;
+    }
+  }
+  PushStack(node_group);
+
+  if (!CanBePacked(node_group)) {
+    return nullptr;
+  }
+
+  DCHECK(AllConstant(node_group) || AllSameOperator(node_group));
+
+  // Check if this is a duplicate of another entry.
+  for (Node* node : node_group) {
+    if (PackNode* p = GetPackNode(node)) {
+      if (!p->IsSame(node_group)) {
+        // TODO(jiepan): Gathering due to partial overlap
+        TRACE("Failed due to partial overlap at #%d,%s!\n", node->id(),
+              node->op()->mnemonic());
+        return nullptr;
+      }
+
+      PopStack();
+      TRACE("Perfect diamond merge at #%d,%s\n", node->id(),
+            node->op()->mnemonic());
+      return p;
+    }
+  }
+
+  if (node0->opcode() == IrOpcode::kExtractF128) {
+    Node* source = node0->InputAt(0);
+    TRACE("Extract leaf node from #%d,%s!\n", source->id(),
+          source->op()->mnemonic());
+    // For 256 only, check whether they are from the same source
+    if (node0->InputAt(0) == node1->InputAt(0) &&
+        (node0->InputAt(0)->opcode() == IrOpcode::kLoadTransform
+             ? node0 == node1
+             : OpParameter<int32_t>(node0->op()) + 1 ==
+                   OpParameter<int32_t>(node1->op()))) {
+      TRACE("Added a pair of Extract.\n");
+      PackNode* pnode = NewPackNode(node_group);
+      PopStack();
+      return pnode;
+    }
+    TRACE("Failed due to ExtractF128!\n");
+    return nullptr;
+  }
+
+  if (node0->opcode() == IrOpcode::kProtectedLoad ||
+      node0->opcode() == IrOpcode::kLoadTransform) {
+    TRACE("Load leaf node\n");
+    if (!AllSameAddress(node_group)) {
+      TRACE("Failed due to different load addr!\n");
+      return nullptr;
+    }
+    if (node0->opcode() == IrOpcode::kProtectedLoad) {
+      MachineRepresentation rep =
+          LoadRepresentationOf(node0->op()).representation();
+      if (rep != MachineRepresentation::kSimd128) {
+        return nullptr;
+      }
+      // Sort loads by offset
+      ZoneVector<Node*> sorted_node_group(node_group.size(), zone_);
+      partial_sort_copy(begin(node_group), end(node_group),
+                        begin(sorted_node_group), end(sorted_node_group),
+                        MemoryOffsetComparer());
+      if (!IsContinuousAccess(sorted_node_group)) {
+        TRACE("Failed due to non-continuous load!\n");
+        return nullptr;
+      }
+    }
+
+    if (node0->opcode() == IrOpcode::kLoadTransform) {
+      if (!IsSplat(node_group)) {
+        TRACE("LoadTransform Failed due to IsSplat!\n");
+        return nullptr;
+      }
+      LoadTransformParameters params = LoadTransformParametersOf(node0->op());
+      // TODO(jiepan): Support more LoadTransformation types
+      if (params.transformation != LoadTransformation::kS128Load32Splat &&
+          params.transformation != LoadTransformation::kS128Load64Splat) {
+        TRACE("LoadTransform failed due to unsupported type #%d!\n",
+              node0->id());
+        return nullptr;
+      }
+    }
+
+    if (!IsSideEffectFreeLoad(node_group)) {
+      TRACE("Failed due to dependency check\n");
+      return nullptr;
+    }
+    PackNode* p = NewPackNode(node_group);
+    PopStack();
+    return p;
+  }
+
+  int value_in_count = node0->op()->ValueInputCount();
+  switch (node0->opcode()) {
+    case IrOpcode::kPhi: {
+      TRACE("Added a vector of PHI nodes.\n");
+      MachineRepresentation rep = PhiRepresentationOf(node0->op());
+      if (rep != MachineRepresentation::kSimd128) {
+        return nullptr;
+      }
+      PackNode* pnode =
+          NewPackNodeAndRecurs(node_group, 0, value_in_count, recursion_depth);
+      PopStack();
+      return pnode;
+    }
+    case IrOpcode::kLoopExitValue: {
+      MachineRepresentation rep = LoopExitValueRepresentationOf(node0->op());
+      if (rep != MachineRepresentation::kSimd128) {
+        return nullptr;
+      }
+      PackNode* pnode =
+          NewPackNodeAndRecurs(node_group, 0, value_in_count, recursion_depth);
+      PopStack();
+      return pnode;
+    }
+    case IrOpcode::kF32x4Add:
+    case IrOpcode::kF32x4Mul: {
+      TRACE("Added a vector of un/bin/ter op.\n");
+      PackNode* pnode =
+          NewPackNodeAndRecurs(node_group, 0, value_in_count, recursion_depth);
+      PopStack();
+      return pnode;
+    }
+
+    // TODO(jiepan): UnalignedStore,
+    case IrOpcode::kStore:
+    case IrOpcode::kProtectedStore: {
+      TRACE("Added a vector of stores.\n");
+      if (!AllSameAddress(node_group)) {
+        TRACE("Failed due to different store addr!\n");
+        return nullptr;
+      }
+      PackNode* pnode = NewPackNodeAndRecurs(node_group, 2, 1, recursion_depth);
+      PopStack();
+      return pnode;
+    }
+    default:
+      TRACE("Default branch #%d:%s\n", node0->id(), node0->op()->mnemonic());
+      break;
+  }
+  return nullptr;
+}
+
+void SLPTree::DeleteTree() {
+  ClearStack();
+  node_to_packnode_.clear();
+}
+
+void SLPTree::Print(const char* info) {
+  TRACE("%s, Packed node:\n", info);
+  if (!v8_flags.trace_wasm_revectorize) {
+    return;
+  }
+  std::unordered_set<PackNode const*> visited;
+
+  for (auto& entry : node_to_packnode_) {
+    PackNode const* pnode = entry.second;
+    if (!pnode || visited.find(pnode) != visited.end()) {
+      continue;
+    }
+    pnode->Print();
+    visited.insert(pnode);
+  }
+}
+
+//////////////////////////////////////////////////////
+void Revectorizer::DetectCPUFeatures() {
+  base::CPU cpu;
+  if (cpu.has_avx2()) {
+    support_simd256_ = true;
+  }
+}
+
+bool Revectorizer::TryRevectorize(const char* function) {
+  bool success = false;
+  if (support_simd256_ && graph_->GetSimdStoreNodes().size()) {
+    TRACE("TryRevectorize %s\n", function);
+    CollectSeeds();
+    for (auto entry : group_of_stores_) {
+      ZoneMap<Node*, StoreNodeSet>* store_chains = entry.second;
+      if (store_chains != nullptr) {
+        PrintStores(store_chains);
+        if (ReduceStoreChains(store_chains)) {
+          TRACE("Successful revectorize %s\n", function);
+          success = true;
+        }
+      }
+    }
+    TRACE("Finish revectorize %s\n", function);
+  }
+  return success;
+}
+
+void Revectorizer::CollectSeeds() {
+  for (auto it = graph_->GetSimdStoreNodes().begin();
+       it != graph_->GetSimdStoreNodes().end(); ++it) {
+    Node* node = *it;
+    Node* dominator = slp_tree_->GetEarlySchedulePosition(node);
+
+    if ((GetMemoryOffsetValue(node) % kSimd128Size) != 0) {
+      continue;
+    }
+    Node* address = GetNodeAddress(node);
+    ZoneMap<Node*, StoreNodeSet>* store_nodes;
+    auto first_level_iter = group_of_stores_.find(dominator);
+    if (first_level_iter == group_of_stores_.end()) {
+      store_nodes = zone_->New<ZoneMap<Node*, StoreNodeSet>>(zone_);
+      group_of_stores_[dominator] = store_nodes;
+    } else {
+      store_nodes = first_level_iter->second;
+    }
+    auto second_level_iter = store_nodes->find(address);
+    if (second_level_iter == store_nodes->end()) {
+      second_level_iter =
+          store_nodes->insert({address, StoreNodeSet(zone())}).first;
+    }
+    second_level_iter->second.insert(node);
+  }
+}
+
+bool Revectorizer::ReduceStoreChains(
+    ZoneMap<Node*, StoreNodeSet>* store_chains) {
+  TRACE("Enter %s\n", __func__);
+  bool changed = false;
+  for (auto chain_iter = store_chains->cbegin();
+       chain_iter != store_chains->cend(); ++chain_iter) {
+    if (chain_iter->second.size() >= 2 && chain_iter->second.size() % 2 == 0) {
+      ZoneVector<Node*> store_chain(chain_iter->second.begin(),
+                                    chain_iter->second.end(), zone_);
+      for (auto it = store_chain.begin(); it < store_chain.end(); it = it + 2) {
+        ZoneVector<Node*> stores_unit(it, it + 2, zone_);
+        if (ReduceStoreChain(stores_unit)) {
+          changed = true;
+        }
+      }
+    }
+  }
+
+  return changed;
+}
+
+bool Revectorizer::ReduceStoreChain(const ZoneVector<Node*>& Stores) {
+  TRACE("Enter %s, root@ (#%d,#%d)\n", __func__, Stores[0]->id(),
+        Stores[1]->id());
+  if (!IsContinuousAccess(Stores)) {
+    return false;
+  }
+
+  PackNode* root = slp_tree_->BuildTree(Stores);
+  if (!root) {
+    TRACE("Build tree failed!\n");
+    return false;
+  }
+
+  slp_tree_->Print("After build tree");
+  TRACE("\n");
+  return true;
+}
+
+void Revectorizer::PrintStores(ZoneMap<Node*, StoreNodeSet>* store_chains) {
+  if (!v8_flags.trace_wasm_revectorize) {
+    return;
+  }
+  TRACE("Enter %s\n", __func__);
+  for (auto it = store_chains->cbegin(); it != store_chains->cend(); ++it) {
+    if (it->second.size() > 0) {
+      TRACE("address = #%d:%s \n", it->first->id(),
+            it->first->op()->mnemonic());
+
+      for (auto node : it->second) {
+        TRACE("#%d:%s, ", node->id(), node->op()->mnemonic());
+      }
+
+      TRACE("\n");
+    }
+  }
+}
+
+}  // namespace compiler
+}  // namespace internal
+}  // namespace v8
--- a/src/compiler/revectorizer.h
+++ b/src/compiler/revectorizer.h
@ -0,0 +1,197 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_COMPILER_REVECTORIZER_H_
+#define V8_COMPILER_REVECTORIZER_H_
+
+// Revectorizer is an optimization to promote pairs of simd128 nodes to new
+// simd256 nodes accelerated by wider vector available from hardware e.g. the
+// YMM registers from AVX2 instruction set when possible and beneficial. The
+// main algorithm is based on the Superword Level Parallel (SLP) vectorization
+// technique.
+
+#include <vector>
+
+#include "src/base/small-vector.h"
+#include "src/compiler/graph.h"
+#include "src/compiler/linear-scheduler.h"
+#include "src/compiler/machine-graph.h"
+#include "src/compiler/machine-operator.h"
+#include "src/compiler/node-marker.h"
+#include "src/compiler/node-properties.h"
+#include "src/compiler/node.h"
+#include "src/compiler/schedule.h"
+#include "src/zone/zone-containers.h"
+
+namespace v8 {
+namespace internal {
+namespace compiler {
+
+struct V8_EXPORT_PRIVATE MemoryOffsetComparer {
+  bool operator()(const Node* lhs, const Node* rhs) const;
+};
+
+using StoreNodeSet = ZoneSet<Node*, MemoryOffsetComparer>;
+
+// A PackNode consists of a fixed number of isomorphic simd128 nodes which can
+// execute in parallel and convert to a 256-bit simd node later. The nodes in a
+// PackNode must satisfy that they can be scheduled in the same basic block and
+// are mutually independent.
+class PackNode final : public NON_EXPORTED_BASE(ZoneObject) {
+ public:
+  explicit PackNode(Zone* zone, const ZoneVector<Node*>& node_group)
+      : nodes_(zone), operands_(zone), revectorized_node_(nullptr) {}
+
+  bool IsSame(const ZoneVector<Node*>& node_group) const {
+    return nodes_ == node_group;
+  }
+  const Node* RevectorizedNode() const { return revectorized_node_; }
+  // returns the index operand of this PackNode.
+  PackNode* GetOperand(size_t index) {
+    DCHECK_LT(index, operands_.size());
+    return operands_[index];
+  }
+
+  ZoneVector<PackNode*>::size_type GetOperandsSize() const {
+    return operands_.size();
+  }
+
+  void SetOperand(size_t index, PackNode* pnode) {
+    if (operands_.size() < index + 1) operands_.resize(index + 1);
+    operands_[index] = pnode;
+  }
+
+  void Print() const;
+
+ private:
+  ZoneVector<Node*> nodes_;
+  ZoneVector<PackNode*> operands_;
+  Node* revectorized_node_;
+};
+
+// An auxillary tree structure with a set of PackNodes based on the Superword
+// Level Parallelism (SLP) vectorization technique. The BuildTree method will
+// start from a selected root, e.g. a group of consecutive stores, and extend
+// through value inputs to create new PackNodes if the inputs are valid, or
+// conclude that the current PackNode is a leaf and terminate the tree.
+// Below is an example of SLPTree where loads and stores in each PackNode are
+// all consecutive.
+// [Load0, Load1]  [Load2, Load3]
+//           \       /
+//          [Add0, Add1]
+//                |
+//         [Store0, Store1]
+class SLPTree : public NON_EXPORTED_BASE(ZoneObject) {
+ public:
+  explicit SLPTree(Zone* zone, Graph* graph)
+      : zone_(zone),
+        graph_(graph),
+        root_(nullptr),
+        on_stack_(zone),
+        stack_(zone),
+        node_to_packnode_(zone) {
+    scheduler_ = zone->New<LinearScheduler>(zone, graph);
+  }
+
+  PackNode* BuildTree(const ZoneVector<Node*>& roots);
+  void DeleteTree();
+
+  PackNode* GetPackNode(Node* node);
+
+  void Print(const char* info);
+
+  Node* GetEarlySchedulePosition(Node* node) {
+    return scheduler_->GetEarlySchedulePosition(node);
+  }
+
+ private:
+  friend class LinearScheduler;
+
+  // This is the recursive part of BuildTree.
+  PackNode* BuildTreeRec(const ZoneVector<Node*>& node_group, unsigned depth);
+
+  // Baseline: create a new PackNode, and return.
+  PackNode* NewPackNode(const ZoneVector<Node*>& node_group);
+
+  // Recursion: create a new PackNode and call BuildTreeRec recursively
+  PackNode* NewPackNodeAndRecurs(const ZoneVector<Node*>& node_group,
+                                 int start_index, int count, unsigned depth);
+
+  bool CanBePacked(const ZoneVector<Node*>& node_group);
+
+  Graph* graph() const { return graph_; }
+
+  // Node stack operations.
+  void PopStack();
+  void PushStack(const ZoneVector<Node*>& node_group);
+  void ClearStack();
+  bool OnStack(Node* node);
+  bool AllOnStack(const ZoneVector<Node*>& node_group);
+  bool StackTopIsPhi();
+
+  bool IsSideEffectFreeLoad(const ZoneVector<Node*>& node_group);
+  bool SameBasicBlock(Node* node0, Node* node1) {
+    return scheduler_->SameBasicBlock(node0, node1);
+  }
+
+  Zone* const zone_;
+  Graph* const graph_;
+  PackNode* root_;
+  LinearScheduler* scheduler_;
+  ZoneSet<Node*> on_stack_;
+  ZoneStack<ZoneVector<Node*>> stack_;
+  // Maps a specific node to PackNode.
+  ZoneUnorderedMap<Node*, PackNode*> node_to_packnode_;
+  static constexpr size_t RecursionMaxDepth = 1000;
+};
+
+// The Revectorizer pass will firstly collect seeds with valid group of
+// consecutive stores as the root to build the SLPTree. If the SLPTree is built
+// successfully, it will estimate the cost of the 256-bit transformation for
+// each PackNode and conduct the final revectorization if benefitial.
+class V8_EXPORT_PRIVATE Revectorizer final
+    : public NON_EXPORTED_BASE(ZoneObject) {
+ public:
+  Revectorizer(Zone* zone, Graph* graph, MachineGraph* mcgraph)
+      : zone_(zone),
+        graph_(graph),
+        mcgraph_(mcgraph),
+        group_of_stores_(zone),
+        support_simd256_(false) {
+    DetectCPUFeatures();
+    slp_tree_ = zone_->New<SLPTree>(zone, graph);
+  }
+
+  void DetectCPUFeatures();
+  bool TryRevectorize(const char* name);
+
+ private:
+  void CollectSeeds();
+
+  bool ReduceStoreChains(ZoneMap<Node*, StoreNodeSet>* store_chains);
+  bool ReduceStoreChain(const ZoneVector<Node*>& Stores);
+
+  void PrintStores(ZoneMap<Node*, StoreNodeSet>* store_chains);
+  Zone* zone() const { return zone_; }
+  Graph* graph() const { return graph_; }
+  MachineGraph* mcgraph() const { return mcgraph_; }
+
+  PackNode* GetPackNode(Node* node) const {
+    return slp_tree_->GetPackNode(node);
+  }
+
+  Zone* const zone_;
+  Graph* const graph_;
+  MachineGraph* const mcgraph_;
+  ZoneMap<Node*, ZoneMap<Node*, StoreNodeSet>*> group_of_stores_;
+  SLPTree* slp_tree_;
+
+  bool support_simd256_;
+};
+
+}  // namespace compiler
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_COMPILER_REVECTORIZER_H_
--- a/src/compiler/simplified-lowering-verifier.cc
+++ b/src/compiler/simplified-lowering-verifier.cc
@ -720,7 +720,8 @@ void SimplifiedLoweringVerifier::VisitNode(Node* node,
        // TODO(nicohartmann@): These operators might need to be supported.
        break;
      }
-      MACHINE_SIMD_OP_LIST(CASE)
+      MACHINE_SIMD128_OP_LIST(CASE)
+      MACHINE_SIMD256_OP_LIST(CASE)
      IF_WASM(SIMPLIFIED_WASM_OP_LIST, CASE) {
        // SIMD operators should not be in the graph, yet.
        UNREACHABLE();
--- a/src/compiler/typer.cc
+++ b/src/compiler/typer.cc
@ -125,7 +125,8 @@ class Typer::Visitor : public Reducer {
      SIMPLIFIED_CHANGE_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
      SIMPLIFIED_CHECKED_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
      IF_WASM(SIMPLIFIED_WASM_OP_LIST, DECLARE_IMPOSSIBLE_CASE)
-      MACHINE_SIMD_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
+      MACHINE_SIMD128_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
+      MACHINE_SIMD256_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
      MACHINE_UNOP_32_LIST(DECLARE_IMPOSSIBLE_CASE)
      DECLARE_IMPOSSIBLE_CASE(Word32Xor)
      DECLARE_IMPOSSIBLE_CASE(Word32Sar)
--- a/src/compiler/verifier.cc
+++ b/src/compiler/verifier.cc
@ -1947,7 +1947,8 @@ void Verifier::Visitor::Check(Node* node, const AllNodes& all) {
    case IrOpcode::kTraceInstruction:

 #define SIMD_MACHINE_OP_CASE(Name) case IrOpcode::k##Name:
-      MACHINE_SIMD_OP_LIST(SIMD_MACHINE_OP_CASE)
+      MACHINE_SIMD128_OP_LIST(SIMD_MACHINE_OP_CASE)
+      MACHINE_SIMD256_OP_LIST(SIMD_MACHINE_OP_CASE)
 #undef SIMD_MACHINE_OP_CASE

      // TODO(rossberg): Check.
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@ -3804,15 +3804,23 @@ void WasmGraphBuilder::StoreMem(MachineRepresentation mem_rep, Node* index,
      gasm_->StoreUnaligned(UnalignedStoreRepresentation{mem_rep},
                            MemBuffer(capped_offset), index, val);
      break;
-    case MemoryAccessKind::kProtected:
-      SetSourcePosition(
-          gasm_->ProtectedStore(mem_rep, MemBuffer(capped_offset), index, val),
-          position);
+    case MemoryAccessKind::kProtected: {
+      Node* store =
+          gasm_->ProtectedStore(mem_rep, MemBuffer(capped_offset), index, val);
+      SetSourcePosition(store, position);
+      if (mem_rep == MachineRepresentation::kSimd128) {
+        graph()->RecordSimdStore(store);
+      }
      break;
-    case MemoryAccessKind::kNormal:
-      gasm_->Store(StoreRepresentation{mem_rep, kNoWriteBarrier},
-                   MemBuffer(capped_offset), index, val);
+    }
+    case MemoryAccessKind::kNormal: {
+      Node* store = gasm_->Store(StoreRepresentation{mem_rep, kNoWriteBarrier},
+                                 MemBuffer(capped_offset), index, val);
+      if (mem_rep == MachineRepresentation::kSimd128) {
+        graph()->RecordSimdStore(store);
+      }
      break;
+    }
  }

  if (v8_flags.trace_wasm_memory) {
@ -8498,6 +8506,12 @@ bool BuildGraphForWasmFunction(wasm::CompilationEnv* env,
                                    WasmGraphBuilder::kCalledFromWasm);
  builder.LowerInt64(sig);

+#ifdef V8_ENABLE_WASM_SIMD256_REVEC
+  if (v8_flags.experimental_wasm_revectorize && builder.has_simd()) {
+    mcgraph->graph()->SetSimd(true);
+  }
+#endif
+
  return true;
 }

--- a/src/flags/flag-definitions.h
+++ b/src/flags/flag-definitions.h
@ -1232,6 +1232,13 @@ DEFINE_BOOL(trace_wasm_gdb_remote, false, "trace Webassembly GDB-remote server")
 DEFINE_DEBUG_BOOL(trace_wasm_instances, false,
                  "trace creation and collection of wasm instances")

+// Flags for WASM SIMD256 revectorize
+#ifdef V8_ENABLE_WASM_SIMD256_REVEC
+DEFINE_BOOL(experimental_wasm_revectorize, false,
+            "enable 128 to 256 bit revectorization for Webassembly SIMD")
+DEFINE_BOOL(trace_wasm_revectorize, false, "trace wasm revectorize")
+#endif  // V8_ENABLE_WASM_SIMD256_REVEC
+
 #endif  // V8_ENABLE_WEBASSEMBLY

 DEFINE_INT(stress_sampling_allocation_profiler, 0,
--- a/src/logging/runtime-call-stats.h
+++ b/src/logging/runtime-call-stats.h
@ -475,6 +475,7 @@ class RuntimeCallTimer final {
  V(OptimizeFinalizePipelineJob)               \
  V(OptimizeHeapBrokerInitialization)          \
  V(OptimizeNonConcurrent)                     \
+  V(OptimizeRevectorizer)                      \
  V(OptimizeSerialization)                     \
  V(OptimizeSerializeMetadata)                 \
  V(ParseEval)                                 \
--- a/test/unittests/BUILD.gn
+++ b/test/unittests/BUILD.gn
@ -4,6 +4,17 @@

 import("../../gni/v8.gni")

+if (v8_enable_webassembly) {
+  # Specifies if the target build is a simulator build. Comparing target cpu
+  # with v8 target cpu to not affect simulator builds for making cross-compile
+  # snapshots.
+  target_is_simulator = (target_cpu != v8_target_cpu && !v8_multi_arch_build) ||
+                        (current_cpu != v8_current_cpu && v8_multi_arch_build)
+  if (!target_is_simulator && v8_current_cpu == "x64") {
+    v8_enable_wasm_simd256_revec = true
+  }
+}
+
 if (is_fuchsia) {
  import("//build/config/fuchsia/generate_runner_scripts.gni")
  import("//third_party/fuchsia-sdk/sdk/build/component.gni")
@ -586,7 +597,10 @@ v8_source_set("unittests_sources") {
  }

  if (v8_enable_wasm_simd256_revec) {
-    sources += [ "compiler/linear-scheduler-unittest.cc" ]
+    sources += [
+      "compiler/linear-scheduler-unittest.cc",
+      "compiler/revec-unittest.cc",
+    ]
  }

  if (v8_enable_wasm_gdb_remote_debugging) {
--- a/test/unittests/compiler/revec-unittest.cc
+++ b/test/unittests/compiler/revec-unittest.cc
@ -0,0 +1,106 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/codegen/machine-type.h"
+#include "src/compiler/common-operator.h"
+#include "src/compiler/machine-graph.h"
+#include "src/compiler/machine-operator.h"
+#include "src/compiler/node-properties.h"
+#include "src/compiler/node.h"
+#include "src/compiler/revectorizer.h"
+#include "src/compiler/wasm-compiler.h"
+#include "src/wasm/wasm-module.h"
+#include "test/unittests/compiler/graph-unittest.h"
+#include "test/unittests/compiler/node-test-utils.h"
+#include "testing/gmock-support.h"
+
+using testing::AllOf;
+using testing::Capture;
+using testing::CaptureEq;
+
+namespace v8 {
+namespace internal {
+namespace compiler {
+
+class RevecTest : public TestWithIsolateAndZone {
+ public:
+  RevecTest()
+      : TestWithIsolateAndZone(kCompressGraphZone),
+        graph_(zone()),
+        common_(zone()),
+        machine_(zone(), MachineRepresentation::kWord64,
+                 MachineOperatorBuilder::Flag::kAllOptionalOps),
+        mcgraph_(&graph_, &common_, &machine_) {}
+
+  Graph* graph() { return &graph_; }
+  CommonOperatorBuilder* common() { return &common_; }
+  MachineOperatorBuilder* machine() { return &machine_; }
+  MachineGraph* mcgraph() { return &mcgraph_; }
+
+ private:
+  Graph graph_;
+  CommonOperatorBuilder common_;
+  MachineOperatorBuilder machine_;
+  MachineGraph mcgraph_;
+};
+
+// Create a graph which add two 256 bit vectors(a, b), store the result in c:
+// simd128 *a,*b,*c;
+// *c = *a + *b;
+// *(c+1) = *(a+1) + *(b+1);
+// In Revectorization, two simd 128 nodes can be combined into one 256 node:
+// simd256 *d, *e, *f;
+// *f = *d + *e;
+TEST_F(RevecTest, F32x8Add) {
+  Node* start = graph()->NewNode(common()->Start(5));
+  graph()->SetStart(start);
+
+  Node* zero = graph()->NewNode(common()->Int32Constant(0));
+  Node* sixteen = graph()->NewNode(common()->Int64Constant(16));
+  // offset of memory start field in WASM instance object.
+  Node* offset = graph()->NewNode(common()->Int64Constant(23));
+
+  Node* p0 = graph()->NewNode(common()->Parameter(0), start);
+  Node* p1 = graph()->NewNode(common()->Parameter(1), start);
+  Node* p2 = graph()->NewNode(common()->Parameter(2), start);
+  Node* p3 = graph()->NewNode(common()->Parameter(3), start);
+
+  StoreRepresentation store_rep(MachineRepresentation::kSimd128,
+                                WriteBarrierKind::kNoWriteBarrier);
+  LoadRepresentation load_rep(MachineType::Simd128());
+  Node* load0 = graph()->NewNode(machine()->Load(MachineType::Int64()), p0,
+                                 offset, start, start);
+  Node* mem_buffer1 = graph()->NewNode(machine()->Int64Add(), load0, sixteen);
+  Node* mem_buffer2 = graph()->NewNode(machine()->Int64Add(), load0, sixteen);
+  Node* mem_store = graph()->NewNode(machine()->Int64Add(), load0, sixteen);
+  Node* load1 = graph()->NewNode(machine()->ProtectedLoad(load_rep), load0, p1,
+                                 load0, start);
+  Node* load2 = graph()->NewNode(machine()->ProtectedLoad(load_rep),
+                                 mem_buffer1, p1, load1, start);
+  Node* load3 = graph()->NewNode(machine()->ProtectedLoad(load_rep), load0, p2,
+                                 load2, start);
+  Node* load4 = graph()->NewNode(machine()->ProtectedLoad(load_rep),
+                                 mem_buffer2, p2, load3, start);
+  Node* add1 = graph()->NewNode(machine()->F32x4Add(), load1, load3);
+  Node* add2 = graph()->NewNode(machine()->F32x4Add(), load2, load4);
+  Node* store1 = graph()->NewNode(machine()->Store(store_rep), load0, p3, add1,
+                                  load4, start);
+  Node* store2 = graph()->NewNode(machine()->Store(store_rep), mem_store, p3,
+                                  add2, store1, start);
+  Node* ret = graph()->NewNode(common()->Return(0), zero, store2, start);
+  Node* end = graph()->NewNode(common()->End(1), ret);
+  graph()->SetEnd(end);
+
+  graph()->RecordSimdStore(store1);
+  graph()->RecordSimdStore(store2);
+  graph()->SetSimd(true);
+
+  // Test whether the graph can be revectorized
+  Revectorizer revec(zone(), graph(), mcgraph());
+  EXPECT_TRUE(revec.TryRevectorize(nullptr));
+}
+
+}  // namespace compiler
+}  // namespace internal
+}  // namespace v8