[WASM SIMD] Replace primitive shuffles with general Shuffle.

- Removes primitive shuffle opcodes. - Adds Shuffle opcode for S32x4, S16x8, S8x16. - Adds code to ARM instruction selector to pick best opcodes for some common shuffle patterns. LOG=N BUG=v8:6020 Review-Url: https://codereview.chromium.org/2847663005 Cr-Commit-Position: refs/heads/master@{#45104}
2017-05-04 09:50:51 -07:00 · 2017-05-04 09:50:51 -07:00 · 0cd0fa3b98
commit 0cd0fa3b98
parent ec619cbd89
15 changed files with 450 additions and 491 deletions
--- a/src/api.cc
+++ b/src/api.cc
@ -2714,10 +2714,7 @@ void v8::TryCatch::SetVerbose(bool value) {
  is_verbose_ = value;
 }

-bool v8::TryCatch::IsVerbose() const {
-  return is_verbose_;
-}
-
+bool v8::TryCatch::IsVerbose() const { return is_verbose_; }

 void v8::TryCatch::SetCaptureMessage(bool value) {
  capture_message_ = value;
--- a/src/compiler/arm/instruction-selector-arm.cc
+++ b/src/compiler/arm/instruction-selector-arm.cc
@ -2414,12 +2414,6 @@ VISIT_ATOMIC_BINOP(Xor)
  V(I16x8UConvertI8x16High, kArmI16x8UConvertI8x16High) \
  V(I8x16Neg, kArmI8x16Neg)                             \
  V(S128Not, kArmS128Not)                               \
-  V(S32x2Reverse, kArmS32x2Reverse)                     \
-  V(S16x4Reverse, kArmS16x4Reverse)                     \
-  V(S16x2Reverse, kArmS16x2Reverse)                     \
-  V(S8x8Reverse, kArmS8x8Reverse)                       \
-  V(S8x4Reverse, kArmS8x4Reverse)                       \
-  V(S8x2Reverse, kArmS8x2Reverse)                       \
  V(S1x4Not, kArmS128Not)                               \
  V(S1x4AnyTrue, kArmS1x4AnyTrue)                       \
  V(S1x4AllTrue, kArmS1x4AllTrue)                       \
@ -2518,26 +2512,6 @@ VISIT_ATOMIC_BINOP(Xor)
  V(S1x16Or, kArmS128Or)                        \
  V(S1x16Xor, kArmS128Xor)

-#define SIMD_SHUFFLE_OP_LIST(V) \
-  V(S32x4ZipLeft)               \
-  V(S32x4ZipRight)              \
-  V(S32x4UnzipLeft)             \
-  V(S32x4UnzipRight)            \
-  V(S32x4TransposeLeft)         \
-  V(S32x4TransposeRight)        \
-  V(S16x8ZipLeft)               \
-  V(S16x8ZipRight)              \
-  V(S16x8UnzipLeft)             \
-  V(S16x8UnzipRight)            \
-  V(S16x8TransposeLeft)         \
-  V(S16x8TransposeRight)        \
-  V(S8x16ZipLeft)               \
-  V(S8x16ZipRight)              \
-  V(S8x16UnzipLeft)             \
-  V(S8x16UnzipRight)            \
-  V(S8x16TransposeLeft)         \
-  V(S8x16TransposeRight)
-
 #define SIMD_VISIT_SPLAT(Type)                               \
  void InstructionSelector::Visit##Type##Splat(Node* node) { \
    VisitRR(this, kArm##Type##Splat, node);                  \
@ -2595,19 +2569,178 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
 SIMD_FORMAT_LIST(SIMD_VISIT_SELECT_OP)
 #undef SIMD_VISIT_SELECT_OP

-#define SIMD_VISIT_SHUFFLE_OP(Name)                   \
-  void InstructionSelector::Visit##Name(Node* node) { \
-    VisitRRRShuffle(this, kArm##Name, node);          \
-  }
-SIMD_SHUFFLE_OP_LIST(SIMD_VISIT_SHUFFLE_OP)
-#undef SIMD_VISIT_SHUFFLE_OP
+namespace {
+template <int LANES>
+struct ShuffleEntry {
+  uint8_t shuffle[LANES];
+  ArchOpcode opcode;
+};

-void InstructionSelector::VisitS8x16Concat(Node* node) {
+static const ShuffleEntry<4> arch_s32x4_shuffles[] = {
+    {{0, 4, 1, 5}, kArmS32x4ZipLeft},
+    {{2, 6, 3, 7}, kArmS32x4ZipRight},
+    {{0, 2, 4, 6}, kArmS32x4UnzipLeft},
+    {{1, 3, 5, 7}, kArmS32x4UnzipRight},
+    {{0, 4, 2, 6}, kArmS32x4TransposeLeft},
+    {{1, 5, 3, 7}, kArmS32x4TransposeRight},
+    {{1, 0, 3, 2}, kArmS32x2Reverse},
+};
+
+static const ShuffleEntry<8> arch_s16x8_shuffles[] = {
+    {{0, 8, 1, 9, 2, 10, 3, 11}, kArmS16x8ZipLeft},
+    {{4, 12, 5, 13, 6, 14, 7, 15}, kArmS16x8ZipRight},
+    {{0, 2, 4, 6, 8, 10, 12, 14}, kArmS16x8UnzipLeft},
+    {{1, 3, 5, 7, 9, 11, 13, 15}, kArmS16x8UnzipRight},
+    {{0, 8, 2, 10, 4, 12, 6, 14}, kArmS16x8TransposeLeft},
+    {{1, 9, 3, 11, 5, 13, 7, 15}, kArmS16x8TransposeRight},
+    {{3, 2, 1, 0, 7, 6, 5, 4}, kArmS16x4Reverse},
+    {{1, 0, 3, 2, 5, 4, 7, 6}, kArmS16x2Reverse},
+};
+
+static const ShuffleEntry<16> arch_s8x16_shuffles[] = {
+    {{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23},
+     kArmS8x16ZipLeft},
+    {{8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31},
+     kArmS8x16ZipRight},
+    {{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
+     kArmS8x16UnzipLeft},
+    {{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31},
+     kArmS8x16UnzipRight},
+    {{0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30},
+     kArmS8x16TransposeLeft},
+    {{1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31},
+     kArmS8x16TransposeRight},
+    {{7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}, kArmS8x8Reverse},
+    {{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}, kArmS8x4Reverse},
+    {{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, kArmS8x2Reverse},
+};
+
+// Use a non-shuffle opcode to signal no match.
+static const ArchOpcode kNoShuffle = kArmS128Not;
+
+template <int LANES>
+ArchOpcode TryMatchArchShuffle(const uint8_t* shuffle,
+                               const ShuffleEntry<LANES>* table,
+                               size_t num_entries, uint8_t mask) {
+  for (size_t i = 0; i < num_entries; i++) {
+    const ShuffleEntry<LANES>& entry = table[i];
+    int j = 0;
+    for (; j < LANES; j++) {
+      if ((entry.shuffle[j] & mask) != (shuffle[j] & mask)) {
+        break;
+      }
+    }
+    if (j == LANES) return entry.opcode;
+  }
+  return kNoShuffle;
+}
+
+// Returns the bias if shuffle is a concatenation, 0 otherwise.
+template <int LANES>
+uint8_t TryMatchConcat(const uint8_t* shuffle, uint8_t mask) {
+  uint8_t start = shuffle[0];
+  int i = 1;
+  for (; i < LANES - start; i++) {
+    if ((shuffle[i] & mask) != ((shuffle[i - 1] + 1) & mask)) return 0;
+  }
+  uint8_t wrap = LANES;
+  for (; i < LANES; i++, wrap++) {
+    if ((shuffle[i] & mask) != (wrap & mask)) return 0;
+  }
+  return start;
+}
+
+// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
+// will ignore the high bit of indices in some cases.
+uint8_t CanonicalizeShuffle(InstructionSelector* selector, Node* node,
+                            int num_lanes) {
+  const uint8_t* shuffle = OpParameter<uint8_t*>(node);
+  uint8_t mask = 0xff;
+  // If shuffle is unary, set 'mask' to ignore the high bit of the indices.
+  // Replace any unused source with the other.
+  if (selector->GetVirtualRegister(node->InputAt(0)) ==
+      selector->GetVirtualRegister(node->InputAt(1))) {
+    // unary, src0 == src1.
+    mask = num_lanes - 1;
+  } else {
+    bool src0_is_used = false;
+    bool src1_is_used = false;
+    for (int i = 0; i < num_lanes; i++) {
+      if (shuffle[i] < num_lanes) {
+        src0_is_used = true;
+      } else {
+        src1_is_used = true;
+      }
+    }
+    if (src0_is_used && !src1_is_used) {
+      node->ReplaceInput(1, node->InputAt(0));
+      mask = num_lanes - 1;
+    } else if (src1_is_used && !src0_is_used) {
+      node->ReplaceInput(0, node->InputAt(1));
+      mask = num_lanes - 1;
+    }
+  }
+  return mask;
+}
+
+}  // namespace
+
+void InstructionSelector::VisitS32x4Shuffle(Node* node) {
+  const uint8_t* shuffle = OpParameter<uint8_t*>(node);
+  uint8_t mask = CanonicalizeShuffle(this, node, 4);
+  ArchOpcode opcode = TryMatchArchShuffle<4>(
+      shuffle, arch_s32x4_shuffles, arraysize(arch_s32x4_shuffles), mask);
+  if (opcode != kNoShuffle) {
+    VisitRRRShuffle(this, opcode, node);
+    return;
+  }
  ArmOperandGenerator g(this);
-  int32_t imm = OpParameter<int32_t>(node);
-  Emit(kArmS8x16Concat, g.DefineAsRegister(node),
-       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
-       g.UseImmediate(imm));
+  uint8_t lanes = TryMatchConcat<4>(shuffle, mask);
+  if (lanes != 0) {
+    Emit(kArmS8x16Concat, g.DefineAsRegister(node),
+         g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
+         g.UseImmediate(lanes * 4));
+    return;
+  }
+  // TODO(bbudge) vtbl to handle all other shuffles.
+}
+
+void InstructionSelector::VisitS16x8Shuffle(Node* node) {
+  const uint8_t* shuffle = OpParameter<uint8_t*>(node);
+  uint8_t mask = CanonicalizeShuffle(this, node, 8);
+  ArchOpcode opcode = TryMatchArchShuffle<8>(
+      shuffle, arch_s16x8_shuffles, arraysize(arch_s16x8_shuffles), mask);
+  if (opcode != kNoShuffle) {
+    VisitRRRShuffle(this, opcode, node);
+    return;
+  }
+  ArmOperandGenerator g(this);
+  uint8_t lanes = TryMatchConcat<8>(shuffle, mask);
+  if (lanes != 0) {
+    Emit(kArmS8x16Concat, g.DefineAsRegister(node),
+         g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
+         g.UseImmediate(lanes * 2));
+  }
+  // TODO(bbudge) vtbl to handle all other shuffles.
+}
+
+void InstructionSelector::VisitS8x16Shuffle(Node* node) {
+  const uint8_t* shuffle = OpParameter<uint8_t*>(node);
+  uint8_t mask = CanonicalizeShuffle(this, node, 16);
+  ArchOpcode opcode = TryMatchArchShuffle<16>(
+      shuffle, arch_s8x16_shuffles, arraysize(arch_s8x16_shuffles), mask);
+  if (opcode != kNoShuffle) {
+    VisitRRRShuffle(this, opcode, node);
+    return;
+  }
+  ArmOperandGenerator g(this);
+  uint8_t lanes = TryMatchConcat<16>(shuffle, mask);
+  if (lanes != 0) {
+    Emit(kArmS8x16Concat, g.DefineAsRegister(node),
+         g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
+         g.UseImmediate(lanes));
+  }
+  // TODO(bbudge) vtbl to handle all other shuffles.
 }

 void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
--- a/src/compiler/instruction-selector.cc
+++ b/src/compiler/instruction-selector.cc
@ -1705,62 +1705,18 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitS128Xor(node);
    case IrOpcode::kS128Not:
      return MarkAsSimd128(node), VisitS128Not(node);
-    case IrOpcode::kS32x4ZipLeft:
-      return MarkAsSimd128(node), VisitS32x4ZipLeft(node);
-    case IrOpcode::kS32x4ZipRight:
-      return MarkAsSimd128(node), VisitS32x4ZipRight(node);
-    case IrOpcode::kS32x4UnzipLeft:
-      return MarkAsSimd128(node), VisitS32x4UnzipLeft(node);
-    case IrOpcode::kS32x4UnzipRight:
-      return MarkAsSimd128(node), VisitS32x4UnzipRight(node);
-    case IrOpcode::kS32x4TransposeLeft:
-      return MarkAsSimd128(node), VisitS32x4TransposeLeft(node);
-    case IrOpcode::kS32x4TransposeRight:
-      return MarkAsSimd128(node), VisitS32x4TransposeRight(node);
+    case IrOpcode::kS32x4Shuffle:
+      return MarkAsSimd128(node), VisitS32x4Shuffle(node);
    case IrOpcode::kS32x4Select:
      return MarkAsSimd128(node), VisitS32x4Select(node);
-    case IrOpcode::kS16x8ZipLeft:
-      return MarkAsSimd128(node), VisitS16x8ZipLeft(node);
-    case IrOpcode::kS16x8ZipRight:
-      return MarkAsSimd128(node), VisitS16x8ZipRight(node);
-    case IrOpcode::kS16x8UnzipLeft:
-      return MarkAsSimd128(node), VisitS16x8UnzipLeft(node);
-    case IrOpcode::kS16x8UnzipRight:
-      return MarkAsSimd128(node), VisitS16x8UnzipRight(node);
-    case IrOpcode::kS16x8TransposeLeft:
-      return MarkAsSimd128(node), VisitS16x8TransposeLeft(node);
-    case IrOpcode::kS16x8TransposeRight:
-      return MarkAsSimd128(node), VisitS16x8TransposeRight(node);
+    case IrOpcode::kS16x8Shuffle:
+      return MarkAsSimd128(node), VisitS16x8Shuffle(node);
    case IrOpcode::kS16x8Select:
      return MarkAsSimd128(node), VisitS16x8Select(node);
-    case IrOpcode::kS8x16ZipLeft:
-      return MarkAsSimd128(node), VisitS8x16ZipLeft(node);
-    case IrOpcode::kS8x16ZipRight:
-      return MarkAsSimd128(node), VisitS8x16ZipRight(node);
-    case IrOpcode::kS8x16UnzipLeft:
-      return MarkAsSimd128(node), VisitS8x16UnzipLeft(node);
-    case IrOpcode::kS8x16UnzipRight:
-      return MarkAsSimd128(node), VisitS8x16UnzipRight(node);
-    case IrOpcode::kS8x16TransposeLeft:
-      return MarkAsSimd128(node), VisitS8x16TransposeLeft(node);
-    case IrOpcode::kS8x16TransposeRight:
-      return MarkAsSimd128(node), VisitS8x16TransposeRight(node);
+    case IrOpcode::kS8x16Shuffle:
+      return MarkAsSimd128(node), VisitS8x16Shuffle(node);
    case IrOpcode::kS8x16Select:
      return MarkAsSimd128(node), VisitS8x16Select(node);
-    case IrOpcode::kS8x16Concat:
-      return MarkAsSimd128(node), VisitS8x16Concat(node);
-    case IrOpcode::kS32x2Reverse:
-      return MarkAsSimd128(node), VisitS32x2Reverse(node);
-    case IrOpcode::kS16x4Reverse:
-      return MarkAsSimd128(node), VisitS16x4Reverse(node);
-    case IrOpcode::kS16x2Reverse:
-      return MarkAsSimd128(node), VisitS16x2Reverse(node);
-    case IrOpcode::kS8x8Reverse:
-      return MarkAsSimd128(node), VisitS8x8Reverse(node);
-    case IrOpcode::kS8x4Reverse:
-      return MarkAsSimd128(node), VisitS8x4Reverse(node);
-    case IrOpcode::kS8x2Reverse:
-      return MarkAsSimd128(node), VisitS8x2Reverse(node);
    case IrOpcode::kS1x4Zero:
      return MarkAsSimd1x4(node), VisitS1x4Zero(node);
    case IrOpcode::kS1x4And:
@ -2464,37 +2420,10 @@ void InstructionSelector::VisitS32x4Select(Node* node) { UNIMPLEMENTED(); }
        // !V8_TARGET_ARCH_MIPS64

 #if !V8_TARGET_ARCH_ARM
-void InstructionSelector::VisitS32x4ZipLeft(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitS32x4Shuffle(Node* node) { UNIMPLEMENTED(); }

-void InstructionSelector::VisitS32x4ZipRight(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitS16x8Shuffle(Node* node) { UNIMPLEMENTED(); }

-void InstructionSelector::VisitS32x4UnzipLeft(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS32x4UnzipRight(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS32x4TransposeLeft(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitS32x4TransposeRight(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitS16x8ZipLeft(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS16x8ZipRight(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS16x8UnzipLeft(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS16x8UnzipRight(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS16x8TransposeLeft(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitS16x8TransposeRight(Node* node) {
-  UNIMPLEMENTED();
-}
 #endif  // !V8_TARGET_ARCH_ARM

 #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM
@ -2502,21 +2431,8 @@ void InstructionSelector::VisitS16x8Select(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM

 #if !V8_TARGET_ARCH_ARM
-void InstructionSelector::VisitS8x16ZipLeft(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitS8x16Shuffle(Node* node) { UNIMPLEMENTED(); }

-void InstructionSelector::VisitS8x16ZipRight(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS8x16UnzipLeft(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS8x16UnzipRight(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS8x16TransposeLeft(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitS8x16TransposeRight(Node* node) {
-  UNIMPLEMENTED();
-}
 #endif  // !V8_TARGET_ARCH_ARM

 #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM
@ -2524,20 +2440,6 @@ void InstructionSelector::VisitS8x16Select(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM

 #if !V8_TARGET_ARCH_ARM
-void InstructionSelector::VisitS8x16Concat(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS32x2Reverse(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS16x4Reverse(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS16x2Reverse(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS8x8Reverse(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS8x4Reverse(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS8x2Reverse(Node* node) { UNIMPLEMENTED(); }
-
 void InstructionSelector::VisitS1x4And(Node* node) { UNIMPLEMENTED(); }

 void InstructionSelector::VisitS1x4Or(Node* node) { UNIMPLEMENTED(); }
--- a/src/compiler/machine-operator.cc
+++ b/src/compiler/machine-operator.cc
@ -317,33 +317,9 @@ MachineType AtomicOpRepresentationOf(Operator const* op) {
  V(S128Or, Operator::kAssociative | Operator::kCommutative, 2, 0, 1)     \
  V(S128Xor, Operator::kAssociative | Operator::kCommutative, 2, 0, 1)    \
  V(S128Not, Operator::kNoProperties, 1, 0, 1)                            \
-  V(S32x4ZipLeft, Operator::kNoProperties, 2, 0, 1)                       \
-  V(S32x4ZipRight, Operator::kNoProperties, 2, 0, 1)                      \
-  V(S32x4UnzipLeft, Operator::kNoProperties, 2, 0, 1)                     \
-  V(S32x4UnzipRight, Operator::kNoProperties, 2, 0, 1)                    \
-  V(S32x4TransposeLeft, Operator::kNoProperties, 2, 0, 1)                 \
-  V(S32x4TransposeRight, Operator::kNoProperties, 2, 0, 1)                \
  V(S32x4Select, Operator::kNoProperties, 3, 0, 1)                        \
-  V(S16x8ZipLeft, Operator::kNoProperties, 2, 0, 1)                       \
-  V(S16x8ZipRight, Operator::kNoProperties, 2, 0, 1)                      \
-  V(S16x8UnzipLeft, Operator::kNoProperties, 2, 0, 1)                     \
-  V(S16x8UnzipRight, Operator::kNoProperties, 2, 0, 1)                    \
-  V(S16x8TransposeLeft, Operator::kNoProperties, 2, 0, 1)                 \
-  V(S16x8TransposeRight, Operator::kNoProperties, 2, 0, 1)                \
  V(S16x8Select, Operator::kNoProperties, 3, 0, 1)                        \
-  V(S8x16ZipLeft, Operator::kNoProperties, 2, 0, 1)                       \
-  V(S8x16ZipRight, Operator::kNoProperties, 2, 0, 1)                      \
-  V(S8x16UnzipLeft, Operator::kNoProperties, 2, 0, 1)                     \
-  V(S8x16UnzipRight, Operator::kNoProperties, 2, 0, 1)                    \
-  V(S8x16TransposeLeft, Operator::kNoProperties, 2, 0, 1)                 \
-  V(S8x16TransposeRight, Operator::kNoProperties, 2, 0, 1)                \
  V(S8x16Select, Operator::kNoProperties, 3, 0, 1)                        \
-  V(S32x2Reverse, Operator::kNoProperties, 1, 0, 1)                       \
-  V(S16x4Reverse, Operator::kNoProperties, 1, 0, 1)                       \
-  V(S16x2Reverse, Operator::kNoProperties, 1, 0, 1)                       \
-  V(S8x8Reverse, Operator::kNoProperties, 1, 0, 1)                        \
-  V(S8x4Reverse, Operator::kNoProperties, 1, 0, 1)                        \
-  V(S8x2Reverse, Operator::kNoProperties, 1, 0, 1)                        \
  V(S1x4Zero, Operator::kNoProperties, 0, 0, 1)                           \
  V(S1x4And, Operator::kAssociative | Operator::kCommutative, 2, 0, 1)    \
  V(S1x4Or, Operator::kAssociative | Operator::kCommutative, 2, 0, 1)     \
@ -1028,10 +1004,28 @@ SIMD_LANE_OP_LIST(SIMD_LANE_OPS)
 SIMD_FORMAT_LIST(SIMD_SHIFT_OPS)
 #undef SIMD_SHIFT_OPS

-const Operator* MachineOperatorBuilder::S8x16Concat(int32_t bytes) {
-  DCHECK(0 <= bytes && bytes < kSimd128Size);
-  return new (zone_) Operator1<int32_t>(IrOpcode::kS8x16Concat, Operator::kPure,
-                                        "Concat", 2, 0, 0, 1, 0, 0, bytes);
+const Operator* MachineOperatorBuilder::S32x4Shuffle(uint8_t shuffle[16]) {
+  uint8_t* array = zone_->NewArray<uint8_t>(4);
+  memcpy(array, shuffle, 4);
+  return new (zone_)
+      Operator1<uint8_t*>(IrOpcode::kS32x4Shuffle, Operator::kPure, "Shuffle",
+                          2, 0, 0, 1, 0, 0, array);
+}
+
+const Operator* MachineOperatorBuilder::S16x8Shuffle(uint8_t shuffle[16]) {
+  uint8_t* array = zone_->NewArray<uint8_t>(8);
+  memcpy(array, shuffle, 8);
+  return new (zone_)
+      Operator1<uint8_t*>(IrOpcode::kS16x8Shuffle, Operator::kPure, "Shuffle",
+                          2, 0, 0, 1, 0, 0, array);
+}
+
+const Operator* MachineOperatorBuilder::S8x16Shuffle(uint8_t shuffle[16]) {
+  uint8_t* array = zone_->NewArray<uint8_t>(16);
+  memcpy(array, shuffle, 16);
+  return new (zone_)
+      Operator1<uint8_t*>(IrOpcode::kS8x16Shuffle, Operator::kPure, "Shuffle",
+                          2, 0, 0, 1, 0, 0, array);
 }

 }  // namespace compiler
--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@ -554,35 +554,12 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* S128Xor();
  const Operator* S128Not();

-  const Operator* S32x4ZipLeft();
-  const Operator* S32x4ZipRight();
-  const Operator* S32x4UnzipLeft();
-  const Operator* S32x4UnzipRight();
-  const Operator* S32x4TransposeLeft();
-  const Operator* S32x4TransposeRight();
+  const Operator* S32x4Shuffle(uint8_t shuffle[16]);
  const Operator* S32x4Select();
-  const Operator* S16x8ZipLeft();
-  const Operator* S16x8ZipRight();
-  const Operator* S16x8UnzipLeft();
-  const Operator* S16x8UnzipRight();
-  const Operator* S16x8TransposeLeft();
-  const Operator* S16x8TransposeRight();
+  const Operator* S16x8Shuffle(uint8_t shuffle[16]);
  const Operator* S16x8Select();
-  const Operator* S8x16ZipLeft();
-  const Operator* S8x16ZipRight();
-  const Operator* S8x16UnzipLeft();
-  const Operator* S8x16UnzipRight();
-  const Operator* S8x16TransposeLeft();
-  const Operator* S8x16TransposeRight();
+  const Operator* S8x16Shuffle(uint8_t shuffle[16]);
  const Operator* S8x16Select();
-  const Operator* S8x16Concat(int32_t);
-
-  const Operator* S32x2Reverse();
-  const Operator* S16x4Reverse();
-  const Operator* S16x2Reverse();
-  const Operator* S8x8Reverse();
-  const Operator* S8x4Reverse();
-  const Operator* S8x2Reverse();

  const Operator* S1x4Zero();
  const Operator* S1x4And();
--- a/src/compiler/opcodes.h
+++ b/src/compiler/opcodes.h
@ -691,34 +691,12 @@
  V(S128And)                    \
  V(S128Or)                     \
  V(S128Xor)                    \
-  V(S32x4ZipLeft)               \
-  V(S32x4ZipRight)              \
-  V(S32x4UnzipLeft)             \
-  V(S32x4UnzipRight)            \
-  V(S32x4TransposeLeft)         \
-  V(S32x4TransposeRight)        \
+  V(S32x4Shuffle)               \
  V(S32x4Select)                \
-  V(S16x8ZipLeft)               \
-  V(S16x8ZipRight)              \
-  V(S16x8UnzipLeft)             \
-  V(S16x8UnzipRight)            \
-  V(S16x8TransposeLeft)         \
-  V(S16x8TransposeRight)        \
+  V(S16x8Shuffle)               \
  V(S16x8Select)                \
-  V(S8x16ZipLeft)               \
-  V(S8x16ZipRight)              \
-  V(S8x16UnzipLeft)             \
-  V(S8x16UnzipRight)            \
-  V(S8x16TransposeLeft)         \
-  V(S8x16TransposeRight)        \
+  V(S8x16Shuffle)               \
  V(S8x16Select)                \
-  V(S8x16Concat)                \
-  V(S32x2Reverse)               \
-  V(S16x4Reverse)               \
-  V(S16x2Reverse)               \
-  V(S8x8Reverse)                \
-  V(S8x4Reverse)                \
-  V(S8x2Reverse)                \
  V(S1x4Zero)                   \
  V(S1x4And)                    \
  V(S1x4Or)                     \
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@ -3478,81 +3478,15 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode,
                              inputs[1]);
    case wasm::kExprS128Not:
      return graph()->NewNode(jsgraph()->machine()->S128Not(), inputs[0]);
-    case wasm::kExprS32x4ZipLeft:
-      return graph()->NewNode(jsgraph()->machine()->S32x4ZipLeft(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS32x4ZipRight:
-      return graph()->NewNode(jsgraph()->machine()->S32x4ZipRight(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS32x4UnzipLeft:
-      return graph()->NewNode(jsgraph()->machine()->S32x4UnzipLeft(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS32x4UnzipRight:
-      return graph()->NewNode(jsgraph()->machine()->S32x4UnzipRight(),
-                              inputs[0], inputs[1]);
-    case wasm::kExprS32x4TransposeLeft:
-      return graph()->NewNode(jsgraph()->machine()->S32x4TransposeLeft(),
-                              inputs[0], inputs[1]);
-    case wasm::kExprS32x4TransposeRight:
-      return graph()->NewNode(jsgraph()->machine()->S32x4TransposeRight(),
-                              inputs[0], inputs[1]);
    case wasm::kExprS32x4Select:
      return graph()->NewNode(jsgraph()->machine()->S32x4Select(), inputs[0],
                              inputs[1], inputs[2]);
-    case wasm::kExprS16x8ZipLeft:
-      return graph()->NewNode(jsgraph()->machine()->S16x8ZipLeft(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS16x8ZipRight:
-      return graph()->NewNode(jsgraph()->machine()->S16x8ZipRight(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS16x8UnzipLeft:
-      return graph()->NewNode(jsgraph()->machine()->S16x8UnzipLeft(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS16x8UnzipRight:
-      return graph()->NewNode(jsgraph()->machine()->S16x8UnzipRight(),
-                              inputs[0], inputs[1]);
-    case wasm::kExprS16x8TransposeLeft:
-      return graph()->NewNode(jsgraph()->machine()->S16x8TransposeLeft(),
-                              inputs[0], inputs[1]);
-    case wasm::kExprS16x8TransposeRight:
-      return graph()->NewNode(jsgraph()->machine()->S16x8TransposeRight(),
-                              inputs[0], inputs[1]);
    case wasm::kExprS16x8Select:
      return graph()->NewNode(jsgraph()->machine()->S16x8Select(), inputs[0],
                              inputs[1], inputs[2]);
-    case wasm::kExprS8x16ZipLeft:
-      return graph()->NewNode(jsgraph()->machine()->S8x16ZipLeft(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS8x16ZipRight:
-      return graph()->NewNode(jsgraph()->machine()->S8x16ZipRight(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS8x16UnzipLeft:
-      return graph()->NewNode(jsgraph()->machine()->S8x16UnzipLeft(), inputs[0],
-                              inputs[1]);
-    case wasm::kExprS8x16UnzipRight:
-      return graph()->NewNode(jsgraph()->machine()->S8x16UnzipRight(),
-                              inputs[0], inputs[1]);
-    case wasm::kExprS8x16TransposeLeft:
-      return graph()->NewNode(jsgraph()->machine()->S8x16TransposeLeft(),
-                              inputs[0], inputs[1]);
-    case wasm::kExprS8x16TransposeRight:
-      return graph()->NewNode(jsgraph()->machine()->S8x16TransposeRight(),
-                              inputs[0], inputs[1]);
    case wasm::kExprS8x16Select:
      return graph()->NewNode(jsgraph()->machine()->S8x16Select(), inputs[0],
                              inputs[1], inputs[2]);
-    case wasm::kExprS32x2Reverse:
-      return graph()->NewNode(jsgraph()->machine()->S32x2Reverse(), inputs[0]);
-    case wasm::kExprS16x4Reverse:
-      return graph()->NewNode(jsgraph()->machine()->S16x4Reverse(), inputs[0]);
-    case wasm::kExprS16x2Reverse:
-      return graph()->NewNode(jsgraph()->machine()->S16x2Reverse(), inputs[0]);
-    case wasm::kExprS8x8Reverse:
-      return graph()->NewNode(jsgraph()->machine()->S8x8Reverse(), inputs[0]);
-    case wasm::kExprS8x4Reverse:
-      return graph()->NewNode(jsgraph()->machine()->S8x4Reverse(), inputs[0]);
-    case wasm::kExprS8x2Reverse:
-      return graph()->NewNode(jsgraph()->machine()->S8x2Reverse(), inputs[0]);
    case wasm::kExprS1x4And:
      return graph()->NewNode(jsgraph()->machine()->S1x4And(), inputs[0],
                              inputs[1]);
@ -3669,10 +3603,23 @@ Node* WasmGraphBuilder::SimdShiftOp(wasm::WasmOpcode opcode, uint8_t shift,
  }
 }

-Node* WasmGraphBuilder::SimdConcatOp(uint8_t bytes, const NodeVector& inputs) {
+Node* WasmGraphBuilder::SimdShuffleOp(uint8_t shuffle[16], unsigned lanes,
+                                      const NodeVector& inputs) {
  has_simd_ = true;
-  return graph()->NewNode(jsgraph()->machine()->S8x16Concat(bytes), inputs[0],
-                          inputs[1]);
+  switch (lanes) {
+    case 4:
+      return graph()->NewNode(jsgraph()->machine()->S32x4Shuffle(shuffle),
+                              inputs[0], inputs[1]);
+    case 8:
+      return graph()->NewNode(jsgraph()->machine()->S16x8Shuffle(shuffle),
+                              inputs[0], inputs[1]);
+    case 16:
+      return graph()->NewNode(jsgraph()->machine()->S8x16Shuffle(shuffle),
+                              inputs[0], inputs[1]);
+    default:
+      UNREACHABLE();
+      return nullptr;
+  }
 }

 static void RecordFunctionCompilation(CodeEventListener::LogEventsAndTags tag,
--- a/src/compiler/wasm-compiler.h
+++ b/src/compiler/wasm-compiler.h
@ -252,7 +252,8 @@ class WasmGraphBuilder {
  Node* SimdShiftOp(wasm::WasmOpcode opcode, uint8_t shift,
                    const NodeVector& inputs);

-  Node* SimdConcatOp(uint8_t bytes, const NodeVector& inputs);
+  Node* SimdShuffleOp(uint8_t shuffle[16], unsigned lanes,
+                      const NodeVector& inputs);

  bool has_simd() const { return has_simd_; }

--- a/src/wasm/function-body-decoder-impl.h
+++ b/src/wasm/function-body-decoder-impl.h
@ -322,15 +322,17 @@ struct SimdShiftOperand {
  }
 };

-// Operand for SIMD concatenation operations.
+// Operand for SIMD shuffle operations.
 template <bool checked>
-struct SimdConcatOperand {
-  uint8_t bytes;
-  unsigned length;
+struct SimdShuffleOperand {
+  uint8_t shuffle[16];
+  unsigned lanes;

-  inline SimdConcatOperand(Decoder* decoder, const byte* pc) {
-    bytes = decoder->read_u8<checked>(pc + 2, "bytes");
-    length = 1;
+  inline SimdShuffleOperand(Decoder* decoder, const byte* pc, unsigned lanes_) {
+    lanes = lanes_;
+    for (unsigned i = 0; i < lanes; i++) {
+      shuffle[i] = decoder->read_u8<checked>(pc + 2 + i, "shuffle");
+    }
  }
 };

--- a/src/wasm/function-body-decoder.cc
+++ b/src/wasm/function-body-decoder.cc
@ -146,6 +146,22 @@ struct Control {
  }
 };

+namespace {
+inline unsigned GetShuffleMaskSize(WasmOpcode opcode) {
+  switch (opcode) {
+    case kExprS32x4Shuffle:
+      return 4;
+    case kExprS16x8Shuffle:
+      return 8;
+    case kExprS8x16Shuffle:
+      return 16;
+    default:
+      UNREACHABLE();
+      return 0;
+  }
+}
+}  // namespace
+
 // Macros that build nodes only if there is a graph and the current SSA
 // environment is reachable from start. This avoids problems with malformed
 // TF graphs when decoding inputs that have unreachable code.
@ -412,10 +428,13 @@ class WasmDecoder : public Decoder {
  }

  inline bool Validate(const byte* pc, WasmOpcode opcode,
-                       SimdConcatOperand<true>& operand) {
-    DCHECK_EQ(wasm::kExprS8x16Concat, opcode);
-    if (operand.bytes <= 0 || operand.bytes >= kSimd128Size) {
-      error(pc_ + 2, "invalid byte amount");
+                       SimdShuffleOperand<true>& operand) {
+    unsigned lanes = GetShuffleMaskSize(opcode);
+    uint8_t max_lane = 0;
+    for (unsigned i = 0; i < lanes; i++)
+      max_lane = std::max(max_lane, operand.shuffle[i]);
+    if (operand.lanes != lanes || max_lane > 2 * lanes) {
+      error(pc_ + 2, "invalid shuffle mask");
      return false;
    } else {
      return true;
@ -423,7 +442,8 @@ class WasmDecoder : public Decoder {
  }

  static unsigned OpcodeLength(Decoder* decoder, const byte* pc) {
-    switch (static_cast<byte>(*pc)) {
+    WasmOpcode opcode = static_cast<WasmOpcode>(*pc);
+    switch (opcode) {
 #define DECLARE_OPCODE_CASE(name, opcode, sig) case kExpr##name:
      FOREACH_LOAD_MEM_OPCODE(DECLARE_OPCODE_CASE)
      FOREACH_STORE_MEM_OPCODE(DECLARE_OPCODE_CASE)
@ -506,6 +526,11 @@ class WasmDecoder : public Decoder {
          {
            return 3;
          }
+          // Shuffles contain a byte array to determine the shuffle.
+          case kExprS32x4Shuffle:
+          case kExprS16x8Shuffle:
+          case kExprS8x16Shuffle:
+            return 2 + GetShuffleMaskSize(opcode);
          default:
            decoder->error(pc, "invalid SIMD opcode");
            return 2;
@ -1548,17 +1573,17 @@ class WasmFullDecoder : public WasmDecoder {
    return operand.length;
  }

-  unsigned SimdConcatOp(WasmOpcode opcode) {
-    DCHECK_EQ(wasm::kExprS8x16Concat, opcode);
-    SimdConcatOperand<true> operand(this, pc_);
+  unsigned SimdShuffleOp(WasmOpcode opcode) {
+    SimdShuffleOperand<true> operand(this, pc_, GetShuffleMaskSize(opcode));
    if (Validate(pc_, opcode, operand)) {
      compiler::NodeVector inputs(2, zone_);
      inputs[1] = Pop(1, ValueType::kSimd128).node;
      inputs[0] = Pop(0, ValueType::kSimd128).node;
-      TFNode* node = BUILD(SimdConcatOp, operand.bytes, inputs);
+      TFNode* node =
+          BUILD(SimdShuffleOp, operand.shuffle, operand.lanes, inputs);
      Push(ValueType::kSimd128, node);
    }
-    return operand.length;
+    return operand.lanes;
  }

  unsigned DecodeSimdOpcode(WasmOpcode opcode) {
@ -1596,8 +1621,10 @@ class WasmFullDecoder : public WasmDecoder {
        len = SimdShiftOp(opcode);
        break;
      }
-      case kExprS8x16Concat: {
-        len = SimdConcatOp(opcode);
+      case kExprS32x4Shuffle:
+      case kExprS16x8Shuffle:
+      case kExprS8x16Shuffle: {
+        len = SimdShuffleOp(opcode);
        break;
      }
      default: {
--- a/src/wasm/wasm-opcodes.cc
+++ b/src/wasm/wasm-opcodes.cc
@ -214,34 +214,12 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
    CASE_S128_OP(Or, "or")
    CASE_S128_OP(Xor, "xor")
    CASE_S128_OP(Not, "not")
-    CASE_S32x4_OP(ZipLeft, "zip left")
-    CASE_S32x4_OP(ZipRight, "zip right")
-    CASE_S32x4_OP(UnzipLeft, "unzip left")
-    CASE_S32x4_OP(UnzipRight, "unzip right")
-    CASE_S32x4_OP(TransposeLeft, "transpose left")
-    CASE_S32x4_OP(TransposeRight, "transpose right")
+    CASE_S32x4_OP(Shuffle, "shuffle")
    CASE_S32x4_OP(Select, "select")
-    CASE_S16x8_OP(ZipLeft, "zip left")
-    CASE_S16x8_OP(ZipRight, "zip right")
-    CASE_S16x8_OP(UnzipLeft, "unzip left")
-    CASE_S16x8_OP(UnzipRight, "unzip right")
-    CASE_S16x8_OP(TransposeLeft, "transpose left")
-    CASE_S16x8_OP(TransposeRight, "transpose right")
+    CASE_S16x8_OP(Shuffle, "shuffle")
    CASE_S16x8_OP(Select, "select")
-    CASE_S8x16_OP(ZipLeft, "zip left")
-    CASE_S8x16_OP(ZipRight, "zip right")
-    CASE_S8x16_OP(UnzipLeft, "unzip left")
-    CASE_S8x16_OP(UnzipRight, "unzip right")
-    CASE_S8x16_OP(TransposeLeft, "transpose left")
-    CASE_S8x16_OP(TransposeRight, "transpose right")
+    CASE_S8x16_OP(Shuffle, "shuffle")
    CASE_S8x16_OP(Select, "select")
-    CASE_S8x16_OP(Concat, "concat")
-    CASE_OP(S32x2Reverse, "32x2 reverse")
-    CASE_OP(S16x4Reverse, "16x4 reverse")
-    CASE_OP(S16x2Reverse, "16x2 reverse")
-    CASE_OP(S8x8Reverse, "8x8 reverse")
-    CASE_OP(S8x4Reverse, "8x4 reverse")
-    CASE_OP(S8x2Reverse, "8x2 reverse")
    CASE_S1x4_OP(And, "and")
    CASE_S1x4_OP(Or, "or")
    CASE_S1x4_OP(Xor, "xor")
--- a/src/wasm/wasm-opcodes.h
+++ b/src/wasm/wasm-opcodes.h
@ -387,33 +387,9 @@ constexpr WasmCodePosition kNoCodePosition = -1;
  V(S128Or, 0xe577, s_ss)                \
  V(S128Xor, 0xe578, s_ss)               \
  V(S128Not, 0xe579, s_s)                \
-  V(S32x4ZipLeft, 0xe5a0, s_ss)          \
-  V(S32x4ZipRight, 0xe5a1, s_ss)         \
-  V(S32x4UnzipLeft, 0xe5a2, s_ss)        \
-  V(S32x4UnzipRight, 0xe5a3, s_ss)       \
-  V(S32x4TransposeLeft, 0xe5a4, s_ss)    \
-  V(S32x4TransposeRight, 0xe5a5, s_ss)   \
  V(S32x4Select, 0xe52c, s_s1x4ss)       \
-  V(S16x8ZipLeft, 0xe5a6, s_ss)          \
-  V(S16x8ZipRight, 0xe5a7, s_ss)         \
-  V(S16x8UnzipLeft, 0xe5a8, s_ss)        \
-  V(S16x8UnzipRight, 0xe5a9, s_ss)       \
-  V(S16x8TransposeLeft, 0xe5aa, s_ss)    \
-  V(S16x8TransposeRight, 0xe5ab, s_ss)   \
  V(S16x8Select, 0xe54b, s_s1x8ss)       \
-  V(S8x16ZipLeft, 0xe5ac, s_ss)          \
-  V(S8x16ZipRight, 0xe5ad, s_ss)         \
-  V(S8x16UnzipLeft, 0xe5ae, s_ss)        \
-  V(S8x16UnzipRight, 0xe5af, s_ss)       \
-  V(S8x16TransposeLeft, 0xe5b0, s_ss)    \
-  V(S8x16TransposeRight, 0xe5b1, s_ss)   \
  V(S8x16Select, 0xe56a, s_s1x16ss)      \
-  V(S32x2Reverse, 0xe5b2, s_s)           \
-  V(S16x4Reverse, 0xe5b3, s_s)           \
-  V(S16x2Reverse, 0xe5b4, s_s)           \
-  V(S8x8Reverse, 0xe5b5, s_s)            \
-  V(S8x4Reverse, 0xe5b6, s_s)            \
-  V(S8x2Reverse, 0xe5b7, s_s)            \
  V(S1x4And, 0xe580, s1x4_s1x4s1x4)      \
  V(S1x4Or, 0xe581, s1x4_s1x4s1x4)       \
  V(S1x4Xor, 0xe582, s1x4_s1x4s1x4)      \
@ -450,8 +426,12 @@ constexpr WasmCodePosition kNoCodePosition = -1;
  V(I8x16ReplaceLane, 0xe559, _)         \
  V(I8x16Shl, 0xe562, _)                 \
  V(I8x16ShrS, 0xe563, _)                \
-  V(I8x16ShrU, 0xe571, _)                \
-  V(S8x16Concat, 0xe5b8, _)
+  V(I8x16ShrU, 0xe571, _)
+
+#define FOREACH_SIMD_MASK_OPERAND_OPCODE(V) \
+  V(S32x4Shuffle, 0xe52d, s_ss)             \
+  V(S16x8Shuffle, 0xe54c, s_ss)             \
+  V(S8x16Shuffle, 0xe56b, s_ss)

 #define FOREACH_ATOMIC_OPCODE(V)               \
  V(I32AtomicAdd8S, 0xe601, i_ii)              \
@ -491,16 +471,17 @@ constexpr WasmCodePosition kNoCodePosition = -1;
  V(I32AtomicXor, 0xe623, i_ii)

 // All opcodes.
-#define FOREACH_OPCODE(V)          \
-  FOREACH_CONTROL_OPCODE(V)        \
-  FOREACH_MISC_OPCODE(V)           \
-  FOREACH_SIMPLE_OPCODE(V)         \
-  FOREACH_STORE_MEM_OPCODE(V)      \
-  FOREACH_LOAD_MEM_OPCODE(V)       \
-  FOREACH_MISC_MEM_OPCODE(V)       \
-  FOREACH_ASMJS_COMPAT_OPCODE(V)   \
-  FOREACH_SIMD_0_OPERAND_OPCODE(V) \
-  FOREACH_SIMD_1_OPERAND_OPCODE(V) \
+#define FOREACH_OPCODE(V)             \
+  FOREACH_CONTROL_OPCODE(V)           \
+  FOREACH_MISC_OPCODE(V)              \
+  FOREACH_SIMPLE_OPCODE(V)            \
+  FOREACH_STORE_MEM_OPCODE(V)         \
+  FOREACH_LOAD_MEM_OPCODE(V)          \
+  FOREACH_MISC_MEM_OPCODE(V)          \
+  FOREACH_ASMJS_COMPAT_OPCODE(V)      \
+  FOREACH_SIMD_0_OPERAND_OPCODE(V)    \
+  FOREACH_SIMD_1_OPERAND_OPCODE(V)    \
+  FOREACH_SIMD_MASK_OPERAND_OPCODE(V) \
  FOREACH_ATOMIC_OPCODE(V)

 // All signatures.
--- a/src/wasm/wasm-text.cc
+++ b/src/wasm/wasm-text.cc
@ -194,6 +194,7 @@ void wasm::PrintWasmText(const WasmModule *module,
        // they are publicly available.
        FOREACH_SIMD_0_OPERAND_OPCODE(CASE_OPCODE)
        FOREACH_SIMD_1_OPERAND_OPCODE(CASE_OPCODE)
+        FOREACH_SIMD_MASK_OPERAND_OPCODE(CASE_OPCODE)
        FOREACH_ATOMIC_OPCODE(CASE_OPCODE)
        os << WasmOpcodes::OpcodeName(opcode);
        break;
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -393,6 +393,20 @@ T RecipSqrt(T a) {
 #define WASM_SIMD_I8x16_REPLACE_LANE(lane, x, y) \
  x, y, WASM_SIMD_OP(kExprI8x16ReplaceLane), TO_BYTE(lane)

+#define WASM_SIMD_S32x4_SHUFFLE_OP(opcode, m, x, y)                        \
+  x, y, WASM_SIMD_OP(opcode), TO_BYTE(m[0]), TO_BYTE(m[1]), TO_BYTE(m[2]), \
+      TO_BYTE(m[3])
+#define WASM_SIMD_S16x8_SHUFFLE_OP(opcode, m, x, y)                        \
+  x, y, WASM_SIMD_OP(opcode), TO_BYTE(m[0]), TO_BYTE(m[1]), TO_BYTE(m[2]), \
+      TO_BYTE(m[3]), TO_BYTE(m[4]), TO_BYTE(m[5]), TO_BYTE(m[6]),          \
+      TO_BYTE(m[7])
+#define WASM_SIMD_S8x16_SHUFFLE_OP(opcode, m, x, y)                        \
+  x, y, WASM_SIMD_OP(opcode), TO_BYTE(m[0]), TO_BYTE(m[1]), TO_BYTE(m[2]), \
+      TO_BYTE(m[3]), TO_BYTE(m[4]), TO_BYTE(m[5]), TO_BYTE(m[6]),          \
+      TO_BYTE(m[7]), TO_BYTE(m[8]), TO_BYTE(m[9]), TO_BYTE(m[10]),         \
+      TO_BYTE(m[11]), TO_BYTE(m[12]), TO_BYTE(m[13]), TO_BYTE(m[14]),      \
+      TO_BYTE(m[15])
+
 // Skip FP tests involving extremely large or extremely small values, which
 // may fail due to non-IEEE-754 SIMD arithmetic on some platforms.
 bool SkipFPValue(float x) {
@ -1588,57 +1602,6 @@ WASM_SIMD_SELECT_TEST(16x8)
 WASM_SIMD_SELECT_TEST(8x16)
 #endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64

-#if V8_TARGET_ARCH_ARM
-// Test unary ops with a lane test pattern, all lanes distinct.
-template <typename T>
-void RunUnaryLaneOpTest(
-    WasmOpcode simd_op,
-    const std::array<T, kSimd128Size / sizeof(T)>& expected) {
-  FLAG_wasm_simd_prototype = true;
-  WasmRunner<int32_t> r(kExecuteCompiled);
-  // Set up a test pattern as a global, e.g. [0, 1, 2, 3].
-  T* global = r.module().AddGlobal<T>(kWasmS128);
-  static const size_t kElems = kSimd128Size / sizeof(T);
-  for (size_t i = 0; i < kElems; i++) {
-    global[i] = i;
-  }
-  BUILD(r, WASM_SET_GLOBAL(0, WASM_SIMD_UNOP(simd_op, WASM_GET_GLOBAL(0))),
-        WASM_ONE);
-
-  CHECK_EQ(1, r.Call());
-  for (size_t i = 0; i < kElems; i++) {
-    CHECK_EQ(global[i], expected[i]);
-  }
-}
-
-WASM_EXEC_COMPILED_TEST(S32x2Reverse) {
-  RunUnaryLaneOpTest<int32_t>(kExprS32x2Reverse, {{1, 0, 3, 2}});
-}
-
-WASM_EXEC_COMPILED_TEST(S16x4Reverse) {
-  RunUnaryLaneOpTest<int16_t>(kExprS16x4Reverse, {{3, 2, 1, 0, 7, 6, 5, 4}});
-}
-
-WASM_EXEC_COMPILED_TEST(S16x2Reverse) {
-  RunUnaryLaneOpTest<int16_t>(kExprS16x2Reverse, {{1, 0, 3, 2, 5, 4, 7, 6}});
-}
-
-WASM_EXEC_COMPILED_TEST(S8x8Reverse) {
-  RunUnaryLaneOpTest<int8_t>(kExprS8x8Reverse, {{7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
-                                                 13, 12, 11, 10, 9, 8}});
-}
-
-WASM_EXEC_COMPILED_TEST(S8x4Reverse) {
-  RunUnaryLaneOpTest<int8_t>(kExprS8x4Reverse, {{3, 2, 1, 0, 7, 6, 5, 4, 11, 10,
-                                                 9, 8, 15, 14, 13, 12}});
-}
-
-WASM_EXEC_COMPILED_TEST(S8x2Reverse) {
-  RunUnaryLaneOpTest<int8_t>(kExprS8x2Reverse, {{1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
-                                                 11, 10, 13, 12, 15, 14}});
-}
-#endif  // V8_TARGET_ARCH_ARM
-
 #if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
 // Test binary ops with two lane test patterns, all lanes distinct.
 template <typename T>
@ -1648,21 +1611,50 @@ void RunBinaryLaneOpTest(
  FLAG_wasm_simd_prototype = true;
  WasmRunner<int32_t> r(kExecuteCompiled);
  // Set up two test patterns as globals, e.g. [0, 1, 2, 3] and [4, 5, 6, 7].
-  T* global1 = r.module().AddGlobal<T>(kWasmS128);
-  T* global2 = r.module().AddGlobal<T>(kWasmS128);
-  static const size_t kElems = kSimd128Size / sizeof(T);
-  for (size_t i = 0; i < kElems; i++) {
-    global1[i] = static_cast<T>(i);
-    global2[i] = static_cast<T>(kElems + i);
+  T* src0 = r.module().AddGlobal<T>(kWasmS128);
+  T* src1 = r.module().AddGlobal<T>(kWasmS128);
+  static const int kElems = kSimd128Size / sizeof(T);
+  for (int i = 0; i < kElems; i++) {
+    src0[i] = i;
+    src1[i] = kElems + i;
+  }
+  switch (simd_op) {
+    case kExprS32x4Shuffle: {
+      BUILD(r,
+            WASM_SET_GLOBAL(0, WASM_SIMD_S32x4_SHUFFLE_OP(simd_op, expected,
+                                                          WASM_GET_GLOBAL(0),
+                                                          WASM_GET_GLOBAL(1))),
+            WASM_ONE);
+      break;
+    }
+    case kExprS16x8Shuffle: {
+      BUILD(r,
+            WASM_SET_GLOBAL(0, WASM_SIMD_S16x8_SHUFFLE_OP(simd_op, expected,
+                                                          WASM_GET_GLOBAL(0),
+                                                          WASM_GET_GLOBAL(1))),
+            WASM_ONE);
+      break;
+    }
+    case kExprS8x16Shuffle: {
+      BUILD(r,
+            WASM_SET_GLOBAL(0, WASM_SIMD_S8x16_SHUFFLE_OP(simd_op, expected,
+                                                          WASM_GET_GLOBAL(0),
+                                                          WASM_GET_GLOBAL(1))),
+            WASM_ONE);
+      break;
+    }
+    default: {
+      BUILD(r,
+            WASM_SET_GLOBAL(0, WASM_SIMD_BINOP(simd_op, WASM_GET_GLOBAL(0),
+                                               WASM_GET_GLOBAL(1))),
+            WASM_ONE);
+      break;
+    }
  }
-  BUILD(r,
-        WASM_SET_GLOBAL(0, WASM_SIMD_BINOP(simd_op, WASM_GET_GLOBAL(0),
-                                           WASM_GET_GLOBAL(1))),
-        WASM_ONE);

  CHECK_EQ(1, r.Call());
  for (size_t i = 0; i < expected.size(); i++) {
-    CHECK_EQ(global1[i], expected[i]);
+    CHECK_EQ(src0[i], expected[i]);
  }
 }

@ -1681,134 +1673,179 @@ WASM_EXEC_COMPILED_TEST(F32x4AddHoriz) {
  RunBinaryLaneOpTest<float>(kExprF32x4AddHoriz, {{1.0f, 5.0f, 9.0f, 13.0f}});
 }

+// Test some regular shuffles that may have special handling on some targets.
+// Test a normal and unary versions (where second operand isn't used).
 WASM_EXEC_COMPILED_TEST(S32x4ZipLeft) {
-  RunBinaryLaneOpTest<int32_t>(kExprS32x4ZipLeft, {{0, 4, 1, 5}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 4, 1, 5}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 0, 1, 1}});
 }

 WASM_EXEC_COMPILED_TEST(S32x4ZipRight) {
-  RunBinaryLaneOpTest<int32_t>(kExprS32x4ZipRight, {{2, 6, 3, 7}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{2, 6, 3, 7}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{2, 2, 3, 3}});
 }

 WASM_EXEC_COMPILED_TEST(S32x4UnzipLeft) {
-  RunBinaryLaneOpTest<int32_t>(kExprS32x4UnzipLeft, {{0, 2, 4, 6}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 2, 4, 6}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 2, 0, 2}});
 }

 WASM_EXEC_COMPILED_TEST(S32x4UnzipRight) {
-  RunBinaryLaneOpTest<int32_t>(kExprS32x4UnzipRight, {{1, 3, 5, 7}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{1, 3, 5, 7}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{1, 3, 1, 3}});
 }

 WASM_EXEC_COMPILED_TEST(S32x4TransposeLeft) {
-  RunBinaryLaneOpTest<int32_t>(kExprS32x4TransposeLeft, {{0, 4, 2, 6}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 4, 2, 6}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 0, 2, 2}});
 }

 WASM_EXEC_COMPILED_TEST(S32x4TransposeRight) {
-  RunBinaryLaneOpTest<int32_t>(kExprS32x4TransposeRight, {{1, 5, 3, 7}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{1, 5, 3, 7}});
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{1, 1, 3, 3}});
+}
+
+// Reverses are only unary.
+WASM_EXEC_COMPILED_TEST(S32x2Reverse) {
+  RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{1, 0, 3, 2}});
 }

 WASM_EXEC_COMPILED_TEST(S16x8ZipLeft) {
-  RunBinaryLaneOpTest<int16_t>(kExprS16x8ZipLeft, {{0, 8, 1, 9, 2, 10, 3, 11}});
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 8, 1, 9, 2, 10, 3, 11}});
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 0, 1, 1, 2, 2, 3, 3}});
 }

 WASM_EXEC_COMPILED_TEST(S16x8ZipRight) {
-  RunBinaryLaneOpTest<int16_t>(kExprS16x8ZipRight,
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle,
                               {{4, 12, 5, 13, 6, 14, 7, 15}});
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{4, 4, 5, 5, 6, 6, 7, 7}});
 }

 WASM_EXEC_COMPILED_TEST(S16x8UnzipLeft) {
-  RunBinaryLaneOpTest<int16_t>(kExprS16x8UnzipLeft,
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle,
                               {{0, 2, 4, 6, 8, 10, 12, 14}});
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 2, 4, 6, 0, 2, 4, 6}});
 }

 WASM_EXEC_COMPILED_TEST(S16x8UnzipRight) {
-  RunBinaryLaneOpTest<int16_t>(kExprS16x8UnzipRight,
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle,
                               {{1, 3, 5, 7, 9, 11, 13, 15}});
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{1, 3, 5, 7, 1, 3, 5, 7}});
 }

 WASM_EXEC_COMPILED_TEST(S16x8TransposeLeft) {
-  RunBinaryLaneOpTest<int16_t>(kExprS16x8TransposeLeft,
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle,
                               {{0, 8, 2, 10, 4, 12, 6, 14}});
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 0, 2, 2, 4, 4, 6, 6}});
 }

 WASM_EXEC_COMPILED_TEST(S16x8TransposeRight) {
-  RunBinaryLaneOpTest<int16_t>(kExprS16x8TransposeRight,
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle,
                               {{1, 9, 3, 11, 5, 13, 7, 15}});
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{1, 1, 3, 3, 5, 5, 7, 7}});
+}
+
+WASM_EXEC_COMPILED_TEST(S16x4Reverse) {
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{3, 2, 1, 0, 7, 6, 5, 4}});
+}
+
+WASM_EXEC_COMPILED_TEST(S16x2Reverse) {
+  RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{1, 0, 3, 2, 5, 4, 7, 6}});
 }

 WASM_EXEC_COMPILED_TEST(S8x16ZipLeft) {
  RunBinaryLaneOpTest<int8_t>(
-      kExprS8x16ZipLeft,
+      kExprS8x16Shuffle,
      {{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}});
+  RunBinaryLaneOpTest<int8_t>(
+      kExprS8x16Shuffle, {{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}});
 }

 WASM_EXEC_COMPILED_TEST(S8x16ZipRight) {
  RunBinaryLaneOpTest<int8_t>(
-      kExprS8x16ZipRight,
+      kExprS8x16Shuffle,
      {{8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}});
+  RunBinaryLaneOpTest<int8_t>(
+      kExprS8x16Shuffle,
+      {{8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}});
 }

 WASM_EXEC_COMPILED_TEST(S8x16UnzipLeft) {
  RunBinaryLaneOpTest<int8_t>(
-      kExprS8x16UnzipLeft,
+      kExprS8x16Shuffle,
      {{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}});
+  RunBinaryLaneOpTest<int8_t>(kExprS8x16Shuffle, {{0, 2, 4, 6, 8, 10, 12, 14, 0,
+                                                   2, 4, 6, 8, 10, 12, 14}});
 }

 WASM_EXEC_COMPILED_TEST(S8x16UnzipRight) {
  RunBinaryLaneOpTest<int8_t>(
-      kExprS8x16UnzipRight,
+      kExprS8x16Shuffle,
      {{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}});
+  RunBinaryLaneOpTest<int8_t>(kExprS8x16Shuffle, {{1, 3, 5, 7, 9, 11, 13, 15, 1,
+                                                   3, 5, 7, 9, 11, 13, 15}});
 }

 WASM_EXEC_COMPILED_TEST(S8x16TransposeLeft) {
  RunBinaryLaneOpTest<int8_t>(
-      kExprS8x16TransposeLeft,
+      kExprS8x16Shuffle,
      {{0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}});
+  RunBinaryLaneOpTest<int8_t>(kExprS8x16Shuffle, {{0, 0, 2, 2, 4, 4, 6, 6, 8, 8,
+                                                   10, 10, 12, 12, 14, 14}});
 }

 WASM_EXEC_COMPILED_TEST(S8x16TransposeRight) {
  RunBinaryLaneOpTest<int8_t>(
-      kExprS8x16TransposeRight,
+      kExprS8x16Shuffle,
      {{1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}});
+  RunBinaryLaneOpTest<int8_t>(kExprS8x16Shuffle, {{1, 1, 3, 3, 5, 5, 7, 7, 9, 9,
+                                                   11, 11, 13, 13, 15, 15}});
 }

-template <typename T>
-void RunConcatOpTest(WasmOpcode simd_op, int bytes,
-                     const std::array<T, kSimd128Size / sizeof(T)>& expected) {
-  FLAG_wasm_simd_prototype = true;
-  WasmRunner<int32_t> r(kExecuteCompiled);
-  // Set up two test patterns as globals, e.g. [0, 1, 2, 3] and [4, 5, 6, 7].
-  T* global1 = r.module().AddGlobal<T>(kWasmS128);
-  T* global2 = r.module().AddGlobal<T>(kWasmS128);
-  static const size_t kElems = kSimd128Size / sizeof(T);
-  for (size_t i = 0; i < kElems; i++) {
-    global1[i] = i;
-    global2[i] = kElems + i;
-  }
-  BUILD(
-      r,
-      WASM_SET_GLOBAL(0, WASM_SIMD_CONCAT_OP(simd_op, bytes, WASM_GET_GLOBAL(0),
-                                             WASM_GET_GLOBAL(1))),
-      WASM_ONE);
+WASM_EXEC_COMPILED_TEST(S8x8Reverse) {
+  RunBinaryLaneOpTest<int8_t>(kExprS8x16Shuffle, {{7, 6, 5, 4, 3, 2, 1, 0, 15,
+                                                   14, 13, 12, 11, 10, 9, 8}});
+}

-  CHECK_EQ(1, r.Call());
-  for (size_t i = 0; i < expected.size(); i++) {
-    CHECK_EQ(global1[i], expected[i]);
+WASM_EXEC_COMPILED_TEST(S8x4Reverse) {
+  RunBinaryLaneOpTest<int8_t>(kExprS8x16Shuffle, {{3, 2, 1, 0, 7, 6, 5, 4, 11,
+                                                   10, 9, 8, 15, 14, 13, 12}});
+}
+
+WASM_EXEC_COMPILED_TEST(S8x2Reverse) {
+  RunBinaryLaneOpTest<int8_t>(kExprS8x16Shuffle, {{1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
+                                                   11, 10, 13, 12, 15, 14}});
+}
+
+// Test shuffles that concatenate the two vectors.
+template <typename T>
+void RunConcatOpTest(WasmOpcode simd_op) {
+  static const int kLanes = kSimd128Size / sizeof(T);
+  std::array<T, kLanes> expected;
+  for (int bias = 1; bias < kLanes; bias++) {
+    int i = 0;
+    // last kLanes - bias bytes of first vector.
+    for (int j = bias; j < kLanes; j++) {
+      expected[i++] = j;
+    }
+    // first bias lanes of second vector
+    for (int j = 0; j < bias; j++) {
+      expected[i++] = j + kLanes;
+    }
+    RunBinaryLaneOpTest<T>(simd_op, expected);
  }
 }

+WASM_EXEC_COMPILED_TEST(S32x4Concat) {
+  RunConcatOpTest<int32_t>(kExprS32x4Shuffle);
+}
+
+WASM_EXEC_COMPILED_TEST(S16x8Concat) {
+  RunConcatOpTest<int16_t>(kExprS16x8Shuffle);
+}
+
 WASM_EXEC_COMPILED_TEST(S8x16Concat) {
-  std::array<int8_t, kSimd128Size> expected;
-  for (int k = 1; k < 16; k++) {
-    int j = 0;
-    // last 16 - k bytes of first vector.
-    for (int i = k; i < kSimd128Size; i++) {
-      expected[j++] = i;
-    }
-    // first k bytes of second vector
-    for (int i = 0; i < k; i++) {
-      expected[j++] = i + kSimd128Size;
-    }
-    RunConcatOpTest<int8_t>(kExprS8x16Concat, k, expected);
-  }
+  RunConcatOpTest<int8_t>(kExprS8x16Shuffle);
 }

 // Boolean unary operations are 'AllTrue' and 'AnyTrue', which return an integer
--- a/test/unittests/wasm/function-body-decoder-unittest.cc
+++ b/test/unittests/wasm/function-body-decoder-unittest.cc
@ -2634,6 +2634,10 @@ TEST_F(WasmOpcodeLengthTest, SimdExpressions) {
 #define TEST_SIMD(name, opcode, sig) \
  EXPECT_LENGTH_N(3, kSimdPrefix, static_cast<byte>(kExpr##name & 0xff));
  FOREACH_SIMD_1_OPERAND_OPCODE(TEST_SIMD)
+#undef TEST_SIMD
+  EXPECT_LENGTH_N(6, kSimdPrefix, static_cast<byte>(kExprS32x4Shuffle & 0xff));
+  EXPECT_LENGTH_N(10, kSimdPrefix, static_cast<byte>(kExprS16x8Shuffle & 0xff));
+  EXPECT_LENGTH_N(18, kSimdPrefix, static_cast<byte>(kExprS8x16Shuffle & 0xff));
 #undef TEST_SIMD
  // test for bad simd opcode
  EXPECT_LENGTH_N(2, kSimdPrefix, 0xff);