[wasm simd] Handle more shuffles

- Handles zip, unzip, and transpose shuffles/swizzles. - Adds punpck* instructions to assembler. Bug: v8:6020 Change-Id: If124b7a7462ffd0470347b54ce4a93c01667e384 Reviewed-on: https://chromium-review.googlesource.com/1084069 Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#53947}
2018-06-21 06:58:02 -07:00 · 2018-06-21 06:58:02 -07:00 · 51ded9a743
commit 51ded9a743
parent edfcba0407
6 changed files with 411 additions and 45 deletions
--- a/src/compiler/ia32/code-generator-ia32.cc
+++ b/src/compiler/ia32/code-generator-ia32.cc
@ -420,6 +420,30 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
    }                                                       \
  } while (0)

+#define ASSEMBLE_SIMD_PUNPCK_SHUFFLE(opcode)                         \
+  do {                                                               \
+    XMMRegister src0 = i.InputSimd128Register(0);                    \
+    Operand src1 = i.InputOperand(instr->InputCount() == 2 ? 1 : 0); \
+    if (CpuFeatures::IsSupported(AVX)) {                             \
+      CpuFeatureScope avx_scope(tasm(), AVX);                        \
+      __ v##opcode(i.OutputSimd128Register(), src0, src1);           \
+    } else {                                                         \
+      DCHECK_EQ(i.OutputSimd128Register(), src0);                    \
+      __ opcode(i.OutputSimd128Register(), src1);                    \
+    }                                                                \
+  } while (false)
+
+#define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, SSELevel, imm)               \
+  if (CpuFeatures::IsSupported(AVX)) {                                 \
+    CpuFeatureScope avx_scope(tasm(), AVX);                            \
+    __ v##opcode(i.OutputSimd128Register(), i.InputSimd128Register(0), \
+                 i.InputOperand(1), imm);                              \
+  } else {                                                             \
+    CpuFeatureScope sse_scope(tasm(), SSELevel);                       \
+    DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));   \
+    __ opcode(i.OutputSimd128Register(), i.InputOperand(1), imm);      \
+  }
+
 void CodeGenerator::AssembleDeconstructFrame() {
  __ mov(esp, ebp);
  __ pop(ebp);
@ -3195,23 +3219,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kIA32S32x4Shuffle: {
      DCHECK_EQ(4, instr->InputCount());  // Swizzles should be handled above.
-      __ Pshufd(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
-      __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2));
+      int8_t shuffle = i.InputInt8(2);
+      DCHECK_NE(0xe4, shuffle);  // A simple blend should be handled below.
+      __ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
+      __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
      __ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
      break;
    }
-    case kSSES16x8Blend: {
-      CpuFeatureScope sse_scope(tasm(), SSE4_1);
-      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
-      __ pblendw(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2));
+    case kIA32S16x8Blend:
+      ASSEMBLE_SIMD_IMM_SHUFFLE(pblendw, SSE4_1, i.InputInt8(2));
      break;
-    }
-    case kAVXS16x8Blend: {
-      CpuFeatureScope sse_scope(tasm(), AVX);
-      __ vpblendw(i.OutputSimd128Register(), i.InputSimd128Register(0),
-                  i.InputOperand(1), i.InputInt8(2));
-      break;
-    }
    case kIA32S16x8HalfShuffle1: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
@ -3227,18 +3244,202 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
      break;
    }
-    case kSSES8x16Alignr: {
-      CpuFeatureScope sse_scope(tasm(), SSSE3);
-      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
-      __ palignr(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2));
+    case kIA32S8x16Alignr:
+      ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2));
+      break;
+    case kIA32S64x2UnpackHigh:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq);
+      break;
+    case kIA32S32x4UnpackHigh:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhdq);
+      break;
+    case kIA32S16x8UnpackHigh:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhwd);
+      break;
+    case kIA32S8x16UnpackHigh:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhbw);
+      break;
+    case kIA32S64x2UnpackLow:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklqdq);
+      break;
+    case kIA32S32x4UnpackLow:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckldq);
+      break;
+    case kIA32S16x8UnpackLow:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklwd);
+      break;
+    case kIA32S8x16UnpackLow:
+      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklbw);
+      break;
+    case kSSES16x8UnzipHigh: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      if (instr->InputCount() == 2) {
+        __ movups(kScratchDoubleReg, i.InputOperand(1));
+        __ psrld(kScratchDoubleReg, 16);
+        src2 = kScratchDoubleReg;
+      }
+      __ psrld(dst, 16);
+      __ packusdw(dst, src2);
      break;
    }
-    case kAVXS8x16Alignr: {
+    case kAVXS16x8UnzipHigh: {
      CpuFeatureScope avx_scope(tasm(), AVX);
-      __ vpalignr(i.OutputSimd128Register(), i.InputSimd128Register(0),
-                  i.InputOperand(1), i.InputInt8(2));
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      if (instr->InputCount() == 2) {
+        __ vpsrld(kScratchDoubleReg, i.InputSimd128Register(1), 16);
+        src2 = kScratchDoubleReg;
+      }
+      __ vpsrld(dst, i.InputSimd128Register(0), 16);
+      __ vpackusdw(dst, dst, src2);
      break;
    }
+    case kSSES16x8UnzipLow: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
+      if (instr->InputCount() == 2) {
+        __ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
+        src2 = kScratchDoubleReg;
+      }
+      __ pblendw(dst, kScratchDoubleReg, 0xaa);
+      __ packusdw(dst, src2);
+      break;
+    }
+    case kAVXS16x8UnzipLow: {
+      CpuFeatureScope avx_scope(tasm(), AVX);
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
+      if (instr->InputCount() == 2) {
+        __ vpblendw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1),
+                    0x55);
+        src2 = kScratchDoubleReg;
+      }
+      __ vpblendw(dst, kScratchDoubleReg, i.InputSimd128Register(0), 0x55);
+      __ vpackusdw(dst, dst, src2);
+      break;
+    }
+    case kSSES8x16UnzipHigh: {
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      if (instr->InputCount() == 2) {
+        __ movups(kScratchDoubleReg, i.InputOperand(1));
+        __ psrlw(kScratchDoubleReg, 8);
+        src2 = kScratchDoubleReg;
+      }
+      __ psrlw(dst, 8);
+      __ packuswb(dst, src2);
+      break;
+    }
+    case kAVXS8x16UnzipHigh: {
+      CpuFeatureScope avx_scope(tasm(), AVX);
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      if (instr->InputCount() == 2) {
+        __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
+        src2 = kScratchDoubleReg;
+      }
+      __ vpsrlw(dst, i.InputSimd128Register(0), 8);
+      __ vpackuswb(dst, dst, src2);
+      break;
+    }
+    case kSSES8x16UnzipLow: {
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      if (instr->InputCount() == 2) {
+        __ movups(kScratchDoubleReg, i.InputOperand(1));
+        __ psllw(kScratchDoubleReg, 8);
+        __ psrlw(kScratchDoubleReg, 8);
+        src2 = kScratchDoubleReg;
+      }
+      __ psllw(dst, 8);
+      __ psrlw(dst, 8);
+      __ packuswb(dst, src2);
+      break;
+    }
+    case kAVXS8x16UnzipLow: {
+      CpuFeatureScope avx_scope(tasm(), AVX);
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src2 = dst;
+      if (instr->InputCount() == 2) {
+        __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
+        __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
+        src2 = kScratchDoubleReg;
+      }
+      __ vpsllw(dst, i.InputSimd128Register(0), 8);
+      __ vpsrlw(dst, dst, 8);
+      __ vpackuswb(dst, dst, src2);
+      break;
+    }
+    case kSSES8x16TransposeLow: {
+      XMMRegister dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      __ psllw(dst, 8);
+      if (instr->InputCount() == 1) {
+        __ movups(kScratchDoubleReg, dst);
+      } else {
+        DCHECK_EQ(2, instr->InputCount());
+        __ movups(kScratchDoubleReg, i.InputOperand(1));
+        __ psllw(kScratchDoubleReg, 8);
+      }
+      __ psrlw(dst, 8);
+      __ por(dst, kScratchDoubleReg);
+      break;
+    }
+    case kAVXS8x16TransposeLow: {
+      CpuFeatureScope avx_scope(tasm(), AVX);
+      XMMRegister dst = i.OutputSimd128Register();
+      if (instr->InputCount() == 1) {
+        __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(0), 8);
+        __ vpsrlw(dst, kScratchDoubleReg, 8);
+      } else {
+        DCHECK_EQ(2, instr->InputCount());
+        __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
+        __ vpsllw(dst, i.InputSimd128Register(0), 8);
+        __ vpsrlw(dst, dst, 8);
+      }
+      __ vpor(dst, dst, kScratchDoubleReg);
+      break;
+    }
+    case kSSES8x16TransposeHigh: {
+      XMMRegister dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      __ psrlw(dst, 8);
+      if (instr->InputCount() == 1) {
+        __ movups(kScratchDoubleReg, dst);
+      } else {
+        DCHECK_EQ(2, instr->InputCount());
+        __ movups(kScratchDoubleReg, i.InputOperand(1));
+        __ psrlw(kScratchDoubleReg, 8);
+      }
+      __ psllw(kScratchDoubleReg, 8);
+      __ por(dst, kScratchDoubleReg);
+      break;
+    }
+    case kAVXS8x16TransposeHigh: {
+      CpuFeatureScope avx_scope(tasm(), AVX);
+      XMMRegister dst = i.OutputSimd128Register();
+      if (instr->InputCount() == 1) {
+        __ vpsrlw(dst, i.InputSimd128Register(0), 8);
+        __ vpsllw(kScratchDoubleReg, dst, 8);
+      } else {
+        DCHECK_EQ(2, instr->InputCount());
+        __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
+        __ vpsrlw(dst, i.InputSimd128Register(0), 8);
+        __ vpsllw(kScratchDoubleReg, kScratchDoubleReg, 8);
+      }
+      __ vpor(dst, dst, kScratchDoubleReg);
+      break;
+    }
+
    case kIA32S1x4AnyTrue:
    case kIA32S1x8AnyTrue:
    case kIA32S1x16AnyTrue: {
@ -4136,6 +4337,8 @@ void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
 #undef ASSEMBLE_BINOP
 #undef ASSEMBLE_ATOMIC_BINOP
 #undef ASSEMBLE_MOVX
+#undef ASSEMBLE_SIMD_PUNPCK_SHUFFLE
+#undef ASSEMBLE_SIMD_IMM_SHUFFLE

 }  // namespace compiler
 }  // namespace internal
--- a/src/compiler/ia32/instruction-codes-ia32.h
+++ b/src/compiler/ia32/instruction-codes-ia32.h
@ -303,12 +303,30 @@ namespace compiler {
  V(IA32S8x16Shuffle)              \
  V(IA32S32x4Swizzle)              \
  V(IA32S32x4Shuffle)              \
-  V(SSES16x8Blend)                 \
-  V(AVXS16x8Blend)                 \
+  V(IA32S16x8Blend)                \
  V(IA32S16x8HalfShuffle1)         \
  V(IA32S16x8HalfShuffle2)         \
-  V(SSES8x16Alignr)                \
-  V(AVXS8x16Alignr)                \
+  V(IA32S8x16Alignr)               \
+  V(SSES16x8UnzipHigh)             \
+  V(AVXS16x8UnzipHigh)             \
+  V(SSES16x8UnzipLow)              \
+  V(AVXS16x8UnzipLow)              \
+  V(SSES8x16UnzipHigh)             \
+  V(AVXS8x16UnzipHigh)             \
+  V(SSES8x16UnzipLow)              \
+  V(AVXS8x16UnzipLow)              \
+  V(IA32S64x2UnpackHigh)           \
+  V(IA32S32x4UnpackHigh)           \
+  V(IA32S16x8UnpackHigh)           \
+  V(IA32S8x16UnpackHigh)           \
+  V(IA32S64x2UnpackLow)            \
+  V(IA32S32x4UnpackLow)            \
+  V(IA32S16x8UnpackLow)            \
+  V(IA32S8x16UnpackLow)            \
+  V(SSES8x16TransposeLow)          \
+  V(AVXS8x16TransposeLow)          \
+  V(SSES8x16TransposeHigh)         \
+  V(AVXS8x16TransposeHigh)         \
  V(IA32S1x4AnyTrue)               \
  V(IA32S1x4AllTrue)               \
  V(IA32S1x8AnyTrue)               \
--- a/src/compiler/ia32/instruction-scheduler-ia32.cc
+++ b/src/compiler/ia32/instruction-scheduler-ia32.cc
@ -285,12 +285,30 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kIA32S8x16Shuffle:
    case kIA32S32x4Swizzle:
    case kIA32S32x4Shuffle:
-    case kSSES16x8Blend:
-    case kAVXS16x8Blend:
+    case kIA32S16x8Blend:
    case kIA32S16x8HalfShuffle1:
    case kIA32S16x8HalfShuffle2:
-    case kSSES8x16Alignr:
-    case kAVXS8x16Alignr:
+    case kIA32S8x16Alignr:
+    case kSSES16x8UnzipHigh:
+    case kAVXS16x8UnzipHigh:
+    case kSSES16x8UnzipLow:
+    case kAVXS16x8UnzipLow:
+    case kSSES8x16UnzipHigh:
+    case kAVXS8x16UnzipHigh:
+    case kSSES8x16UnzipLow:
+    case kAVXS8x16UnzipLow:
+    case kIA32S64x2UnpackHigh:
+    case kIA32S32x4UnpackHigh:
+    case kIA32S16x8UnpackHigh:
+    case kIA32S8x16UnpackHigh:
+    case kIA32S64x2UnpackLow:
+    case kIA32S32x4UnpackLow:
+    case kIA32S16x8UnpackLow:
+    case kIA32S8x16UnpackLow:
+    case kSSES8x16TransposeLow:
+    case kAVXS8x16TransposeLow:
+    case kSSES8x16TransposeHigh:
+    case kAVXS8x16TransposeHigh:
    case kIA32S1x4AnyTrue:
    case kIA32S1x4AllTrue:
    case kIA32S1x8AnyTrue:
--- a/src/compiler/ia32/instruction-selector-ia32.cc
+++ b/src/compiler/ia32/instruction-selector-ia32.cc
@ -2059,18 +2059,125 @@ uint8_t PackBlend4(const uint8_t* shuffle32x4) {
  return result;
 }

-// Returns true if shuffle can be separated into two half shuffles, i.e.lanes
-// don't move from low 4 lanes to high 4 lanes or vice versa) and a blend.
+// Returns true if shuffle can be decomposed into two 16x4 half shuffles
+// followed by a 16x8 blend.
 // E.g. [3 2 1 0 15 14 13 12].
-bool Is16x8BlendedShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) {
+bool TryMatch16x8HalfShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) {
  *blend_mask = 0;
  for (int i = 0; i < 8; i++) {
-    *blend_mask |= (shuffle16x8[i] > 7 ? 1 : 0) << i;
    if ((shuffle16x8[i] & 0x4) != (i & 0x4)) return false;
+    *blend_mask |= (shuffle16x8[i] > 7 ? 1 : 0) << i;
  }
  return true;
 }

+struct ShuffleEntry {
+  uint8_t shuffle[kSimd128Size];
+  ArchOpcode opcode;
+  ArchOpcode avx_opcode;
+  bool src0_needs_reg;
+  bool src1_needs_reg;
+};
+
+// Shuffles that map to architecture-specific instruction sequences. These are
+// matched very early, so we shouldn't include shuffles that match better in
+// later tests, like 32x4 and 16x8 shuffles. In general, these patterns should
+// map to either a single instruction, or be finer grained, such as zip/unzip or
+// transpose patterns.
+static const ShuffleEntry arch_shuffles[] = {
+    {{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23},
+     kIA32S64x2UnpackLow,
+     kIA32S64x2UnpackLow,
+     true,
+     false},
+    {{8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
+     kIA32S64x2UnpackHigh,
+     kIA32S64x2UnpackHigh,
+     true,
+     false},
+    {{0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23},
+     kIA32S32x4UnpackLow,
+     kIA32S32x4UnpackLow,
+     true,
+     false},
+    {{8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31},
+     kIA32S32x4UnpackHigh,
+     kIA32S32x4UnpackHigh,
+     true,
+     false},
+    {{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23},
+     kIA32S16x8UnpackLow,
+     kIA32S16x8UnpackLow,
+     true,
+     false},
+    {{8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31},
+     kIA32S16x8UnpackHigh,
+     kIA32S16x8UnpackHigh,
+     true,
+     false},
+    {{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23},
+     kIA32S8x16UnpackLow,
+     kIA32S8x16UnpackLow,
+     true,
+     false},
+    {{8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31},
+     kIA32S8x16UnpackHigh,
+     kIA32S8x16UnpackHigh,
+     true,
+     false},
+
+    {{0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29},
+     kSSES16x8UnzipLow,
+     kAVXS16x8UnzipLow,
+     true,
+     false},
+    {{2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31},
+     kSSES16x8UnzipHigh,
+     kAVXS16x8UnzipHigh,
+     true,
+     true},
+    {{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
+     kSSES8x16UnzipLow,
+     kAVXS8x16UnzipLow,
+     true,
+     true},
+    {{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31},
+     kSSES8x16UnzipHigh,
+     kAVXS8x16UnzipHigh,
+     true,
+     true},
+
+    {{0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30},
+     kSSES8x16TransposeLow,
+     kAVXS8x16TransposeLow,
+     true,
+     true},
+    {{1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31},
+     kSSES8x16TransposeHigh,
+     kAVXS8x16TransposeHigh,
+     true,
+     true}};
+
+bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
+                         size_t num_entries, bool is_swizzle,
+                         const ShuffleEntry** arch_shuffle) {
+  uint8_t mask = is_swizzle ? kSimd128Size - 1 : 2 * kSimd128Size - 1;
+  for (size_t i = 0; i < num_entries; ++i) {
+    const ShuffleEntry& entry = table[i];
+    int j = 0;
+    for (; j < kSimd128Size; ++j) {
+      if ((entry.shuffle[j] & mask) != (shuffle[j] & mask)) {
+        break;
+      }
+    }
+    if (j == kSimd128Size) {
+      *arch_shuffle = &entry;
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace

 // TODO(bbudge) Make sure identity shuffle emits no instructions.
@ -2090,40 +2197,51 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
  bool use_avx = CpuFeatures::IsSupported(AVX);
  // AVX and swizzles don't generally need DefineSameAsFirst to avoid a move.
  bool no_same_as_first = use_avx || is_swizzle;
-  // We generally need UseRegister for the first source.
-  bool no_use_register = false;
+  // We generally need UseRegister for input0, Use for input1.
+  bool src0_needs_reg = true;
+  bool src1_needs_reg = false;
  ArchOpcode opcode = kIA32S8x16Shuffle;  // general shuffle is the default

  uint8_t offset;
  uint8_t shuffle32x4[4];
  uint8_t shuffle16x8[8];
+  const ShuffleEntry* arch_shuffle;
  if (TryMatchConcat(shuffle, &offset)) {
    // Swap inputs from the normal order for (v)palignr.
    SwapShuffleInputs(node);
    is_swizzle = false;  // It's simpler to just handle the general case.
    no_same_as_first = use_avx;  // SSE requires same-as-first.
-    opcode = use_avx ? kAVXS8x16Alignr : kSSES8x16Alignr;
+    opcode = kIA32S8x16Alignr;
    // palignr takes a single imm8 offset.
    imms[imm_count++] = offset;
+  } else if (TryMatchArchShuffle(shuffle, arch_shuffles,
+                                 arraysize(arch_shuffles), is_swizzle,
+                                 &arch_shuffle)) {
+    opcode = use_avx ? arch_shuffle->avx_opcode : arch_shuffle->opcode;
+    src0_needs_reg = arch_shuffle->src0_needs_reg;
+    // SSE can't take advantage of both operands in registers and needs
+    // same-as-first.
+    src1_needs_reg = use_avx && arch_shuffle->src1_needs_reg;
+    no_same_as_first = use_avx;
  } else if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
    uint8_t shuffle_mask = PackShuffle4(shuffle32x4);
    if (is_swizzle) {
      // pshufd takes a single imm8 shuffle mask.
      opcode = kIA32S32x4Swizzle;
      no_same_as_first = true;
-      no_use_register = true;
+      src0_needs_reg = false;
      imms[imm_count++] = shuffle_mask;
    } else {
      // 2 operand shuffle
      // A blend is more efficient than a general 32x4 shuffle; try it first.
      if (TryMatchBlend(shuffle)) {
-        opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend;
+        opcode = kIA32S16x8Blend;
        uint8_t blend_mask = PackBlend4(shuffle32x4);
        imms[imm_count++] = blend_mask;
      } else {
        opcode = kIA32S32x4Shuffle;
        no_same_as_first = true;
-        no_use_register = true;
+        src0_needs_reg = false;
        imms[imm_count++] = shuffle_mask;
        int8_t blend_mask = PackBlend4(shuffle32x4);
        imms[imm_count++] = blend_mask;
@ -2132,14 +2250,14 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
  } else if (TryMatch16x8Shuffle(shuffle, shuffle16x8)) {
    uint8_t blend_mask;
    if (TryMatchBlend(shuffle)) {
-      opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend;
+      opcode = kIA32S16x8Blend;
      blend_mask = PackBlend8(shuffle16x8);
      imms[imm_count++] = blend_mask;
-    } else if (Is16x8BlendedShuffle(shuffle16x8, &blend_mask)) {
+    } else if (TryMatch16x8HalfShuffle(shuffle16x8, &blend_mask)) {
      opcode = is_swizzle ? kIA32S16x8HalfShuffle1 : kIA32S16x8HalfShuffle2;
      // Half-shuffles don't need DefineSameAsFirst or UseRegister(src0).
      no_same_as_first = true;
-      no_use_register = true;
+      src0_needs_reg = false;
      uint8_t mask_lo = PackShuffle4(shuffle16x8);
      uint8_t mask_hi = PackShuffle4(shuffle16x8 + 4);
      imms[imm_count++] = mask_lo;
@ -2150,7 +2268,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
  if (opcode == kIA32S8x16Shuffle) {
    // Use same-as-first for general swizzle, but not shuffle.
    no_same_as_first = !is_swizzle;
-    no_use_register = no_same_as_first;
+    src0_needs_reg = !no_same_as_first;
    imms[imm_count++] = Pack4Lanes(shuffle);
    imms[imm_count++] = Pack4Lanes(shuffle + 4);
    imms[imm_count++] = Pack4Lanes(shuffle + 8);
@ -2164,13 +2282,15 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
  InstructionOperand dst =
      no_same_as_first ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
  InstructionOperand src0 =
-      no_use_register ? g.Use(input0) : g.UseRegister(input0);
+      src0_needs_reg ? g.UseRegister(input0) : g.Use(input0);

  int input_count = 0;
  InstructionOperand inputs[2 + kMaxImms + kMaxTemps];
  inputs[input_count++] = src0;
  if (!is_swizzle) {
-    inputs[input_count++] = g.Use(node->InputAt(1));
+    Node* input1 = node->InputAt(1);
+    inputs[input_count++] =
+        src1_needs_reg ? g.UseRegister(input1) : g.Use(input1);
  }
  for (int i = 0; i < imm_count; ++i) {
    inputs[input_count++] = g.UseImmediate(imms[i]);
--- a/src/ia32/sse-instr.h
+++ b/src/ia32/sse-instr.h
@ -42,8 +42,14 @@
  V(psubsw, 66, 0F, E9)          \
  V(psubusb, 66, 0F, D8)         \
  V(psubusw, 66, 0F, D9)         \
-  V(punpckhdq, 66, 0F, 6A)       \
+  V(punpcklbw, 66, 0F, 60)       \
+  V(punpcklwd, 66, 0F, 61)       \
  V(punpckldq, 66, 0F, 62)       \
+  V(punpcklqdq, 66, 0F, 6C)      \
+  V(punpckhbw, 66, 0F, 68)       \
+  V(punpckhwd, 66, 0F, 69)       \
+  V(punpckhdq, 66, 0F, 6A)       \
+  V(punpckhqdq, 66, 0F, 6D)      \
  V(pxor, 66, 0F, EF)

 #define SSSE3_INSTRUCTION_LIST(V) \
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -1874,6 +1874,7 @@ WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x8TransposeRight) {
      {{2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}});
 }

+// TODO(simd) 'Reverse' tests should be 2-operand shuffles, not swizzles.
 WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x4Reverse) {
  RunShuffleOpTest<int8_t>(
      execution_mode, lower_simd, kExprS8x16Shuffle,