[x64] Wasm SIMD x64 Conversion, AllTrue/AnyTrue operations

This CL mirrors the ia32 SIMD conversion, Alltrue/AnyTrue operations with minor cleanliness changes to use TempRegisters instead of ScratchRegisters Change-Id: I84d3e148200dd611a72380b24404b75c73c5352d Reviewed-on: https://chromium-review.googlesource.com/1174096 Commit-Queue: Deepti Gandluri <gdeepti@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#56297}
2018-09-28 15:58:43 -07:00 · 2018-09-28 15:58:43 -07:00 · caf93c9f6f
commit caf93c9f6f
parent 02a6727bb8
7 changed files with 320 additions and 105 deletions
--- a/src/compiler/instruction-selector.cc
+++ b/src/compiler/instruction-selector.cc
@ -2427,18 +2427,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
 }
 #endif  // !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM

-#if !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS && \
-    !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
-void InstructionSelector::VisitF32x4SConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-#endif  // !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS
-        // && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
-
 #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS64
 void InstructionSelector::VisitWord64AtomicLoad(Node* node) { UNIMPLEMENTED(); }

@ -2467,61 +2455,6 @@ void InstructionSelector::VisitWord64AtomicCompareExchange(Node* node) {

 #if !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS && \
    !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
-void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4SConvertI16x8Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4SConvertI16x8High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4UConvertI16x8Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4UConvertI16x8High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8SConvertI8x16Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8SConvertI8x16High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8UConvertI8x16Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8UConvertI8x16High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8SConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI8x16SConvertI16x8(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
-  UNIMPLEMENTED();
-}
-
 void InstructionSelector::VisitI8x16Shl(Node* node) { UNIMPLEMENTED(); }

 void InstructionSelector::VisitI8x16ShrS(Node* node) { UNIMPLEMENTED(); }
@ -2531,18 +2464,6 @@ void InstructionSelector::VisitI8x16ShrU(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI8x16Mul(Node* node) { UNIMPLEMENTED(); }

 void InstructionSelector::VisitS8x16Shuffle(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x4AnyTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x4AllTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x8AnyTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x8AllTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x16AnyTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x16AllTrue(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS
        // && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32

--- a/src/compiler/x64/code-generator-x64.cc
+++ b/src/compiler/x64/code-generator-x64.cc
@ -2137,6 +2137,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ insertps(i.OutputSimd128Register(), i.InputDoubleRegister(2), select);
      break;
    }
+    case kX64F32x4SConvertI32x4: {
+      __ cvtdq2ps(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64F32x4UConvertI32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      DCHECK_NE(i.OutputSimd128Register(), kScratchDoubleReg);
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);      // zeros
+      __ pblendw(kScratchDoubleReg, dst, 0x55);           // get lo 16 bits
+      __ psubd(dst, kScratchDoubleReg);                   // get hi 16 bits
+      __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
+      __ psrld(dst, 1);                  // divide by 2 to get in unsigned range
+      __ cvtdq2ps(dst, dst);             // convert hi exactly
+      __ addps(dst, dst);                // double hi, exactly
+      __ addps(dst, kScratchDoubleReg);  // add hi and lo, may round.
+      break;
+    }
    case kX64F32x4Abs: {
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = i.InputSimd128Register(0);
@ -2245,6 +2264,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      }
      break;
    }
+    case kX64I32x4SConvertF32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      XMMRegister dst = i.OutputSimd128Register();
+      // NAN->0
+      __ movaps(kScratchDoubleReg, dst);
+      __ cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
+      __ pand(dst, kScratchDoubleReg);
+      // Set top bit if >= 0 (but not -0.0!)
+      __ pxor(kScratchDoubleReg, dst);
+      // Convert
+      __ cvttps2dq(dst, dst);
+      // Set top bit if >=0 is now < 0
+      __ pand(kScratchDoubleReg, dst);
+      __ psrad(kScratchDoubleReg, 31);
+      // Set positive overflow lanes to 0x7FFFFFFF
+      __ pxor(dst, kScratchDoubleReg);
+      break;
+    }
+    case kX64I32x4SConvertI16x8Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovsxwd(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I32x4SConvertI16x8High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovsxwd(dst, dst);
+      break;
+    }
    case kX64I32x4Neg: {
      CpuFeatureScope sse_scope(tasm(), SSSE3);
      XMMRegister dst = i.OutputSimd128Register();
@ -2316,6 +2365,46 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ pcmpeqd(dst, src);
      break;
    }
+    case kX64I32x4UConvertF32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
+      // NAN->0, negative->0
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
+      __ maxps(dst, kScratchDoubleReg);
+      // scratch: float representation of max_signed
+      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
+      __ psrld(kScratchDoubleReg, 1);                     // 0x7fffffff
+      __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
+      // tmp: convert (src-max_signed).
+      // Positive overflow lanes -> 0x7FFFFFFF
+      // Negative lanes -> 0
+      __ movaps(tmp, dst);
+      __ subps(tmp, kScratchDoubleReg);
+      __ cmpleps(kScratchDoubleReg, tmp);
+      __ cvttps2dq(tmp, tmp);
+      __ pxor(tmp, kScratchDoubleReg);
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
+      __ pmaxsd(tmp, kScratchDoubleReg);
+      // convert. Overflow lanes above max_signed will be 0x80000000
+      __ cvttps2dq(dst, dst);
+      // Add (src-max_signed) for overflow lanes.
+      __ paddd(dst, tmp);
+      break;
+    }
+    case kX64I32x4UConvertI16x8Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovzxwd(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I32x4UConvertI16x8High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovzxwd(dst, dst);
+      break;
+    }
    case kX64I32x4ShrU: {
      __ psrld(i.OutputSimd128Register(), i.InputInt8(1));
      break;
@ -2377,6 +2466,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      }
      break;
    }
+    case kX64I16x8SConvertI8x16Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovsxbw(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I16x8SConvertI8x16High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovsxbw(dst, dst);
+      break;
+    }
    case kX64I16x8Neg: {
      CpuFeatureScope sse_scope(tasm(), SSSE3);
      XMMRegister dst = i.OutputSimd128Register();
@ -2398,6 +2499,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ psraw(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
+    case kX64I16x8SConvertI32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      __ packssdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
+      break;
+    }
    case kX64I16x8Add: {
      __ paddw(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
@ -2456,10 +2562,34 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ pcmpeqw(dst, src);
      break;
    }
+    case kX64I16x8UConvertI8x16Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovzxbw(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I16x8UConvertI8x16High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovzxbw(dst, dst);
+      break;
+    }
    case kX64I16x8ShrU: {
      __ psrlw(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
+    case kX64I16x8UConvertI32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      // Change negative lanes to 0x7FFFFFFF
+      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
+      __ psrld(kScratchDoubleReg, 1);
+      __ pminud(dst, kScratchDoubleReg);
+      __ pminud(kScratchDoubleReg, i.InputSimd128Register(1));
+      __ packusdw(dst, kScratchDoubleReg);
+      break;
+    }
    case kX64I16x8AddSaturateU: {
      __ paddusw(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
@ -2521,6 +2651,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      }
      break;
    }
+    case kX64I8x16SConvertI16x8: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      __ packsswb(i.OutputSimd128Register(), i.InputSimd128Register(1));
+      break;
+    }
    case kX64I8x16Neg: {
      CpuFeatureScope sse_scope(tasm(), SSSE3);
      XMMRegister dst = i.OutputSimd128Register();
@ -2582,6 +2717,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ pcmpeqb(dst, src);
      break;
    }
+    case kX64I8x16UConvertI16x8: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      // Change negative lanes to 0x7FFF
+      __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
+      __ psrlw(kScratchDoubleReg, 1);
+      __ pminuw(dst, kScratchDoubleReg);
+      __ pminuw(kScratchDoubleReg, i.InputSimd128Register(1));
+      __ packuswb(dst, kScratchDoubleReg);
+      break;
+    }
    case kX64I8x16AddSaturateU: {
      __ paddusb(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
@ -2653,6 +2800,42 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ xorps(dst, i.InputSimd128Register(2));
      break;
    }
+    case kX64S1x4AnyTrue:
+    case kX64S1x8AnyTrue:
+    case kX64S1x16AnyTrue: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      Register dst = i.OutputRegister();
+      XMMRegister src = i.InputSimd128Register(0);
+      Register tmp = i.TempRegister(0);
+      __ xorq(tmp, tmp);
+      __ movq(dst, Immediate(-1));
+      __ ptest(src, src);
+      __ cmovq(zero, dst, tmp);
+      break;
+    }
+    case kX64S1x4AllTrue:
+    case kX64S1x8AllTrue:
+    case kX64S1x16AllTrue: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      Register dst = i.OutputRegister();
+      XMMRegister src = i.InputSimd128Register(0);
+      Register tmp = i.TempRegister(0);
+      __ movq(tmp, Immediate(-1));
+      __ xorq(dst, dst);
+      // Compare all src lanes to false.
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
+      if (arch_opcode == kX64S1x4AllTrue) {
+        __ pcmpeqd(kScratchDoubleReg, src);
+      } else if (arch_opcode == kX64S1x8AllTrue) {
+        __ pcmpeqw(kScratchDoubleReg, src);
+      } else {
+        __ pcmpeqb(kScratchDoubleReg, src);
+      }
+      // If kScratchDoubleReg is all zero, none of src lanes are false.
+      __ ptest(kScratchDoubleReg, kScratchDoubleReg);
+      __ cmovq(zero, dst, tmp);
+      break;
+    }
    case kX64StackCheck:
      __ CompareRoot(rsp, RootIndex::kStackLimit);
      break;
--- a/src/compiler/x64/instruction-codes-x64.h
+++ b/src/compiler/x64/instruction-codes-x64.h
@ -151,6 +151,8 @@ namespace compiler {
  V(X64F32x4Splat)                        \
  V(X64F32x4ExtractLane)                  \
  V(X64F32x4ReplaceLane)                  \
+  V(X64F32x4SConvertI32x4)                \
+  V(X64F32x4UConvertI32x4)                \
  V(X64F32x4Abs)                          \
  V(X64F32x4Neg)                          \
  V(X64F32x4RecipApprox)                  \
@ -168,6 +170,9 @@ namespace compiler {
  V(X64I32x4Splat)                        \
  V(X64I32x4ExtractLane)                  \
  V(X64I32x4ReplaceLane)                  \
+  V(X64I32x4SConvertF32x4)                \
+  V(X64I32x4SConvertI16x8Low)             \
+  V(X64I32x4SConvertI16x8High)            \
  V(X64I32x4Neg)                          \
  V(X64I32x4Shl)                          \
  V(X64I32x4ShrS)                         \
@ -181,6 +186,9 @@ namespace compiler {
  V(X64I32x4Ne)                           \
  V(X64I32x4GtS)                          \
  V(X64I32x4GeS)                          \
+  V(X64I32x4UConvertF32x4)                \
+  V(X64I32x4UConvertI16x8Low)             \
+  V(X64I32x4UConvertI16x8High)            \
  V(X64I32x4ShrU)                         \
  V(X64I32x4MinU)                         \
  V(X64I32x4MaxU)                         \
@ -189,9 +197,12 @@ namespace compiler {
  V(X64I16x8Splat)                        \
  V(X64I16x8ExtractLane)                  \
  V(X64I16x8ReplaceLane)                  \
+  V(X64I16x8SConvertI8x16Low)             \
+  V(X64I16x8SConvertI8x16High)            \
  V(X64I16x8Neg)                          \
  V(X64I16x8Shl)                          \
  V(X64I16x8ShrS)                         \
+  V(X64I16x8SConvertI32x4)                \
  V(X64I16x8Add)                          \
  V(X64I16x8AddSaturateS)                 \
  V(X64I16x8AddHoriz)                     \
@ -204,7 +215,10 @@ namespace compiler {
  V(X64I16x8Ne)                           \
  V(X64I16x8GtS)                          \
  V(X64I16x8GeS)                          \
+  V(X64I16x8UConvertI8x16Low)             \
+  V(X64I16x8UConvertI8x16High)            \
  V(X64I16x8ShrU)                         \
+  V(X64I16x8UConvertI32x4)                \
  V(X64I16x8AddSaturateU)                 \
  V(X64I16x8SubSaturateU)                 \
  V(X64I16x8MinU)                         \
@ -214,6 +228,7 @@ namespace compiler {
  V(X64I8x16Splat)                        \
  V(X64I8x16ExtractLane)                  \
  V(X64I8x16ReplaceLane)                  \
+  V(X64I8x16SConvertI16x8)                \
  V(X64I8x16Neg)                          \
  V(X64I8x16Add)                          \
  V(X64I8x16AddSaturateS)                 \
@ -225,6 +240,7 @@ namespace compiler {
  V(X64I8x16Ne)                           \
  V(X64I8x16GtS)                          \
  V(X64I8x16GeS)                          \
+  V(X64I8x16UConvertI16x8)                \
  V(X64I8x16AddSaturateU)                 \
  V(X64I8x16SubSaturateU)                 \
  V(X64I8x16MinU)                         \
@ -237,6 +253,12 @@ namespace compiler {
  V(X64S128Not)                           \
  V(X64S128Select)                        \
  V(X64S128Zero)                          \
+  V(X64S1x4AnyTrue)                       \
+  V(X64S1x4AllTrue)                       \
+  V(X64S1x8AnyTrue)                       \
+  V(X64S1x8AllTrue)                       \
+  V(X64S1x16AnyTrue)                      \
+  V(X64S1x16AllTrue)                      \
  V(X64Word64AtomicLoadUint8)             \
  V(X64Word64AtomicLoadUint16)            \
  V(X64Word64AtomicLoadUint32)            \
--- a/src/compiler/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/x64/instruction-scheduler-x64.cc
@ -128,6 +128,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64F32x4Splat:
    case kX64F32x4ExtractLane:
    case kX64F32x4ReplaceLane:
+    case kX64F32x4SConvertI32x4:
+    case kX64F32x4UConvertI32x4:
    case kX64F32x4RecipApprox:
    case kX64F32x4RecipSqrtApprox:
    case kX64F32x4Abs:
@ -145,6 +147,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I32x4Splat:
    case kX64I32x4ExtractLane:
    case kX64I32x4ReplaceLane:
+    case kX64I32x4SConvertF32x4:
+    case kX64I32x4SConvertI16x8Low:
+    case kX64I32x4SConvertI16x8High:
    case kX64I32x4Neg:
    case kX64I32x4Shl:
    case kX64I32x4ShrS:
@ -158,6 +163,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I32x4Ne:
    case kX64I32x4GtS:
    case kX64I32x4GeS:
+    case kX64I32x4UConvertF32x4:
+    case kX64I32x4UConvertI16x8Low:
+    case kX64I32x4UConvertI16x8High:
    case kX64I32x4ShrU:
    case kX64I32x4MinU:
    case kX64I32x4MaxU:
@ -166,9 +174,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I16x8Splat:
    case kX64I16x8ExtractLane:
    case kX64I16x8ReplaceLane:
+    case kX64I16x8SConvertI8x16Low:
+    case kX64I16x8SConvertI8x16High:
    case kX64I16x8Neg:
    case kX64I16x8Shl:
    case kX64I16x8ShrS:
+    case kX64I16x8SConvertI32x4:
    case kX64I16x8Add:
    case kX64I16x8AddSaturateS:
    case kX64I16x8AddHoriz:
@ -181,6 +192,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I16x8Ne:
    case kX64I16x8GtS:
    case kX64I16x8GeS:
+    case kX64I16x8UConvertI8x16Low:
+    case kX64I16x8UConvertI8x16High:
+    case kX64I16x8UConvertI32x4:
    case kX64I16x8ShrU:
    case kX64I16x8AddSaturateU:
    case kX64I16x8SubSaturateU:
@ -191,6 +205,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I8x16Splat:
    case kX64I8x16ExtractLane:
    case kX64I8x16ReplaceLane:
+    case kX64I8x16SConvertI16x8:
    case kX64I8x16Neg:
    case kX64I8x16Add:
    case kX64I8x16AddSaturateS:
@ -202,6 +217,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I8x16Ne:
    case kX64I8x16GtS:
    case kX64I8x16GeS:
+    case kX64I8x16UConvertI16x8:
    case kX64I8x16AddSaturateU:
    case kX64I8x16SubSaturateU:
    case kX64I8x16MinU:
@ -214,6 +230,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64S128Not:
    case kX64S128Select:
    case kX64S128Zero:
+    case kX64S1x4AnyTrue:
+    case kX64S1x4AllTrue:
+    case kX64S1x8AnyTrue:
+    case kX64S1x8AllTrue:
+    case kX64S1x16AnyTrue:
+    case kX64S1x16AllTrue:
      return (instr->addressing_mode() == kMode_None)
          ? kNoOpcodeFlags
          : kIsLoadOperation | kHasSideEffect;
--- a/src/compiler/x64/instruction-selector-x64.cc
+++ b/src/compiler/x64/instruction-selector-x64.cc
@ -2496,6 +2496,7 @@ VISIT_ATOMIC_BINOP(Xor)
  V(I32x4MaxU)             \
  V(I32x4GtU)              \
  V(I32x4GeU)              \
+  V(I16x8SConvertI32x4)    \
  V(I16x8Add)              \
  V(I16x8AddSaturateS)     \
  V(I16x8AddHoriz)         \
@ -2514,6 +2515,7 @@ VISIT_ATOMIC_BINOP(Xor)
  V(I16x8MaxU)             \
  V(I16x8GtU)              \
  V(I16x8GeU)              \
+  V(I8x16SConvertI16x8)    \
  V(I8x16Add)              \
  V(I8x16AddSaturateS)     \
  V(I8x16Sub)              \
@ -2534,14 +2536,23 @@ VISIT_ATOMIC_BINOP(Xor)
  V(S128Or)                \
  V(S128Xor)

-#define SIMD_UNOP_LIST(V) \
-  V(F32x4Abs)             \
-  V(F32x4Neg)             \
-  V(F32x4RecipApprox)     \
-  V(F32x4RecipSqrtApprox) \
-  V(I32x4Neg)             \
-  V(I16x8Neg)             \
-  V(I8x16Neg)             \
+#define SIMD_UNOP_LIST(V)   \
+  V(F32x4SConvertI32x4)     \
+  V(F32x4Abs)               \
+  V(F32x4Neg)               \
+  V(F32x4RecipApprox)       \
+  V(F32x4RecipSqrtApprox)   \
+  V(I32x4SConvertI16x8Low)  \
+  V(I32x4SConvertI16x8High) \
+  V(I32x4Neg)               \
+  V(I32x4UConvertI16x8Low)  \
+  V(I32x4UConvertI16x8High) \
+  V(I16x8SConvertI8x16Low)  \
+  V(I16x8SConvertI8x16High) \
+  V(I16x8Neg)               \
+  V(I16x8UConvertI8x16Low)  \
+  V(I16x8UConvertI8x16High) \
+  V(I8x16Neg)               \
  V(S128Not)

 #define SIMD_SHIFT_OPCODES(V) \
@ -2552,6 +2563,16 @@ VISIT_ATOMIC_BINOP(Xor)
  V(I16x8ShrS)                \
  V(I16x8ShrU)

+#define SIMD_ANYTRUE_LIST(V) \
+  V(S1x4AnyTrue)             \
+  V(S1x8AnyTrue)             \
+  V(S1x16AnyTrue)
+
+#define SIMD_ALLTRUE_LIST(V) \
+  V(S1x4AllTrue)             \
+  V(S1x8AllTrue)             \
+  V(S1x16AllTrue)
+
 void InstructionSelector::VisitS128Zero(Node* node) {
  X64OperandGenerator g(this);
  Emit(kX64S128Zero, g.DefineAsRegister(node), g.DefineAsRegister(node));
@ -2596,6 +2617,7 @@ SIMD_TYPES(VISIT_SIMD_REPLACE_LANE)
  }
 SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
 #undef VISIT_SIMD_SHIFT
+#undef SIMD_SHIFT_OPCODES

 #define VISIT_SIMD_UNOP(Opcode)                         \
  void InstructionSelector::Visit##Opcode(Node* node) { \
@ -2605,6 +2627,7 @@ SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
  }
 SIMD_UNOP_LIST(VISIT_SIMD_UNOP)
 #undef VISIT_SIMD_UNOP
+#undef SIMD_UNOP_LIST

 #define VISIT_SIMD_BINOP(Opcode)                                            \
  void InstructionSelector::Visit##Opcode(Node* node) {                     \
@ -2614,10 +2637,30 @@ SIMD_UNOP_LIST(VISIT_SIMD_UNOP)
  }
 SIMD_BINOP_LIST(VISIT_SIMD_BINOP)
 #undef VISIT_SIMD_BINOP
-#undef SIMD_TYPES
 #undef SIMD_BINOP_LIST
-#undef SIMD_UNOP_LIST
-#undef SIMD_SHIFT_OPCODES
+
+#define VISIT_SIMD_ANYTRUE(Opcode)                                        \
+  void InstructionSelector::Visit##Opcode(Node* node) {                   \
+    X64OperandGenerator g(this);                                          \
+    InstructionOperand temps[] = {g.TempRegister()};                      \
+    Emit(kX64##Opcode, g.DefineAsRegister(node),                          \
+         g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \
+  }
+SIMD_ANYTRUE_LIST(VISIT_SIMD_ANYTRUE)
+#undef VISIT_SIMD_ANYTRUE
+#undef SIMD_ANYTRUE_LIST
+
+#define VISIT_SIMD_ALLTRUE(Opcode)                                        \
+  void InstructionSelector::Visit##Opcode(Node* node) {                   \
+    X64OperandGenerator g(this);                                          \
+    InstructionOperand temps[] = {g.TempRegister()};                      \
+    Emit(kX64##Opcode, g.DefineAsRegister(node),                          \
+         g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \
+  }
+SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE)
+#undef VISIT_SIMD_ALLTRUE
+#undef SIMD_ALLTRUE_LIST
+#undef SIMD_TYPES

 void InstructionSelector::VisitS128Select(Node* node) {
  X64OperandGenerator g(this);
@ -2626,6 +2669,36 @@ void InstructionSelector::VisitS128Select(Node* node) {
       g.UseRegister(node->InputAt(2)));
 }

+void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64F32x4UConvertI32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)));
+}
+
+void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I32x4SConvertF32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)));
+}
+
+void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I32x4UConvertF32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)));
+}
+
+void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I16x8UConvertI32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
+}
+
+void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I8x16UConvertI16x8, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
+}
+
 void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
  UNREACHABLE();
 }
--- a/src/x64/assembler-x64.cc
+++ b/src/x64/assembler-x64.cc
@ -83,7 +83,10 @@ void CpuFeatures::ProbeImpl(bool cross_compile) {
  // Only use statically determined features for cross compile (snapshot).
  if (cross_compile) return;

-  if (cpu.has_sse41() && FLAG_enable_sse4_1) supported_ |= 1u << SSE4_1;
+  if (cpu.has_sse41() && FLAG_enable_sse4_1) {
+    supported_ |= 1u << SSE4_1;
+    supported_ |= 1u << SSSE3;
+  }
  if (cpu.has_ssse3() && FLAG_enable_ssse3) supported_ |= 1u << SSSE3;
  if (cpu.has_sse3() && FLAG_enable_sse3) supported_ |= 1u << SSE3;
  // SAHF is not generally available in long mode.
@ -458,6 +461,9 @@ Assembler::Assembler(const AssemblerOptions& options, void* buffer,

  ReserveCodeTargetSpace(100);
  reloc_info_writer.Reposition(buffer_ + buffer_size_, pc_);
+  if (CpuFeatures::IsSupported(SSE4_1)) {
+    EnableCpuFeature(SSSE3);
+  }
 }

 void Assembler::GetCode(Isolate* isolate, CodeDesc* desc) {
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -1080,8 +1080,6 @@ WASM_SIMD_TEST(I32x4ShrU) {
                      LogicalShiftRight);
 }

-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Tests both signed and unsigned conversion from I8x16 (unpacking).
 WASM_SIMD_TEST(I16x8ConvertI8x16) {
  WasmRunner<int32_t, int32_t, int32_t, int32_t, int32_t> r(execution_tier,
@ -1124,8 +1122,6 @@ WASM_SIMD_TEST(I16x8ConvertI8x16) {
    CHECK_EQ(1, r.Call(*i, unpacked_signed, unpacked_unsigned, 0));
  }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32

 void RunI16x8UnOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                      WasmOpcode simd_op, Int16UnOp expected_op) {
@ -1144,8 +1140,6 @@ WASM_SIMD_TEST(I16x8Neg) {
  RunI16x8UnOpTest(execution_tier, lower_simd, kExprI16x8Neg, Negate);
 }

-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Tests both signed and unsigned conversion from I32x4 (packing).
 WASM_SIMD_TEST(I16x8ConvertI32x4) {
  WasmRunner<int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t> r(
@ -1190,8 +1184,6 @@ WASM_SIMD_TEST(I16x8ConvertI32x4) {
    }
  }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32

 void RunI16x8BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                       WasmOpcode simd_op, Int16BinOp expected_op) {
@ -1374,8 +1366,6 @@ WASM_SIMD_TEST(I8x16Neg) {
  RunI8x16UnOpTest(execution_tier, lower_simd, kExprI8x16Neg, Negate);
 }

-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Tests both signed and unsigned conversion from I16x8 (packing).
 WASM_SIMD_TEST(I8x16ConvertI16x8) {
  WasmRunner<int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t> r(
@ -1422,8 +1412,6 @@ WASM_SIMD_TEST(I8x16ConvertI16x8) {
    }
  }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32

 void RunI8x16BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                       WasmOpcode simd_op, Int8BinOp expected_op) {
@ -2012,6 +2000,8 @@ WASM_SIMD_COMPILED_TEST(S8x16MultiShuffleFuzz) {
    }
  }
 }
+#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
+        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32

 // Boolean unary operations are 'AllTrue' and 'AnyTrue', which return an integer
 // result. Use relational ops on numeric vectors to create the boolean vector
@ -2099,8 +2089,6 @@ WASM_SIMD_TEST(SimdI32x4ExtractWithF32x4) {
               WASM_I32V(1), WASM_I32V(0)));
  CHECK_EQ(1, r.Call());
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32

 WASM_SIMD_TEST(SimdF32x4ExtractWithI32x4) {
  WasmRunner<int32_t> r(execution_tier, lower_simd);