From caf93c9f6f79dabf4c0d1053aa2f5ca3e13095db Mon Sep 17 00:00:00 2001
From: Deepti Gandluri <gdeepti@chromium.org>
Date: Fri, 28 Sep 2018 15:58:43 -0700
Subject: [PATCH] [x64] Wasm SIMD x64 Conversion, AllTrue/AnyTrue operations

This CL mirrors the ia32 SIMD conversion, Alltrue/AnyTrue operations
with minor cleanliness changes to use TempRegisters instead of
ScratchRegisters

Change-Id: I84d3e148200dd611a72380b24404b75c73c5352d
Reviewed-on: https://chromium-review.googlesource.com/1174096
Commit-Queue: Deepti Gandluri <gdeepti@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#56297}
---
 src/compiler/instruction-selector.cc          |  79 --------
 src/compiler/x64/code-generator-x64.cc        | 183 ++++++++++++++++++
 src/compiler/x64/instruction-codes-x64.h      |  22 +++
 src/compiler/x64/instruction-scheduler-x64.cc |  22 +++
 src/compiler/x64/instruction-selector-x64.cc  |  95 +++++++--
 src/x64/assembler-x64.cc                      |   8 +-
 test/cctest/wasm/test-run-wasm-simd.cc        |  16 +-
 7 files changed, 320 insertions(+), 105 deletions(-)

diff --git a/src/compiler/instruction-selector.cc b/src/compiler/instruction-selector.cc
index b13d322ab4..78c7d4f864 100644
--- a/src/compiler/instruction-selector.cc
+++ b/src/compiler/instruction-selector.cc
@@ -2427,18 +2427,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
 }
 #endif  // !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM
 
-#if !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS && \
-    !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
-void InstructionSelector::VisitF32x4SConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-#endif  // !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS
-        // && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
-
 #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS64
 void InstructionSelector::VisitWord64AtomicLoad(Node* node) { UNIMPLEMENTED(); }
 
@@ -2467,61 +2455,6 @@ void InstructionSelector::VisitWord64AtomicCompareExchange(Node* node) {
 
 #if !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS && \
     !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
-void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4SConvertI16x8Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4SConvertI16x8High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4UConvertI16x8Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI32x4UConvertI16x8High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8SConvertI8x16Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8SConvertI8x16High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8UConvertI8x16Low(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8UConvertI8x16High(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI16x8SConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI8x16SConvertI16x8(Node* node) {
-  UNIMPLEMENTED();
-}
-
-void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
-  UNIMPLEMENTED();
-}
-
 void InstructionSelector::VisitI8x16Shl(Node* node) { UNIMPLEMENTED(); }
 
 void InstructionSelector::VisitI8x16ShrS(Node* node) { UNIMPLEMENTED(); }
@@ -2531,18 +2464,6 @@ void InstructionSelector::VisitI8x16ShrU(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI8x16Mul(Node* node) { UNIMPLEMENTED(); }
 
 void InstructionSelector::VisitS8x16Shuffle(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x4AnyTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x4AllTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x8AnyTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x8AllTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x16AnyTrue(Node* node) { UNIMPLEMENTED(); }
-
-void InstructionSelector::VisitS1x16AllTrue(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS
         // && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
 
diff --git a/src/compiler/x64/code-generator-x64.cc b/src/compiler/x64/code-generator-x64.cc
index f7616ee36e..178d2b33b9 100644
--- a/src/compiler/x64/code-generator-x64.cc
+++ b/src/compiler/x64/code-generator-x64.cc
@@ -2137,6 +2137,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ insertps(i.OutputSimd128Register(), i.InputDoubleRegister(2), select);
       break;
     }
+    case kX64F32x4SConvertI32x4: {
+      __ cvtdq2ps(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64F32x4UConvertI32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      DCHECK_NE(i.OutputSimd128Register(), kScratchDoubleReg);
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);      // zeros
+      __ pblendw(kScratchDoubleReg, dst, 0x55);           // get lo 16 bits
+      __ psubd(dst, kScratchDoubleReg);                   // get hi 16 bits
+      __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
+      __ psrld(dst, 1);                  // divide by 2 to get in unsigned range
+      __ cvtdq2ps(dst, dst);             // convert hi exactly
+      __ addps(dst, dst);                // double hi, exactly
+      __ addps(dst, kScratchDoubleReg);  // add hi and lo, may round.
+      break;
+    }
     case kX64F32x4Abs: {
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src = i.InputSimd128Register(0);
@@ -2245,6 +2264,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       }
       break;
     }
+    case kX64I32x4SConvertF32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      XMMRegister dst = i.OutputSimd128Register();
+      // NAN->0
+      __ movaps(kScratchDoubleReg, dst);
+      __ cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
+      __ pand(dst, kScratchDoubleReg);
+      // Set top bit if >= 0 (but not -0.0!)
+      __ pxor(kScratchDoubleReg, dst);
+      // Convert
+      __ cvttps2dq(dst, dst);
+      // Set top bit if >=0 is now < 0
+      __ pand(kScratchDoubleReg, dst);
+      __ psrad(kScratchDoubleReg, 31);
+      // Set positive overflow lanes to 0x7FFFFFFF
+      __ pxor(dst, kScratchDoubleReg);
+      break;
+    }
+    case kX64I32x4SConvertI16x8Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovsxwd(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I32x4SConvertI16x8High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovsxwd(dst, dst);
+      break;
+    }
     case kX64I32x4Neg: {
       CpuFeatureScope sse_scope(tasm(), SSSE3);
       XMMRegister dst = i.OutputSimd128Register();
@@ -2316,6 +2365,46 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ pcmpeqd(dst, src);
       break;
     }
+    case kX64I32x4UConvertF32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
+      // NAN->0, negative->0
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
+      __ maxps(dst, kScratchDoubleReg);
+      // scratch: float representation of max_signed
+      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
+      __ psrld(kScratchDoubleReg, 1);                     // 0x7fffffff
+      __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
+      // tmp: convert (src-max_signed).
+      // Positive overflow lanes -> 0x7FFFFFFF
+      // Negative lanes -> 0
+      __ movaps(tmp, dst);
+      __ subps(tmp, kScratchDoubleReg);
+      __ cmpleps(kScratchDoubleReg, tmp);
+      __ cvttps2dq(tmp, tmp);
+      __ pxor(tmp, kScratchDoubleReg);
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
+      __ pmaxsd(tmp, kScratchDoubleReg);
+      // convert. Overflow lanes above max_signed will be 0x80000000
+      __ cvttps2dq(dst, dst);
+      // Add (src-max_signed) for overflow lanes.
+      __ paddd(dst, tmp);
+      break;
+    }
+    case kX64I32x4UConvertI16x8Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovzxwd(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I32x4UConvertI16x8High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovzxwd(dst, dst);
+      break;
+    }
     case kX64I32x4ShrU: {
       __ psrld(i.OutputSimd128Register(), i.InputInt8(1));
       break;
@@ -2377,6 +2466,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       }
       break;
     }
+    case kX64I16x8SConvertI8x16Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovsxbw(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I16x8SConvertI8x16High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovsxbw(dst, dst);
+      break;
+    }
     case kX64I16x8Neg: {
       CpuFeatureScope sse_scope(tasm(), SSSE3);
       XMMRegister dst = i.OutputSimd128Register();
@@ -2398,6 +2499,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ psraw(i.OutputSimd128Register(), i.InputInt8(1));
       break;
     }
+    case kX64I16x8SConvertI32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      __ packssdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
+      break;
+    }
     case kX64I16x8Add: {
       __ paddw(i.OutputSimd128Register(), i.InputSimd128Register(1));
       break;
@@ -2456,10 +2562,34 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ pcmpeqw(dst, src);
       break;
     }
+    case kX64I16x8UConvertI8x16Low: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      __ pmovzxbw(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      break;
+    }
+    case kX64I16x8UConvertI8x16High: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      __ palignr(dst, i.InputSimd128Register(0), 8);
+      __ pmovzxbw(dst, dst);
+      break;
+    }
     case kX64I16x8ShrU: {
       __ psrlw(i.OutputSimd128Register(), i.InputInt8(1));
       break;
     }
+    case kX64I16x8UConvertI32x4: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      // Change negative lanes to 0x7FFFFFFF
+      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
+      __ psrld(kScratchDoubleReg, 1);
+      __ pminud(dst, kScratchDoubleReg);
+      __ pminud(kScratchDoubleReg, i.InputSimd128Register(1));
+      __ packusdw(dst, kScratchDoubleReg);
+      break;
+    }
     case kX64I16x8AddSaturateU: {
       __ paddusw(i.OutputSimd128Register(), i.InputSimd128Register(1));
       break;
@@ -2521,6 +2651,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       }
       break;
     }
+    case kX64I8x16SConvertI16x8: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      __ packsswb(i.OutputSimd128Register(), i.InputSimd128Register(1));
+      break;
+    }
     case kX64I8x16Neg: {
       CpuFeatureScope sse_scope(tasm(), SSSE3);
       XMMRegister dst = i.OutputSimd128Register();
@@ -2582,6 +2717,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ pcmpeqb(dst, src);
       break;
     }
+    case kX64I8x16UConvertI16x8: {
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      XMMRegister dst = i.OutputSimd128Register();
+      // Change negative lanes to 0x7FFF
+      __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
+      __ psrlw(kScratchDoubleReg, 1);
+      __ pminuw(dst, kScratchDoubleReg);
+      __ pminuw(kScratchDoubleReg, i.InputSimd128Register(1));
+      __ packuswb(dst, kScratchDoubleReg);
+      break;
+    }
     case kX64I8x16AddSaturateU: {
       __ paddusb(i.OutputSimd128Register(), i.InputSimd128Register(1));
       break;
@@ -2653,6 +2800,42 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ xorps(dst, i.InputSimd128Register(2));
       break;
     }
+    case kX64S1x4AnyTrue:
+    case kX64S1x8AnyTrue:
+    case kX64S1x16AnyTrue: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      Register dst = i.OutputRegister();
+      XMMRegister src = i.InputSimd128Register(0);
+      Register tmp = i.TempRegister(0);
+      __ xorq(tmp, tmp);
+      __ movq(dst, Immediate(-1));
+      __ ptest(src, src);
+      __ cmovq(zero, dst, tmp);
+      break;
+    }
+    case kX64S1x4AllTrue:
+    case kX64S1x8AllTrue:
+    case kX64S1x16AllTrue: {
+      CpuFeatureScope sse_scope(tasm(), SSE4_1);
+      Register dst = i.OutputRegister();
+      XMMRegister src = i.InputSimd128Register(0);
+      Register tmp = i.TempRegister(0);
+      __ movq(tmp, Immediate(-1));
+      __ xorq(dst, dst);
+      // Compare all src lanes to false.
+      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
+      if (arch_opcode == kX64S1x4AllTrue) {
+        __ pcmpeqd(kScratchDoubleReg, src);
+      } else if (arch_opcode == kX64S1x8AllTrue) {
+        __ pcmpeqw(kScratchDoubleReg, src);
+      } else {
+        __ pcmpeqb(kScratchDoubleReg, src);
+      }
+      // If kScratchDoubleReg is all zero, none of src lanes are false.
+      __ ptest(kScratchDoubleReg, kScratchDoubleReg);
+      __ cmovq(zero, dst, tmp);
+      break;
+    }
     case kX64StackCheck:
       __ CompareRoot(rsp, RootIndex::kStackLimit);
       break;
diff --git a/src/compiler/x64/instruction-codes-x64.h b/src/compiler/x64/instruction-codes-x64.h
index 6a9e313f4e..c2a194e94a 100644
--- a/src/compiler/x64/instruction-codes-x64.h
+++ b/src/compiler/x64/instruction-codes-x64.h
@@ -151,6 +151,8 @@ namespace compiler {
   V(X64F32x4Splat)                        \
   V(X64F32x4ExtractLane)                  \
   V(X64F32x4ReplaceLane)                  \
+  V(X64F32x4SConvertI32x4)                \
+  V(X64F32x4UConvertI32x4)                \
   V(X64F32x4Abs)                          \
   V(X64F32x4Neg)                          \
   V(X64F32x4RecipApprox)                  \
@@ -168,6 +170,9 @@ namespace compiler {
   V(X64I32x4Splat)                        \
   V(X64I32x4ExtractLane)                  \
   V(X64I32x4ReplaceLane)                  \
+  V(X64I32x4SConvertF32x4)                \
+  V(X64I32x4SConvertI16x8Low)             \
+  V(X64I32x4SConvertI16x8High)            \
   V(X64I32x4Neg)                          \
   V(X64I32x4Shl)                          \
   V(X64I32x4ShrS)                         \
@@ -181,6 +186,9 @@ namespace compiler {
   V(X64I32x4Ne)                           \
   V(X64I32x4GtS)                          \
   V(X64I32x4GeS)                          \
+  V(X64I32x4UConvertF32x4)                \
+  V(X64I32x4UConvertI16x8Low)             \
+  V(X64I32x4UConvertI16x8High)            \
   V(X64I32x4ShrU)                         \
   V(X64I32x4MinU)                         \
   V(X64I32x4MaxU)                         \
@@ -189,9 +197,12 @@ namespace compiler {
   V(X64I16x8Splat)                        \
   V(X64I16x8ExtractLane)                  \
   V(X64I16x8ReplaceLane)                  \
+  V(X64I16x8SConvertI8x16Low)             \
+  V(X64I16x8SConvertI8x16High)            \
   V(X64I16x8Neg)                          \
   V(X64I16x8Shl)                          \
   V(X64I16x8ShrS)                         \
+  V(X64I16x8SConvertI32x4)                \
   V(X64I16x8Add)                          \
   V(X64I16x8AddSaturateS)                 \
   V(X64I16x8AddHoriz)                     \
@@ -204,7 +215,10 @@ namespace compiler {
   V(X64I16x8Ne)                           \
   V(X64I16x8GtS)                          \
   V(X64I16x8GeS)                          \
+  V(X64I16x8UConvertI8x16Low)             \
+  V(X64I16x8UConvertI8x16High)            \
   V(X64I16x8ShrU)                         \
+  V(X64I16x8UConvertI32x4)                \
   V(X64I16x8AddSaturateU)                 \
   V(X64I16x8SubSaturateU)                 \
   V(X64I16x8MinU)                         \
@@ -214,6 +228,7 @@ namespace compiler {
   V(X64I8x16Splat)                        \
   V(X64I8x16ExtractLane)                  \
   V(X64I8x16ReplaceLane)                  \
+  V(X64I8x16SConvertI16x8)                \
   V(X64I8x16Neg)                          \
   V(X64I8x16Add)                          \
   V(X64I8x16AddSaturateS)                 \
@@ -225,6 +240,7 @@ namespace compiler {
   V(X64I8x16Ne)                           \
   V(X64I8x16GtS)                          \
   V(X64I8x16GeS)                          \
+  V(X64I8x16UConvertI16x8)                \
   V(X64I8x16AddSaturateU)                 \
   V(X64I8x16SubSaturateU)                 \
   V(X64I8x16MinU)                         \
@@ -237,6 +253,12 @@ namespace compiler {
   V(X64S128Not)                           \
   V(X64S128Select)                        \
   V(X64S128Zero)                          \
+  V(X64S1x4AnyTrue)                       \
+  V(X64S1x4AllTrue)                       \
+  V(X64S1x8AnyTrue)                       \
+  V(X64S1x8AllTrue)                       \
+  V(X64S1x16AnyTrue)                      \
+  V(X64S1x16AllTrue)                      \
   V(X64Word64AtomicLoadUint8)             \
   V(X64Word64AtomicLoadUint16)            \
   V(X64Word64AtomicLoadUint32)            \
diff --git a/src/compiler/x64/instruction-scheduler-x64.cc b/src/compiler/x64/instruction-scheduler-x64.cc
index b1f380badf..e5523fd49d 100644
--- a/src/compiler/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/x64/instruction-scheduler-x64.cc
@@ -128,6 +128,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64F32x4Splat:
     case kX64F32x4ExtractLane:
     case kX64F32x4ReplaceLane:
+    case kX64F32x4SConvertI32x4:
+    case kX64F32x4UConvertI32x4:
     case kX64F32x4RecipApprox:
     case kX64F32x4RecipSqrtApprox:
     case kX64F32x4Abs:
@@ -145,6 +147,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64I32x4Splat:
     case kX64I32x4ExtractLane:
     case kX64I32x4ReplaceLane:
+    case kX64I32x4SConvertF32x4:
+    case kX64I32x4SConvertI16x8Low:
+    case kX64I32x4SConvertI16x8High:
     case kX64I32x4Neg:
     case kX64I32x4Shl:
     case kX64I32x4ShrS:
@@ -158,6 +163,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64I32x4Ne:
     case kX64I32x4GtS:
     case kX64I32x4GeS:
+    case kX64I32x4UConvertF32x4:
+    case kX64I32x4UConvertI16x8Low:
+    case kX64I32x4UConvertI16x8High:
     case kX64I32x4ShrU:
     case kX64I32x4MinU:
     case kX64I32x4MaxU:
@@ -166,9 +174,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64I16x8Splat:
     case kX64I16x8ExtractLane:
     case kX64I16x8ReplaceLane:
+    case kX64I16x8SConvertI8x16Low:
+    case kX64I16x8SConvertI8x16High:
     case kX64I16x8Neg:
     case kX64I16x8Shl:
     case kX64I16x8ShrS:
+    case kX64I16x8SConvertI32x4:
     case kX64I16x8Add:
     case kX64I16x8AddSaturateS:
     case kX64I16x8AddHoriz:
@@ -181,6 +192,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64I16x8Ne:
     case kX64I16x8GtS:
     case kX64I16x8GeS:
+    case kX64I16x8UConvertI8x16Low:
+    case kX64I16x8UConvertI8x16High:
+    case kX64I16x8UConvertI32x4:
     case kX64I16x8ShrU:
     case kX64I16x8AddSaturateU:
     case kX64I16x8SubSaturateU:
@@ -191,6 +205,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64I8x16Splat:
     case kX64I8x16ExtractLane:
     case kX64I8x16ReplaceLane:
+    case kX64I8x16SConvertI16x8:
     case kX64I8x16Neg:
     case kX64I8x16Add:
     case kX64I8x16AddSaturateS:
@@ -202,6 +217,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64I8x16Ne:
     case kX64I8x16GtS:
     case kX64I8x16GeS:
+    case kX64I8x16UConvertI16x8:
     case kX64I8x16AddSaturateU:
     case kX64I8x16SubSaturateU:
     case kX64I8x16MinU:
@@ -214,6 +230,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64S128Not:
     case kX64S128Select:
     case kX64S128Zero:
+    case kX64S1x4AnyTrue:
+    case kX64S1x4AllTrue:
+    case kX64S1x8AnyTrue:
+    case kX64S1x8AllTrue:
+    case kX64S1x16AnyTrue:
+    case kX64S1x16AllTrue:
       return (instr->addressing_mode() == kMode_None)
           ? kNoOpcodeFlags
           : kIsLoadOperation | kHasSideEffect;
diff --git a/src/compiler/x64/instruction-selector-x64.cc b/src/compiler/x64/instruction-selector-x64.cc
index 074f00fad7..211794ace8 100644
--- a/src/compiler/x64/instruction-selector-x64.cc
+++ b/src/compiler/x64/instruction-selector-x64.cc
@@ -2496,6 +2496,7 @@ VISIT_ATOMIC_BINOP(Xor)
   V(I32x4MaxU)             \
   V(I32x4GtU)              \
   V(I32x4GeU)              \
+  V(I16x8SConvertI32x4)    \
   V(I16x8Add)              \
   V(I16x8AddSaturateS)     \
   V(I16x8AddHoriz)         \
@@ -2514,6 +2515,7 @@ VISIT_ATOMIC_BINOP(Xor)
   V(I16x8MaxU)             \
   V(I16x8GtU)              \
   V(I16x8GeU)              \
+  V(I8x16SConvertI16x8)    \
   V(I8x16Add)              \
   V(I8x16AddSaturateS)     \
   V(I8x16Sub)              \
@@ -2534,14 +2536,23 @@ VISIT_ATOMIC_BINOP(Xor)
   V(S128Or)                \
   V(S128Xor)
 
-#define SIMD_UNOP_LIST(V) \
-  V(F32x4Abs)             \
-  V(F32x4Neg)             \
-  V(F32x4RecipApprox)     \
-  V(F32x4RecipSqrtApprox) \
-  V(I32x4Neg)             \
-  V(I16x8Neg)             \
-  V(I8x16Neg)             \
+#define SIMD_UNOP_LIST(V)   \
+  V(F32x4SConvertI32x4)     \
+  V(F32x4Abs)               \
+  V(F32x4Neg)               \
+  V(F32x4RecipApprox)       \
+  V(F32x4RecipSqrtApprox)   \
+  V(I32x4SConvertI16x8Low)  \
+  V(I32x4SConvertI16x8High) \
+  V(I32x4Neg)               \
+  V(I32x4UConvertI16x8Low)  \
+  V(I32x4UConvertI16x8High) \
+  V(I16x8SConvertI8x16Low)  \
+  V(I16x8SConvertI8x16High) \
+  V(I16x8Neg)               \
+  V(I16x8UConvertI8x16Low)  \
+  V(I16x8UConvertI8x16High) \
+  V(I8x16Neg)               \
   V(S128Not)
 
 #define SIMD_SHIFT_OPCODES(V) \
@@ -2552,6 +2563,16 @@ VISIT_ATOMIC_BINOP(Xor)
   V(I16x8ShrS)                \
   V(I16x8ShrU)
 
+#define SIMD_ANYTRUE_LIST(V) \
+  V(S1x4AnyTrue)             \
+  V(S1x8AnyTrue)             \
+  V(S1x16AnyTrue)
+
+#define SIMD_ALLTRUE_LIST(V) \
+  V(S1x4AllTrue)             \
+  V(S1x8AllTrue)             \
+  V(S1x16AllTrue)
+
 void InstructionSelector::VisitS128Zero(Node* node) {
   X64OperandGenerator g(this);
   Emit(kX64S128Zero, g.DefineAsRegister(node), g.DefineAsRegister(node));
@@ -2596,6 +2617,7 @@ SIMD_TYPES(VISIT_SIMD_REPLACE_LANE)
   }
 SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
 #undef VISIT_SIMD_SHIFT
+#undef SIMD_SHIFT_OPCODES
 
 #define VISIT_SIMD_UNOP(Opcode)                         \
   void InstructionSelector::Visit##Opcode(Node* node) { \
@@ -2605,6 +2627,7 @@ SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
   }
 SIMD_UNOP_LIST(VISIT_SIMD_UNOP)
 #undef VISIT_SIMD_UNOP
+#undef SIMD_UNOP_LIST
 
 #define VISIT_SIMD_BINOP(Opcode)                                            \
   void InstructionSelector::Visit##Opcode(Node* node) {                     \
@@ -2614,10 +2637,30 @@ SIMD_UNOP_LIST(VISIT_SIMD_UNOP)
   }
 SIMD_BINOP_LIST(VISIT_SIMD_BINOP)
 #undef VISIT_SIMD_BINOP
-#undef SIMD_TYPES
 #undef SIMD_BINOP_LIST
-#undef SIMD_UNOP_LIST
-#undef SIMD_SHIFT_OPCODES
+
+#define VISIT_SIMD_ANYTRUE(Opcode)                                        \
+  void InstructionSelector::Visit##Opcode(Node* node) {                   \
+    X64OperandGenerator g(this);                                          \
+    InstructionOperand temps[] = {g.TempRegister()};                      \
+    Emit(kX64##Opcode, g.DefineAsRegister(node),                          \
+         g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \
+  }
+SIMD_ANYTRUE_LIST(VISIT_SIMD_ANYTRUE)
+#undef VISIT_SIMD_ANYTRUE
+#undef SIMD_ANYTRUE_LIST
+
+#define VISIT_SIMD_ALLTRUE(Opcode)                                        \
+  void InstructionSelector::Visit##Opcode(Node* node) {                   \
+    X64OperandGenerator g(this);                                          \
+    InstructionOperand temps[] = {g.TempRegister()};                      \
+    Emit(kX64##Opcode, g.DefineAsRegister(node),                          \
+         g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \
+  }
+SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE)
+#undef VISIT_SIMD_ALLTRUE
+#undef SIMD_ALLTRUE_LIST
+#undef SIMD_TYPES
 
 void InstructionSelector::VisitS128Select(Node* node) {
   X64OperandGenerator g(this);
@@ -2626,6 +2669,36 @@ void InstructionSelector::VisitS128Select(Node* node) {
        g.UseRegister(node->InputAt(2)));
 }
 
+void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64F32x4UConvertI32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)));
+}
+
+void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I32x4SConvertF32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)));
+}
+
+void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I32x4UConvertF32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)));
+}
+
+void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I16x8UConvertI32x4, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
+}
+
+void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
+  X64OperandGenerator g(this);
+  Emit(kX64I8x16UConvertI16x8, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
+}
+
 void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
   UNREACHABLE();
 }
diff --git a/src/x64/assembler-x64.cc b/src/x64/assembler-x64.cc
index 171fac0831..821b7acdc0 100644
--- a/src/x64/assembler-x64.cc
+++ b/src/x64/assembler-x64.cc
@@ -83,7 +83,10 @@ void CpuFeatures::ProbeImpl(bool cross_compile) {
   // Only use statically determined features for cross compile (snapshot).
   if (cross_compile) return;
 
-  if (cpu.has_sse41() && FLAG_enable_sse4_1) supported_ |= 1u << SSE4_1;
+  if (cpu.has_sse41() && FLAG_enable_sse4_1) {
+    supported_ |= 1u << SSE4_1;
+    supported_ |= 1u << SSSE3;
+  }
   if (cpu.has_ssse3() && FLAG_enable_ssse3) supported_ |= 1u << SSSE3;
   if (cpu.has_sse3() && FLAG_enable_sse3) supported_ |= 1u << SSE3;
   // SAHF is not generally available in long mode.
@@ -458,6 +461,9 @@ Assembler::Assembler(const AssemblerOptions& options, void* buffer,
 
   ReserveCodeTargetSpace(100);
   reloc_info_writer.Reposition(buffer_ + buffer_size_, pc_);
+  if (CpuFeatures::IsSupported(SSE4_1)) {
+    EnableCpuFeature(SSSE3);
+  }
 }
 
 void Assembler::GetCode(Isolate* isolate, CodeDesc* desc) {
diff --git a/test/cctest/wasm/test-run-wasm-simd.cc b/test/cctest/wasm/test-run-wasm-simd.cc
index f60c65b727..b0f3dcf8ce 100644
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@@ -1080,8 +1080,6 @@ WASM_SIMD_TEST(I32x4ShrU) {
                       LogicalShiftRight);
 }
 
-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Tests both signed and unsigned conversion from I8x16 (unpacking).
 WASM_SIMD_TEST(I16x8ConvertI8x16) {
   WasmRunner<int32_t, int32_t, int32_t, int32_t, int32_t> r(execution_tier,
@@ -1124,8 +1122,6 @@ WASM_SIMD_TEST(I16x8ConvertI8x16) {
     CHECK_EQ(1, r.Call(*i, unpacked_signed, unpacked_unsigned, 0));
   }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 
 void RunI16x8UnOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                       WasmOpcode simd_op, Int16UnOp expected_op) {
@@ -1144,8 +1140,6 @@ WASM_SIMD_TEST(I16x8Neg) {
   RunI16x8UnOpTest(execution_tier, lower_simd, kExprI16x8Neg, Negate);
 }
 
-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Tests both signed and unsigned conversion from I32x4 (packing).
 WASM_SIMD_TEST(I16x8ConvertI32x4) {
   WasmRunner<int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t> r(
@@ -1190,8 +1184,6 @@ WASM_SIMD_TEST(I16x8ConvertI32x4) {
     }
   }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 
 void RunI16x8BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                        WasmOpcode simd_op, Int16BinOp expected_op) {
@@ -1374,8 +1366,6 @@ WASM_SIMD_TEST(I8x16Neg) {
   RunI8x16UnOpTest(execution_tier, lower_simd, kExprI8x16Neg, Negate);
 }
 
-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Tests both signed and unsigned conversion from I16x8 (packing).
 WASM_SIMD_TEST(I8x16ConvertI16x8) {
   WasmRunner<int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t> r(
@@ -1422,8 +1412,6 @@ WASM_SIMD_TEST(I8x16ConvertI16x8) {
     }
   }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 
 void RunI8x16BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                        WasmOpcode simd_op, Int8BinOp expected_op) {
@@ -2012,6 +2000,8 @@ WASM_SIMD_COMPILED_TEST(S8x16MultiShuffleFuzz) {
     }
   }
 }
+#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
+        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 
 // Boolean unary operations are 'AllTrue' and 'AnyTrue', which return an integer
 // result. Use relational ops on numeric vectors to create the boolean vector
@@ -2099,8 +2089,6 @@ WASM_SIMD_TEST(SimdI32x4ExtractWithF32x4) {
                WASM_I32V(1), WASM_I32V(0)));
   CHECK_EQ(1, r.Call());
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 
 WASM_SIMD_TEST(SimdF32x4ExtractWithI32x4) {
   WasmRunner<int32_t> r(execution_tier, lower_simd);