From caf93c9f6f79dabf4c0d1053aa2f5ca3e13095db Mon Sep 17 00:00:00 2001 From: Deepti Gandluri Date: Fri, 28 Sep 2018 15:58:43 -0700 Subject: [PATCH] [x64] Wasm SIMD x64 Conversion, AllTrue/AnyTrue operations This CL mirrors the ia32 SIMD conversion, Alltrue/AnyTrue operations with minor cleanliness changes to use TempRegisters instead of ScratchRegisters Change-Id: I84d3e148200dd611a72380b24404b75c73c5352d Reviewed-on: https://chromium-review.googlesource.com/1174096 Commit-Queue: Deepti Gandluri Reviewed-by: Bill Budge Cr-Commit-Position: refs/heads/master@{#56297} --- src/compiler/instruction-selector.cc | 79 -------- src/compiler/x64/code-generator-x64.cc | 183 ++++++++++++++++++ src/compiler/x64/instruction-codes-x64.h | 22 +++ src/compiler/x64/instruction-scheduler-x64.cc | 22 +++ src/compiler/x64/instruction-selector-x64.cc | 95 +++++++-- src/x64/assembler-x64.cc | 8 +- test/cctest/wasm/test-run-wasm-simd.cc | 16 +- 7 files changed, 320 insertions(+), 105 deletions(-) diff --git a/src/compiler/instruction-selector.cc b/src/compiler/instruction-selector.cc index b13d322ab4..78c7d4f864 100644 --- a/src/compiler/instruction-selector.cc +++ b/src/compiler/instruction-selector.cc @@ -2427,18 +2427,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) { } #endif // !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM -#if !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS && \ - !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32 -void InstructionSelector::VisitF32x4SConvertI32x4(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) { - UNIMPLEMENTED(); -} -#endif // !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS - // && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32 - #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS64 void InstructionSelector::VisitWord64AtomicLoad(Node* node) { UNIMPLEMENTED(); } @@ -2467,61 +2455,6 @@ void InstructionSelector::VisitWord64AtomicCompareExchange(Node* node) { #if !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS && \ !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32 -void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI32x4SConvertI16x8Low(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI32x4SConvertI16x8High(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI32x4UConvertI16x8Low(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI32x4UConvertI16x8High(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI16x8SConvertI8x16Low(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI16x8SConvertI8x16High(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI16x8UConvertI8x16Low(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI16x8UConvertI8x16High(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI16x8SConvertI32x4(Node* node) { - UNIMPLEMENTED(); -} -void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI8x16SConvertI16x8(Node* node) { - UNIMPLEMENTED(); -} - -void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) { - UNIMPLEMENTED(); -} - void InstructionSelector::VisitI8x16Shl(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI8x16ShrS(Node* node) { UNIMPLEMENTED(); } @@ -2531,18 +2464,6 @@ void InstructionSelector::VisitI8x16ShrU(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI8x16Mul(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitS8x16Shuffle(Node* node) { UNIMPLEMENTED(); } - -void InstructionSelector::VisitS1x4AnyTrue(Node* node) { UNIMPLEMENTED(); } - -void InstructionSelector::VisitS1x4AllTrue(Node* node) { UNIMPLEMENTED(); } - -void InstructionSelector::VisitS1x8AnyTrue(Node* node) { UNIMPLEMENTED(); } - -void InstructionSelector::VisitS1x8AllTrue(Node* node) { UNIMPLEMENTED(); } - -void InstructionSelector::VisitS1x16AnyTrue(Node* node) { UNIMPLEMENTED(); } - -void InstructionSelector::VisitS1x16AllTrue(Node* node) { UNIMPLEMENTED(); } #endif // !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS // && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32 diff --git a/src/compiler/x64/code-generator-x64.cc b/src/compiler/x64/code-generator-x64.cc index f7616ee36e..178d2b33b9 100644 --- a/src/compiler/x64/code-generator-x64.cc +++ b/src/compiler/x64/code-generator-x64.cc @@ -2137,6 +2137,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ insertps(i.OutputSimd128Register(), i.InputDoubleRegister(2), select); break; } + case kX64F32x4SConvertI32x4: { + __ cvtdq2ps(i.OutputSimd128Register(), i.InputSimd128Register(0)); + break; + } + case kX64F32x4UConvertI32x4: { + DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); + DCHECK_NE(i.OutputSimd128Register(), kScratchDoubleReg); + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + __ pxor(kScratchDoubleReg, kScratchDoubleReg); // zeros + __ pblendw(kScratchDoubleReg, dst, 0x55); // get lo 16 bits + __ psubd(dst, kScratchDoubleReg); // get hi 16 bits + __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly + __ psrld(dst, 1); // divide by 2 to get in unsigned range + __ cvtdq2ps(dst, dst); // convert hi exactly + __ addps(dst, dst); // double hi, exactly + __ addps(dst, kScratchDoubleReg); // add hi and lo, may round. + break; + } case kX64F32x4Abs: { XMMRegister dst = i.OutputSimd128Register(); XMMRegister src = i.InputSimd128Register(0); @@ -2245,6 +2264,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } break; } + case kX64I32x4SConvertF32x4: { + DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); + XMMRegister dst = i.OutputSimd128Register(); + // NAN->0 + __ movaps(kScratchDoubleReg, dst); + __ cmpeqps(kScratchDoubleReg, kScratchDoubleReg); + __ pand(dst, kScratchDoubleReg); + // Set top bit if >= 0 (but not -0.0!) + __ pxor(kScratchDoubleReg, dst); + // Convert + __ cvttps2dq(dst, dst); + // Set top bit if >=0 is now < 0 + __ pand(kScratchDoubleReg, dst); + __ psrad(kScratchDoubleReg, 31); + // Set positive overflow lanes to 0x7FFFFFFF + __ pxor(dst, kScratchDoubleReg); + break; + } + case kX64I32x4SConvertI16x8Low: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + __ pmovsxwd(i.OutputSimd128Register(), i.InputSimd128Register(0)); + break; + } + case kX64I32x4SConvertI16x8High: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + __ palignr(dst, i.InputSimd128Register(0), 8); + __ pmovsxwd(dst, dst); + break; + } case kX64I32x4Neg: { CpuFeatureScope sse_scope(tasm(), SSSE3); XMMRegister dst = i.OutputSimd128Register(); @@ -2316,6 +2365,46 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ pcmpeqd(dst, src); break; } + case kX64I32x4UConvertF32x4: { + DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0)); + // NAN->0, negative->0 + __ pxor(kScratchDoubleReg, kScratchDoubleReg); + __ maxps(dst, kScratchDoubleReg); + // scratch: float representation of max_signed + __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); + __ psrld(kScratchDoubleReg, 1); // 0x7fffffff + __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // 0x4f000000 + // tmp: convert (src-max_signed). + // Positive overflow lanes -> 0x7FFFFFFF + // Negative lanes -> 0 + __ movaps(tmp, dst); + __ subps(tmp, kScratchDoubleReg); + __ cmpleps(kScratchDoubleReg, tmp); + __ cvttps2dq(tmp, tmp); + __ pxor(tmp, kScratchDoubleReg); + __ pxor(kScratchDoubleReg, kScratchDoubleReg); + __ pmaxsd(tmp, kScratchDoubleReg); + // convert. Overflow lanes above max_signed will be 0x80000000 + __ cvttps2dq(dst, dst); + // Add (src-max_signed) for overflow lanes. + __ paddd(dst, tmp); + break; + } + case kX64I32x4UConvertI16x8Low: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + __ pmovzxwd(i.OutputSimd128Register(), i.InputSimd128Register(0)); + break; + } + case kX64I32x4UConvertI16x8High: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + __ palignr(dst, i.InputSimd128Register(0), 8); + __ pmovzxwd(dst, dst); + break; + } case kX64I32x4ShrU: { __ psrld(i.OutputSimd128Register(), i.InputInt8(1)); break; @@ -2377,6 +2466,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } break; } + case kX64I16x8SConvertI8x16Low: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + __ pmovsxbw(i.OutputSimd128Register(), i.InputSimd128Register(0)); + break; + } + case kX64I16x8SConvertI8x16High: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + __ palignr(dst, i.InputSimd128Register(0), 8); + __ pmovsxbw(dst, dst); + break; + } case kX64I16x8Neg: { CpuFeatureScope sse_scope(tasm(), SSSE3); XMMRegister dst = i.OutputSimd128Register(); @@ -2398,6 +2499,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ psraw(i.OutputSimd128Register(), i.InputInt8(1)); break; } + case kX64I16x8SConvertI32x4: { + DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); + __ packssdw(i.OutputSimd128Register(), i.InputSimd128Register(1)); + break; + } case kX64I16x8Add: { __ paddw(i.OutputSimd128Register(), i.InputSimd128Register(1)); break; @@ -2456,10 +2562,34 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ pcmpeqw(dst, src); break; } + case kX64I16x8UConvertI8x16Low: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + __ pmovzxbw(i.OutputSimd128Register(), i.InputSimd128Register(0)); + break; + } + case kX64I16x8UConvertI8x16High: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + __ palignr(dst, i.InputSimd128Register(0), 8); + __ pmovzxbw(dst, dst); + break; + } case kX64I16x8ShrU: { __ psrlw(i.OutputSimd128Register(), i.InputInt8(1)); break; } + case kX64I16x8UConvertI32x4: { + DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + // Change negative lanes to 0x7FFFFFFF + __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); + __ psrld(kScratchDoubleReg, 1); + __ pminud(dst, kScratchDoubleReg); + __ pminud(kScratchDoubleReg, i.InputSimd128Register(1)); + __ packusdw(dst, kScratchDoubleReg); + break; + } case kX64I16x8AddSaturateU: { __ paddusw(i.OutputSimd128Register(), i.InputSimd128Register(1)); break; @@ -2521,6 +2651,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } break; } + case kX64I8x16SConvertI16x8: { + DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); + __ packsswb(i.OutputSimd128Register(), i.InputSimd128Register(1)); + break; + } case kX64I8x16Neg: { CpuFeatureScope sse_scope(tasm(), SSSE3); XMMRegister dst = i.OutputSimd128Register(); @@ -2582,6 +2717,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ pcmpeqb(dst, src); break; } + case kX64I8x16UConvertI16x8: { + DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); + CpuFeatureScope sse_scope(tasm(), SSE4_1); + XMMRegister dst = i.OutputSimd128Register(); + // Change negative lanes to 0x7FFF + __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); + __ psrlw(kScratchDoubleReg, 1); + __ pminuw(dst, kScratchDoubleReg); + __ pminuw(kScratchDoubleReg, i.InputSimd128Register(1)); + __ packuswb(dst, kScratchDoubleReg); + break; + } case kX64I8x16AddSaturateU: { __ paddusb(i.OutputSimd128Register(), i.InputSimd128Register(1)); break; @@ -2653,6 +2800,42 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ xorps(dst, i.InputSimd128Register(2)); break; } + case kX64S1x4AnyTrue: + case kX64S1x8AnyTrue: + case kX64S1x16AnyTrue: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + Register dst = i.OutputRegister(); + XMMRegister src = i.InputSimd128Register(0); + Register tmp = i.TempRegister(0); + __ xorq(tmp, tmp); + __ movq(dst, Immediate(-1)); + __ ptest(src, src); + __ cmovq(zero, dst, tmp); + break; + } + case kX64S1x4AllTrue: + case kX64S1x8AllTrue: + case kX64S1x16AllTrue: { + CpuFeatureScope sse_scope(tasm(), SSE4_1); + Register dst = i.OutputRegister(); + XMMRegister src = i.InputSimd128Register(0); + Register tmp = i.TempRegister(0); + __ movq(tmp, Immediate(-1)); + __ xorq(dst, dst); + // Compare all src lanes to false. + __ pxor(kScratchDoubleReg, kScratchDoubleReg); + if (arch_opcode == kX64S1x4AllTrue) { + __ pcmpeqd(kScratchDoubleReg, src); + } else if (arch_opcode == kX64S1x8AllTrue) { + __ pcmpeqw(kScratchDoubleReg, src); + } else { + __ pcmpeqb(kScratchDoubleReg, src); + } + // If kScratchDoubleReg is all zero, none of src lanes are false. + __ ptest(kScratchDoubleReg, kScratchDoubleReg); + __ cmovq(zero, dst, tmp); + break; + } case kX64StackCheck: __ CompareRoot(rsp, RootIndex::kStackLimit); break; diff --git a/src/compiler/x64/instruction-codes-x64.h b/src/compiler/x64/instruction-codes-x64.h index 6a9e313f4e..c2a194e94a 100644 --- a/src/compiler/x64/instruction-codes-x64.h +++ b/src/compiler/x64/instruction-codes-x64.h @@ -151,6 +151,8 @@ namespace compiler { V(X64F32x4Splat) \ V(X64F32x4ExtractLane) \ V(X64F32x4ReplaceLane) \ + V(X64F32x4SConvertI32x4) \ + V(X64F32x4UConvertI32x4) \ V(X64F32x4Abs) \ V(X64F32x4Neg) \ V(X64F32x4RecipApprox) \ @@ -168,6 +170,9 @@ namespace compiler { V(X64I32x4Splat) \ V(X64I32x4ExtractLane) \ V(X64I32x4ReplaceLane) \ + V(X64I32x4SConvertF32x4) \ + V(X64I32x4SConvertI16x8Low) \ + V(X64I32x4SConvertI16x8High) \ V(X64I32x4Neg) \ V(X64I32x4Shl) \ V(X64I32x4ShrS) \ @@ -181,6 +186,9 @@ namespace compiler { V(X64I32x4Ne) \ V(X64I32x4GtS) \ V(X64I32x4GeS) \ + V(X64I32x4UConvertF32x4) \ + V(X64I32x4UConvertI16x8Low) \ + V(X64I32x4UConvertI16x8High) \ V(X64I32x4ShrU) \ V(X64I32x4MinU) \ V(X64I32x4MaxU) \ @@ -189,9 +197,12 @@ namespace compiler { V(X64I16x8Splat) \ V(X64I16x8ExtractLane) \ V(X64I16x8ReplaceLane) \ + V(X64I16x8SConvertI8x16Low) \ + V(X64I16x8SConvertI8x16High) \ V(X64I16x8Neg) \ V(X64I16x8Shl) \ V(X64I16x8ShrS) \ + V(X64I16x8SConvertI32x4) \ V(X64I16x8Add) \ V(X64I16x8AddSaturateS) \ V(X64I16x8AddHoriz) \ @@ -204,7 +215,10 @@ namespace compiler { V(X64I16x8Ne) \ V(X64I16x8GtS) \ V(X64I16x8GeS) \ + V(X64I16x8UConvertI8x16Low) \ + V(X64I16x8UConvertI8x16High) \ V(X64I16x8ShrU) \ + V(X64I16x8UConvertI32x4) \ V(X64I16x8AddSaturateU) \ V(X64I16x8SubSaturateU) \ V(X64I16x8MinU) \ @@ -214,6 +228,7 @@ namespace compiler { V(X64I8x16Splat) \ V(X64I8x16ExtractLane) \ V(X64I8x16ReplaceLane) \ + V(X64I8x16SConvertI16x8) \ V(X64I8x16Neg) \ V(X64I8x16Add) \ V(X64I8x16AddSaturateS) \ @@ -225,6 +240,7 @@ namespace compiler { V(X64I8x16Ne) \ V(X64I8x16GtS) \ V(X64I8x16GeS) \ + V(X64I8x16UConvertI16x8) \ V(X64I8x16AddSaturateU) \ V(X64I8x16SubSaturateU) \ V(X64I8x16MinU) \ @@ -237,6 +253,12 @@ namespace compiler { V(X64S128Not) \ V(X64S128Select) \ V(X64S128Zero) \ + V(X64S1x4AnyTrue) \ + V(X64S1x4AllTrue) \ + V(X64S1x8AnyTrue) \ + V(X64S1x8AllTrue) \ + V(X64S1x16AnyTrue) \ + V(X64S1x16AllTrue) \ V(X64Word64AtomicLoadUint8) \ V(X64Word64AtomicLoadUint16) \ V(X64Word64AtomicLoadUint32) \ diff --git a/src/compiler/x64/instruction-scheduler-x64.cc b/src/compiler/x64/instruction-scheduler-x64.cc index b1f380badf..e5523fd49d 100644 --- a/src/compiler/x64/instruction-scheduler-x64.cc +++ b/src/compiler/x64/instruction-scheduler-x64.cc @@ -128,6 +128,8 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64F32x4Splat: case kX64F32x4ExtractLane: case kX64F32x4ReplaceLane: + case kX64F32x4SConvertI32x4: + case kX64F32x4UConvertI32x4: case kX64F32x4RecipApprox: case kX64F32x4RecipSqrtApprox: case kX64F32x4Abs: @@ -145,6 +147,9 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I32x4Splat: case kX64I32x4ExtractLane: case kX64I32x4ReplaceLane: + case kX64I32x4SConvertF32x4: + case kX64I32x4SConvertI16x8Low: + case kX64I32x4SConvertI16x8High: case kX64I32x4Neg: case kX64I32x4Shl: case kX64I32x4ShrS: @@ -158,6 +163,9 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I32x4Ne: case kX64I32x4GtS: case kX64I32x4GeS: + case kX64I32x4UConvertF32x4: + case kX64I32x4UConvertI16x8Low: + case kX64I32x4UConvertI16x8High: case kX64I32x4ShrU: case kX64I32x4MinU: case kX64I32x4MaxU: @@ -166,9 +174,12 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I16x8Splat: case kX64I16x8ExtractLane: case kX64I16x8ReplaceLane: + case kX64I16x8SConvertI8x16Low: + case kX64I16x8SConvertI8x16High: case kX64I16x8Neg: case kX64I16x8Shl: case kX64I16x8ShrS: + case kX64I16x8SConvertI32x4: case kX64I16x8Add: case kX64I16x8AddSaturateS: case kX64I16x8AddHoriz: @@ -181,6 +192,9 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I16x8Ne: case kX64I16x8GtS: case kX64I16x8GeS: + case kX64I16x8UConvertI8x16Low: + case kX64I16x8UConvertI8x16High: + case kX64I16x8UConvertI32x4: case kX64I16x8ShrU: case kX64I16x8AddSaturateU: case kX64I16x8SubSaturateU: @@ -191,6 +205,7 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I8x16Splat: case kX64I8x16ExtractLane: case kX64I8x16ReplaceLane: + case kX64I8x16SConvertI16x8: case kX64I8x16Neg: case kX64I8x16Add: case kX64I8x16AddSaturateS: @@ -202,6 +217,7 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I8x16Ne: case kX64I8x16GtS: case kX64I8x16GeS: + case kX64I8x16UConvertI16x8: case kX64I8x16AddSaturateU: case kX64I8x16SubSaturateU: case kX64I8x16MinU: @@ -214,6 +230,12 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64S128Not: case kX64S128Select: case kX64S128Zero: + case kX64S1x4AnyTrue: + case kX64S1x4AllTrue: + case kX64S1x8AnyTrue: + case kX64S1x8AllTrue: + case kX64S1x16AnyTrue: + case kX64S1x16AllTrue: return (instr->addressing_mode() == kMode_None) ? kNoOpcodeFlags : kIsLoadOperation | kHasSideEffect; diff --git a/src/compiler/x64/instruction-selector-x64.cc b/src/compiler/x64/instruction-selector-x64.cc index 074f00fad7..211794ace8 100644 --- a/src/compiler/x64/instruction-selector-x64.cc +++ b/src/compiler/x64/instruction-selector-x64.cc @@ -2496,6 +2496,7 @@ VISIT_ATOMIC_BINOP(Xor) V(I32x4MaxU) \ V(I32x4GtU) \ V(I32x4GeU) \ + V(I16x8SConvertI32x4) \ V(I16x8Add) \ V(I16x8AddSaturateS) \ V(I16x8AddHoriz) \ @@ -2514,6 +2515,7 @@ VISIT_ATOMIC_BINOP(Xor) V(I16x8MaxU) \ V(I16x8GtU) \ V(I16x8GeU) \ + V(I8x16SConvertI16x8) \ V(I8x16Add) \ V(I8x16AddSaturateS) \ V(I8x16Sub) \ @@ -2534,14 +2536,23 @@ VISIT_ATOMIC_BINOP(Xor) V(S128Or) \ V(S128Xor) -#define SIMD_UNOP_LIST(V) \ - V(F32x4Abs) \ - V(F32x4Neg) \ - V(F32x4RecipApprox) \ - V(F32x4RecipSqrtApprox) \ - V(I32x4Neg) \ - V(I16x8Neg) \ - V(I8x16Neg) \ +#define SIMD_UNOP_LIST(V) \ + V(F32x4SConvertI32x4) \ + V(F32x4Abs) \ + V(F32x4Neg) \ + V(F32x4RecipApprox) \ + V(F32x4RecipSqrtApprox) \ + V(I32x4SConvertI16x8Low) \ + V(I32x4SConvertI16x8High) \ + V(I32x4Neg) \ + V(I32x4UConvertI16x8Low) \ + V(I32x4UConvertI16x8High) \ + V(I16x8SConvertI8x16Low) \ + V(I16x8SConvertI8x16High) \ + V(I16x8Neg) \ + V(I16x8UConvertI8x16Low) \ + V(I16x8UConvertI8x16High) \ + V(I8x16Neg) \ V(S128Not) #define SIMD_SHIFT_OPCODES(V) \ @@ -2552,6 +2563,16 @@ VISIT_ATOMIC_BINOP(Xor) V(I16x8ShrS) \ V(I16x8ShrU) +#define SIMD_ANYTRUE_LIST(V) \ + V(S1x4AnyTrue) \ + V(S1x8AnyTrue) \ + V(S1x16AnyTrue) + +#define SIMD_ALLTRUE_LIST(V) \ + V(S1x4AllTrue) \ + V(S1x8AllTrue) \ + V(S1x16AllTrue) + void InstructionSelector::VisitS128Zero(Node* node) { X64OperandGenerator g(this); Emit(kX64S128Zero, g.DefineAsRegister(node), g.DefineAsRegister(node)); @@ -2596,6 +2617,7 @@ SIMD_TYPES(VISIT_SIMD_REPLACE_LANE) } SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT) #undef VISIT_SIMD_SHIFT +#undef SIMD_SHIFT_OPCODES #define VISIT_SIMD_UNOP(Opcode) \ void InstructionSelector::Visit##Opcode(Node* node) { \ @@ -2605,6 +2627,7 @@ SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT) } SIMD_UNOP_LIST(VISIT_SIMD_UNOP) #undef VISIT_SIMD_UNOP +#undef SIMD_UNOP_LIST #define VISIT_SIMD_BINOP(Opcode) \ void InstructionSelector::Visit##Opcode(Node* node) { \ @@ -2614,10 +2637,30 @@ SIMD_UNOP_LIST(VISIT_SIMD_UNOP) } SIMD_BINOP_LIST(VISIT_SIMD_BINOP) #undef VISIT_SIMD_BINOP -#undef SIMD_TYPES #undef SIMD_BINOP_LIST -#undef SIMD_UNOP_LIST -#undef SIMD_SHIFT_OPCODES + +#define VISIT_SIMD_ANYTRUE(Opcode) \ + void InstructionSelector::Visit##Opcode(Node* node) { \ + X64OperandGenerator g(this); \ + InstructionOperand temps[] = {g.TempRegister()}; \ + Emit(kX64##Opcode, g.DefineAsRegister(node), \ + g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \ + } +SIMD_ANYTRUE_LIST(VISIT_SIMD_ANYTRUE) +#undef VISIT_SIMD_ANYTRUE +#undef SIMD_ANYTRUE_LIST + +#define VISIT_SIMD_ALLTRUE(Opcode) \ + void InstructionSelector::Visit##Opcode(Node* node) { \ + X64OperandGenerator g(this); \ + InstructionOperand temps[] = {g.TempRegister()}; \ + Emit(kX64##Opcode, g.DefineAsRegister(node), \ + g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \ + } +SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE) +#undef VISIT_SIMD_ALLTRUE +#undef SIMD_ALLTRUE_LIST +#undef SIMD_TYPES void InstructionSelector::VisitS128Select(Node* node) { X64OperandGenerator g(this); @@ -2626,6 +2669,36 @@ void InstructionSelector::VisitS128Select(Node* node) { g.UseRegister(node->InputAt(2))); } +void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) { + X64OperandGenerator g(this); + Emit(kX64F32x4UConvertI32x4, g.DefineSameAsFirst(node), + g.UseRegister(node->InputAt(0))); +} + +void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) { + X64OperandGenerator g(this); + Emit(kX64I32x4SConvertF32x4, g.DefineSameAsFirst(node), + g.UseRegister(node->InputAt(0))); +} + +void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) { + X64OperandGenerator g(this); + Emit(kX64I32x4UConvertF32x4, g.DefineSameAsFirst(node), + g.UseRegister(node->InputAt(0))); +} + +void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) { + X64OperandGenerator g(this); + Emit(kX64I16x8UConvertI32x4, g.DefineSameAsFirst(node), + g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1))); +} + +void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) { + X64OperandGenerator g(this); + Emit(kX64I8x16UConvertI16x8, g.DefineSameAsFirst(node), + g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1))); +} + void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) { UNREACHABLE(); } diff --git a/src/x64/assembler-x64.cc b/src/x64/assembler-x64.cc index 171fac0831..821b7acdc0 100644 --- a/src/x64/assembler-x64.cc +++ b/src/x64/assembler-x64.cc @@ -83,7 +83,10 @@ void CpuFeatures::ProbeImpl(bool cross_compile) { // Only use statically determined features for cross compile (snapshot). if (cross_compile) return; - if (cpu.has_sse41() && FLAG_enable_sse4_1) supported_ |= 1u << SSE4_1; + if (cpu.has_sse41() && FLAG_enable_sse4_1) { + supported_ |= 1u << SSE4_1; + supported_ |= 1u << SSSE3; + } if (cpu.has_ssse3() && FLAG_enable_ssse3) supported_ |= 1u << SSSE3; if (cpu.has_sse3() && FLAG_enable_sse3) supported_ |= 1u << SSE3; // SAHF is not generally available in long mode. @@ -458,6 +461,9 @@ Assembler::Assembler(const AssemblerOptions& options, void* buffer, ReserveCodeTargetSpace(100); reloc_info_writer.Reposition(buffer_ + buffer_size_, pc_); + if (CpuFeatures::IsSupported(SSE4_1)) { + EnableCpuFeature(SSSE3); + } } void Assembler::GetCode(Isolate* isolate, CodeDesc* desc) { diff --git a/test/cctest/wasm/test-run-wasm-simd.cc b/test/cctest/wasm/test-run-wasm-simd.cc index f60c65b727..b0f3dcf8ce 100644 --- a/test/cctest/wasm/test-run-wasm-simd.cc +++ b/test/cctest/wasm/test-run-wasm-simd.cc @@ -1080,8 +1080,6 @@ WASM_SIMD_TEST(I32x4ShrU) { LogicalShiftRight); } -#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \ - V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 // Tests both signed and unsigned conversion from I8x16 (unpacking). WASM_SIMD_TEST(I16x8ConvertI8x16) { WasmRunner r(execution_tier, @@ -1124,8 +1122,6 @@ WASM_SIMD_TEST(I16x8ConvertI8x16) { CHECK_EQ(1, r.Call(*i, unpacked_signed, unpacked_unsigned, 0)); } } -#endif // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || - // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 void RunI16x8UnOpTest(ExecutionTier execution_tier, LowerSimd lower_simd, WasmOpcode simd_op, Int16UnOp expected_op) { @@ -1144,8 +1140,6 @@ WASM_SIMD_TEST(I16x8Neg) { RunI16x8UnOpTest(execution_tier, lower_simd, kExprI16x8Neg, Negate); } -#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \ - V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 // Tests both signed and unsigned conversion from I32x4 (packing). WASM_SIMD_TEST(I16x8ConvertI32x4) { WasmRunner r( @@ -1190,8 +1184,6 @@ WASM_SIMD_TEST(I16x8ConvertI32x4) { } } } -#endif // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || - // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 void RunI16x8BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd, WasmOpcode simd_op, Int16BinOp expected_op) { @@ -1374,8 +1366,6 @@ WASM_SIMD_TEST(I8x16Neg) { RunI8x16UnOpTest(execution_tier, lower_simd, kExprI8x16Neg, Negate); } -#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \ - V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 // Tests both signed and unsigned conversion from I16x8 (packing). WASM_SIMD_TEST(I8x16ConvertI16x8) { WasmRunner r( @@ -1422,8 +1412,6 @@ WASM_SIMD_TEST(I8x16ConvertI16x8) { } } } -#endif // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || - // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 void RunI8x16BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd, WasmOpcode simd_op, Int8BinOp expected_op) { @@ -2012,6 +2000,8 @@ WASM_SIMD_COMPILED_TEST(S8x16MultiShuffleFuzz) { } } } +#endif // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || + // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 // Boolean unary operations are 'AllTrue' and 'AnyTrue', which return an integer // result. Use relational ops on numeric vectors to create the boolean vector @@ -2099,8 +2089,6 @@ WASM_SIMD_TEST(SimdI32x4ExtractWithF32x4) { WASM_I32V(1), WASM_I32V(0))); CHECK_EQ(1, r.Call()); } -#endif // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || - // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32 WASM_SIMD_TEST(SimdF32x4ExtractWithI32x4) { WasmRunner r(execution_tier, lower_simd);