diff --git a/src/codegen/ia32/macro-assembler-ia32.h b/src/codegen/ia32/macro-assembler-ia32.h index be3480ac80..a8b1bd1b01 100644 --- a/src/codegen/ia32/macro-assembler-ia32.h +++ b/src/codegen/ia32/macro-assembler-ia32.h @@ -505,15 +505,16 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { #define AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, dst_type, src_type, \ sse_scope) \ - void macro_name(dst_type dst, src_type src) { \ + void macro_name(dst_type dst, dst_type src1, src_type src2) { \ if (CpuFeatures::IsSupported(AVX)) { \ CpuFeatureScope scope(this, AVX); \ - v##name(dst, dst, src); \ + v##name(dst, src1, src2); \ return; \ } \ if (CpuFeatures::IsSupported(sse_scope)) { \ CpuFeatureScope scope(this, sse_scope); \ - name(dst, src); \ + DCHECK_EQ(dst, src1); \ + name(dst, src2); \ return; \ } \ UNREACHABLE(); \ @@ -523,6 +524,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE4_1) AVX_OP3_XO_SSE4(Pmaxsd, pmaxsd) + AVX_OP3_WITH_TYPE_SCOPE(Pmaddubsw, pmaddubsw, XMMRegister, XMMRegister, SSSE3) #undef AVX_OP3_XO_SSE4 #undef AVX_OP3_WITH_TYPE_SCOPE diff --git a/src/codegen/ia32/sse-instr.h b/src/codegen/ia32/sse-instr.h index dffa31d6f7..dc71d0887f 100644 --- a/src/codegen/ia32/sse-instr.h +++ b/src/codegen/ia32/sse-instr.h @@ -63,9 +63,10 @@ V(pxor, 66, 0F, EF) #define SSSE3_INSTRUCTION_LIST(V) \ - V(phaddd, 66, 0F, 38, 02) \ - V(phaddw, 66, 0F, 38, 01) \ V(pshufb, 66, 0F, 38, 00) \ + V(phaddw, 66, 0F, 38, 01) \ + V(phaddd, 66, 0F, 38, 02) \ + V(pmaddubsw, 66, 0F, 38, 04) \ V(psignb, 66, 0F, 38, 08) \ V(psignw, 66, 0F, 38, 09) \ V(psignd, 66, 0F, 38, 0A) diff --git a/src/compiler/backend/ia32/code-generator-ia32.cc b/src/compiler/backend/ia32/code-generator-ia32.cc index b7b75963f2..9eb9502eab 100644 --- a/src/compiler/backend/ia32/code-generator-ia32.cc +++ b/src/compiler/backend/ia32/code-generator-ia32.cc @@ -2323,6 +2323,54 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } break; } + case kIA32I32x4ExtAddPairwiseI16x8S: { + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister src = i.InputSimd128Register(0); + // kScratchDoubleReg = i16x8.splat(1) + __ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); + __ Psrlw(kScratchDoubleReg, byte{15}); + // pmaddwd multiplies signed words in kScratchDoubleReg and src, producing + // signed doublewords, then adds pairwise. + // src = |a|b|c|d|e|f|g|h| + // dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 | + __ Pmaddwd(dst, src, kScratchDoubleReg); + break; + } + case kIA32I32x4ExtAddPairwiseI16x8U: { + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister src = i.InputSimd128Register(0); + + // src = |a|b|c|d|e|f|g|h| + // kScratchDoubleReg = i32x4.splat(0x0000FFFF) + __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); + __ Psrld(kScratchDoubleReg, kScratchDoubleReg, uint8_t{16}); + // kScratchDoubleReg =|0|b|0|d|0|f|0|h| + __ Pand(kScratchDoubleReg, src); + // dst = |0|a|0|c|0|e|0|g| + __ Psrld(dst, src, byte{16}); + // dst = |a+b|c+d|e+f|g+h| + __ Paddd(dst, src, kScratchDoubleReg); + break; + } + case kIA32I16x8ExtAddPairwiseI8x16S: { + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister src = i.InputSimd128Register(0); + DCHECK_NE(dst, src); + // dst = i8x16.splat(1) + __ Move(dst, uint32_t{0x01010101}); + __ Pshufd(dst, dst, byte{0}); + __ Pmaddubsw(dst, dst, src); + break; + break; + } + case kIA32I16x8ExtAddPairwiseI8x16U: { + XMMRegister dst = i.OutputSimd128Register(); + // dst = i8x16.splat(1) + __ Move(kScratchDoubleReg, uint32_t{0x01010101}); + __ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0}); + __ Pmaddubsw(dst, i.InputSimd128Register(0), kScratchDoubleReg); + break; + } case kIA32I32x4SignSelect: { ASSEMBLE_SIMD_SIGN_SELECT(blendvps); break; diff --git a/src/compiler/backend/ia32/instruction-codes-ia32.h b/src/compiler/backend/ia32/instruction-codes-ia32.h index 61b9b900fd..0337942881 100644 --- a/src/compiler/backend/ia32/instruction-codes-ia32.h +++ b/src/compiler/backend/ia32/instruction-codes-ia32.h @@ -235,6 +235,8 @@ namespace compiler { V(IA32I32x4ExtMulHighI16x8S) \ V(IA32I32x4ExtMulLowI16x8U) \ V(IA32I32x4ExtMulHighI16x8U) \ + V(IA32I32x4ExtAddPairwiseI16x8S) \ + V(IA32I32x4ExtAddPairwiseI16x8U) \ V(IA32I16x8Splat) \ V(IA32I16x8ExtractLaneS) \ V(IA32I16x8SConvertI8x16Low) \ @@ -293,6 +295,8 @@ namespace compiler { V(IA32I16x8ExtMulHighI8x16S) \ V(IA32I16x8ExtMulLowI8x16U) \ V(IA32I16x8ExtMulHighI8x16U) \ + V(IA32I16x8ExtAddPairwiseI8x16S) \ + V(IA32I16x8ExtAddPairwiseI8x16U) \ V(IA32I8x16Splat) \ V(IA32I8x16ExtractLaneS) \ V(IA32Pinsrb) \ diff --git a/src/compiler/backend/ia32/instruction-scheduler-ia32.cc b/src/compiler/backend/ia32/instruction-scheduler-ia32.cc index f8ece998d7..f13caf1192 100644 --- a/src/compiler/backend/ia32/instruction-scheduler-ia32.cc +++ b/src/compiler/backend/ia32/instruction-scheduler-ia32.cc @@ -217,6 +217,8 @@ int InstructionScheduler::GetTargetInstructionFlags( case kIA32I32x4ExtMulHighI16x8S: case kIA32I32x4ExtMulLowI16x8U: case kIA32I32x4ExtMulHighI16x8U: + case kIA32I32x4ExtAddPairwiseI16x8S: + case kIA32I32x4ExtAddPairwiseI16x8U: case kIA32I16x8Splat: case kIA32I16x8ExtractLaneS: case kIA32I16x8SConvertI8x16Low: @@ -275,6 +277,8 @@ int InstructionScheduler::GetTargetInstructionFlags( case kIA32I16x8ExtMulHighI8x16S: case kIA32I16x8ExtMulLowI8x16U: case kIA32I16x8ExtMulHighI8x16U: + case kIA32I16x8ExtAddPairwiseI8x16S: + case kIA32I16x8ExtAddPairwiseI8x16U: case kIA32I8x16Splat: case kIA32I8x16ExtractLaneS: case kIA32Pinsrb: diff --git a/src/compiler/backend/ia32/instruction-selector-ia32.cc b/src/compiler/backend/ia32/instruction-selector-ia32.cc index e5969d2f47..5a3fe4006c 100644 --- a/src/compiler/backend/ia32/instruction-selector-ia32.cc +++ b/src/compiler/backend/ia32/instruction-selector-ia32.cc @@ -2360,7 +2360,7 @@ void InstructionSelector::VisitF64x2Max(Node* node) { } void InstructionSelector::VisitF64x2Splat(Node* node) { - VisitRRSimd(this, node, kIA32F64x2Splat, kIA32F64x2Splat); + VisitRRSimd(this, node, kIA32F64x2Splat); } void InstructionSelector::VisitF64x2ExtractLane(Node* node) { @@ -2427,7 +2427,7 @@ void InstructionSelector::VisitI64x2Mul(Node* node) { } void InstructionSelector::VisitF32x4Splat(Node* node) { - VisitRRSimd(this, node, kIA32F32x4Splat, kIA32F32x4Splat); + VisitRRSimd(this, node, kIA32F32x4Splat); } void InstructionSelector::VisitF32x4ExtractLane(Node* node) { @@ -3083,6 +3083,24 @@ void InstructionSelector::VisitI64x2SignSelect(Node* node) { VisitSignSelect(this, node, kIA32I64x2SignSelect); } +void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) { + VisitRRSimd(this, node, kIA32I32x4ExtAddPairwiseI16x8S); +} + +void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8U(Node* node) { + VisitRRSimd(this, node, kIA32I32x4ExtAddPairwiseI16x8U); +} + +void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) { + IA32OperandGenerator g(this); + Emit(kIA32I16x8ExtAddPairwiseI8x16S, g.DefineAsRegister(node), + g.UseUniqueRegister(node->InputAt(0))); +} + +void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) { + VisitRRSimd(this, node, kIA32I16x8ExtAddPairwiseI8x16U); +} + // static MachineOperatorBuilder::Flags InstructionSelector::SupportedMachineOperatorFlags() { diff --git a/src/compiler/backend/instruction-selector.cc b/src/compiler/backend/instruction-selector.cc index 75a9de1e9a..0f23487987 100644 --- a/src/compiler/backend/instruction-selector.cc +++ b/src/compiler/backend/instruction-selector.cc @@ -2814,7 +2814,8 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) { void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); } #endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM -#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64 +#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64 && \ + !V8_TARGET_ARCH_IA32 // TODO(v8:11086) Prototype extended pairwise add. void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) { UNIMPLEMENTED(); @@ -2829,6 +2830,7 @@ void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) { UNIMPLEMENTED(); } #endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64 + // && !V8_TARGET_ARCH_IA32 #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \ !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_MIPS64 diff --git a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h index aa906c6762..abf290efe6 100644 --- a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h +++ b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h @@ -4112,7 +4112,7 @@ void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst, Cvttps2dq(tmp, tmp); Pxor(tmp, liftoff::kScratchDoubleReg); Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg); - Pmaxsd(tmp, liftoff::kScratchDoubleReg); + Pmaxsd(tmp, tmp, liftoff::kScratchDoubleReg); // Convert to int. Overflow lanes above max_signed will be 0x80000000. Cvttps2dq(dst.fp(), dst.fp()); // Add (src-max_signed) for overflow lanes. diff --git a/test/cctest/wasm/test-run-wasm-simd.cc b/test/cctest/wasm/test-run-wasm-simd.cc index 194ee31fdc..6b5a6c2bfd 100644 --- a/test/cctest/wasm/test-run-wasm-simd.cc +++ b/test/cctest/wasm/test-run-wasm-simd.cc @@ -1884,7 +1884,8 @@ WASM_SIMD_TEST(S128Not) { [](int32_t x) { return ~x; }); } -#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 +#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 || \ + V8_TARGET_ARCH_IA32 // TODO(v8:11086) Prototype i32x4.extadd_pairwise_i16x8_{s,u} template void RunExtAddPairwiseTest(TestExecutionTier execution_tier, @@ -1933,7 +1934,8 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8ExtAddPairwiseI8x16U) { kExprI16x8ExtAddPairwiseI8x16U, kExprI8x16Splat); } -#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 +#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 || + // V8_TARGET_ARCH_IA32 void RunI32x4BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd, WasmOpcode opcode, Int32BinOp expected_op) {