diff --git a/src/codegen/x64/sse-instr.h b/src/codegen/x64/sse-instr.h index 52107ed6b9..0215ac831e 100644 --- a/src/codegen/x64/sse-instr.h +++ b/src/codegen/x64/sse-instr.h @@ -77,6 +77,8 @@ V(pminsw, 66, 0F, EA) \ V(pminub, 66, 0F, DA) \ V(pmullw, 66, 0F, D5) \ + V(pmulhuw, 66, 0F, E4) \ + V(pmulhw, 66, 0F, E5) \ V(pmuludq, 66, 0F, F4) \ V(psllw, 66, 0F, F1) \ V(pslld, 66, 0F, F2) \ @@ -143,6 +145,7 @@ V(pabsd, 66, 0F, 38, 1E) #define SSE4_INSTRUCTION_LIST(V) \ + V(pmuldq, 66, 0F, 38, 28) \ V(pcmpeqq, 66, 0F, 38, 29) \ V(packusdw, 66, 0F, 38, 2B) \ V(pminsb, 66, 0F, 38, 38) \ diff --git a/src/compiler/backend/instruction-selector.cc b/src/compiler/backend/instruction-selector.cc index 40b859d63c..b4cf6ccec6 100644 --- a/src/compiler/backend/instruction-selector.cc +++ b/src/compiler/backend/instruction-selector.cc @@ -2737,7 +2737,7 @@ void InstructionSelector::VisitI64x2Eq(Node* node) { UNIMPLEMENTED(); } #endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X && !V8_TARGET_ARCH_ARM64 // && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM -#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM +#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64 // TODO(v8:11008) Prototype extended multiplication. void InstructionSelector::VisitI64x2ExtMulLowI32x4S(Node* node) { UNIMPLEMENTED(); @@ -2775,7 +2775,7 @@ void InstructionSelector::VisitI16x8ExtMulLowI8x16U(Node* node) { void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) { UNIMPLEMENTED(); } -#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM +#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM #if !V8_TARGET_ARCH_ARM64 // TODO(v8:10971) Prototype i16x8.q15mulr_sat_s diff --git a/src/compiler/backend/x64/code-generator-x64.cc b/src/compiler/backend/x64/code-generator-x64.cc index 19f36988fa..776334f4cd 100644 --- a/src/compiler/backend/x64/code-generator-x64.cc +++ b/src/compiler/backend/x64/code-generator-x64.cc @@ -5,6 +5,7 @@ #include #include "src/base/overflowing-math.h" +#include "src/codegen/assembler.h" #include "src/codegen/macro-assembler.h" #include "src/codegen/optimized-compilation-info.h" #include "src/codegen/x64/assembler-x64.h" @@ -679,6 +680,72 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen, } \ } while (false) +#define ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(EXTEND_MACRO_INSTR) \ + do { \ + XMMRegister dst = i.OutputSimd128Register(); \ + __ EXTEND_MACRO_INSTR(kScratchDoubleReg, i.InputSimd128Register(0)); \ + __ EXTEND_MACRO_INSTR(dst, i.InputSimd128Register(1)); \ + __ Pmullw(dst, kScratchDoubleReg); \ + } while (false) + +#define ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(EXTEND_MACRO_INSTR) \ + do { \ + XMMRegister dst = i.OutputSimd128Register(); \ + __ Palignr(kScratchDoubleReg, i.InputSimd128Register(0), uint8_t{8}); \ + __ EXTEND_MACRO_INSTR(kScratchDoubleReg, kScratchDoubleReg); \ + __ Palignr(dst, i.InputSimd128Register(1), uint8_t{8}); \ + __ EXTEND_MACRO_INSTR(dst, dst); \ + __ Pmullw(dst, kScratchDoubleReg); \ + } while (false) + +// 1. Multiply low word into scratch. +// 2. Multiply high word (can be signed or unsigned) into dst. +// 3. Unpack and interleave scratch and dst into dst. +#define ASSEMBLE_SIMD_I32X4_EXT_MUL(MUL_HIGH_INSTR, UNPACK_INSTR) \ + do { \ + XMMRegister dst = i.OutputSimd128Register(); \ + XMMRegister src0 = i.InputSimd128Register(0); \ + XMMRegister src1 = i.InputSimd128Register(1); \ + if (CpuFeatures::IsSupported(AVX)) { \ + CpuFeatureScope avx_scope(tasm(), AVX); \ + __ vpmullw(kScratchDoubleReg, src0, src1); \ + __ v##MUL_HIGH_INSTR(dst, src0, src1); \ + __ v##UNPACK_INSTR(dst, kScratchDoubleReg, dst); \ + } else { \ + DCHECK_EQ(dst, src0); \ + __ movdqu(kScratchDoubleReg, src0); \ + __ pmullw(kScratchDoubleReg, src1); \ + __ MUL_HIGH_INSTR(dst, src1); \ + __ UNPACK_INSTR(kScratchDoubleReg, dst); \ + __ movdqu(dst, kScratchDoubleReg); \ + } \ + } while (false) + +// 1. Unpack src0, src0 into even-number elements of scratch. +// 2. Unpack src1, src1 into even-number elements of dst. +// 3. Multiply 1. with 2. +// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. +// We only need SSE4_1 for pmuldq (signed ext mul), but to reduce macro +// duplication we enable it in all cases. +#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR, MUL_INSTR, SHUFFLE_CONST) \ + do { \ + XMMRegister dst = i.OutputSimd128Register(); \ + XMMRegister src0 = i.InputSimd128Register(0); \ + XMMRegister src1 = i.InputSimd128Register(1); \ + if (CpuFeatures::IsSupported(AVX)) { \ + CpuFeatureScope avx_scope(tasm(), AVX); \ + __ v##UNPACK_INSTR(kScratchDoubleReg, src0, src0); \ + __ v##UNPACK_INSTR(dst, src1, src1); \ + __ v##MUL_INSTR(dst, kScratchDoubleReg, dst); \ + } else { \ + CpuFeatureScope avx_scope(tasm(), SSE4_1); \ + DCHECK_EQ(dst, src0); \ + __ pshufd(kScratchDoubleReg, src0, SHUFFLE_CONST); \ + __ pshufd(dst, src1, SHUFFLE_CONST); \ + __ MUL_INSTR(dst, kScratchDoubleReg); \ + } \ + } while (false) + void CodeGenerator::AssembleDeconstructFrame() { unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset()); __ movq(rsp, rbp); @@ -2810,6 +2877,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ASSEMBLE_SIMD_SHIFT(psrlq, 6); break; } + case kX64I64x2ExtMulLowI32x4S: { + ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuldq, 0x50); + break; + } + case kX64I64x2ExtMulHighI32x4S: { + ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuldq, 0xFA); + break; + } + case kX64I64x2ExtMulLowI32x4U: { + ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuludq, 0x50); + break; + } + case kX64I64x2ExtMulHighI32x4U: { + ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuludq, 0xFA); + break; + } case kX64I32x4Splat: { XMMRegister dst = i.OutputSimd128Register(); if (HasRegisterInput(instr, 0)) { @@ -3198,6 +3281,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ shrq(dst, Immediate(8)); break; } + case kX64I16x8ExtMulLowI8x16S: { + ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovsxbw); + break; + } + case kX64I16x8ExtMulHighI8x16S: { + ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovsxbw); + break; + } + case kX64I16x8ExtMulLowI8x16U: { + ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovzxbw); + break; + } + case kX64I16x8ExtMulHighI8x16U: { + ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw); + break; + } case kX64I8x16Splat: { XMMRegister dst = i.OutputSimd128Register(); if (HasRegisterInput(instr, 0)) { @@ -3528,6 +3627,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( i.InputSimd128Register(1), i.InputSimd128Register(2)); break; } + case kX64I32x4ExtMulLowI16x8S: { + ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpcklwd); + break; + } + case kX64I32x4ExtMulHighI16x8S: { + ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpckhwd); + break; + } + case kX64I32x4ExtMulLowI16x8U: { + ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpcklwd); + break; + } + case kX64I32x4ExtMulHighI16x8U: { + ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpckhwd); + break; + } case kX64I64x2SignSelect: { __ Blendvpd(i.OutputSimd128Register(), i.InputSimd128Register(0), i.InputSimd128Register(1), i.InputSimd128Register(2)); @@ -4113,6 +4228,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( return kSuccess; } // NOLadability/fn_size) +#undef ASSEMBLE_SIMD_I64X2_EXT_MUL +#undef ASSEMBLE_SIMD_I32X4_EXT_MUL +#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH +#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW +#undef ASSEMBLE_PINSR #undef ASSEMBLE_UNOP #undef ASSEMBLE_BINOP #undef ASSEMBLE_COMPARE diff --git a/src/compiler/backend/x64/instruction-codes-x64.h b/src/compiler/backend/x64/instruction-codes-x64.h index f1958e8141..17dbcd45b5 100644 --- a/src/compiler/backend/x64/instruction-codes-x64.h +++ b/src/compiler/backend/x64/instruction-codes-x64.h @@ -212,6 +212,10 @@ namespace compiler { V(X64I64x2Eq) \ V(X64I64x2ShrU) \ V(X64I64x2SignSelect) \ + V(X64I64x2ExtMulLowI32x4S) \ + V(X64I64x2ExtMulHighI32x4S) \ + V(X64I64x2ExtMulLowI32x4U) \ + V(X64I64x2ExtMulHighI32x4U) \ V(X64I32x4Splat) \ V(X64I32x4ExtractLane) \ V(X64I32x4SConvertF32x4) \ @@ -242,6 +246,10 @@ namespace compiler { V(X64I32x4BitMask) \ V(X64I32x4DotI16x8S) \ V(X64I32x4SignSelect) \ + V(X64I32x4ExtMulLowI16x8S) \ + V(X64I32x4ExtMulHighI16x8S) \ + V(X64I32x4ExtMulLowI16x8U) \ + V(X64I32x4ExtMulHighI16x8U) \ V(X64I16x8Splat) \ V(X64I16x8ExtractLaneS) \ V(X64I16x8SConvertI8x16Low) \ @@ -276,6 +284,10 @@ namespace compiler { V(X64I16x8Abs) \ V(X64I16x8BitMask) \ V(X64I16x8SignSelect) \ + V(X64I16x8ExtMulLowI8x16S) \ + V(X64I16x8ExtMulHighI8x16S) \ + V(X64I16x8ExtMulLowI8x16U) \ + V(X64I16x8ExtMulHighI8x16U) \ V(X64I8x16Splat) \ V(X64I8x16ExtractLaneS) \ V(X64Pinsrb) \ diff --git a/src/compiler/backend/x64/instruction-scheduler-x64.cc b/src/compiler/backend/x64/instruction-scheduler-x64.cc index 2af0877e53..590e2f2888 100644 --- a/src/compiler/backend/x64/instruction-scheduler-x64.cc +++ b/src/compiler/backend/x64/instruction-scheduler-x64.cc @@ -188,6 +188,10 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I64x2Eq: case kX64I64x2ShrU: case kX64I64x2SignSelect: + case kX64I64x2ExtMulLowI32x4S: + case kX64I64x2ExtMulHighI32x4S: + case kX64I64x2ExtMulLowI32x4U: + case kX64I64x2ExtMulHighI32x4U: case kX64I32x4Splat: case kX64I32x4ExtractLane: case kX64I32x4SConvertF32x4: @@ -218,6 +222,10 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I32x4BitMask: case kX64I32x4DotI16x8S: case kX64I32x4SignSelect: + case kX64I32x4ExtMulLowI16x8S: + case kX64I32x4ExtMulHighI16x8S: + case kX64I32x4ExtMulLowI16x8U: + case kX64I32x4ExtMulHighI16x8U: case kX64I16x8Splat: case kX64I16x8ExtractLaneS: case kX64I16x8SConvertI8x16Low: @@ -252,6 +260,10 @@ int InstructionScheduler::GetTargetInstructionFlags( case kX64I16x8Abs: case kX64I16x8BitMask: case kX64I16x8SignSelect: + case kX64I16x8ExtMulLowI8x16S: + case kX64I16x8ExtMulHighI8x16S: + case kX64I16x8ExtMulLowI8x16U: + case kX64I16x8ExtMulHighI8x16U: case kX64I8x16Splat: case kX64I8x16ExtractLaneS: case kX64I8x16SConvertI16x8: diff --git a/src/compiler/backend/x64/instruction-selector-x64.cc b/src/compiler/backend/x64/instruction-selector-x64.cc index b147fe22ac..638e82f22a 100644 --- a/src/compiler/backend/x64/instruction-selector-x64.cc +++ b/src/compiler/backend/x64/instruction-selector-x64.cc @@ -2819,6 +2819,10 @@ VISIT_ATOMIC_BINOP(Xor) V(I64x2Add) \ V(I64x2Sub) \ V(I64x2Eq) \ + V(I64x2ExtMulLowI32x4S) \ + V(I64x2ExtMulHighI32x4S) \ + V(I64x2ExtMulLowI32x4U) \ + V(I64x2ExtMulHighI32x4U) \ V(I32x4Add) \ V(I32x4AddHoriz) \ V(I32x4Sub) \ @@ -2830,6 +2834,10 @@ VISIT_ATOMIC_BINOP(Xor) V(I32x4MinU) \ V(I32x4MaxU) \ V(I32x4DotI16x8S) \ + V(I32x4ExtMulLowI16x8S) \ + V(I32x4ExtMulHighI16x8S) \ + V(I32x4ExtMulLowI16x8U) \ + V(I32x4ExtMulHighI16x8U) \ V(I16x8SConvertI32x4) \ V(I16x8UConvertI32x4) \ V(I16x8Add) \ @@ -2847,6 +2855,10 @@ VISIT_ATOMIC_BINOP(Xor) V(I16x8MinU) \ V(I16x8MaxU) \ V(I16x8RoundingAverageU) \ + V(I16x8ExtMulLowI8x16S) \ + V(I16x8ExtMulHighI8x16S) \ + V(I16x8ExtMulLowI8x16U) \ + V(I16x8ExtMulHighI8x16U) \ V(I8x16SConvertI16x8) \ V(I8x16UConvertI16x8) \ V(I8x16Add) \ diff --git a/src/diagnostics/x64/disasm-x64.cc b/src/diagnostics/x64/disasm-x64.cc index 7ae330c3ea..292bf9a7a9 100644 --- a/src/diagnostics/x64/disasm-x64.cc +++ b/src/diagnostics/x64/disasm-x64.cc @@ -2041,6 +2041,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) { mnemonic = "psrad"; } else if (opcode == 0xE3) { mnemonic = "pavgw"; + } else if (opcode == 0xE4) { + mnemonic = "pmulhuw"; + } else if (opcode == 0xE5) { + mnemonic = "pmulhw"; } else if (opcode == 0xE8) { mnemonic = "psubsb"; } else if (opcode == 0xE9) { diff --git a/test/cctest/wasm/test-run-wasm-simd.cc b/test/cctest/wasm/test-run-wasm-simd.cc index 13c1a34889..3ec67d224b 100644 --- a/test/cctest/wasm/test-run-wasm-simd.cc +++ b/test/cctest/wasm/test-run-wasm-simd.cc @@ -2333,7 +2333,7 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8Q15MulRSatS) { } #endif // V8_TARGET_ARCH_ARM64 -#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM +#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 // TODO(v8:11008) Prototype extended multiplication. namespace { enum class MulHalf { kLow, kHigh }; @@ -2445,7 +2445,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2ExtMulHighI32x4U) { kExprI64x2ExtMulHighI32x4U, MultiplyLong, kExprI32x4Splat, MulHalf::kHigh); } -#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM +#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 WASM_SIMD_TEST(I32x4DotI16x8S) { WasmRunner r(execution_tier, lower_simd);