[wasm-simd][x64] Prototype extended multiply

Bug: v8:11008
Change-Id: Ic72e71eb10a5b47c97467bf6d25e55d20425273a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2575784
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71686}
This commit is contained in:
Zhi An Ng 2020-12-08 06:34:53 +00:00 committed by Commit Bot
parent b6c1ef3863
commit baf7e9029e
8 changed files with 167 additions and 4 deletions

View File

@ -77,6 +77,8 @@
V(pminsw, 66, 0F, EA) \
V(pminub, 66, 0F, DA) \
V(pmullw, 66, 0F, D5) \
V(pmulhuw, 66, 0F, E4) \
V(pmulhw, 66, 0F, E5) \
V(pmuludq, 66, 0F, F4) \
V(psllw, 66, 0F, F1) \
V(pslld, 66, 0F, F2) \
@ -143,6 +145,7 @@
V(pabsd, 66, 0F, 38, 1E)
#define SSE4_INSTRUCTION_LIST(V) \
V(pmuldq, 66, 0F, 38, 28) \
V(pcmpeqq, 66, 0F, 38, 29) \
V(packusdw, 66, 0F, 38, 2B) \
V(pminsb, 66, 0F, 38, 38) \

View File

@ -2737,7 +2737,7 @@ void InstructionSelector::VisitI64x2Eq(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X && !V8_TARGET_ARCH_ARM64
// && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
// TODO(v8:11008) Prototype extended multiplication.
void InstructionSelector::VisitI64x2ExtMulLowI32x4S(Node* node) {
UNIMPLEMENTED();
@ -2775,7 +2775,7 @@ void InstructionSelector::VisitI16x8ExtMulLowI8x16U(Node* node) {
void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_ARM64
// TODO(v8:10971) Prototype i16x8.q15mulr_sat_s

View File

@ -5,6 +5,7 @@
#include <limits>
#include "src/base/overflowing-math.h"
#include "src/codegen/assembler.h"
#include "src/codegen/macro-assembler.h"
#include "src/codegen/optimized-compilation-info.h"
#include "src/codegen/x64/assembler-x64.h"
@ -679,6 +680,72 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
} \
} while (false)
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(EXTEND_MACRO_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, i.InputSimd128Register(0)); \
__ EXTEND_MACRO_INSTR(dst, i.InputSimd128Register(1)); \
__ Pmullw(dst, kScratchDoubleReg); \
} while (false)
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(EXTEND_MACRO_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
__ Palignr(kScratchDoubleReg, i.InputSimd128Register(0), uint8_t{8}); \
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, kScratchDoubleReg); \
__ Palignr(dst, i.InputSimd128Register(1), uint8_t{8}); \
__ EXTEND_MACRO_INSTR(dst, dst); \
__ Pmullw(dst, kScratchDoubleReg); \
} while (false)
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.
#define ASSEMBLE_SIMD_I32X4_EXT_MUL(MUL_HIGH_INSTR, UNPACK_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src0 = i.InputSimd128Register(0); \
XMMRegister src1 = i.InputSimd128Register(1); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ vpmullw(kScratchDoubleReg, src0, src1); \
__ v##MUL_HIGH_INSTR(dst, src0, src1); \
__ v##UNPACK_INSTR(dst, kScratchDoubleReg, dst); \
} else { \
DCHECK_EQ(dst, src0); \
__ movdqu(kScratchDoubleReg, src0); \
__ pmullw(kScratchDoubleReg, src1); \
__ MUL_HIGH_INSTR(dst, src1); \
__ UNPACK_INSTR(kScratchDoubleReg, dst); \
__ movdqu(dst, kScratchDoubleReg); \
} \
} while (false)
// 1. Unpack src0, src0 into even-number elements of scratch.
// 2. Unpack src1, src1 into even-number elements of dst.
// 3. Multiply 1. with 2.
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
// We only need SSE4_1 for pmuldq (signed ext mul), but to reduce macro
// duplication we enable it in all cases.
#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR, MUL_INSTR, SHUFFLE_CONST) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src0 = i.InputSimd128Register(0); \
XMMRegister src1 = i.InputSimd128Register(1); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ v##UNPACK_INSTR(kScratchDoubleReg, src0, src0); \
__ v##UNPACK_INSTR(dst, src1, src1); \
__ v##MUL_INSTR(dst, kScratchDoubleReg, dst); \
} else { \
CpuFeatureScope avx_scope(tasm(), SSE4_1); \
DCHECK_EQ(dst, src0); \
__ pshufd(kScratchDoubleReg, src0, SHUFFLE_CONST); \
__ pshufd(dst, src1, SHUFFLE_CONST); \
__ MUL_INSTR(dst, kScratchDoubleReg); \
} \
} while (false)
void CodeGenerator::AssembleDeconstructFrame() {
unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset());
__ movq(rsp, rbp);
@ -2810,6 +2877,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
ASSEMBLE_SIMD_SHIFT(psrlq, 6);
break;
}
case kX64I64x2ExtMulLowI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuldq, 0x50);
break;
}
case kX64I64x2ExtMulHighI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuldq, 0xFA);
break;
}
case kX64I64x2ExtMulLowI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuludq, 0x50);
break;
}
case kX64I64x2ExtMulHighI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuludq, 0xFA);
break;
}
case kX64I32x4Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) {
@ -3198,6 +3281,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ shrq(dst, Immediate(8));
break;
}
case kX64I16x8ExtMulLowI8x16S: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovsxbw);
break;
}
case kX64I16x8ExtMulHighI8x16S: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovsxbw);
break;
}
case kX64I16x8ExtMulLowI8x16U: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovzxbw);
break;
}
case kX64I16x8ExtMulHighI8x16U: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw);
break;
}
case kX64I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) {
@ -3528,6 +3627,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1), i.InputSimd128Register(2));
break;
}
case kX64I32x4ExtMulLowI16x8S: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpcklwd);
break;
}
case kX64I32x4ExtMulHighI16x8S: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpckhwd);
break;
}
case kX64I32x4ExtMulLowI16x8U: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpcklwd);
break;
}
case kX64I32x4ExtMulHighI16x8U: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpckhwd);
break;
}
case kX64I64x2SignSelect: {
__ Blendvpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2));
@ -4113,6 +4228,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
return kSuccess;
} // NOLadability/fn_size)
#undef ASSEMBLE_SIMD_I64X2_EXT_MUL
#undef ASSEMBLE_SIMD_I32X4_EXT_MUL
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW
#undef ASSEMBLE_PINSR
#undef ASSEMBLE_UNOP
#undef ASSEMBLE_BINOP
#undef ASSEMBLE_COMPARE

View File

@ -212,6 +212,10 @@ namespace compiler {
V(X64I64x2Eq) \
V(X64I64x2ShrU) \
V(X64I64x2SignSelect) \
V(X64I64x2ExtMulLowI32x4S) \
V(X64I64x2ExtMulHighI32x4S) \
V(X64I64x2ExtMulLowI32x4U) \
V(X64I64x2ExtMulHighI32x4U) \
V(X64I32x4Splat) \
V(X64I32x4ExtractLane) \
V(X64I32x4SConvertF32x4) \
@ -242,6 +246,10 @@ namespace compiler {
V(X64I32x4BitMask) \
V(X64I32x4DotI16x8S) \
V(X64I32x4SignSelect) \
V(X64I32x4ExtMulLowI16x8S) \
V(X64I32x4ExtMulHighI16x8S) \
V(X64I32x4ExtMulLowI16x8U) \
V(X64I32x4ExtMulHighI16x8U) \
V(X64I16x8Splat) \
V(X64I16x8ExtractLaneS) \
V(X64I16x8SConvertI8x16Low) \
@ -276,6 +284,10 @@ namespace compiler {
V(X64I16x8Abs) \
V(X64I16x8BitMask) \
V(X64I16x8SignSelect) \
V(X64I16x8ExtMulLowI8x16S) \
V(X64I16x8ExtMulHighI8x16S) \
V(X64I16x8ExtMulLowI8x16U) \
V(X64I16x8ExtMulHighI8x16U) \
V(X64I8x16Splat) \
V(X64I8x16ExtractLaneS) \
V(X64Pinsrb) \

View File

@ -188,6 +188,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I64x2Eq:
case kX64I64x2ShrU:
case kX64I64x2SignSelect:
case kX64I64x2ExtMulLowI32x4S:
case kX64I64x2ExtMulHighI32x4S:
case kX64I64x2ExtMulLowI32x4U:
case kX64I64x2ExtMulHighI32x4U:
case kX64I32x4Splat:
case kX64I32x4ExtractLane:
case kX64I32x4SConvertF32x4:
@ -218,6 +222,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I32x4BitMask:
case kX64I32x4DotI16x8S:
case kX64I32x4SignSelect:
case kX64I32x4ExtMulLowI16x8S:
case kX64I32x4ExtMulHighI16x8S:
case kX64I32x4ExtMulLowI16x8U:
case kX64I32x4ExtMulHighI16x8U:
case kX64I16x8Splat:
case kX64I16x8ExtractLaneS:
case kX64I16x8SConvertI8x16Low:
@ -252,6 +260,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I16x8Abs:
case kX64I16x8BitMask:
case kX64I16x8SignSelect:
case kX64I16x8ExtMulLowI8x16S:
case kX64I16x8ExtMulHighI8x16S:
case kX64I16x8ExtMulLowI8x16U:
case kX64I16x8ExtMulHighI8x16U:
case kX64I8x16Splat:
case kX64I8x16ExtractLaneS:
case kX64I8x16SConvertI16x8:

View File

@ -2819,6 +2819,10 @@ VISIT_ATOMIC_BINOP(Xor)
V(I64x2Add) \
V(I64x2Sub) \
V(I64x2Eq) \
V(I64x2ExtMulLowI32x4S) \
V(I64x2ExtMulHighI32x4S) \
V(I64x2ExtMulLowI32x4U) \
V(I64x2ExtMulHighI32x4U) \
V(I32x4Add) \
V(I32x4AddHoriz) \
V(I32x4Sub) \
@ -2830,6 +2834,10 @@ VISIT_ATOMIC_BINOP(Xor)
V(I32x4MinU) \
V(I32x4MaxU) \
V(I32x4DotI16x8S) \
V(I32x4ExtMulLowI16x8S) \
V(I32x4ExtMulHighI16x8S) \
V(I32x4ExtMulLowI16x8U) \
V(I32x4ExtMulHighI16x8U) \
V(I16x8SConvertI32x4) \
V(I16x8UConvertI32x4) \
V(I16x8Add) \
@ -2847,6 +2855,10 @@ VISIT_ATOMIC_BINOP(Xor)
V(I16x8MinU) \
V(I16x8MaxU) \
V(I16x8RoundingAverageU) \
V(I16x8ExtMulLowI8x16S) \
V(I16x8ExtMulHighI8x16S) \
V(I16x8ExtMulLowI8x16U) \
V(I16x8ExtMulHighI8x16U) \
V(I8x16SConvertI16x8) \
V(I8x16UConvertI16x8) \
V(I8x16Add) \

View File

@ -2041,6 +2041,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
mnemonic = "psrad";
} else if (opcode == 0xE3) {
mnemonic = "pavgw";
} else if (opcode == 0xE4) {
mnemonic = "pmulhuw";
} else if (opcode == 0xE5) {
mnemonic = "pmulhw";
} else if (opcode == 0xE8) {
mnemonic = "psubsb";
} else if (opcode == 0xE9) {

View File

@ -2333,7 +2333,7 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8Q15MulRSatS) {
}
#endif // V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
// TODO(v8:11008) Prototype extended multiplication.
namespace {
enum class MulHalf { kLow, kHigh };
@ -2445,7 +2445,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2ExtMulHighI32x4U) {
kExprI64x2ExtMulHighI32x4U, MultiplyLong,
kExprI32x4Splat, MulHalf::kHigh);
}
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
WASM_SIMD_TEST(I32x4DotI16x8S) {
WasmRunner<int32_t, int16_t, int16_t> r(execution_tier, lower_simd);