[wasm-simd][x64] Prototype extended multiply
Bug: v8:11008 Change-Id: Ic72e71eb10a5b47c97467bf6d25e55d20425273a Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2575784 Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71686}
This commit is contained in:
parent
b6c1ef3863
commit
baf7e9029e
@ -77,6 +77,8 @@
|
||||
V(pminsw, 66, 0F, EA) \
|
||||
V(pminub, 66, 0F, DA) \
|
||||
V(pmullw, 66, 0F, D5) \
|
||||
V(pmulhuw, 66, 0F, E4) \
|
||||
V(pmulhw, 66, 0F, E5) \
|
||||
V(pmuludq, 66, 0F, F4) \
|
||||
V(psllw, 66, 0F, F1) \
|
||||
V(pslld, 66, 0F, F2) \
|
||||
@ -143,6 +145,7 @@
|
||||
V(pabsd, 66, 0F, 38, 1E)
|
||||
|
||||
#define SSE4_INSTRUCTION_LIST(V) \
|
||||
V(pmuldq, 66, 0F, 38, 28) \
|
||||
V(pcmpeqq, 66, 0F, 38, 29) \
|
||||
V(packusdw, 66, 0F, 38, 2B) \
|
||||
V(pminsb, 66, 0F, 38, 38) \
|
||||
|
@ -2737,7 +2737,7 @@ void InstructionSelector::VisitI64x2Eq(Node* node) { UNIMPLEMENTED(); }
|
||||
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X && !V8_TARGET_ARCH_ARM64
|
||||
// && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM
|
||||
|
||||
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
|
||||
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
|
||||
// TODO(v8:11008) Prototype extended multiplication.
|
||||
void InstructionSelector::VisitI64x2ExtMulLowI32x4S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
@ -2775,7 +2775,7 @@ void InstructionSelector::VisitI16x8ExtMulLowI8x16U(Node* node) {
|
||||
void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
|
||||
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM
|
||||
|
||||
#if !V8_TARGET_ARCH_ARM64
|
||||
// TODO(v8:10971) Prototype i16x8.q15mulr_sat_s
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <limits>
|
||||
|
||||
#include "src/base/overflowing-math.h"
|
||||
#include "src/codegen/assembler.h"
|
||||
#include "src/codegen/macro-assembler.h"
|
||||
#include "src/codegen/optimized-compilation-info.h"
|
||||
#include "src/codegen/x64/assembler-x64.h"
|
||||
@ -679,6 +680,72 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(EXTEND_MACRO_INSTR) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, i.InputSimd128Register(0)); \
|
||||
__ EXTEND_MACRO_INSTR(dst, i.InputSimd128Register(1)); \
|
||||
__ Pmullw(dst, kScratchDoubleReg); \
|
||||
} while (false)
|
||||
|
||||
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(EXTEND_MACRO_INSTR) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
__ Palignr(kScratchDoubleReg, i.InputSimd128Register(0), uint8_t{8}); \
|
||||
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, kScratchDoubleReg); \
|
||||
__ Palignr(dst, i.InputSimd128Register(1), uint8_t{8}); \
|
||||
__ EXTEND_MACRO_INSTR(dst, dst); \
|
||||
__ Pmullw(dst, kScratchDoubleReg); \
|
||||
} while (false)
|
||||
|
||||
// 1. Multiply low word into scratch.
|
||||
// 2. Multiply high word (can be signed or unsigned) into dst.
|
||||
// 3. Unpack and interleave scratch and dst into dst.
|
||||
#define ASSEMBLE_SIMD_I32X4_EXT_MUL(MUL_HIGH_INSTR, UNPACK_INSTR) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
XMMRegister src0 = i.InputSimd128Register(0); \
|
||||
XMMRegister src1 = i.InputSimd128Register(1); \
|
||||
if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(tasm(), AVX); \
|
||||
__ vpmullw(kScratchDoubleReg, src0, src1); \
|
||||
__ v##MUL_HIGH_INSTR(dst, src0, src1); \
|
||||
__ v##UNPACK_INSTR(dst, kScratchDoubleReg, dst); \
|
||||
} else { \
|
||||
DCHECK_EQ(dst, src0); \
|
||||
__ movdqu(kScratchDoubleReg, src0); \
|
||||
__ pmullw(kScratchDoubleReg, src1); \
|
||||
__ MUL_HIGH_INSTR(dst, src1); \
|
||||
__ UNPACK_INSTR(kScratchDoubleReg, dst); \
|
||||
__ movdqu(dst, kScratchDoubleReg); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// 1. Unpack src0, src0 into even-number elements of scratch.
|
||||
// 2. Unpack src1, src1 into even-number elements of dst.
|
||||
// 3. Multiply 1. with 2.
|
||||
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
|
||||
// We only need SSE4_1 for pmuldq (signed ext mul), but to reduce macro
|
||||
// duplication we enable it in all cases.
|
||||
#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR, MUL_INSTR, SHUFFLE_CONST) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
XMMRegister src0 = i.InputSimd128Register(0); \
|
||||
XMMRegister src1 = i.InputSimd128Register(1); \
|
||||
if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(tasm(), AVX); \
|
||||
__ v##UNPACK_INSTR(kScratchDoubleReg, src0, src0); \
|
||||
__ v##UNPACK_INSTR(dst, src1, src1); \
|
||||
__ v##MUL_INSTR(dst, kScratchDoubleReg, dst); \
|
||||
} else { \
|
||||
CpuFeatureScope avx_scope(tasm(), SSE4_1); \
|
||||
DCHECK_EQ(dst, src0); \
|
||||
__ pshufd(kScratchDoubleReg, src0, SHUFFLE_CONST); \
|
||||
__ pshufd(dst, src1, SHUFFLE_CONST); \
|
||||
__ MUL_INSTR(dst, kScratchDoubleReg); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
void CodeGenerator::AssembleDeconstructFrame() {
|
||||
unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset());
|
||||
__ movq(rsp, rbp);
|
||||
@ -2810,6 +2877,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
ASSEMBLE_SIMD_SHIFT(psrlq, 6);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2ExtMulLowI32x4S: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuldq, 0x50);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2ExtMulHighI32x4S: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuldq, 0xFA);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2ExtMulLowI32x4U: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuludq, 0x50);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2ExtMulHighI32x4U: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuludq, 0xFA);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4Splat: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (HasRegisterInput(instr, 0)) {
|
||||
@ -3198,6 +3281,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ shrq(dst, Immediate(8));
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtMulLowI8x16S: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovsxbw);
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtMulHighI8x16S: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovsxbw);
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtMulLowI8x16U: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovzxbw);
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtMulHighI8x16U: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw);
|
||||
break;
|
||||
}
|
||||
case kX64I8x16Splat: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (HasRegisterInput(instr, 0)) {
|
||||
@ -3528,6 +3627,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
i.InputSimd128Register(1), i.InputSimd128Register(2));
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtMulLowI16x8S: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpcklwd);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtMulHighI16x8S: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpckhwd);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtMulLowI16x8U: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpcklwd);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtMulHighI16x8U: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpckhwd);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2SignSelect: {
|
||||
__ Blendvpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), i.InputSimd128Register(2));
|
||||
@ -4113,6 +4228,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
return kSuccess;
|
||||
} // NOLadability/fn_size)
|
||||
|
||||
#undef ASSEMBLE_SIMD_I64X2_EXT_MUL
|
||||
#undef ASSEMBLE_SIMD_I32X4_EXT_MUL
|
||||
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH
|
||||
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW
|
||||
#undef ASSEMBLE_PINSR
|
||||
#undef ASSEMBLE_UNOP
|
||||
#undef ASSEMBLE_BINOP
|
||||
#undef ASSEMBLE_COMPARE
|
||||
|
@ -212,6 +212,10 @@ namespace compiler {
|
||||
V(X64I64x2Eq) \
|
||||
V(X64I64x2ShrU) \
|
||||
V(X64I64x2SignSelect) \
|
||||
V(X64I64x2ExtMulLowI32x4S) \
|
||||
V(X64I64x2ExtMulHighI32x4S) \
|
||||
V(X64I64x2ExtMulLowI32x4U) \
|
||||
V(X64I64x2ExtMulHighI32x4U) \
|
||||
V(X64I32x4Splat) \
|
||||
V(X64I32x4ExtractLane) \
|
||||
V(X64I32x4SConvertF32x4) \
|
||||
@ -242,6 +246,10 @@ namespace compiler {
|
||||
V(X64I32x4BitMask) \
|
||||
V(X64I32x4DotI16x8S) \
|
||||
V(X64I32x4SignSelect) \
|
||||
V(X64I32x4ExtMulLowI16x8S) \
|
||||
V(X64I32x4ExtMulHighI16x8S) \
|
||||
V(X64I32x4ExtMulLowI16x8U) \
|
||||
V(X64I32x4ExtMulHighI16x8U) \
|
||||
V(X64I16x8Splat) \
|
||||
V(X64I16x8ExtractLaneS) \
|
||||
V(X64I16x8SConvertI8x16Low) \
|
||||
@ -276,6 +284,10 @@ namespace compiler {
|
||||
V(X64I16x8Abs) \
|
||||
V(X64I16x8BitMask) \
|
||||
V(X64I16x8SignSelect) \
|
||||
V(X64I16x8ExtMulLowI8x16S) \
|
||||
V(X64I16x8ExtMulHighI8x16S) \
|
||||
V(X64I16x8ExtMulLowI8x16U) \
|
||||
V(X64I16x8ExtMulHighI8x16U) \
|
||||
V(X64I8x16Splat) \
|
||||
V(X64I8x16ExtractLaneS) \
|
||||
V(X64Pinsrb) \
|
||||
|
@ -188,6 +188,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kX64I64x2Eq:
|
||||
case kX64I64x2ShrU:
|
||||
case kX64I64x2SignSelect:
|
||||
case kX64I64x2ExtMulLowI32x4S:
|
||||
case kX64I64x2ExtMulHighI32x4S:
|
||||
case kX64I64x2ExtMulLowI32x4U:
|
||||
case kX64I64x2ExtMulHighI32x4U:
|
||||
case kX64I32x4Splat:
|
||||
case kX64I32x4ExtractLane:
|
||||
case kX64I32x4SConvertF32x4:
|
||||
@ -218,6 +222,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kX64I32x4BitMask:
|
||||
case kX64I32x4DotI16x8S:
|
||||
case kX64I32x4SignSelect:
|
||||
case kX64I32x4ExtMulLowI16x8S:
|
||||
case kX64I32x4ExtMulHighI16x8S:
|
||||
case kX64I32x4ExtMulLowI16x8U:
|
||||
case kX64I32x4ExtMulHighI16x8U:
|
||||
case kX64I16x8Splat:
|
||||
case kX64I16x8ExtractLaneS:
|
||||
case kX64I16x8SConvertI8x16Low:
|
||||
@ -252,6 +260,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kX64I16x8Abs:
|
||||
case kX64I16x8BitMask:
|
||||
case kX64I16x8SignSelect:
|
||||
case kX64I16x8ExtMulLowI8x16S:
|
||||
case kX64I16x8ExtMulHighI8x16S:
|
||||
case kX64I16x8ExtMulLowI8x16U:
|
||||
case kX64I16x8ExtMulHighI8x16U:
|
||||
case kX64I8x16Splat:
|
||||
case kX64I8x16ExtractLaneS:
|
||||
case kX64I8x16SConvertI16x8:
|
||||
|
@ -2819,6 +2819,10 @@ VISIT_ATOMIC_BINOP(Xor)
|
||||
V(I64x2Add) \
|
||||
V(I64x2Sub) \
|
||||
V(I64x2Eq) \
|
||||
V(I64x2ExtMulLowI32x4S) \
|
||||
V(I64x2ExtMulHighI32x4S) \
|
||||
V(I64x2ExtMulLowI32x4U) \
|
||||
V(I64x2ExtMulHighI32x4U) \
|
||||
V(I32x4Add) \
|
||||
V(I32x4AddHoriz) \
|
||||
V(I32x4Sub) \
|
||||
@ -2830,6 +2834,10 @@ VISIT_ATOMIC_BINOP(Xor)
|
||||
V(I32x4MinU) \
|
||||
V(I32x4MaxU) \
|
||||
V(I32x4DotI16x8S) \
|
||||
V(I32x4ExtMulLowI16x8S) \
|
||||
V(I32x4ExtMulHighI16x8S) \
|
||||
V(I32x4ExtMulLowI16x8U) \
|
||||
V(I32x4ExtMulHighI16x8U) \
|
||||
V(I16x8SConvertI32x4) \
|
||||
V(I16x8UConvertI32x4) \
|
||||
V(I16x8Add) \
|
||||
@ -2847,6 +2855,10 @@ VISIT_ATOMIC_BINOP(Xor)
|
||||
V(I16x8MinU) \
|
||||
V(I16x8MaxU) \
|
||||
V(I16x8RoundingAverageU) \
|
||||
V(I16x8ExtMulLowI8x16S) \
|
||||
V(I16x8ExtMulHighI8x16S) \
|
||||
V(I16x8ExtMulLowI8x16U) \
|
||||
V(I16x8ExtMulHighI8x16U) \
|
||||
V(I8x16SConvertI16x8) \
|
||||
V(I8x16UConvertI16x8) \
|
||||
V(I8x16Add) \
|
||||
|
@ -2041,6 +2041,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
|
||||
mnemonic = "psrad";
|
||||
} else if (opcode == 0xE3) {
|
||||
mnemonic = "pavgw";
|
||||
} else if (opcode == 0xE4) {
|
||||
mnemonic = "pmulhuw";
|
||||
} else if (opcode == 0xE5) {
|
||||
mnemonic = "pmulhw";
|
||||
} else if (opcode == 0xE8) {
|
||||
mnemonic = "psubsb";
|
||||
} else if (opcode == 0xE9) {
|
||||
|
@ -2333,7 +2333,7 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8Q15MulRSatS) {
|
||||
}
|
||||
#endif // V8_TARGET_ARCH_ARM64
|
||||
|
||||
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
|
||||
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
|
||||
// TODO(v8:11008) Prototype extended multiplication.
|
||||
namespace {
|
||||
enum class MulHalf { kLow, kHigh };
|
||||
@ -2445,7 +2445,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2ExtMulHighI32x4U) {
|
||||
kExprI64x2ExtMulHighI32x4U, MultiplyLong,
|
||||
kExprI32x4Splat, MulHalf::kHigh);
|
||||
}
|
||||
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
|
||||
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
|
||||
|
||||
WASM_SIMD_TEST(I32x4DotI16x8S) {
|
||||
WasmRunner<int32_t, int16_t, int16_t> r(execution_tier, lower_simd);
|
||||
|
Loading…
Reference in New Issue
Block a user