[wasm-simd] Share i8x16.splat implementation
The optimal implementation is in TurboFan x64 codegen, move it into shared-macro-assembler, and have TurboFan ia32 and Liftoff use it. The optimal implementation accounts for AVX2 support. We add a couple of AVX2 instruction to ia32 in sse-instr.h, not all of them are used, but follow-up patches will use them, so we add support (including diassembly and test) in this change. Drive-by clean up to test-disasm-x64.cc to merge 2 AVX2 test sections. Bug: v8:11589 Change-Id: I1c8d7deb0f8bb70b29e7a680e5dbcfb09ca5505b Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3092555 Reviewed-by: Clemens Backes <clemensb@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/main@{#76352}
This commit is contained in:
parent
9b772187a4
commit
bb12c48ac3
@ -1790,6 +1790,19 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE4_AVX_RM_INSTRUCTION)
|
||||
#undef DECLARE_SSE4_AVX_RM_INSTRUCTION
|
||||
|
||||
// AVX2 instructions
|
||||
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
|
||||
void instr(XMMRegister dst, XMMRegister src) { \
|
||||
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
|
||||
AVX2); \
|
||||
} \
|
||||
void instr(XMMRegister dst, Operand src) { \
|
||||
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
|
||||
AVX2); \
|
||||
}
|
||||
AVX2_BROADCAST_LIST(AVX2_INSTRUCTION)
|
||||
#undef AVX2_INSTRUCTION
|
||||
|
||||
// Prefetch src position into cache level.
|
||||
// Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a
|
||||
// non-temporal
|
||||
|
@ -102,4 +102,10 @@
|
||||
V(pmovzxdq, 66, 0F, 38, 35) \
|
||||
V(ptest, 66, 0F, 38, 17)
|
||||
|
||||
// These require AVX2, and we only define the VEX-128 versions.
|
||||
#define AVX2_BROADCAST_LIST(V) \
|
||||
V(vpbroadcastd, 66, 0F, 38, 58) \
|
||||
V(vpbroadcastb, 66, 0F, 38, 78) \
|
||||
V(vpbroadcastw, 66, 0F, 38, 79)
|
||||
|
||||
#endif // V8_CODEGEN_IA32_SSE_INSTR_H_
|
||||
|
@ -254,6 +254,42 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
|
||||
XMMRegister scratch) {
|
||||
DCHECK(!CpuFeatures::IsSupported(AVX2));
|
||||
CpuFeatureScope ssse3_scope(this, SSSE3);
|
||||
Movd(dst, src);
|
||||
Xorps(scratch, scratch);
|
||||
Pshufb(dst, scratch);
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
|
||||
XMMRegister scratch) {
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(this, AVX2);
|
||||
Movd(scratch, src);
|
||||
vpbroadcastb(dst, scratch);
|
||||
} else {
|
||||
I8x16SplatPreAvx2(dst, src, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
|
||||
XMMRegister scratch) {
|
||||
#if V8_TARGET_ARCH_IA32
|
||||
// Operand on IA32 can be a wrapper for a single register, in which case they
|
||||
// should call I8x16Splat |src| being Register.
|
||||
DCHECK(!src.is_reg_only());
|
||||
#endif
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(this, AVX2);
|
||||
vpbroadcastb(dst, src);
|
||||
} else {
|
||||
I8x16SplatPreAvx2(dst, src, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
|
||||
uint8_t src2, Register tmp1,
|
||||
XMMRegister tmp2) {
|
||||
|
@ -223,6 +223,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
AVX_OP(Pmullw, pmullw)
|
||||
AVX_OP(Pmuludq, pmuludq)
|
||||
AVX_OP(Por, por)
|
||||
AVX_OP(Pshufb, pshufb)
|
||||
AVX_OP(Pshufd, pshufd)
|
||||
AVX_OP(Pshufhw, pshufhw)
|
||||
AVX_OP(Pshuflw, pshuflw)
|
||||
@ -300,6 +301,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
void F32x4Splat(XMMRegister dst, DoubleRegister src);
|
||||
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
|
||||
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
|
||||
void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch);
|
||||
void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
|
||||
void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
|
||||
XMMRegister tmp2);
|
||||
void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1,
|
||||
@ -350,6 +353,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
// Requires dst == mask when AVX is not supported.
|
||||
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch);
|
||||
|
||||
private:
|
||||
template <typename Op>
|
||||
void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch);
|
||||
};
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
@ -3009,10 +3009,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kIA32I8x16Splat: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
__ Movd(dst, i.InputOperand(0));
|
||||
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pshufb(dst, kScratchDoubleReg);
|
||||
if (instr->InputAt(0)->IsRegister()) {
|
||||
__ I8x16Splat(i.OutputSimd128Register(), i.InputRegister(0),
|
||||
kScratchDoubleReg);
|
||||
} else {
|
||||
__ I8x16Splat(i.OutputSimd128Register(), i.InputOperand(0),
|
||||
kScratchDoubleReg);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kIA32I8x16ExtractLaneS: {
|
||||
|
@ -3405,25 +3405,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I8x16Splat: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
CpuFeatureScope avx2_scope(tasm(), AVX2);
|
||||
if (HasRegisterInput(instr, 0)) {
|
||||
__ vmovd(kScratchDoubleReg, i.InputRegister(0));
|
||||
__ vpbroadcastb(dst, kScratchDoubleReg);
|
||||
} else {
|
||||
__ vpbroadcastb(dst, i.InputOperand(0));
|
||||
}
|
||||
if (HasRegisterInput(instr, 0)) {
|
||||
__ I8x16Splat(dst, i.InputRegister(0), kScratchDoubleReg);
|
||||
} else {
|
||||
if (HasRegisterInput(instr, 0)) {
|
||||
__ Movd(dst, i.InputRegister(0));
|
||||
} else {
|
||||
__ Movd(dst, i.InputOperand(0));
|
||||
}
|
||||
__ Xorps(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pshufb(dst, kScratchDoubleReg);
|
||||
__ I8x16Splat(dst, i.InputOperand(0), kScratchDoubleReg);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case kX64Pextrb: {
|
||||
|
@ -786,6 +786,15 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
|
||||
SSSE3_UNOP_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE)
|
||||
SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE)
|
||||
#undef DECLARE_SSE_AVX_RM_DIS_CASE
|
||||
|
||||
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
|
||||
case 0x##code: \
|
||||
AppendToBuffer("" #instruction " %s,", NameOfXMMRegister(regop)); \
|
||||
current += PrintRightXMMOperand(current); \
|
||||
break;
|
||||
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
|
||||
#undef DISASSEMBLE_AVX2_BROADCAST
|
||||
|
||||
default:
|
||||
UnimplementedInstruction();
|
||||
}
|
||||
|
@ -2917,9 +2917,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Movd(dst.fp(), src.gp());
|
||||
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
|
||||
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
|
||||
I8x16Splat(dst.fp(), src.gp(), liftoff::kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
|
||||
|
@ -2502,9 +2502,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Movd(dst.fp(), src.gp());
|
||||
Pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
Pshufb(dst.fp(), kScratchDoubleReg);
|
||||
I8x16Splat(dst.fp(), src.gp(), kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
|
||||
|
@ -865,6 +865,18 @@ TEST(DisasmIa320) {
|
||||
}
|
||||
}
|
||||
|
||||
// AVX2 instructions.
|
||||
{
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope scope(&assm, AVX2);
|
||||
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
|
||||
notUsed4) \
|
||||
__ instruction(xmm0, xmm1); \
|
||||
__ instruction(xmm0, Operand(ebx, ecx, times_4, 10000));
|
||||
AVX2_BROADCAST_LIST(EMIT_AVX2_BROADCAST)
|
||||
}
|
||||
}
|
||||
|
||||
// FMA3 instruction
|
||||
{
|
||||
if (CpuFeatures::IsSupported(FMA3)) {
|
||||
|
@ -869,13 +869,6 @@ TEST(DisasmX64) {
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope scope(&assm, AVX2);
|
||||
__ vbroadcastss(xmm1, xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
// AVX2 instructions.
|
||||
{
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope scope(&assm, AVX2);
|
||||
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
|
||||
notUsed4) \
|
||||
__ instruction(xmm0, xmm1); \
|
||||
|
Loading…
Reference in New Issue
Block a user