[wasm-simd] Share i8x16.splat implementation

The optimal implementation is in TurboFan x64 codegen, move it into
shared-macro-assembler, and have TurboFan ia32 and Liftoff use it. The
optimal implementation accounts for AVX2 support.

We add a couple of AVX2 instruction to ia32 in sse-instr.h, not all of
them are used, but follow-up patches will use them, so we add support
(including diassembly and test) in this change.

Drive-by clean up to test-disasm-x64.cc to merge 2 AVX2 test sections.

Bug: v8:11589
Change-Id: I1c8d7deb0f8bb70b29e7a680e5dbcfb09ca5505b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3092555
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76352}
This commit is contained in:
Ng Zhi An 2021-08-13 15:30:54 -07:00 committed by V8 LUCI CQ
parent 9b772187a4
commit bb12c48ac3
11 changed files with 95 additions and 34 deletions

View File

@ -1790,6 +1790,19 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE4_AVX_RM_INSTRUCTION)
#undef DECLARE_SSE4_AVX_RM_INSTRUCTION
// AVX2 instructions
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
void instr(XMMRegister dst, XMMRegister src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
} \
void instr(XMMRegister dst, Operand src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
}
AVX2_BROADCAST_LIST(AVX2_INSTRUCTION)
#undef AVX2_INSTRUCTION
// Prefetch src position into cache level.
// Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a
// non-temporal

View File

@ -102,4 +102,10 @@
V(pmovzxdq, 66, 0F, 38, 35) \
V(ptest, 66, 0F, 38, 17)
// These require AVX2, and we only define the VEX-128 versions.
#define AVX2_BROADCAST_LIST(V) \
V(vpbroadcastd, 66, 0F, 38, 58) \
V(vpbroadcastb, 66, 0F, 38, 78) \
V(vpbroadcastw, 66, 0F, 38, 79)
#endif // V8_CODEGEN_IA32_SSE_INSTR_H_

View File

@ -254,6 +254,42 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
}
}
template <typename Op>
void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
XMMRegister scratch) {
DCHECK(!CpuFeatures::IsSupported(AVX2));
CpuFeatureScope ssse3_scope(this, SSSE3);
Movd(dst, src);
Xorps(scratch, scratch);
Pshufb(dst, scratch);
}
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
Movd(scratch, src);
vpbroadcastb(dst, scratch);
} else {
I8x16SplatPreAvx2(dst, src, scratch);
}
}
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
XMMRegister scratch) {
#if V8_TARGET_ARCH_IA32
// Operand on IA32 can be a wrapper for a single register, in which case they
// should call I8x16Splat |src| being Register.
DCHECK(!src.is_reg_only());
#endif
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastb(dst, src);
} else {
I8x16SplatPreAvx2(dst, src, scratch);
}
}
void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
uint8_t src2, Register tmp1,
XMMRegister tmp2) {

View File

@ -223,6 +223,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pmullw, pmullw)
AVX_OP(Pmuludq, pmuludq)
AVX_OP(Por, por)
AVX_OP(Pshufb, pshufb)
AVX_OP(Pshufd, pshufd)
AVX_OP(Pshufhw, pshufhw)
AVX_OP(Pshuflw, pshuflw)
@ -300,6 +301,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void F32x4Splat(XMMRegister dst, DoubleRegister src);
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch);
void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
XMMRegister tmp2);
void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1,
@ -350,6 +353,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Requires dst == mask when AVX is not supported.
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2, XMMRegister scratch);
private:
template <typename Op>
void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch);
};
} // namespace internal
} // namespace v8

View File

@ -3009,10 +3009,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register();
__ Movd(dst, i.InputOperand(0));
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg);
if (instr->InputAt(0)->IsRegister()) {
__ I8x16Splat(i.OutputSimd128Register(), i.InputRegister(0),
kScratchDoubleReg);
} else {
__ I8x16Splat(i.OutputSimd128Register(), i.InputOperand(0),
kScratchDoubleReg);
}
break;
}
case kIA32I8x16ExtractLaneS: {

View File

@ -3405,25 +3405,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx_scope(tasm(), AVX);
CpuFeatureScope avx2_scope(tasm(), AVX2);
if (HasRegisterInput(instr, 0)) {
__ vmovd(kScratchDoubleReg, i.InputRegister(0));
__ vpbroadcastb(dst, kScratchDoubleReg);
} else {
__ vpbroadcastb(dst, i.InputOperand(0));
}
if (HasRegisterInput(instr, 0)) {
__ I8x16Splat(dst, i.InputRegister(0), kScratchDoubleReg);
} else {
if (HasRegisterInput(instr, 0)) {
__ Movd(dst, i.InputRegister(0));
} else {
__ Movd(dst, i.InputOperand(0));
}
__ Xorps(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg);
__ I8x16Splat(dst, i.InputOperand(0), kScratchDoubleReg);
}
break;
}
case kX64Pextrb: {

View File

@ -786,6 +786,15 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
SSSE3_UNOP_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE)
SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE)
#undef DECLARE_SSE_AVX_RM_DIS_CASE
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
case 0x##code: \
AppendToBuffer("" #instruction " %s,", NameOfXMMRegister(regop)); \
current += PrintRightXMMOperand(current); \
break;
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
#undef DISASSEMBLE_AVX2_BROADCAST
default:
UnimplementedInstruction();
}

View File

@ -2917,9 +2917,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
I8x16Splat(dst.fp(), src.gp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,

View File

@ -2502,9 +2502,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pshufb(dst.fp(), kScratchDoubleReg);
I8x16Splat(dst.fp(), src.gp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,

View File

@ -865,6 +865,18 @@ TEST(DisasmIa320) {
}
}
// AVX2 instructions.
{
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2);
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
notUsed4) \
__ instruction(xmm0, xmm1); \
__ instruction(xmm0, Operand(ebx, ecx, times_4, 10000));
AVX2_BROADCAST_LIST(EMIT_AVX2_BROADCAST)
}
}
// FMA3 instruction
{
if (CpuFeatures::IsSupported(FMA3)) {

View File

@ -869,13 +869,6 @@ TEST(DisasmX64) {
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2);
__ vbroadcastss(xmm1, xmm2);
}
}
// AVX2 instructions.
{
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2);
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
notUsed4) \
__ instruction(xmm0, xmm1); \