[wasm-simd] Share and optimize load 8, 16, 32 splat

Move optimized implementation (accounts for AVX2) into
shared-macro-assembler, and use it everywhere.

Drive-by fix in liftoff-assembler-ia32.h to use Movss and Movsd
macro-assembler functions to that they emit AVX when supported.

Bug: v8:11589
Change-Id: Ibc4f2709d323d5b835bcac175a32b422d47d3355
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3095008
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76372}
This commit is contained in:
Ng Zhi An 2021-08-18 17:15:23 -07:00 committed by V8 LUCI CQ
parent 268a160857
commit acf0f4698a
7 changed files with 83 additions and 58 deletions

View File

@ -916,6 +916,63 @@ void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
}
}
void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
XMMRegister scratch) {
// The trap handler uses the current pc to creating a landing, so that it can
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
// first instruction in each case below is the one that loads.
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastb(dst, src);
} else if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Avoid dependency on previous value of dst.
vpinsrb(dst, scratch, src, uint8_t{0});
vpxor(scratch, scratch, scratch);
vpshufb(dst, dst, scratch);
} else {
CpuFeatureScope ssse4_scope(this, SSE4_1);
CpuFeatureScope ssse3_scope(this, SSSE3);
pinsrb(dst, src, uint8_t{0});
xorps(scratch, scratch);
pshufb(dst, scratch);
}
}
void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
XMMRegister scratch) {
// The trap handler uses the current pc to creating a landing, so that it can
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
// first instruction in each case below is the one that loads.
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastw(dst, src);
} else if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Avoid dependency on previous value of dst.
vpinsrw(dst, scratch, src, uint8_t{0});
vpshuflw(dst, dst, uint8_t{0});
vpunpcklqdq(dst, dst, dst);
} else {
pinsrw(dst, src, uint8_t{0});
pshuflw(dst, dst, uint8_t{0});
movlhps(dst, dst);
}
}
void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
// The trap handler uses the current pc to creating a landing, so that it can
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
// first instruction in each case below is the one that loads.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vbroadcastss(dst, src);
} else {
movss(dst, src);
shufps(dst, dst, byte{0});
}
}
} // namespace internal
} // namespace v8

View File

@ -217,6 +217,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pavgw, pavgw)
AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pinsrw, pinsrw)
AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminub, pminub)
AVX_OP(Pmovmskb, pmovmskb)
@ -278,6 +279,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Pblendw, pblendw)
AVX_OP_SSE4_1(Pextrb, pextrb)
AVX_OP_SSE4_1(Pextrw, pextrw)
AVX_OP_SSE4_1(Pinsrb, pinsrb)
AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pminsb, pminsb)
@ -355,6 +357,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Requires dst == mask when AVX is not supported.
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2, XMMRegister scratch);
void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch);
void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
void S128Load32Splat(XMMRegister dst, Operand src);
private:
template <typename Op>

View File

@ -1256,14 +1256,15 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void pmovmskb(Register dst, XMMRegister src);
void pinsrw(XMMRegister dst, Register src, uint8_t imm8);
void pinsrw(XMMRegister dst, Operand src, uint8_t imm8);
// SSE 4.1 instruction
void insertps(XMMRegister dst, XMMRegister src, byte imm8);
void insertps(XMMRegister dst, Operand src, byte imm8);
void pextrq(Register dst, XMMRegister src, int8_t imm8);
void pinsrb(XMMRegister dst, Register src, uint8_t imm8);
void pinsrb(XMMRegister dst, Operand src, uint8_t imm8);
void pinsrw(XMMRegister dst, Register src, uint8_t imm8);
void pinsrw(XMMRegister dst, Operand src, uint8_t imm8);
void pinsrd(XMMRegister dst, Register src, uint8_t imm8);
void pinsrd(XMMRegister dst, Operand src, uint8_t imm8);
void pinsrq(XMMRegister dst, Register src, uint8_t imm8);

View File

@ -3442,20 +3442,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32S128Load8Splat: {
__ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
__ S128Load8Splat(i.OutputSimd128Register(), i.MemoryOperand(),
kScratchDoubleReg);
break;
}
case kIA32S128Load16Splat: {
__ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(),
uint8_t{0});
__ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
__ S128Load16Splat(i.OutputSimd128Register(), i.MemoryOperand(),
kScratchDoubleReg);
break;
}
case kIA32S128Load32Splat: {
__ Vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
__ S128Load32Splat(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kIA32S128Load64Splat: {

View File

@ -3723,40 +3723,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64S128Load8Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
XMMRegister dst = i.OutputSimd128Register();
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vpbroadcastb(dst, i.MemoryOperand());
} else {
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg);
}
__ S128Load8Splat(i.OutputSimd128Register(), i.MemoryOperand(),
kScratchDoubleReg);
break;
}
case kX64S128Load16Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
XMMRegister dst = i.OutputSimd128Register();
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vpbroadcastw(dst, i.MemoryOperand());
} else {
__ Pinsrw(dst, dst, i.MemoryOperand(), 0);
__ Pshuflw(dst, dst, uint8_t{0});
__ Punpcklqdq(dst, dst);
}
__ S128Load16Splat(i.OutputSimd128Register(), i.MemoryOperand(),
kScratchDoubleReg);
break;
}
case kX64S128Load32Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
} else {
__ movss(i.OutputSimd128Register(), i.MemoryOperand());
__ shufps(i.OutputSimd128Register(), i.OutputSimd128Register(),
byte{0});
}
__ S128Load32Splat(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kX64S128Load64Splat: {

View File

@ -2775,23 +2775,19 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
}
} else if (transform == LoadTransformationKind::kZeroExtend) {
if (memtype == MachineType::Int32()) {
movss(dst.fp(), src_op);
Movss(dst.fp(), src_op);
} else {
DCHECK_EQ(MachineType::Int64(), memtype);
movsd(dst.fp(), src_op);
Movsd(dst.fp(), src_op);
}
} else {
DCHECK_EQ(LoadTransformationKind::kSplat, transform);
if (memtype == MachineType::Int8()) {
Pinsrb(dst.fp(), src_op, 0);
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
S128Load8Splat(dst.fp(), src_op, liftoff::kScratchDoubleReg);
} else if (memtype == MachineType::Int16()) {
Pinsrw(dst.fp(), src_op, 0);
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Punpcklqdq(dst.fp(), dst.fp());
S128Load16Splat(dst.fp(), src_op, liftoff::kScratchDoubleReg);
} else if (memtype == MachineType::Int32()) {
Vbroadcastss(dst.fp(), src_op);
S128Load32Splat(dst.fp(), src_op);
} else if (memtype == MachineType::Int64()) {
Movddup(dst.fp(), src_op);
}

View File

@ -2391,21 +2391,11 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
} else {
DCHECK_EQ(LoadTransformationKind::kSplat, transform);
if (memtype == MachineType::Int8()) {
Pinsrb(dst.fp(), dst.fp(), src_op, 0);
Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pshufb(dst.fp(), kScratchDoubleReg);
S128Load8Splat(dst.fp(), src_op, kScratchDoubleReg);
} else if (memtype == MachineType::Int16()) {
Pinsrw(dst.fp(), dst.fp(), src_op, 0);
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Punpcklqdq(dst.fp(), dst.fp());
S128Load16Splat(dst.fp(), src_op, kScratchDoubleReg);
} else if (memtype == MachineType::Int32()) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vbroadcastss(dst.fp(), src_op);
} else {
movss(dst.fp(), src_op);
shufps(dst.fp(), dst.fp(), byte{0});
}
S128Load32Splat(dst.fp(), src_op);
} else if (memtype == MachineType::Int64()) {
Movddup(dst.fp(), src_op);
}