[wasm-simd] Share and optimize load 8, 16, 32 splat
Move optimized implementation (accounts for AVX2) into shared-macro-assembler, and use it everywhere. Drive-by fix in liftoff-assembler-ia32.h to use Movss and Movsd macro-assembler functions to that they emit AVX when supported. Bug: v8:11589 Change-Id: Ibc4f2709d323d5b835bcac175a32b422d47d3355 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3095008 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/main@{#76372}
This commit is contained in:
parent
268a160857
commit
acf0f4698a
@ -916,6 +916,63 @@ void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
|
||||
XMMRegister scratch) {
|
||||
// The trap handler uses the current pc to creating a landing, so that it can
|
||||
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
|
||||
// first instruction in each case below is the one that loads.
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(this, AVX2);
|
||||
vpbroadcastb(dst, src);
|
||||
} else if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
// Avoid dependency on previous value of dst.
|
||||
vpinsrb(dst, scratch, src, uint8_t{0});
|
||||
vpxor(scratch, scratch, scratch);
|
||||
vpshufb(dst, dst, scratch);
|
||||
} else {
|
||||
CpuFeatureScope ssse4_scope(this, SSE4_1);
|
||||
CpuFeatureScope ssse3_scope(this, SSSE3);
|
||||
pinsrb(dst, src, uint8_t{0});
|
||||
xorps(scratch, scratch);
|
||||
pshufb(dst, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
|
||||
XMMRegister scratch) {
|
||||
// The trap handler uses the current pc to creating a landing, so that it can
|
||||
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
|
||||
// first instruction in each case below is the one that loads.
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(this, AVX2);
|
||||
vpbroadcastw(dst, src);
|
||||
} else if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
// Avoid dependency on previous value of dst.
|
||||
vpinsrw(dst, scratch, src, uint8_t{0});
|
||||
vpshuflw(dst, dst, uint8_t{0});
|
||||
vpunpcklqdq(dst, dst, dst);
|
||||
} else {
|
||||
pinsrw(dst, src, uint8_t{0});
|
||||
pshuflw(dst, dst, uint8_t{0});
|
||||
movlhps(dst, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
|
||||
// The trap handler uses the current pc to creating a landing, so that it can
|
||||
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
|
||||
// first instruction in each case below is the one that loads.
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vbroadcastss(dst, src);
|
||||
} else {
|
||||
movss(dst, src);
|
||||
shufps(dst, dst, byte{0});
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
|
@ -217,6 +217,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
AVX_OP(Pavgw, pavgw)
|
||||
AVX_OP(Pcmpgtb, pcmpgtb)
|
||||
AVX_OP(Pcmpeqd, pcmpeqd)
|
||||
AVX_OP(Pinsrw, pinsrw)
|
||||
AVX_OP(Pmaxub, pmaxub)
|
||||
AVX_OP(Pminub, pminub)
|
||||
AVX_OP(Pmovmskb, pmovmskb)
|
||||
@ -278,6 +279,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
AVX_OP_SSE4_1(Pblendw, pblendw)
|
||||
AVX_OP_SSE4_1(Pextrb, pextrb)
|
||||
AVX_OP_SSE4_1(Pextrw, pextrw)
|
||||
AVX_OP_SSE4_1(Pinsrb, pinsrb)
|
||||
AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
|
||||
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
|
||||
AVX_OP_SSE4_1(Pminsb, pminsb)
|
||||
@ -355,6 +357,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
// Requires dst == mask when AVX is not supported.
|
||||
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch);
|
||||
void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch);
|
||||
void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
|
||||
void S128Load32Splat(XMMRegister dst, Operand src);
|
||||
|
||||
private:
|
||||
template <typename Op>
|
||||
|
@ -1256,14 +1256,15 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
|
||||
void pmovmskb(Register dst, XMMRegister src);
|
||||
|
||||
void pinsrw(XMMRegister dst, Register src, uint8_t imm8);
|
||||
void pinsrw(XMMRegister dst, Operand src, uint8_t imm8);
|
||||
|
||||
// SSE 4.1 instruction
|
||||
void insertps(XMMRegister dst, XMMRegister src, byte imm8);
|
||||
void insertps(XMMRegister dst, Operand src, byte imm8);
|
||||
void pextrq(Register dst, XMMRegister src, int8_t imm8);
|
||||
void pinsrb(XMMRegister dst, Register src, uint8_t imm8);
|
||||
void pinsrb(XMMRegister dst, Operand src, uint8_t imm8);
|
||||
void pinsrw(XMMRegister dst, Register src, uint8_t imm8);
|
||||
void pinsrw(XMMRegister dst, Operand src, uint8_t imm8);
|
||||
void pinsrd(XMMRegister dst, Register src, uint8_t imm8);
|
||||
void pinsrd(XMMRegister dst, Operand src, uint8_t imm8);
|
||||
void pinsrq(XMMRegister dst, Register src, uint8_t imm8);
|
||||
|
@ -3442,20 +3442,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kIA32S128Load8Splat: {
|
||||
__ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0);
|
||||
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
|
||||
__ S128Load8Splat(i.OutputSimd128Register(), i.MemoryOperand(),
|
||||
kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kIA32S128Load16Splat: {
|
||||
__ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0);
|
||||
__ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(),
|
||||
uint8_t{0});
|
||||
__ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
|
||||
__ S128Load16Splat(i.OutputSimd128Register(), i.MemoryOperand(),
|
||||
kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kIA32S128Load32Splat: {
|
||||
__ Vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
|
||||
__ S128Load32Splat(i.OutputSimd128Register(), i.MemoryOperand());
|
||||
break;
|
||||
}
|
||||
case kIA32S128Load64Splat: {
|
||||
|
@ -3723,40 +3723,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64S128Load8Splat: {
|
||||
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(tasm(), AVX2);
|
||||
__ vpbroadcastb(dst, i.MemoryOperand());
|
||||
} else {
|
||||
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
|
||||
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pshufb(dst, kScratchDoubleReg);
|
||||
}
|
||||
__ S128Load8Splat(i.OutputSimd128Register(), i.MemoryOperand(),
|
||||
kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kX64S128Load16Splat: {
|
||||
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(tasm(), AVX2);
|
||||
__ vpbroadcastw(dst, i.MemoryOperand());
|
||||
} else {
|
||||
__ Pinsrw(dst, dst, i.MemoryOperand(), 0);
|
||||
__ Pshuflw(dst, dst, uint8_t{0});
|
||||
__ Punpcklqdq(dst, dst);
|
||||
}
|
||||
__ S128Load16Splat(i.OutputSimd128Register(), i.MemoryOperand(),
|
||||
kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kX64S128Load32Splat: {
|
||||
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
__ vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
|
||||
} else {
|
||||
__ movss(i.OutputSimd128Register(), i.MemoryOperand());
|
||||
__ shufps(i.OutputSimd128Register(), i.OutputSimd128Register(),
|
||||
byte{0});
|
||||
}
|
||||
__ S128Load32Splat(i.OutputSimd128Register(), i.MemoryOperand());
|
||||
break;
|
||||
}
|
||||
case kX64S128Load64Splat: {
|
||||
|
@ -2775,23 +2775,19 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
|
||||
}
|
||||
} else if (transform == LoadTransformationKind::kZeroExtend) {
|
||||
if (memtype == MachineType::Int32()) {
|
||||
movss(dst.fp(), src_op);
|
||||
Movss(dst.fp(), src_op);
|
||||
} else {
|
||||
DCHECK_EQ(MachineType::Int64(), memtype);
|
||||
movsd(dst.fp(), src_op);
|
||||
Movsd(dst.fp(), src_op);
|
||||
}
|
||||
} else {
|
||||
DCHECK_EQ(LoadTransformationKind::kSplat, transform);
|
||||
if (memtype == MachineType::Int8()) {
|
||||
Pinsrb(dst.fp(), src_op, 0);
|
||||
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
|
||||
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
|
||||
S128Load8Splat(dst.fp(), src_op, liftoff::kScratchDoubleReg);
|
||||
} else if (memtype == MachineType::Int16()) {
|
||||
Pinsrw(dst.fp(), src_op, 0);
|
||||
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
|
||||
Punpcklqdq(dst.fp(), dst.fp());
|
||||
S128Load16Splat(dst.fp(), src_op, liftoff::kScratchDoubleReg);
|
||||
} else if (memtype == MachineType::Int32()) {
|
||||
Vbroadcastss(dst.fp(), src_op);
|
||||
S128Load32Splat(dst.fp(), src_op);
|
||||
} else if (memtype == MachineType::Int64()) {
|
||||
Movddup(dst.fp(), src_op);
|
||||
}
|
||||
|
@ -2391,21 +2391,11 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
|
||||
} else {
|
||||
DCHECK_EQ(LoadTransformationKind::kSplat, transform);
|
||||
if (memtype == MachineType::Int8()) {
|
||||
Pinsrb(dst.fp(), dst.fp(), src_op, 0);
|
||||
Pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
Pshufb(dst.fp(), kScratchDoubleReg);
|
||||
S128Load8Splat(dst.fp(), src_op, kScratchDoubleReg);
|
||||
} else if (memtype == MachineType::Int16()) {
|
||||
Pinsrw(dst.fp(), dst.fp(), src_op, 0);
|
||||
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
|
||||
Punpcklqdq(dst.fp(), dst.fp());
|
||||
S128Load16Splat(dst.fp(), src_op, kScratchDoubleReg);
|
||||
} else if (memtype == MachineType::Int32()) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vbroadcastss(dst.fp(), src_op);
|
||||
} else {
|
||||
movss(dst.fp(), src_op);
|
||||
shufps(dst.fp(), dst.fp(), byte{0});
|
||||
}
|
||||
S128Load32Splat(dst.fp(), src_op);
|
||||
} else if (memtype == MachineType::Int64()) {
|
||||
Movddup(dst.fp(), src_op);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user