[wasm-simd] Enhance Shufps to copy src to dst

Extract Shufps to handle both AVX and SSE cases, in the SSE case it will
copy src to dst if they are not the same. This allows us to use it in
Liftoff as well, without the extra copy when AVX is supported.

In other places, the usage of Shufps is unnecessary, since they are
within a clause checking for non-AVX support, so we can simply use the
shufps (non-macro-assembler).

Bug: v8:9561
Change-Id: Icb043d7a43397c1b0810ece2666be567f0f5986c
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2513866
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70911}
This commit is contained in:
Zhi An Ng 2020-11-02 02:03:28 +00:00 committed by Commit Bot
parent d988237e7a
commit 14570fe0c9
4 changed files with 21 additions and 18 deletions

View File

@ -1727,6 +1727,18 @@ void TurboAssembler::RetpolineJump(Register reg) {
ret(0);
}
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vshufps(dst, src, src, imm8);
} else {
if (dst != src) {
movss(dst, src);
}
shufps(dst, src, static_cast<byte>(0));
}
}
void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
if (imm8 == 0) {
Movd(dst, src);

View File

@ -224,7 +224,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Divpd, divpd)
AVX_OP(Maxps, maxps)
AVX_OP(Maxpd, maxpd)
AVX_OP(Shufps, shufps)
AVX_OP(Cvtdq2ps, cvtdq2ps)
AVX_OP(Rcpps, rcpps)
AVX_OP(Rsqrtps, rsqrtps)
@ -519,6 +518,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Trap() override;
void DebugBreak() override;
// Shufps that will mov src into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src, byte imm8);
// Non-SSE2 instructions.
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);

View File

@ -2491,15 +2491,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F32x4Splat: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputDoubleRegister(0);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vshufps(dst, src, src, byte{0x0});
} else {
DCHECK_EQ(dst, src);
__ Shufps(dst, dst, byte{0x0});
}
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0), 0);
break;
}
case kX64F32x4ExtractLane: {
@ -3663,8 +3655,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
CpuFeatureScope avx_scope(tasm(), AVX);
__ vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
} else {
__ Movss(i.OutputSimd128Register(), i.MemoryOperand());
__ Shufps(i.OutputSimd128Register(), i.OutputSimd128Register(),
__ movss(i.OutputSimd128Register(), i.MemoryOperand());
__ shufps(i.OutputSimd128Register(), i.OutputSimd128Register(),
byte{0});
}
break;

View File

@ -2309,8 +2309,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
CpuFeatureScope avx_scope(this, AVX);
vbroadcastss(dst.fp(), src_op);
} else {
Movss(dst.fp(), src_op);
Shufps(dst.fp(), dst.fp(), byte{0});
movss(dst.fp(), src_op);
shufps(dst.fp(), dst.fp(), byte{0});
}
} else if (memtype == MachineType::Int64()) {
Movddup(dst.fp(), src_op);
@ -2419,10 +2419,7 @@ void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() != src.fp()) {
Movss(dst.fp(), src.fp());
}
Shufps(dst.fp(), src.fp(), static_cast<byte>(0));
Shufps(dst.fp(), src.fp(), 0);
}
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,