[wasm-simd][x64] Fix definition of Shufps

The definition of Shufps is wrong, we are incorrectly passing 0 as the
immediate in all cases. No tests broke because we only used Shufps for
splats, which has imm8 == 0 anyway.

Also, it was using movss, which only moves a single 32-bit. Because we
were using it only for f32x4 splat, this ended up being enough (imm8 ==
0 meant that we only shuffled the low 32-bit). This is fixed to use
movaps, which moves the entire 128-bit register.

Also tweak the definition of Shufps to take 4 arguments. `vshufps dst,
src1, src2, imm8` shuffles src1 and src2 into dst. `shufps dst, src,
imm8`, shuffles dst and src into dst.

So `Shufps(dst, src, imm8)` is ambiguous in the AVX case, it could be:
1. vshufps(dst, src, src, imm8), or
2. vshufps(dst, dst, src, imm8)

2. is more likely to be the intended behavior, but it introduces a false
dependency on the value of dst.

With `Shufps(dst, src1, src2, imm8)`, it is clearer what the behavior
should be:
1. shufps(dst, src2, imm8) matches the AVX behavior IFF dst == src1.

Change-Id: I60dc4ec868023d28d00f2b09d2c53b82a729bc4d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2591849
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71775}
This commit is contained in:
Zhi An Ng 2020-12-15 03:48:27 +00:00 committed by Commit Bot
parent 28740a36dc
commit 5f4b0e47a9
4 changed files with 12 additions and 10 deletions

View File

@ -1756,15 +1756,16 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src, byte imm8) {
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vshufps(dst, src, src, imm8);
vshufps(dst, src1, src2, imm8);
} else {
if (dst != src) {
movss(dst, src);
if (dst != src1) {
movaps(dst, src1);
}
shufps(dst, src, static_cast<byte>(0));
shufps(dst, src2, imm8);
}
}

View File

@ -526,8 +526,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// Shufps that will mov src into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src, byte imm8);
// Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8);
// Non-SSE2 instructions.
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);

View File

@ -2572,7 +2572,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F32x4Splat: {
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0), 0);
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0),
i.InputDoubleRegister(0), 0);
break;
}
case kX64F32x4ExtractLane: {
@ -3912,7 +3913,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
uint8_t mask = i.InputUint8(1);
if (dst == src) {
// 1-byte shorter encoding than pshufd.
__ Shufps(dst, src, mask);
__ Shufps(dst, src, src, mask);
} else {
__ Pshufd(dst, src, mask);
}

View File

@ -2475,7 +2475,7 @@ void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Shufps(dst.fp(), src.fp(), 0);
Shufps(dst.fp(), src.fp(), src.fp(), 0);
}
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,