[wasm-simd][x64] Fix definition of Shufps
The definition of Shufps is wrong, we are incorrectly passing 0 as the immediate in all cases. No tests broke because we only used Shufps for splats, which has imm8 == 0 anyway. Also, it was using movss, which only moves a single 32-bit. Because we were using it only for f32x4 splat, this ended up being enough (imm8 == 0 meant that we only shuffled the low 32-bit). This is fixed to use movaps, which moves the entire 128-bit register. Also tweak the definition of Shufps to take 4 arguments. `vshufps dst, src1, src2, imm8` shuffles src1 and src2 into dst. `shufps dst, src, imm8`, shuffles dst and src into dst. So `Shufps(dst, src, imm8)` is ambiguous in the AVX case, it could be: 1. vshufps(dst, src, src, imm8), or 2. vshufps(dst, dst, src, imm8) 2. is more likely to be the intended behavior, but it introduces a false dependency on the value of dst. With `Shufps(dst, src1, src2, imm8)`, it is clearer what the behavior should be: 1. shufps(dst, src2, imm8) matches the AVX behavior IFF dst == src1. Change-Id: I60dc4ec868023d28d00f2b09d2c53b82a729bc4d Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2591849 Reviewed-by: Clemens Backes <clemensb@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71775}
This commit is contained in:
parent
28740a36dc
commit
5f4b0e47a9
@ -1756,15 +1756,16 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src, byte imm8) {
|
||||
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
byte imm8) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vshufps(dst, src, src, imm8);
|
||||
vshufps(dst, src1, src2, imm8);
|
||||
} else {
|
||||
if (dst != src) {
|
||||
movss(dst, src);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
shufps(dst, src, static_cast<byte>(0));
|
||||
shufps(dst, src2, imm8);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -526,8 +526,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
|
||||
// Shufps that will mov src into dst if AVX is not supported.
|
||||
void Shufps(XMMRegister dst, XMMRegister src, byte imm8);
|
||||
// Shufps that will mov src1 into dst if AVX is not supported.
|
||||
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8);
|
||||
|
||||
// Non-SSE2 instructions.
|
||||
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
|
||||
|
@ -2572,7 +2572,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kX64F32x4Splat: {
|
||||
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0), 0);
|
||||
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0),
|
||||
i.InputDoubleRegister(0), 0);
|
||||
break;
|
||||
}
|
||||
case kX64F32x4ExtractLane: {
|
||||
@ -3912,7 +3913,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
uint8_t mask = i.InputUint8(1);
|
||||
if (dst == src) {
|
||||
// 1-byte shorter encoding than pshufd.
|
||||
__ Shufps(dst, src, mask);
|
||||
__ Shufps(dst, src, src, mask);
|
||||
} else {
|
||||
__ Pshufd(dst, src, mask);
|
||||
}
|
||||
|
@ -2475,7 +2475,7 @@ void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
|
||||
|
||||
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Shufps(dst.fp(), src.fp(), 0);
|
||||
Shufps(dst.fp(), src.fp(), src.fp(), 0);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,
|
||||
|
Loading…
Reference in New Issue
Block a user