[wasm-simd] Share i16x8.q15mulr_sat_s implementation

Bug: v8:11589
Change-Id: Ie51cfd6cd6315f7f14f0c584f190a478ed565b0e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3114603
Reviewed-by: Adam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76475}
This commit is contained in:
Ng Zhi An 2021-08-24 15:54:38 -07:00 committed by V8 LUCI CQ
parent b415fa3824
commit 5e80730fb6
8 changed files with 25 additions and 61 deletions

View File

@ -631,32 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) {
add(dst, Immediate(0x80000000));
}
void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmulhrsw(dst, src1, src2);
} else {
if (dst != src1) {
movaps(dst, src1);
}
CpuFeatureScope sse_scope(this, SSSE3);
pmulhrsw(dst, src2);
}
}
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch) {
ASM_CODE_COMMENT(this);
// k = i16x8.splat(0x8000)
Pcmpeqd(scratch, scratch);
Psllw(scratch, scratch, byte{15});
Pmulhrsw(dst, src1, src2);
Pcmpeqw(scratch, dst);
Pxor(dst, scratch);
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp1, XMMRegister tmp2,
Register scratch) {

View File

@ -392,14 +392,9 @@ class V8_EXPORT_PRIVATE TurboAssembler
}
void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp);
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
void Pmulhrsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on IA32. These
// helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff.
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch);
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, Register tmp);

View File

@ -570,6 +570,24 @@ void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
}
}
void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2,
XMMRegister scratch) {
ASM_CODE_COMMENT(this);
// k = i16x8.splat(0x8000)
Pcmpeqd(scratch, scratch);
Psllw(scratch, scratch, byte{15});
if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
movaps(dst, src1);
src1 = dst;
}
Pmulhrsw(dst, src1, src2);
Pcmpeqw(scratch, dst);
Pxor(dst, scratch);
}
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.

View File

@ -232,6 +232,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pcmpgtd, pcmpgtd)
AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pinsrw, pinsrw)
AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminub, pminub)
@ -288,6 +289,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSSE3(Pabsd, pabsd)
AVX_OP_SSSE3(Pabsw, pabsw)
AVX_OP_SSSE3(Palignr, palignr)
AVX_OP_SSSE3(Pmulhrsw, pmulhrsw)
AVX_OP_SSSE3(Psignb, psignb)
AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Psignw, psignw)
@ -347,6 +349,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
// Will move src1 to dst if AVX is not supported.
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);

View File

@ -2243,31 +2243,6 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmulhrsw(dst, src1, src2);
} else {
if (dst != src1) {
Movdqa(dst, src1);
}
CpuFeatureScope sse_scope(this, SSSE3);
pmulhrsw(dst, src2);
}
}
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
// k = i16x8.splat(0x8000)
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
Psllw(kScratchDoubleReg, byte{15});
Pmulhrsw(dst, src1, src2);
Pcmpeqw(kScratchDoubleReg, dst);
Pxor(dst, kScratchDoubleReg);
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
DCHECK_NE(dst, tmp);

View File

@ -475,13 +475,10 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmulhrsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff.
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src);

View File

@ -3405,7 +3405,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I16x8Q15MulRSatS: {
__ I16x8Q15MulRSatS(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
i.InputSimd128Register(1), kScratchDoubleReg);
break;
}
case kX64I8x16Splat: {

View File

@ -3170,7 +3170,7 @@ void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8Q15MulRSatS(dst.fp(), src1.fp(), src2.fp());
I16x8Q15MulRSatS(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst,