[wasm-simd] Share i16x8.q15mulr_sat_s implementation
Bug: v8:11589 Change-Id: Ie51cfd6cd6315f7f14f0c584f190a478ed565b0e Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3114603 Reviewed-by: Adam Klein <adamk@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/main@{#76475}
This commit is contained in:
parent
b415fa3824
commit
5e80730fb6
@ -631,32 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) {
|
||||
add(dst, Immediate(0x80000000));
|
||||
}
|
||||
|
||||
void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmulhrsw(dst, src1, src2);
|
||||
} else {
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
CpuFeatureScope sse_scope(this, SSSE3);
|
||||
pmulhrsw(dst, src2);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch) {
|
||||
ASM_CODE_COMMENT(this);
|
||||
// k = i16x8.splat(0x8000)
|
||||
Pcmpeqd(scratch, scratch);
|
||||
Psllw(scratch, scratch, byte{15});
|
||||
|
||||
Pmulhrsw(dst, src1, src2);
|
||||
Pcmpeqw(scratch, dst);
|
||||
Pxor(dst, scratch);
|
||||
}
|
||||
|
||||
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister tmp1, XMMRegister tmp2,
|
||||
Register scratch) {
|
||||
|
@ -392,14 +392,9 @@ class V8_EXPORT_PRIVATE TurboAssembler
|
||||
}
|
||||
void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp);
|
||||
|
||||
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
|
||||
void Pmulhrsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
|
||||
// These Wasm SIMD ops do not have direct lowerings on IA32. These
|
||||
// helpers are optimized to produce the fastest and smallest codegen.
|
||||
// Defined here to allow usage on both TurboFan and Liftoff.
|
||||
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch);
|
||||
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
|
||||
XMMRegister tmp2, Register scratch);
|
||||
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, Register tmp);
|
||||
|
@ -570,6 +570,24 @@ void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2,
|
||||
XMMRegister scratch) {
|
||||
ASM_CODE_COMMENT(this);
|
||||
// k = i16x8.splat(0x8000)
|
||||
Pcmpeqd(scratch, scratch);
|
||||
Psllw(scratch, scratch, byte{15});
|
||||
|
||||
if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
|
||||
movaps(dst, src1);
|
||||
src1 = dst;
|
||||
}
|
||||
|
||||
Pmulhrsw(dst, src1, src2);
|
||||
Pcmpeqw(scratch, dst);
|
||||
Pxor(dst, scratch);
|
||||
}
|
||||
|
||||
// 1. Multiply low word into scratch.
|
||||
// 2. Multiply high word (can be signed or unsigned) into dst.
|
||||
// 3. Unpack and interleave scratch and dst into dst.
|
||||
|
@ -232,6 +232,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
AVX_OP(Pcmpgtb, pcmpgtb)
|
||||
AVX_OP(Pcmpgtd, pcmpgtd)
|
||||
AVX_OP(Pcmpeqd, pcmpeqd)
|
||||
AVX_OP(Pcmpeqw, pcmpeqw)
|
||||
AVX_OP(Pinsrw, pinsrw)
|
||||
AVX_OP(Pmaxub, pmaxub)
|
||||
AVX_OP(Pminub, pminub)
|
||||
@ -288,6 +289,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
AVX_OP_SSSE3(Pabsd, pabsd)
|
||||
AVX_OP_SSSE3(Pabsw, pabsw)
|
||||
AVX_OP_SSSE3(Palignr, palignr)
|
||||
AVX_OP_SSSE3(Pmulhrsw, pmulhrsw)
|
||||
AVX_OP_SSSE3(Psignb, psignb)
|
||||
AVX_OP_SSSE3(Psignd, psignd)
|
||||
AVX_OP_SSSE3(Psignw, psignw)
|
||||
@ -347,6 +349,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
|
||||
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister scratch);
|
||||
// Will move src1 to dst if AVX is not supported.
|
||||
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch);
|
||||
// Requires that dst == src1 if AVX is not supported.
|
||||
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
|
@ -2243,31 +2243,6 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmulhrsw(dst, src1, src2);
|
||||
} else {
|
||||
if (dst != src1) {
|
||||
Movdqa(dst, src1);
|
||||
}
|
||||
CpuFeatureScope sse_scope(this, SSSE3);
|
||||
pmulhrsw(dst, src2);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
// k = i16x8.splat(0x8000)
|
||||
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
|
||||
Psllw(kScratchDoubleReg, byte{15});
|
||||
|
||||
Pmulhrsw(dst, src1, src2);
|
||||
Pcmpeqw(kScratchDoubleReg, dst);
|
||||
Pxor(dst, kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister tmp) {
|
||||
DCHECK_NE(dst, tmp);
|
||||
|
@ -475,13 +475,10 @@ class V8_EXPORT_PRIVATE TurboAssembler
|
||||
|
||||
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
|
||||
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
void Pmulhrsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
|
||||
// These Wasm SIMD ops do not have direct lowerings on x64. These
|
||||
// helpers are optimized to produce the fastest and smallest codegen.
|
||||
// Defined here to allow usage on both TurboFan and Liftoff.
|
||||
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
|
||||
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
|
||||
|
||||
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src);
|
||||
|
@ -3405,7 +3405,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I16x8Q15MulRSatS: {
|
||||
__ I16x8Q15MulRSatS(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1));
|
||||
i.InputSimd128Register(1), kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kX64I8x16Splat: {
|
||||
|
@ -3170,7 +3170,7 @@ void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst,
|
||||
void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
I16x8Q15MulRSatS(dst.fp(), src1.fp(), src2.fp());
|
||||
I16x8Q15MulRSatS(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst,
|
||||
|
Loading…
Reference in New Issue
Block a user