[wasm-simd] Share extadd pairwise implementation

Bug: v8:11589
Change-Id: I7c97920d8ab94408b5cde4e90e7ff1aa9bcaeeba
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3119995
Reviewed-by: Adam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76511}
This commit is contained in:
Ng Zhi An 2021-08-25 16:30:42 -07:00 committed by V8 LUCI CQ
parent 24af48d6e4
commit eaf3044073
8 changed files with 114 additions and 215 deletions

View File

@ -701,95 +701,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
XMMRegister tmp,
Register scratch) {
// pmaddubsw treats the first operand as unsigned, so pass the external
// reference to as the first operand.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp, op);
vpmaddubsw(dst, tmp, src);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
if (dst == src) {
movaps(tmp, op);
pmaddubsw(tmp, src);
movaps(dst, tmp);
} else {
movaps(dst, op);
pmaddubsw(dst, src);
}
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
Register scratch) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddubsw(dst, src, op);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(dst, src);
pmaddubsw(dst, op);
}
}
void TurboAssembler::I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
Register scratch) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch);
// pmaddwd multiplies signed words in src and op, producing
// signed doublewords, then adds pairwise.
// src = |a|b|c|d|e|f|g|h|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
Pmaddwd(dst, src, op);
}
void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
ASM_CODE_COMMENT(this);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// src = |a|b|c|d|e|f|g|h| (low)
// scratch = |0|a|0|c|0|e|0|g|
vpsrld(tmp, src, 16);
// dst = |0|b|0|d|0|f|0|h|
vpblendw(dst, src, tmp, 0xAA);
// dst = |a+b|c+d|e+f|g+h|
vpaddd(dst, tmp, dst);
} else if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
// There is a potentially better lowering if we get rip-relative constants,
// see https://github.com/WebAssembly/simd/pull/380.
movaps(tmp, src);
psrld(tmp, 16);
if (dst != src) {
movaps(dst, src);
}
pblendw(dst, tmp, 0xAA);
paddd(dst, tmp);
} else {
// src = |a|b|c|d|e|f|g|h|
// tmp = i32x4.splat(0x0000FFFF)
pcmpeqd(tmp, tmp);
psrld(tmp, byte{16});
// tmp =|0|b|0|d|0|f|0|h|
andps(tmp, src);
// dst = |0|a|0|c|0|e|0|g|
if (dst != src) {
movaps(dst, src);
}
psrld(dst, byte{16});
// dst = |a+b|c+d|e+f|g+h|
paddd(dst, tmp);
}
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, XMMRegister scratch,
Register tmp, bool omit_add) {

View File

@ -326,7 +326,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
AVX_OP3_WITH_MOVE(Cmpeqps, cmpeqps, XMMRegister, Operand)
AVX_OP3_WITH_MOVE(Movlps, movlps, XMMRegister, Operand)
AVX_OP3_WITH_MOVE(Movhps, movhps, XMMRegister, Operand)
AVX_OP3_WITH_MOVE(Pmaddwd, pmaddwd, XMMRegister, Operand)
#undef AVX_OP3_WITH_MOVE
// TODO(zhin): Remove after moving more definitions into SharedTurboAssembler.
@ -397,14 +396,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch);
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
XMMRegister tmp, Register scratch);
void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
Register scratch);
void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
Register scratch);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
XMMRegister scratch, Register tmp, bool omit_add = false);

View File

@ -588,6 +588,47 @@ void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
Pxor(dst, scratch);
}
void SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
XMMRegister src,
XMMRegister tmp) {
ASM_CODE_COMMENT(this);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// src = |a|b|c|d|e|f|g|h| (low)
// scratch = |0|a|0|c|0|e|0|g|
vpsrld(tmp, src, 16);
// dst = |0|b|0|d|0|f|0|h|
vpblendw(dst, src, tmp, 0xAA);
// dst = |a+b|c+d|e+f|g+h|
vpaddd(dst, tmp, dst);
} else if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
// There is a potentially better lowering if we get rip-relative
// constants, see https://github.com/WebAssembly/simd/pull/380.
movaps(tmp, src);
psrld(tmp, 16);
if (dst != src) {
movaps(dst, src);
}
pblendw(dst, tmp, 0xAA);
paddd(dst, tmp);
} else {
// src = |a|b|c|d|e|f|g|h|
// tmp = i32x4.splat(0x0000FFFF)
pcmpeqd(tmp, tmp);
psrld(tmp, byte{16});
// tmp =|0|b|0|d|0|f|0|h|
andps(tmp, src);
// dst = |0|a|0|c|0|e|0|g|
if (dst != src) {
movaps(dst, src);
}
psrld(dst, byte{16});
// dst = |a+b|c+d|e+f|g+h|
paddd(dst, tmp);
}
}
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.

View File

@ -235,6 +235,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pinsrw, pinsrw)
AVX_OP(Pmaddwd, pmaddwd)
AVX_OP(Pmaxsw, pmaxsw)
AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminsw, pminsw)
@ -361,6 +362,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Will move src1 to dst if AVX is not supported.
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp);
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
@ -515,6 +518,63 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
}
}
void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
Register scratch) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch);
// pmaddwd multiplies signed words in src and op, producing
// signed doublewords, then adds pairwise.
// src = |a|b|c|d|e|f|g|h|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
movaps(dst, src);
src = dst;
}
Pmaddwd(dst, src, op);
}
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
XMMRegister scratch, Register tmp) {
ASM_CODE_COMMENT(this);
// pmaddubsw treats the first operand as unsigned, so pass the external
// reference to it as the first operand.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01(), tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(scratch, op);
vpmaddubsw(dst, scratch, src);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
if (dst == src) {
movaps(scratch, op);
pmaddubsw(scratch, src);
movaps(dst, scratch);
} else {
movaps(dst, op);
pmaddubsw(dst, src);
}
}
}
void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
Register scratch) {
ASM_CODE_COMMENT(this);
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddubsw(dst, src, op);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
if (dst != src) {
movaps(dst, src);
}
pmaddubsw(dst, op);
}
}
private:
// All implementation-specific methods must be called through this.
Impl* impl() { return static_cast<Impl*>(this); }

View File

@ -2004,31 +2004,6 @@ void TurboAssembler::JumpCodeTObject(Register code, JumpMode jump_mode) {
}
}
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddwd(dst, src1, src2);
} else {
if (dst != src1) {
movaps(dst, src1);
}
pmaddwd(dst, src2);
}
}
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddwd(dst, src1, src2);
} else {
if (dst != src1) {
movaps(dst, src1);
}
pmaddwd(dst, src2);
}
}
void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
Operand src2) {
if (CpuFeatures::IsSupported(AVX)) {
@ -2302,68 +2277,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst,
XMMRegister src) {
// pmaddubsw treats the first operand as unsigned, so the external reference
// to be passed to it as the first operand.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01());
if (dst == src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(kScratchDoubleReg, op);
vpmaddubsw(dst, kScratchDoubleReg, src);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(kScratchDoubleReg, op);
pmaddubsw(kScratchDoubleReg, src);
movaps(dst, kScratchDoubleReg);
}
} else {
Movdqa(dst, op);
Pmaddubsw(dst, dst, src);
}
}
void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// src = |a|b|c|d|e|f|g|h| (low)
// scratch = |0|a|0|c|0|e|0|g|
vpsrld(kScratchDoubleReg, src, 16);
// dst = |0|b|0|d|0|f|0|h|
vpblendw(dst, src, kScratchDoubleReg, 0xAA);
// dst = |a+b|c+d|e+f|g+h|
vpaddd(dst, kScratchDoubleReg, dst);
} else if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
// There is a potentially better lowering if we get rip-relative constants,
// see https://github.com/WebAssembly/simd/pull/380.
movaps(kScratchDoubleReg, src);
psrld(kScratchDoubleReg, 16);
if (dst != src) {
movaps(dst, src);
}
pblendw(dst, kScratchDoubleReg, 0xAA);
paddd(dst, kScratchDoubleReg);
} else {
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
andps(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
if (dst != src) {
movaps(dst, src);
}
psrld(dst, byte{16});
// dst = |a+b|c+d|e+f|g+h|
paddd(dst, kScratchDoubleReg);
}
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, bool omit_add) {
if (omit_add) {

View File

@ -447,8 +447,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
void DebugBreak();
// Will move src1 to dst if dst != src1.
void Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
@ -481,9 +479,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
bool omit_add = false);

View File

@ -3167,21 +3167,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4ExtAddPairwiseI16x8S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src1 = i.InputSimd128Register(0);
// pmaddwd multiplies signed words in src1 and src2, producing signed
// doublewords, then adds pairwise.
// src1 = |a|b|c|d|e|f|g|h|
// src2 = |1|1|1|1|1|1|1|1|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
Operand src2 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i16x8_splat_0x0001());
__ Pmaddwd(dst, src1, src2);
__ I32x4ExtAddPairwiseI16x8S(i.OutputSimd128Register(),
i.InputSimd128Register(0), kScratchRegister);
break;
}
case kX64I32x4ExtAddPairwiseI16x8U: {
__ I32x4ExtAddPairwiseI16x8U(i.OutputSimd128Register(),
i.InputSimd128Register(0));
i.InputSimd128Register(0),
kScratchDoubleReg);
break;
}
case kX64S128Const: {
@ -3394,15 +3387,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I16x8ExtAddPairwiseI8x16S: {
__ I16x8ExtAddPairwiseI8x16S(i.OutputSimd128Register(),
i.InputSimd128Register(0));
i.InputSimd128Register(0), kScratchDoubleReg,
kScratchRegister);
break;
}
case kX64I16x8ExtAddPairwiseI8x16U: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src1 = i.InputSimd128Register(0);
Operand src2 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01());
__ Pmaddubsw(dst, src1, src2);
__ I16x8ExtAddPairwiseI8x16U(i.OutputSimd128Register(),
i.InputSimd128Register(0), kScratchRegister);
break;
}
case kX64I16x8Q15MulRSatS: {

View File

@ -3131,14 +3131,13 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
LiftoffRegister src) {
I16x8ExtAddPairwiseI8x16S(dst.fp(), src.fp());
I16x8ExtAddPairwiseI8x16S(dst.fp(), src.fp(), kScratchDoubleReg,
kScratchRegister);
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
LiftoffRegister src) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01());
Pmaddubsw(dst.fp(), src.fp(), op);
I16x8ExtAddPairwiseI8x16U(dst.fp(), src.fp(), kScratchRegister);
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
@ -3287,14 +3286,12 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
LiftoffRegister src) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i16x8_splat_0x0001());
Pmaddwd(dst.fp(), src.fp(), op);
I32x4ExtAddPairwiseI16x8S(dst.fp(), src.fp(), kScratchRegister);
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
LiftoffRegister src) {
I32x4ExtAddPairwiseI16x8U(dst.fp(), src.fp());
I32x4ExtAddPairwiseI16x8U(dst.fp(), src.fp(), kScratchDoubleReg);
}
namespace liftoff {