[wasm-simd][x64][liftoff] Implement extended add pairwise

Extract code sequence into macro-assembler for reuse between Liftoff and
TurboFan.

Small tweaks to macro-assembler functions  Pmaddwd and Pmaddubsw to move
src1 to dst on SSE when src != dst. TurboFan codegen won't be affected
by this since it sets the right restrictions in instruction-selector.

Bug: v8:11086
Change-Id: I6c206dec332c8195a6a4d419d11a28e7058c905a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2707253
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72924}
This commit is contained in:
Ng Zhi An 2021-02-19 14:47:21 -08:00 committed by Commit Bot
parent 2cb031ecfd
commit 8136e39997
9 changed files with 161 additions and 25 deletions

View File

@ -1828,7 +1828,9 @@ void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddwd(dst, src1, src2);
} else {
DCHECK_EQ(dst, src1);
if (dst != src1) {
movaps(dst, src1);
}
pmaddwd(dst, src2);
}
}
@ -1839,7 +1841,9 @@ void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
CpuFeatureScope avx_scope(this, AVX);
vpmaddwd(dst, src1, src2);
} else {
DCHECK_EQ(dst, src1);
if (dst != src1) {
movaps(dst, src1);
}
pmaddwd(dst, src2);
}
}
@ -1851,7 +1855,9 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
vpmaddubsw(dst, src1, src2);
} else {
CpuFeatureScope ssse3_scope(this, SSSE3);
DCHECK_EQ(dst, src1);
if (dst != src1) {
movaps(dst, src1);
}
pmaddubsw(dst, src2);
}
}
@ -1863,7 +1869,9 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
vpmaddubsw(dst, src1, src2);
} else {
CpuFeatureScope ssse3_scope(this, SSSE3);
DCHECK_EQ(dst, src1);
if (dst != src1) {
movaps(dst, src1);
}
pmaddubsw(dst, src2);
}
}
@ -2543,6 +2551,43 @@ void TurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst,
XMMRegister src) {
// pmaddubsw treats the first operand as unsigned, so the external reference
// to be passed to it as the first operand.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01());
if (dst == src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(kScratchDoubleReg, op);
vpmaddubsw(dst, kScratchDoubleReg, src);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(kScratchDoubleReg, op);
pmaddubsw(kScratchDoubleReg, src);
movaps(dst, kScratchDoubleReg);
}
} else {
Movdqa(dst, op);
Pmaddubsw(dst, dst, src);
}
}
void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
XMMRegister src) {
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
Psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
Pand(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
Paddd(dst, kScratchDoubleReg);
}
void TurboAssembler::Abspd(XMMRegister dst) {
Andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_double_abs_constant()));

View File

@ -552,7 +552,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Trap() override;
void DebugBreak() override;
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
// Will move src1 to dst if dst != src1.
void Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
@ -634,6 +634,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1);
void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1);
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src);
void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst);

View File

@ -3127,19 +3127,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4ExtAddPairwiseI16x8U: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
__ Pand(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
__ Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
__ Paddd(dst, kScratchDoubleReg);
__ I32x4ExtAddPairwiseI16x8U(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64S128Const: {
@ -3362,13 +3351,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8ExtAddPairwiseI8x16S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
DCHECK_NE(dst, src);
__ Movdqa(dst,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01()));
__ Pmaddubsw(dst, dst, src);
__ I16x8ExtAddPairwiseI8x16S(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I16x8ExtAddPairwiseI8x16U: {

View File

@ -3150,6 +3150,16 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
vpadd(Neon32, dest.high(), scratch.low(), scratch.high());
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_s");
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_u");
}
void LiftoffAssembler::emit_i32x4_extmul_low_i16x8_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
@ -3358,6 +3368,16 @@ void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_s");
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_u");
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {

View File

@ -2271,6 +2271,16 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
Addp(dst.fp().V4S(), tmp1, tmp2);
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_s");
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_u");
}
void LiftoffAssembler::emit_i32x4_extmul_low_i16x8_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
@ -3011,6 +3021,16 @@ void LiftoffAssembler::emit_i16x8_abs(LiftoffRegister dst,
Abs(dst.fp().V8H(), src.fp().V8H());
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_s");
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_u");
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {

View File

@ -3640,6 +3640,16 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_s");
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_u");
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
@ -3786,6 +3796,16 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_s");
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_u");
}
namespace liftoff {
// Helper function to check for register aliasing, AVX support, and moves
// registers around before calling the actual macro-assembler function.

View File

@ -1098,6 +1098,10 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_i16x8_max_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2);
@ -1144,6 +1148,10 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_i32x4_dot_i16x8_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_i32x4_extmul_low_i16x8_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2);

View File

@ -3173,6 +3173,12 @@ class LiftoffCompiler {
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_s);
case wasm::kExprI16x8MaxU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_u);
case wasm::kExprI16x8ExtAddPairwiseI8x16S:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s);
case wasm::kExprI16x8ExtAddPairwiseI8x16U:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u);
case wasm::kExprI16x8ExtMulLowI8x16S:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s);
@ -3220,6 +3226,12 @@ class LiftoffCompiler {
case wasm::kExprI32x4DotI16x8S:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i32x4_dot_i16x8_s);
case wasm::kExprI32x4ExtAddPairwiseI16x8S:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s);
case wasm::kExprI32x4ExtAddPairwiseI16x8U:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u);
case wasm::kExprI32x4ExtMulLowI16x8S:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i32x4_extmul_low_i16x8_s);

View File

@ -3260,6 +3260,18 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
LiftoffRegister src) {
I16x8ExtAddPairwiseI8x16S(dst.fp(), src.fp());
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
LiftoffRegister src) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01());
Pmaddubsw(dst.fp(), src.fp(), op);
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
@ -3405,6 +3417,18 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
LiftoffRegister src) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i16x8_splat_0x0001());
Pmaddwd(dst.fp(), src.fp(), op);
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
LiftoffRegister src) {
I32x4ExtAddPairwiseI16x8U(dst.fp(), src.fp());
}
namespace liftoff {
// Helper function to check for register aliasing, AVX support, and moves
// registers around before calling the actual macro-assembler function.