[wasm-simd][x64][liftoff] Implement extended add pairwise
Extract code sequence into macro-assembler for reuse between Liftoff and TurboFan. Small tweaks to macro-assembler functions Pmaddwd and Pmaddubsw to move src1 to dst on SSE when src != dst. TurboFan codegen won't be affected by this since it sets the right restrictions in instruction-selector. Bug: v8:11086 Change-Id: I6c206dec332c8195a6a4d419d11a28e7058c905a Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2707253 Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72924}
This commit is contained in:
parent
2cb031ecfd
commit
8136e39997
@ -1828,7 +1828,9 @@ void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmaddwd(dst, src1, src2);
|
||||
} else {
|
||||
DCHECK_EQ(dst, src1);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
pmaddwd(dst, src2);
|
||||
}
|
||||
}
|
||||
@ -1839,7 +1841,9 @@ void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmaddwd(dst, src1, src2);
|
||||
} else {
|
||||
DCHECK_EQ(dst, src1);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
pmaddwd(dst, src2);
|
||||
}
|
||||
}
|
||||
@ -1851,7 +1855,9 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
|
||||
vpmaddubsw(dst, src1, src2);
|
||||
} else {
|
||||
CpuFeatureScope ssse3_scope(this, SSSE3);
|
||||
DCHECK_EQ(dst, src1);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
pmaddubsw(dst, src2);
|
||||
}
|
||||
}
|
||||
@ -1863,7 +1869,9 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
|
||||
vpmaddubsw(dst, src1, src2);
|
||||
} else {
|
||||
CpuFeatureScope ssse3_scope(this, SSSE3);
|
||||
DCHECK_EQ(dst, src1);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
pmaddubsw(dst, src2);
|
||||
}
|
||||
}
|
||||
@ -2543,6 +2551,43 @@ void TurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst,
|
||||
XMMRegister src) {
|
||||
// pmaddubsw treats the first operand as unsigned, so the external reference
|
||||
// to be passed to it as the first operand.
|
||||
Operand op = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x01());
|
||||
if (dst == src) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vmovdqa(kScratchDoubleReg, op);
|
||||
vpmaddubsw(dst, kScratchDoubleReg, src);
|
||||
} else {
|
||||
CpuFeatureScope sse_scope(this, SSSE3);
|
||||
movaps(kScratchDoubleReg, op);
|
||||
pmaddubsw(kScratchDoubleReg, src);
|
||||
movaps(dst, kScratchDoubleReg);
|
||||
}
|
||||
} else {
|
||||
Movdqa(dst, op);
|
||||
Pmaddubsw(dst, dst, src);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
|
||||
XMMRegister src) {
|
||||
// src = |a|b|c|d|e|f|g|h|
|
||||
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
|
||||
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
|
||||
Psrld(kScratchDoubleReg, byte{16});
|
||||
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
|
||||
Pand(kScratchDoubleReg, src);
|
||||
// dst = |0|a|0|c|0|e|0|g|
|
||||
Psrld(dst, src, byte{16});
|
||||
// dst = |a+b|c+d|e+f|g+h|
|
||||
Paddd(dst, kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void TurboAssembler::Abspd(XMMRegister dst) {
|
||||
Andps(dst, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_double_abs_constant()));
|
||||
|
@ -552,7 +552,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void Trap() override;
|
||||
void DebugBreak() override;
|
||||
|
||||
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
|
||||
// Will move src1 to dst if dst != src1.
|
||||
void Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2);
|
||||
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
|
||||
@ -634,6 +634,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1);
|
||||
void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1);
|
||||
|
||||
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src);
|
||||
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src);
|
||||
|
||||
void Abspd(XMMRegister dst);
|
||||
void Negpd(XMMRegister dst);
|
||||
|
||||
|
@ -3127,19 +3127,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtAddPairwiseI16x8U: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
|
||||
// src = |a|b|c|d|e|f|g|h|
|
||||
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
|
||||
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Psrld(kScratchDoubleReg, byte{16});
|
||||
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
|
||||
__ Pand(kScratchDoubleReg, src);
|
||||
// dst = |0|a|0|c|0|e|0|g|
|
||||
__ Psrld(dst, src, byte{16});
|
||||
// dst = |a+b|c+d|e+f|g+h|
|
||||
__ Paddd(dst, kScratchDoubleReg);
|
||||
__ I32x4ExtAddPairwiseI16x8U(i.OutputSimd128Register(),
|
||||
i.InputSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64S128Const: {
|
||||
@ -3362,13 +3351,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtAddPairwiseI8x16S: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
DCHECK_NE(dst, src);
|
||||
__ Movdqa(dst,
|
||||
__ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x01()));
|
||||
__ Pmaddubsw(dst, dst, src);
|
||||
__ I16x8ExtAddPairwiseI8x16S(i.OutputSimd128Register(),
|
||||
i.InputSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtAddPairwiseI8x16U: {
|
||||
|
@ -3150,6 +3150,16 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
|
||||
vpadd(Neon32, dest.high(), scratch.low(), scratch.high());
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_s");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_u");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extmul_low_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
@ -3358,6 +3368,16 @@ void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
|
||||
imm_lane_idx);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_s");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_u");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
|
@ -2271,6 +2271,16 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
|
||||
Addp(dst.fp().V4S(), tmp1, tmp2);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_s");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_u");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extmul_low_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
@ -3011,6 +3021,16 @@ void LiftoffAssembler::emit_i16x8_abs(LiftoffRegister dst,
|
||||
Abs(dst.fp().V8H(), src.fp().V8H());
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_s");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_u");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
|
@ -3640,6 +3640,16 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
|
||||
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_s");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_u");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
@ -3786,6 +3796,16 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
|
||||
this, dst, lhs, rhs);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_s");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_u");
|
||||
}
|
||||
|
||||
namespace liftoff {
|
||||
// Helper function to check for register aliasing, AVX support, and moves
|
||||
// registers around before calling the actual macro-assembler function.
|
||||
|
@ -1098,6 +1098,10 @@ class LiftoffAssembler : public TurboAssembler {
|
||||
LiftoffRegister rhs);
|
||||
inline void emit_i16x8_max_u(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
LiftoffRegister rhs);
|
||||
inline void emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src);
|
||||
inline void emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
|
||||
LiftoffRegister src);
|
||||
inline void emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2);
|
||||
@ -1144,6 +1148,10 @@ class LiftoffAssembler : public TurboAssembler {
|
||||
LiftoffRegister rhs);
|
||||
inline void emit_i32x4_dot_i16x8_s(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
LiftoffRegister rhs);
|
||||
inline void emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src);
|
||||
inline void emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
|
||||
LiftoffRegister src);
|
||||
inline void emit_i32x4_extmul_low_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2);
|
||||
|
@ -3173,6 +3173,12 @@ class LiftoffCompiler {
|
||||
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_s);
|
||||
case wasm::kExprI16x8MaxU:
|
||||
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_u);
|
||||
case wasm::kExprI16x8ExtAddPairwiseI8x16S:
|
||||
return EmitUnOp<kS128, kS128>(
|
||||
&LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s);
|
||||
case wasm::kExprI16x8ExtAddPairwiseI8x16U:
|
||||
return EmitUnOp<kS128, kS128>(
|
||||
&LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u);
|
||||
case wasm::kExprI16x8ExtMulLowI8x16S:
|
||||
return EmitBinOp<kS128, kS128>(
|
||||
&LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s);
|
||||
@ -3220,6 +3226,12 @@ class LiftoffCompiler {
|
||||
case wasm::kExprI32x4DotI16x8S:
|
||||
return EmitBinOp<kS128, kS128>(
|
||||
&LiftoffAssembler::emit_i32x4_dot_i16x8_s);
|
||||
case wasm::kExprI32x4ExtAddPairwiseI16x8S:
|
||||
return EmitUnOp<kS128, kS128>(
|
||||
&LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s);
|
||||
case wasm::kExprI32x4ExtAddPairwiseI16x8U:
|
||||
return EmitUnOp<kS128, kS128>(
|
||||
&LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u);
|
||||
case wasm::kExprI32x4ExtMulLowI16x8S:
|
||||
return EmitBinOp<kS128, kS128>(
|
||||
&LiftoffAssembler::emit_i32x4_extmul_low_i16x8_s);
|
||||
|
@ -3260,6 +3260,18 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
|
||||
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
I16x8ExtAddPairwiseI8x16S(dst.fp(), src.fp());
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Operand op = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x01());
|
||||
Pmaddubsw(dst.fp(), src.fp(), op);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
@ -3405,6 +3417,18 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
|
||||
this, dst, lhs, rhs);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Operand op = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i16x8_splat_0x0001());
|
||||
Pmaddwd(dst.fp(), src.fp(), op);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
I32x4ExtAddPairwiseI16x8U(dst.fp(), src.fp());
|
||||
}
|
||||
|
||||
namespace liftoff {
|
||||
// Helper function to check for register aliasing, AVX support, and moves
|
||||
// registers around before calling the actual macro-assembler function.
|
||||
|
Loading…
Reference in New Issue
Block a user