[ia32][x64][liftoff] Share i64x2mul code

Optimize i64x2mul when AVX is supported to elide some moves.

Bug: v8:11589
Change-Id: Ide0bba502a35cbb632e3fc311c9697c5f54f9d82
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3163280
Reviewed-by: Adam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76889}
This commit is contained in:
Ng Zhi An 2021-09-15 11:34:07 -07:00 committed by V8 LUCI CQ
parent af7232380f
commit 693112bfc0
6 changed files with 55 additions and 70 deletions

View File

@ -900,6 +900,51 @@ void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
Psubq(dst, xmm_tmp);
}
void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
XMMRegister rhs, XMMRegister tmp1,
XMMRegister tmp2) {
DCHECK(!AreAliased(dst, tmp1, tmp2));
DCHECK(!AreAliased(lhs, tmp1, tmp2));
DCHECK(!AreAliased(rhs, tmp1, tmp2));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// 1. Multiply high dword of each qword of left with right.
vpsrlq(tmp1, lhs, byte{32});
vpmuludq(tmp1, tmp1, rhs);
// 2. Multiply high dword of each qword of right with left.
vpsrlq(tmp2, rhs, byte{32});
vpmuludq(tmp2, tmp2, lhs);
// 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
vpaddq(tmp2, tmp2, tmp1);
vpsllq(tmp2, tmp2, byte{32});
// 4. Multiply low dwords (this is the low dword of result).
vpmuludq(dst, lhs, rhs);
// 5. Add 3 and 4.
vpaddq(dst, dst, tmp2);
} else {
// Same algorithm as AVX version, but with moves to not overwrite inputs.
movaps(tmp1, lhs);
movaps(tmp2, rhs);
psrlq(tmp1, byte{32});
pmuludq(tmp1, rhs);
psrlq(tmp2, byte{32});
pmuludq(tmp2, lhs);
paddq(tmp2, tmp1);
psllq(tmp2, byte{32});
if (dst == rhs) {
// pmuludq is commutative
pmuludq(dst, lhs);
} else {
if (dst != lhs) {
movaps(dst, lhs);
}
pmuludq(dst, rhs);
}
paddq(dst, tmp2);
}
}
// 1. Unpack src0, src1 into even-number elements of scratch.
// 2. Unpack src1, src0 into even-number elements of dst.
// 3. Multiply 1. with 2.

View File

@ -441,6 +441,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift,
XMMRegister xmm_tmp, XMMRegister xmm_shift,
Register tmp_shift);
void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
XMMRegister tmp1, XMMRegister tmp2);
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);

View File

@ -1985,28 +1985,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I64x2Mul: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister left = i.InputSimd128Register(0);
XMMRegister right = i.InputSimd128Register(1);
XMMRegister tmp1 = i.TempSimd128Register(0);
XMMRegister tmp2 = i.TempSimd128Register(1);
__ Movaps(tmp1, left);
__ Movaps(tmp2, right);
// Multiply high dword of each qword of left with right.
__ Psrlq(tmp1, byte{32});
__ Pmuludq(tmp1, tmp1, right);
// Multiply high dword of each qword of right with left.
__ Psrlq(tmp2, byte{32});
__ Pmuludq(tmp2, tmp2, left);
__ Paddq(tmp2, tmp2, tmp1);
__ Psllq(tmp2, tmp2, byte{32});
__ Pmuludq(dst, left, right);
__ Paddq(dst, dst, tmp2);
__ I64x2Mul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.TempSimd128Register(0),
i.TempSimd128Register(1));
break;
}
case kIA32I64x2ShrU: {

View File

@ -2940,28 +2940,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I64x2Mul: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister left = i.InputSimd128Register(0);
XMMRegister right = i.InputSimd128Register(1);
XMMRegister tmp1 = i.TempSimd128Register(0);
XMMRegister tmp2 = kScratchDoubleReg;
__ Movdqa(tmp1, left);
__ Movdqa(tmp2, right);
// Multiply high dword of each qword of left with right.
__ Psrlq(tmp1, byte{32});
__ Pmuludq(tmp1, right);
// Multiply high dword of each qword of right with left.
__ Psrlq(tmp2, byte{32});
__ Pmuludq(tmp2, left);
__ Paddq(tmp2, tmp1);
__ Psllq(tmp2, byte{32});
__ Pmuludq(left, right);
__ Paddq(left, tmp2); // left == dst
__ I64x2Mul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.TempSimd128Register(0),
kScratchDoubleReg);
break;
}
case kX64I64x2Eq: {

View File

@ -3863,19 +3863,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), byte{32});
Pmuludq(tmp1.fp(), tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), byte{32});
Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), tmp2.fp(), byte{32});
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
this, dst, lhs, rhs);
Paddq(dst.fp(), dst.fp(), tmp2.fp());
I64x2Mul(dst.fp(), lhs.fp(), rhs.fp(), tmp1.fp(), tmp2.fp());
}
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,

View File

@ -3413,19 +3413,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), byte{32});
Pmuludq(tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), byte{32});
Pmuludq(tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), byte{32});
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
this, dst, lhs, rhs);
Paddq(dst.fp(), tmp2.fp());
I64x2Mul(dst.fp(), lhs.fp(), rhs.fp(), tmp1.fp(), tmp2.fp());
}
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,