[ia32][x64][liftoff] Share i64x2mul code
Optimize i64x2mul when AVX is supported to elide some moves. Bug: v8:11589 Change-Id: Ide0bba502a35cbb632e3fc311c9697c5f54f9d82 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3163280 Reviewed-by: Adam Klein <adamk@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/main@{#76889}
This commit is contained in:
parent
af7232380f
commit
693112bfc0
@ -900,6 +900,51 @@ void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
|
||||
Psubq(dst, xmm_tmp);
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
|
||||
XMMRegister rhs, XMMRegister tmp1,
|
||||
XMMRegister tmp2) {
|
||||
DCHECK(!AreAliased(dst, tmp1, tmp2));
|
||||
DCHECK(!AreAliased(lhs, tmp1, tmp2));
|
||||
DCHECK(!AreAliased(rhs, tmp1, tmp2));
|
||||
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
// 1. Multiply high dword of each qword of left with right.
|
||||
vpsrlq(tmp1, lhs, byte{32});
|
||||
vpmuludq(tmp1, tmp1, rhs);
|
||||
// 2. Multiply high dword of each qword of right with left.
|
||||
vpsrlq(tmp2, rhs, byte{32});
|
||||
vpmuludq(tmp2, tmp2, lhs);
|
||||
// 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
|
||||
vpaddq(tmp2, tmp2, tmp1);
|
||||
vpsllq(tmp2, tmp2, byte{32});
|
||||
// 4. Multiply low dwords (this is the low dword of result).
|
||||
vpmuludq(dst, lhs, rhs);
|
||||
// 5. Add 3 and 4.
|
||||
vpaddq(dst, dst, tmp2);
|
||||
} else {
|
||||
// Same algorithm as AVX version, but with moves to not overwrite inputs.
|
||||
movaps(tmp1, lhs);
|
||||
movaps(tmp2, rhs);
|
||||
psrlq(tmp1, byte{32});
|
||||
pmuludq(tmp1, rhs);
|
||||
psrlq(tmp2, byte{32});
|
||||
pmuludq(tmp2, lhs);
|
||||
paddq(tmp2, tmp1);
|
||||
psllq(tmp2, byte{32});
|
||||
if (dst == rhs) {
|
||||
// pmuludq is commutative
|
||||
pmuludq(dst, lhs);
|
||||
} else {
|
||||
if (dst != lhs) {
|
||||
movaps(dst, lhs);
|
||||
}
|
||||
pmuludq(dst, rhs);
|
||||
}
|
||||
paddq(dst, tmp2);
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Unpack src0, src1 into even-number elements of scratch.
|
||||
// 2. Unpack src1, src0 into even-number elements of dst.
|
||||
// 3. Multiply 1. with 2.
|
||||
|
@ -441,6 +441,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift,
|
||||
XMMRegister xmm_tmp, XMMRegister xmm_shift,
|
||||
Register tmp_shift);
|
||||
void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
|
||||
XMMRegister tmp1, XMMRegister tmp2);
|
||||
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
|
||||
|
@ -1985,28 +1985,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kIA32I64x2Mul: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister left = i.InputSimd128Register(0);
|
||||
XMMRegister right = i.InputSimd128Register(1);
|
||||
XMMRegister tmp1 = i.TempSimd128Register(0);
|
||||
XMMRegister tmp2 = i.TempSimd128Register(1);
|
||||
|
||||
__ Movaps(tmp1, left);
|
||||
__ Movaps(tmp2, right);
|
||||
|
||||
// Multiply high dword of each qword of left with right.
|
||||
__ Psrlq(tmp1, byte{32});
|
||||
__ Pmuludq(tmp1, tmp1, right);
|
||||
|
||||
// Multiply high dword of each qword of right with left.
|
||||
__ Psrlq(tmp2, byte{32});
|
||||
__ Pmuludq(tmp2, tmp2, left);
|
||||
|
||||
__ Paddq(tmp2, tmp2, tmp1);
|
||||
__ Psllq(tmp2, tmp2, byte{32});
|
||||
|
||||
__ Pmuludq(dst, left, right);
|
||||
__ Paddq(dst, dst, tmp2);
|
||||
__ I64x2Mul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), i.TempSimd128Register(0),
|
||||
i.TempSimd128Register(1));
|
||||
break;
|
||||
}
|
||||
case kIA32I64x2ShrU: {
|
||||
|
@ -2940,28 +2940,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kX64I64x2Mul: {
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
XMMRegister left = i.InputSimd128Register(0);
|
||||
XMMRegister right = i.InputSimd128Register(1);
|
||||
XMMRegister tmp1 = i.TempSimd128Register(0);
|
||||
XMMRegister tmp2 = kScratchDoubleReg;
|
||||
|
||||
__ Movdqa(tmp1, left);
|
||||
__ Movdqa(tmp2, right);
|
||||
|
||||
// Multiply high dword of each qword of left with right.
|
||||
__ Psrlq(tmp1, byte{32});
|
||||
__ Pmuludq(tmp1, right);
|
||||
|
||||
// Multiply high dword of each qword of right with left.
|
||||
__ Psrlq(tmp2, byte{32});
|
||||
__ Pmuludq(tmp2, left);
|
||||
|
||||
__ Paddq(tmp2, tmp1);
|
||||
__ Psllq(tmp2, byte{32});
|
||||
|
||||
__ Pmuludq(left, right);
|
||||
__ Paddq(left, tmp2); // left == dst
|
||||
__ I64x2Mul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), i.TempSimd128Register(0),
|
||||
kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2Eq: {
|
||||
|
@ -3863,19 +3863,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
|
||||
LiftoffRegister tmp2 =
|
||||
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
|
||||
Movaps(tmp1.fp(), lhs.fp());
|
||||
Movaps(tmp2.fp(), rhs.fp());
|
||||
// Multiply high dword of each qword of left with right.
|
||||
Psrlq(tmp1.fp(), byte{32});
|
||||
Pmuludq(tmp1.fp(), tmp1.fp(), rhs.fp());
|
||||
// Multiply high dword of each qword of right with left.
|
||||
Psrlq(tmp2.fp(), byte{32});
|
||||
Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp());
|
||||
Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp());
|
||||
Psllq(tmp2.fp(), tmp2.fp(), byte{32});
|
||||
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
|
||||
this, dst, lhs, rhs);
|
||||
Paddq(dst.fp(), dst.fp(), tmp2.fp());
|
||||
I64x2Mul(dst.fp(), lhs.fp(), rhs.fp(), tmp1.fp(), tmp2.fp());
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,
|
||||
|
@ -3413,19 +3413,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
|
||||
LiftoffRegister tmp2 =
|
||||
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
|
||||
Movaps(tmp1.fp(), lhs.fp());
|
||||
Movaps(tmp2.fp(), rhs.fp());
|
||||
// Multiply high dword of each qword of left with right.
|
||||
Psrlq(tmp1.fp(), byte{32});
|
||||
Pmuludq(tmp1.fp(), rhs.fp());
|
||||
// Multiply high dword of each qword of right with left.
|
||||
Psrlq(tmp2.fp(), byte{32});
|
||||
Pmuludq(tmp2.fp(), lhs.fp());
|
||||
Paddq(tmp2.fp(), tmp1.fp());
|
||||
Psllq(tmp2.fp(), byte{32});
|
||||
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
|
||||
this, dst, lhs, rhs);
|
||||
Paddq(dst.fp(), tmp2.fp());
|
||||
I64x2Mul(dst.fp(), lhs.fp(), rhs.fp(), tmp1.fp(), tmp2.fp());
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,
|
||||
|
Loading…
Reference in New Issue
Block a user