[wasm-simd][x64] Optimize f64x2.extract_lane
pextrq + movq crosses register files twice, which is not efficient. Optimize this by: - checking if lane 0, do nothing if dst == src (macro-assembler helper) - use vmovhlps on AVX, with src as the operands to avoid false dependency on dst - use movhlps otherwise, this is shorter than shufpd, and faster on older system Change-Id: I3486d87224c048b3229c2f92359b8b8e6d5fd025 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2589056 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#71751}
This commit is contained in:
parent
3bc06ed3e1
commit
6cb61e63bb
@ -1284,6 +1284,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
void pshuflw(XMMRegister dst, XMMRegister src, uint8_t shuffle);
|
||||
void pshuflw(XMMRegister dst, Operand src, uint8_t shuffle);
|
||||
|
||||
void movhlps(XMMRegister dst, XMMRegister src) {
|
||||
sse_instr(dst, src, 0x0F, 0x12);
|
||||
}
|
||||
void movlhps(XMMRegister dst, XMMRegister src) {
|
||||
sse_instr(dst, src, 0x0F, 0x16);
|
||||
}
|
||||
@ -1386,6 +1389,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
void vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
|
||||
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
|
||||
}
|
||||
void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
|
||||
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
|
||||
}
|
||||
void vcvtss2sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
|
||||
vinstr(0x5a, dst, src1, src2, kF3, k0F, kWIG);
|
||||
}
|
||||
|
@ -1293,6 +1293,12 @@ void TurboAssembler::Move(Register dst, Register src) {
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Move(XMMRegister dst, XMMRegister src) {
|
||||
if (dst != src) {
|
||||
movaps(dst, src);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::MovePair(Register dst0, Register src0, Register dst1,
|
||||
Register src1) {
|
||||
if (dst0 != src1) {
|
||||
|
@ -442,6 +442,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
|
||||
// Move if the registers are not identical.
|
||||
void Move(Register target, Register source);
|
||||
void Move(XMMRegister target, XMMRegister source);
|
||||
|
||||
void Move(Register dst, Handle<HeapObject> source,
|
||||
RelocInfo::Mode rmode = RelocInfo::FULL_EMBEDDED_OBJECT);
|
||||
|
@ -2453,8 +2453,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kX64F64x2ExtractLane: {
|
||||
__ Pextrq(kScratchRegister, i.InputSimd128Register(0), i.InputInt8(1));
|
||||
__ Movq(i.OutputDoubleRegister(), kScratchRegister);
|
||||
DoubleRegister dst = i.OutputDoubleRegister();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
uint8_t lane = i.InputUint8(1);
|
||||
if (lane == 0) {
|
||||
__ Move(dst, src);
|
||||
} else {
|
||||
DCHECK_EQ(1, lane);
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
// Pass src as operand to avoid false-dependency on dst.
|
||||
__ vmovhlps(dst, src, src);
|
||||
} else {
|
||||
__ movhlps(dst, src);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kX64F64x2Sqrt: {
|
||||
|
@ -1381,9 +1381,15 @@ int DisassemblerX64::AVXInstruction(byte* data) {
|
||||
AppendToBuffer(",%s", NameOfXMMRegister(regop));
|
||||
break;
|
||||
case 0x12:
|
||||
if (mod == 0b11) {
|
||||
AppendToBuffer("vmovhlps %s,%s,", NameOfXMMRegister(regop),
|
||||
NameOfXMMRegister(vvvv));
|
||||
current += PrintRightXMMOperand(current);
|
||||
} else {
|
||||
AppendToBuffer("vmovlps %s,%s,", NameOfXMMRegister(regop),
|
||||
NameOfXMMRegister(vvvv));
|
||||
current += PrintRightXMMOperand(current);
|
||||
}
|
||||
break;
|
||||
case 0x13:
|
||||
AppendToBuffer("vmovlps ");
|
||||
@ -2065,8 +2071,13 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
|
||||
// movups xmm/m128, xmm
|
||||
current += PrintOperands("movups", XMMOPER_XMMREG_OP_ORDER, current);
|
||||
} else if (opcode == 0x12) {
|
||||
// movhlps xmm1, xmm2
|
||||
// movlps xmm1, m64
|
||||
if (mod == 0b11) {
|
||||
current += PrintOperands("movhlps", XMMREG_XMMOPER_OP_ORDER, current);
|
||||
} else {
|
||||
current += PrintOperands("movlps", XMMREG_OPER_OP_ORDER, current);
|
||||
}
|
||||
} else if (opcode == 0x13) {
|
||||
// movlps m64, xmm1
|
||||
AppendToBuffer("movlps ");
|
||||
|
@ -401,8 +401,10 @@ TEST(DisasmX64) {
|
||||
__ movdqu(xmm0, Operand(rsp, 12));
|
||||
__ movdqu(Operand(rsp, 12), xmm0);
|
||||
__ movdqu(xmm1, xmm0);
|
||||
__ movhlps(xmm5, xmm1);
|
||||
__ movlps(xmm8, Operand(rbx, rcx, times_4, 10000));
|
||||
__ movlps(Operand(rbx, rcx, times_4, 10000), xmm9);
|
||||
__ movlhps(xmm5, xmm1);
|
||||
__ movhps(xmm8, Operand(rbx, rcx, times_4, 10000));
|
||||
__ movhps(Operand(rbx, rcx, times_4, 10000), xmm9);
|
||||
__ shufps(xmm0, xmm9, 0x0);
|
||||
@ -577,7 +579,6 @@ TEST(DisasmX64) {
|
||||
__ movups(xmm5, xmm1);
|
||||
__ movups(xmm5, Operand(rdx, 4));
|
||||
__ movups(Operand(rdx, 4), xmm5);
|
||||
__ movlhps(xmm5, xmm1);
|
||||
__ pmulld(xmm5, xmm1);
|
||||
__ pmulld(xmm5, Operand(rdx, 4));
|
||||
__ pmullw(xmm5, xmm1);
|
||||
@ -659,8 +660,10 @@ TEST(DisasmX64) {
|
||||
__ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000));
|
||||
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0);
|
||||
|
||||
__ vmovhlps(xmm1, xmm3, xmm5);
|
||||
__ vmovlps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
|
||||
__ vmovlps(Operand(rbx, rcx, times_4, 10000), xmm9);
|
||||
__ vmovlhps(xmm1, xmm3, xmm5);
|
||||
__ vmovhps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
|
||||
__ vmovhps(Operand(rbx, rcx, times_4, 10000), xmm12);
|
||||
|
||||
@ -693,7 +696,6 @@ TEST(DisasmX64) {
|
||||
__ vmovups(xmm5, xmm1);
|
||||
__ vmovups(xmm5, Operand(rdx, 4));
|
||||
__ vmovups(Operand(rdx, 4), xmm5);
|
||||
__ vmovlhps(xmm1, xmm3, xmm5);
|
||||
|
||||
__ vandps(xmm0, xmm9, xmm2);
|
||||
__ vandps(xmm9, xmm1, Operand(rbx, rcx, times_4, 10000));
|
||||
|
Loading…
Reference in New Issue
Block a user