[wasm-simd][x64] Optimize f64x2.extract_lane

pextrq + movq crosses register files twice, which is not efficient.

Optimize this by:
- checking if lane 0, do nothing if dst == src (macro-assembler helper)
- use vmovhlps on AVX, with src as the operands to avoid false
dependency on dst
- use movhlps otherwise, this is shorter than shufpd, and faster on
older system

Change-Id: I3486d87224c048b3229c2f92359b8b8e6d5fd025
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2589056
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71751}
This commit is contained in:
Zhi An Ng 2020-12-14 23:12:42 +00:00 committed by Commit Bot
parent 3bc06ed3e1
commit 6cb61e63bb
6 changed files with 47 additions and 8 deletions

View File

@ -1284,6 +1284,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void pshuflw(XMMRegister dst, XMMRegister src, uint8_t shuffle);
void pshuflw(XMMRegister dst, Operand src, uint8_t shuffle);
void movhlps(XMMRegister dst, XMMRegister src) {
sse_instr(dst, src, 0x0F, 0x12);
}
void movlhps(XMMRegister dst, XMMRegister src) {
sse_instr(dst, src, 0x0F, 0x16);
}
@ -1386,6 +1389,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
}
void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
void vcvtss2sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x5a, dst, src1, src2, kF3, k0F, kWIG);
}

View File

@ -1293,6 +1293,12 @@ void TurboAssembler::Move(Register dst, Register src) {
}
}
void TurboAssembler::Move(XMMRegister dst, XMMRegister src) {
if (dst != src) {
movaps(dst, src);
}
}
void TurboAssembler::MovePair(Register dst0, Register src0, Register dst1,
Register src1) {
if (dst0 != src1) {

View File

@ -442,6 +442,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// Move if the registers are not identical.
void Move(Register target, Register source);
void Move(XMMRegister target, XMMRegister source);
void Move(Register dst, Handle<HeapObject> source,
RelocInfo::Mode rmode = RelocInfo::FULL_EMBEDDED_OBJECT);

View File

@ -2453,8 +2453,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F64x2ExtractLane: {
__ Pextrq(kScratchRegister, i.InputSimd128Register(0), i.InputInt8(1));
__ Movq(i.OutputDoubleRegister(), kScratchRegister);
DoubleRegister dst = i.OutputDoubleRegister();
XMMRegister src = i.InputSimd128Register(0);
uint8_t lane = i.InputUint8(1);
if (lane == 0) {
__ Move(dst, src);
} else {
DCHECK_EQ(1, lane);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
// Pass src as operand to avoid false-dependency on dst.
__ vmovhlps(dst, src, src);
} else {
__ movhlps(dst, src);
}
}
break;
}
case kX64F64x2Sqrt: {

View File

@ -1381,9 +1381,15 @@ int DisassemblerX64::AVXInstruction(byte* data) {
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x12:
if (mod == 0b11) {
AppendToBuffer("vmovhlps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
} else {
AppendToBuffer("vmovlps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
}
break;
case 0x13:
AppendToBuffer("vmovlps ");
@ -2065,8 +2071,13 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
// movups xmm/m128, xmm
current += PrintOperands("movups", XMMOPER_XMMREG_OP_ORDER, current);
} else if (opcode == 0x12) {
// movhlps xmm1, xmm2
// movlps xmm1, m64
if (mod == 0b11) {
current += PrintOperands("movhlps", XMMREG_XMMOPER_OP_ORDER, current);
} else {
current += PrintOperands("movlps", XMMREG_OPER_OP_ORDER, current);
}
} else if (opcode == 0x13) {
// movlps m64, xmm1
AppendToBuffer("movlps ");

View File

@ -401,8 +401,10 @@ TEST(DisasmX64) {
__ movdqu(xmm0, Operand(rsp, 12));
__ movdqu(Operand(rsp, 12), xmm0);
__ movdqu(xmm1, xmm0);
__ movhlps(xmm5, xmm1);
__ movlps(xmm8, Operand(rbx, rcx, times_4, 10000));
__ movlps(Operand(rbx, rcx, times_4, 10000), xmm9);
__ movlhps(xmm5, xmm1);
__ movhps(xmm8, Operand(rbx, rcx, times_4, 10000));
__ movhps(Operand(rbx, rcx, times_4, 10000), xmm9);
__ shufps(xmm0, xmm9, 0x0);
@ -577,7 +579,6 @@ TEST(DisasmX64) {
__ movups(xmm5, xmm1);
__ movups(xmm5, Operand(rdx, 4));
__ movups(Operand(rdx, 4), xmm5);
__ movlhps(xmm5, xmm1);
__ pmulld(xmm5, xmm1);
__ pmulld(xmm5, Operand(rdx, 4));
__ pmullw(xmm5, xmm1);
@ -659,8 +660,10 @@ TEST(DisasmX64) {
__ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovhlps(xmm1, xmm3, xmm5);
__ vmovlps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovlps(Operand(rbx, rcx, times_4, 10000), xmm9);
__ vmovlhps(xmm1, xmm3, xmm5);
__ vmovhps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovhps(Operand(rbx, rcx, times_4, 10000), xmm12);
@ -693,7 +696,6 @@ TEST(DisasmX64) {
__ vmovups(xmm5, xmm1);
__ vmovups(xmm5, Operand(rdx, 4));
__ vmovups(Operand(rdx, 4), xmm5);
__ vmovlhps(xmm1, xmm3, xmm5);
__ vandps(xmm0, xmm9, xmm2);
__ vandps(xmm9, xmm1, Operand(rbx, rcx, times_4, 10000));