[x86] Allow (v)divsd->(v)mulsd to execute in parallel.

This tweak was already present in CrankShaft for the non-AVX case. As it turns out, it's also relevant even with AVX. Now the same optimization is applied in case of TurboFan as well. R=dcarney@chromium.org Review URL: https://codereview.chromium.org/1081033003 Cr-Commit-Position: refs/heads/master@{#27774}
2015-04-13 01:25:27 -07:00 · 2015-04-13 01:25:27 -07:00 · e21f9ab42b
commit e21f9ab42b
parent 1dbc432729
4 changed files with 30 additions and 6 deletions
--- a/src/compiler/ia32/code-generator-ia32.cc
+++ b/src/compiler/ia32/code-generator-ia32.cc
@ -468,6 +468,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      break;
    case kSSEFloat32Div:
      __ divss(i.InputDoubleRegister(0), i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kSSEFloat32Max:
      __ maxss(i.InputDoubleRegister(0), i.InputOperand(1));
@ -506,6 +509,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      break;
    case kSSEFloat64Div:
      __ divsd(i.InputDoubleRegister(0), i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kSSEFloat64Max:
      __ maxsd(i.InputDoubleRegister(0), i.InputOperand(1));
@ -629,6 +635,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      CpuFeatureScope avx_scope(masm(), AVX);
      __ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    }
    case kAVXFloat32Max: {
@ -665,6 +674,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      CpuFeatureScope avx_scope(masm(), AVX);
      __ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    }
    case kAVXFloat64Max: {
--- a/src/compiler/x64/code-generator-x64.cc
+++ b/src/compiler/x64/code-generator-x64.cc
@ -726,6 +726,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      break;
    case kSSEFloat32Div:
      ASSEMBLE_SSE_BINOP(divss);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kSSEFloat32Abs: {
      // TODO(bmeurer): Use RIP relative 128-bit constants.
@ -767,6 +770,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      break;
    case kSSEFloat64Div:
      ASSEMBLE_SSE_BINOP(divsd);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kSSEFloat64Mod: {
      __ subq(rsp, Immediate(kDoubleSize));
@ -919,6 +925,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      break;
    case kAVXFloat32Div:
      ASSEMBLE_AVX_BINOP(vdivss);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kAVXFloat32Max:
      ASSEMBLE_AVX_BINOP(vmaxss);
@ -946,6 +955,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
      break;
    case kAVXFloat64Div:
      ASSEMBLE_AVX_BINOP(vdivsd);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kAVXFloat64Max:
      ASSEMBLE_AVX_BINOP(vmaxsd);
--- a/src/ia32/lithium-codegen-ia32.cc
+++ b/src/ia32/lithium-codegen-ia32.cc
@ -2040,10 +2040,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) {
      } else {
        DCHECK(result.is(left));
        __ divsd(left, right);
-        // Don't delete this mov. It may improve performance on some CPUs,
-        // when there is a mulsd depending on the result
-        __ movaps(left, left);
      }
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result
+      __ movaps(result, result);
      break;
    case Token::MOD: {
      // Pass two doubles as arguments on the stack.
--- a/src/x64/lithium-codegen-x64.cc
+++ b/src/x64/lithium-codegen-x64.cc
@ -2076,10 +2076,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) {
      } else {
        DCHECK(result.is(left));
        __ divsd(left, right);
-        // Don't delete this mov. It may improve performance on some CPUs,
-        // when there is a mulsd depending on the result
-        __ movaps(left, left);
      }
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result
+      __ movaps(result, result);
      break;
    case Token::MOD: {
      XMMRegister xmm_scratch = double_scratch0();