ARM: Make DoStoreKeyedFixedDoubleArray faster; don't allow conditional Vmov

This patch makes us generate faster code for DoStoreKeyedFixedDoubleArray, by using a branch rather than a conditional Vmov instruction. Conditional VFP instructions are not a great idea in general, and it was especially bad in this case because Vmov expands to a bunch of instructions. For this reason, the patch also removes the 'cond' parameter from Vmov. Thanks to Rodolph for pointing me to this! BUG=none Review URL: https://chromiumcodereview.appspot.com/12316096 Patch from Hans Wennborg <hans@chromium.org>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13722 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2013-02-25 16:15:37 +00:00 · 2013-02-25 16:15:37 +00:00 · 9ebcfb41e2
commit 9ebcfb41e2
parent 34c372d800
5 changed files with 20 additions and 20 deletions
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@ -2067,8 +2067,7 @@ static bool FitsVMOVDoubleImmediate(double d, uint32_t *encoding) {

 void Assembler::vmov(const DwVfpRegister dst,
                     double imm,
-                     const Register scratch,
-                     const Condition cond) {
+                     const Register scratch) {
  ASSERT(CpuFeatures::IsEnabled(VFP2));

  uint32_t enc;
@ -2081,7 +2080,7 @@ void Assembler::vmov(const DwVfpRegister dst,
    // Vd(15-12) | 101(11-9) | sz=1(8) | imm4L(3-0)
    int vd, d;
    dst.split_code(&vd, &d);
-    emit(cond | 0x1D*B23 | d*B22 | 0x3*B20 | vd*B12 | 0x5*B9 | B8 | enc);
+    emit(al | 0x1D*B23 | d*B22 | 0x3*B20 | vd*B12 | 0x5*B9 | B8 | enc);
  } else if (FLAG_enable_vldr_imm) {
    // TODO(jfb) Temporarily turned off until we have constant blinding or
    //           some equivalent mitigation: an attacker can otherwise control
@ -2099,7 +2098,7 @@ void Assembler::vmov(const DwVfpRegister dst,
    //           that's tricky because vldr has a limited reach. Furthermore
    //           it breaks load locality.
    RecordRelocInfo(imm);
-    vldr(dst, MemOperand(pc, 0), cond);
+    vldr(dst, MemOperand(pc, 0));
  } else {
    // Synthesise the double from ARM immediates.
    uint32_t lo, hi;
@ -2110,27 +2109,27 @@ void Assembler::vmov(const DwVfpRegister dst,
        // Move the low part of the double into the lower of the corresponsing S
        // registers of D register dst.
        mov(ip, Operand(lo));
-        vmov(dst.low(), ip, cond);
+        vmov(dst.low(), ip);

        // Move the high part of the double into the higher of the
        // corresponsing S registers of D register dst.
        mov(ip, Operand(hi));
-        vmov(dst.high(), ip, cond);
+        vmov(dst.high(), ip);
      } else {
        // D16-D31 does not have S registers, so move the low and high parts
        // directly to the D register using vmov.32.
        // Note: This may be slower, so we only do this when we have to.
        mov(ip, Operand(lo));
-        vmov(dst, VmovIndexLo, ip, cond);
+        vmov(dst, VmovIndexLo, ip);
        mov(ip, Operand(hi));
-        vmov(dst, VmovIndexHi, ip, cond);
+        vmov(dst, VmovIndexHi, ip);
      }
    } else {
      // Move the low and high parts of the double to a D register in one
      // instruction.
      mov(ip, Operand(lo));
      mov(scratch, Operand(hi));
-      vmov(dst, ip, scratch, cond);
+      vmov(dst, ip, scratch);
    }
  }
 }
--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@ -1066,8 +1066,7 @@ class Assembler : public AssemblerBase {

  void vmov(const DwVfpRegister dst,
            double imm,
-            const Register scratch = no_reg,
-            const Condition cond = al);
+            const Register scratch = no_reg);
  void vmov(const SwVfpRegister dst,
            const SwVfpRegister src,
            const Condition cond = al);
--- a/src/arm/lithium-codegen-arm.cc
+++ b/src/arm/lithium-codegen-arm.cc
@ -4472,10 +4472,14 @@ void LCodeGen::DoStoreKeyedFixedDoubleArray(LStoreKeyed* instr) {
  if (instr->NeedsCanonicalization()) {
    // Check for NaN. All NaNs must be canonicalized.
    __ VFPCompareAndSetFlags(value, value);
+    Label after_canonicalization;
+
    // Only load canonical NaN if the comparison above set the overflow.
+    __ b(vc, &after_canonicalization);
    __ Vmov(value,
-            FixedDoubleArray::canonical_not_the_hole_nan_as_double(),
-            no_reg, vs);
+            FixedDoubleArray::canonical_not_the_hole_nan_as_double());
+
+    __ bind(&after_canonicalization);
  }

  __ vstr(value, scratch, instr->additional_index() << element_size_shift);
--- a/src/arm/macro-assembler-arm.cc
+++ b/src/arm/macro-assembler-arm.cc
@ -812,19 +812,18 @@ void MacroAssembler::VFPCompareAndLoadFlags(const DwVfpRegister src1,

 void MacroAssembler::Vmov(const DwVfpRegister dst,
                          const double imm,
-                          const Register scratch,
-                          const Condition cond) {
+                          const Register scratch) {
  ASSERT(CpuFeatures::IsEnabled(VFP2));
  static const DoubleRepresentation minus_zero(-0.0);
  static const DoubleRepresentation zero(0.0);
  DoubleRepresentation value(imm);
  // Handle special values first.
  if (value.bits == zero.bits) {
-    vmov(dst, kDoubleRegZero, cond);
+    vmov(dst, kDoubleRegZero);
  } else if (value.bits == minus_zero.bits) {
-    vneg(dst, kDoubleRegZero, cond);
+    vneg(dst, kDoubleRegZero);
  } else {
-    vmov(dst, imm, scratch, cond);
+    vmov(dst, imm, scratch);
  }
 }

--- a/src/arm/macro-assembler-arm.h
+++ b/src/arm/macro-assembler-arm.h
@ -480,8 +480,7 @@ class MacroAssembler: public Assembler {

  void Vmov(const DwVfpRegister dst,
            const double imm,
-            const Register scratch = no_reg,
-            const Condition cond = al);
+            const Register scratch = no_reg);

  // Enter exit frame.
  // stack_space - extra stack space, used for alignment before call to C.