[arm][turbofan] Use NEON for unaligned float64 memory accesses

When available, we use the NEON instructions vld1.8 and vst1.8 to implement unaligned loads and stores of float64 values. R=bmeurer@chromium.org, v8-arm-ports@googlegroups.com Review-Url: https://codereview.chromium.org/2769723003 Cr-Commit-Position: refs/heads/master@{#44063}
2017-03-23 07:25:35 -07:00 · 2017-03-23 07:25:35 -07:00 · ae8bc6ed2a
commit ae8bc6ed2a
parent 118f09f121
5 changed files with 76 additions and 32 deletions
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@ -468,7 +468,6 @@ NeonMemOperand::NeonMemOperand(Register rn, Register rm, int align) {
  SetAlignment(align);
 }

-
 void NeonMemOperand::SetAlignment(int align) {
  switch (align) {
    case 0:
--- a/src/compiler/arm/code-generator-arm.cc
+++ b/src/compiler/arm/code-generator-arm.cc
@ -1443,6 +1443,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ vstr(i.InputFloatRegister(0), i.InputOffset(1));
      DCHECK_EQ(LeaveCC, i.OutputSBit());
      break;
+    case kArmVld1F64: {
+      __ vld1(NeonSize::Neon8, NeonListOperand(i.OutputDoubleRegister()),
+              NeonMemOperand(i.InputRegister(0)));
+      break;
+    }
+    case kArmVst1F64: {
+      __ vst1(Neon8, NeonListOperand(i.InputDoubleRegister(0)),
+              NeonMemOperand(i.InputRegister(1)));
+      break;
+    }
    case kArmVldrF64:
      __ vldr(i.OutputDoubleRegister(), i.InputOffset());
      DCHECK_EQ(LeaveCC, i.OutputSBit());
--- a/src/compiler/arm/instruction-codes-arm.h
+++ b/src/compiler/arm/instruction-codes-arm.h
@ -104,7 +104,9 @@ namespace compiler {
  V(ArmVldrF32)                    \
  V(ArmVstrF32)                    \
  V(ArmVldrF64)                    \
+  V(ArmVld1F64)                    \
  V(ArmVstrF64)                    \
+  V(ArmVst1F64)                    \
  V(ArmFloat32Max)                 \
  V(ArmFloat64Max)                 \
  V(ArmFloat32Min)                 \
--- a/src/compiler/arm/instruction-scheduler-arm.cc
+++ b/src/compiler/arm/instruction-scheduler-arm.cc
@ -216,6 +216,7 @@ int InstructionScheduler::GetTargetInstructionFlags(

    case kArmVldrF32:
    case kArmVldrF64:
+    case kArmVld1F64:
    case kArmLdrb:
    case kArmLdrsb:
    case kArmLdrh:
@ -225,6 +226,7 @@ int InstructionScheduler::GetTargetInstructionFlags(

    case kArmVstrF32:
    case kArmVstrF64:
+    case kArmVst1F64:
    case kArmStrb:
    case kArmStrh:
    case kArmStr:
--- a/src/compiler/arm/instruction-selector-arm.cc
+++ b/src/compiler/arm/instruction-selector-arm.cc
@ -560,7 +560,6 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
      return;
    }
    case MachineRepresentation::kFloat64: {
-      // TODO(arm): use vld1.8 for this when NEON is available.
      // Compute the address of the least-significant half of the FP value.
      // We assume that the base node is unlikely to be an encodable immediate
      // or the result of a shift operation, so only consider the addressing
@ -572,8 +571,8 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
      size_t input_count;
      if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,
                                   &inputs[1])) {
-        // input_count has been set by TryMatchImmediateOrShift(), so increment
-        // it to account for the base register in inputs[0].
+        // input_count has been set by TryMatchImmediateOrShift(), so
+        // increment it to account for the base register in inputs[0].
        input_count++;
      } else {
        add_opcode |= AddressingModeField::encode(kMode_Operand2_R);
@ -584,6 +583,10 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
      InstructionOperand addr = g.TempRegister();
      Emit(add_opcode, 1, &addr, input_count, inputs);

+      if (CpuFeatures::IsSupported(NEON)) {
+        // With NEON we can load directly from the calculated address.
+        Emit(kArmVld1F64, g.DefineAsRegister(node), addr);
+      } else {
        // Load both halves and move to an FP register.
        InstructionOperand fp_lo = g.TempRegister();
        InstructionOperand fp_hi = g.TempRegister();
@ -591,6 +594,7 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
        Emit(opcode, fp_lo, addr, g.TempImmediate(0));
        Emit(opcode, fp_hi, addr, g.TempImmediate(4));
        Emit(kArmVmovF64U32U32, g.DefineAsRegister(node), fp_lo, fp_hi);
+      }
      return;
    }
    default:
@ -624,7 +628,33 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
      return;
    }
    case MachineRepresentation::kFloat64: {
-      // TODO(arm): use vst1.8 for this when NEON is available.
+      if (CpuFeatures::IsSupported(NEON)) {
+        InstructionOperand address = g.TempRegister();
+        {
+          // First we have to calculate the actual address.
+          InstructionCode add_opcode = kArmAdd;
+          InstructionOperand inputs[3];
+          inputs[0] = g.UseRegister(base);
+
+          size_t input_count;
+          if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,
+                                       &inputs[1])) {
+            // input_count has been set by TryMatchImmediateOrShift(), so
+            // increment it to account for the base register in inputs[0].
+            input_count++;
+          } else {
+            add_opcode |= AddressingModeField::encode(kMode_Operand2_R);
+            inputs[1] = g.UseRegister(index);
+            input_count = 2;  // Base register and index.
+          }
+
+          Emit(add_opcode, 1, &address, input_count, inputs);
+        }
+
+        inputs[input_count++] = g.UseRegister(value);
+        inputs[input_count++] = address;
+        Emit(kArmVst1F64, 0, nullptr, input_count, inputs);
+      } else {
        // Store a 64-bit floating point value using two 32-bit integer stores.
        // Computing the store address here would require three live temporary
        // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after
@ -633,12 +663,12 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
        // First, move the 64-bit FP value into two temporary integer registers.
        InstructionOperand fp[] = {g.TempRegister(), g.TempRegister()};
        inputs[input_count++] = g.UseRegister(value);
-      Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count,
-           inputs);
+        Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count, inputs);

        // Store the least-significant half.
        inputs[0] = fp[0];  // Low 32-bits of FP value.
-      inputs[input_count++] = g.UseRegister(base);  // First store base address.
+        inputs[input_count++] =
+            g.UseRegister(base);  // First store base address.
        EmitStore(this, kArmStr, input_count, inputs, index);

        // Store the most-significant half.
@ -648,6 +678,7 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
        inputs[0] = fp[1];  // High 32-bits of FP value.
        inputs[1] = base4;  // Second store base + 4 address.
        EmitStore(this, kArmStr, input_count, inputs, index);
+      }
      return;
    }
    default: