[ARM] Make Simd 128 bit load/store more like existing load/store.

- Renames kArmSimd128Load, kArmSimd128Store to kArmVld1S128, kArmVst1S128 - Handles the unaligned load/store cases. LOG=N BUG=v8:6020 Review-Url: https://codereview.chromium.org/2769083003 Cr-Commit-Position: refs/heads/master@{#44117}
2017-03-24 10:40:49 -07:00 · 2017-03-24 10:40:49 -07:00 · 6839e7ac08
commit 6839e7ac08
parent 14e01da1cf
4 changed files with 38 additions and 28 deletions
--- a/src/compiler/arm/code-generator-arm.cc
+++ b/src/compiler/arm/code-generator-arm.cc
@ -1444,7 +1444,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      DCHECK_EQ(LeaveCC, i.OutputSBit());
      break;
    case kArmVld1F64: {
-      __ vld1(NeonSize::Neon8, NeonListOperand(i.OutputDoubleRegister()),
+      __ vld1(Neon8, NeonListOperand(i.OutputDoubleRegister()),
              NeonMemOperand(i.InputRegister(0)));
      break;
    }
@ -1453,6 +1453,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
              NeonMemOperand(i.InputRegister(1)));
      break;
    }
+    case kArmVld1S128: {
+      __ vld1(Neon8, NeonListOperand(i.OutputSimd128Register()),
+              NeonMemOperand(i.InputRegister(0)));
+      break;
+    }
+    case kArmVst1S128: {
+      __ vst1(Neon8, NeonListOperand(i.InputSimd128Register(0)),
+              NeonMemOperand(i.InputRegister(1)));
+      break;
+    }
    case kArmVldrF64:
      __ vldr(i.OutputDoubleRegister(), i.InputOffset());
      DCHECK_EQ(LeaveCC, i.OutputSBit());
@ -1992,18 +2002,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
              i.OutputSimd128Register());
      break;
    }
-    case kArmSimd128Load: {
-      MemOperand src = i.InputOffset();
-      __ vld1(Neon8, NeonListOperand(i.OutputSimd128Register()),
-              NeonMemOperand(src.rn(), src.rm()));
-      break;
-    }
-    case kArmSimd128Store: {
-      MemOperand src = i.InputOffset(1);
-      __ vst1(Neon8, NeonListOperand(i.InputSimd128Register(0)),
-              NeonMemOperand(src.rn(), src.rm()));
-      break;
-    }
    case kArmSimd128And: {
      __ vand(i.OutputSimd128Register(), i.InputSimd128Register(0),
              i.InputSimd128Register(1));
--- a/src/compiler/arm/instruction-codes-arm.h
+++ b/src/compiler/arm/instruction-codes-arm.h
@ -107,6 +107,8 @@ namespace compiler {
  V(ArmVld1F64)                    \
  V(ArmVstrF64)                    \
  V(ArmVst1F64)                    \
+  V(ArmVld1S128)                   \
+  V(ArmVst1S128)                   \
  V(ArmFloat32Max)                 \
  V(ArmFloat64Max)                 \
  V(ArmFloat32Min)                 \
@ -213,8 +215,6 @@ namespace compiler {
  V(ArmUint8x16LessThan)           \
  V(ArmUint8x16LessThanOrEqual)    \
  V(ArmSimd128Zero)                \
-  V(ArmSimd128Load)                \
-  V(ArmSimd128Store)               \
  V(ArmSimd128And)                 \
  V(ArmSimd128Or)                  \
  V(ArmSimd128Xor)                 \
--- a/src/compiler/arm/instruction-scheduler-arm.cc
+++ b/src/compiler/arm/instruction-scheduler-arm.cc
@ -199,8 +199,6 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArmUint8x16LessThan:
    case kArmUint8x16LessThanOrEqual:
    case kArmSimd128Zero:
-    case kArmSimd128Load:
-    case kArmSimd128Store:
    case kArmSimd128And:
    case kArmSimd128Or:
    case kArmSimd128Xor:
@ -217,6 +215,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArmVldrF32:
    case kArmVldrF64:
    case kArmVld1F64:
+    case kArmVld1S128:
    case kArmLdrb:
    case kArmLdrsb:
    case kArmLdrh:
@ -227,6 +226,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArmVstrF32:
    case kArmVstrF64:
    case kArmVst1F64:
+    case kArmVst1S128:
    case kArmStrb:
    case kArmStrh:
    case kArmStr:
--- a/src/compiler/arm/instruction-selector-arm.cc
+++ b/src/compiler/arm/instruction-selector-arm.cc
@ -427,7 +427,7 @@ void InstructionSelector::VisitLoad(Node* node) {
      opcode = kArmLdr;
      break;
    case MachineRepresentation::kSimd128:
-      opcode = kArmSimd128Load;
+      opcode = kArmVld1S128;
      break;
    case MachineRepresentation::kWord64:   // Fall through.
    case MachineRepresentation::kSimd1x4:  // Fall through.
@ -517,7 +517,7 @@ void InstructionSelector::VisitStore(Node* node) {
        opcode = kArmStr;
        break;
      case MachineRepresentation::kSimd128:
-        opcode = kArmSimd128Store;
+        opcode = kArmVst1S128;
        break;
      case MachineRepresentation::kWord64:   // Fall through.
      case MachineRepresentation::kSimd1x4:  // Fall through.
@ -542,8 +542,8 @@ void InstructionSelector::VisitProtectedStore(Node* node) {
 }

 void InstructionSelector::VisitUnalignedLoad(Node* node) {
-  UnalignedLoadRepresentation load_rep =
-      UnalignedLoadRepresentationOf(node->op());
+  MachineRepresentation load_rep =
+      UnalignedLoadRepresentationOf(node->op()).representation();
  ArmOperandGenerator g(this);
  Node* base = node->InputAt(0);
  Node* index = node->InputAt(1);
@ -551,16 +551,18 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
  InstructionCode opcode = kArmLdr;
  // Only floating point loads need to be specially handled; integer loads
  // support unaligned access. We support unaligned FP loads by loading to
-  // integer registers first, then moving to the destination FP register.
-  switch (load_rep.representation()) {
+  // integer registers first, then moving to the destination FP register. If
+  // NEON is supported, we use the vld1.8 instruction.
+  switch (load_rep) {
    case MachineRepresentation::kFloat32: {
      InstructionOperand temp = g.TempRegister();
      EmitLoad(this, opcode, &temp, base, index);
      Emit(kArmVmovF32U32, g.DefineAsRegister(node), temp);
      return;
    }
-    case MachineRepresentation::kFloat64: {
-      // Compute the address of the least-significant half of the FP value.
+    case MachineRepresentation::kFloat64:
+    case MachineRepresentation::kSimd128: {
+      // Compute the address of the least-significant byte of the FP value.
      // We assume that the base node is unlikely to be an encodable immediate
      // or the result of a shift operation, so only consider the addressing
      // mode that should be used for the index node.
@ -585,8 +587,12 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {

      if (CpuFeatures::IsSupported(NEON)) {
        // With NEON we can load directly from the calculated address.
-        Emit(kArmVld1F64, g.DefineAsRegister(node), addr);
+        ArchOpcode op = load_rep == MachineRepresentation::kFloat64
+                            ? kArmVld1F64
+                            : kArmVld1S128;
+        Emit(op, g.DefineAsRegister(node), addr);
      } else {
+        DCHECK_NE(MachineRepresentation::kSimd128, load_rep);
        // Load both halves and move to an FP register.
        InstructionOperand fp_lo = g.TempRegister();
        InstructionOperand fp_hi = g.TempRegister();
@ -619,6 +625,7 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
  // Only floating point stores need to be specially handled; integer stores
  // support unaligned access. We support unaligned FP stores by moving the
  // value to integer registers first, then storing to the destination address.
+  // If NEON is supported, we use the vst1.8 instruction.
  switch (store_rep) {
    case MachineRepresentation::kFloat32: {
      inputs[input_count++] = g.TempRegister();
@ -627,7 +634,8 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
      EmitStore(this, kArmStr, input_count, inputs, index);
      return;
    }
-    case MachineRepresentation::kFloat64: {
+    case MachineRepresentation::kFloat64:
+    case MachineRepresentation::kSimd128: {
      if (CpuFeatures::IsSupported(NEON)) {
        InstructionOperand address = g.TempRegister();
        {
@ -653,8 +661,12 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {

        inputs[input_count++] = g.UseRegister(value);
        inputs[input_count++] = address;
-        Emit(kArmVst1F64, 0, nullptr, input_count, inputs);
+        ArchOpcode op = store_rep == MachineRepresentation::kFloat64
+                            ? kArmVst1F64
+                            : kArmVst1S128;
+        Emit(op, 0, nullptr, input_count, inputs);
      } else {
+        DCHECK_NE(MachineRepresentation::kSimd128, store_rep);
        // Store a 64-bit floating point value using two 32-bit integer stores.
        // Computing the store address here would require three live temporary
        // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after