diff --git a/src/compiler/arm/code-generator-arm.cc b/src/compiler/arm/code-generator-arm.cc index 6a41809cd0..72493df3da 100644 --- a/src/compiler/arm/code-generator-arm.cc +++ b/src/compiler/arm/code-generator-arm.cc @@ -1444,7 +1444,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( DCHECK_EQ(LeaveCC, i.OutputSBit()); break; case kArmVld1F64: { - __ vld1(NeonSize::Neon8, NeonListOperand(i.OutputDoubleRegister()), + __ vld1(Neon8, NeonListOperand(i.OutputDoubleRegister()), NeonMemOperand(i.InputRegister(0))); break; } @@ -1453,6 +1453,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( NeonMemOperand(i.InputRegister(1))); break; } + case kArmVld1S128: { + __ vld1(Neon8, NeonListOperand(i.OutputSimd128Register()), + NeonMemOperand(i.InputRegister(0))); + break; + } + case kArmVst1S128: { + __ vst1(Neon8, NeonListOperand(i.InputSimd128Register(0)), + NeonMemOperand(i.InputRegister(1))); + break; + } case kArmVldrF64: __ vldr(i.OutputDoubleRegister(), i.InputOffset()); DCHECK_EQ(LeaveCC, i.OutputSBit()); @@ -1992,18 +2002,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( i.OutputSimd128Register()); break; } - case kArmSimd128Load: { - MemOperand src = i.InputOffset(); - __ vld1(Neon8, NeonListOperand(i.OutputSimd128Register()), - NeonMemOperand(src.rn(), src.rm())); - break; - } - case kArmSimd128Store: { - MemOperand src = i.InputOffset(1); - __ vst1(Neon8, NeonListOperand(i.InputSimd128Register(0)), - NeonMemOperand(src.rn(), src.rm())); - break; - } case kArmSimd128And: { __ vand(i.OutputSimd128Register(), i.InputSimd128Register(0), i.InputSimd128Register(1)); diff --git a/src/compiler/arm/instruction-codes-arm.h b/src/compiler/arm/instruction-codes-arm.h index a44206b402..5a05c4bfa0 100644 --- a/src/compiler/arm/instruction-codes-arm.h +++ b/src/compiler/arm/instruction-codes-arm.h @@ -107,6 +107,8 @@ namespace compiler { V(ArmVld1F64) \ V(ArmVstrF64) \ V(ArmVst1F64) \ + V(ArmVld1S128) \ + V(ArmVst1S128) \ V(ArmFloat32Max) \ V(ArmFloat64Max) \ V(ArmFloat32Min) \ @@ -213,8 +215,6 @@ namespace compiler { V(ArmUint8x16LessThan) \ V(ArmUint8x16LessThanOrEqual) \ V(ArmSimd128Zero) \ - V(ArmSimd128Load) \ - V(ArmSimd128Store) \ V(ArmSimd128And) \ V(ArmSimd128Or) \ V(ArmSimd128Xor) \ diff --git a/src/compiler/arm/instruction-scheduler-arm.cc b/src/compiler/arm/instruction-scheduler-arm.cc index 2eaca956c4..00f0f759c6 100644 --- a/src/compiler/arm/instruction-scheduler-arm.cc +++ b/src/compiler/arm/instruction-scheduler-arm.cc @@ -199,8 +199,6 @@ int InstructionScheduler::GetTargetInstructionFlags( case kArmUint8x16LessThan: case kArmUint8x16LessThanOrEqual: case kArmSimd128Zero: - case kArmSimd128Load: - case kArmSimd128Store: case kArmSimd128And: case kArmSimd128Or: case kArmSimd128Xor: @@ -217,6 +215,7 @@ int InstructionScheduler::GetTargetInstructionFlags( case kArmVldrF32: case kArmVldrF64: case kArmVld1F64: + case kArmVld1S128: case kArmLdrb: case kArmLdrsb: case kArmLdrh: @@ -227,6 +226,7 @@ int InstructionScheduler::GetTargetInstructionFlags( case kArmVstrF32: case kArmVstrF64: case kArmVst1F64: + case kArmVst1S128: case kArmStrb: case kArmStrh: case kArmStr: diff --git a/src/compiler/arm/instruction-selector-arm.cc b/src/compiler/arm/instruction-selector-arm.cc index 6f3957183f..c29afb87ca 100644 --- a/src/compiler/arm/instruction-selector-arm.cc +++ b/src/compiler/arm/instruction-selector-arm.cc @@ -427,7 +427,7 @@ void InstructionSelector::VisitLoad(Node* node) { opcode = kArmLdr; break; case MachineRepresentation::kSimd128: - opcode = kArmSimd128Load; + opcode = kArmVld1S128; break; case MachineRepresentation::kWord64: // Fall through. case MachineRepresentation::kSimd1x4: // Fall through. @@ -517,7 +517,7 @@ void InstructionSelector::VisitStore(Node* node) { opcode = kArmStr; break; case MachineRepresentation::kSimd128: - opcode = kArmSimd128Store; + opcode = kArmVst1S128; break; case MachineRepresentation::kWord64: // Fall through. case MachineRepresentation::kSimd1x4: // Fall through. @@ -542,8 +542,8 @@ void InstructionSelector::VisitProtectedStore(Node* node) { } void InstructionSelector::VisitUnalignedLoad(Node* node) { - UnalignedLoadRepresentation load_rep = - UnalignedLoadRepresentationOf(node->op()); + MachineRepresentation load_rep = + UnalignedLoadRepresentationOf(node->op()).representation(); ArmOperandGenerator g(this); Node* base = node->InputAt(0); Node* index = node->InputAt(1); @@ -551,16 +551,18 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) { InstructionCode opcode = kArmLdr; // Only floating point loads need to be specially handled; integer loads // support unaligned access. We support unaligned FP loads by loading to - // integer registers first, then moving to the destination FP register. - switch (load_rep.representation()) { + // integer registers first, then moving to the destination FP register. If + // NEON is supported, we use the vld1.8 instruction. + switch (load_rep) { case MachineRepresentation::kFloat32: { InstructionOperand temp = g.TempRegister(); EmitLoad(this, opcode, &temp, base, index); Emit(kArmVmovF32U32, g.DefineAsRegister(node), temp); return; } - case MachineRepresentation::kFloat64: { - // Compute the address of the least-significant half of the FP value. + case MachineRepresentation::kFloat64: + case MachineRepresentation::kSimd128: { + // Compute the address of the least-significant byte of the FP value. // We assume that the base node is unlikely to be an encodable immediate // or the result of a shift operation, so only consider the addressing // mode that should be used for the index node. @@ -585,8 +587,12 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) { if (CpuFeatures::IsSupported(NEON)) { // With NEON we can load directly from the calculated address. - Emit(kArmVld1F64, g.DefineAsRegister(node), addr); + ArchOpcode op = load_rep == MachineRepresentation::kFloat64 + ? kArmVld1F64 + : kArmVld1S128; + Emit(op, g.DefineAsRegister(node), addr); } else { + DCHECK_NE(MachineRepresentation::kSimd128, load_rep); // Load both halves and move to an FP register. InstructionOperand fp_lo = g.TempRegister(); InstructionOperand fp_hi = g.TempRegister(); @@ -619,6 +625,7 @@ void InstructionSelector::VisitUnalignedStore(Node* node) { // Only floating point stores need to be specially handled; integer stores // support unaligned access. We support unaligned FP stores by moving the // value to integer registers first, then storing to the destination address. + // If NEON is supported, we use the vst1.8 instruction. switch (store_rep) { case MachineRepresentation::kFloat32: { inputs[input_count++] = g.TempRegister(); @@ -627,7 +634,8 @@ void InstructionSelector::VisitUnalignedStore(Node* node) { EmitStore(this, kArmStr, input_count, inputs, index); return; } - case MachineRepresentation::kFloat64: { + case MachineRepresentation::kFloat64: + case MachineRepresentation::kSimd128: { if (CpuFeatures::IsSupported(NEON)) { InstructionOperand address = g.TempRegister(); { @@ -653,8 +661,12 @@ void InstructionSelector::VisitUnalignedStore(Node* node) { inputs[input_count++] = g.UseRegister(value); inputs[input_count++] = address; - Emit(kArmVst1F64, 0, nullptr, input_count, inputs); + ArchOpcode op = store_rep == MachineRepresentation::kFloat64 + ? kArmVst1F64 + : kArmVst1S128; + Emit(op, 0, nullptr, input_count, inputs); } else { + DCHECK_NE(MachineRepresentation::kSimd128, store_rep); // Store a 64-bit floating point value using two 32-bit integer stores. // Computing the store address here would require three live temporary // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after