[arm][turbofan] Use NEON for unaligned float64 memory accesses

When available, we use the NEON instructions vld1.8 and vst1.8 to
implement unaligned loads and stores of float64 values.

R=bmeurer@chromium.org, v8-arm-ports@googlegroups.com

Review-Url: https://codereview.chromium.org/2769723003
Cr-Commit-Position: refs/heads/master@{#44063}
This commit is contained in:
ahaas 2017-03-23 07:25:35 -07:00 committed by Commit bot
parent 118f09f121
commit ae8bc6ed2a
5 changed files with 76 additions and 32 deletions

View File

@ -468,7 +468,6 @@ NeonMemOperand::NeonMemOperand(Register rn, Register rm, int align) {
SetAlignment(align);
}
void NeonMemOperand::SetAlignment(int align) {
switch (align) {
case 0:

View File

@ -1443,6 +1443,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vstr(i.InputFloatRegister(0), i.InputOffset(1));
DCHECK_EQ(LeaveCC, i.OutputSBit());
break;
case kArmVld1F64: {
__ vld1(NeonSize::Neon8, NeonListOperand(i.OutputDoubleRegister()),
NeonMemOperand(i.InputRegister(0)));
break;
}
case kArmVst1F64: {
__ vst1(Neon8, NeonListOperand(i.InputDoubleRegister(0)),
NeonMemOperand(i.InputRegister(1)));
break;
}
case kArmVldrF64:
__ vldr(i.OutputDoubleRegister(), i.InputOffset());
DCHECK_EQ(LeaveCC, i.OutputSBit());

View File

@ -104,7 +104,9 @@ namespace compiler {
V(ArmVldrF32) \
V(ArmVstrF32) \
V(ArmVldrF64) \
V(ArmVld1F64) \
V(ArmVstrF64) \
V(ArmVst1F64) \
V(ArmFloat32Max) \
V(ArmFloat64Max) \
V(ArmFloat32Min) \

View File

@ -216,6 +216,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmVldrF32:
case kArmVldrF64:
case kArmVld1F64:
case kArmLdrb:
case kArmLdrsb:
case kArmLdrh:
@ -225,6 +226,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmVstrF32:
case kArmVstrF64:
case kArmVst1F64:
case kArmStrb:
case kArmStrh:
case kArmStr:

View File

@ -560,7 +560,6 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
return;
}
case MachineRepresentation::kFloat64: {
// TODO(arm): use vld1.8 for this when NEON is available.
// Compute the address of the least-significant half of the FP value.
// We assume that the base node is unlikely to be an encodable immediate
// or the result of a shift operation, so only consider the addressing
@ -572,8 +571,8 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
size_t input_count;
if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,
&inputs[1])) {
// input_count has been set by TryMatchImmediateOrShift(), so increment
// it to account for the base register in inputs[0].
// input_count has been set by TryMatchImmediateOrShift(), so
// increment it to account for the base register in inputs[0].
input_count++;
} else {
add_opcode |= AddressingModeField::encode(kMode_Operand2_R);
@ -584,13 +583,18 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
InstructionOperand addr = g.TempRegister();
Emit(add_opcode, 1, &addr, input_count, inputs);
// Load both halves and move to an FP register.
InstructionOperand fp_lo = g.TempRegister();
InstructionOperand fp_hi = g.TempRegister();
opcode |= AddressingModeField::encode(kMode_Offset_RI);
Emit(opcode, fp_lo, addr, g.TempImmediate(0));
Emit(opcode, fp_hi, addr, g.TempImmediate(4));
Emit(kArmVmovF64U32U32, g.DefineAsRegister(node), fp_lo, fp_hi);
if (CpuFeatures::IsSupported(NEON)) {
// With NEON we can load directly from the calculated address.
Emit(kArmVld1F64, g.DefineAsRegister(node), addr);
} else {
// Load both halves and move to an FP register.
InstructionOperand fp_lo = g.TempRegister();
InstructionOperand fp_hi = g.TempRegister();
opcode |= AddressingModeField::encode(kMode_Offset_RI);
Emit(opcode, fp_lo, addr, g.TempImmediate(0));
Emit(opcode, fp_hi, addr, g.TempImmediate(4));
Emit(kArmVmovF64U32U32, g.DefineAsRegister(node), fp_lo, fp_hi);
}
return;
}
default:
@ -624,30 +628,57 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
return;
}
case MachineRepresentation::kFloat64: {
// TODO(arm): use vst1.8 for this when NEON is available.
// Store a 64-bit floating point value using two 32-bit integer stores.
// Computing the store address here would require three live temporary
// registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after
// storing the least-significant half of the value.
if (CpuFeatures::IsSupported(NEON)) {
InstructionOperand address = g.TempRegister();
{
// First we have to calculate the actual address.
InstructionCode add_opcode = kArmAdd;
InstructionOperand inputs[3];
inputs[0] = g.UseRegister(base);
// First, move the 64-bit FP value into two temporary integer registers.
InstructionOperand fp[] = {g.TempRegister(), g.TempRegister()};
inputs[input_count++] = g.UseRegister(value);
Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count,
inputs);
size_t input_count;
if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,
&inputs[1])) {
// input_count has been set by TryMatchImmediateOrShift(), so
// increment it to account for the base register in inputs[0].
input_count++;
} else {
add_opcode |= AddressingModeField::encode(kMode_Operand2_R);
inputs[1] = g.UseRegister(index);
input_count = 2; // Base register and index.
}
// Store the least-significant half.
inputs[0] = fp[0]; // Low 32-bits of FP value.
inputs[input_count++] = g.UseRegister(base); // First store base address.
EmitStore(this, kArmStr, input_count, inputs, index);
Emit(add_opcode, 1, &address, input_count, inputs);
}
// Store the most-significant half.
InstructionOperand base4 = g.TempRegister();
Emit(kArmAdd | AddressingModeField::encode(kMode_Operand2_I), base4,
g.UseRegister(base), g.TempImmediate(4)); // Compute base + 4.
inputs[0] = fp[1]; // High 32-bits of FP value.
inputs[1] = base4; // Second store base + 4 address.
EmitStore(this, kArmStr, input_count, inputs, index);
inputs[input_count++] = g.UseRegister(value);
inputs[input_count++] = address;
Emit(kArmVst1F64, 0, nullptr, input_count, inputs);
} else {
// Store a 64-bit floating point value using two 32-bit integer stores.
// Computing the store address here would require three live temporary
// registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after
// storing the least-significant half of the value.
// First, move the 64-bit FP value into two temporary integer registers.
InstructionOperand fp[] = {g.TempRegister(), g.TempRegister()};
inputs[input_count++] = g.UseRegister(value);
Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count, inputs);
// Store the least-significant half.
inputs[0] = fp[0]; // Low 32-bits of FP value.
inputs[input_count++] =
g.UseRegister(base); // First store base address.
EmitStore(this, kArmStr, input_count, inputs, index);
// Store the most-significant half.
InstructionOperand base4 = g.TempRegister();
Emit(kArmAdd | AddressingModeField::encode(kMode_Operand2_I), base4,
g.UseRegister(base), g.TempImmediate(4)); // Compute base + 4.
inputs[0] = fp[1]; // High 32-bits of FP value.
inputs[1] = base4; // Second store base + 4 address.
EmitStore(this, kArmStr, input_count, inputs, index);
}
return;
}
default: