[ARM] Make Simd 128 bit load/store more like existing load/store.
- Renames kArmSimd128Load, kArmSimd128Store to kArmVld1S128, kArmVst1S128 - Handles the unaligned load/store cases. LOG=N BUG=v8:6020 Review-Url: https://codereview.chromium.org/2769083003 Cr-Commit-Position: refs/heads/master@{#44117}
This commit is contained in:
parent
14e01da1cf
commit
6839e7ac08
@ -1444,7 +1444,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
DCHECK_EQ(LeaveCC, i.OutputSBit());
|
||||
break;
|
||||
case kArmVld1F64: {
|
||||
__ vld1(NeonSize::Neon8, NeonListOperand(i.OutputDoubleRegister()),
|
||||
__ vld1(Neon8, NeonListOperand(i.OutputDoubleRegister()),
|
||||
NeonMemOperand(i.InputRegister(0)));
|
||||
break;
|
||||
}
|
||||
@ -1453,6 +1453,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
NeonMemOperand(i.InputRegister(1)));
|
||||
break;
|
||||
}
|
||||
case kArmVld1S128: {
|
||||
__ vld1(Neon8, NeonListOperand(i.OutputSimd128Register()),
|
||||
NeonMemOperand(i.InputRegister(0)));
|
||||
break;
|
||||
}
|
||||
case kArmVst1S128: {
|
||||
__ vst1(Neon8, NeonListOperand(i.InputSimd128Register(0)),
|
||||
NeonMemOperand(i.InputRegister(1)));
|
||||
break;
|
||||
}
|
||||
case kArmVldrF64:
|
||||
__ vldr(i.OutputDoubleRegister(), i.InputOffset());
|
||||
DCHECK_EQ(LeaveCC, i.OutputSBit());
|
||||
@ -1992,18 +2002,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
i.OutputSimd128Register());
|
||||
break;
|
||||
}
|
||||
case kArmSimd128Load: {
|
||||
MemOperand src = i.InputOffset();
|
||||
__ vld1(Neon8, NeonListOperand(i.OutputSimd128Register()),
|
||||
NeonMemOperand(src.rn(), src.rm()));
|
||||
break;
|
||||
}
|
||||
case kArmSimd128Store: {
|
||||
MemOperand src = i.InputOffset(1);
|
||||
__ vst1(Neon8, NeonListOperand(i.InputSimd128Register(0)),
|
||||
NeonMemOperand(src.rn(), src.rm()));
|
||||
break;
|
||||
}
|
||||
case kArmSimd128And: {
|
||||
__ vand(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1));
|
||||
|
@ -107,6 +107,8 @@ namespace compiler {
|
||||
V(ArmVld1F64) \
|
||||
V(ArmVstrF64) \
|
||||
V(ArmVst1F64) \
|
||||
V(ArmVld1S128) \
|
||||
V(ArmVst1S128) \
|
||||
V(ArmFloat32Max) \
|
||||
V(ArmFloat64Max) \
|
||||
V(ArmFloat32Min) \
|
||||
@ -213,8 +215,6 @@ namespace compiler {
|
||||
V(ArmUint8x16LessThan) \
|
||||
V(ArmUint8x16LessThanOrEqual) \
|
||||
V(ArmSimd128Zero) \
|
||||
V(ArmSimd128Load) \
|
||||
V(ArmSimd128Store) \
|
||||
V(ArmSimd128And) \
|
||||
V(ArmSimd128Or) \
|
||||
V(ArmSimd128Xor) \
|
||||
|
@ -199,8 +199,6 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kArmUint8x16LessThan:
|
||||
case kArmUint8x16LessThanOrEqual:
|
||||
case kArmSimd128Zero:
|
||||
case kArmSimd128Load:
|
||||
case kArmSimd128Store:
|
||||
case kArmSimd128And:
|
||||
case kArmSimd128Or:
|
||||
case kArmSimd128Xor:
|
||||
@ -217,6 +215,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kArmVldrF32:
|
||||
case kArmVldrF64:
|
||||
case kArmVld1F64:
|
||||
case kArmVld1S128:
|
||||
case kArmLdrb:
|
||||
case kArmLdrsb:
|
||||
case kArmLdrh:
|
||||
@ -227,6 +226,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kArmVstrF32:
|
||||
case kArmVstrF64:
|
||||
case kArmVst1F64:
|
||||
case kArmVst1S128:
|
||||
case kArmStrb:
|
||||
case kArmStrh:
|
||||
case kArmStr:
|
||||
|
@ -427,7 +427,7 @@ void InstructionSelector::VisitLoad(Node* node) {
|
||||
opcode = kArmLdr;
|
||||
break;
|
||||
case MachineRepresentation::kSimd128:
|
||||
opcode = kArmSimd128Load;
|
||||
opcode = kArmVld1S128;
|
||||
break;
|
||||
case MachineRepresentation::kWord64: // Fall through.
|
||||
case MachineRepresentation::kSimd1x4: // Fall through.
|
||||
@ -517,7 +517,7 @@ void InstructionSelector::VisitStore(Node* node) {
|
||||
opcode = kArmStr;
|
||||
break;
|
||||
case MachineRepresentation::kSimd128:
|
||||
opcode = kArmSimd128Store;
|
||||
opcode = kArmVst1S128;
|
||||
break;
|
||||
case MachineRepresentation::kWord64: // Fall through.
|
||||
case MachineRepresentation::kSimd1x4: // Fall through.
|
||||
@ -542,8 +542,8 @@ void InstructionSelector::VisitProtectedStore(Node* node) {
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitUnalignedLoad(Node* node) {
|
||||
UnalignedLoadRepresentation load_rep =
|
||||
UnalignedLoadRepresentationOf(node->op());
|
||||
MachineRepresentation load_rep =
|
||||
UnalignedLoadRepresentationOf(node->op()).representation();
|
||||
ArmOperandGenerator g(this);
|
||||
Node* base = node->InputAt(0);
|
||||
Node* index = node->InputAt(1);
|
||||
@ -551,16 +551,18 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
|
||||
InstructionCode opcode = kArmLdr;
|
||||
// Only floating point loads need to be specially handled; integer loads
|
||||
// support unaligned access. We support unaligned FP loads by loading to
|
||||
// integer registers first, then moving to the destination FP register.
|
||||
switch (load_rep.representation()) {
|
||||
// integer registers first, then moving to the destination FP register. If
|
||||
// NEON is supported, we use the vld1.8 instruction.
|
||||
switch (load_rep) {
|
||||
case MachineRepresentation::kFloat32: {
|
||||
InstructionOperand temp = g.TempRegister();
|
||||
EmitLoad(this, opcode, &temp, base, index);
|
||||
Emit(kArmVmovF32U32, g.DefineAsRegister(node), temp);
|
||||
return;
|
||||
}
|
||||
case MachineRepresentation::kFloat64: {
|
||||
// Compute the address of the least-significant half of the FP value.
|
||||
case MachineRepresentation::kFloat64:
|
||||
case MachineRepresentation::kSimd128: {
|
||||
// Compute the address of the least-significant byte of the FP value.
|
||||
// We assume that the base node is unlikely to be an encodable immediate
|
||||
// or the result of a shift operation, so only consider the addressing
|
||||
// mode that should be used for the index node.
|
||||
@ -585,8 +587,12 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
|
||||
|
||||
if (CpuFeatures::IsSupported(NEON)) {
|
||||
// With NEON we can load directly from the calculated address.
|
||||
Emit(kArmVld1F64, g.DefineAsRegister(node), addr);
|
||||
ArchOpcode op = load_rep == MachineRepresentation::kFloat64
|
||||
? kArmVld1F64
|
||||
: kArmVld1S128;
|
||||
Emit(op, g.DefineAsRegister(node), addr);
|
||||
} else {
|
||||
DCHECK_NE(MachineRepresentation::kSimd128, load_rep);
|
||||
// Load both halves and move to an FP register.
|
||||
InstructionOperand fp_lo = g.TempRegister();
|
||||
InstructionOperand fp_hi = g.TempRegister();
|
||||
@ -619,6 +625,7 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
|
||||
// Only floating point stores need to be specially handled; integer stores
|
||||
// support unaligned access. We support unaligned FP stores by moving the
|
||||
// value to integer registers first, then storing to the destination address.
|
||||
// If NEON is supported, we use the vst1.8 instruction.
|
||||
switch (store_rep) {
|
||||
case MachineRepresentation::kFloat32: {
|
||||
inputs[input_count++] = g.TempRegister();
|
||||
@ -627,7 +634,8 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
|
||||
EmitStore(this, kArmStr, input_count, inputs, index);
|
||||
return;
|
||||
}
|
||||
case MachineRepresentation::kFloat64: {
|
||||
case MachineRepresentation::kFloat64:
|
||||
case MachineRepresentation::kSimd128: {
|
||||
if (CpuFeatures::IsSupported(NEON)) {
|
||||
InstructionOperand address = g.TempRegister();
|
||||
{
|
||||
@ -653,8 +661,12 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
|
||||
|
||||
inputs[input_count++] = g.UseRegister(value);
|
||||
inputs[input_count++] = address;
|
||||
Emit(kArmVst1F64, 0, nullptr, input_count, inputs);
|
||||
ArchOpcode op = store_rep == MachineRepresentation::kFloat64
|
||||
? kArmVst1F64
|
||||
: kArmVst1S128;
|
||||
Emit(op, 0, nullptr, input_count, inputs);
|
||||
} else {
|
||||
DCHECK_NE(MachineRepresentation::kSimd128, store_rep);
|
||||
// Store a 64-bit floating point value using two 32-bit integer stores.
|
||||
// Computing the store address here would require three live temporary
|
||||
// registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after
|
||||
|
Loading…
Reference in New Issue
Block a user