[arm64][turbofan] FP simplification

FNMUL is efficient arm64 instruction, which can save 1 cycle
by optimizing FNEG(FMUL x y)) to FNMUL x y and
FMUL((FNEG x) y) to FNMUL x y

Change-Id: If25d9de1253098b17033a9d8736ff6a1c06601f3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1572681
Commit-Queue: Georg Neis <neis@chromium.org>
Reviewed-by: Martyn Capewell <martyn.capewell@arm.com>
Reviewed-by: Georg Neis <neis@chromium.org>
Cr-Commit-Position: refs/heads/master@{#61230}
This commit is contained in:
Balaram Makam 2019-04-25 17:35:41 -05:00 committed by Commit Bot
parent 7d17fd465d
commit 529ed9e992
8 changed files with 232 additions and 4 deletions

View File

@ -25,6 +25,7 @@ Home Jinni Inc. <*@homejinni.com>
IBM Inc. <*@*.ibm.com> IBM Inc. <*@*.ibm.com>
IBM Inc. <*@ibm.com> IBM Inc. <*@ibm.com>
Samsung <*@*.samsung.com> Samsung <*@*.samsung.com>
Samsung <*@samsung.com>
Joyent, Inc <*@joyent.com> Joyent, Inc <*@joyent.com>
RT-RK Computer Based System <*@rt-rk.com> RT-RK Computer Based System <*@rt-rk.com>
Amazon, Inc <*@amazon.com> Amazon, Inc <*@amazon.com>

View File

@ -388,6 +388,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
V(fmla, Fmla) \ V(fmla, Fmla) \
V(fmls, Fmls) \ V(fmls, Fmls) \
V(fmulx, Fmulx) \ V(fmulx, Fmulx) \
V(fnmul, Fnmul) \
V(frecps, Frecps) \ V(frecps, Frecps) \
V(frsqrts, Frsqrts) \ V(frsqrts, Frsqrts) \
V(mla, Mla) \ V(mla, Mla) \

View File

@ -1340,6 +1340,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kArm64Float32Sqrt: case kArm64Float32Sqrt:
__ Fsqrt(i.OutputFloat32Register(), i.InputFloat32Register(0)); __ Fsqrt(i.OutputFloat32Register(), i.InputFloat32Register(0));
break; break;
case kArm64Float32Fnmul: {
__ Fnmul(i.OutputFloat32Register(), i.InputFloat32Register(0),
i.InputFloat32Register(1));
break;
}
case kArm64Float64Cmp: case kArm64Float64Cmp:
if (instr->InputAt(1)->IsFPRegister()) { if (instr->InputAt(1)->IsFPRegister()) {
__ Fcmp(i.InputDoubleRegister(0), i.InputDoubleRegister(1)); __ Fcmp(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
@ -1405,6 +1410,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kArm64Float64Sqrt: case kArm64Float64Sqrt:
__ Fsqrt(i.OutputDoubleRegister(), i.InputDoubleRegister(0)); __ Fsqrt(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
break; break;
case kArm64Float64Fnmul:
__ Fnmul(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputDoubleRegister(1));
break;
case kArm64Float32ToFloat64: case kArm64Float32ToFloat64:
__ Fcvt(i.OutputDoubleRegister(), i.InputDoubleRegister(0).S()); __ Fcvt(i.OutputDoubleRegister(), i.InputDoubleRegister(0).S());
break; break;

View File

@ -95,6 +95,7 @@ namespace compiler {
V(Arm64Float32Abs) \ V(Arm64Float32Abs) \
V(Arm64Float32Neg) \ V(Arm64Float32Neg) \
V(Arm64Float32Sqrt) \ V(Arm64Float32Sqrt) \
V(Arm64Float32Fnmul) \
V(Arm64Float32RoundDown) \ V(Arm64Float32RoundDown) \
V(Arm64Float32Max) \ V(Arm64Float32Max) \
V(Arm64Float32Min) \ V(Arm64Float32Min) \
@ -109,6 +110,7 @@ namespace compiler {
V(Arm64Float64Abs) \ V(Arm64Float64Abs) \
V(Arm64Float64Neg) \ V(Arm64Float64Neg) \
V(Arm64Float64Sqrt) \ V(Arm64Float64Sqrt) \
V(Arm64Float64Fnmul) \
V(Arm64Float64RoundDown) \ V(Arm64Float64RoundDown) \
V(Arm64Float32RoundUp) \ V(Arm64Float32RoundUp) \
V(Arm64Float64RoundUp) \ V(Arm64Float64RoundUp) \

View File

@ -88,6 +88,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64Float32Abs: case kArm64Float32Abs:
case kArm64Float32Neg: case kArm64Float32Neg:
case kArm64Float32Sqrt: case kArm64Float32Sqrt:
case kArm64Float32Fnmul:
case kArm64Float32RoundDown: case kArm64Float32RoundDown:
case kArm64Float32Max: case kArm64Float32Max:
case kArm64Float32Min: case kArm64Float32Min:
@ -101,6 +102,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64Float64Abs: case kArm64Float64Abs:
case kArm64Float64Neg: case kArm64Float64Neg:
case kArm64Float64Sqrt: case kArm64Float64Sqrt:
case kArm64Float64Fnmul:
case kArm64Float64RoundDown: case kArm64Float64RoundDown:
case kArm64Float64RoundTiesAway: case kArm64Float64RoundTiesAway:
case kArm64Float64RoundTruncate: case kArm64Float64RoundTruncate:

View File

@ -1238,8 +1238,6 @@ void InstructionSelector::VisitWord64Ror(Node* node) {
V(Float64RoundTiesAway, kArm64Float64RoundTiesAway) \ V(Float64RoundTiesAway, kArm64Float64RoundTiesAway) \
V(Float32RoundTiesEven, kArm64Float32RoundTiesEven) \ V(Float32RoundTiesEven, kArm64Float32RoundTiesEven) \
V(Float64RoundTiesEven, kArm64Float64RoundTiesEven) \ V(Float64RoundTiesEven, kArm64Float64RoundTiesEven) \
V(Float32Neg, kArm64Float32Neg) \
V(Float64Neg, kArm64Float64Neg) \
V(Float64ExtractLowWord32, kArm64Float64ExtractLowWord32) \ V(Float64ExtractLowWord32, kArm64Float64ExtractLowWord32) \
V(Float64ExtractHighWord32, kArm64Float64ExtractHighWord32) \ V(Float64ExtractHighWord32, kArm64Float64ExtractHighWord32) \
V(Float64SilenceNaN, kArm64Float64SilenceNaN) V(Float64SilenceNaN, kArm64Float64SilenceNaN)
@ -1257,8 +1255,6 @@ void InstructionSelector::VisitWord64Ror(Node* node) {
V(Float64Add, kArm64Float64Add) \ V(Float64Add, kArm64Float64Add) \
V(Float32Sub, kArm64Float32Sub) \ V(Float32Sub, kArm64Float32Sub) \
V(Float64Sub, kArm64Float64Sub) \ V(Float64Sub, kArm64Float64Sub) \
V(Float32Mul, kArm64Float32Mul) \
V(Float64Mul, kArm64Float64Mul) \
V(Float32Div, kArm64Float32Div) \ V(Float32Div, kArm64Float32Div) \
V(Float64Div, kArm64Float64Div) \ V(Float64Div, kArm64Float64Div) \
V(Float32Max, kArm64Float32Max) \ V(Float32Max, kArm64Float32Max) \
@ -2654,6 +2650,38 @@ void InstructionSelector::VisitUint64LessThanOrEqual(Node* node) {
VisitWordCompare(this, node, kArm64Cmp, &cont, false, kArithmeticImm); VisitWordCompare(this, node, kArm64Cmp, &cont, false, kArithmeticImm);
} }
void InstructionSelector::VisitFloat32Neg(Node* node) {
Arm64OperandGenerator g(this);
Node* in = node->InputAt(0);
if (in->opcode() == IrOpcode::kFloat32Mul && CanCover(node, in)) {
Float32BinopMatcher m(in);
Emit(kArm64Float32Fnmul, g.DefineAsRegister(node),
g.UseRegister(m.left().node()), g.UseRegister(m.right().node()));
return;
}
VisitRR(this, kArm64Float32Neg, node);
}
void InstructionSelector::VisitFloat32Mul(Node* node) {
Arm64OperandGenerator g(this);
Float32BinopMatcher m(node);
if (m.left().IsFloat32Neg() && CanCover(node, m.left().node())) {
Emit(kArm64Float32Fnmul, g.DefineAsRegister(node),
g.UseRegister(m.left().node()->InputAt(0)),
g.UseRegister(m.right().node()));
return;
}
if (m.right().IsFloat32Neg() && CanCover(node, m.right().node())) {
Emit(kArm64Float32Fnmul, g.DefineAsRegister(node),
g.UseRegister(m.right().node()->InputAt(0)),
g.UseRegister(m.left().node()));
return;
}
return VisitRRR(this, kArm64Float32Mul, node);
}
void InstructionSelector::VisitFloat32Equal(Node* node) { void InstructionSelector::VisitFloat32Equal(Node* node) {
FlagsContinuation cont = FlagsContinuation::ForSet(kEqual, node); FlagsContinuation cont = FlagsContinuation::ForSet(kEqual, node);
VisitFloat32Compare(this, node, &cont); VisitFloat32Compare(this, node, &cont);
@ -2719,6 +2747,38 @@ void InstructionSelector::VisitFloat64InsertHighWord32(Node* node) {
g.UseRegister(left), g.UseRegister(right)); g.UseRegister(left), g.UseRegister(right));
} }
void InstructionSelector::VisitFloat64Neg(Node* node) {
Arm64OperandGenerator g(this);
Node* in = node->InputAt(0);
if (in->opcode() == IrOpcode::kFloat64Mul && CanCover(node, in)) {
Float64BinopMatcher m(in);
Emit(kArm64Float64Fnmul, g.DefineAsRegister(node),
g.UseRegister(m.left().node()), g.UseRegister(m.right().node()));
return;
}
VisitRR(this, kArm64Float64Neg, node);
}
void InstructionSelector::VisitFloat64Mul(Node* node) {
Arm64OperandGenerator g(this);
Float64BinopMatcher m(node);
if (m.left().IsFloat64Neg() && CanCover(node, m.left().node())) {
Emit(kArm64Float64Fnmul, g.DefineAsRegister(node),
g.UseRegister(m.left().node()->InputAt(0)),
g.UseRegister(m.right().node()));
return;
}
if (m.right().IsFloat64Neg() && CanCover(node, m.right().node())) {
Emit(kArm64Float64Fnmul, g.DefineAsRegister(node),
g.UseRegister(m.right().node()->InputAt(0)),
g.UseRegister(m.left().node()));
return;
}
return VisitRRR(this, kArm64Float64Mul, node);
}
void InstructionSelector::VisitWord32AtomicLoad(Node* node) { void InstructionSelector::VisitWord32AtomicLoad(Node* node) {
LoadRepresentation load_rep = LoadRepresentationOf(node->op()); LoadRepresentation load_rep = LoadRepresentationOf(node->op());
ArchOpcode opcode = kArchNop; ArchOpcode opcode = kArchNop;

View File

@ -3998,6 +3998,87 @@ TEST(RunFloat64MulP) {
} }
} }
TEST(RunFloat32MulAndFloat32Neg) {
BufferedRawMachineAssemblerTester<float> m(MachineType::Float32(),
MachineType::Float32());
m.Return(m.Float32Neg(m.Float32Mul(m.Parameter(0), m.Parameter(1))));
FOR_FLOAT32_INPUTS(i) {
FOR_FLOAT32_INPUTS(j) { CHECK_FLOAT_EQ(-(i * j), m.Call(i, j)); }
}
}
TEST(RunFloat64MulAndFloat64Neg) {
BufferedRawMachineAssemblerTester<double> m(MachineType::Float64(),
MachineType::Float64());
m.Return(m.Float64Neg(m.Float64Mul(m.Parameter(0), m.Parameter(1))));
FOR_FLOAT64_INPUTS(i) {
FOR_FLOAT64_INPUTS(j) { CHECK_DOUBLE_EQ(-(i * j), m.Call(i, j)); }
}
}
TEST(RunFloat32NegAndFloat32Mul1) {
BufferedRawMachineAssemblerTester<float> m(MachineType::Float32(),
MachineType::Float32());
m.Return(m.Float32Mul(m.Float32Neg(m.Parameter(0)), m.Parameter(1)));
FOR_FLOAT32_INPUTS(i) {
FOR_FLOAT32_INPUTS(j) { CHECK_FLOAT_EQ((-i * j), m.Call(i, j)); }
}
}
TEST(RunFloat64NegAndFloat64Mul1) {
BufferedRawMachineAssemblerTester<double> m(MachineType::Float64(),
MachineType::Float64());
m.Return(m.Float64Mul(m.Float64Neg(m.Parameter(0)), m.Parameter(1)));
FOR_FLOAT64_INPUTS(i) {
FOR_FLOAT64_INPUTS(j) { CHECK_DOUBLE_EQ((-i * j), m.Call(i, j)); }
}
}
TEST(RunFloat32NegAndFloat32Mul2) {
BufferedRawMachineAssemblerTester<float> m(MachineType::Float32(),
MachineType::Float32());
m.Return(m.Float32Mul(m.Parameter(0), m.Float32Neg(m.Parameter(1))));
FOR_FLOAT32_INPUTS(i) {
FOR_FLOAT32_INPUTS(j) { CHECK_FLOAT_EQ((i * -j), m.Call(i, j)); }
}
}
TEST(RunFloat64NegAndFloat64Mul2) {
BufferedRawMachineAssemblerTester<double> m(MachineType::Float64(),
MachineType::Float64());
m.Return(m.Float64Mul(m.Parameter(0), m.Float64Neg(m.Parameter(1))));
FOR_FLOAT64_INPUTS(i) {
FOR_FLOAT64_INPUTS(j) { CHECK_DOUBLE_EQ((i * -j), m.Call(i, j)); }
}
}
TEST(RunFloat32NegAndFloat32Mul3) {
BufferedRawMachineAssemblerTester<float> m(MachineType::Float32(),
MachineType::Float32());
m.Return(
m.Float32Mul(m.Float32Neg(m.Parameter(0)), m.Float32Neg(m.Parameter(1))));
FOR_FLOAT32_INPUTS(i) {
FOR_FLOAT32_INPUTS(j) { CHECK_FLOAT_EQ((-i * -j), m.Call(i, j)); }
}
}
TEST(RunFloat64NegAndFloat64Mul3) {
BufferedRawMachineAssemblerTester<double> m(MachineType::Float64(),
MachineType::Float64());
m.Return(
m.Float64Mul(m.Float64Neg(m.Parameter(0)), m.Float64Neg(m.Parameter(1))));
FOR_FLOAT64_INPUTS(i) {
FOR_FLOAT64_INPUTS(j) { CHECK_DOUBLE_EQ((-i * -j), m.Call(i, j)); }
}
}
TEST(RunFloat64MulAndFloat64Add1) { TEST(RunFloat64MulAndFloat64Add1) {
BufferedRawMachineAssemblerTester<double> m( BufferedRawMachineAssemblerTester<double> m(

View File

@ -4368,6 +4368,78 @@ TEST_F(InstructionSelectorTest, Float64Neg) {
EXPECT_EQ(s.ToVreg(n), s.ToVreg(s[0]->Output())); EXPECT_EQ(s.ToVreg(n), s.ToVreg(s[0]->Output()));
} }
TEST_F(InstructionSelectorTest, Float32NegWithMul) {
StreamBuilder m(this, MachineType::Float32(), MachineType::Float32(),
MachineType::Float32());
Node* const p0 = m.Parameter(0);
Node* const p1 = m.Parameter(1);
Node* const n1 = m.AddNode(m.machine()->Float32Mul(), p0, p1);
Node* const n2 = m.AddNode(m.machine()->Float32Neg(), n1);
m.Return(n2);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Float32Fnmul, s[0]->arch_opcode());
ASSERT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
EXPECT_EQ(s.ToVreg(p1), s.ToVreg(s[0]->InputAt(1)));
ASSERT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(n2), s.ToVreg(s[0]->Output()));
}
TEST_F(InstructionSelectorTest, Float64NegWithMul) {
StreamBuilder m(this, MachineType::Float64(), MachineType::Float64(),
MachineType::Float64());
Node* const p0 = m.Parameter(0);
Node* const p1 = m.Parameter(1);
Node* const n1 = m.AddNode(m.machine()->Float64Mul(), p0, p1);
Node* const n2 = m.AddNode(m.machine()->Float64Neg(), n1);
m.Return(n2);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Float64Fnmul, s[0]->arch_opcode());
ASSERT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
EXPECT_EQ(s.ToVreg(p1), s.ToVreg(s[0]->InputAt(1)));
ASSERT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(n2), s.ToVreg(s[0]->Output()));
}
TEST_F(InstructionSelectorTest, Float32MulWithNeg) {
StreamBuilder m(this, MachineType::Float32(), MachineType::Float32(),
MachineType::Float32());
Node* const p0 = m.Parameter(0);
Node* const p1 = m.Parameter(1);
Node* const n1 = m.AddNode(m.machine()->Float32Neg(), p0);
Node* const n2 = m.AddNode(m.machine()->Float32Mul(), n1, p1);
m.Return(n2);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Float32Fnmul, s[0]->arch_opcode());
ASSERT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
EXPECT_EQ(s.ToVreg(p1), s.ToVreg(s[0]->InputAt(1)));
ASSERT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(n2), s.ToVreg(s[0]->Output()));
}
TEST_F(InstructionSelectorTest, Float64MulWithNeg) {
StreamBuilder m(this, MachineType::Float64(), MachineType::Float64(),
MachineType::Float64());
Node* const p0 = m.Parameter(0);
Node* const p1 = m.Parameter(1);
Node* const n1 = m.AddNode(m.machine()->Float64Neg(), p0);
Node* const n2 = m.AddNode(m.machine()->Float64Mul(), n1, p1);
m.Return(n2);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Float64Fnmul, s[0]->arch_opcode());
ASSERT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
EXPECT_EQ(s.ToVreg(p1), s.ToVreg(s[0]->InputAt(1)));
ASSERT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(n2), s.ToVreg(s[0]->Output()));
}
TEST_F(InstructionSelectorTest, LoadAndShiftRight) { TEST_F(InstructionSelectorTest, LoadAndShiftRight) {
{ {
int32_t immediates[] = {-256, -255, -3, -2, -1, 0, 1, int32_t immediates[] = {-256, -255, -3, -2, -1, 0, 1,