[ia32] Merge SSE/AVX float32/float64 add sub mul div

This removes 8 arch opcodes.

Bug: v8:11217
Change-Id: I2c7a73b032ba5fa21f9843ebb4325e226a22550a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3114590
Reviewed-by: Adam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76442}
This commit is contained in:
Ng Zhi An 2021-08-23 13:56:32 -07:00 committed by V8 LUCI CQ
parent 360fdbdee5
commit 09413a884f
6 changed files with 88 additions and 134 deletions

View File

@ -165,6 +165,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Keep this list sorted by required extension, then instruction name.
AVX_OP(Addpd, addpd)
AVX_OP(Addps, addps)
AVX_OP(Addsd, addsd)
AVX_OP(Addss, addss)
AVX_OP(Andnpd, andnpd)
AVX_OP(Andnps, andnps)
AVX_OP(Andpd, andpd)
@ -183,6 +185,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Cvttps2dq, cvttps2dq)
AVX_OP(Divpd, divpd)
AVX_OP(Divps, divps)
AVX_OP(Divsd, divsd)
AVX_OP(Divss, divss)
AVX_OP(Maxpd, maxpd)
AVX_OP(Maxps, maxps)
AVX_OP(Minpd, minpd)
@ -200,6 +204,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Movups, movups)
AVX_OP(Mulpd, mulpd)
AVX_OP(Mulps, mulps)
AVX_OP(Mulsd, mulsd)
AVX_OP(Mulss, mulss)
AVX_OP(Orpd, orpd)
AVX_OP(Orps, orps)
AVX_OP(Packssdw, packssdw)
@ -259,6 +265,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Sqrtss, sqrtss)
AVX_OP(Subpd, subpd)
AVX_OP(Subps, subps)
AVX_OP(Subsd, subsd)
AVX_OP(Subss, subss)
AVX_OP(Unpcklps, unpcklps)
AVX_OP(Xorpd, xorpd)
AVX_OP(Xorps, xorps)

View File

@ -1257,21 +1257,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kSSEFloat32Cmp:
__ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat32Add:
__ addss(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat32Sub:
__ subss(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat32Mul:
__ mulss(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat32Div:
__ divss(i.InputDoubleRegister(0), i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulss depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break;
case kSSEFloat32Sqrt:
__ sqrtss(i.OutputDoubleRegister(), i.InputOperand(0));
break;
@ -1301,21 +1286,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kSSEFloat64Cmp:
__ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat64Add:
__ addsd(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat64Sub:
__ subsd(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat64Mul:
__ mulsd(i.InputDoubleRegister(0), i.InputOperand(1));
break;
case kSSEFloat64Div:
__ divsd(i.InputDoubleRegister(0), i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break;
case kSSEFloat32Max: {
Label compare_swap, done_compare;
if (instr->InputAt(1)->IsFPRegister()) {
@ -1538,55 +1508,47 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kSSEFloat64LoadLowWord32:
__ movd(i.OutputDoubleRegister(), i.InputOperand(0));
break;
case kAVXFloat32Add: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vaddss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat32Add: {
__ Addss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
break;
}
case kAVXFloat32Sub: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vsubss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat32Sub: {
__ Subss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
break;
}
case kAVXFloat32Mul: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vmulss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat32Mul: {
__ Mulss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
break;
}
case kAVXFloat32Div: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat32Div: {
__ Divss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulss depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break;
}
case kAVXFloat64Add: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vaddsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat64Add: {
__ Addsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
break;
}
case kAVXFloat64Sub: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vsubsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat64Sub: {
__ Subsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
break;
}
case kAVXFloat64Mul: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vmulsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat64Mul: {
__ Mulsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
break;
}
case kAVXFloat64Div: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
case kFloat64Div: {
__ Divsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());

View File

@ -48,19 +48,11 @@ namespace compiler {
V(IA32MFence) \
V(IA32LFence) \
V(SSEFloat32Cmp) \
V(SSEFloat32Add) \
V(SSEFloat32Sub) \
V(SSEFloat32Mul) \
V(SSEFloat32Div) \
V(SSEFloat32Abs) \
V(SSEFloat32Neg) \
V(SSEFloat32Sqrt) \
V(SSEFloat32Round) \
V(SSEFloat64Cmp) \
V(SSEFloat64Add) \
V(SSEFloat64Sub) \
V(SSEFloat64Mul) \
V(SSEFloat64Div) \
V(SSEFloat64Mod) \
V(SSEFloat32Max) \
V(SSEFloat64Max) \
@ -86,14 +78,14 @@ namespace compiler {
V(SSEFloat64InsertHighWord32) \
V(SSEFloat64LoadLowWord32) \
V(SSEFloat64SilenceNaN) \
V(AVXFloat32Add) \
V(AVXFloat32Sub) \
V(AVXFloat32Mul) \
V(AVXFloat32Div) \
V(AVXFloat64Add) \
V(AVXFloat64Sub) \
V(AVXFloat64Mul) \
V(AVXFloat64Div) \
V(Float32Add) \
V(Float32Sub) \
V(Float64Add) \
V(Float64Sub) \
V(Float32Mul) \
V(Float32Div) \
V(Float64Mul) \
V(Float64Div) \
V(AVXFloat64Abs) \
V(AVXFloat64Neg) \
V(AVXFloat32Abs) \

View File

@ -49,19 +49,11 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32Bswap:
case kIA32Lea:
case kSSEFloat32Cmp:
case kSSEFloat32Add:
case kSSEFloat32Sub:
case kSSEFloat32Mul:
case kSSEFloat32Div:
case kSSEFloat32Abs:
case kSSEFloat32Neg:
case kSSEFloat32Sqrt:
case kSSEFloat32Round:
case kSSEFloat64Cmp:
case kSSEFloat64Add:
case kSSEFloat64Sub:
case kSSEFloat64Mul:
case kSSEFloat64Div:
case kSSEFloat64Mod:
case kSSEFloat32Max:
case kSSEFloat64Max:
@ -87,14 +79,14 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kSSEFloat64InsertHighWord32:
case kSSEFloat64LoadLowWord32:
case kSSEFloat64SilenceNaN:
case kAVXFloat32Add:
case kAVXFloat32Sub:
case kAVXFloat32Mul:
case kAVXFloat32Div:
case kAVXFloat64Add:
case kAVXFloat64Sub:
case kAVXFloat64Mul:
case kAVXFloat64Div:
case kFloat32Add:
case kFloat32Sub:
case kFloat64Add:
case kFloat64Sub:
case kFloat32Mul:
case kFloat32Div:
case kFloat64Mul:
case kFloat64Div:
case kAVXFloat64Abs:
case kAVXFloat64Neg:
case kAVXFloat32Abs:
@ -448,7 +440,7 @@ int InstructionScheduler::GetInstructionLatency(const Instruction* instr) {
// Basic latency modeling for ia32 instructions. They have been determined
// in an empirical way.
switch (instr->arch_opcode()) {
case kSSEFloat64Mul:
case kFloat64Mul:
return 5;
case kIA32Imul:
case kIA32ImulHigh:
@ -456,18 +448,18 @@ int InstructionScheduler::GetInstructionLatency(const Instruction* instr) {
case kSSEFloat32Cmp:
case kSSEFloat64Cmp:
return 9;
case kSSEFloat32Add:
case kSSEFloat32Sub:
case kFloat32Add:
case kFloat32Sub:
case kFloat64Add:
case kFloat64Sub:
case kSSEFloat32Abs:
case kSSEFloat32Neg:
case kSSEFloat64Add:
case kSSEFloat64Sub:
case kSSEFloat64Max:
case kSSEFloat64Min:
case kSSEFloat64Abs:
case kSSEFloat64Neg:
return 5;
case kSSEFloat32Mul:
case kFloat32Mul:
return 4;
case kSSEFloat32ToFloat64:
case kSSEFloat64ToFloat32:
@ -485,9 +477,9 @@ int InstructionScheduler::GetInstructionLatency(const Instruction* instr) {
return 33;
case kIA32Udiv:
return 26;
case kSSEFloat32Div:
case kFloat32Div:
return 35;
case kSSEFloat64Div:
case kFloat64Div:
return 63;
case kSSEFloat32Sqrt:
case kSSEFloat64Sqrt:

View File

@ -315,14 +315,14 @@ void VisitRR(InstructionSelector* selector, Node* node,
}
void VisitRROFloat(InstructionSelector* selector, Node* node,
ArchOpcode avx_opcode, ArchOpcode sse_opcode) {
ArchOpcode opcode) {
IA32OperandGenerator g(selector);
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
InstructionOperand operand1 = g.Use(node->InputAt(1));
if (selector->IsSupported(AVX)) {
selector->Emit(avx_opcode, g.DefineAsRegister(node), operand0, operand1);
selector->Emit(opcode, g.DefineAsRegister(node), operand0, operand1);
} else {
selector->Emit(sse_opcode, g.DefineSameAsFirst(node), operand0, operand1);
selector->Emit(opcode, g.DefineSameAsFirst(node), operand0, operand1);
}
}
@ -1175,23 +1175,23 @@ void InstructionSelector::VisitWord32Ror(Node* node) {
V(F64x2Trunc, kIA32F64x2Round | MiscField::encode(kRoundToZero)) \
V(F64x2NearestInt, kIA32F64x2Round | MiscField::encode(kRoundToNearest))
#define RRO_FLOAT_OP_LIST(V) \
V(Float32Add, kAVXFloat32Add, kSSEFloat32Add) \
V(Float64Add, kAVXFloat64Add, kSSEFloat64Add) \
V(Float32Sub, kAVXFloat32Sub, kSSEFloat32Sub) \
V(Float64Sub, kAVXFloat64Sub, kSSEFloat64Sub) \
V(Float32Mul, kAVXFloat32Mul, kSSEFloat32Mul) \
V(Float64Mul, kAVXFloat64Mul, kSSEFloat64Mul) \
V(Float32Div, kAVXFloat32Div, kSSEFloat32Div) \
V(Float64Div, kAVXFloat64Div, kSSEFloat64Div) \
V(F64x2Add, kIA32F64x2Add, kIA32F64x2Add) \
V(F64x2Sub, kIA32F64x2Sub, kIA32F64x2Sub) \
V(F64x2Mul, kIA32F64x2Mul, kIA32F64x2Mul) \
V(F64x2Div, kIA32F64x2Div, kIA32F64x2Div) \
V(F64x2Eq, kIA32F64x2Eq, kIA32F64x2Eq) \
V(F64x2Ne, kIA32F64x2Ne, kIA32F64x2Ne) \
V(F64x2Lt, kIA32F64x2Lt, kIA32F64x2Lt) \
V(F64x2Le, kIA32F64x2Le, kIA32F64x2Le)
#define RRO_FLOAT_OP_LIST(V) \
V(Float32Add, kFloat32Add) \
V(Float64Add, kFloat64Add) \
V(Float32Sub, kFloat32Sub) \
V(Float64Sub, kFloat64Sub) \
V(Float32Mul, kFloat32Mul) \
V(Float64Mul, kFloat64Mul) \
V(Float32Div, kFloat32Div) \
V(Float64Div, kFloat64Div) \
V(F64x2Add, kIA32F64x2Add) \
V(F64x2Sub, kIA32F64x2Sub) \
V(F64x2Mul, kIA32F64x2Mul) \
V(F64x2Div, kIA32F64x2Div) \
V(F64x2Eq, kIA32F64x2Eq) \
V(F64x2Ne, kIA32F64x2Ne) \
V(F64x2Lt, kIA32F64x2Lt) \
V(F64x2Le, kIA32F64x2Le)
#define FLOAT_UNOP_LIST(V) \
V(Float32Abs, kAVXFloat32Abs, kSSEFloat32Abs) \
@ -1233,9 +1233,9 @@ RR_OP_LIST(RR_VISITOR)
#undef RR_VISITOR
#undef RR_OP_LIST
#define RRO_FLOAT_VISITOR(Name, avx, sse) \
#define RRO_FLOAT_VISITOR(Name, opcode) \
void InstructionSelector::Visit##Name(Node* node) { \
VisitRROFloat(this, node, avx, sse); \
VisitRROFloat(this, node, opcode); \
}
RRO_FLOAT_OP_LIST(RRO_FLOAT_VISITOR)
#undef RRO_FLOAT_VISITOR

View File

@ -321,8 +321,8 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
class AddressingModeUnitTest : public InstructionSelectorTest {
public:
AddressingModeUnitTest() : m(NULL) { Reset(); }
~AddressingModeUnitTest() { delete m; }
AddressingModeUnitTest() : m(nullptr) { Reset(); }
~AddressingModeUnitTest() override { delete m; }
void Run(Node* base, Node* load_index, Node* store_index,
AddressingMode mode) {
@ -812,10 +812,10 @@ TEST_F(InstructionSelectorTest, Float64BinopArithmetic) {
m.Return(ret);
Stream s = m.Build(AVX);
ASSERT_EQ(4U, s.size());
EXPECT_EQ(kAVXFloat64Add, s[0]->arch_opcode());
EXPECT_EQ(kAVXFloat64Mul, s[1]->arch_opcode());
EXPECT_EQ(kAVXFloat64Sub, s[2]->arch_opcode());
EXPECT_EQ(kAVXFloat64Div, s[3]->arch_opcode());
EXPECT_EQ(kFloat64Add, s[0]->arch_opcode());
EXPECT_EQ(kFloat64Mul, s[1]->arch_opcode());
EXPECT_EQ(kFloat64Sub, s[2]->arch_opcode());
EXPECT_EQ(kFloat64Div, s[3]->arch_opcode());
}
{
StreamBuilder m(this, MachineType::Float64(), MachineType::Float64(),
@ -827,10 +827,10 @@ TEST_F(InstructionSelectorTest, Float64BinopArithmetic) {
m.Return(ret);
Stream s = m.Build();
ASSERT_EQ(4U, s.size());
EXPECT_EQ(kSSEFloat64Add, s[0]->arch_opcode());
EXPECT_EQ(kSSEFloat64Mul, s[1]->arch_opcode());
EXPECT_EQ(kSSEFloat64Sub, s[2]->arch_opcode());
EXPECT_EQ(kSSEFloat64Div, s[3]->arch_opcode());
EXPECT_EQ(kFloat64Add, s[0]->arch_opcode());
EXPECT_EQ(kFloat64Mul, s[1]->arch_opcode());
EXPECT_EQ(kFloat64Sub, s[2]->arch_opcode());
EXPECT_EQ(kFloat64Div, s[3]->arch_opcode());
}
}