From 0d591e919d4b51519fecec15b0f061c4535b4143 Mon Sep 17 00:00:00 2001 From: George Wort Date: Tue, 30 Aug 2022 11:10:24 +0100 Subject: [PATCH] Reland "[turbofan][arm64] Emit Lsl for Int32MulWithOverflow when possible" This is a reland of commit aa541f1c9c6833ab910b7077ac31b434ca0871bf Original change's description: > [turbofan][arm64] Emit Lsl for Int32MulWithOverflow when possible > > Int32MulWithOverflow on arm64 uses a cmp to set flags rather than > the multiply instruction itself, thus we can use a left shift when > the multiplication is by a power of two. > > This provides 0.15% for Speedometer2 on a Neoverse-N1 machine, > with React being improved by 0.45%. > > Change-Id: Ic8db42ecc7cb14cf1ac7bbbeab0e9d8359104351 > Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3829472 > Commit-Queue: George Wort > Reviewed-by: Nico Hartmann > Cr-Commit-Position: refs/heads/main@{#82499} Change-Id: Ib8f387bd41d283df551299f7ee98e72d39e2a3bd Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3865484 Commit-Queue: George Wort Reviewed-by: Nico Hartmann Cr-Commit-Position: refs/heads/main@{#82909} --- src/codegen/arm64/macro-assembler-arm64-inl.h | 14 ++++++------ src/codegen/arm64/macro-assembler-arm64.h | 4 ++-- .../backend/arm64/code-generator-arm64.cc | 4 ++++ .../backend/arm64/instruction-codes-arm64.h | 1 + .../arm64/instruction-scheduler-arm64.cc | 2 ++ .../arm64/instruction-selector-arm64.cc | 13 +++++++++-- test/common/value-helper.h | 22 +++++++++---------- .../instruction-selector-arm64-unittest.cc | 18 +++++++++++++++ 8 files changed, 56 insertions(+), 22 deletions(-) diff --git a/src/codegen/arm64/macro-assembler-arm64-inl.h b/src/codegen/arm64/macro-assembler-arm64-inl.h index 28134dd999..caf9884ec8 100644 --- a/src/codegen/arm64/macro-assembler-arm64-inl.h +++ b/src/codegen/arm64/macro-assembler-arm64-inl.h @@ -903,13 +903,6 @@ void TurboAssembler::Ror(const Register& rd, const Register& rn, rorv(rd, rn, rm); } -void MacroAssembler::Sbfiz(const Register& rd, const Register& rn, unsigned lsb, - unsigned width) { - DCHECK(allow_macro_instructions()); - DCHECK(!rd.IsZero()); - sbfiz(rd, rn, lsb, width); -} - void TurboAssembler::Sbfx(const Register& rd, const Register& rn, unsigned lsb, unsigned width) { DCHECK(allow_macro_instructions()); @@ -990,6 +983,13 @@ void TurboAssembler::Ubfiz(const Register& rd, const Register& rn, unsigned lsb, ubfiz(rd, rn, lsb, width); } +void TurboAssembler::Sbfiz(const Register& rd, const Register& rn, unsigned lsb, + unsigned width) { + DCHECK(allow_macro_instructions()); + DCHECK(!rd.IsZero()); + sbfiz(rd, rn, lsb, width); +} + void TurboAssembler::Ubfx(const Register& rd, const Register& rn, unsigned lsb, unsigned width) { DCHECK(allow_macro_instructions()); diff --git a/src/codegen/arm64/macro-assembler-arm64.h b/src/codegen/arm64/macro-assembler-arm64.h index 633ac21951..0f77e03655 100644 --- a/src/codegen/arm64/macro-assembler-arm64.h +++ b/src/codegen/arm64/macro-assembler-arm64.h @@ -1063,6 +1063,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { inline void Sxtw(const Register& rd, const Register& rn); inline void Ubfiz(const Register& rd, const Register& rn, unsigned lsb, unsigned width); + inline void Sbfiz(const Register& rd, const Register& rn, unsigned lsb, + unsigned width); inline void Ubfx(const Register& rd, const Register& rn, unsigned lsb, unsigned width); inline void Lsr(const Register& rd, const Register& rn, unsigned shift); @@ -1624,8 +1626,6 @@ class V8_EXPORT_PRIVATE MacroAssembler : public TurboAssembler { mvni(vd, imm8, shift, shift_amount); } inline void Rev(const Register& rd, const Register& rn); - inline void Sbfiz(const Register& rd, const Register& rn, unsigned lsb, - unsigned width); inline void Smaddl(const Register& rd, const Register& rn, const Register& rm, const Register& ra); inline void Smsubl(const Register& rd, const Register& rn, const Register& rm, diff --git a/src/compiler/backend/arm64/code-generator-arm64.cc b/src/compiler/backend/arm64/code-generator-arm64.cc index d8a512b050..4a9654e8b9 100644 --- a/src/compiler/backend/arm64/code-generator-arm64.cc +++ b/src/compiler/backend/arm64/code-generator-arm64.cc @@ -1474,6 +1474,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ Ubfiz(i.OutputRegister32(), i.InputRegister32(0), i.InputInt5(1), i.InputInt5(2)); break; + case kArm64Sbfiz: + __ Sbfiz(i.OutputRegister(), i.InputRegister(0), i.InputInt6(1), + i.InputInt6(2)); + break; case kArm64Bfi: __ Bfi(i.OutputRegister(), i.InputRegister(1), i.InputInt6(2), i.InputInt6(3)); diff --git a/src/compiler/backend/arm64/instruction-codes-arm64.h b/src/compiler/backend/arm64/instruction-codes-arm64.h index 232b78e1b7..2d5671927f 100644 --- a/src/compiler/backend/arm64/instruction-codes-arm64.h +++ b/src/compiler/backend/arm64/instruction-codes-arm64.h @@ -123,6 +123,7 @@ namespace compiler { V(Arm64Ubfx) \ V(Arm64Ubfx32) \ V(Arm64Ubfiz32) \ + V(Arm64Sbfiz) \ V(Arm64Bfi) \ V(Arm64Rbit) \ V(Arm64Rbit32) \ diff --git a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc index 8749ce9b33..909bc24c90 100644 --- a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc +++ b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc @@ -89,6 +89,7 @@ int InstructionScheduler::GetTargetInstructionFlags( case kArm64Ubfx: case kArm64Ubfx32: case kArm64Ubfiz32: + case kArm64Sbfiz: case kArm64Bfi: case kArm64Rbit: case kArm64Rbit32: @@ -412,6 +413,7 @@ int InstructionScheduler::GetInstructionLatency(const Instruction* instr) { case kArm64Sxth32: case kArm64Sxtw: case kArm64Ubfiz32: + case kArm64Sbfiz: case kArm64Ubfx: case kArm64Ubfx32: return 1; diff --git a/src/compiler/backend/arm64/instruction-selector-arm64.cc b/src/compiler/backend/arm64/instruction-selector-arm64.cc index 1960a237ca..b8e46f6a65 100644 --- a/src/compiler/backend/arm64/instruction-selector-arm64.cc +++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc @@ -1669,8 +1669,17 @@ void EmitInt32MulWithOverflow(InstructionSelector* selector, Node* node, Int32BinopMatcher m(node); InstructionOperand result = g.DefineAsRegister(node); InstructionOperand left = g.UseRegister(m.left().node()); - InstructionOperand right = g.UseRegister(m.right().node()); - selector->Emit(kArm64Smull, result, left, right); + + if (m.right().HasResolvedValue() && + base::bits::IsPowerOfTwo(m.right().ResolvedValue())) { + // Sign extend the bottom 32 bits and shift left. + int32_t shift = base::bits::WhichPowerOfTwo(m.right().ResolvedValue()); + selector->Emit(kArm64Sbfiz, result, left, g.TempImmediate(shift), + g.TempImmediate(32)); + } else { + InstructionOperand right = g.UseRegister(m.right().node()); + selector->Emit(kArm64Smull, result, left, right); + } InstructionCode opcode = kArm64Cmp | AddressingModeField::encode(kMode_Operand2_R_SXTW); diff --git a/test/common/value-helper.h b/test/common/value-helper.h index a5c7c29ec3..ea14014374 100644 --- a/test/common/value-helper.h +++ b/test/common/value-helper.h @@ -201,19 +201,19 @@ class ValueHelper { } static constexpr uint32_t uint32_array[] = { - // 0x00000000, 0x00000001, 0xFFFFFFFF, 0x1B09788B, 0x04C5FCE8, 0xCC0DE5BF, - // // This row is useful for testing lea optimizations on intel. - // 0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000008, 0x00000009, - // 0x273A798E, 0x187937A3, 0xECE3AF83, 0x5495A16B, 0x0B668ECC, 0x11223344, - // 0x0000009E, 0x00000043, 0x0000AF73, 0x0000116B, 0x00658ECC, 0x002B3B4C, - // 0x88776655, 0x70000000, 0x07200000, 0x7FFFFFFF, 0x56123761, 0x7FFFFF00, - // 0x761C4761, 0x80000000, 0x88888888, 0xA0000000, 0xDDDDDDDD, 0xE0000000, - // 0xEEEEEEEE, 0xFFFFFFFD, 0xF0000000, 0x007FFFFF, 0x003FFFFF, 0x001FFFFF, - // 0x000FFFFF, 0x0007FFFF, 0x0003FFFF, 0x0001FFFF, 0x0000FFFF, 0x00007FFF, - // 0x00003FFF, 0x00001FFF, 0x00000FFF, 0x000007FF, 0x000003FF, 0x000001FF, + 0x00000000, 0x00000001, 0xFFFFFFFF, 0x1B09788B, 0x04C5FCE8, 0xCC0DE5BF, + // This row is useful for testing lea optimizations on intel. + 0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000008, 0x00000009, + 0x273A798E, 0x187937A3, 0xECE3AF83, 0x5495A16B, 0x0B668ECC, 0x11223344, + 0x0000009E, 0x00000043, 0x0000AF73, 0x0000116B, 0x00658ECC, 0x002B3B4C, + 0x88776655, 0x70000000, 0x07200000, 0x7FFFFFFF, 0x56123761, 0x7FFFFF00, + 0x761C4761, 0x80000000, 0x88888888, 0xA0000000, 0xDDDDDDDD, 0xE0000000, + 0xEEEEEEEE, 0xFFFFFFFD, 0xF0000000, 0x007FFFFF, 0x003FFFFF, 0x001FFFFF, + 0x000FFFFF, 0x0007FFFF, 0x0003FFFF, 0x0001FFFF, 0x0000FFFF, 0x00007FFF, + 0x00003FFF, 0x00001FFF, 0x00000FFF, 0x000007FF, 0x000003FF, 0x000001FF, // Bit pattern of a quiet NaN and signaling NaN, with or without // additional payload. - 0x7F876543}; + 0x7FC00000, 0x7F800000, 0x7FFFFFFF, 0x7F876543}; static constexpr base::Vector uint32_vector() { return base::ArrayVector(uint32_array); diff --git a/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc b/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc index 39b77ad1f9..439057b54d 100644 --- a/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc +++ b/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc @@ -1930,6 +1930,24 @@ TEST_F(InstructionSelectorTest, OvfBranchWithImmediateOnLeft) { } } +TEST_F(InstructionSelectorTest, OvfValMulImmediateOnRight) { + TRACED_FORRANGE(int32_t, shift, 0, 30) { + StreamBuilder m(this, MachineType::Int32(), MachineType::Int32()); + m.Return(m.Projection(0, m.Int32MulWithOverflow(m.Int32Constant(1 << shift), + m.Parameter(0)))); + Stream s = m.Build(); + + ASSERT_EQ(2U, s.size()); + EXPECT_EQ(kArm64Sbfiz, s[0]->arch_opcode()); + EXPECT_EQ(kArm64Cmp, s[1]->arch_opcode()); + ASSERT_EQ(3U, s[0]->InputCount()); + EXPECT_EQ(shift, s.ToInt32(s[0]->InputAt(1))); + EXPECT_LE(1U, s[0]->OutputCount()); + EXPECT_EQ(32, s.ToInt32(s[0]->InputAt(2))); + EXPECT_EQ(kFlags_none, s[0]->flags_mode()); + } +} + // ----------------------------------------------------------------------------- // Shift instructions.