Reland "[turbofan][arm64] Emit Lsl for Int32MulWithOverflow when possible"

This is a reland of commit aa541f1c9c

Original change's description:
> [turbofan][arm64] Emit Lsl for Int32MulWithOverflow when possible
>
> Int32MulWithOverflow on arm64 uses a cmp to set flags rather than
> the multiply instruction itself, thus we can use a left shift when
> the multiplication is by a power of two.
>
> This provides 0.15% for Speedometer2 on a Neoverse-N1 machine,
> with React being improved by 0.45%.
>
> Change-Id: Ic8db42ecc7cb14cf1ac7bbbeab0e9d8359104351
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3829472
> Commit-Queue: George Wort <george.wort@arm.com>
> Reviewed-by: Nico Hartmann <nicohartmann@chromium.org>
> Cr-Commit-Position: refs/heads/main@{#82499}

Change-Id: Ib8f387bd41d283df551299f7ee98e72d39e2a3bd
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3865484
Commit-Queue: George Wort <george.wort@arm.com>
Reviewed-by: Nico Hartmann <nicohartmann@chromium.org>
Cr-Commit-Position: refs/heads/main@{#82909}
This commit is contained in:
George Wort 2022-08-30 11:10:24 +01:00 committed by V8 LUCI CQ
parent 1c84fedbf5
commit 0d591e919d
8 changed files with 56 additions and 22 deletions

View File

@ -903,13 +903,6 @@ void TurboAssembler::Ror(const Register& rd, const Register& rn,
rorv(rd, rn, rm);
}
void MacroAssembler::Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());
DCHECK(!rd.IsZero());
sbfiz(rd, rn, lsb, width);
}
void TurboAssembler::Sbfx(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());
@ -990,6 +983,13 @@ void TurboAssembler::Ubfiz(const Register& rd, const Register& rn, unsigned lsb,
ubfiz(rd, rn, lsb, width);
}
void TurboAssembler::Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());
DCHECK(!rd.IsZero());
sbfiz(rd, rn, lsb, width);
}
void TurboAssembler::Ubfx(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());

View File

@ -1063,6 +1063,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
inline void Sxtw(const Register& rd, const Register& rn);
inline void Ubfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Ubfx(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Lsr(const Register& rd, const Register& rn, unsigned shift);
@ -1624,8 +1626,6 @@ class V8_EXPORT_PRIVATE MacroAssembler : public TurboAssembler {
mvni(vd, imm8, shift, shift_amount);
}
inline void Rev(const Register& rd, const Register& rn);
inline void Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Smaddl(const Register& rd, const Register& rn, const Register& rm,
const Register& ra);
inline void Smsubl(const Register& rd, const Register& rn, const Register& rm,

View File

@ -1474,6 +1474,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Ubfiz(i.OutputRegister32(), i.InputRegister32(0), i.InputInt5(1),
i.InputInt5(2));
break;
case kArm64Sbfiz:
__ Sbfiz(i.OutputRegister(), i.InputRegister(0), i.InputInt6(1),
i.InputInt6(2));
break;
case kArm64Bfi:
__ Bfi(i.OutputRegister(), i.InputRegister(1), i.InputInt6(2),
i.InputInt6(3));

View File

@ -123,6 +123,7 @@ namespace compiler {
V(Arm64Ubfx) \
V(Arm64Ubfx32) \
V(Arm64Ubfiz32) \
V(Arm64Sbfiz) \
V(Arm64Bfi) \
V(Arm64Rbit) \
V(Arm64Rbit32) \

View File

@ -89,6 +89,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64Ubfx:
case kArm64Ubfx32:
case kArm64Ubfiz32:
case kArm64Sbfiz:
case kArm64Bfi:
case kArm64Rbit:
case kArm64Rbit32:
@ -412,6 +413,7 @@ int InstructionScheduler::GetInstructionLatency(const Instruction* instr) {
case kArm64Sxth32:
case kArm64Sxtw:
case kArm64Ubfiz32:
case kArm64Sbfiz:
case kArm64Ubfx:
case kArm64Ubfx32:
return 1;

View File

@ -1669,8 +1669,17 @@ void EmitInt32MulWithOverflow(InstructionSelector* selector, Node* node,
Int32BinopMatcher m(node);
InstructionOperand result = g.DefineAsRegister(node);
InstructionOperand left = g.UseRegister(m.left().node());
InstructionOperand right = g.UseRegister(m.right().node());
selector->Emit(kArm64Smull, result, left, right);
if (m.right().HasResolvedValue() &&
base::bits::IsPowerOfTwo(m.right().ResolvedValue())) {
// Sign extend the bottom 32 bits and shift left.
int32_t shift = base::bits::WhichPowerOfTwo(m.right().ResolvedValue());
selector->Emit(kArm64Sbfiz, result, left, g.TempImmediate(shift),
g.TempImmediate(32));
} else {
InstructionOperand right = g.UseRegister(m.right().node());
selector->Emit(kArm64Smull, result, left, right);
}
InstructionCode opcode =
kArm64Cmp | AddressingModeField::encode(kMode_Operand2_R_SXTW);

View File

@ -201,19 +201,19 @@ class ValueHelper {
}
static constexpr uint32_t uint32_array[] = {
// 0x00000000, 0x00000001, 0xFFFFFFFF, 0x1B09788B, 0x04C5FCE8, 0xCC0DE5BF,
// // This row is useful for testing lea optimizations on intel.
// 0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000008, 0x00000009,
// 0x273A798E, 0x187937A3, 0xECE3AF83, 0x5495A16B, 0x0B668ECC, 0x11223344,
// 0x0000009E, 0x00000043, 0x0000AF73, 0x0000116B, 0x00658ECC, 0x002B3B4C,
// 0x88776655, 0x70000000, 0x07200000, 0x7FFFFFFF, 0x56123761, 0x7FFFFF00,
// 0x761C4761, 0x80000000, 0x88888888, 0xA0000000, 0xDDDDDDDD, 0xE0000000,
// 0xEEEEEEEE, 0xFFFFFFFD, 0xF0000000, 0x007FFFFF, 0x003FFFFF, 0x001FFFFF,
// 0x000FFFFF, 0x0007FFFF, 0x0003FFFF, 0x0001FFFF, 0x0000FFFF, 0x00007FFF,
// 0x00003FFF, 0x00001FFF, 0x00000FFF, 0x000007FF, 0x000003FF, 0x000001FF,
0x00000000, 0x00000001, 0xFFFFFFFF, 0x1B09788B, 0x04C5FCE8, 0xCC0DE5BF,
// This row is useful for testing lea optimizations on intel.
0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000008, 0x00000009,
0x273A798E, 0x187937A3, 0xECE3AF83, 0x5495A16B, 0x0B668ECC, 0x11223344,
0x0000009E, 0x00000043, 0x0000AF73, 0x0000116B, 0x00658ECC, 0x002B3B4C,
0x88776655, 0x70000000, 0x07200000, 0x7FFFFFFF, 0x56123761, 0x7FFFFF00,
0x761C4761, 0x80000000, 0x88888888, 0xA0000000, 0xDDDDDDDD, 0xE0000000,
0xEEEEEEEE, 0xFFFFFFFD, 0xF0000000, 0x007FFFFF, 0x003FFFFF, 0x001FFFFF,
0x000FFFFF, 0x0007FFFF, 0x0003FFFF, 0x0001FFFF, 0x0000FFFF, 0x00007FFF,
0x00003FFF, 0x00001FFF, 0x00000FFF, 0x000007FF, 0x000003FF, 0x000001FF,
// Bit pattern of a quiet NaN and signaling NaN, with or without
// additional payload.
0x7F876543};
0x7FC00000, 0x7F800000, 0x7FFFFFFF, 0x7F876543};
static constexpr base::Vector<const uint32_t> uint32_vector() {
return base::ArrayVector(uint32_array);

View File

@ -1930,6 +1930,24 @@ TEST_F(InstructionSelectorTest, OvfBranchWithImmediateOnLeft) {
}
}
TEST_F(InstructionSelectorTest, OvfValMulImmediateOnRight) {
TRACED_FORRANGE(int32_t, shift, 0, 30) {
StreamBuilder m(this, MachineType::Int32(), MachineType::Int32());
m.Return(m.Projection(0, m.Int32MulWithOverflow(m.Int32Constant(1 << shift),
m.Parameter(0))));
Stream s = m.Build();
ASSERT_EQ(2U, s.size());
EXPECT_EQ(kArm64Sbfiz, s[0]->arch_opcode());
EXPECT_EQ(kArm64Cmp, s[1]->arch_opcode());
ASSERT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(shift, s.ToInt32(s[0]->InputAt(1)));
EXPECT_LE(1U, s[0]->OutputCount());
EXPECT_EQ(32, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(kFlags_none, s[0]->flags_mode());
}
}
// -----------------------------------------------------------------------------
// Shift instructions.