[wasm-simd][arm64] Optimize f64x2 dup + mul into fmul by element
This is similar to the optimization for f32x4 dup + mul in https://crrev.com/c/2719083. Refactor the pattern-matching code into a helper function that returns a struct with all the necessary fields to emit the optimized fmul by element instruction. Add similar unittests and a negative test as well. Bug: v8:11257 Change-Id: I79ab0bc783f43397191a54bf6fa736dd4dc8d807 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2728428 Reviewed-by: Andreas Haas <ahaas@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#73164}
This commit is contained in:
parent
e9873bf129
commit
48d724098e
@ -2180,6 +2180,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
i.InputSimd128Register(1).S(), i.InputInt8(2));
|
||||
break;
|
||||
}
|
||||
case kArm64F64x2MulElement: {
|
||||
__ Fmul(i.OutputSimd128Register().V2D(), i.InputSimd128Register(0).V2D(),
|
||||
i.InputSimd128Register(1).D(), i.InputInt8(2));
|
||||
break;
|
||||
}
|
||||
case kArm64F32x4Ne: {
|
||||
VRegister dst = i.OutputSimd128Register().V4S();
|
||||
__ Fcmeq(dst, i.InputSimd128Register(0).V4S(),
|
||||
|
@ -187,6 +187,7 @@ namespace compiler {
|
||||
V(Arm64F64x2Add) \
|
||||
V(Arm64F64x2Sub) \
|
||||
V(Arm64F64x2Mul) \
|
||||
V(Arm64F64x2MulElement) \
|
||||
V(Arm64F64x2Div) \
|
||||
V(Arm64F64x2Min) \
|
||||
V(Arm64F64x2Max) \
|
||||
|
@ -152,6 +152,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kArm64F64x2Add:
|
||||
case kArm64F64x2Sub:
|
||||
case kArm64F64x2Mul:
|
||||
case kArm64F64x2MulElement:
|
||||
case kArm64F64x2Div:
|
||||
case kArm64F64x2Min:
|
||||
case kArm64F64x2Max:
|
||||
|
@ -3457,7 +3457,6 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
|
||||
#define SIMD_BINOP_LIST(V) \
|
||||
V(F64x2Add, kArm64F64x2Add) \
|
||||
V(F64x2Sub, kArm64F64x2Sub) \
|
||||
V(F64x2Mul, kArm64F64x2Mul) \
|
||||
V(F64x2Div, kArm64F64x2Div) \
|
||||
V(F64x2Min, kArm64F64x2Min) \
|
||||
V(F64x2Max, kArm64F64x2Max) \
|
||||
@ -3618,11 +3617,23 @@ using ShuffleMatcher =
|
||||
ValueMatcher<S128ImmediateParameter, IrOpcode::kI8x16Shuffle>;
|
||||
using BinopWithShuffleMatcher = BinopMatcher<ShuffleMatcher, ShuffleMatcher>;
|
||||
|
||||
void InstructionSelector::VisitF32x4Mul(Node* node) {
|
||||
namespace {
|
||||
// Struct holding the result of pattern-matching a mul+dup.
|
||||
struct MulWithDupResult {
|
||||
Node* input; // Node holding the vector elements.
|
||||
Node* dup_node; // Node holding the lane to multiply.
|
||||
int index;
|
||||
// Pattern-match is successful if dup_node is set.
|
||||
explicit operator bool() const { return dup_node != nullptr; }
|
||||
};
|
||||
|
||||
template <int LANES>
|
||||
MulWithDupResult TryMatchMulWithDup(Node* node) {
|
||||
// Pattern match:
|
||||
// f32x4.mul(x, shuffle(x, y, indices)) => f32x4.mul(x, y, laneidx)
|
||||
// f64x2.mul(x, shuffle(x, y, indices)) => f64x2.mul(x, y, laneidx)
|
||||
// where shuffle(x, y, indices) = dup(x[laneidx]) or dup(y[laneidx])
|
||||
// f32x4.mul is commutative, so use BinopMatcher.
|
||||
// f32x4.mul and f64x2.mul are commutative, so use BinopMatcher.
|
||||
BinopWithShuffleMatcher m = BinopWithShuffleMatcher(node);
|
||||
ShuffleMatcher left = m.left();
|
||||
ShuffleMatcher right = m.right();
|
||||
@ -3631,33 +3642,52 @@ void InstructionSelector::VisitF32x4Mul(Node* node) {
|
||||
Node* dup_node = nullptr;
|
||||
|
||||
int index = 0;
|
||||
// TODO(zhin): We can canonicalize first to avoid checking index < 4.
|
||||
// e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2, 3]...).
|
||||
// But doing so can mutate the inputs of the shuffle node without updating the
|
||||
// shuffle immediates themselves. Fix that before we canonicalize here.
|
||||
// We don't want CanCover here because in many use cases, the shuffle is
|
||||
// generated early in the function, but the f32x4.mul happens in a loop, which
|
||||
// won't cover the shuffle since they are different basic blocks.
|
||||
if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<4>(
|
||||
// TODO(zhin): We can canonicalize first to avoid checking index < LANES.
|
||||
// e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2,
|
||||
// 3]...). But doing so can mutate the inputs of the shuffle node without
|
||||
// updating the shuffle immediates themselves. Fix that before we
|
||||
// canonicalize here. We don't want CanCover here because in many use cases,
|
||||
// the shuffle is generated early in the function, but the f32x4.mul happens
|
||||
// in a loop, which won't cover the shuffle since they are different basic
|
||||
// blocks.
|
||||
if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<LANES>(
|
||||
left.ResolvedValue().data(), &index)) {
|
||||
dup_node = left.node()->InputAt(index < 4 ? 0 : 1);
|
||||
dup_node = left.node()->InputAt(index < LANES ? 0 : 1);
|
||||
input = right.node();
|
||||
} else if (right.HasResolvedValue() &&
|
||||
wasm::SimdShuffle::TryMatchSplat<4>(right.ResolvedValue().data(),
|
||||
&index)) {
|
||||
dup_node = right.node()->InputAt(index < 4 ? 0 : 1);
|
||||
wasm::SimdShuffle::TryMatchSplat<LANES>(
|
||||
right.ResolvedValue().data(), &index)) {
|
||||
dup_node = right.node()->InputAt(index < LANES ? 0 : 1);
|
||||
input = left.node();
|
||||
}
|
||||
|
||||
if (dup_node == nullptr) {
|
||||
// Canonicalization would get rid of this too.
|
||||
index %= LANES;
|
||||
|
||||
return {input, dup_node, index};
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void InstructionSelector::VisitF32x4Mul(Node* node) {
|
||||
if (MulWithDupResult result = TryMatchMulWithDup<4>(node)) {
|
||||
Arm64OperandGenerator g(this);
|
||||
Emit(kArm64F32x4MulElement, g.DefineAsRegister(node),
|
||||
g.UseRegister(result.input), g.UseRegister(result.dup_node),
|
||||
g.UseImmediate(result.index));
|
||||
} else {
|
||||
return VisitRRR(this, kArm64F32x4Mul, node);
|
||||
}
|
||||
}
|
||||
|
||||
// Canonicalization would get rid of this too.
|
||||
index %= 4;
|
||||
Arm64OperandGenerator g(this);
|
||||
Emit(kArm64F32x4MulElement, g.DefineAsRegister(node), g.UseRegister(input),
|
||||
g.UseRegister(dup_node), g.UseImmediate(index));
|
||||
void InstructionSelector::VisitF64x2Mul(Node* node) {
|
||||
if (MulWithDupResult result = TryMatchMulWithDup<2>(node)) {
|
||||
Arm64OperandGenerator g(this);
|
||||
Emit(kArm64F64x2MulElement, g.DefineAsRegister(node),
|
||||
g.UseRegister(result.input), g.UseRegister(result.dup_node),
|
||||
g.UseImmediate(result.index));
|
||||
} else {
|
||||
return VisitRRR(this, kArm64F64x2Mul, node);
|
||||
}
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI64x2Mul(Node* node) {
|
||||
|
@ -2267,10 +2267,10 @@ const SIMDMulDupInst kSIMDF32x4MulDuplInstructions[] = {
|
||||
},
|
||||
};
|
||||
|
||||
using InstructionSelectorSimdMulWithDupTest =
|
||||
using InstructionSelectorSimdF32x4MulWithDupTest =
|
||||
InstructionSelectorTestWithParam<SIMDMulDupInst>;
|
||||
|
||||
TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
|
||||
TEST_P(InstructionSelectorSimdF32x4MulWithDupTest, MulWithDup) {
|
||||
const SIMDMulDupInst param = GetParam();
|
||||
const MachineType type = MachineType::Simd128();
|
||||
{
|
||||
@ -2306,10 +2306,10 @@ TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
|
||||
InstructionSelectorSimdMulWithDupTest,
|
||||
InstructionSelectorSimdF32x4MulWithDupTest,
|
||||
::testing::ValuesIn(kSIMDF32x4MulDuplInstructions));
|
||||
|
||||
TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
|
||||
TEST_F(InstructionSelectorTest, SimdF32x4MulWithDupNegativeTest) {
|
||||
const MachineType type = MachineType::Simd128();
|
||||
// Check that optimization does not match when the shuffle is not a f32x4.dup.
|
||||
const uint8_t mask[kSimd128Size] = {0};
|
||||
@ -2330,6 +2330,92 @@ TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
|
||||
}
|
||||
}
|
||||
|
||||
const SIMDMulDupInst kSIMDF64x2MulDuplInstructions[] = {
|
||||
{
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7},
|
||||
0,
|
||||
0,
|
||||
},
|
||||
{
|
||||
{8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
1,
|
||||
0,
|
||||
},
|
||||
{
|
||||
{16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||
0,
|
||||
1,
|
||||
},
|
||||
{
|
||||
{24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||
1,
|
||||
1,
|
||||
},
|
||||
};
|
||||
|
||||
using InstructionSelectorSimdF64x2MulWithDupTest =
|
||||
InstructionSelectorTestWithParam<SIMDMulDupInst>;
|
||||
|
||||
TEST_P(InstructionSelectorSimdF64x2MulWithDupTest, MulWithDup) {
|
||||
const SIMDMulDupInst param = GetParam();
|
||||
const MachineType type = MachineType::Simd128();
|
||||
{
|
||||
StreamBuilder m(this, type, type, type, type);
|
||||
Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
|
||||
m.Parameter(0), m.Parameter(1));
|
||||
m.Return(m.AddNode(m.machine()->F64x2Mul(), m.Parameter(2), shuffle));
|
||||
Stream s = m.Build();
|
||||
ASSERT_EQ(1U, s.size());
|
||||
EXPECT_EQ(kArm64F64x2MulElement, s[0]->arch_opcode());
|
||||
EXPECT_EQ(3U, s[0]->InputCount());
|
||||
EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
|
||||
EXPECT_EQ(1U, s[0]->OutputCount());
|
||||
EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
|
||||
s.ToVreg(s[0]->InputAt(1)));
|
||||
}
|
||||
|
||||
// Multiplication operator should be commutative, so test shuffle op as lhs.
|
||||
{
|
||||
StreamBuilder m(this, type, type, type, type);
|
||||
Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
|
||||
m.Parameter(0), m.Parameter(1));
|
||||
m.Return(m.AddNode(m.machine()->F64x2Mul(), shuffle, m.Parameter(2)));
|
||||
Stream s = m.Build();
|
||||
ASSERT_EQ(1U, s.size());
|
||||
EXPECT_EQ(kArm64F64x2MulElement, s[0]->arch_opcode());
|
||||
EXPECT_EQ(3U, s[0]->InputCount());
|
||||
EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
|
||||
EXPECT_EQ(1U, s[0]->OutputCount());
|
||||
EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
|
||||
s.ToVreg(s[0]->InputAt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
|
||||
InstructionSelectorSimdF64x2MulWithDupTest,
|
||||
::testing::ValuesIn(kSIMDF64x2MulDuplInstructions));
|
||||
|
||||
TEST_F(InstructionSelectorTest, SimdF64x2MulWithDupNegativeTest) {
|
||||
const MachineType type = MachineType::Simd128();
|
||||
// Check that optimization does not match when the shuffle is not a f64x2.dup.
|
||||
const uint8_t mask[kSimd128Size] = {0};
|
||||
{
|
||||
StreamBuilder m(this, type, type, type, type);
|
||||
Node* shuffle = m.AddNode((m.machine()->I8x16Shuffle(mask)), m.Parameter(0),
|
||||
m.Parameter(1));
|
||||
m.Return(m.AddNode(m.machine()->F64x2Mul(), m.Parameter(2), shuffle));
|
||||
Stream s = m.Build();
|
||||
ASSERT_EQ(2U, s.size());
|
||||
// The shuffle is a i8x16.dup of lane 0.
|
||||
EXPECT_EQ(kArm64S128Dup, s[0]->arch_opcode());
|
||||
EXPECT_EQ(3U, s[0]->InputCount());
|
||||
EXPECT_EQ(kArm64F64x2Mul, s[1]->arch_opcode());
|
||||
EXPECT_EQ(1U, s[0]->OutputCount());
|
||||
EXPECT_EQ(2U, s[1]->InputCount());
|
||||
EXPECT_EQ(1U, s[1]->OutputCount());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(InstructionSelectorTest, Int32MulWithImmediate) {
|
||||
// x * (2^k + 1) -> x + (x << k)
|
||||
TRACED_FORRANGE(int32_t, k, 1, 30) {
|
||||
|
Loading…
Reference in New Issue
Block a user