[wasm-simd][arm64] Optimize f64x2 dup + mul into fmul by element

This is similar to the optimization for f32x4 dup + mul in
https://crrev.com/c/2719083. Refactor the pattern-matching code into a
helper function that returns a struct with all the necessary fields to
emit the optimized fmul by element instruction.

Add similar unittests and a negative test as well.

Bug: v8:11257
Change-Id: I79ab0bc783f43397191a54bf6fa736dd4dc8d807
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2728428
Reviewed-by: Andreas Haas <ahaas@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73164}
This commit is contained in:
Ng Zhi An 2021-03-02 14:25:19 -08:00 committed by Commit Bot
parent e9873bf129
commit 48d724098e
5 changed files with 148 additions and 25 deletions

View File

@ -2180,6 +2180,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1).S(), i.InputInt8(2));
break;
}
case kArm64F64x2MulElement: {
__ Fmul(i.OutputSimd128Register().V2D(), i.InputSimd128Register(0).V2D(),
i.InputSimd128Register(1).D(), i.InputInt8(2));
break;
}
case kArm64F32x4Ne: {
VRegister dst = i.OutputSimd128Register().V4S();
__ Fcmeq(dst, i.InputSimd128Register(0).V4S(),

View File

@ -187,6 +187,7 @@ namespace compiler {
V(Arm64F64x2Add) \
V(Arm64F64x2Sub) \
V(Arm64F64x2Mul) \
V(Arm64F64x2MulElement) \
V(Arm64F64x2Div) \
V(Arm64F64x2Min) \
V(Arm64F64x2Max) \

View File

@ -152,6 +152,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64F64x2Add:
case kArm64F64x2Sub:
case kArm64F64x2Mul:
case kArm64F64x2MulElement:
case kArm64F64x2Div:
case kArm64F64x2Min:
case kArm64F64x2Max:

View File

@ -3457,7 +3457,6 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
#define SIMD_BINOP_LIST(V) \
V(F64x2Add, kArm64F64x2Add) \
V(F64x2Sub, kArm64F64x2Sub) \
V(F64x2Mul, kArm64F64x2Mul) \
V(F64x2Div, kArm64F64x2Div) \
V(F64x2Min, kArm64F64x2Min) \
V(F64x2Max, kArm64F64x2Max) \
@ -3618,11 +3617,23 @@ using ShuffleMatcher =
ValueMatcher<S128ImmediateParameter, IrOpcode::kI8x16Shuffle>;
using BinopWithShuffleMatcher = BinopMatcher<ShuffleMatcher, ShuffleMatcher>;
void InstructionSelector::VisitF32x4Mul(Node* node) {
namespace {
// Struct holding the result of pattern-matching a mul+dup.
struct MulWithDupResult {
Node* input; // Node holding the vector elements.
Node* dup_node; // Node holding the lane to multiply.
int index;
// Pattern-match is successful if dup_node is set.
explicit operator bool() const { return dup_node != nullptr; }
};
template <int LANES>
MulWithDupResult TryMatchMulWithDup(Node* node) {
// Pattern match:
// f32x4.mul(x, shuffle(x, y, indices)) => f32x4.mul(x, y, laneidx)
// f64x2.mul(x, shuffle(x, y, indices)) => f64x2.mul(x, y, laneidx)
// where shuffle(x, y, indices) = dup(x[laneidx]) or dup(y[laneidx])
// f32x4.mul is commutative, so use BinopMatcher.
// f32x4.mul and f64x2.mul are commutative, so use BinopMatcher.
BinopWithShuffleMatcher m = BinopWithShuffleMatcher(node);
ShuffleMatcher left = m.left();
ShuffleMatcher right = m.right();
@ -3631,33 +3642,52 @@ void InstructionSelector::VisitF32x4Mul(Node* node) {
Node* dup_node = nullptr;
int index = 0;
// TODO(zhin): We can canonicalize first to avoid checking index < 4.
// e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2, 3]...).
// But doing so can mutate the inputs of the shuffle node without updating the
// shuffle immediates themselves. Fix that before we canonicalize here.
// We don't want CanCover here because in many use cases, the shuffle is
// generated early in the function, but the f32x4.mul happens in a loop, which
// won't cover the shuffle since they are different basic blocks.
if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<4>(
// TODO(zhin): We can canonicalize first to avoid checking index < LANES.
// e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2,
// 3]...). But doing so can mutate the inputs of the shuffle node without
// updating the shuffle immediates themselves. Fix that before we
// canonicalize here. We don't want CanCover here because in many use cases,
// the shuffle is generated early in the function, but the f32x4.mul happens
// in a loop, which won't cover the shuffle since they are different basic
// blocks.
if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<LANES>(
left.ResolvedValue().data(), &index)) {
dup_node = left.node()->InputAt(index < 4 ? 0 : 1);
dup_node = left.node()->InputAt(index < LANES ? 0 : 1);
input = right.node();
} else if (right.HasResolvedValue() &&
wasm::SimdShuffle::TryMatchSplat<4>(right.ResolvedValue().data(),
&index)) {
dup_node = right.node()->InputAt(index < 4 ? 0 : 1);
wasm::SimdShuffle::TryMatchSplat<LANES>(
right.ResolvedValue().data(), &index)) {
dup_node = right.node()->InputAt(index < LANES ? 0 : 1);
input = left.node();
}
if (dup_node == nullptr) {
// Canonicalization would get rid of this too.
index %= LANES;
return {input, dup_node, index};
}
} // namespace
void InstructionSelector::VisitF32x4Mul(Node* node) {
if (MulWithDupResult result = TryMatchMulWithDup<4>(node)) {
Arm64OperandGenerator g(this);
Emit(kArm64F32x4MulElement, g.DefineAsRegister(node),
g.UseRegister(result.input), g.UseRegister(result.dup_node),
g.UseImmediate(result.index));
} else {
return VisitRRR(this, kArm64F32x4Mul, node);
}
}
// Canonicalization would get rid of this too.
index %= 4;
Arm64OperandGenerator g(this);
Emit(kArm64F32x4MulElement, g.DefineAsRegister(node), g.UseRegister(input),
g.UseRegister(dup_node), g.UseImmediate(index));
void InstructionSelector::VisitF64x2Mul(Node* node) {
if (MulWithDupResult result = TryMatchMulWithDup<2>(node)) {
Arm64OperandGenerator g(this);
Emit(kArm64F64x2MulElement, g.DefineAsRegister(node),
g.UseRegister(result.input), g.UseRegister(result.dup_node),
g.UseImmediate(result.index));
} else {
return VisitRRR(this, kArm64F64x2Mul, node);
}
}
void InstructionSelector::VisitI64x2Mul(Node* node) {

View File

@ -2267,10 +2267,10 @@ const SIMDMulDupInst kSIMDF32x4MulDuplInstructions[] = {
},
};
using InstructionSelectorSimdMulWithDupTest =
using InstructionSelectorSimdF32x4MulWithDupTest =
InstructionSelectorTestWithParam<SIMDMulDupInst>;
TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
TEST_P(InstructionSelectorSimdF32x4MulWithDupTest, MulWithDup) {
const SIMDMulDupInst param = GetParam();
const MachineType type = MachineType::Simd128();
{
@ -2306,10 +2306,10 @@ TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSimdMulWithDupTest,
InstructionSelectorSimdF32x4MulWithDupTest,
::testing::ValuesIn(kSIMDF32x4MulDuplInstructions));
TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
TEST_F(InstructionSelectorTest, SimdF32x4MulWithDupNegativeTest) {
const MachineType type = MachineType::Simd128();
// Check that optimization does not match when the shuffle is not a f32x4.dup.
const uint8_t mask[kSimd128Size] = {0};
@ -2330,6 +2330,92 @@ TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
}
}
const SIMDMulDupInst kSIMDF64x2MulDuplInstructions[] = {
{
{0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7},
0,
0,
},
{
{8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15},
1,
0,
},
{
{16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23},
0,
1,
},
{
{24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31},
1,
1,
},
};
using InstructionSelectorSimdF64x2MulWithDupTest =
InstructionSelectorTestWithParam<SIMDMulDupInst>;
TEST_P(InstructionSelectorSimdF64x2MulWithDupTest, MulWithDup) {
const SIMDMulDupInst param = GetParam();
const MachineType type = MachineType::Simd128();
{
StreamBuilder m(this, type, type, type, type);
Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
m.Parameter(0), m.Parameter(1));
m.Return(m.AddNode(m.machine()->F64x2Mul(), m.Parameter(2), shuffle));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64F64x2MulElement, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
s.ToVreg(s[0]->InputAt(1)));
}
// Multiplication operator should be commutative, so test shuffle op as lhs.
{
StreamBuilder m(this, type, type, type, type);
Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
m.Parameter(0), m.Parameter(1));
m.Return(m.AddNode(m.machine()->F64x2Mul(), shuffle, m.Parameter(2)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64F64x2MulElement, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
s.ToVreg(s[0]->InputAt(1)));
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSimdF64x2MulWithDupTest,
::testing::ValuesIn(kSIMDF64x2MulDuplInstructions));
TEST_F(InstructionSelectorTest, SimdF64x2MulWithDupNegativeTest) {
const MachineType type = MachineType::Simd128();
// Check that optimization does not match when the shuffle is not a f64x2.dup.
const uint8_t mask[kSimd128Size] = {0};
{
StreamBuilder m(this, type, type, type, type);
Node* shuffle = m.AddNode((m.machine()->I8x16Shuffle(mask)), m.Parameter(0),
m.Parameter(1));
m.Return(m.AddNode(m.machine()->F64x2Mul(), m.Parameter(2), shuffle));
Stream s = m.Build();
ASSERT_EQ(2U, s.size());
// The shuffle is a i8x16.dup of lane 0.
EXPECT_EQ(kArm64S128Dup, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(kArm64F64x2Mul, s[1]->arch_opcode());
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(2U, s[1]->InputCount());
EXPECT_EQ(1U, s[1]->OutputCount());
}
}
TEST_F(InstructionSelectorTest, Int32MulWithImmediate) {
// x * (2^k + 1) -> x + (x << k)
TRACED_FORRANGE(int32_t, k, 1, 30) {