[wasm-simd][arm64] Optimize f64x2 dup + mul into fmul by element

This is similar to the optimization for f32x4 dup + mul in https://crrev.com/c/2719083. Refactor the pattern-matching code into a helper function that returns a struct with all the necessary fields to emit the optimized fmul by element instruction. Add similar unittests and a negative test as well. Bug: v8:11257 Change-Id: I79ab0bc783f43397191a54bf6fa736dd4dc8d807 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2728428 Reviewed-by: Andreas Haas <ahaas@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#73164}
2021-03-02 14:25:19 -08:00 · 2021-03-02 14:25:19 -08:00 · 48d724098e
commit 48d724098e
parent e9873bf129
5 changed files with 148 additions and 25 deletions
--- a/src/compiler/backend/arm64/code-generator-arm64.cc
+++ b/src/compiler/backend/arm64/code-generator-arm64.cc
@ -2180,6 +2180,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
              i.InputSimd128Register(1).S(), i.InputInt8(2));
      break;
    }
+    case kArm64F64x2MulElement: {
+      __ Fmul(i.OutputSimd128Register().V2D(), i.InputSimd128Register(0).V2D(),
+              i.InputSimd128Register(1).D(), i.InputInt8(2));
+      break;
+    }
    case kArm64F32x4Ne: {
      VRegister dst = i.OutputSimd128Register().V4S();
      __ Fcmeq(dst, i.InputSimd128Register(0).V4S(),
--- a/src/compiler/backend/arm64/instruction-codes-arm64.h
+++ b/src/compiler/backend/arm64/instruction-codes-arm64.h
@ -187,6 +187,7 @@ namespace compiler {
  V(Arm64F64x2Add)                          \
  V(Arm64F64x2Sub)                          \
  V(Arm64F64x2Mul)                          \
+  V(Arm64F64x2MulElement)                   \
  V(Arm64F64x2Div)                          \
  V(Arm64F64x2Min)                          \
  V(Arm64F64x2Max)                          \
--- a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
@ -152,6 +152,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArm64F64x2Add:
    case kArm64F64x2Sub:
    case kArm64F64x2Mul:
+    case kArm64F64x2MulElement:
    case kArm64F64x2Div:
    case kArm64F64x2Min:
    case kArm64F64x2Max:
--- a/src/compiler/backend/arm64/instruction-selector-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc
@ -3457,7 +3457,6 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
 #define SIMD_BINOP_LIST(V)                              \
  V(F64x2Add, kArm64F64x2Add)                           \
  V(F64x2Sub, kArm64F64x2Sub)                           \
-  V(F64x2Mul, kArm64F64x2Mul)                           \
  V(F64x2Div, kArm64F64x2Div)                           \
  V(F64x2Min, kArm64F64x2Min)                           \
  V(F64x2Max, kArm64F64x2Max)                           \
@ -3618,11 +3617,23 @@ using ShuffleMatcher =
    ValueMatcher<S128ImmediateParameter, IrOpcode::kI8x16Shuffle>;
 using BinopWithShuffleMatcher = BinopMatcher<ShuffleMatcher, ShuffleMatcher>;

-void InstructionSelector::VisitF32x4Mul(Node* node) {
+namespace {
+// Struct holding the result of pattern-matching a mul+dup.
+struct MulWithDupResult {
+  Node* input;     // Node holding the vector elements.
+  Node* dup_node;  // Node holding the lane to multiply.
+  int index;
+  // Pattern-match is successful if dup_node is set.
+  explicit operator bool() const { return dup_node != nullptr; }
+};
+
+template <int LANES>
+MulWithDupResult TryMatchMulWithDup(Node* node) {
  // Pattern match:
  //   f32x4.mul(x, shuffle(x, y, indices)) => f32x4.mul(x, y, laneidx)
+  //   f64x2.mul(x, shuffle(x, y, indices)) => f64x2.mul(x, y, laneidx)
  //   where shuffle(x, y, indices) = dup(x[laneidx]) or dup(y[laneidx])
-  // f32x4.mul is commutative, so use BinopMatcher.
+  // f32x4.mul and f64x2.mul are commutative, so use BinopMatcher.
  BinopWithShuffleMatcher m = BinopWithShuffleMatcher(node);
  ShuffleMatcher left = m.left();
  ShuffleMatcher right = m.right();
@ -3631,33 +3642,52 @@ void InstructionSelector::VisitF32x4Mul(Node* node) {
  Node* dup_node = nullptr;

  int index = 0;
-  // TODO(zhin): We can canonicalize first to avoid checking index < 4.
-  // e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2, 3]...).
-  // But doing so can mutate the inputs of the shuffle node without updating the
-  // shuffle immediates themselves. Fix that before we canonicalize here.
-  // We don't want CanCover here because in many use cases, the shuffle is
-  // generated early in the function, but the f32x4.mul happens in a loop, which
-  // won't cover the shuffle since they are different basic blocks.
-  if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<4>(
+  // TODO(zhin): We can canonicalize first to avoid checking index < LANES.
+  // e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2,
+  // 3]...). But doing so can mutate the inputs of the shuffle node without
+  // updating the shuffle immediates themselves. Fix that before we
+  // canonicalize here. We don't want CanCover here because in many use cases,
+  // the shuffle is generated early in the function, but the f32x4.mul happens
+  // in a loop, which won't cover the shuffle since they are different basic
+  // blocks.
+  if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<LANES>(
                                     left.ResolvedValue().data(), &index)) {
-    dup_node = left.node()->InputAt(index < 4 ? 0 : 1);
+    dup_node = left.node()->InputAt(index < LANES ? 0 : 1);
    input = right.node();
  } else if (right.HasResolvedValue() &&
-             wasm::SimdShuffle::TryMatchSplat<4>(right.ResolvedValue().data(),
-                                                 &index)) {
-    dup_node = right.node()->InputAt(index < 4 ? 0 : 1);
+             wasm::SimdShuffle::TryMatchSplat<LANES>(
+                 right.ResolvedValue().data(), &index)) {
+    dup_node = right.node()->InputAt(index < LANES ? 0 : 1);
    input = left.node();
  }

-  if (dup_node == nullptr) {
+  // Canonicalization would get rid of this too.
+  index %= LANES;
+
+  return {input, dup_node, index};
+}
+}  // namespace
+
+void InstructionSelector::VisitF32x4Mul(Node* node) {
+  if (MulWithDupResult result = TryMatchMulWithDup<4>(node)) {
+    Arm64OperandGenerator g(this);
+    Emit(kArm64F32x4MulElement, g.DefineAsRegister(node),
+         g.UseRegister(result.input), g.UseRegister(result.dup_node),
+         g.UseImmediate(result.index));
+  } else {
    return VisitRRR(this, kArm64F32x4Mul, node);
  }
+}

-  // Canonicalization would get rid of this too.
-  index %= 4;
-  Arm64OperandGenerator g(this);
-  Emit(kArm64F32x4MulElement, g.DefineAsRegister(node), g.UseRegister(input),
-       g.UseRegister(dup_node), g.UseImmediate(index));
+void InstructionSelector::VisitF64x2Mul(Node* node) {
+  if (MulWithDupResult result = TryMatchMulWithDup<2>(node)) {
+    Arm64OperandGenerator g(this);
+    Emit(kArm64F64x2MulElement, g.DefineAsRegister(node),
+         g.UseRegister(result.input), g.UseRegister(result.dup_node),
+         g.UseImmediate(result.index));
+  } else {
+    return VisitRRR(this, kArm64F64x2Mul, node);
+  }
 }

 void InstructionSelector::VisitI64x2Mul(Node* node) {
--- a/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc
+++ b/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc
@ -2267,10 +2267,10 @@ const SIMDMulDupInst kSIMDF32x4MulDuplInstructions[] = {
    },
 };

-using InstructionSelectorSimdMulWithDupTest =
+using InstructionSelectorSimdF32x4MulWithDupTest =
    InstructionSelectorTestWithParam<SIMDMulDupInst>;

-TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
+TEST_P(InstructionSelectorSimdF32x4MulWithDupTest, MulWithDup) {
  const SIMDMulDupInst param = GetParam();
  const MachineType type = MachineType::Simd128();
  {
@ -2306,10 +2306,10 @@ TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
 }

 INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
-                         InstructionSelectorSimdMulWithDupTest,
+                         InstructionSelectorSimdF32x4MulWithDupTest,
                         ::testing::ValuesIn(kSIMDF32x4MulDuplInstructions));

-TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
+TEST_F(InstructionSelectorTest, SimdF32x4MulWithDupNegativeTest) {
  const MachineType type = MachineType::Simd128();
  // Check that optimization does not match when the shuffle is not a f32x4.dup.
  const uint8_t mask[kSimd128Size] = {0};
@ -2330,6 +2330,92 @@ TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
  }
 }

+const SIMDMulDupInst kSIMDF64x2MulDuplInstructions[] = {
+    {
+        {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7},
+        0,
+        0,
+    },
+    {
+        {8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15},
+        1,
+        0,
+    },
+    {
+        {16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23},
+        0,
+        1,
+    },
+    {
+        {24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31},
+        1,
+        1,
+    },
+};
+
+using InstructionSelectorSimdF64x2MulWithDupTest =
+    InstructionSelectorTestWithParam<SIMDMulDupInst>;
+
+TEST_P(InstructionSelectorSimdF64x2MulWithDupTest, MulWithDup) {
+  const SIMDMulDupInst param = GetParam();
+  const MachineType type = MachineType::Simd128();
+  {
+    StreamBuilder m(this, type, type, type, type);
+    Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
+                              m.Parameter(0), m.Parameter(1));
+    m.Return(m.AddNode(m.machine()->F64x2Mul(), m.Parameter(2), shuffle));
+    Stream s = m.Build();
+    ASSERT_EQ(1U, s.size());
+    EXPECT_EQ(kArm64F64x2MulElement, s[0]->arch_opcode());
+    EXPECT_EQ(3U, s[0]->InputCount());
+    EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
+    EXPECT_EQ(1U, s[0]->OutputCount());
+    EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
+              s.ToVreg(s[0]->InputAt(1)));
+  }
+
+  // Multiplication operator should be commutative, so test shuffle op as lhs.
+  {
+    StreamBuilder m(this, type, type, type, type);
+    Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
+                              m.Parameter(0), m.Parameter(1));
+    m.Return(m.AddNode(m.machine()->F64x2Mul(), shuffle, m.Parameter(2)));
+    Stream s = m.Build();
+    ASSERT_EQ(1U, s.size());
+    EXPECT_EQ(kArm64F64x2MulElement, s[0]->arch_opcode());
+    EXPECT_EQ(3U, s[0]->InputCount());
+    EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
+    EXPECT_EQ(1U, s[0]->OutputCount());
+    EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
+              s.ToVreg(s[0]->InputAt(1)));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
+                         InstructionSelectorSimdF64x2MulWithDupTest,
+                         ::testing::ValuesIn(kSIMDF64x2MulDuplInstructions));
+
+TEST_F(InstructionSelectorTest, SimdF64x2MulWithDupNegativeTest) {
+  const MachineType type = MachineType::Simd128();
+  // Check that optimization does not match when the shuffle is not a f64x2.dup.
+  const uint8_t mask[kSimd128Size] = {0};
+  {
+    StreamBuilder m(this, type, type, type, type);
+    Node* shuffle = m.AddNode((m.machine()->I8x16Shuffle(mask)), m.Parameter(0),
+                              m.Parameter(1));
+    m.Return(m.AddNode(m.machine()->F64x2Mul(), m.Parameter(2), shuffle));
+    Stream s = m.Build();
+    ASSERT_EQ(2U, s.size());
+    // The shuffle is a i8x16.dup of lane 0.
+    EXPECT_EQ(kArm64S128Dup, s[0]->arch_opcode());
+    EXPECT_EQ(3U, s[0]->InputCount());
+    EXPECT_EQ(kArm64F64x2Mul, s[1]->arch_opcode());
+    EXPECT_EQ(1U, s[0]->OutputCount());
+    EXPECT_EQ(2U, s[1]->InputCount());
+    EXPECT_EQ(1U, s[1]->OutputCount());
+  }
+}
+
 TEST_F(InstructionSelectorTest, Int32MulWithImmediate) {
  // x * (2^k + 1) -> x + (x << k)
  TRACED_FORRANGE(int32_t, k, 1, 30) {