[arm64][wasm-simd] Use Fcm(0) for floating point comparison with zero.

Use an immediate zero operand for floating point comparison nodes when possible. This results in up to 20-25% runtime improvement in some microbenchmarks, as well as 1-1.5% runtime improvement in some real-use benchmarks on Cortex-A55 and Neoverse N1. Change-Id: I39d10871a08a037dbe8c0877d789d110476e1a58 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3133143 Reviewed-by: Zhi An Ng <zhin@chromium.org> Commit-Queue: Martyn Capewell <martyn.capewell@arm.com> Cr-Commit-Position: refs/heads/main@{#76749}
2021-09-08 13:16:46 +01:00 · 2021-09-08 13:16:46 +01:00 · 66bfcdcb43
commit 66bfcdcb43
parent 7ad60c2784
7 changed files with 272 additions and 25 deletions
--- a/src/compiler/backend/arm64/code-generator-arm64.cc
+++ b/src/compiler/backend/arm64/code-generator-arm64.cc
@ -2077,6 +2077,28 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
             i.InputSimd128Register(1).Format(f));                     \
    break;                                                             \
  }
+#define SIMD_FCM_L_CASE(Op, ImmOp, RegOp)                              \
+  case Op: {                                                           \
+    VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode)); \
+    if (instr->InputCount() == 1) {                                    \
+      __ Fcm##ImmOp(i.OutputSimd128Register().Format(f),               \
+                    i.InputSimd128Register(0).Format(f), +0.0);        \
+    } else {                                                           \
+      __ Fcm##RegOp(i.OutputSimd128Register().Format(f),               \
+                    i.InputSimd128Register(1).Format(f),               \
+                    i.InputSimd128Register(0).Format(f));              \
+    }                                                                  \
+    break;                                                             \
+  }
+#define SIMD_FCM_G_CASE(Op, ImmOp)                                     \
+  case Op: {                                                           \
+    VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode)); \
+    /* Currently Gt/Ge instructions are only used with zero */         \
+    DCHECK_EQ(instr->InputCount(), 1);                                 \
+    __ Fcm##ImmOp(i.OutputSimd128Register().Format(f),                 \
+                  i.InputSimd128Register(0).Format(f), +0.0);          \
+    break;                                                             \
+  }
 #define SIMD_DESTRUCTIVE_BINOP_CASE(Op, Instr, FORMAT)     \
  case Op: {                                               \
    VRegister dst = i.OutputSimd128Register().V##FORMAT(); \
@ -2192,29 +2214,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ Mov(dst, i.InputInt8(1), i.InputSimd128Register(2).Format(f), 0);
      break;
    }
-      SIMD_BINOP_LANE_SIZE_CASE(kArm64FEq, Fcmeq);
+      SIMD_FCM_L_CASE(kArm64FEq, eq, eq);
    case kArm64FNe: {
      VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
      VRegister dst = i.OutputSimd128Register().Format(f);
-      __ Fcmeq(dst, i.InputSimd128Register(0).Format(f),
-               i.InputSimd128Register(1).Format(f));
+      if (instr->InputCount() == 1) {
+        __ Fcmeq(dst, i.InputSimd128Register(0).Format(f), +0.0);
+      } else {
+        __ Fcmeq(dst, i.InputSimd128Register(0).Format(f),
+                 i.InputSimd128Register(1).Format(f));
+      }
      __ Mvn(dst, dst);
      break;
    }
-    case kArm64FLt: {
-      VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
-      __ Fcmgt(i.OutputSimd128Register().Format(f),
-               i.InputSimd128Register(1).Format(f),
-               i.InputSimd128Register(0).Format(f));
-      break;
-    }
-    case kArm64FLe: {
-      VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
-      __ Fcmge(i.OutputSimd128Register().Format(f),
-               i.InputSimd128Register(1).Format(f),
-               i.InputSimd128Register(0).Format(f));
-      break;
-    }
+      SIMD_FCM_L_CASE(kArm64FLt, lt, gt);
+      SIMD_FCM_L_CASE(kArm64FLe, le, ge);
+      SIMD_FCM_G_CASE(kArm64FGt, gt);
+      SIMD_FCM_G_CASE(kArm64FGe, ge);
      SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfma, Fmla, 2D);
      SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfms, Fmls, 2D);
    case kArm64F64x2Pmin: {
--- a/src/compiler/backend/arm64/instruction-codes-arm64.h
+++ b/src/compiler/backend/arm64/instruction-codes-arm64.h
@ -210,6 +210,8 @@ namespace compiler {
  V(Arm64FNe)                         \
  V(Arm64FLt)                         \
  V(Arm64FLe)                         \
+  V(Arm64FGt)                         \
+  V(Arm64FGe)                         \
  V(Arm64F64x2Qfma)                   \
  V(Arm64F64x2Qfms)                   \
  V(Arm64F64x2Pmin)                   \
--- a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
@ -170,6 +170,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArm64FNe:
    case kArm64FLt:
    case kArm64FLe:
+    case kArm64FGt:
+    case kArm64FGe:
    case kArm64F64x2Qfma:
    case kArm64F64x2Qfms:
    case kArm64F64x2Pmin:
--- a/src/compiler/backend/arm64/instruction-selector-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc
@ -3538,19 +3538,11 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
  V(F64x2Add, kArm64FAdd, 64)                          \
  V(F64x2Sub, kArm64FSub, 64)                          \
  V(F64x2Div, kArm64FDiv, 64)                          \
-  V(F64x2Eq, kArm64FEq, 64)                            \
-  V(F64x2Ne, kArm64FNe, 64)                            \
-  V(F64x2Lt, kArm64FLt, 64)                            \
-  V(F64x2Le, kArm64FLe, 64)                            \
  V(F32x4Min, kArm64FMin, 32)                          \
  V(F32x4Max, kArm64FMax, 32)                          \
  V(F32x4Add, kArm64FAdd, 32)                          \
  V(F32x4Sub, kArm64FSub, 32)                          \
  V(F32x4Div, kArm64FDiv, 32)                          \
-  V(F32x4Eq, kArm64FEq, 32)                            \
-  V(F32x4Ne, kArm64FNe, 32)                            \
-  V(F32x4Lt, kArm64FLt, 32)                            \
-  V(F32x4Le, kArm64FLe, 32)                            \
  V(I64x2Sub, kArm64ISub, 64)                          \
  V(I64x2Eq, kArm64IEq, 64)                            \
  V(I64x2Ne, kArm64INe, 64)                            \
@ -3951,6 +3943,44 @@ VISIT_SIMD_SUB(I32x4, 32)
 VISIT_SIMD_SUB(I16x8, 16)
 #undef VISIT_SIMD_SUB

+namespace {
+bool isSimdZero(Arm64OperandGenerator& g, Node* node) {
+  auto m = V128ConstMatcher(node);
+  if (m.HasResolvedValue()) {
+    auto imms = m.ResolvedValue().immediate();
+    return (std::all_of(imms.begin(), imms.end(), std::logical_not<uint8_t>()));
+  }
+  return node->opcode() == IrOpcode::kS128Zero;
+}
+}  // namespace
+
+#define VISIT_SIMD_FCM(Type, CmOp, CmOpposite, LaneSize)                   \
+  void InstructionSelector::Visit##Type##CmOp(Node* node) {                \
+    Arm64OperandGenerator g(this);                                         \
+    Node* left = node->InputAt(0);                                         \
+    Node* right = node->InputAt(1);                                        \
+    if (isSimdZero(g, left)) {                                             \
+      Emit(kArm64F##CmOpposite | LaneSizeField::encode(LaneSize),          \
+           g.DefineAsRegister(node), g.UseRegister(right));                \
+      return;                                                              \
+    } else if (isSimdZero(g, right)) {                                     \
+      Emit(kArm64F##CmOp | LaneSizeField::encode(LaneSize),                \
+           g.DefineAsRegister(node), g.UseRegister(left));                 \
+      return;                                                              \
+    }                                                                      \
+    VisitRRR(this, kArm64F##CmOp | LaneSizeField::encode(LaneSize), node); \
+  }
+
+VISIT_SIMD_FCM(F64x2, Eq, Eq, 64)
+VISIT_SIMD_FCM(F64x2, Ne, Ne, 64)
+VISIT_SIMD_FCM(F64x2, Lt, Gt, 64)
+VISIT_SIMD_FCM(F64x2, Le, Ge, 64)
+VISIT_SIMD_FCM(F32x4, Eq, Eq, 32)
+VISIT_SIMD_FCM(F32x4, Ne, Ne, 32)
+VISIT_SIMD_FCM(F32x4, Lt, Gt, 32)
+VISIT_SIMD_FCM(F32x4, Le, Ge, 32)
+#undef VISIT_SIMD_FCM
+
 void InstructionSelector::VisitS128Select(Node* node) {
  Arm64OperandGenerator g(this);
  Emit(kArm64S128Select, g.DefineSameAsFirst(node),
--- a/test/cctest/compiler/value-helper.h
+++ b/test/cctest/compiler/value-helper.h
@ -357,6 +357,16 @@ inline base::Vector<const int64_t> ValueHelper::GetVector() {
  return int64_vector();
 }

+template <>
+inline base::Vector<const float> ValueHelper::GetVector() {
+  return float32_vector();
+}
+
+template <>
+inline base::Vector<const double> ValueHelper::GetVector() {
+  return float64_vector();
+}
+
 // Helper macros that can be used in FOR_INT32_INPUTS(i) { ... i ... }
 #define FOR_INPUTS(ctype, itype, var) \
  for (ctype var : ::v8::internal::compiler::ValueHelper::itype##_vector())
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -351,6 +351,48 @@ WASM_SIMD_TEST(F32x4ConvertI32x4) {
  }
 }

+template <typename FloatType, typename ScalarType>
+void RunF128CompareOpConstImmTest(
+    TestExecutionTier execution_tier, WasmOpcode cmp_opcode,
+    WasmOpcode splat_opcode, ScalarType (*expected_op)(FloatType, FloatType)) {
+  for (FloatType x : compiler::ValueHelper::GetVector<FloatType>()) {
+    if (!PlatformCanRepresent(x)) continue;
+    WasmRunner<int32_t, FloatType> r(execution_tier);
+    // Set up globals to hold mask output for left and right cases
+    ScalarType* g1 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
+    ScalarType* g2 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
+    // Build fn to splat test values, perform compare op on both sides, and
+    // write the result.
+    byte value = 0;
+    byte temp = r.AllocateLocal(kWasmS128);
+    uint8_t const_buffer[kSimd128Size];
+    for (size_t i = 0; i < kSimd128Size / sizeof(FloatType); i++) {
+      memcpy(&const_buffer[i * sizeof(FloatType)], &x, sizeof(FloatType));
+    }
+    BUILD(r,
+          WASM_LOCAL_SET(temp,
+                         WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value))),
+          WASM_GLOBAL_SET(
+              0, WASM_SIMD_BINOP(cmp_opcode, WASM_SIMD_CONSTANT(const_buffer),
+                                 WASM_LOCAL_GET(temp))),
+          WASM_GLOBAL_SET(1, WASM_SIMD_BINOP(cmp_opcode, WASM_LOCAL_GET(temp),
+                                             WASM_SIMD_CONSTANT(const_buffer))),
+          WASM_ONE);
+    for (FloatType y : compiler::ValueHelper::GetVector<FloatType>()) {
+      if (!PlatformCanRepresent(y)) continue;
+      FloatType diff = x - y;  // Model comparison as subtraction.
+      if (!PlatformCanRepresent(diff)) continue;
+      r.Call(y);
+      ScalarType expected1 = expected_op(x, y);
+      ScalarType expected2 = expected_op(y, x);
+      for (size_t i = 0; i < kSimd128Size / sizeof(ScalarType); i++) {
+        CHECK_EQ(expected1, LANE(g1, i));
+        CHECK_EQ(expected2, LANE(g2, i));
+      }
+    }
+  }
+}
+
 WASM_SIMD_TEST(F32x4Abs) {
  RunF32x4UnOpTest(execution_tier, kExprF32x4Abs, std::abs);
 }
@ -470,6 +512,36 @@ void RunShiftAddTestSequence(TestExecutionTier execution_tier,
  }
 }

+WASM_SIMD_TEST(F32x4EqZero) {
+  RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Eq,
+                                               kExprF32x4Splat, Equal);
+}
+
+WASM_SIMD_TEST(F32x4NeZero) {
+  RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Ne,
+                                               kExprF32x4Splat, NotEqual);
+}
+
+WASM_SIMD_TEST(F32x4GtZero) {
+  RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Gt,
+                                               kExprF32x4Splat, Greater);
+}
+
+WASM_SIMD_TEST(F32x4GeZero) {
+  RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Ge,
+                                               kExprF32x4Splat, GreaterEqual);
+}
+
+WASM_SIMD_TEST(F32x4LtZero) {
+  RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Lt,
+                                               kExprF32x4Splat, Less);
+}
+
+WASM_SIMD_TEST(F32x4LeZero) {
+  RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Le,
+                                               kExprF32x4Splat, LessEqual);
+}
+
 WASM_SIMD_TEST(I64x2Splat) {
  WasmRunner<int32_t, int64_t> r(execution_tier);
  // Set up a global to hold output vector.
@ -858,6 +930,36 @@ WASM_SIMD_TEST(F64x2Le) {
  RunF64x2CompareOpTest(execution_tier, kExprF64x2Le, LessEqual);
 }

+WASM_SIMD_TEST(F64x2EqZero) {
+  RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Eq,
+                                                kExprF64x2Splat, Equal);
+}
+
+WASM_SIMD_TEST(F64x2NeZero) {
+  RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Ne,
+                                                kExprF64x2Splat, NotEqual);
+}
+
+WASM_SIMD_TEST(F64x2GtZero) {
+  RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Gt,
+                                                kExprF64x2Splat, Greater);
+}
+
+WASM_SIMD_TEST(F64x2GeZero) {
+  RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Ge,
+                                                kExprF64x2Splat, GreaterEqual);
+}
+
+WASM_SIMD_TEST(F64x2LtZero) {
+  RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Lt,
+                                                kExprF64x2Splat, Less);
+}
+
+WASM_SIMD_TEST(F64x2LeZero) {
+  RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Le,
+                                                kExprF64x2Splat, LessEqual);
+}
+
 WASM_SIMD_TEST(F64x2Min) {
  RunF64x2BinOpTest(execution_tier, kExprF64x2Min, JSMin);
 }
--- a/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc
+++ b/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc
@ -5454,6 +5454,91 @@ TEST_F(InstructionSelectorTest, PokePairPrepareArgumentsSimd128) {
               expected_poke_pair, expected_poke);
 }

+struct SIMDConstZeroFcmTest {
+  const bool is_zero;
+  const uint8_t lane_size;
+  const Operator* (MachineOperatorBuilder::*fcm_operator)();
+  const ArchOpcode expected_op_left;
+  const ArchOpcode expected_op_right;
+  const size_t size;
+};
+
+static const SIMDConstZeroFcmTest SIMDConstZeroFcmTests[] = {
+    {true, 64, &MachineOperatorBuilder::F64x2Eq, kArm64FEq, kArm64FEq, 1},
+    {true, 64, &MachineOperatorBuilder::F64x2Ne, kArm64FNe, kArm64FNe, 1},
+    {true, 64, &MachineOperatorBuilder::F64x2Lt, kArm64FGt, kArm64FLt, 1},
+    {true, 64, &MachineOperatorBuilder::F64x2Le, kArm64FGe, kArm64FLe, 1},
+    {false, 64, &MachineOperatorBuilder::F64x2Eq, kArm64FEq, kArm64FEq, 2},
+    {false, 64, &MachineOperatorBuilder::F64x2Ne, kArm64FNe, kArm64FNe, 2},
+    {false, 64, &MachineOperatorBuilder::F64x2Lt, kArm64FLt, kArm64FLt, 2},
+    {false, 64, &MachineOperatorBuilder::F64x2Le, kArm64FLe, kArm64FLe, 2},
+    {true, 32, &MachineOperatorBuilder::F32x4Eq, kArm64FEq, kArm64FEq, 1},
+    {true, 32, &MachineOperatorBuilder::F32x4Ne, kArm64FNe, kArm64FNe, 1},
+    {true, 32, &MachineOperatorBuilder::F32x4Lt, kArm64FGt, kArm64FLt, 1},
+    {true, 32, &MachineOperatorBuilder::F32x4Le, kArm64FGe, kArm64FLe, 1},
+    {false, 32, &MachineOperatorBuilder::F32x4Eq, kArm64FEq, kArm64FEq, 2},
+    {false, 32, &MachineOperatorBuilder::F32x4Ne, kArm64FNe, kArm64FNe, 2},
+    {false, 32, &MachineOperatorBuilder::F32x4Lt, kArm64FLt, kArm64FLt, 2},
+    {false, 32, &MachineOperatorBuilder::F32x4Le, kArm64FLe, kArm64FLe, 2},
+};
+
+using InstructionSelectorSIMDConstZeroFcmTest =
+    InstructionSelectorTestWithParam<SIMDConstZeroFcmTest>;
+
+TEST_P(InstructionSelectorSIMDConstZeroFcmTest, ConstZero) {
+  const SIMDConstZeroFcmTest param = GetParam();
+  byte data[16] = {};
+  if (!param.is_zero) data[0] = 0xff;
+  // Const node on the left
+  {
+    StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
+    Node* cnst = m.S128Const(data);
+    Node* fcm =
+        m.AddNode((m.machine()->*param.fcm_operator)(), cnst, m.Parameter(0));
+    m.Return(fcm);
+    Stream s = m.Build();
+    ASSERT_EQ(param.size, s.size());
+    if (param.size == 1) {
+      EXPECT_EQ(param.expected_op_left, s[0]->arch_opcode());
+      EXPECT_EQ(1U, s[0]->InputCount());
+      EXPECT_EQ(1U, s[0]->OutputCount());
+      EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
+    } else {
+      EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
+      EXPECT_EQ(param.expected_op_left, s[1]->arch_opcode());
+      EXPECT_EQ(2U, s[1]->InputCount());
+      EXPECT_EQ(1U, s[1]->OutputCount());
+      EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[1]->opcode()));
+    }
+  }
+  //  Const node on the right
+  {
+    StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
+    Node* cnst = m.S128Const(data);
+    Node* fcm =
+        m.AddNode((m.machine()->*param.fcm_operator)(), m.Parameter(0), cnst);
+    m.Return(fcm);
+    Stream s = m.Build();
+    ASSERT_EQ(param.size, s.size());
+    if (param.size == 1) {
+      EXPECT_EQ(param.expected_op_right, s[0]->arch_opcode());
+      EXPECT_EQ(1U, s[0]->InputCount());
+      EXPECT_EQ(1U, s[0]->OutputCount());
+      EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
+    } else {
+      EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
+      EXPECT_EQ(param.expected_op_right, s[1]->arch_opcode());
+      EXPECT_EQ(2U, s[1]->InputCount());
+      EXPECT_EQ(1U, s[1]->OutputCount());
+      EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[1]->opcode()));
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
+                         InstructionSelectorSIMDConstZeroFcmTest,
+                         ::testing::ValuesIn(SIMDConstZeroFcmTests));
+
 }  // namespace
 }  // namespace compiler
 }  // namespace internal