[arm64][wasm-simd] Use Fcm(0) for floating point comparison with zero.

Use an immediate zero operand for floating point comparison nodes when
possible. This results in up to 20-25% runtime improvement in some
microbenchmarks, as well as 1-1.5% runtime improvement in some
real-use benchmarks on Cortex-A55 and Neoverse N1.

Change-Id: I39d10871a08a037dbe8c0877d789d110476e1a58
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3133143
Reviewed-by: Zhi An Ng <zhin@chromium.org>
Commit-Queue: Martyn Capewell <martyn.capewell@arm.com>
Cr-Commit-Position: refs/heads/main@{#76749}
This commit is contained in:
Ilja Iskovs 2021-09-08 13:16:46 +01:00 committed by V8 LUCI CQ
parent 7ad60c2784
commit 66bfcdcb43
7 changed files with 272 additions and 25 deletions

View File

@ -2077,6 +2077,28 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1).Format(f)); \
break; \
}
#define SIMD_FCM_L_CASE(Op, ImmOp, RegOp) \
case Op: { \
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode)); \
if (instr->InputCount() == 1) { \
__ Fcm##ImmOp(i.OutputSimd128Register().Format(f), \
i.InputSimd128Register(0).Format(f), +0.0); \
} else { \
__ Fcm##RegOp(i.OutputSimd128Register().Format(f), \
i.InputSimd128Register(1).Format(f), \
i.InputSimd128Register(0).Format(f)); \
} \
break; \
}
#define SIMD_FCM_G_CASE(Op, ImmOp) \
case Op: { \
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode)); \
/* Currently Gt/Ge instructions are only used with zero */ \
DCHECK_EQ(instr->InputCount(), 1); \
__ Fcm##ImmOp(i.OutputSimd128Register().Format(f), \
i.InputSimd128Register(0).Format(f), +0.0); \
break; \
}
#define SIMD_DESTRUCTIVE_BINOP_CASE(Op, Instr, FORMAT) \
case Op: { \
VRegister dst = i.OutputSimd128Register().V##FORMAT(); \
@ -2192,29 +2214,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Mov(dst, i.InputInt8(1), i.InputSimd128Register(2).Format(f), 0);
break;
}
SIMD_BINOP_LANE_SIZE_CASE(kArm64FEq, Fcmeq);
SIMD_FCM_L_CASE(kArm64FEq, eq, eq);
case kArm64FNe: {
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
VRegister dst = i.OutputSimd128Register().Format(f);
if (instr->InputCount() == 1) {
__ Fcmeq(dst, i.InputSimd128Register(0).Format(f), +0.0);
} else {
__ Fcmeq(dst, i.InputSimd128Register(0).Format(f),
i.InputSimd128Register(1).Format(f));
}
__ Mvn(dst, dst);
break;
}
case kArm64FLt: {
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
__ Fcmgt(i.OutputSimd128Register().Format(f),
i.InputSimd128Register(1).Format(f),
i.InputSimd128Register(0).Format(f));
break;
}
case kArm64FLe: {
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
__ Fcmge(i.OutputSimd128Register().Format(f),
i.InputSimd128Register(1).Format(f),
i.InputSimd128Register(0).Format(f));
break;
}
SIMD_FCM_L_CASE(kArm64FLt, lt, gt);
SIMD_FCM_L_CASE(kArm64FLe, le, ge);
SIMD_FCM_G_CASE(kArm64FGt, gt);
SIMD_FCM_G_CASE(kArm64FGe, ge);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfma, Fmla, 2D);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfms, Fmls, 2D);
case kArm64F64x2Pmin: {

View File

@ -210,6 +210,8 @@ namespace compiler {
V(Arm64FNe) \
V(Arm64FLt) \
V(Arm64FLe) \
V(Arm64FGt) \
V(Arm64FGe) \
V(Arm64F64x2Qfma) \
V(Arm64F64x2Qfms) \
V(Arm64F64x2Pmin) \

View File

@ -170,6 +170,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64FNe:
case kArm64FLt:
case kArm64FLe:
case kArm64FGt:
case kArm64FGe:
case kArm64F64x2Qfma:
case kArm64F64x2Qfms:
case kArm64F64x2Pmin:

View File

@ -3538,19 +3538,11 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(F64x2Add, kArm64FAdd, 64) \
V(F64x2Sub, kArm64FSub, 64) \
V(F64x2Div, kArm64FDiv, 64) \
V(F64x2Eq, kArm64FEq, 64) \
V(F64x2Ne, kArm64FNe, 64) \
V(F64x2Lt, kArm64FLt, 64) \
V(F64x2Le, kArm64FLe, 64) \
V(F32x4Min, kArm64FMin, 32) \
V(F32x4Max, kArm64FMax, 32) \
V(F32x4Add, kArm64FAdd, 32) \
V(F32x4Sub, kArm64FSub, 32) \
V(F32x4Div, kArm64FDiv, 32) \
V(F32x4Eq, kArm64FEq, 32) \
V(F32x4Ne, kArm64FNe, 32) \
V(F32x4Lt, kArm64FLt, 32) \
V(F32x4Le, kArm64FLe, 32) \
V(I64x2Sub, kArm64ISub, 64) \
V(I64x2Eq, kArm64IEq, 64) \
V(I64x2Ne, kArm64INe, 64) \
@ -3951,6 +3943,44 @@ VISIT_SIMD_SUB(I32x4, 32)
VISIT_SIMD_SUB(I16x8, 16)
#undef VISIT_SIMD_SUB
namespace {
bool isSimdZero(Arm64OperandGenerator& g, Node* node) {
auto m = V128ConstMatcher(node);
if (m.HasResolvedValue()) {
auto imms = m.ResolvedValue().immediate();
return (std::all_of(imms.begin(), imms.end(), std::logical_not<uint8_t>()));
}
return node->opcode() == IrOpcode::kS128Zero;
}
} // namespace
#define VISIT_SIMD_FCM(Type, CmOp, CmOpposite, LaneSize) \
void InstructionSelector::Visit##Type##CmOp(Node* node) { \
Arm64OperandGenerator g(this); \
Node* left = node->InputAt(0); \
Node* right = node->InputAt(1); \
if (isSimdZero(g, left)) { \
Emit(kArm64F##CmOpposite | LaneSizeField::encode(LaneSize), \
g.DefineAsRegister(node), g.UseRegister(right)); \
return; \
} else if (isSimdZero(g, right)) { \
Emit(kArm64F##CmOp | LaneSizeField::encode(LaneSize), \
g.DefineAsRegister(node), g.UseRegister(left)); \
return; \
} \
VisitRRR(this, kArm64F##CmOp | LaneSizeField::encode(LaneSize), node); \
}
VISIT_SIMD_FCM(F64x2, Eq, Eq, 64)
VISIT_SIMD_FCM(F64x2, Ne, Ne, 64)
VISIT_SIMD_FCM(F64x2, Lt, Gt, 64)
VISIT_SIMD_FCM(F64x2, Le, Ge, 64)
VISIT_SIMD_FCM(F32x4, Eq, Eq, 32)
VISIT_SIMD_FCM(F32x4, Ne, Ne, 32)
VISIT_SIMD_FCM(F32x4, Lt, Gt, 32)
VISIT_SIMD_FCM(F32x4, Le, Ge, 32)
#undef VISIT_SIMD_FCM
void InstructionSelector::VisitS128Select(Node* node) {
Arm64OperandGenerator g(this);
Emit(kArm64S128Select, g.DefineSameAsFirst(node),

View File

@ -357,6 +357,16 @@ inline base::Vector<const int64_t> ValueHelper::GetVector() {
return int64_vector();
}
template <>
inline base::Vector<const float> ValueHelper::GetVector() {
return float32_vector();
}
template <>
inline base::Vector<const double> ValueHelper::GetVector() {
return float64_vector();
}
// Helper macros that can be used in FOR_INT32_INPUTS(i) { ... i ... }
#define FOR_INPUTS(ctype, itype, var) \
for (ctype var : ::v8::internal::compiler::ValueHelper::itype##_vector())

View File

@ -351,6 +351,48 @@ WASM_SIMD_TEST(F32x4ConvertI32x4) {
}
}
template <typename FloatType, typename ScalarType>
void RunF128CompareOpConstImmTest(
TestExecutionTier execution_tier, WasmOpcode cmp_opcode,
WasmOpcode splat_opcode, ScalarType (*expected_op)(FloatType, FloatType)) {
for (FloatType x : compiler::ValueHelper::GetVector<FloatType>()) {
if (!PlatformCanRepresent(x)) continue;
WasmRunner<int32_t, FloatType> r(execution_tier);
// Set up globals to hold mask output for left and right cases
ScalarType* g1 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
ScalarType* g2 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
// Build fn to splat test values, perform compare op on both sides, and
// write the result.
byte value = 0;
byte temp = r.AllocateLocal(kWasmS128);
uint8_t const_buffer[kSimd128Size];
for (size_t i = 0; i < kSimd128Size / sizeof(FloatType); i++) {
memcpy(&const_buffer[i * sizeof(FloatType)], &x, sizeof(FloatType));
}
BUILD(r,
WASM_LOCAL_SET(temp,
WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value))),
WASM_GLOBAL_SET(
0, WASM_SIMD_BINOP(cmp_opcode, WASM_SIMD_CONSTANT(const_buffer),
WASM_LOCAL_GET(temp))),
WASM_GLOBAL_SET(1, WASM_SIMD_BINOP(cmp_opcode, WASM_LOCAL_GET(temp),
WASM_SIMD_CONSTANT(const_buffer))),
WASM_ONE);
for (FloatType y : compiler::ValueHelper::GetVector<FloatType>()) {
if (!PlatformCanRepresent(y)) continue;
FloatType diff = x - y; // Model comparison as subtraction.
if (!PlatformCanRepresent(diff)) continue;
r.Call(y);
ScalarType expected1 = expected_op(x, y);
ScalarType expected2 = expected_op(y, x);
for (size_t i = 0; i < kSimd128Size / sizeof(ScalarType); i++) {
CHECK_EQ(expected1, LANE(g1, i));
CHECK_EQ(expected2, LANE(g2, i));
}
}
}
}
WASM_SIMD_TEST(F32x4Abs) {
RunF32x4UnOpTest(execution_tier, kExprF32x4Abs, std::abs);
}
@ -470,6 +512,36 @@ void RunShiftAddTestSequence(TestExecutionTier execution_tier,
}
}
WASM_SIMD_TEST(F32x4EqZero) {
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Eq,
kExprF32x4Splat, Equal);
}
WASM_SIMD_TEST(F32x4NeZero) {
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Ne,
kExprF32x4Splat, NotEqual);
}
WASM_SIMD_TEST(F32x4GtZero) {
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Gt,
kExprF32x4Splat, Greater);
}
WASM_SIMD_TEST(F32x4GeZero) {
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Ge,
kExprF32x4Splat, GreaterEqual);
}
WASM_SIMD_TEST(F32x4LtZero) {
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Lt,
kExprF32x4Splat, Less);
}
WASM_SIMD_TEST(F32x4LeZero) {
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Le,
kExprF32x4Splat, LessEqual);
}
WASM_SIMD_TEST(I64x2Splat) {
WasmRunner<int32_t, int64_t> r(execution_tier);
// Set up a global to hold output vector.
@ -858,6 +930,36 @@ WASM_SIMD_TEST(F64x2Le) {
RunF64x2CompareOpTest(execution_tier, kExprF64x2Le, LessEqual);
}
WASM_SIMD_TEST(F64x2EqZero) {
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Eq,
kExprF64x2Splat, Equal);
}
WASM_SIMD_TEST(F64x2NeZero) {
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Ne,
kExprF64x2Splat, NotEqual);
}
WASM_SIMD_TEST(F64x2GtZero) {
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Gt,
kExprF64x2Splat, Greater);
}
WASM_SIMD_TEST(F64x2GeZero) {
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Ge,
kExprF64x2Splat, GreaterEqual);
}
WASM_SIMD_TEST(F64x2LtZero) {
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Lt,
kExprF64x2Splat, Less);
}
WASM_SIMD_TEST(F64x2LeZero) {
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Le,
kExprF64x2Splat, LessEqual);
}
WASM_SIMD_TEST(F64x2Min) {
RunF64x2BinOpTest(execution_tier, kExprF64x2Min, JSMin);
}

View File

@ -5454,6 +5454,91 @@ TEST_F(InstructionSelectorTest, PokePairPrepareArgumentsSimd128) {
expected_poke_pair, expected_poke);
}
struct SIMDConstZeroFcmTest {
const bool is_zero;
const uint8_t lane_size;
const Operator* (MachineOperatorBuilder::*fcm_operator)();
const ArchOpcode expected_op_left;
const ArchOpcode expected_op_right;
const size_t size;
};
static const SIMDConstZeroFcmTest SIMDConstZeroFcmTests[] = {
{true, 64, &MachineOperatorBuilder::F64x2Eq, kArm64FEq, kArm64FEq, 1},
{true, 64, &MachineOperatorBuilder::F64x2Ne, kArm64FNe, kArm64FNe, 1},
{true, 64, &MachineOperatorBuilder::F64x2Lt, kArm64FGt, kArm64FLt, 1},
{true, 64, &MachineOperatorBuilder::F64x2Le, kArm64FGe, kArm64FLe, 1},
{false, 64, &MachineOperatorBuilder::F64x2Eq, kArm64FEq, kArm64FEq, 2},
{false, 64, &MachineOperatorBuilder::F64x2Ne, kArm64FNe, kArm64FNe, 2},
{false, 64, &MachineOperatorBuilder::F64x2Lt, kArm64FLt, kArm64FLt, 2},
{false, 64, &MachineOperatorBuilder::F64x2Le, kArm64FLe, kArm64FLe, 2},
{true, 32, &MachineOperatorBuilder::F32x4Eq, kArm64FEq, kArm64FEq, 1},
{true, 32, &MachineOperatorBuilder::F32x4Ne, kArm64FNe, kArm64FNe, 1},
{true, 32, &MachineOperatorBuilder::F32x4Lt, kArm64FGt, kArm64FLt, 1},
{true, 32, &MachineOperatorBuilder::F32x4Le, kArm64FGe, kArm64FLe, 1},
{false, 32, &MachineOperatorBuilder::F32x4Eq, kArm64FEq, kArm64FEq, 2},
{false, 32, &MachineOperatorBuilder::F32x4Ne, kArm64FNe, kArm64FNe, 2},
{false, 32, &MachineOperatorBuilder::F32x4Lt, kArm64FLt, kArm64FLt, 2},
{false, 32, &MachineOperatorBuilder::F32x4Le, kArm64FLe, kArm64FLe, 2},
};
using InstructionSelectorSIMDConstZeroFcmTest =
InstructionSelectorTestWithParam<SIMDConstZeroFcmTest>;
TEST_P(InstructionSelectorSIMDConstZeroFcmTest, ConstZero) {
const SIMDConstZeroFcmTest param = GetParam();
byte data[16] = {};
if (!param.is_zero) data[0] = 0xff;
// Const node on the left
{
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
Node* cnst = m.S128Const(data);
Node* fcm =
m.AddNode((m.machine()->*param.fcm_operator)(), cnst, m.Parameter(0));
m.Return(fcm);
Stream s = m.Build();
ASSERT_EQ(param.size, s.size());
if (param.size == 1) {
EXPECT_EQ(param.expected_op_left, s[0]->arch_opcode());
EXPECT_EQ(1U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
} else {
EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
EXPECT_EQ(param.expected_op_left, s[1]->arch_opcode());
EXPECT_EQ(2U, s[1]->InputCount());
EXPECT_EQ(1U, s[1]->OutputCount());
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[1]->opcode()));
}
}
// Const node on the right
{
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
Node* cnst = m.S128Const(data);
Node* fcm =
m.AddNode((m.machine()->*param.fcm_operator)(), m.Parameter(0), cnst);
m.Return(fcm);
Stream s = m.Build();
ASSERT_EQ(param.size, s.size());
if (param.size == 1) {
EXPECT_EQ(param.expected_op_right, s[0]->arch_opcode());
EXPECT_EQ(1U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
} else {
EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
EXPECT_EQ(param.expected_op_right, s[1]->arch_opcode());
EXPECT_EQ(2U, s[1]->InputCount());
EXPECT_EQ(1U, s[1]->OutputCount());
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[1]->opcode()));
}
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDConstZeroFcmTest,
::testing::ValuesIn(SIMDConstZeroFcmTests));
} // namespace
} // namespace compiler
} // namespace internal