[arm64][wasm-simd] Use Fcm(0) for floating point comparison with zero.
Use an immediate zero operand for floating point comparison nodes when possible. This results in up to 20-25% runtime improvement in some microbenchmarks, as well as 1-1.5% runtime improvement in some real-use benchmarks on Cortex-A55 and Neoverse N1. Change-Id: I39d10871a08a037dbe8c0877d789d110476e1a58 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3133143 Reviewed-by: Zhi An Ng <zhin@chromium.org> Commit-Queue: Martyn Capewell <martyn.capewell@arm.com> Cr-Commit-Position: refs/heads/main@{#76749}
This commit is contained in:
parent
7ad60c2784
commit
66bfcdcb43
@ -2077,6 +2077,28 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
i.InputSimd128Register(1).Format(f)); \
|
||||
break; \
|
||||
}
|
||||
#define SIMD_FCM_L_CASE(Op, ImmOp, RegOp) \
|
||||
case Op: { \
|
||||
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode)); \
|
||||
if (instr->InputCount() == 1) { \
|
||||
__ Fcm##ImmOp(i.OutputSimd128Register().Format(f), \
|
||||
i.InputSimd128Register(0).Format(f), +0.0); \
|
||||
} else { \
|
||||
__ Fcm##RegOp(i.OutputSimd128Register().Format(f), \
|
||||
i.InputSimd128Register(1).Format(f), \
|
||||
i.InputSimd128Register(0).Format(f)); \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
#define SIMD_FCM_G_CASE(Op, ImmOp) \
|
||||
case Op: { \
|
||||
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode)); \
|
||||
/* Currently Gt/Ge instructions are only used with zero */ \
|
||||
DCHECK_EQ(instr->InputCount(), 1); \
|
||||
__ Fcm##ImmOp(i.OutputSimd128Register().Format(f), \
|
||||
i.InputSimd128Register(0).Format(f), +0.0); \
|
||||
break; \
|
||||
}
|
||||
#define SIMD_DESTRUCTIVE_BINOP_CASE(Op, Instr, FORMAT) \
|
||||
case Op: { \
|
||||
VRegister dst = i.OutputSimd128Register().V##FORMAT(); \
|
||||
@ -2192,29 +2214,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ Mov(dst, i.InputInt8(1), i.InputSimd128Register(2).Format(f), 0);
|
||||
break;
|
||||
}
|
||||
SIMD_BINOP_LANE_SIZE_CASE(kArm64FEq, Fcmeq);
|
||||
SIMD_FCM_L_CASE(kArm64FEq, eq, eq);
|
||||
case kArm64FNe: {
|
||||
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
|
||||
VRegister dst = i.OutputSimd128Register().Format(f);
|
||||
__ Fcmeq(dst, i.InputSimd128Register(0).Format(f),
|
||||
i.InputSimd128Register(1).Format(f));
|
||||
if (instr->InputCount() == 1) {
|
||||
__ Fcmeq(dst, i.InputSimd128Register(0).Format(f), +0.0);
|
||||
} else {
|
||||
__ Fcmeq(dst, i.InputSimd128Register(0).Format(f),
|
||||
i.InputSimd128Register(1).Format(f));
|
||||
}
|
||||
__ Mvn(dst, dst);
|
||||
break;
|
||||
}
|
||||
case kArm64FLt: {
|
||||
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
|
||||
__ Fcmgt(i.OutputSimd128Register().Format(f),
|
||||
i.InputSimd128Register(1).Format(f),
|
||||
i.InputSimd128Register(0).Format(f));
|
||||
break;
|
||||
}
|
||||
case kArm64FLe: {
|
||||
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
|
||||
__ Fcmge(i.OutputSimd128Register().Format(f),
|
||||
i.InputSimd128Register(1).Format(f),
|
||||
i.InputSimd128Register(0).Format(f));
|
||||
break;
|
||||
}
|
||||
SIMD_FCM_L_CASE(kArm64FLt, lt, gt);
|
||||
SIMD_FCM_L_CASE(kArm64FLe, le, ge);
|
||||
SIMD_FCM_G_CASE(kArm64FGt, gt);
|
||||
SIMD_FCM_G_CASE(kArm64FGe, ge);
|
||||
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfma, Fmla, 2D);
|
||||
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfms, Fmls, 2D);
|
||||
case kArm64F64x2Pmin: {
|
||||
|
@ -210,6 +210,8 @@ namespace compiler {
|
||||
V(Arm64FNe) \
|
||||
V(Arm64FLt) \
|
||||
V(Arm64FLe) \
|
||||
V(Arm64FGt) \
|
||||
V(Arm64FGe) \
|
||||
V(Arm64F64x2Qfma) \
|
||||
V(Arm64F64x2Qfms) \
|
||||
V(Arm64F64x2Pmin) \
|
||||
|
@ -170,6 +170,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kArm64FNe:
|
||||
case kArm64FLt:
|
||||
case kArm64FLe:
|
||||
case kArm64FGt:
|
||||
case kArm64FGe:
|
||||
case kArm64F64x2Qfma:
|
||||
case kArm64F64x2Qfms:
|
||||
case kArm64F64x2Pmin:
|
||||
|
@ -3538,19 +3538,11 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
|
||||
V(F64x2Add, kArm64FAdd, 64) \
|
||||
V(F64x2Sub, kArm64FSub, 64) \
|
||||
V(F64x2Div, kArm64FDiv, 64) \
|
||||
V(F64x2Eq, kArm64FEq, 64) \
|
||||
V(F64x2Ne, kArm64FNe, 64) \
|
||||
V(F64x2Lt, kArm64FLt, 64) \
|
||||
V(F64x2Le, kArm64FLe, 64) \
|
||||
V(F32x4Min, kArm64FMin, 32) \
|
||||
V(F32x4Max, kArm64FMax, 32) \
|
||||
V(F32x4Add, kArm64FAdd, 32) \
|
||||
V(F32x4Sub, kArm64FSub, 32) \
|
||||
V(F32x4Div, kArm64FDiv, 32) \
|
||||
V(F32x4Eq, kArm64FEq, 32) \
|
||||
V(F32x4Ne, kArm64FNe, 32) \
|
||||
V(F32x4Lt, kArm64FLt, 32) \
|
||||
V(F32x4Le, kArm64FLe, 32) \
|
||||
V(I64x2Sub, kArm64ISub, 64) \
|
||||
V(I64x2Eq, kArm64IEq, 64) \
|
||||
V(I64x2Ne, kArm64INe, 64) \
|
||||
@ -3951,6 +3943,44 @@ VISIT_SIMD_SUB(I32x4, 32)
|
||||
VISIT_SIMD_SUB(I16x8, 16)
|
||||
#undef VISIT_SIMD_SUB
|
||||
|
||||
namespace {
|
||||
bool isSimdZero(Arm64OperandGenerator& g, Node* node) {
|
||||
auto m = V128ConstMatcher(node);
|
||||
if (m.HasResolvedValue()) {
|
||||
auto imms = m.ResolvedValue().immediate();
|
||||
return (std::all_of(imms.begin(), imms.end(), std::logical_not<uint8_t>()));
|
||||
}
|
||||
return node->opcode() == IrOpcode::kS128Zero;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
#define VISIT_SIMD_FCM(Type, CmOp, CmOpposite, LaneSize) \
|
||||
void InstructionSelector::Visit##Type##CmOp(Node* node) { \
|
||||
Arm64OperandGenerator g(this); \
|
||||
Node* left = node->InputAt(0); \
|
||||
Node* right = node->InputAt(1); \
|
||||
if (isSimdZero(g, left)) { \
|
||||
Emit(kArm64F##CmOpposite | LaneSizeField::encode(LaneSize), \
|
||||
g.DefineAsRegister(node), g.UseRegister(right)); \
|
||||
return; \
|
||||
} else if (isSimdZero(g, right)) { \
|
||||
Emit(kArm64F##CmOp | LaneSizeField::encode(LaneSize), \
|
||||
g.DefineAsRegister(node), g.UseRegister(left)); \
|
||||
return; \
|
||||
} \
|
||||
VisitRRR(this, kArm64F##CmOp | LaneSizeField::encode(LaneSize), node); \
|
||||
}
|
||||
|
||||
VISIT_SIMD_FCM(F64x2, Eq, Eq, 64)
|
||||
VISIT_SIMD_FCM(F64x2, Ne, Ne, 64)
|
||||
VISIT_SIMD_FCM(F64x2, Lt, Gt, 64)
|
||||
VISIT_SIMD_FCM(F64x2, Le, Ge, 64)
|
||||
VISIT_SIMD_FCM(F32x4, Eq, Eq, 32)
|
||||
VISIT_SIMD_FCM(F32x4, Ne, Ne, 32)
|
||||
VISIT_SIMD_FCM(F32x4, Lt, Gt, 32)
|
||||
VISIT_SIMD_FCM(F32x4, Le, Ge, 32)
|
||||
#undef VISIT_SIMD_FCM
|
||||
|
||||
void InstructionSelector::VisitS128Select(Node* node) {
|
||||
Arm64OperandGenerator g(this);
|
||||
Emit(kArm64S128Select, g.DefineSameAsFirst(node),
|
||||
|
@ -357,6 +357,16 @@ inline base::Vector<const int64_t> ValueHelper::GetVector() {
|
||||
return int64_vector();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline base::Vector<const float> ValueHelper::GetVector() {
|
||||
return float32_vector();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline base::Vector<const double> ValueHelper::GetVector() {
|
||||
return float64_vector();
|
||||
}
|
||||
|
||||
// Helper macros that can be used in FOR_INT32_INPUTS(i) { ... i ... }
|
||||
#define FOR_INPUTS(ctype, itype, var) \
|
||||
for (ctype var : ::v8::internal::compiler::ValueHelper::itype##_vector())
|
||||
|
@ -351,6 +351,48 @@ WASM_SIMD_TEST(F32x4ConvertI32x4) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename FloatType, typename ScalarType>
|
||||
void RunF128CompareOpConstImmTest(
|
||||
TestExecutionTier execution_tier, WasmOpcode cmp_opcode,
|
||||
WasmOpcode splat_opcode, ScalarType (*expected_op)(FloatType, FloatType)) {
|
||||
for (FloatType x : compiler::ValueHelper::GetVector<FloatType>()) {
|
||||
if (!PlatformCanRepresent(x)) continue;
|
||||
WasmRunner<int32_t, FloatType> r(execution_tier);
|
||||
// Set up globals to hold mask output for left and right cases
|
||||
ScalarType* g1 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
|
||||
ScalarType* g2 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
|
||||
// Build fn to splat test values, perform compare op on both sides, and
|
||||
// write the result.
|
||||
byte value = 0;
|
||||
byte temp = r.AllocateLocal(kWasmS128);
|
||||
uint8_t const_buffer[kSimd128Size];
|
||||
for (size_t i = 0; i < kSimd128Size / sizeof(FloatType); i++) {
|
||||
memcpy(&const_buffer[i * sizeof(FloatType)], &x, sizeof(FloatType));
|
||||
}
|
||||
BUILD(r,
|
||||
WASM_LOCAL_SET(temp,
|
||||
WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value))),
|
||||
WASM_GLOBAL_SET(
|
||||
0, WASM_SIMD_BINOP(cmp_opcode, WASM_SIMD_CONSTANT(const_buffer),
|
||||
WASM_LOCAL_GET(temp))),
|
||||
WASM_GLOBAL_SET(1, WASM_SIMD_BINOP(cmp_opcode, WASM_LOCAL_GET(temp),
|
||||
WASM_SIMD_CONSTANT(const_buffer))),
|
||||
WASM_ONE);
|
||||
for (FloatType y : compiler::ValueHelper::GetVector<FloatType>()) {
|
||||
if (!PlatformCanRepresent(y)) continue;
|
||||
FloatType diff = x - y; // Model comparison as subtraction.
|
||||
if (!PlatformCanRepresent(diff)) continue;
|
||||
r.Call(y);
|
||||
ScalarType expected1 = expected_op(x, y);
|
||||
ScalarType expected2 = expected_op(y, x);
|
||||
for (size_t i = 0; i < kSimd128Size / sizeof(ScalarType); i++) {
|
||||
CHECK_EQ(expected1, LANE(g1, i));
|
||||
CHECK_EQ(expected2, LANE(g2, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F32x4Abs) {
|
||||
RunF32x4UnOpTest(execution_tier, kExprF32x4Abs, std::abs);
|
||||
}
|
||||
@ -470,6 +512,36 @@ void RunShiftAddTestSequence(TestExecutionTier execution_tier,
|
||||
}
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F32x4EqZero) {
|
||||
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Eq,
|
||||
kExprF32x4Splat, Equal);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F32x4NeZero) {
|
||||
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Ne,
|
||||
kExprF32x4Splat, NotEqual);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F32x4GtZero) {
|
||||
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Gt,
|
||||
kExprF32x4Splat, Greater);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F32x4GeZero) {
|
||||
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Ge,
|
||||
kExprF32x4Splat, GreaterEqual);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F32x4LtZero) {
|
||||
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Lt,
|
||||
kExprF32x4Splat, Less);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F32x4LeZero) {
|
||||
RunF128CompareOpConstImmTest<float, int32_t>(execution_tier, kExprF32x4Le,
|
||||
kExprF32x4Splat, LessEqual);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(I64x2Splat) {
|
||||
WasmRunner<int32_t, int64_t> r(execution_tier);
|
||||
// Set up a global to hold output vector.
|
||||
@ -858,6 +930,36 @@ WASM_SIMD_TEST(F64x2Le) {
|
||||
RunF64x2CompareOpTest(execution_tier, kExprF64x2Le, LessEqual);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F64x2EqZero) {
|
||||
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Eq,
|
||||
kExprF64x2Splat, Equal);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F64x2NeZero) {
|
||||
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Ne,
|
||||
kExprF64x2Splat, NotEqual);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F64x2GtZero) {
|
||||
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Gt,
|
||||
kExprF64x2Splat, Greater);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F64x2GeZero) {
|
||||
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Ge,
|
||||
kExprF64x2Splat, GreaterEqual);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F64x2LtZero) {
|
||||
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Lt,
|
||||
kExprF64x2Splat, Less);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F64x2LeZero) {
|
||||
RunF128CompareOpConstImmTest<double, int64_t>(execution_tier, kExprF64x2Le,
|
||||
kExprF64x2Splat, LessEqual);
|
||||
}
|
||||
|
||||
WASM_SIMD_TEST(F64x2Min) {
|
||||
RunF64x2BinOpTest(execution_tier, kExprF64x2Min, JSMin);
|
||||
}
|
||||
|
@ -5454,6 +5454,91 @@ TEST_F(InstructionSelectorTest, PokePairPrepareArgumentsSimd128) {
|
||||
expected_poke_pair, expected_poke);
|
||||
}
|
||||
|
||||
struct SIMDConstZeroFcmTest {
|
||||
const bool is_zero;
|
||||
const uint8_t lane_size;
|
||||
const Operator* (MachineOperatorBuilder::*fcm_operator)();
|
||||
const ArchOpcode expected_op_left;
|
||||
const ArchOpcode expected_op_right;
|
||||
const size_t size;
|
||||
};
|
||||
|
||||
static const SIMDConstZeroFcmTest SIMDConstZeroFcmTests[] = {
|
||||
{true, 64, &MachineOperatorBuilder::F64x2Eq, kArm64FEq, kArm64FEq, 1},
|
||||
{true, 64, &MachineOperatorBuilder::F64x2Ne, kArm64FNe, kArm64FNe, 1},
|
||||
{true, 64, &MachineOperatorBuilder::F64x2Lt, kArm64FGt, kArm64FLt, 1},
|
||||
{true, 64, &MachineOperatorBuilder::F64x2Le, kArm64FGe, kArm64FLe, 1},
|
||||
{false, 64, &MachineOperatorBuilder::F64x2Eq, kArm64FEq, kArm64FEq, 2},
|
||||
{false, 64, &MachineOperatorBuilder::F64x2Ne, kArm64FNe, kArm64FNe, 2},
|
||||
{false, 64, &MachineOperatorBuilder::F64x2Lt, kArm64FLt, kArm64FLt, 2},
|
||||
{false, 64, &MachineOperatorBuilder::F64x2Le, kArm64FLe, kArm64FLe, 2},
|
||||
{true, 32, &MachineOperatorBuilder::F32x4Eq, kArm64FEq, kArm64FEq, 1},
|
||||
{true, 32, &MachineOperatorBuilder::F32x4Ne, kArm64FNe, kArm64FNe, 1},
|
||||
{true, 32, &MachineOperatorBuilder::F32x4Lt, kArm64FGt, kArm64FLt, 1},
|
||||
{true, 32, &MachineOperatorBuilder::F32x4Le, kArm64FGe, kArm64FLe, 1},
|
||||
{false, 32, &MachineOperatorBuilder::F32x4Eq, kArm64FEq, kArm64FEq, 2},
|
||||
{false, 32, &MachineOperatorBuilder::F32x4Ne, kArm64FNe, kArm64FNe, 2},
|
||||
{false, 32, &MachineOperatorBuilder::F32x4Lt, kArm64FLt, kArm64FLt, 2},
|
||||
{false, 32, &MachineOperatorBuilder::F32x4Le, kArm64FLe, kArm64FLe, 2},
|
||||
};
|
||||
|
||||
using InstructionSelectorSIMDConstZeroFcmTest =
|
||||
InstructionSelectorTestWithParam<SIMDConstZeroFcmTest>;
|
||||
|
||||
TEST_P(InstructionSelectorSIMDConstZeroFcmTest, ConstZero) {
|
||||
const SIMDConstZeroFcmTest param = GetParam();
|
||||
byte data[16] = {};
|
||||
if (!param.is_zero) data[0] = 0xff;
|
||||
// Const node on the left
|
||||
{
|
||||
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
|
||||
Node* cnst = m.S128Const(data);
|
||||
Node* fcm =
|
||||
m.AddNode((m.machine()->*param.fcm_operator)(), cnst, m.Parameter(0));
|
||||
m.Return(fcm);
|
||||
Stream s = m.Build();
|
||||
ASSERT_EQ(param.size, s.size());
|
||||
if (param.size == 1) {
|
||||
EXPECT_EQ(param.expected_op_left, s[0]->arch_opcode());
|
||||
EXPECT_EQ(1U, s[0]->InputCount());
|
||||
EXPECT_EQ(1U, s[0]->OutputCount());
|
||||
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
|
||||
} else {
|
||||
EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
|
||||
EXPECT_EQ(param.expected_op_left, s[1]->arch_opcode());
|
||||
EXPECT_EQ(2U, s[1]->InputCount());
|
||||
EXPECT_EQ(1U, s[1]->OutputCount());
|
||||
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[1]->opcode()));
|
||||
}
|
||||
}
|
||||
// Const node on the right
|
||||
{
|
||||
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
|
||||
Node* cnst = m.S128Const(data);
|
||||
Node* fcm =
|
||||
m.AddNode((m.machine()->*param.fcm_operator)(), m.Parameter(0), cnst);
|
||||
m.Return(fcm);
|
||||
Stream s = m.Build();
|
||||
ASSERT_EQ(param.size, s.size());
|
||||
if (param.size == 1) {
|
||||
EXPECT_EQ(param.expected_op_right, s[0]->arch_opcode());
|
||||
EXPECT_EQ(1U, s[0]->InputCount());
|
||||
EXPECT_EQ(1U, s[0]->OutputCount());
|
||||
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
|
||||
} else {
|
||||
EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
|
||||
EXPECT_EQ(param.expected_op_right, s[1]->arch_opcode());
|
||||
EXPECT_EQ(2U, s[1]->InputCount());
|
||||
EXPECT_EQ(1U, s[1]->OutputCount());
|
||||
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[1]->opcode()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
|
||||
InstructionSelectorSIMDConstZeroFcmTest,
|
||||
::testing::ValuesIn(SIMDConstZeroFcmTests));
|
||||
|
||||
} // namespace
|
||||
} // namespace compiler
|
||||
} // namespace internal
|
||||
|
Loading…
Reference in New Issue
Block a user