[relaxed-simd][arm64] Optimize DotAdd instruction

- Add CPU detection for Dot product instructions
 - Use sdot for I32x4DotI8x16AddS operation

Bug: v8:13197
Change-Id: Ie3d52a7625246abaa371c2f4f4bdc8907d6889ee
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4190522
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Commit-Queue: Deepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#85526}
This commit is contained in:
Deepti Gandluri 2023-01-27 10:58:13 -08:00 committed by V8 LUCI CQ
parent 76a817e03a
commit 74085b2af4
9 changed files with 66 additions and 15 deletions

View File

@ -404,6 +404,7 @@ CPU::CPU()
has_vfp3_(false),
has_vfp3_d32_(false),
has_jscvt_(false),
has_dot_prod_(false),
is_fp64_mode_(false),
has_non_stop_time_stamp_counter_(false),
is_running_in_vm_(false),
@ -726,20 +727,27 @@ CPU::CPU()
#if !defined(PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE)
constexpr int PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE = 44;
#endif
#if !defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
constexpr int PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE = 43;
#endif
has_jscvt_ =
IsProcessorFeaturePresent(PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE);
has_dot_prod_ =
IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE);
#elif V8_OS_LINUX
// Try to extract the list of CPU features from ELF hwcaps.
uint32_t hwcaps = ReadELFHWCaps();
if (hwcaps != 0) {
has_jscvt_ = (hwcaps & HWCAP_JSCVT) != 0;
has_dot_prod_ = (hwcaps & HWCAP_ASIMDDP) != 0;
} else {
// Try to fallback to "Features" CPUInfo field
CPUInfo cpu_info;
char* features = cpu_info.ExtractField("Features");
has_jscvt_ = HasListItem(features, "jscvt");
has_dot_prod_ = HasListItem(features, "asimddp");
delete[] features;
}
#elif V8_OS_DARWIN
@ -752,9 +760,18 @@ CPU::CPU()
} else {
has_jscvt_ = feat_jscvt;
}
int64_t feat_dot_prod = 0;
size_t feat_dot_prod_size = sizeof(feat_dot_prod);
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &feat_dot_prod,
&feat_dot_prod_size, nullptr, 0) == -1) {
has_dot_prod_ = false;
} else {
has_dot_prod_ = feat_dot_prod;
}
#else
// ARM64 Macs always have JSCVT.
// ARM64 Macs always have JSCVT and ASIMDDP
has_jscvt_ = true;
has_dot_prod_ = true;
#endif // V8_OS_IOS
#endif // V8_OS_WIN

View File

@ -123,6 +123,7 @@ class V8_BASE_EXPORT CPU final {
bool has_vfp3() const { return has_vfp3_; }
bool has_vfp3_d32() const { return has_vfp3_d32_; }
bool has_jscvt() const { return has_jscvt_; }
bool has_dot_prod() const { return has_dot_prod_; }
// mips features
bool is_fp64_mode() const { return is_fp64_mode_; }
@ -176,6 +177,7 @@ class V8_BASE_EXPORT CPU final {
bool has_vfp3_;
bool has_vfp3_d32_;
bool has_jscvt_;
bool has_dot_prod_;
bool is_fp64_mode_;
bool has_non_stop_time_stamp_counter_;
bool is_running_in_vm_;

View File

@ -66,6 +66,9 @@ constexpr unsigned CpuFeaturesFromCompiler() {
unsigned features = 0;
#if defined(__ARM_FEATURE_JCVT)
features |= 1u << JSCVT;
#endif
#if defined(__ARM_FEATURE_DOTPROD)
features |= 1u << DOTPROD;
#endif
return features;
}
@ -75,6 +78,7 @@ constexpr unsigned CpuFeaturesFromTargetOS() {
#if defined(V8_TARGET_OS_MACOS) && !defined(V8_TARGET_OS_IOS)
// TODO(v8:13004): Detect if an iPhone is new enough to support jscvt.
features |= 1u << JSCVT;
features |= 1u << DOTPROD;
#endif
return features;
}
@ -106,6 +110,9 @@ void CpuFeatures::ProbeImpl(bool cross_compile) {
if (cpu.has_jscvt()) {
runtime |= 1u << JSCVT;
}
if (cpu.has_dot_prod()) {
runtime |= 1u << DOTPROD;
}
// Use the best of the features found by CPU detection and those inferred from
// the build system.
@ -1424,6 +1431,14 @@ void Assembler::stlxrh(const Register& rs, const Register& rt,
Emit(STLXR_h | Rs(rs) | Rt2(x31) | RnSP(rn) | Rt(rt));
}
void Assembler::sdot(const VRegister& vd, const VRegister& vn,
const VRegister& vm) {
DCHECK(CpuFeatures::IsSupported(DOTPROD));
DCHECK(vn.Is16B() && vd.Is4S());
DCHECK(AreSameFormat(vn, vm));
Emit(NEON_Q | NEON_SDOT | Rm(vm) | Rn(vn) | Rd(vd));
}
void Assembler::NEON3DifferentL(const VRegister& vd, const VRegister& vn,
const VRegister& vm, NEON3DifferentOp vop) {
DCHECK(AreSameFormat(vn, vm));

View File

@ -1219,6 +1219,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// Signed minimum across vector.
void sminv(const VRegister& vd, const VRegister& vn);
// Signed dot product
void sdot(const VRegister& vd, const VRegister& vn, const VRegister& vm);
// One-element structure store from one register.
void st1(const VRegister& vt, const MemOperand& src);

View File

@ -1653,6 +1653,7 @@ constexpr NEON3SameOp NEON_BSL = NEON3SameLogicalFixed | 0x20400000;
// NEON instructions with three different-type operands.
using NEON3DifferentOp = uint32_t;
constexpr NEON3DifferentOp NEON3DifferentFixed = 0x0E200000;
constexpr NEON3DifferentOp NEON3DifferentDot = 0x0E800000;
constexpr NEON3DifferentOp NEON3DifferentFMask = 0x9F200C00;
constexpr NEON3DifferentOp NEON3DifferentMask = 0xFF20FC00;
constexpr NEON3DifferentOp NEON_ADDHN = NEON3DifferentFixed | 0x00004000;
@ -1671,6 +1672,7 @@ constexpr NEON3DifferentOp NEON_SADDL = NEON3DifferentFixed | 0x00000000;
constexpr NEON3DifferentOp NEON_SADDL2 = NEON_SADDL | NEON_Q;
constexpr NEON3DifferentOp NEON_SADDW = NEON3DifferentFixed | 0x00001000;
constexpr NEON3DifferentOp NEON_SADDW2 = NEON_SADDW | NEON_Q;
constexpr NEON3DifferentOp NEON_SDOT = NEON3DifferentDot | 0x00009400;
constexpr NEON3DifferentOp NEON_SMLAL = NEON3DifferentFixed | 0x00008000;
constexpr NEON3DifferentOp NEON_SMLAL2 = NEON_SMLAL | NEON_Q;
constexpr NEON3DifferentOp NEON_SMLSL = NEON3DifferentFixed | 0x0000A000;

View File

@ -407,6 +407,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
V(saddl, Saddl) \
V(saddw2, Saddw2) \
V(saddw, Saddw) \
V(sdot, Sdot) \
V(shadd, Shadd) \
V(shsub, Shsub) \
V(smaxp, Smaxp) \

View File

@ -43,6 +43,7 @@ enum CpuFeature {
#elif V8_TARGET_ARCH_ARM64
JSCVT,
DOTPROD,
#elif V8_TARGET_ARCH_MIPS64
FPU,

View File

@ -2508,17 +2508,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kArm64I32x4DotI8x16AddS: {
UseScratchRegisterScope scope(tasm());
VRegister lhs = i.InputSimd128Register(0);
VRegister rhs = i.InputSimd128Register(1);
VRegister tmp1 = scope.AcquireV(kFormat8H);
VRegister tmp2 = scope.AcquireV(kFormat8H);
__ Smull(tmp1, lhs.V8B(), rhs.V8B());
__ Smull2(tmp2, lhs.V16B(), rhs.V16B());
__ Addp(tmp1, tmp1, tmp2);
__ Saddlp(tmp1.V4S(), tmp1);
__ Add(i.OutputSimd128Register().V4S(), tmp1.V4S(),
i.InputSimd128Register(2).V4S());
if (CpuFeatures::IsSupported(DOTPROD)) {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(2));
__ Sdot(i.InputSimd128Register(2).V4S(),
i.InputSimd128Register(0).V16B(),
i.InputSimd128Register(1).V16B());
} else {
UseScratchRegisterScope scope(tasm());
VRegister lhs = i.InputSimd128Register(0);
VRegister rhs = i.InputSimd128Register(1);
VRegister tmp1 = scope.AcquireV(kFormat8H);
VRegister tmp2 = scope.AcquireV(kFormat8H);
__ Smull(tmp1, lhs.V8B(), rhs.V8B());
__ Smull2(tmp2, lhs.V16B(), rhs.V16B());
__ Addp(tmp1, tmp1, tmp2);
__ Saddlp(tmp1.V4S(), tmp1);
__ Add(i.OutputSimd128Register().V4S(), tmp1.V4S(),
i.InputSimd128Register(2).V4S());
}
break;
}
case kArm64IExtractLaneU: {

View File

@ -3886,9 +3886,11 @@ void InstructionSelector::VisitS128Zero(Node* node) {
void InstructionSelector::VisitI32x4DotI8x16I7x16AddS(Node* node) {
Arm64OperandGenerator g(this);
Emit(
kArm64I32x4DotI8x16AddS, g.DefineAsRegister(node), g.UseRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2)));
InstructionOperand output = CpuFeatures::IsSupported(DOTPROD)
? g.DefineSameAsInput(node, 2)
: g.DefineAsRegister(node);
Emit(kArm64I32x4DotI8x16AddS, output, g.UseRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2)));
}
#define SIMD_VISIT_EXTRACT_LANE(Type, T, Sign, LaneSize) \