[relaxed-simd][arm64] Optimize DotAdd instruction
- Add CPU detection for Dot product instructions - Use sdot for I32x4DotI8x16AddS operation Bug: v8:13197 Change-Id: Ie3d52a7625246abaa371c2f4f4bdc8907d6889ee Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4190522 Reviewed-by: Clemens Backes <clemensb@chromium.org> Commit-Queue: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/main@{#85526}
This commit is contained in:
parent
76a817e03a
commit
74085b2af4
@ -404,6 +404,7 @@ CPU::CPU()
|
||||
has_vfp3_(false),
|
||||
has_vfp3_d32_(false),
|
||||
has_jscvt_(false),
|
||||
has_dot_prod_(false),
|
||||
is_fp64_mode_(false),
|
||||
has_non_stop_time_stamp_counter_(false),
|
||||
is_running_in_vm_(false),
|
||||
@ -726,20 +727,27 @@ CPU::CPU()
|
||||
#if !defined(PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE)
|
||||
constexpr int PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE = 44;
|
||||
#endif
|
||||
#if !defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
|
||||
constexpr int PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE = 43;
|
||||
#endif
|
||||
|
||||
has_jscvt_ =
|
||||
IsProcessorFeaturePresent(PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE);
|
||||
has_dot_prod_ =
|
||||
IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE);
|
||||
|
||||
#elif V8_OS_LINUX
|
||||
// Try to extract the list of CPU features from ELF hwcaps.
|
||||
uint32_t hwcaps = ReadELFHWCaps();
|
||||
if (hwcaps != 0) {
|
||||
has_jscvt_ = (hwcaps & HWCAP_JSCVT) != 0;
|
||||
has_dot_prod_ = (hwcaps & HWCAP_ASIMDDP) != 0;
|
||||
} else {
|
||||
// Try to fallback to "Features" CPUInfo field
|
||||
CPUInfo cpu_info;
|
||||
char* features = cpu_info.ExtractField("Features");
|
||||
has_jscvt_ = HasListItem(features, "jscvt");
|
||||
has_dot_prod_ = HasListItem(features, "asimddp");
|
||||
delete[] features;
|
||||
}
|
||||
#elif V8_OS_DARWIN
|
||||
@ -752,9 +760,18 @@ CPU::CPU()
|
||||
} else {
|
||||
has_jscvt_ = feat_jscvt;
|
||||
}
|
||||
int64_t feat_dot_prod = 0;
|
||||
size_t feat_dot_prod_size = sizeof(feat_dot_prod);
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &feat_dot_prod,
|
||||
&feat_dot_prod_size, nullptr, 0) == -1) {
|
||||
has_dot_prod_ = false;
|
||||
} else {
|
||||
has_dot_prod_ = feat_dot_prod;
|
||||
}
|
||||
#else
|
||||
// ARM64 Macs always have JSCVT.
|
||||
// ARM64 Macs always have JSCVT and ASIMDDP
|
||||
has_jscvt_ = true;
|
||||
has_dot_prod_ = true;
|
||||
#endif // V8_OS_IOS
|
||||
#endif // V8_OS_WIN
|
||||
|
||||
|
@ -123,6 +123,7 @@ class V8_BASE_EXPORT CPU final {
|
||||
bool has_vfp3() const { return has_vfp3_; }
|
||||
bool has_vfp3_d32() const { return has_vfp3_d32_; }
|
||||
bool has_jscvt() const { return has_jscvt_; }
|
||||
bool has_dot_prod() const { return has_dot_prod_; }
|
||||
|
||||
// mips features
|
||||
bool is_fp64_mode() const { return is_fp64_mode_; }
|
||||
@ -176,6 +177,7 @@ class V8_BASE_EXPORT CPU final {
|
||||
bool has_vfp3_;
|
||||
bool has_vfp3_d32_;
|
||||
bool has_jscvt_;
|
||||
bool has_dot_prod_;
|
||||
bool is_fp64_mode_;
|
||||
bool has_non_stop_time_stamp_counter_;
|
||||
bool is_running_in_vm_;
|
||||
|
@ -66,6 +66,9 @@ constexpr unsigned CpuFeaturesFromCompiler() {
|
||||
unsigned features = 0;
|
||||
#if defined(__ARM_FEATURE_JCVT)
|
||||
features |= 1u << JSCVT;
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
features |= 1u << DOTPROD;
|
||||
#endif
|
||||
return features;
|
||||
}
|
||||
@ -75,6 +78,7 @@ constexpr unsigned CpuFeaturesFromTargetOS() {
|
||||
#if defined(V8_TARGET_OS_MACOS) && !defined(V8_TARGET_OS_IOS)
|
||||
// TODO(v8:13004): Detect if an iPhone is new enough to support jscvt.
|
||||
features |= 1u << JSCVT;
|
||||
features |= 1u << DOTPROD;
|
||||
#endif
|
||||
return features;
|
||||
}
|
||||
@ -106,6 +110,9 @@ void CpuFeatures::ProbeImpl(bool cross_compile) {
|
||||
if (cpu.has_jscvt()) {
|
||||
runtime |= 1u << JSCVT;
|
||||
}
|
||||
if (cpu.has_dot_prod()) {
|
||||
runtime |= 1u << DOTPROD;
|
||||
}
|
||||
|
||||
// Use the best of the features found by CPU detection and those inferred from
|
||||
// the build system.
|
||||
@ -1424,6 +1431,14 @@ void Assembler::stlxrh(const Register& rs, const Register& rt,
|
||||
Emit(STLXR_h | Rs(rs) | Rt2(x31) | RnSP(rn) | Rt(rt));
|
||||
}
|
||||
|
||||
void Assembler::sdot(const VRegister& vd, const VRegister& vn,
|
||||
const VRegister& vm) {
|
||||
DCHECK(CpuFeatures::IsSupported(DOTPROD));
|
||||
DCHECK(vn.Is16B() && vd.Is4S());
|
||||
DCHECK(AreSameFormat(vn, vm));
|
||||
Emit(NEON_Q | NEON_SDOT | Rm(vm) | Rn(vn) | Rd(vd));
|
||||
}
|
||||
|
||||
void Assembler::NEON3DifferentL(const VRegister& vd, const VRegister& vn,
|
||||
const VRegister& vm, NEON3DifferentOp vop) {
|
||||
DCHECK(AreSameFormat(vn, vm));
|
||||
|
@ -1219,6 +1219,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
// Signed minimum across vector.
|
||||
void sminv(const VRegister& vd, const VRegister& vn);
|
||||
|
||||
// Signed dot product
|
||||
void sdot(const VRegister& vd, const VRegister& vn, const VRegister& vm);
|
||||
|
||||
// One-element structure store from one register.
|
||||
void st1(const VRegister& vt, const MemOperand& src);
|
||||
|
||||
|
@ -1653,6 +1653,7 @@ constexpr NEON3SameOp NEON_BSL = NEON3SameLogicalFixed | 0x20400000;
|
||||
// NEON instructions with three different-type operands.
|
||||
using NEON3DifferentOp = uint32_t;
|
||||
constexpr NEON3DifferentOp NEON3DifferentFixed = 0x0E200000;
|
||||
constexpr NEON3DifferentOp NEON3DifferentDot = 0x0E800000;
|
||||
constexpr NEON3DifferentOp NEON3DifferentFMask = 0x9F200C00;
|
||||
constexpr NEON3DifferentOp NEON3DifferentMask = 0xFF20FC00;
|
||||
constexpr NEON3DifferentOp NEON_ADDHN = NEON3DifferentFixed | 0x00004000;
|
||||
@ -1671,6 +1672,7 @@ constexpr NEON3DifferentOp NEON_SADDL = NEON3DifferentFixed | 0x00000000;
|
||||
constexpr NEON3DifferentOp NEON_SADDL2 = NEON_SADDL | NEON_Q;
|
||||
constexpr NEON3DifferentOp NEON_SADDW = NEON3DifferentFixed | 0x00001000;
|
||||
constexpr NEON3DifferentOp NEON_SADDW2 = NEON_SADDW | NEON_Q;
|
||||
constexpr NEON3DifferentOp NEON_SDOT = NEON3DifferentDot | 0x00009400;
|
||||
constexpr NEON3DifferentOp NEON_SMLAL = NEON3DifferentFixed | 0x00008000;
|
||||
constexpr NEON3DifferentOp NEON_SMLAL2 = NEON_SMLAL | NEON_Q;
|
||||
constexpr NEON3DifferentOp NEON_SMLSL = NEON3DifferentFixed | 0x0000A000;
|
||||
|
@ -407,6 +407,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
V(saddl, Saddl) \
|
||||
V(saddw2, Saddw2) \
|
||||
V(saddw, Saddw) \
|
||||
V(sdot, Sdot) \
|
||||
V(shadd, Shadd) \
|
||||
V(shsub, Shsub) \
|
||||
V(smaxp, Smaxp) \
|
||||
|
@ -43,6 +43,7 @@ enum CpuFeature {
|
||||
|
||||
#elif V8_TARGET_ARCH_ARM64
|
||||
JSCVT,
|
||||
DOTPROD,
|
||||
|
||||
#elif V8_TARGET_ARCH_MIPS64
|
||||
FPU,
|
||||
|
@ -2508,17 +2508,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kArm64I32x4DotI8x16AddS: {
|
||||
UseScratchRegisterScope scope(tasm());
|
||||
VRegister lhs = i.InputSimd128Register(0);
|
||||
VRegister rhs = i.InputSimd128Register(1);
|
||||
VRegister tmp1 = scope.AcquireV(kFormat8H);
|
||||
VRegister tmp2 = scope.AcquireV(kFormat8H);
|
||||
__ Smull(tmp1, lhs.V8B(), rhs.V8B());
|
||||
__ Smull2(tmp2, lhs.V16B(), rhs.V16B());
|
||||
__ Addp(tmp1, tmp1, tmp2);
|
||||
__ Saddlp(tmp1.V4S(), tmp1);
|
||||
__ Add(i.OutputSimd128Register().V4S(), tmp1.V4S(),
|
||||
i.InputSimd128Register(2).V4S());
|
||||
if (CpuFeatures::IsSupported(DOTPROD)) {
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(2));
|
||||
__ Sdot(i.InputSimd128Register(2).V4S(),
|
||||
i.InputSimd128Register(0).V16B(),
|
||||
i.InputSimd128Register(1).V16B());
|
||||
|
||||
} else {
|
||||
UseScratchRegisterScope scope(tasm());
|
||||
VRegister lhs = i.InputSimd128Register(0);
|
||||
VRegister rhs = i.InputSimd128Register(1);
|
||||
VRegister tmp1 = scope.AcquireV(kFormat8H);
|
||||
VRegister tmp2 = scope.AcquireV(kFormat8H);
|
||||
__ Smull(tmp1, lhs.V8B(), rhs.V8B());
|
||||
__ Smull2(tmp2, lhs.V16B(), rhs.V16B());
|
||||
__ Addp(tmp1, tmp1, tmp2);
|
||||
__ Saddlp(tmp1.V4S(), tmp1);
|
||||
__ Add(i.OutputSimd128Register().V4S(), tmp1.V4S(),
|
||||
i.InputSimd128Register(2).V4S());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kArm64IExtractLaneU: {
|
||||
|
@ -3886,9 +3886,11 @@ void InstructionSelector::VisitS128Zero(Node* node) {
|
||||
|
||||
void InstructionSelector::VisitI32x4DotI8x16I7x16AddS(Node* node) {
|
||||
Arm64OperandGenerator g(this);
|
||||
Emit(
|
||||
kArm64I32x4DotI8x16AddS, g.DefineAsRegister(node), g.UseRegister(node->InputAt(0)),
|
||||
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2)));
|
||||
InstructionOperand output = CpuFeatures::IsSupported(DOTPROD)
|
||||
? g.DefineSameAsInput(node, 2)
|
||||
: g.DefineAsRegister(node);
|
||||
Emit(kArm64I32x4DotI8x16AddS, output, g.UseRegister(node->InputAt(0)),
|
||||
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2)));
|
||||
}
|
||||
|
||||
#define SIMD_VISIT_EXTRACT_LANE(Type, T, Sign, LaneSize) \
|
||||
|
Loading…
Reference in New Issue
Block a user