From f3ba901f71457bda5a1c4b80f2329c340465d861 Mon Sep 17 00:00:00 2001 From: Jamie Reece Wilson Date: Sun, 5 May 2024 19:42:10 +0100 Subject: [PATCH] [+] Zen3 on top of AlderLake optimizations [*] Minor alderlake adjustments --- Source/Threading/AuSleep.cpp | 79 +++++++++++++++++++++--- Source/Threading/Primitives/SMTYield.cpp | 6 ++ Source/Threading/Primitives/SMTYield.hpp | 64 ++++++++++++++----- 3 files changed, 125 insertions(+), 24 deletions(-) diff --git a/Source/Threading/AuSleep.cpp b/Source/Threading/AuSleep.cpp index d126dca8..fe923969 100644 --- a/Source/Threading/AuSleep.cpp +++ b/Source/Threading/AuSleep.cpp @@ -111,12 +111,43 @@ namespace Aurora::Threading { if (uEndTimeSteadyNS2 - uNowNS <= 100000ull) { - for (AU_ITERATE_N(i, 32)) + auto uNow = AuAtomicAdd(&AuThreadPrimitives::gSpinAdaptiveCurrentCount, 1u); + if (!AuThreadPrimitives::gSpinAdaptiveThreshold || uNow <= AuThreadPrimitives::gSpinAdaptiveThreshold) { - AuThreadPrimitives::SMPPause(); - } + #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) + if (AuThreadPrimitives::ThrdCfg::gIsIntelAlderLakeOrGreater) + { + _tpause(0, 10000); + } + else + #endif + { + #if 0 + for (AU_ITERATE_N(i, 32)) + { + AuThreadPrimitives::SMPPause(); + } + #else + // shit compiler wont unwrap + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + // 32 or 16? + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + #endif + } - continue; + AuAtomicSub(&AuThreadPrimitives::gSpinAdaptiveCurrentCount, 1u); + continue; + } + else + { + AuAtomicSub(&AuThreadPrimitives::gSpinAdaptiveCurrentCount, 1u); + } } } @@ -131,12 +162,44 @@ namespace Aurora::Threading { if (AuThreadPrimitives::ThrdCfg::gPlatformIsSMPProcessorOptimized) { - for (AU_ITERATE_N(i, 32)) + auto uNow = AuAtomicAdd(&AuThreadPrimitives::gSpinAdaptiveCurrentCount, 1u); + if (!AuThreadPrimitives::gSpinAdaptiveThreshold || uNow <= AuThreadPrimitives::gSpinAdaptiveThreshold) { - AuThreadPrimitives::SMPPause(); - } + #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) + if (AuThreadPrimitives::ThrdCfg::gIsIntelAlderLakeOrGreater) + { + _tpause(0, 1000); + } + else + #endif + { + #if 0 + for (AU_ITERATE_N(i, 32)) + { + AuThreadPrimitives::SMPPause(); + } + #else + // shit compiler wont unwrap + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + // 32 or 16? + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); AuThreadPrimitives::SMPPause(); + #endif + } - continue; + AuAtomicSub(&AuThreadPrimitives::gSpinAdaptiveCurrentCount, 1u); + continue; + } + else + { + AuAtomicSub(&AuThreadPrimitives::gSpinAdaptiveCurrentCount, 1u); + break; + } } else { diff --git a/Source/Threading/Primitives/SMTYield.cpp b/Source/Threading/Primitives/SMTYield.cpp index c5041689..6e2c6fc2 100644 --- a/Source/Threading/Primitives/SMTYield.cpp +++ b/Source/Threading/Primitives/SMTYield.cpp @@ -101,8 +101,14 @@ namespace Aurora::Threading::Primitives #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) { + #if 0 auto cpuId = AuHwInfo::cpuid(7); ThrdCfg::gIsIntelAlderLakeOrGreater = (cpuId.ecx >> 5) & 1; + #else + auto &cpuId = AuHwInfo::GetCPUInfo().cpuId; + ThrdCfg::gIsIntelAlderLakeOrGreater = AuBitTest(cpuId.f_7_ECX, 5); + ThrdCfg::gIsZen3OrGreater = AuBitTest(cpuId.f_81_ECX, 29); + #endif } #endif diff --git a/Source/Threading/Primitives/SMTYield.hpp b/Source/Threading/Primitives/SMTYield.hpp index d926e361..1441029c 100644 --- a/Source/Threading/Primitives/SMTYield.hpp +++ b/Source/Threading/Primitives/SMTYield.hpp @@ -85,6 +85,7 @@ namespace Aurora::Threading::Primitives inline bool gAlwaysRWLockWriteBiasOnReadLock {}; inline bool gEnableRWLockWriteBiasOnReadLock {}; inline AuUInt32 gIsIntelAlderLakeOrGreater {}; + inline AuUInt32 gIsZen3OrGreater {}; inline AuUInt8 gCountOfPCores {}; } @@ -204,7 +205,7 @@ namespace Aurora::Threading::Primitives AuAtomicStore(&gCoreTable[uWord], 1u); auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); - if (uNow <= gSpinAdaptiveThreshold) + if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold) { auto uCount = spin; @@ -216,23 +217,40 @@ namespace Aurora::Threading::Primitives { uCount += tlsSpinCountLocal; } - - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) - auto perfCounter = __rdtsc() + uCount; - while (__rdtsc() < perfCounter) - #else - while (uCount > 0) - #endif + + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + if (ThrdCfg::gIsIntelAlderLakeOrGreater) { if (callback()) { bRet = true; - break; } else { - SMPPause(); - uCount--; + _umwait(1, __rdtsc() + uCount); + bRet = callback(); + } + } + else + #endif + { + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) + auto perfCounter = __rdtsc() + uCount; + while (__rdtsc() < perfCounter) + #else + while (uCount > 0) + #endif + { + if (callback()) + { + bRet = true; + break; + } + else + { + SMPPause(); + uCount--; + } } } } @@ -279,7 +297,7 @@ namespace Aurora::Threading::Primitives if (gHasThreadLocalTimeout) { auto uCount = tlsSpinCountLocal; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else @@ -304,7 +322,7 @@ namespace Aurora::Threading::Primitives else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3)) { auto uCount = (spin) / 3; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else @@ -403,7 +421,7 @@ namespace Aurora::Threading::Primitives AuAtomicStore(&gCoreTable[uWord], 1u); auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); - if (uNow <= gSpinAdaptiveThreshold) + if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold) { auto uCount = spin; bool bSMTProbablyHit {}; @@ -433,6 +451,20 @@ namespace Aurora::Threading::Primitives bRet = callback(); } } + else if (ThrdCfg::gIsZen3OrGreater) + { + _mm_monitorx((void *)pWord, 0, 0); + + if (callback()) + { + bRet = true; + } + else + { + _mm_mwaitx(2, 0, uCount); + bRet = callback(); + } + } else #endif { @@ -499,7 +531,7 @@ namespace Aurora::Threading::Primitives if (gHasThreadLocalTimeout) { auto uCount = tlsSpinCountLocal; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else @@ -524,7 +556,7 @@ namespace Aurora::Threading::Primitives else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3)) { auto uCount = (spin) / 3; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else