[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

This commit is contained in:
Reece Wilson 2024-05-03 12:14:52 +01:00
parent a35c1f165a
commit 134816e128
19 changed files with 314 additions and 52 deletions

View File

@ -218,10 +218,10 @@ namespace Aurora::IO::Loop
bool LSLocalEvent::TryTakeSpin() bool LSLocalEvent::TryTakeSpin()
{ {
return Threading::Primitives::DoTryIf([&] return Threading::Primitives::DoTryIfAlderLake([&]
{ {
return this->TryTakeNoSpin(); return this->TryTakeNoSpin();
}); }, &this->state_);
} }
bool LSLocalEvent::IsSignaledNoSpinIfUserland() bool LSLocalEvent::IsSignaledNoSpinIfUserland()

View File

@ -108,10 +108,10 @@ namespace Aurora::IO::Loop
bool LSLocalMutex::TryTakeSpin() bool LSLocalMutex::TryTakeSpin()
{ {
return Threading::Primitives::DoTryIf([&] return Threading::Primitives::DoTryIfAlderLake([&]
{ {
return this->TryTakeNoSpin(); return this->TryTakeNoSpin();
}); }, &this->uAtomicWord);
} }
bool LSLocalMutex::TryTake() bool LSLocalMutex::TryTake()

View File

@ -150,10 +150,10 @@ namespace Aurora::IO::Loop
bool LSLocalSemaphore::TryTakeSpin() bool LSLocalSemaphore::TryTakeSpin()
{ {
return Threading::Primitives::DoTryIf([&] return Threading::Primitives::DoTryIfAlderLake([&]
{ {
return this->TryTakeNoSpin(); return this->TryTakeNoSpin();
}); }, &this->uAtomicSemaphore);
} }
bool LSLocalSemaphore::TryTake() bool LSLocalSemaphore::TryTake()

View File

@ -58,20 +58,20 @@ namespace Aurora::Threading
{ {
if (gShouldSpinOnlyInCPU == 0) if (gShouldSpinOnlyInCPU == 0)
{ {
while (!Primitives::DoTryIf([&]() while (!Primitives::DoTryIfAlderLake([&]()
{ {
return AuAtomicTestAndSet(uPointer, 0) == 0; return AuAtomicTestAndSet(uPointer, 0) == 0;
})) }, uPointer))
{ {
} }
} }
else if (gShouldSpinOnlyInCPU == 1) else if (gShouldSpinOnlyInCPU == 1)
{ {
while (!Primitives::DoTryIf([&]() while (!Primitives::DoTryIfAlderLake([&]()
{ {
return AuAtomicTestAndSet(uPointer, 0) == 0; return AuAtomicTestAndSet(uPointer, 0) == 0;
})) }, uPointer))
{ {
ContextYield(); ContextYield();
} }
@ -920,10 +920,10 @@ namespace Aurora::Threading
{ {
uMS = AuNSToMS<AuUInt32>(uAbsTimeSteadyClock - uNow); uMS = AuNSToMS<AuUInt32>(uAbsTimeSteadyClock - uNow);
if (Primitives::DoTryIf([&]() if (Primitives::DoTryIfAlderLake([&]()
{ {
return !WaitBuffer::Compare2<EWaitMethod::eNotEqual, true>(pTargetAddress, uWordSize, pCompareAddress); return !WaitBuffer::Compare2<EWaitMethod::eNotEqual, true>(pTargetAddress, uWordSize, pCompareAddress);
})) }, pTargetAddress))
{ {
// hit it within the span of 1 << SpinLoopPowerA SMT stalls // hit it within the span of 1 << SpinLoopPowerA SMT stalls
return true; return true;
@ -1175,10 +1175,10 @@ namespace Aurora::Threading
const void *pCompareAddress, const void *pCompareAddress,
AuUInt8 uWordSize) AuUInt8 uWordSize)
{ {
return Primitives::DoTryIf([&]() return Primitives::DoTryIfAlderLake([&]()
{ {
return !WaitBuffer::Compare2<T, true>(pTargetAddress, uWordSize, pCompareAddress); return !WaitBuffer::Compare2<T, true>(pTargetAddress, uWordSize, pCompareAddress);
}); }, pTargetAddress);
} }
WOAFASTPUB bool TryWaitOnAddress(const void *pTargetAddress, WOAFASTPUB bool TryWaitOnAddress(const void *pTargetAddress,
@ -1208,7 +1208,7 @@ namespace Aurora::Threading
return TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize); return TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize);
} }
return Primitives::DoTryIf([&]() return Primitives::DoTryIfAlderLake([&]()
{ {
if (WaitBuffer::Compare2<EWaitMethod::eNotEqual, true>(pTargetAddress, uWordSize, pCompareAddress)) if (WaitBuffer::Compare2<EWaitMethod::eNotEqual, true>(pTargetAddress, uWordSize, pCompareAddress))
{ {
@ -1216,7 +1216,7 @@ namespace Aurora::Threading
} }
return check(pTargetAddress, pCompareAddress, uWordSize); return check(pTargetAddress, pCompareAddress, uWordSize);
}); }, pTargetAddress);
} }
template <EWaitMethod T> template <EWaitMethod T>
@ -1225,7 +1225,7 @@ namespace Aurora::Threading
AuUInt8 uWordSize, AuUInt8 uWordSize,
const AuFunction<bool(const void *, const void *, AuUInt8)> &check) const AuFunction<bool(const void *, const void *, AuUInt8)> &check)
{ {
return Primitives::DoTryIf([&]() return Primitives::DoTryIfAlderLake([&]()
{ {
if (WaitBuffer::Compare2<T, true>(pTargetAddress, uWordSize, pCompareAddress)) if (WaitBuffer::Compare2<T, true>(pTargetAddress, uWordSize, pCompareAddress))
{ {
@ -1233,7 +1233,7 @@ namespace Aurora::Threading
} }
return check(pTargetAddress, pCompareAddress, uWordSize); return check(pTargetAddress, pCompareAddress, uWordSize);
}); }, pTargetAddress);
} }
WOAFASTPUB bool TryWaitOnAddressSpecialEx(EWaitMethod eMethod, WOAFASTPUB bool TryWaitOnAddressSpecialEx(EWaitMethod eMethod,

View File

@ -79,10 +79,10 @@ namespace Aurora::Threading::Primitives
bool GenericConditionMutex::TryLockHeavy() bool GenericConditionMutex::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->uState_);
} }
bool GenericConditionMutex::LockAbsNS(AuUInt64 uEndTime) bool GenericConditionMutex::LockAbsNS(AuUInt64 uEndTime)

View File

@ -134,10 +134,10 @@ namespace Aurora::Threading::Primitives
bool LinuxConditionMutex::TryLockHeavy() bool LinuxConditionMutex::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return TryLockNoSpin(); return TryLockNoSpin();
}); }, &this->uState_);
} }
void LinuxConditionMutex::Lock() void LinuxConditionMutex::Lock()

View File

@ -46,10 +46,10 @@ namespace Aurora::Threading::Primitives
bool Win32ConditionMutex::TryLockHeavy() bool Win32ConditionMutex::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->lock_);
} }
bool Win32ConditionMutex::TryLockNoSpin() bool Win32ConditionMutex::TryLockNoSpin()

View File

@ -162,10 +162,10 @@ namespace Aurora::Threading::Primitives
return this->TryTakeOneNoSpin(); return this->TryTakeOneNoSpin();
} }
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryTakeOneNoSpin(); return this->TryTakeOneNoSpin();
}); }, &this->uState_);
} }
AUKN_SYM IConditionVariable *ConditionVariableNew(const AuSPtr<IConditionMutex> &pMutex) AUKN_SYM IConditionVariable *ConditionVariableNew(const AuSPtr<IConditionMutex> &pMutex)

View File

@ -32,15 +32,20 @@ namespace Aurora::Threading::Primitives
bool ConditionVariableLinux::TryTakeOneSpin() bool ConditionVariableLinux::TryTakeOneSpin()
{ {
if (ThrdCfg::gPreferLinuxPrimitivesFutexNoSpin) if (this->TryTakeOneNoSpin())
{ {
return this->TryTakeOneNoSpin(); return true;
} }
return DoTryIf([=]() if (ThrdCfg::gPreferLinuxPrimitivesFutexNoSpin)
{
return false;
}
return DoTryIfAlderLake([=]()
{ {
return this->TryTakeOneNoSpin(); return this->TryTakeOneNoSpin();
}); }, &this->uState_);
} }
bool ConditionVariableLinux::WaitOne(AuUInt64 qwTimeoutRelative, bool ConditionVariableLinux::WaitOne(AuUInt64 qwTimeoutRelative,

View File

@ -343,10 +343,10 @@ namespace Aurora::Threading::Primitives
#if defined(AURORA_FORCE_SRW_LOCKS) #if defined(AURORA_FORCE_SRW_LOCKS)
return false; return false;
#else #else
return DoTryIf([&]() return DoTryIfAlderLake([&]()
{ {
return this->CheckOutNoSpin(); return this->CheckOutNoSpin();
}); }, &this->signalCount);
#endif #endif
} }

View File

@ -53,10 +53,10 @@ namespace Aurora::Threading::Primitives
bool MutexGenericImpl::TryLockHeavy() bool MutexGenericImpl::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->state_);
} }
bool MutexGenericImpl::LockMS(AuUInt64 uTimeout) bool MutexGenericImpl::LockMS(AuUInt64 uTimeout)

View File

@ -36,14 +36,17 @@ namespace Aurora::Threading::Primitives
bool MutexImpl::TryLock() bool MutexImpl::TryLock()
{ {
if (ThrdCfg::gPreferLinuxMutexSpinTryLock) if (this->TryLockNoSpin())
{ {
return this->TryLockHeavy(); return true;
} }
else
if (!ThrdCfg::gPreferLinuxMutexSpinTryLock)
{ {
return this->TryLockNoSpin(); return false;
} }
return this->TryLockHeavy();
} }
bool MutexImpl::TryLockNoSpin() bool MutexImpl::TryLockNoSpin()
@ -53,10 +56,10 @@ namespace Aurora::Threading::Primitives
bool MutexImpl::TryLockHeavy() bool MutexImpl::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->state_);
} }
bool MutexImpl::LockMS(AuUInt64 uTimeout) bool MutexImpl::LockMS(AuUInt64 uTimeout)

View File

@ -46,10 +46,10 @@ namespace Aurora::Threading::Primitives
bool MutexImpl::TryLockHeavy() bool MutexImpl::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->state_);
} }
bool MutexImpl::TryLock() bool MutexImpl::TryLock()

View File

@ -314,10 +314,10 @@ namespace Aurora::Threading::Primitives
if (gUseFutexRWLock) if (gUseFutexRWLock)
{ {
if (DoTryIf([=]() if (DoTryIfAlderLake([=]()
{ {
return this->TryLockWriteNoSpin(); return this->TryLockWriteNoSpin();
})) }, &this->iState_))
{ {
return true; return true;
} }
@ -615,10 +615,10 @@ namespace Aurora::Threading::Primitives
if (ThrdCfg::gPreferRWLockReadLockSpin && if (ThrdCfg::gPreferRWLockReadLockSpin &&
AuAtomicLoad(&this->dwWritersPending_) == 0) AuAtomicLoad(&this->dwWritersPending_) == 0)
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockReadNoSpin<true>(); return this->TryLockReadNoSpin<true>();
}); }, &this->iState_);
} }
return false; return false;

View File

@ -32,10 +32,10 @@ namespace Aurora::Threading::Primitives
bool SemaphoreGeneric::TryLockHeavy() bool SemaphoreGeneric::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->uAtomicState);
} }
bool SemaphoreGeneric::TryLock() bool SemaphoreGeneric::TryLock()

View File

@ -52,10 +52,10 @@ namespace Aurora::Threading::Primitives
bool SemaphoreImpl::TryLockHeavy() bool SemaphoreImpl::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->dwState_);
} }
bool SemaphoreImpl::LockMS(AuUInt64 uTimeout) bool SemaphoreImpl::LockMS(AuUInt64 uTimeout)

View File

@ -50,10 +50,10 @@ namespace Aurora::Threading::Primitives
bool SemaphoreImpl::TryLockHeavy() bool SemaphoreImpl::TryLockHeavy()
{ {
return DoTryIf([=]() return DoTryIfAlderLake([=]()
{ {
return this->TryLockNoSpin(); return this->TryLockNoSpin();
}); }, &this->dwState_);
} }
bool SemaphoreImpl::TryLock() bool SemaphoreImpl::TryLock()

View File

@ -99,6 +99,13 @@ namespace Aurora::Threading::Primitives
return; return;
} }
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
{
auto cpuId = AuHwInfo::cpuid(7);
ThrdCfg::gIsIntelAlderLakeOrGreater = (cpuId.ecx >> 5) & 1;
}
#endif
if (!ThrdCfg::gForceEnableAdaptiveSpin) if (!ThrdCfg::gForceEnableAdaptiveSpin)
{ {
gSpinAdaptiveThreshold = 0; gSpinAdaptiveThreshold = 0;

View File

@ -84,6 +84,7 @@ namespace Aurora::Threading::Primitives
inline bool gPreferUnixPrimitivesNoSpin {}; inline bool gPreferUnixPrimitivesNoSpin {};
inline bool gAlwaysRWLockWriteBiasOnReadLock {}; inline bool gAlwaysRWLockWriteBiasOnReadLock {};
inline bool gEnableRWLockWriteBiasOnReadLock {}; inline bool gEnableRWLockWriteBiasOnReadLock {};
inline AuUInt32 gIsIntelAlderLakeOrGreater {};
inline AuUInt8 gCountOfPCores {}; inline AuUInt8 gCountOfPCores {};
} }
@ -379,6 +380,226 @@ namespace Aurora::Threading::Primitives
return callback(); return callback();
} }
template <typename T>
bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord)
{
if (callback())
{
return true;
}
#if defined(AURORA_ARCH_ARM)
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
#endif
if (kEnableSmartScheduling)
{
bool bRet { false };
auto uWord = SMTGetAPICNumber();
if (uWord < AuArraySize(gCoreTable) &&
uWord < ThrdCfg::gCountOfPCores)
{
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
bool bSMTProbablyHit {};
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
{
uCount /= 5;
bSMTProbablyHit = true;
}
else if (gHasThreadLocalTimeout)
{
uCount += tlsSpinCountLocal;
}
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
if (ThrdCfg::gIsIntelAlderLakeOrGreater)
{
_umonitor((void *)AuPageRound<AuUInt>(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine));
if (callback())
{
bRet = true;
}
else
{
_umwait(/*0*/ /*1*/ bSMTProbablyHit ? 1 : 0, __rdtsc() + uCount);
bRet = callback();
}
}
else
#endif
{
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
bRet = true;
break;
}
else
{
SMPPause();
uCount--;
}
}
}
}
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
return bRet;
}
else if (gSpinAdaptiveThreshold)
{
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3))
{
auto uCount = (spin) / 3;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
}
return callback();
}
template <typename T> template <typename T>
bool auline DoTryIf(T callback) bool auline DoTryIf(T callback)
@ -392,4 +613,30 @@ namespace Aurora::Threading::Primitives
return callback(); return callback();
} }
} }
template <typename T>
bool auline DoTryIfAlderLake(T callback, const void *pWord)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord);
}
else
{
return callback();
}
}
template <typename T>
bool auline DoTryIfAlderLake(T callback, const volatile void *pWord)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord);
}
else
{
return callback();
}
}
} }