[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

This commit is contained in:
Reece Wilson 2024-05-03 12:14:52 +01:00
parent a35c1f165a
commit 134816e128
19 changed files with 314 additions and 52 deletions

View File

@ -218,10 +218,10 @@ namespace Aurora::IO::Loop
bool LSLocalEvent::TryTakeSpin()
{
return Threading::Primitives::DoTryIf([&]
return Threading::Primitives::DoTryIfAlderLake([&]
{
return this->TryTakeNoSpin();
});
}, &this->state_);
}
bool LSLocalEvent::IsSignaledNoSpinIfUserland()

View File

@ -108,10 +108,10 @@ namespace Aurora::IO::Loop
bool LSLocalMutex::TryTakeSpin()
{
return Threading::Primitives::DoTryIf([&]
return Threading::Primitives::DoTryIfAlderLake([&]
{
return this->TryTakeNoSpin();
});
}, &this->uAtomicWord);
}
bool LSLocalMutex::TryTake()

View File

@ -150,10 +150,10 @@ namespace Aurora::IO::Loop
bool LSLocalSemaphore::TryTakeSpin()
{
return Threading::Primitives::DoTryIf([&]
return Threading::Primitives::DoTryIfAlderLake([&]
{
return this->TryTakeNoSpin();
});
}, &this->uAtomicSemaphore);
}
bool LSLocalSemaphore::TryTake()

View File

@ -58,20 +58,20 @@ namespace Aurora::Threading
{
if (gShouldSpinOnlyInCPU == 0)
{
while (!Primitives::DoTryIf([&]()
while (!Primitives::DoTryIfAlderLake([&]()
{
return AuAtomicTestAndSet(uPointer, 0) == 0;
}))
}, uPointer))
{
}
}
else if (gShouldSpinOnlyInCPU == 1)
{
while (!Primitives::DoTryIf([&]()
while (!Primitives::DoTryIfAlderLake([&]()
{
return AuAtomicTestAndSet(uPointer, 0) == 0;
}))
}, uPointer))
{
ContextYield();
}
@ -920,10 +920,10 @@ namespace Aurora::Threading
{
uMS = AuNSToMS<AuUInt32>(uAbsTimeSteadyClock - uNow);
if (Primitives::DoTryIf([&]()
if (Primitives::DoTryIfAlderLake([&]()
{
return !WaitBuffer::Compare2<EWaitMethod::eNotEqual, true>(pTargetAddress, uWordSize, pCompareAddress);
}))
}, pTargetAddress))
{
// hit it within the span of 1 << SpinLoopPowerA SMT stalls
return true;
@ -1175,10 +1175,10 @@ namespace Aurora::Threading
const void *pCompareAddress,
AuUInt8 uWordSize)
{
return Primitives::DoTryIf([&]()
return Primitives::DoTryIfAlderLake([&]()
{
return !WaitBuffer::Compare2<T, true>(pTargetAddress, uWordSize, pCompareAddress);
});
}, pTargetAddress);
}
WOAFASTPUB bool TryWaitOnAddress(const void *pTargetAddress,
@ -1208,7 +1208,7 @@ namespace Aurora::Threading
return TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize);
}
return Primitives::DoTryIf([&]()
return Primitives::DoTryIfAlderLake([&]()
{
if (WaitBuffer::Compare2<EWaitMethod::eNotEqual, true>(pTargetAddress, uWordSize, pCompareAddress))
{
@ -1216,7 +1216,7 @@ namespace Aurora::Threading
}
return check(pTargetAddress, pCompareAddress, uWordSize);
});
}, pTargetAddress);
}
template <EWaitMethod T>
@ -1225,7 +1225,7 @@ namespace Aurora::Threading
AuUInt8 uWordSize,
const AuFunction<bool(const void *, const void *, AuUInt8)> &check)
{
return Primitives::DoTryIf([&]()
return Primitives::DoTryIfAlderLake([&]()
{
if (WaitBuffer::Compare2<T, true>(pTargetAddress, uWordSize, pCompareAddress))
{
@ -1233,7 +1233,7 @@ namespace Aurora::Threading
}
return check(pTargetAddress, pCompareAddress, uWordSize);
});
}, pTargetAddress);
}
WOAFASTPUB bool TryWaitOnAddressSpecialEx(EWaitMethod eMethod,

View File

@ -79,10 +79,10 @@ namespace Aurora::Threading::Primitives
bool GenericConditionMutex::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->uState_);
}
bool GenericConditionMutex::LockAbsNS(AuUInt64 uEndTime)

View File

@ -134,10 +134,10 @@ namespace Aurora::Threading::Primitives
bool LinuxConditionMutex::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return TryLockNoSpin();
});
}, &this->uState_);
}
void LinuxConditionMutex::Lock()

View File

@ -46,10 +46,10 @@ namespace Aurora::Threading::Primitives
bool Win32ConditionMutex::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->lock_);
}
bool Win32ConditionMutex::TryLockNoSpin()

View File

@ -162,10 +162,10 @@ namespace Aurora::Threading::Primitives
return this->TryTakeOneNoSpin();
}
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryTakeOneNoSpin();
});
}, &this->uState_);
}
AUKN_SYM IConditionVariable *ConditionVariableNew(const AuSPtr<IConditionMutex> &pMutex)

View File

@ -32,15 +32,20 @@ namespace Aurora::Threading::Primitives
bool ConditionVariableLinux::TryTakeOneSpin()
{
if (ThrdCfg::gPreferLinuxPrimitivesFutexNoSpin)
if (this->TryTakeOneNoSpin())
{
return this->TryTakeOneNoSpin();
return true;
}
return DoTryIf([=]()
if (ThrdCfg::gPreferLinuxPrimitivesFutexNoSpin)
{
return false;
}
return DoTryIfAlderLake([=]()
{
return this->TryTakeOneNoSpin();
});
}, &this->uState_);
}
bool ConditionVariableLinux::WaitOne(AuUInt64 qwTimeoutRelative,

View File

@ -343,10 +343,10 @@ namespace Aurora::Threading::Primitives
#if defined(AURORA_FORCE_SRW_LOCKS)
return false;
#else
return DoTryIf([&]()
return DoTryIfAlderLake([&]()
{
return this->CheckOutNoSpin();
});
}, &this->signalCount);
#endif
}

View File

@ -53,10 +53,10 @@ namespace Aurora::Threading::Primitives
bool MutexGenericImpl::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->state_);
}
bool MutexGenericImpl::LockMS(AuUInt64 uTimeout)

View File

@ -36,14 +36,17 @@ namespace Aurora::Threading::Primitives
bool MutexImpl::TryLock()
{
if (ThrdCfg::gPreferLinuxMutexSpinTryLock)
if (this->TryLockNoSpin())
{
return this->TryLockHeavy();
return true;
}
else
if (!ThrdCfg::gPreferLinuxMutexSpinTryLock)
{
return this->TryLockNoSpin();
return false;
}
return this->TryLockHeavy();
}
bool MutexImpl::TryLockNoSpin()
@ -53,10 +56,10 @@ namespace Aurora::Threading::Primitives
bool MutexImpl::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->state_);
}
bool MutexImpl::LockMS(AuUInt64 uTimeout)

View File

@ -46,10 +46,10 @@ namespace Aurora::Threading::Primitives
bool MutexImpl::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->state_);
}
bool MutexImpl::TryLock()

View File

@ -314,10 +314,10 @@ namespace Aurora::Threading::Primitives
if (gUseFutexRWLock)
{
if (DoTryIf([=]()
if (DoTryIfAlderLake([=]()
{
return this->TryLockWriteNoSpin();
}))
}, &this->iState_))
{
return true;
}
@ -615,10 +615,10 @@ namespace Aurora::Threading::Primitives
if (ThrdCfg::gPreferRWLockReadLockSpin &&
AuAtomicLoad(&this->dwWritersPending_) == 0)
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockReadNoSpin<true>();
});
}, &this->iState_);
}
return false;

View File

@ -32,10 +32,10 @@ namespace Aurora::Threading::Primitives
bool SemaphoreGeneric::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->uAtomicState);
}
bool SemaphoreGeneric::TryLock()

View File

@ -52,10 +52,10 @@ namespace Aurora::Threading::Primitives
bool SemaphoreImpl::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->dwState_);
}
bool SemaphoreImpl::LockMS(AuUInt64 uTimeout)

View File

@ -50,10 +50,10 @@ namespace Aurora::Threading::Primitives
bool SemaphoreImpl::TryLockHeavy()
{
return DoTryIf([=]()
return DoTryIfAlderLake([=]()
{
return this->TryLockNoSpin();
});
}, &this->dwState_);
}
bool SemaphoreImpl::TryLock()

View File

@ -99,6 +99,13 @@ namespace Aurora::Threading::Primitives
return;
}
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
{
auto cpuId = AuHwInfo::cpuid(7);
ThrdCfg::gIsIntelAlderLakeOrGreater = (cpuId.ecx >> 5) & 1;
}
#endif
if (!ThrdCfg::gForceEnableAdaptiveSpin)
{
gSpinAdaptiveThreshold = 0;

View File

@ -84,6 +84,7 @@ namespace Aurora::Threading::Primitives
inline bool gPreferUnixPrimitivesNoSpin {};
inline bool gAlwaysRWLockWriteBiasOnReadLock {};
inline bool gEnableRWLockWriteBiasOnReadLock {};
inline AuUInt32 gIsIntelAlderLakeOrGreater {};
inline AuUInt8 gCountOfPCores {};
}
@ -379,6 +380,226 @@ namespace Aurora::Threading::Primitives
return callback();
}
template <typename T>
bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord)
{
if (callback())
{
return true;
}
#if defined(AURORA_ARCH_ARM)
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
#endif
if (kEnableSmartScheduling)
{
bool bRet { false };
auto uWord = SMTGetAPICNumber();
if (uWord < AuArraySize(gCoreTable) &&
uWord < ThrdCfg::gCountOfPCores)
{
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
bool bSMTProbablyHit {};
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
{
uCount /= 5;
bSMTProbablyHit = true;
}
else if (gHasThreadLocalTimeout)
{
uCount += tlsSpinCountLocal;
}
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
if (ThrdCfg::gIsIntelAlderLakeOrGreater)
{
_umonitor((void *)AuPageRound<AuUInt>(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine));
if (callback())
{
bRet = true;
}
else
{
_umwait(/*0*/ /*1*/ bSMTProbablyHit ? 1 : 0, __rdtsc() + uCount);
bRet = callback();
}
}
else
#endif
{
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
bRet = true;
break;
}
else
{
SMPPause();
uCount--;
}
}
}
}
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
return bRet;
}
else if (gSpinAdaptiveThreshold)
{
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3))
{
auto uCount = (spin) / 3;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
}
return callback();
}
template <typename T>
bool auline DoTryIf(T callback)
@ -392,4 +613,30 @@ namespace Aurora::Threading::Primitives
return callback();
}
}
template <typename T>
bool auline DoTryIfAlderLake(T callback, const void *pWord)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord);
}
else
{
return callback();
}
}
template <typename T>
bool auline DoTryIfAlderLake(T callback, const volatile void *pWord)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord);
}
else
{
return callback();
}
}
}