[+] WaitOnAddress[Steady](..., AuOptional<bool> optAlreadySpun = {}) arguments

[+] ...slight UWP optimization?
[*] Lift WoA limitation
This commit is contained in:
Reece Wilson 2023-10-30 14:50:28 +00:00
parent 77253a7654
commit e071b3d509
9 changed files with 126 additions and 155 deletions

View File

@ -380,7 +380,8 @@ namespace Aurora
#else
AuUInt64 bPreferEmulatedWakeOnAddress : 1 { !AuBuild::kIsNtDerived /*everybody else requires us to hit the kernel. */ };
#endif
AuUInt64 bPreferWaitOnAddressAlwaysSpin : 1 { true }; // ..., if emulated! if double-spinning under higher level locks, disable me.
AuUInt64 bPreferWaitOnAddressAlwaysSpin : 1 { true }; // ..., if emulated! if double-spinning under higher level locks, disable me.
AuUInt64 bPreferWaitOnAddressAlwaysSpinNative : 1 { false }; // ..., if not emulated! noting that most kernels and user-schedulers will spin for you
AuUInt64 bPreferRWLockReadLockSpin : 1 { true };
AuUInt64 bUWPNanosecondEmulationCheckFirst : 1 { false };
AuUInt64 uUWPNanosecondEmulationMaxYields : 7 { 12 };

View File

@ -184,37 +184,12 @@ namespace Aurora::Threading::Waitables
return true;
}
#if (defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64)) && !defined(AURORA_RUNTIME_FORCE_ADAPTIVE_FUTEX)
AuUInt uCount(GetTotalSpinCountTime());
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto uPerfCounter = __rdtsc() + uCount;
while (__rdtsc() < uPerfCounter)
#else
for (AU_ITERATE_N(i, uCount))
#endif
{
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
_mm_pause();
_mm_pause();
_mm_pause();
_mm_pause();
#else
// TBD
#endif
if (TryLock3())
{
return true;
}
}
#else
static const AuUInt32 kRef { 0 };
if (TryWaitOnAddress(&this->uAtomicState, &kRef, sizeof(kRef)))
if (TryWaitOnAddress((const void *)&this->uAtomicState, &kRef, sizeof(kRef)))
{
return TryLock3();
}
#endif
return false;
}
@ -227,7 +202,7 @@ namespace Aurora::Threading::Waitables
{
bool bStatus {};
bStatus = WaitOnAddressSteady((void *)&this->uAtomicState, &kRef, sizeof(kRef), qwTimeout);
bStatus = WaitOnAddressSteady((const void *)&this->uAtomicState, &kRef, sizeof(kRef), qwTimeout, true);
if (!bStatus)
{

View File

@ -32,36 +32,11 @@ namespace Aurora::Threading::Waitables
return true;
}
#if (defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64)) && !defined(AURORA_RUNTIME_FORCE_ADAPTIVE_FUTEX)
AuUInt uCount(GetTotalSpinCountTime());
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto uPerfCounter = __rdtsc() + uCount;
while (__rdtsc() < uPerfCounter)
#else
for (AU_ITERATE_N(i, uCount))
#endif
{
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
_mm_pause();
_mm_pause();
_mm_pause();
_mm_pause();
#else
// TBD
#endif
if (TryLockNoSpin())
{
return true;
}
}
#else
static const AuUInt32 kRef { 0 };
if (TryWaitOnAddress(&this->uAtomicState, &kRef, sizeof(kRef)))
if (TryWaitOnAddress((const void *)&this->uAtomicState, &kRef, sizeof(kRef)))
{
return TryLockNoSpin();
}
#endif
return false;
}
@ -108,7 +83,7 @@ namespace Aurora::Threading::Waitables
while (!TryLockNoSpin())
{
AuAtomicAdd(&this->uAtomicSleeping, 1u);
WaitOnAddress((void *)&this->uAtomicState, &kRef, sizeof(kRef), 0);
WaitOnAddress((const void *)&this->uAtomicState, &kRef, sizeof(kRef), 0, true);
AuAtomicSub(&this->uAtomicSleeping, 1u);
}
}
@ -144,7 +119,7 @@ namespace Aurora::Threading::Waitables
bool bStatus {};
AuAtomicAdd(&this->uAtomicSleeping, 1u);
bStatus = WaitOnAddressSteady((void *)&this->uAtomicState, &kRef, sizeof(kRef), qwEndTime);
bStatus = WaitOnAddressSteady((const void *)&this->uAtomicState, &kRef, sizeof(kRef), qwEndTime, true);
AuAtomicSub(&this->uAtomicSleeping, 1u);
if (!bStatus)
@ -165,7 +140,7 @@ namespace Aurora::Threading::Waitables
bool bStatus {};
AuAtomicAdd(&this->uAtomicSleeping, 1u);
bStatus = WaitOnAddressSteady((void *)&this->uAtomicState, &kRef, sizeof(kRef), qwTimeoutAbs);
bStatus = WaitOnAddressSteady((const void *)&this->uAtomicState, &kRef, sizeof(kRef), qwTimeoutAbs, true);
AuAtomicSub(&this->uAtomicSleeping, 1u);
if (!bStatus)

View File

@ -31,39 +31,14 @@ namespace Aurora::Threading::Waitables
return true;
}
#if (defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64)) && !defined(AURORA_RUNTIME_FORCE_ADAPTIVE_FUTEX)
AuUInt uCount(GetTotalSpinCountTime());
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto uPerfCounter = __rdtsc() + uCount;
while (__rdtsc() < uPerfCounter)
#else
for (AU_ITERATE_N(i, uCount))
#endif
{
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
_mm_pause();
_mm_pause();
_mm_pause();
_mm_pause();
#else
// TBD
#endif
if (TryLockNoSpin())
{
return true;
}
}
#else
static const AuUInt32 kRef { 1 };
if (TryWaitOnAddress((void *)&this->uAtomicState, &kRef, sizeof(kRef)))
if (TryWaitOnAddress((const void *)&this->uAtomicState, &kRef, sizeof(kRef)))
{
if (TryLockNoSpin())
{
return true;
}
}
#endif
return false;
}
@ -100,7 +75,7 @@ namespace Aurora::Threading::Waitables
while (!TryLockNoSpin())
{
AuAtomicAdd(&this->uAtomicSleeping, 1u);
WaitOnAddress((void *)&this->uAtomicState, &kRef, sizeof(kRef), 0);
WaitOnAddress((const void *)&this->uAtomicState, &kRef, sizeof(kRef), 0, true);
AuAtomicSub(&this->uAtomicSleeping, 1u);
}
}
@ -136,7 +111,7 @@ namespace Aurora::Threading::Waitables
bool bStatus {};
AuAtomicAdd(&this->uAtomicSleeping, 1u);
bStatus = WaitOnAddressSteady((void *)&this->uAtomicState, &kRef, sizeof(kRef), qwEndTime);
bStatus = WaitOnAddressSteady((const void *)&this->uAtomicState, &kRef, sizeof(kRef), qwEndTime, true);
AuAtomicSub(&this->uAtomicSleeping, 1u);
if (!bStatus)
@ -162,7 +137,7 @@ namespace Aurora::Threading::Waitables
bool bStatus {};
AuAtomicAdd(&this->uAtomicSleeping, 1u);
bStatus = WaitOnAddressSteady((void *)&this->uAtomicState, &kRef, sizeof(kRef), qwTimeoutAbs);
bStatus = WaitOnAddressSteady((const void *)&this->uAtomicState, &kRef, sizeof(kRef), qwTimeoutAbs, true);
AuAtomicSub(&this->uAtomicSleeping, 1u);
if (!bStatus)

View File

@ -4,6 +4,25 @@
File: WakeOnAddress.hpp
Date: 2023-3-11
Author: Reece
Note: In emulation mode (*):
1: Wakes occur in FIFO order
2: uWordSize can be any length not exceeding 32 bytes
otherwise
1: Wakes are orderless
2: uWordSize must be less than or equal to 8 bytes
3: only the least significant 32bits are guaranteed to be used as wake signals
* By default: UNIXes and targets below/inc Windows 7 will be in userland emulation mode for performance reasons.
* Linux and other targets can directly interface with their futex interface under a smaller wrapper;
* however, these applications are limited to internal synchronization primitives. The added bloat
* of the WaitOnAddress/FUTEX/atomic wait emulation layer improves performance in real world dumb
* code with spurious wakes, odd word sizes, and pointer alignments. Not to mention some targets
* are stuck with semaphores or condition variables to start off with, and therefore need this
* for the sake of porting modern applications. The aforementioned synchronization primitives
* are written with OS specific optimizations in mind, and therefore consider emulation bloat.
* bPreferEmulatedWakeOnAddress disables the emulation layer, if theres a reasonable native
* interfaces available.
* Defer to ThreadingConfig::bPreferEmulatedWakeOnAddress = !AuBuild::kIsNtDerived
***/
#pragma once
@ -20,15 +39,17 @@ namespace Aurora::Threading
const void *pCompareAddress,
AuUInt8 uWordSize);
// Relative timeout variant of nanosecond resolution WoA. nanoseconds in steady clock time. 0 = indefinite
// Relative timeout variant of nanosecond resolution WoA. 0 = indefinite
AUKN_SYM bool WaitOnAddress(const void *pTargetAddress,
const void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds);
AuUInt64 qwNanoseconds,
AuOptional<bool> optAlreadySpun = {} /*hint: do not spin before switching. subject to global config.*/);
// Absolute timeout variant of nanosecond resolution WoA. Nanoseconds are in steady clock time. 0 = indefinite
AUKN_SYM bool WaitOnAddressSteady(const void *pTargetAddress,
const void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds);
AuUInt64 qwNanoseconds,
AuOptional<bool> optAlreadySpun = {} /*hint: do not spin before switching. subject to global config.*/);
}

View File

@ -112,7 +112,7 @@ namespace Aurora::Threading
if (state.qwNanosecondsAbs)
{
if (!WaitBuffer::From(this->pAddress, this->uSize).Compare(state))
if (!WaitBuffer::Compare(this->pAddress, this->uSize, state))
{
return true;
}
@ -122,7 +122,7 @@ namespace Aurora::Threading
while (uNow < uEndTime)
{
if (!WaitBuffer::From(this->pAddress, this->uSize).Compare(state))
if (!WaitBuffer::Compare(this->pAddress, this->uSize, state))
{
return true;
}
@ -138,11 +138,11 @@ namespace Aurora::Threading
uNow = AuTime::SteadyClockNS();
}
return !WaitBuffer::From(this->pAddress, this->uSize).Compare(state);
return !WaitBuffer::Compare(this->pAddress, this->uSize, state);
}
else
{
while (WaitBuffer::From(this->pAddress, this->uSize).Compare(state))
while (WaitBuffer::Compare(this->pAddress, this->uSize, state))
{
this->variable.WaitForSignalNsEx(&this->mutex, 0);
}
@ -215,6 +215,28 @@ namespace Aurora::Threading
return AuMove(wait);
}
bool WaitBuffer::Compare(const void *pBuf, AuUInt8 uSize, WaitState &state)
{
if (!state.uDownsizeMask)
{
return AuMemcmp(pBuf, state.compare.buffer, AuMin(uSize, state.compare.uSize)) == 0;
}
else
{
auto uMask = state.uDownsizeMask.value();
auto &uSrcWord = *AuReinterpretCast<const AuUInt32 *>(pBuf);
auto &uCmpWord = *AuReinterpretCast<const AuUInt32 *>(state.compare.buffer);
return (uSrcWord & uMask) == (uCmpWord & uMask);
}
}
bool WaitBuffer::Compare(const void *pBuf, AuUInt8 uSize, const void *pBuf2)
{
return AuMemcmp(pBuf, pBuf2, uSize) == 0;
}
bool WaitBuffer::Compare(const void *pBuf)
{
return AuMemcmp(this->buffer, pBuf, this->uSize) == 0;
@ -230,8 +252,8 @@ namespace Aurora::Threading
{
auto uMask = state.uDownsizeMask.value();
auto &uSrcWord = *AuReinterpretCast<AuUInt32 *>(this->buffer);
auto &uCmpWord = *AuReinterpretCast<AuUInt32 *>(state.compare.buffer);
auto &uSrcWord = *AuReinterpretCast<const AuUInt32 *>(this->buffer);
auto &uCmpWord = *AuReinterpretCast<const AuUInt32 *>(state.compare.buffer);
return (uSrcWord & uMask) == (uCmpWord & uMask);
}
@ -406,7 +428,7 @@ namespace Aurora::Threading
)
{
WaitState state;
SysAssertDbg(uWordSize <= 8);
SysAssertDbg(uWordSize <= 32);
auto pWaitEntry = gProcessWaitables.WaitBufferFrom(pTargetAddress, uWordSize);
state.compare = WaitBuffer::From(pCompareAddress, uWordSize);
@ -524,7 +546,8 @@ namespace Aurora::Threading
AuUInt8 uWordSize,
AuUInt64 uAbsTimeSteadyClock,
AuUInt64 uRelativeNanoseconds,
AuOptional<AuUInt64> uAbsTimeAltClock /* hint */)
AuOptional<AuUInt64> uAbsTimeAltClock /* hint */,
bool bSpun = false)
{
#if defined(AURORA_IS_MODERNNT_DERIVED)
@ -547,7 +570,7 @@ namespace Aurora::Threading
{
if (uAbsTimeSteadyClock <= uNow)
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
return !expect.Compare(pTargetAddress);
}
word.QuadPart = -(AuInt64(uAbsTimeSteadyClock - uNow) / 100ull);
@ -589,7 +612,7 @@ namespace Aurora::Threading
if (iDelta <= 0)
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
return !WaitBuffer::Compare(pCompareAddress, uWordSize, pTargetAddress);
}
uRelativeNanoseconds = iDelta;
@ -604,11 +627,14 @@ namespace Aurora::Threading
{
// take a copy
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
// first: cpu spin to avoid the kernel all together
if (TryWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize))
if (!bSpun)
{
return true;
if (TryWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize))
{
return true;
}
}
// second: yield
@ -637,8 +663,11 @@ namespace Aurora::Threading
// first: wait on the address with an ms scale timeout
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
// take a copy
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
// never trust the error value/status provided by wait addresses - instead, do a quick compare
if (!WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress))
if (!expect.Compare(pTargetAddress))
{
// best case: we woke up during the ms-res waitonaddress
return true;
@ -651,9 +680,9 @@ namespace Aurora::Threading
{
uMS = AuNSToMS<AuUInt32>(uAbsTimeSteadyClock - uNow);
if (Primitives::DoTryIf([=]()
if (Primitives::DoTryIf([&]()
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
return !expect.Compare(pTargetAddress);
}))
{
// hit it within the span of 1 << SpinLoopPowerA SMT stalls
@ -722,14 +751,14 @@ namespace Aurora::Threading
#endif
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
return !WaitBuffer::Compare(pCompareAddress, uWordSize, pTargetAddress);
}
static void RunOSWaitOnAddressNoTimedNoErrors(const void *pTargetAddress,
const void *pCompareAddress,
WaitState &state)
{
while (WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state))
while (WaitBuffer::Compare(pTargetAddress, state.uWordSize, state))
{
if (!RunOSWaitOnAddressNoTimed(pTargetAddress, pCompareAddress, state.uWordSize))
{
@ -739,16 +768,17 @@ namespace Aurora::Threading
}
static bool RunOSWaitOnAddressTimedSteady(const void *pTargetAddress,
const void *pCompareAddress,
WaitState &state)
const void *pCompareAddress,
WaitState &state,
bool bSpun = false)
{
if (!WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state))
if (!WaitBuffer::Compare(pTargetAddress, state.uWordSize, state))
{
return true;
}
(void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, state.qwNanosecondsAbs.value(), { }, { });
return !WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state);
(void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, state.qwNanosecondsAbs.value(), { }, { }, bSpun);
return !WaitBuffer::Compare(pTargetAddress, state.uWordSize, state);
}
static void RunOSWakeNOnAddress(const void *pAddress,
@ -828,7 +858,7 @@ namespace Aurora::Threading
else
{
state.qwNanosecondsAbs = qwNanosecondsAbs;
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state, true);
}
}
@ -853,55 +883,30 @@ namespace Aurora::Threading
AUKN_SYM bool WaitOnAddress(const void *pTargetAddress,
const void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds)
AuUInt64 qwNanoseconds,
AuOptional<bool> optAlreadySpun)
{
bool bWaitOnAddress = IsWaitOnRecommended();
if (bWaitOnAddress)
// Avoid SteadyTime syscall in the event of HAL retardation (missing KUSER QPC, Linux vDSO, etc)
if (!WaitBuffer::Compare(pTargetAddress, uWordSize, pCompareAddress))
{
auto [pWaitAddress, uDelta, uMask] = DecodeAddress(pTargetAddress, uWordSize);
auto pCompareAddress2 = AuReinterpretCast<const char *>(pCompareAddress) - uDelta;
WaitState state;
state.uDownsizeMask = uMask;
state.compare = uMask ?
WaitBuffer::From(pCompareAddress2, 4) :
WaitBuffer::From(pCompareAddress2, uWordSize);
state.uWordSize = uMask ? 4 : uWordSize;
if (!qwNanoseconds)
{
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
return true;
}
else
{
state.qwNanosecondsAbs = qwNanoseconds + AuTime::SteadyClockNS();
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
}
}
else
{
if (Primitives::ThrdCfg::gPreferWaitOnAddressAlwaysSpin)
{
if (TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize))
{
return true;
}
}
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, qwNanoseconds, {}, false);
return true;
}
return false;
return WaitOnAddressSteady(pTargetAddress,
pCompareAddress,
uWordSize,
qwNanoseconds ? qwNanoseconds + AuTime::SteadyClockNS() : 0,
optAlreadySpun);
}
AUKN_SYM bool TryWaitOnAddress(const void *pTargetAddress,
const void *pCompareAddress,
AuUInt8 uWordSize)
{
return Primitives::DoTryIf([=]()
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
return Primitives::DoTryIf([&]()
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
return !expect.Compare(pTargetAddress);
});
}
@ -959,7 +964,8 @@ namespace Aurora::Threading
AUKN_SYM bool WaitOnAddressSteady(const void *pTargetAddress,
const void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds)
AuUInt64 qwNanoseconds,
AuOptional<bool> optAlreadySpun)
{
bool bWaitOnAddress = IsWaitOnRecommended();
if (bWaitOnAddress)
@ -974,6 +980,18 @@ namespace Aurora::Threading
WaitBuffer::From(pCompareAddress2, uWordSize);
state.uWordSize = uMask ? 4 : uWordSize;
bool bSpun {};
if (Primitives::ThrdCfg::gPreferWaitOnAddressAlwaysSpinNative &&
optAlreadySpun.value_or(false))
{
if (TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize))
{
return true;
}
bSpun = true;
}
if (!qwNanoseconds)
{
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
@ -982,12 +1000,13 @@ namespace Aurora::Threading
else
{
state.qwNanosecondsAbs = qwNanoseconds;
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state, bSpun);
}
}
else
{
if (Primitives::ThrdCfg::gPreferWaitOnAddressAlwaysSpin)
if (Primitives::ThrdCfg::gPreferWaitOnAddressAlwaysSpin &&
optAlreadySpun.value_or(false))
{
if (TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize))
{

View File

@ -18,10 +18,13 @@ namespace Aurora::Threading
struct WaitBuffer
{
char buffer[8];
char buffer[32];
AuUInt8 uSize;
static WaitBuffer From(const void *pBuf, AuUInt8 uSize);
static bool Compare(const void *pBuf, AuUInt8 uSize, WaitState &state);
static bool Compare(const void *pBuf, AuUInt8 uSize, const void *pBuf2);
bool Compare(const void *pBuf);
bool Compare(WaitState &state);
};

View File

@ -164,6 +164,7 @@ namespace Aurora::Threading::Primitives
ThrdCfg::gPreferLinuxCondMutexSpinTryLock = gRuntimeConfig.threadingConfig.bPreferLinuxCondMutexSpinTryLock;
ThrdCfg::gPreferEmulatedWakeOnAddress = gRuntimeConfig.threadingConfig.bPreferEmulatedWakeOnAddress;
ThrdCfg::gPreferWaitOnAddressAlwaysSpin = gRuntimeConfig.threadingConfig.bPreferWaitOnAddressAlwaysSpin;
ThrdCfg::gPreferWaitOnAddressAlwaysSpinNative = gRuntimeConfig.threadingConfig.bPreferWaitOnAddressAlwaysSpinNative;
ThrdCfg::gPreferRWLockReadLockSpin = gRuntimeConfig.threadingConfig.bPreferRWLockReadLockSpin;
ThrdCfg::gUWPNanosecondEmulationCheckFirst = gRuntimeConfig.threadingConfig.bUWPNanosecondEmulationCheckFirst;
ThrdCfg::gUWPNanosecondEmulationMaxYields = gRuntimeConfig.threadingConfig.uUWPNanosecondEmulationMaxYields;

View File

@ -32,6 +32,7 @@ namespace Aurora::Threading::Primitives
inline bool gPreferLinuxCondMutexSpinTryLock {};
inline bool gPreferEmulatedWakeOnAddress {};
inline bool gPreferWaitOnAddressAlwaysSpin {};
inline bool gPreferWaitOnAddressAlwaysSpinNative {};
inline bool gPreferRWLockReadLockSpin {};
inline bool gUWPNanosecondEmulationCheckFirst {};
inline AuUInt32 gUWPNanosecondEmulationMaxYields {};