[+] Improve WoA on Windows 8+
[+] AuThreading::WaitOnAddressSteady
This commit is contained in:
parent
17c50eff64
commit
28201db2d7
@ -345,13 +345,15 @@ namespace Aurora
|
||||
bool bNoThreadNames { false };
|
||||
bool bPlatformIsSMPProcessorOptimized { true }; // Whether to attempt to using mm_pause or similar before yielding into the kernel
|
||||
AuUInt8 uSpinLoopPowerA { 7 }; // Nudgable spinloop power. This is our local userland niceness factor; where 1 << n is the amount of yield instructions to stall for
|
||||
bool bPreferNt51XpMutexesOver81 { true }; // Fun Fact: Undocumented Windows XP APIs are still better than whatever the fuck shit fest they sharted out under Windows Vista and 8.1
|
||||
}; // Wth the former set of apis, we are still nothing more than a futex intended for nothing more than x86 bittestandset with undefined
|
||||
// bahviour on the higher bits, and we're crippled by some annoying thread switch function. Windows Vista superseded the dumb kernel-io
|
||||
|
||||
bool bPreferNt51XpMutexesOver8 { false }; // Fun Fact: Undocumented Windows XP APIs are still better than whatever the fuck shit fest they sharted out under Windows Vista and maybe 8.1
|
||||
bool bPerferNt51XpCondvarsOver8 { false }; // Wth the former set of apis, we are still nothing more than a futex intended for nothing more than x86 bittestandset with undefined
|
||||
}; // bahviour on the higher bits, and we're crippled by some annoying thread switch function. Windows Vista superseded the dumb kernel-io
|
||||
// based switching apis everyone thought they had to use with bloat on top of this very same 5.1 era api.
|
||||
// And to end it all off, Windows 8.1 wait/wake on address forces relative millisecond precision, in the first (?) MS OS to drop tick based [re]scheduling.
|
||||
// ~~ And to end it all off, Windows 8.1 wait/wake on address forces relative millisecond precision, in the first (?) MS OS to drop tick based [re]scheduling. ~~ (officially)
|
||||
// Our main mutex is one edge case where undcoumented XP era scheduling apis are better than the garbage indiasoft wants you to use in <current year>.
|
||||
|
||||
|
||||
struct RuntimeStartInfo
|
||||
{
|
||||
ConsoleConfig console;
|
||||
|
@ -20,8 +20,15 @@ namespace Aurora::Threading
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize);
|
||||
|
||||
// Relative timeout variant of nanosecond resolution WoA. nanoseconds in steady clock time. 0 = indefinite
|
||||
AUKN_SYM bool WaitOnAddress(void *pTargetAddress,
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
AuUInt64 qwNanoseconds);
|
||||
|
||||
// Absolute timeout variant of nanosecond resolution WoA. Nanoseconds are in steady clock time. 0 = indefinite
|
||||
AUKN_SYM bool WaitOnAddressSteady(void *pTargetAddress,
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
AuUInt64 qwNanoseconds);
|
||||
}
|
@ -51,6 +51,7 @@ namespace Aurora
|
||||
ADD_GET_PROC(Nt, NtReleaseKeyedEvent)
|
||||
ADD_GET_PROC(Nt, NtOpenKeyedEvent)
|
||||
ADD_GET_PROC(Nt, NtCreateKeyedEvent)
|
||||
ADD_GET_PROC(Nt, RtlWaitOnAddress)
|
||||
|
||||
ADD_GET_PROC_BI(Kernel32, KernelBase, VirtualAlloc2)
|
||||
ADD_GET_PROC_BI(Kernel32, KernelBase, MapViewOfFile3)
|
||||
@ -79,5 +80,15 @@ namespace Aurora
|
||||
|
||||
pNtDelayExecution = nullptr /* ... (you dont need it, but it'll help a ton) */;
|
||||
#endif
|
||||
|
||||
gUseNativeWaitMutex = (pWaitOnAddress &&
|
||||
!gRuntimeConfig.threadingConfig.bPreferNt51XpMutexesOver8 &&
|
||||
(pRtlWaitOnAddress || AuBuild::kCurrentPlatform != AuBuild::EPlatform::ePlatformWin32)) ||
|
||||
!pNtWaitForKeyedEvent;
|
||||
|
||||
gUseNativeWaitCondvar = (pWaitOnAddress &&
|
||||
!gRuntimeConfig.threadingConfig.bPerferNt51XpCondvarsOver8 &&
|
||||
(pRtlWaitOnAddress || AuBuild::kCurrentPlatform != AuBuild::EPlatform::ePlatformWin32)) ||
|
||||
!pNtWaitForKeyedEvent;
|
||||
}
|
||||
}
|
@ -92,9 +92,18 @@ namespace Aurora
|
||||
ULONG Flags
|
||||
);
|
||||
|
||||
inline NTSTATUS(__stdcall *pRtlWaitOnAddress)(
|
||||
const void *addr,
|
||||
const void *cmp,
|
||||
SIZE_T size,
|
||||
const LARGE_INTEGER *timeout);
|
||||
|
||||
#if defined(AURORA_PLATFORM_WIN32)
|
||||
inline NTSTATUS(_stdcall *pRtlGetVersion)(
|
||||
PRTL_OSVERSIONINFOW lpVersionInformation
|
||||
);
|
||||
#endif
|
||||
|
||||
inline bool gUseNativeWaitMutex {};
|
||||
inline bool gUseNativeWaitCondvar {};
|
||||
}
|
@ -15,6 +15,12 @@ namespace Aurora
|
||||
{
|
||||
void InitProcAddresses()
|
||||
{
|
||||
static bool gDumbInitOnce {};
|
||||
if (AuExchange(gDumbInitOnce, true))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(AURORA_IS_MODERNNT_DERIVED)
|
||||
InitNTAddresses();
|
||||
#endif
|
||||
|
17
Source/Threading/AuWakeInternal.hpp
Normal file
17
Source/Threading/AuWakeInternal.hpp
Normal file
@ -0,0 +1,17 @@
|
||||
/***
|
||||
Copyright (C) 2023 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
||||
|
||||
File: AuWakeInternal.hpp
|
||||
Date: 2023-6-15
|
||||
Author: Reece
|
||||
Note: Defines a LTS prototype for nanosecond-resolution WoA API for Desktop class Win32 primitives
|
||||
***/
|
||||
#pragma once
|
||||
|
||||
namespace Aurora::Threading
|
||||
{
|
||||
bool InternalLTSWaitOnAddressHighRes(void *pTargetAddress,
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
AuUInt64 qwNanosecondsAbs);
|
||||
}
|
@ -11,6 +11,7 @@
|
||||
#if defined(AURORA_PLATFORM_WIN32)
|
||||
#include <timeapi.h>
|
||||
#endif
|
||||
#include <Time/Time.hpp>
|
||||
|
||||
namespace Aurora::Threading
|
||||
{
|
||||
@ -124,15 +125,20 @@ namespace Aurora::Threading
|
||||
{
|
||||
AU_LOCK_GUARD(this->mutex);
|
||||
|
||||
if (state.qwNanoseconds)
|
||||
if (state.qwNanosecondsAbs)
|
||||
{
|
||||
if (!WaitBuffer::From(this->pAddress, this->uSize).Compare(state))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
AuUInt64 uEndTime {};
|
||||
auto uNow = AuTime::SteadyClockNS();
|
||||
auto uEndTime = uNow + state.qwNanoseconds.value();
|
||||
|
||||
if (state.qwNanosecondsAbs)
|
||||
{
|
||||
uEndTime = state.qwNanosecondsAbs.value();
|
||||
}
|
||||
|
||||
#if defined(AURORA_IS_POSIX_DERIVED)
|
||||
struct timespec tspec;
|
||||
@ -588,6 +594,7 @@ namespace Aurora::Threading
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
AuOptional<AuUInt64> qwNanoseconds,
|
||||
AuOptional<AuUInt64> qwNanosecondsAbs,
|
||||
bool bOSSupportsWait
|
||||
)
|
||||
{
|
||||
@ -595,7 +602,16 @@ namespace Aurora::Threading
|
||||
SysAssertDbg(uWordSize <= 8);
|
||||
auto pWaitEntry = gProcessWaitables.WaitBufferFrom(pTargetAddress, uWordSize);
|
||||
state.compare = WaitBuffer::From(pCompareAddress, uWordSize);
|
||||
state.qwNanoseconds = qwNanoseconds ? AuOptionalEx<AuUInt64> { qwNanoseconds.value() } : AuOptionalEx<AuUInt64> {}; // from default/zeroable optional, to boolean suffix
|
||||
|
||||
if (qwNanoseconds)
|
||||
{
|
||||
state.qwNanosecondsAbs = AuTime::SteadyClockNS() + qwNanoseconds.value();
|
||||
}
|
||||
else if (qwNanosecondsAbs)
|
||||
{
|
||||
state.qwNanosecondsAbs = qwNanosecondsAbs.value();
|
||||
}
|
||||
|
||||
auto bResult = pWaitEntry->SleepOn(state);
|
||||
#if defined(WOA_USE_DEFERRED_REL)
|
||||
pWaitEntry->Release();
|
||||
@ -687,56 +703,148 @@ namespace Aurora::Threading
|
||||
static bool RunOSWaitOnAddressTimed(const void *pTargetAddress,
|
||||
const void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
AuUInt64 uAbsTime,
|
||||
AuUInt32 uNanoseconds)
|
||||
AuUInt64 uAbsTimeSteadyClock,
|
||||
AuUInt64 uRelativeNanoseconds,
|
||||
AuOptional<AuUInt64> uAbsTimeAltClock /* hint */)
|
||||
{
|
||||
#if defined(AURORA_IS_MODERNNT_DERIVED)
|
||||
|
||||
auto uMS = AuNSToMS<AuUInt32>(uNanoseconds);
|
||||
if (!uMS)
|
||||
if (pRtlWaitOnAddress)
|
||||
{
|
||||
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
|
||||
do
|
||||
|
||||
AuUInt64 uNow {};
|
||||
while (uAbsTimeSteadyClock ?
|
||||
(uAbsTimeSteadyClock > (uNow = AuTime::SteadyClockNS())) :
|
||||
true)
|
||||
{
|
||||
LARGE_INTEGER word {};
|
||||
|
||||
if (uAbsTimeAltClock)
|
||||
{
|
||||
word.QuadPart = AuTime::ConvertTimestampNs(uAbsTimeAltClock.value());
|
||||
}
|
||||
else if (uAbsTimeSteadyClock)
|
||||
{
|
||||
if (uAbsTimeSteadyClock <= uNow)
|
||||
{
|
||||
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
|
||||
}
|
||||
|
||||
word.QuadPart = -(AuInt64(uAbsTimeSteadyClock - uNow) / 100ull);
|
||||
}
|
||||
|
||||
if (expect.Compare(pTargetAddress))
|
||||
{
|
||||
AuThreading::ContextYield();
|
||||
pRtlWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize, &word);
|
||||
|
||||
if (!expect.Compare(pTargetAddress))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else if (!uAbsTimeSteadyClock)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
while (uAbsTime > AuTime::SteadyClockNS());
|
||||
}
|
||||
else
|
||||
{
|
||||
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
// ~~some paths might miss the uRelativeNanoseconds, like cas loops.~~
|
||||
// most paths will now skimp on the relative values
|
||||
if (uAbsTimeSteadyClock && !uRelativeNanoseconds)
|
||||
{
|
||||
AuInt64 iDelta = uAbsTimeSteadyClock;
|
||||
iDelta -= AuTime::SteadyClockNS();
|
||||
|
||||
if (iDelta <= 0)
|
||||
{
|
||||
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
|
||||
}
|
||||
|
||||
uRelativeNanoseconds = iDelta;
|
||||
}
|
||||
|
||||
// LockN(<1MS) on a platform without that resolution of yielding... damn
|
||||
auto uMS = AuNSToMS<AuUInt32>(uRelativeNanoseconds);
|
||||
if (!uMS)
|
||||
{
|
||||
// take a copy
|
||||
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
|
||||
|
||||
// first: cpu spin to avoid the kernel all together
|
||||
if (TryWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
AuUInt64 uNow {};
|
||||
while (uAbsTime > (uNow = AuTime::SteadyClockNS()))
|
||||
// second: yield
|
||||
do
|
||||
{
|
||||
uMS = AuNSToMS<AuUInt32>(uAbsTime - uNow);
|
||||
if (!expect.Compare(pTargetAddress))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
AuThreading::ContextYield();
|
||||
}
|
||||
while (uAbsTimeSteadyClock > AuTime::SteadyClockNS()); // ...until times up
|
||||
}
|
||||
else // high level lock function was called with ms scale resolution
|
||||
{
|
||||
// first: wait on the address with an ms scale timeout
|
||||
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
|
||||
|
||||
// never trust the error value/status provided by wait addresses - instead, do a quick compare
|
||||
if (!WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress))
|
||||
{
|
||||
// best case: we woke up during the ms-res waitonaddress
|
||||
return true;
|
||||
}
|
||||
|
||||
// attempt to yield again, potentially context switching a few times to hit any NS remainder
|
||||
AuUInt64 uNow {};
|
||||
unsigned uLimit {};
|
||||
while (uAbsTimeSteadyClock > (uNow = AuTime::SteadyClockNS()))
|
||||
{
|
||||
uMS = AuNSToMS<AuUInt32>(uAbsTimeSteadyClock - uNow);
|
||||
|
||||
if (Primitives::DoTryIf([=]()
|
||||
{
|
||||
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
|
||||
}))
|
||||
{
|
||||
// hit it within the span of 1 << SpinLoopPowerA SMT stalls
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!uMS)
|
||||
{
|
||||
// burn off any remainder cycles by switching contexts (this isnt a very long time usually)
|
||||
if (uLimit++ < 4)
|
||||
{
|
||||
AuThreading::ContextYield();
|
||||
}
|
||||
else
|
||||
{
|
||||
// do not burn the cpu to meet the timeout. we'll just undershoot.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -754,7 +862,7 @@ namespace Aurora::Threading
|
||||
auto uCurrent = *(AuUInt32 *)pCompareAddress;
|
||||
|
||||
struct timespec tspec;
|
||||
Time::auabsns2ts(&tspec, uAbsTime);
|
||||
Time::auabsns2ts(&tspec, uAbsTimeAltClock ? uAbsTimeAltClock.value() : uAbsTimeSteadyClock);
|
||||
|
||||
do
|
||||
{
|
||||
@ -795,7 +903,7 @@ namespace Aurora::Threading
|
||||
}
|
||||
}
|
||||
|
||||
static bool RunOSWaitOnAddressTimedNoErrors(const void *pTargetAddress,
|
||||
static bool RunOSWaitOnAddressTimedSteady(const void *pTargetAddress,
|
||||
const void *pCompareAddress,
|
||||
WaitState &state)
|
||||
{
|
||||
@ -804,7 +912,7 @@ namespace Aurora::Threading
|
||||
return true;
|
||||
}
|
||||
|
||||
(void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, AuTime::SteadyClockNS() + state.qwNanoseconds.value(), state.qwNanoseconds.value());
|
||||
(void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, state.qwNanosecondsAbs.value(), 0, { });
|
||||
return !WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state);
|
||||
}
|
||||
|
||||
@ -834,6 +942,37 @@ namespace Aurora::Threading
|
||||
#endif
|
||||
}
|
||||
|
||||
// Windows 8+ thread primitives might use me instead of the public API
|
||||
// it does work on Linux and Windows 8+
|
||||
// it does not, however, work on emulated platforms
|
||||
// this is intentional
|
||||
bool InternalLTSWaitOnAddressHighRes(void *pTargetAddress,
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
AuUInt64 qwNanosecondsAbs)
|
||||
{
|
||||
auto [pWaitAddress, uDelta, uMask] = DecodeAddress(pTargetAddress, uWordSize);
|
||||
auto pCompareAddress2 = AuReinterpretCast<char *>(pCompareAddress) - uDelta;
|
||||
|
||||
WaitState state;
|
||||
state.uDownsizeMask = uMask;
|
||||
state.compare = uMask ?
|
||||
WaitBuffer::From(pCompareAddress2, 4) :
|
||||
WaitBuffer::From(pCompareAddress2, uWordSize);
|
||||
state.uWordSize = uMask ? 4 : uWordSize;
|
||||
|
||||
if (!qwNanosecondsAbs)
|
||||
{
|
||||
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
state.qwNanosecondsAbs = qwNanosecondsAbs;
|
||||
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
|
||||
}
|
||||
}
|
||||
|
||||
AUKN_SYM bool WaitOnAddress(void *pTargetAddress,
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
@ -859,8 +998,8 @@ namespace Aurora::Threading
|
||||
}
|
||||
else
|
||||
{
|
||||
state.qwNanoseconds = qwNanoseconds;
|
||||
return RunOSWaitOnAddressTimedNoErrors(pWaitAddress, pCompareAddress2, state);
|
||||
state.qwNanosecondsAbs = qwNanoseconds + AuTime::GetSteadyClock();
|
||||
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -870,7 +1009,7 @@ namespace Aurora::Threading
|
||||
return true;
|
||||
}
|
||||
|
||||
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, qwNanoseconds, false);
|
||||
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, qwNanoseconds, {}, false);
|
||||
}
|
||||
|
||||
return false;
|
||||
@ -956,6 +1095,48 @@ namespace Aurora::Threading
|
||||
}
|
||||
}
|
||||
|
||||
AUKN_SYM bool WaitOnAddressSteady(void *pTargetAddress,
|
||||
void *pCompareAddress,
|
||||
AuUInt8 uWordSize,
|
||||
AuUInt64 qwNanoseconds)
|
||||
{
|
||||
bool bWaitOnAddress = IsWaitOnRecommended();
|
||||
if (bWaitOnAddress)
|
||||
{
|
||||
auto [pWaitAddress, uDelta, uMask] = DecodeAddress(pTargetAddress, uWordSize);
|
||||
auto pCompareAddress2 = AuReinterpretCast<char *>(pCompareAddress) - uDelta;
|
||||
|
||||
WaitState state;
|
||||
state.uDownsizeMask = uMask;
|
||||
state.compare = uMask ?
|
||||
WaitBuffer::From(pCompareAddress2, 4) :
|
||||
WaitBuffer::From(pCompareAddress2, uWordSize);
|
||||
state.uWordSize = uMask ? 4 : uWordSize;
|
||||
|
||||
if (!qwNanoseconds)
|
||||
{
|
||||
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
state.qwNanosecondsAbs = qwNanoseconds;
|
||||
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, {}, qwNanoseconds, false);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Future (Reece): AuThread aware (safe force-terminate)
|
||||
// There are three ways we can go about this:
|
||||
// Shared pointers
|
||||
|
@ -29,7 +29,8 @@ namespace Aurora::Threading
|
||||
struct WaitState
|
||||
{
|
||||
WaitBuffer compare;
|
||||
AuOptionalEx<AuUInt64> qwNanoseconds;
|
||||
//AuOptionalEx<AuUInt64> qwNanoseconds;
|
||||
AuOptionalEx<AuUInt64> qwNanosecondsAbs;
|
||||
AuOptionalEx<AuUInt32> uDownsizeMask;
|
||||
AuUInt32 uWordSize {};
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user