[+] Improve WoA on Windows 8+

[+] AuThreading::WaitOnAddressSteady
This commit is contained in:
Reece Wilson 2023-06-15 20:44:27 +01:00
parent 17c50eff64
commit 28201db2d7
8 changed files with 287 additions and 53 deletions

View File

@ -345,13 +345,15 @@ namespace Aurora
bool bNoThreadNames { false };
bool bPlatformIsSMPProcessorOptimized { true }; // Whether to attempt to using mm_pause or similar before yielding into the kernel
AuUInt8 uSpinLoopPowerA { 7 }; // Nudgable spinloop power. This is our local userland niceness factor; where 1 << n is the amount of yield instructions to stall for
bool bPreferNt51XpMutexesOver81 { true }; // Fun Fact: Undocumented Windows XP APIs are still better than whatever the fuck shit fest they sharted out under Windows Vista and 8.1
}; // Wth the former set of apis, we are still nothing more than a futex intended for nothing more than x86 bittestandset with undefined
// bahviour on the higher bits, and we're crippled by some annoying thread switch function. Windows Vista superseded the dumb kernel-io
bool bPreferNt51XpMutexesOver8 { false }; // Fun Fact: Undocumented Windows XP APIs are still better than whatever the fuck shit fest they sharted out under Windows Vista and maybe 8.1
bool bPerferNt51XpCondvarsOver8 { false }; // Wth the former set of apis, we are still nothing more than a futex intended for nothing more than x86 bittestandset with undefined
}; // bahviour on the higher bits, and we're crippled by some annoying thread switch function. Windows Vista superseded the dumb kernel-io
// based switching apis everyone thought they had to use with bloat on top of this very same 5.1 era api.
// And to end it all off, Windows 8.1 wait/wake on address forces relative millisecond precision, in the first (?) MS OS to drop tick based [re]scheduling.
// ~~ And to end it all off, Windows 8.1 wait/wake on address forces relative millisecond precision, in the first (?) MS OS to drop tick based [re]scheduling. ~~ (officially)
// Our main mutex is one edge case where undcoumented XP era scheduling apis are better than the garbage indiasoft wants you to use in <current year>.
struct RuntimeStartInfo
{
ConsoleConfig console;

View File

@ -20,8 +20,15 @@ namespace Aurora::Threading
void *pCompareAddress,
AuUInt8 uWordSize);
// Relative timeout variant of nanosecond resolution WoA. nanoseconds in steady clock time. 0 = indefinite
AUKN_SYM bool WaitOnAddress(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds);
// Absolute timeout variant of nanosecond resolution WoA. Nanoseconds are in steady clock time. 0 = indefinite
AUKN_SYM bool WaitOnAddressSteady(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds);
}

View File

@ -51,6 +51,7 @@ namespace Aurora
ADD_GET_PROC(Nt, NtReleaseKeyedEvent)
ADD_GET_PROC(Nt, NtOpenKeyedEvent)
ADD_GET_PROC(Nt, NtCreateKeyedEvent)
ADD_GET_PROC(Nt, RtlWaitOnAddress)
ADD_GET_PROC_BI(Kernel32, KernelBase, VirtualAlloc2)
ADD_GET_PROC_BI(Kernel32, KernelBase, MapViewOfFile3)
@ -79,5 +80,15 @@ namespace Aurora
pNtDelayExecution = nullptr /* ... (you dont need it, but it'll help a ton) */;
#endif
gUseNativeWaitMutex = (pWaitOnAddress &&
!gRuntimeConfig.threadingConfig.bPreferNt51XpMutexesOver8 &&
(pRtlWaitOnAddress || AuBuild::kCurrentPlatform != AuBuild::EPlatform::ePlatformWin32)) ||
!pNtWaitForKeyedEvent;
gUseNativeWaitCondvar = (pWaitOnAddress &&
!gRuntimeConfig.threadingConfig.bPerferNt51XpCondvarsOver8 &&
(pRtlWaitOnAddress || AuBuild::kCurrentPlatform != AuBuild::EPlatform::ePlatformWin32)) ||
!pNtWaitForKeyedEvent;
}
}

View File

@ -92,9 +92,18 @@ namespace Aurora
ULONG Flags
);
inline NTSTATUS(__stdcall *pRtlWaitOnAddress)(
const void *addr,
const void *cmp,
SIZE_T size,
const LARGE_INTEGER *timeout);
#if defined(AURORA_PLATFORM_WIN32)
inline NTSTATUS(_stdcall *pRtlGetVersion)(
PRTL_OSVERSIONINFOW lpVersionInformation
);
#endif
inline bool gUseNativeWaitMutex {};
inline bool gUseNativeWaitCondvar {};
}

View File

@ -15,6 +15,12 @@ namespace Aurora
{
void InitProcAddresses()
{
static bool gDumbInitOnce {};
if (AuExchange(gDumbInitOnce, true))
{
return;
}
#if defined(AURORA_IS_MODERNNT_DERIVED)
InitNTAddresses();
#endif

View File

@ -0,0 +1,17 @@
/***
Copyright (C) 2023 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: AuWakeInternal.hpp
Date: 2023-6-15
Author: Reece
Note: Defines a LTS prototype for nanosecond-resolution WoA API for Desktop class Win32 primitives
***/
#pragma once
namespace Aurora::Threading
{
bool InternalLTSWaitOnAddressHighRes(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanosecondsAbs);
}

View File

@ -11,6 +11,7 @@
#if defined(AURORA_PLATFORM_WIN32)
#include <timeapi.h>
#endif
#include <Time/Time.hpp>
namespace Aurora::Threading
{
@ -124,15 +125,20 @@ namespace Aurora::Threading
{
AU_LOCK_GUARD(this->mutex);
if (state.qwNanoseconds)
if (state.qwNanosecondsAbs)
{
if (!WaitBuffer::From(this->pAddress, this->uSize).Compare(state))
{
return true;
}
AuUInt64 uEndTime {};
auto uNow = AuTime::SteadyClockNS();
auto uEndTime = uNow + state.qwNanoseconds.value();
if (state.qwNanosecondsAbs)
{
uEndTime = state.qwNanosecondsAbs.value();
}
#if defined(AURORA_IS_POSIX_DERIVED)
struct timespec tspec;
@ -588,6 +594,7 @@ namespace Aurora::Threading
void *pCompareAddress,
AuUInt8 uWordSize,
AuOptional<AuUInt64> qwNanoseconds,
AuOptional<AuUInt64> qwNanosecondsAbs,
bool bOSSupportsWait
)
{
@ -595,7 +602,16 @@ namespace Aurora::Threading
SysAssertDbg(uWordSize <= 8);
auto pWaitEntry = gProcessWaitables.WaitBufferFrom(pTargetAddress, uWordSize);
state.compare = WaitBuffer::From(pCompareAddress, uWordSize);
state.qwNanoseconds = qwNanoseconds ? AuOptionalEx<AuUInt64> { qwNanoseconds.value() } : AuOptionalEx<AuUInt64> {}; // from default/zeroable optional, to boolean suffix
if (qwNanoseconds)
{
state.qwNanosecondsAbs = AuTime::SteadyClockNS() + qwNanoseconds.value();
}
else if (qwNanosecondsAbs)
{
state.qwNanosecondsAbs = qwNanosecondsAbs.value();
}
auto bResult = pWaitEntry->SleepOn(state);
#if defined(WOA_USE_DEFERRED_REL)
pWaitEntry->Release();
@ -687,56 +703,148 @@ namespace Aurora::Threading
static bool RunOSWaitOnAddressTimed(const void *pTargetAddress,
const void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 uAbsTime,
AuUInt32 uNanoseconds)
AuUInt64 uAbsTimeSteadyClock,
AuUInt64 uRelativeNanoseconds,
AuOptional<AuUInt64> uAbsTimeAltClock /* hint */)
{
#if defined(AURORA_IS_MODERNNT_DERIVED)
auto uMS = AuNSToMS<AuUInt32>(uNanoseconds);
if (!uMS)
if (pRtlWaitOnAddress)
{
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
do
AuUInt64 uNow {};
while (uAbsTimeSteadyClock ?
(uAbsTimeSteadyClock > (uNow = AuTime::SteadyClockNS())) :
true)
{
LARGE_INTEGER word {};
if (uAbsTimeAltClock)
{
word.QuadPart = AuTime::ConvertTimestampNs(uAbsTimeAltClock.value());
}
else if (uAbsTimeSteadyClock)
{
if (uAbsTimeSteadyClock <= uNow)
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
}
word.QuadPart = -(AuInt64(uAbsTimeSteadyClock - uNow) / 100ull);
}
if (expect.Compare(pTargetAddress))
{
AuThreading::ContextYield();
pRtlWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize, &word);
if (!expect.Compare(pTargetAddress))
{
return true;
}
else if (!uAbsTimeSteadyClock)
{
return false;
}
while (uAbsTime > AuTime::SteadyClockNS());
}
else
{
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
return true;
}
}
if (!WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress))
return false;
}
else
{
// ~~some paths might miss the uRelativeNanoseconds, like cas loops.~~
// most paths will now skimp on the relative values
if (uAbsTimeSteadyClock && !uRelativeNanoseconds)
{
AuInt64 iDelta = uAbsTimeSteadyClock;
iDelta -= AuTime::SteadyClockNS();
if (iDelta <= 0)
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
}
uRelativeNanoseconds = iDelta;
}
// LockN(<1MS) on a platform without that resolution of yielding... damn
auto uMS = AuNSToMS<AuUInt32>(uRelativeNanoseconds);
if (!uMS)
{
// take a copy
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
// first: cpu spin to avoid the kernel all together
if (TryWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize))
{
return true;
}
AuUInt64 uNow {};
while (uAbsTime > (uNow = AuTime::SteadyClockNS()))
// second: yield
do
{
uMS = AuNSToMS<AuUInt32>(uAbsTime - uNow);
if (!expect.Compare(pTargetAddress))
{
break;
}
AuThreading::ContextYield();
}
while (uAbsTimeSteadyClock > AuTime::SteadyClockNS()); // ...until times up
}
else // high level lock function was called with ms scale resolution
{
// first: wait on the address with an ms scale timeout
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
// never trust the error value/status provided by wait addresses - instead, do a quick compare
if (!WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress))
{
// best case: we woke up during the ms-res waitonaddress
return true;
}
// attempt to yield again, potentially context switching a few times to hit any NS remainder
AuUInt64 uNow {};
unsigned uLimit {};
while (uAbsTimeSteadyClock > (uNow = AuTime::SteadyClockNS()))
{
uMS = AuNSToMS<AuUInt32>(uAbsTimeSteadyClock - uNow);
if (Primitives::DoTryIf([=]()
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
}))
{
// hit it within the span of 1 << SpinLoopPowerA SMT stalls
return true;
}
if (!uMS)
{
// burn off any remainder cycles by switching contexts (this isnt a very long time usually)
if (uLimit++ < 4)
{
AuThreading::ContextYield();
}
else
{
// do not burn the cpu to meet the timeout. we'll just undershoot.
return false;
}
}
else
{
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
}
}
}
}
#endif
@ -754,7 +862,7 @@ namespace Aurora::Threading
auto uCurrent = *(AuUInt32 *)pCompareAddress;
struct timespec tspec;
Time::auabsns2ts(&tspec, uAbsTime);
Time::auabsns2ts(&tspec, uAbsTimeAltClock ? uAbsTimeAltClock.value() : uAbsTimeSteadyClock);
do
{
@ -795,7 +903,7 @@ namespace Aurora::Threading
}
}
static bool RunOSWaitOnAddressTimedNoErrors(const void *pTargetAddress,
static bool RunOSWaitOnAddressTimedSteady(const void *pTargetAddress,
const void *pCompareAddress,
WaitState &state)
{
@ -804,7 +912,7 @@ namespace Aurora::Threading
return true;
}
(void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, AuTime::SteadyClockNS() + state.qwNanoseconds.value(), state.qwNanoseconds.value());
(void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, state.qwNanosecondsAbs.value(), 0, { });
return !WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state);
}
@ -834,6 +942,37 @@ namespace Aurora::Threading
#endif
}
// Windows 8+ thread primitives might use me instead of the public API
// it does work on Linux and Windows 8+
// it does not, however, work on emulated platforms
// this is intentional
bool InternalLTSWaitOnAddressHighRes(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanosecondsAbs)
{
auto [pWaitAddress, uDelta, uMask] = DecodeAddress(pTargetAddress, uWordSize);
auto pCompareAddress2 = AuReinterpretCast<char *>(pCompareAddress) - uDelta;
WaitState state;
state.uDownsizeMask = uMask;
state.compare = uMask ?
WaitBuffer::From(pCompareAddress2, 4) :
WaitBuffer::From(pCompareAddress2, uWordSize);
state.uWordSize = uMask ? 4 : uWordSize;
if (!qwNanosecondsAbs)
{
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
return true;
}
else
{
state.qwNanosecondsAbs = qwNanosecondsAbs;
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
}
}
AUKN_SYM bool WaitOnAddress(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
@ -859,8 +998,8 @@ namespace Aurora::Threading
}
else
{
state.qwNanoseconds = qwNanoseconds;
return RunOSWaitOnAddressTimedNoErrors(pWaitAddress, pCompareAddress2, state);
state.qwNanosecondsAbs = qwNanoseconds + AuTime::GetSteadyClock();
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
}
}
else
@ -870,7 +1009,7 @@ namespace Aurora::Threading
return true;
}
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, qwNanoseconds, false);
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, qwNanoseconds, {}, false);
}
return false;
@ -956,6 +1095,48 @@ namespace Aurora::Threading
}
}
AUKN_SYM bool WaitOnAddressSteady(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds)
{
bool bWaitOnAddress = IsWaitOnRecommended();
if (bWaitOnAddress)
{
auto [pWaitAddress, uDelta, uMask] = DecodeAddress(pTargetAddress, uWordSize);
auto pCompareAddress2 = AuReinterpretCast<char *>(pCompareAddress) - uDelta;
WaitState state;
state.uDownsizeMask = uMask;
state.compare = uMask ?
WaitBuffer::From(pCompareAddress2, 4) :
WaitBuffer::From(pCompareAddress2, uWordSize);
state.uWordSize = uMask ? 4 : uWordSize;
if (!qwNanoseconds)
{
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
return true;
}
else
{
state.qwNanosecondsAbs = qwNanoseconds;
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
}
}
else
{
if (TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize))
{
return true;
}
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, {}, qwNanoseconds, false);
}
return false;
}
// Future (Reece): AuThread aware (safe force-terminate)
// There are three ways we can go about this:
// Shared pointers

View File

@ -29,7 +29,8 @@ namespace Aurora::Threading
struct WaitState
{
WaitBuffer compare;
AuOptionalEx<AuUInt64> qwNanoseconds;
//AuOptionalEx<AuUInt64> qwNanoseconds;
AuOptionalEx<AuUInt64> qwNanosecondsAbs;
AuOptionalEx<AuUInt32> uDownsizeMask;
AuUInt32 uWordSize {};
};