[+] Improve WoA on Windows 8+

[+] AuThreading::WaitOnAddressSteady
This commit is contained in:
Reece Wilson 2023-06-15 20:44:27 +01:00
parent 17c50eff64
commit 28201db2d7
8 changed files with 287 additions and 53 deletions

View File

@ -343,14 +343,16 @@ namespace Aurora
struct ThreadingConfig struct ThreadingConfig
{ {
bool bNoThreadNames { false }; bool bNoThreadNames { false };
bool bPlatformIsSMPProcessorOptimized { true }; // Whether to attempt to using mm_pause or similar before yielding into the kernel bool bPlatformIsSMPProcessorOptimized { true }; // Whether to attempt to using mm_pause or similar before yielding into the kernel
AuUInt8 uSpinLoopPowerA { 7 }; // Nudgable spinloop power. This is our local userland niceness factor; where 1 << n is the amount of yield instructions to stall for AuUInt8 uSpinLoopPowerA { 7 }; // Nudgable spinloop power. This is our local userland niceness factor; where 1 << n is the amount of yield instructions to stall for
bool bPreferNt51XpMutexesOver81 { true }; // Fun Fact: Undocumented Windows XP APIs are still better than whatever the fuck shit fest they sharted out under Windows Vista and 8.1
}; // Wth the former set of apis, we are still nothing more than a futex intended for nothing more than x86 bittestandset with undefined bool bPreferNt51XpMutexesOver8 { false }; // Fun Fact: Undocumented Windows XP APIs are still better than whatever the fuck shit fest they sharted out under Windows Vista and maybe 8.1
// bahviour on the higher bits, and we're crippled by some annoying thread switch function. Windows Vista superseded the dumb kernel-io bool bPerferNt51XpCondvarsOver8 { false }; // Wth the former set of apis, we are still nothing more than a futex intended for nothing more than x86 bittestandset with undefined
// based switching apis everyone thought they had to use with bloat on top of this very same 5.1 era api. }; // bahviour on the higher bits, and we're crippled by some annoying thread switch function. Windows Vista superseded the dumb kernel-io
// And to end it all off, Windows 8.1 wait/wake on address forces relative millisecond precision, in the first (?) MS OS to drop tick based [re]scheduling. // based switching apis everyone thought they had to use with bloat on top of this very same 5.1 era api.
// Our main mutex is one edge case where undcoumented XP era scheduling apis are better than the garbage indiasoft wants you to use in <current year>. // ~~ And to end it all off, Windows 8.1 wait/wake on address forces relative millisecond precision, in the first (?) MS OS to drop tick based [re]scheduling. ~~ (officially)
// Our main mutex is one edge case where undcoumented XP era scheduling apis are better than the garbage indiasoft wants you to use in <current year>.
struct RuntimeStartInfo struct RuntimeStartInfo
{ {

View File

@ -20,8 +20,15 @@ namespace Aurora::Threading
void *pCompareAddress, void *pCompareAddress,
AuUInt8 uWordSize); AuUInt8 uWordSize);
// Relative timeout variant of nanosecond resolution WoA. nanoseconds in steady clock time. 0 = indefinite
AUKN_SYM bool WaitOnAddress(void *pTargetAddress, AUKN_SYM bool WaitOnAddress(void *pTargetAddress,
void *pCompareAddress, void *pCompareAddress,
AuUInt8 uWordSize, AuUInt8 uWordSize,
AuUInt64 qwNanoseconds); AuUInt64 qwNanoseconds);
// Absolute timeout variant of nanosecond resolution WoA. Nanoseconds are in steady clock time. 0 = indefinite
AUKN_SYM bool WaitOnAddressSteady(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds);
} }

View File

@ -51,6 +51,7 @@ namespace Aurora
ADD_GET_PROC(Nt, NtReleaseKeyedEvent) ADD_GET_PROC(Nt, NtReleaseKeyedEvent)
ADD_GET_PROC(Nt, NtOpenKeyedEvent) ADD_GET_PROC(Nt, NtOpenKeyedEvent)
ADD_GET_PROC(Nt, NtCreateKeyedEvent) ADD_GET_PROC(Nt, NtCreateKeyedEvent)
ADD_GET_PROC(Nt, RtlWaitOnAddress)
ADD_GET_PROC_BI(Kernel32, KernelBase, VirtualAlloc2) ADD_GET_PROC_BI(Kernel32, KernelBase, VirtualAlloc2)
ADD_GET_PROC_BI(Kernel32, KernelBase, MapViewOfFile3) ADD_GET_PROC_BI(Kernel32, KernelBase, MapViewOfFile3)
@ -79,5 +80,15 @@ namespace Aurora
pNtDelayExecution = nullptr /* ... (you dont need it, but it'll help a ton) */; pNtDelayExecution = nullptr /* ... (you dont need it, but it'll help a ton) */;
#endif #endif
gUseNativeWaitMutex = (pWaitOnAddress &&
!gRuntimeConfig.threadingConfig.bPreferNt51XpMutexesOver8 &&
(pRtlWaitOnAddress || AuBuild::kCurrentPlatform != AuBuild::EPlatform::ePlatformWin32)) ||
!pNtWaitForKeyedEvent;
gUseNativeWaitCondvar = (pWaitOnAddress &&
!gRuntimeConfig.threadingConfig.bPerferNt51XpCondvarsOver8 &&
(pRtlWaitOnAddress || AuBuild::kCurrentPlatform != AuBuild::EPlatform::ePlatformWin32)) ||
!pNtWaitForKeyedEvent;
} }
} }

View File

@ -92,9 +92,18 @@ namespace Aurora
ULONG Flags ULONG Flags
); );
inline NTSTATUS(__stdcall *pRtlWaitOnAddress)(
const void *addr,
const void *cmp,
SIZE_T size,
const LARGE_INTEGER *timeout);
#if defined(AURORA_PLATFORM_WIN32) #if defined(AURORA_PLATFORM_WIN32)
inline NTSTATUS(_stdcall *pRtlGetVersion)( inline NTSTATUS(_stdcall *pRtlGetVersion)(
PRTL_OSVERSIONINFOW lpVersionInformation PRTL_OSVERSIONINFOW lpVersionInformation
); );
#endif #endif
inline bool gUseNativeWaitMutex {};
inline bool gUseNativeWaitCondvar {};
} }

View File

@ -15,6 +15,12 @@ namespace Aurora
{ {
void InitProcAddresses() void InitProcAddresses()
{ {
static bool gDumbInitOnce {};
if (AuExchange(gDumbInitOnce, true))
{
return;
}
#if defined(AURORA_IS_MODERNNT_DERIVED) #if defined(AURORA_IS_MODERNNT_DERIVED)
InitNTAddresses(); InitNTAddresses();
#endif #endif

View File

@ -0,0 +1,17 @@
/***
Copyright (C) 2023 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: AuWakeInternal.hpp
Date: 2023-6-15
Author: Reece
Note: Defines a LTS prototype for nanosecond-resolution WoA API for Desktop class Win32 primitives
***/
#pragma once
namespace Aurora::Threading
{
bool InternalLTSWaitOnAddressHighRes(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanosecondsAbs);
}

View File

@ -11,6 +11,7 @@
#if defined(AURORA_PLATFORM_WIN32) #if defined(AURORA_PLATFORM_WIN32)
#include <timeapi.h> #include <timeapi.h>
#endif #endif
#include <Time/Time.hpp>
namespace Aurora::Threading namespace Aurora::Threading
{ {
@ -124,15 +125,20 @@ namespace Aurora::Threading
{ {
AU_LOCK_GUARD(this->mutex); AU_LOCK_GUARD(this->mutex);
if (state.qwNanoseconds) if (state.qwNanosecondsAbs)
{ {
if (!WaitBuffer::From(this->pAddress, this->uSize).Compare(state)) if (!WaitBuffer::From(this->pAddress, this->uSize).Compare(state))
{ {
return true; return true;
} }
AuUInt64 uEndTime {};
auto uNow = AuTime::SteadyClockNS(); auto uNow = AuTime::SteadyClockNS();
auto uEndTime = uNow + state.qwNanoseconds.value();
if (state.qwNanosecondsAbs)
{
uEndTime = state.qwNanosecondsAbs.value();
}
#if defined(AURORA_IS_POSIX_DERIVED) #if defined(AURORA_IS_POSIX_DERIVED)
struct timespec tspec; struct timespec tspec;
@ -588,6 +594,7 @@ namespace Aurora::Threading
void *pCompareAddress, void *pCompareAddress,
AuUInt8 uWordSize, AuUInt8 uWordSize,
AuOptional<AuUInt64> qwNanoseconds, AuOptional<AuUInt64> qwNanoseconds,
AuOptional<AuUInt64> qwNanosecondsAbs,
bool bOSSupportsWait bool bOSSupportsWait
) )
{ {
@ -595,7 +602,16 @@ namespace Aurora::Threading
SysAssertDbg(uWordSize <= 8); SysAssertDbg(uWordSize <= 8);
auto pWaitEntry = gProcessWaitables.WaitBufferFrom(pTargetAddress, uWordSize); auto pWaitEntry = gProcessWaitables.WaitBufferFrom(pTargetAddress, uWordSize);
state.compare = WaitBuffer::From(pCompareAddress, uWordSize); state.compare = WaitBuffer::From(pCompareAddress, uWordSize);
state.qwNanoseconds = qwNanoseconds ? AuOptionalEx<AuUInt64> { qwNanoseconds.value() } : AuOptionalEx<AuUInt64> {}; // from default/zeroable optional, to boolean suffix
if (qwNanoseconds)
{
state.qwNanosecondsAbs = AuTime::SteadyClockNS() + qwNanoseconds.value();
}
else if (qwNanosecondsAbs)
{
state.qwNanosecondsAbs = qwNanosecondsAbs.value();
}
auto bResult = pWaitEntry->SleepOn(state); auto bResult = pWaitEntry->SleepOn(state);
#if defined(WOA_USE_DEFERRED_REL) #if defined(WOA_USE_DEFERRED_REL)
pWaitEntry->Release(); pWaitEntry->Release();
@ -687,53 +703,145 @@ namespace Aurora::Threading
static bool RunOSWaitOnAddressTimed(const void *pTargetAddress, static bool RunOSWaitOnAddressTimed(const void *pTargetAddress,
const void *pCompareAddress, const void *pCompareAddress,
AuUInt8 uWordSize, AuUInt8 uWordSize,
AuUInt64 uAbsTime, AuUInt64 uAbsTimeSteadyClock,
AuUInt32 uNanoseconds) AuUInt64 uRelativeNanoseconds,
AuOptional<AuUInt64> uAbsTimeAltClock /* hint */)
{ {
#if defined(AURORA_IS_MODERNNT_DERIVED) #if defined(AURORA_IS_MODERNNT_DERIVED)
auto uMS = AuNSToMS<AuUInt32>(uNanoseconds); if (pRtlWaitOnAddress)
if (!uMS)
{ {
auto expect = WaitBuffer::From(pCompareAddress, uWordSize); auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
do
AuUInt64 uNow {};
while (uAbsTimeSteadyClock ?
(uAbsTimeSteadyClock > (uNow = AuTime::SteadyClockNS())) :
true)
{ {
LARGE_INTEGER word {};
if (uAbsTimeAltClock)
{
word.QuadPart = AuTime::ConvertTimestampNs(uAbsTimeAltClock.value());
}
else if (uAbsTimeSteadyClock)
{
if (uAbsTimeSteadyClock <= uNow)
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
}
word.QuadPart = -(AuInt64(uAbsTimeSteadyClock - uNow) / 100ull);
}
if (expect.Compare(pTargetAddress)) if (expect.Compare(pTargetAddress))
{ {
AuThreading::ContextYield(); pRtlWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize, &word);
if (!expect.Compare(pTargetAddress))
{
return true;
}
else if (!uAbsTimeSteadyClock)
{
return false;
}
}
else
{
return true;
} }
} }
while (uAbsTime > AuTime::SteadyClockNS());
return false;
} }
else else
{ {
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS); // ~~some paths might miss the uRelativeNanoseconds, like cas loops.~~
// most paths will now skimp on the relative values
if (!WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress)) if (uAbsTimeSteadyClock && !uRelativeNanoseconds)
{ {
return true; AuInt64 iDelta = uAbsTimeSteadyClock;
} iDelta -= AuTime::SteadyClockNS();
AuUInt64 uNow {}; if (iDelta <= 0)
while (uAbsTime > (uNow = AuTime::SteadyClockNS()))
{
uMS = AuNSToMS<AuUInt32>(uAbsTime - uNow);
if (Primitives::DoTryIf([=]()
{ {
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress); return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
})) }
uRelativeNanoseconds = iDelta;
}
// LockN(<1MS) on a platform without that resolution of yielding... damn
auto uMS = AuNSToMS<AuUInt32>(uRelativeNanoseconds);
if (!uMS)
{
// take a copy
auto expect = WaitBuffer::From(pCompareAddress, uWordSize);
// first: cpu spin to avoid the kernel all together
if (TryWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize))
{ {
return true; return true;
} }
if (!uMS) // second: yield
do
{ {
if (!expect.Compare(pTargetAddress))
{
break;
}
AuThreading::ContextYield(); AuThreading::ContextYield();
} }
else while (uAbsTimeSteadyClock > AuTime::SteadyClockNS()); // ...until times up
}
else // high level lock function was called with ms scale resolution
{
// first: wait on the address with an ms scale timeout
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
// never trust the error value/status provided by wait addresses - instead, do a quick compare
if (!WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress))
{ {
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS); // best case: we woke up during the ms-res waitonaddress
return true;
}
// attempt to yield again, potentially context switching a few times to hit any NS remainder
AuUInt64 uNow {};
unsigned uLimit {};
while (uAbsTimeSteadyClock > (uNow = AuTime::SteadyClockNS()))
{
uMS = AuNSToMS<AuUInt32>(uAbsTimeSteadyClock - uNow);
if (Primitives::DoTryIf([=]()
{
return !WaitBuffer::From(pCompareAddress, uWordSize).Compare(pTargetAddress);
}))
{
// hit it within the span of 1 << SpinLoopPowerA SMT stalls
return true;
}
if (!uMS)
{
// burn off any remainder cycles by switching contexts (this isnt a very long time usually)
if (uLimit++ < 4)
{
AuThreading::ContextYield();
}
else
{
// do not burn the cpu to meet the timeout. we'll just undershoot.
return false;
}
}
else
{
(void)pWaitOnAddress((void *)pTargetAddress, (void *)pCompareAddress, uWordSize, uMS);
}
} }
} }
} }
@ -754,7 +862,7 @@ namespace Aurora::Threading
auto uCurrent = *(AuUInt32 *)pCompareAddress; auto uCurrent = *(AuUInt32 *)pCompareAddress;
struct timespec tspec; struct timespec tspec;
Time::auabsns2ts(&tspec, uAbsTime); Time::auabsns2ts(&tspec, uAbsTimeAltClock ? uAbsTimeAltClock.value() : uAbsTimeSteadyClock);
do do
{ {
@ -795,16 +903,16 @@ namespace Aurora::Threading
} }
} }
static bool RunOSWaitOnAddressTimedNoErrors(const void *pTargetAddress, static bool RunOSWaitOnAddressTimedSteady(const void *pTargetAddress,
const void *pCompareAddress, const void *pCompareAddress,
WaitState &state) WaitState &state)
{ {
if (!WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state)) if (!WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state))
{ {
return true; return true;
} }
(void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, AuTime::SteadyClockNS() + state.qwNanoseconds.value(), state.qwNanoseconds.value()); (void)RunOSWaitOnAddressTimed(pTargetAddress, pCompareAddress, state.uWordSize, state.qwNanosecondsAbs.value(), 0, { });
return !WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state); return !WaitBuffer::From(pTargetAddress, state.uWordSize).Compare(state);
} }
@ -834,6 +942,37 @@ namespace Aurora::Threading
#endif #endif
} }
// Windows 8+ thread primitives might use me instead of the public API
// it does work on Linux and Windows 8+
// it does not, however, work on emulated platforms
// this is intentional
bool InternalLTSWaitOnAddressHighRes(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanosecondsAbs)
{
auto [pWaitAddress, uDelta, uMask] = DecodeAddress(pTargetAddress, uWordSize);
auto pCompareAddress2 = AuReinterpretCast<char *>(pCompareAddress) - uDelta;
WaitState state;
state.uDownsizeMask = uMask;
state.compare = uMask ?
WaitBuffer::From(pCompareAddress2, 4) :
WaitBuffer::From(pCompareAddress2, uWordSize);
state.uWordSize = uMask ? 4 : uWordSize;
if (!qwNanosecondsAbs)
{
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
return true;
}
else
{
state.qwNanosecondsAbs = qwNanosecondsAbs;
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
}
}
AUKN_SYM bool WaitOnAddress(void *pTargetAddress, AUKN_SYM bool WaitOnAddress(void *pTargetAddress,
void *pCompareAddress, void *pCompareAddress,
AuUInt8 uWordSize, AuUInt8 uWordSize,
@ -859,8 +998,8 @@ namespace Aurora::Threading
} }
else else
{ {
state.qwNanoseconds = qwNanoseconds; state.qwNanosecondsAbs = qwNanoseconds + AuTime::GetSteadyClock();
return RunOSWaitOnAddressTimedNoErrors(pWaitAddress, pCompareAddress2, state); return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
} }
} }
else else
@ -870,7 +1009,7 @@ namespace Aurora::Threading
return true; return true;
} }
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, qwNanoseconds, false); return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, qwNanoseconds, {}, false);
} }
return false; return false;
@ -956,6 +1095,48 @@ namespace Aurora::Threading
} }
} }
AUKN_SYM bool WaitOnAddressSteady(void *pTargetAddress,
void *pCompareAddress,
AuUInt8 uWordSize,
AuUInt64 qwNanoseconds)
{
bool bWaitOnAddress = IsWaitOnRecommended();
if (bWaitOnAddress)
{
auto [pWaitAddress, uDelta, uMask] = DecodeAddress(pTargetAddress, uWordSize);
auto pCompareAddress2 = AuReinterpretCast<char *>(pCompareAddress) - uDelta;
WaitState state;
state.uDownsizeMask = uMask;
state.compare = uMask ?
WaitBuffer::From(pCompareAddress2, 4) :
WaitBuffer::From(pCompareAddress2, uWordSize);
state.uWordSize = uMask ? 4 : uWordSize;
if (!qwNanoseconds)
{
RunOSWaitOnAddressNoTimedNoErrors(pWaitAddress, pCompareAddress2, state);
return true;
}
else
{
state.qwNanosecondsAbs = qwNanoseconds;
return RunOSWaitOnAddressTimedSteady(pWaitAddress, pCompareAddress2, state);
}
}
else
{
if (TryWaitOnAddress(pTargetAddress, pCompareAddress, uWordSize))
{
return true;
}
return WaitOnAddressWide(pTargetAddress, pCompareAddress, uWordSize, {}, qwNanoseconds, false);
}
return false;
}
// Future (Reece): AuThread aware (safe force-terminate) // Future (Reece): AuThread aware (safe force-terminate)
// There are three ways we can go about this: // There are three ways we can go about this:
// Shared pointers // Shared pointers

View File

@ -29,7 +29,8 @@ namespace Aurora::Threading
struct WaitState struct WaitState
{ {
WaitBuffer compare; WaitBuffer compare;
AuOptionalEx<AuUInt64> qwNanoseconds; //AuOptionalEx<AuUInt64> qwNanoseconds;
AuOptionalEx<AuUInt64> qwNanosecondsAbs;
AuOptionalEx<AuUInt32> uDownsizeMask; AuOptionalEx<AuUInt32> uDownsizeMask;
AuUInt32 uWordSize {}; AuUInt32 uWordSize {};
}; };