AuroraRuntime/Source/Threading/AuWakeOnAddress.hpp
Jamie Reece Wilson 035d822ec1 [*] Explicit memory order access barrier when reading WOA_SEMAPHORE_MODE-less bAlive under weakly ordered systems. (5b193411 cont: "[*] Improve regressed AuWoA time to wake"
In all other cases, the memory is either thread-local write-local or followed up by an indirect aquire/release of the processors pipeline and L1 cache by virtue of the containers dumb spinlock ::Lock, ::Unlock (...release, ...barrier)
Clang doesn't have /volatile:ms anymore so we cant rely on that
Assuming MSVC-like or x86 isnt good enough

(and, no retard midwits, volatile is a fine keyword. take ur spec sperging and shove it. i just need to control over-optimization of defacto-weakly ordered access between explicit lockless semaphore yields)
2024-06-23 04:29:21 +01:00

146 lines
5.0 KiB
C++

/***
Copyright (C) 2023 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: AuWakeOnAddress.hpp
Date: 2023-3-10
Author: Reece
***/
#pragma once
#include "Primitives/AuWoASemaphore.hpp"
#include "Primitives/AuConditionMutex.Generic.hpp"
#include "Primitives/AuConditionVariable.Generic.hpp"
#include "Primitives/AuSemaphore.Generic.hpp"
#if defined(AURORA_COMPILER_MSVC)
#define WOAFAST __declspec(safebuffers) auline
#define WOAFASTPUB AUKN_SYM __declspec(safebuffers) auline
#else
#define WOAFAST auline
#define WOAFASTPUB AUKN_SYM
#endif
namespace Aurora::Threading
{
static const auto kDefaultWaitPerProcess = 128;
static const auto kMax64 = 0xFFFFFFFFFFFFFFFFull;
static const auto kPlatformFutexNoForcedAlignedU32 = AuBuild::kIsNTDerived;
struct WaitState;
struct WaitBuffer
{
char buffer[32];
AuUInt8 uSize;
WOAFAST static WaitBuffer From(const void *pBuf, AuUInt8 uSize);
WOAFAST static bool Compare(const void *pHotAddress, AuUInt8 uSize, WaitState &state);
WOAFAST static bool Compare(const void *pHotAddress, AuUInt8 uSize, const void *pCompare, AuUInt64 uMask, EWaitMethod eMethod);
// returns false when valid
template <EWaitMethod eMethod, bool bFast = false>
WOAFAST static bool Compare2(const void *pHotAddress, AuUInt8 uSize, const void *pReference, AuUInt64 uMask = 0xFFFFFFFFFFFFFFFF);
template <EWaitMethod eMethod, bool bFast = false>
WOAFAST static bool Compare2(const volatile void *pHotAddress, AuUInt8 uSize, const void *pReference, AuUInt64 uMask = 0xFFFFFFFFFFFFFFFF);
};
struct WaitState
{
WaitBuffer compare;
//AuOptionalEx<AuUInt64> qwNanoseconds;
AuOptionalEx<AuUInt64> qwNanosecondsAbs;
AuUInt64 uDownsizeMask { 0xFFFFFFFFFFFFFFFF };
AuUInt32 uWordSize {};
const void *pCompare2 {};
EWaitMethod eWaitMethod { EWaitMethod::eNotEqual };
};
struct WaitEntry
{
WaitEntry();
~WaitEntry();
WaitEntry * volatile pNext {};
WaitEntry * volatile pBefore {};
// synch
#if defined(WOA_SEMAPHORE_MODE)
#if !defined(WOA_SEMAPHORE_SEMAPHORE)
Primitives::Semaphore semaphore;
#else
// Recommended for XNU targets:
WOA_SEMAPHORE_SEMAPHORE semaphore;
#endif
#else
// Recommended (we can better filter spurious wakes for the cost of a barrier on signal):
// !!! we also prefer to block the containers mutex while we signal each thread individually !!!
// !!! for the sake of optimizing for windows xp - 7, its far nicer to optimize the entire signaling and wait operations under a container lock, than it is to buffer shared pointers or externally managed memory out of the lock scope !!!
// !!! also note: container spinlocks =/= WaitEntry::mutex !!
#if !defined(WOA_CONDVAR_MUTEX)
Primitives::ConditionMutexInternal mutex; // mutex ctor must come before var
Primitives::ConditionVariableInternal variable; // ...and something all 2007+ micro and monolithic kernels should have; an event or semaphore primitive on which we can form a crude condvar
#else
WOA_CONDVAR_MUTEX mutex;
WOA_CONDVAR_VARIABLE variable;
#endif
#endif
// state
const void *pAddress {};
AuUInt8 uSize {};
const void *pCompareAddress {};
EWaitMethod eWaitMethod { EWaitMethod::eNotEqual };
// bookkeeping (parent container)
volatile AuUInt8 bAlive {}; // wait entry validity. must be rechecked for each spurious or expected wake, if the comparison doesn't break the yield loop.
// if false, and we're still yielding under pCompare == pAddress, we must reschedule with inverse order (as to steal the next signal, as opposed to waiting last)
void Release();
template <EWaitMethod eMethod>
bool SleepOn(WaitState &state);
bool TrySignalAddress(const void *pAddress);
};
struct ProcessListWait
{
WaitEntry *pHead {};
WaitEntry *pTail {};
};
struct ProcessWaitNodeContainer
{
AuUInt32 uAtomic {};
ProcessListWait waitList;
WaitEntry *WaitBufferFrom(const void *pAddress, AuUInt8 uSize, bool bScheduleFirst, const void *pAddressCompare, EWaitMethod eWaitMethod);
template <typename T>
bool IterateWake(T callback);
void RemoveSelf(WaitEntry *pSelf);
void RemoveEntry(WaitEntry *pSelf, bool bAllUnderLock);
void Lock();
void Unlock();
};
struct ProcessWaitContainer
{
ProcessWaitNodeContainer list[kDefaultWaitPerProcess];
WaitEntry *WaitBufferFrom(const void *pAddress, AuUInt8 uSize, bool bScheduleFirst, const void *pAddressCompare, EWaitMethod eWaitMethod);
template <typename T>
bool IterateWake(const void *pAddress, T callback);
void RemoveSelf(const void *pAddress, WaitEntry *pSelf);
};
}