[+] By-raw pointer WOA lists

(also they are now fairer)
[+] Steps towards future proofing NT (not the future proofing itself)
This commit is contained in:
Reece Wilson 2023-06-12 18:31:10 +01:00
parent 50413f36e5
commit 1a8acbdde5
4 changed files with 273 additions and 41 deletions

View File

@ -97,11 +97,13 @@ namespace Aurora::Threading
void WaitEntry::Release()
{
#if 0
if (this->bOverflow)
{
gProcessWaitables.Remove(this);
this->bOverflow = false;
}
#endif
AuResetMember(this->uSize);
AuResetMember(this->pAddress);
@ -273,6 +275,7 @@ namespace Aurora::Threading
WaitEntry *ProcessWaitContainer::WaitBufferFrom(void *pAddress, AuUInt8 uSize)
{
#if defined(WOA_ENABLE_OLD_SHORT_LIST)
for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
{
if (this->entries[i].TryAcquire(pAddress, uSize))
@ -280,13 +283,27 @@ namespace Aurora::Threading
return &this->entries[i];
}
}
#endif
auto pReturn = &tlsWaitEntry;
pReturn->bReleaseOnWake = true;
pReturn->pAddress = pAddress;
pReturn->uSize = uSize;
pReturn->uAtomic = 0;
{
Lock();
this->overflow.push_back(pReturn);
pReturn->bOverflow = true;
if (auto pLoadFromMemory = this->waitList.pHead)
{
pReturn->pNext = pLoadFromMemory;
pLoadFromMemory->pBefore = pReturn;
}
else
{
this->waitList.pTail = pReturn;
}
this->waitList.pHead = pReturn;
Unlock();
}
@ -296,6 +313,7 @@ namespace Aurora::Threading
template <typename T>
bool ProcessWaitContainer::IterateAll(T callback)
{
#if defined(WOA_ENABLE_OLD_SHORT_LIST)
for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
{
auto &entry = this->entries[i];
@ -319,17 +337,25 @@ namespace Aurora::Threading
}
}
}
#endif
Lock();
for (auto &overflow : this->overflow)
{
AU_LOCK_GUARD(overflow->mutex);
if (!callback(*overflow.get()))
Lock();
auto pCurrentHead = this->waitList.pHead;
while (pCurrentHead)
{
return false;
AU_LOCK_GUARD(pCurrentHead->mutex);
if (!callback(pCurrentHead))
{
Unlock();
return false;
}
pCurrentHead = pCurrentHead->pNext;
}
Unlock();
}
Unlock();
return true;
}
@ -339,6 +365,7 @@ namespace Aurora::Threading
{
bool bRetStatus { true };
#if defined(WOA_ENABLE_OLD_SHORT_LIST)
for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
{
auto &entry = this->entries[i];
@ -360,21 +387,118 @@ namespace Aurora::Threading
}
}
}
#endif
Lock();
for (auto &overflow : this->overflow)
{
AU_LOCK_GUARD(overflow->mutex);
if (!callback(*overflow))
Lock();
auto pCurrentHead = this->waitList.pHead;
while (pCurrentHead)
{
bRetStatus = false;
break;
AU_LOCK_GUARD(pCurrentHead->mutex);
if (!callback(*pCurrentHead))
{
bRetStatus = false;
break;
}
pCurrentHead = pCurrentHead->pNext;
}
Unlock();
}
#if defined(WOA_ENABLE_OLD_SHORT_LIST)
for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
{
auto &entry = this->entries[i];
{
entry.uAtomic = 0;
}
}
#endif
return bRetStatus;
}
template <typename T>
bool ProcessWaitContainer::IterateWake(T callback)
{
bool bRetStatus { true };
Lock();
{
// FIFO
auto pCurrentHead = this->waitList.pTail;
decltype(pCurrentHead) pLast {};
while (pCurrentHead)
{
AU_LOCK_GUARD(pCurrentHead->mutex);
auto [bCont, bRemove] = callback(*pCurrentHead);
if (bRemove)
{
if (pLast)
{
pLast->pNext = pCurrentHead->pNext;
}
else if (this->waitList.pHead == pCurrentHead)
{
this->waitList.pHead = pCurrentHead->pNext;
}
if (pCurrentHead->pNext)
{
pCurrentHead->pNext->pBefore = pCurrentHead->pBefore;
}
if (this->waitList.pTail == pCurrentHead)
{
this->waitList.pTail = pLast;
}
}
if (!bCont)
{
bRetStatus = false;
break;
}
pLast = pCurrentHead;
pCurrentHead = pCurrentHead->pBefore;
}
}
Unlock();
// meh - just so i can experiment with changes
#if defined(WOA_ENABLE_OLD_SHORT_LIST)
for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
{
auto &entry = this->entries[i];
{
DoSpinLockOnVar(&entry.uAtomic);
if (entry.pAddress)
{
AU_LOCK_GUARD(entry.mutex);
auto [bCont, bRemove] = callback(*entry);
if (!bCont)
{
for (AU_ITERATE_N(z, i + 1))
{
this->entries[z].uAtomic = 0;
}
return false;
}
}
}
}
for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
{
auto &entry = this->entries[i];
@ -382,6 +506,7 @@ namespace Aurora::Threading
entry.uAtomic = 0;
}
}
#endif
return bRetStatus;
}
@ -399,17 +524,37 @@ namespace Aurora::Threading
void ProcessWaitContainer::Remove(WaitEntry *pParent)
{
Lock();
for (auto itr = this->overflow.begin();
itr != this->overflow.end();
)
{
if ((*itr) == pParent)
auto pCurrent = this->waitList.pHead;
decltype(pCurrent) pLast {};
while (pCurrent)
{
itr = this->overflow.erase(itr);
}
else
{
itr++;
if (pCurrent == pParent)
{
if (pLast)
{
pLast->pNext = pCurrent->pNext;
}
else if (this->waitList.pHead == pCurrent)
{
this->waitList.pHead = pCurrent->pNext;
}
if (pCurrent->pNext)
{
pCurrent->pNext->pBefore = pCurrent->pBefore;
}
if (this->waitList.pTail == pParent)
{
this->waitList.pTail = pLast;
}
break;
}
pLast = pCurrent;
pCurrent = pCurrent->pNext;
}
}
Unlock();
@ -451,7 +596,9 @@ namespace Aurora::Threading
state.compare = WaitBuffer::From(pCompareAddress, uWordSize);
state.qwNanoseconds = qwNanoseconds ? AuOptionalEx<AuUInt64> { qwNanoseconds.value() } : AuOptionalEx<AuUInt64> {}; // from default/zeroable optional, to boolean suffix
auto bResult = pWaitEntry->SleepOn(state);
#if defined(WOA_USE_DEFERRED_REL)
pWaitEntry->Release();
#endif
return bResult;
}
@ -747,19 +894,34 @@ namespace Aurora::Threading
}
else
{
#if defined(WOA_USE_DEFERRED_REL)
(void)gProcessWaitables.IterateForceNoCreateDuringOp([&](WaitEntry &entry) -> bool
#else
(void)gProcessWaitables.IterateWake([&](WaitEntry &entry) -> AuPair<bool, bool>
#endif
{
if (!uNMaximumThreads)
{
#if defined(WOA_USE_DEFERRED_REL)
return false;
#else
return AuMakePair(false, false);
#endif
}
bool bWake {};
if (entry.TryWakeNoLockNoReallyNoLock(pTargetAddress))
{
bWake = true;
uNMaximumThreads--;
}
return uNMaximumThreads != 0;
bool bCont = uNMaximumThreads != 0;
#if defined(WOA_USE_DEFERRED_REL)
return bCont;
#else
return AuMakePair(bCont, bWake);
#endif
});
}
}
@ -777,11 +939,31 @@ namespace Aurora::Threading
}
else
{
(void)gProcessWaitables.IterateForceNoCreateDuringOp([=](WaitEntry &entry) -> bool
#if defined(WOA_USE_DEFERRED_REL)
(void)gProcessWaitables.IterateForceNoCreateDuringOp([&](WaitEntry &entry) -> bool
#else
(void)gProcessWaitables.IterateWake([&](WaitEntry &entry) -> AuPair<bool, bool>
#endif
{
#if defined(WOA_USE_DEFERRED_REL)
entry.TryWakeNoLockNoReallyNoLock(pTargetAddress);
return true;
#else
return AuMakePair(true, entry.TryWakeNoLockNoReallyNoLock(pTargetAddress));
#endif
});
}
}
// Future (Reece): AuThread aware (safe force-terminate)
// There are three ways we can go about this:
// Shared pointers
// Shared pointers such that we dont need to remove the raw pointer optimization
// Callback on thread death
//
// 1st would increase overhead for a case i dont want to condone
// 2nd would work but would probably require a callback on death
// 3rd would work.
//
// to be addressed later
}

View File

@ -39,6 +39,9 @@ namespace Aurora::Threading
WaitEntry();
~WaitEntry();
WaitEntry *pNext {};
WaitEntry *pBefore {};
// synch
AuUInt32 uAtomic {}; // fastpath
Primitives::ConditionMutexImpl mutex; // mutex ctor must come before var
@ -50,6 +53,7 @@ namespace Aurora::Threading
// bookkeeping (parent container)
bool bOverflow {};
bool bReleaseOnWake {};
bool TryAcquire(const void *pAddress, AuUInt8 uSize);
void Release();
@ -60,11 +64,19 @@ namespace Aurora::Threading
bool TryWakeNoLockNoReallyNoLock(const void *pAddress);
};
struct ProcessListWait
{
WaitEntry *pHead {};
WaitEntry *pTail {};
};
struct ProcessWaitContainer
{
AuUInt32 uAtomic {};
#if defined(WOA_ENABLE_OLD_SHORT_LIST)
WaitEntry entries[kDefaultWaitPerProcess];
AuList<WaitEntry *> overflow;
#endif
ProcessListWait waitList;
WaitEntry *WaitBufferFrom(void *pAddress, AuUInt8 uSize);
@ -74,6 +86,9 @@ namespace Aurora::Threading
template <typename T>
bool IterateForceNoCreateDuringOp(T callback);
template <typename T>
bool IterateWake(T callback);
void Lock();
void Unlock();

View File

@ -73,8 +73,8 @@ namespace Aurora::Threading::Primitives
while (true)
{
auto uNow = this->wlist;
auto waiting = uNow >> 2u;
auto uNext = ((waiting + 1) << 2u) | (!bool(waiting)) | (uNow & 1);
auto waiting = uNow >> kShiftCountByBits;
auto uNext = ((waiting + 1) << kShiftCountByBits) | (!bool(waiting)) | (uNow & 1);
if (AuAtomicCompareExchange(&this->wlist, uNext, uNow) == uNow)
{
@ -104,7 +104,7 @@ namespace Aurora::Threading::Primitives
#endif
{
auto uNow = this->wlist;
auto uOld = (uNow >> 2u);
auto uOld = (uNow >> kShiftCountByBits);
if (uOld == 0)
{
@ -125,7 +125,7 @@ namespace Aurora::Threading::Primitives
// go for an atomic decrement while racing against ::Signal and ::Broadcast
auto waiting = uOld - 1u;
auto uNext = waiting << 2u;
auto uNext = waiting << kShiftCountByBits;
if (AuAtomicCompareExchange(&this->wlist, uNext, uNow) == uNow)
{
@ -158,8 +158,8 @@ namespace Aurora::Threading::Primitives
while (true)
{
auto uNow = this->wlist;
auto waiting = uNow >> 2u;
auto uNext = ((waiting + 1) << 2u) | (!bool(waiting)) | (uNow & 1);
auto waiting = uNow >> kShiftCountByBits;
auto uNext = ((waiting + 1) << kShiftCountByBits) | (!bool(waiting)) | (uNow & 1);
if (AuAtomicCompareExchange(&this->wlist, uNext, uNow) == uNow)
{
@ -239,9 +239,12 @@ namespace Aurora::Threading::Primitives
return false;
}
if (uSignalNext == 0)
if constexpr (kBoolRequiredLateSet)
{
InterlockedOr((volatile LONG*)&this->wlist, 1);
if (uSignalNext == 0)
{
InterlockedOr((volatile LONG *)&this->wlist, 1);
}
}
return true;
@ -254,7 +257,7 @@ namespace Aurora::Threading::Primitives
#if !defined(AURORA_FORCE_SRW_LOCKS)
auto original = this->wlist;
auto expected = original;
expected = expected >> 2;
expected = expected >> kShiftCountByBits;
if (expected)
{
@ -262,14 +265,14 @@ namespace Aurora::Threading::Primitives
while (expected)
{
if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << 2) /*intentional clear*/, original) == original)
if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << kShiftCountByBits) /*intentional clear*/, original) == original)
{
pNtReleaseKeyedEvent(gKeyedEventHandle, &this->wlist, FALSE, nullptr);
return;
}
original = this->wlist;
expected = original >> 2;
expected = original >> kShiftCountByBits;
}
}
#else
@ -282,7 +285,7 @@ namespace Aurora::Threading::Primitives
#if !defined(AURORA_FORCE_SRW_LOCKS)
auto original = this->wlist;
auto expected = original;
expected = expected >> 2;
expected = expected >> kShiftCountByBits;
auto uBroadcastIterations = expected;
@ -293,7 +296,7 @@ namespace Aurora::Threading::Primitives
while (expected && uBroadcastIterations)
{
bool bBreak {};
if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << 2) /*intentional clear*/, original) == original)
if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << kShiftCountByBits) /*intentional clear*/, original) == original)
{
pNtReleaseKeyedEvent(gKeyedEventHandle, &this->wlist, FALSE, nullptr);
@ -302,7 +305,7 @@ namespace Aurora::Threading::Primitives
}
original = this->wlist;
expected = original >> 2;
expected = original >> kShiftCountByBits;
if (bBreak)
{

View File

@ -34,5 +34,37 @@ namespace Aurora::Threading::Primitives
std::shared_ptr<Win32ConditionMutex> mutex_;
};
static const auto kBoolRequiredLateSet = true;
// Future (Reece): I got future plans
static const auto kShiftCountByBits = 8u;
// ...otherwise
// assume undefined behaviour past:
// * bit zero is used for atomic bit test and yield loops
// ( keyed events are an optimization mechanism for Windows XPs spinloop i had accidentally recreated in xenus. )
// ( originally, nt yielding sucked with the most barebones spinlock being dumb a hypervisor-unaware, smt-aware, spinner. )
// ( keyed events would then go in these spinners to serve as an early futex as early back as the year 2000 (?). )
// ( that does, in fact, mean the free-toddlers crying about how 'windows stole muh kernels totally originally idea' is entirely wrong at each sub-point. )
// ( though, keyed didn't see much use until Windows Vistas synch primitives were built on top of them. )
// ( infamously missing 100ths scale nanosecond yimeouts and an inablity to lock with a timeout. )
// ( raymond chen once claimed they didnt make it to xp because they werent fast enough )
// ( raymond chen once claimed a "con" of keyedevents were that they were linear )
// ( problem is, as far as i can tell, they didnt really change. whats worse, WakeOnAddress (windows 8+)
// ( ...inherits the issue of not having relative/abs nanosecond scale timeouts AND the issue the primitives sucking. )
// ( WakeOnAddress is nothing more than keyed events 2.0 - but with userland list keeping. )
// ( scratch the concept of how i implement WakeOnAddress with lists, how older nts waited with lists under lock, )
// ( they use hashmaps with "le meme lockless" interactions which are surly less expensive than reusing tls allocations ?! )
// ( whether or not its even faster is still up for debate. its just easier to use. )
// * bit one might be used under some niche versions of windows
// (hearsay paranoia)
// i actually have zero reason to believe windows ever implemented lock-awareness into the kernel
// i think it might be fine to skip the whole bit zero thing, but still, im going to say keep the min=2
// worst case scenario, we end up using these bits.
// ....
// =8 is future proof
// =2 is recommended
// =0 would require a bit of a require. i think this is how other people use keyed events nowadays
}
#endif