[+] By-raw pointer WOA lists

(also they are now fairer) [+] Steps towards future proofing NT (not the future proofing itself)
2023-06-12 18:31:10 +01:00 · 2023-06-12 18:31:10 +01:00 · 1a8acbdde5
commit 1a8acbdde5
parent 50413f36e5
4 changed files with 273 additions and 41 deletions
--- a/Source/Threading/AuWakeOnAddress.cpp
+++ b/Source/Threading/AuWakeOnAddress.cpp
@ -97,11 +97,13 @@ namespace Aurora::Threading

    void WaitEntry::Release()
    {
+    #if 0
        if (this->bOverflow)
        {
            gProcessWaitables.Remove(this);
            this->bOverflow = false;
        }
+    #endif

        AuResetMember(this->uSize);
        AuResetMember(this->pAddress);
@ -273,6 +275,7 @@ namespace Aurora::Threading

    WaitEntry *ProcessWaitContainer::WaitBufferFrom(void *pAddress, AuUInt8 uSize)
    {
+    #if defined(WOA_ENABLE_OLD_SHORT_LIST)
        for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
        {
            if (this->entries[i].TryAcquire(pAddress, uSize))
@ -280,13 +283,27 @@ namespace Aurora::Threading
                return &this->entries[i];
            }
        }
+    #endif
        
        auto pReturn = &tlsWaitEntry;
+        pReturn->bReleaseOnWake = true;
+
+        pReturn->pAddress = pAddress;
+        pReturn->uSize    = uSize;
+        pReturn->uAtomic  = 0;

        {
            Lock();
-            this->overflow.push_back(pReturn);
-            pReturn->bOverflow = true;
+            if (auto pLoadFromMemory = this->waitList.pHead)
+            {
+                pReturn->pNext = pLoadFromMemory;
+                pLoadFromMemory->pBefore = pReturn;
+            }
+            else
+            {
+                this->waitList.pTail = pReturn;
+            }
+            this->waitList.pHead = pReturn;
            Unlock();
        }

@ -296,6 +313,7 @@ namespace Aurora::Threading
    template <typename T>
    bool ProcessWaitContainer::IterateAll(T callback)
    {
+    #if defined(WOA_ENABLE_OLD_SHORT_LIST)
        for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
        {
            auto &entry = this->entries[i];
@ -319,17 +337,25 @@ namespace Aurora::Threading
                }
            }
        }
+    #endif

-        Lock();
-        for (auto &overflow : this->overflow)
        {
-            AU_LOCK_GUARD(overflow->mutex);
-            if (!callback(*overflow.get()))
+            Lock();
+            auto pCurrentHead = this->waitList.pHead;
+            while (pCurrentHead)
            {
-                return false;
+                AU_LOCK_GUARD(pCurrentHead->mutex);
+
+                if (!callback(pCurrentHead))
+                {
+                    Unlock();
+                    return false;
+                }
+
+                pCurrentHead = pCurrentHead->pNext;
            }
+            Unlock();
        }
-        Unlock();

        return true;
    }
@ -339,6 +365,7 @@ namespace Aurora::Threading
    {
        bool bRetStatus { true };

+    #if defined(WOA_ENABLE_OLD_SHORT_LIST)
        for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
        {
            auto &entry = this->entries[i];
@ -360,21 +387,118 @@ namespace Aurora::Threading
                }
            }
        }
+    #endif

-        Lock();
-
-        for (auto &overflow : this->overflow)
        {
-            AU_LOCK_GUARD(overflow->mutex);
-            if (!callback(*overflow))
+            Lock();
+            auto pCurrentHead = this->waitList.pHead;
+            while (pCurrentHead)
            {
-                bRetStatus = false;
-                break;
+                AU_LOCK_GUARD(pCurrentHead->mutex);
+
+                if (!callback(*pCurrentHead))
+                {
+                    bRetStatus = false;
+                    break;
+                }
+
+                pCurrentHead = pCurrentHead->pNext;
            }
+            Unlock();
        }

+
+    #if defined(WOA_ENABLE_OLD_SHORT_LIST)
+        for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
+        {
+            auto &entry = this->entries[i];
+            {
+                entry.uAtomic = 0;
+            }
+        }
+    #endif
+
+        return bRetStatus;
+    }
+
+
+    template <typename T>
+    bool ProcessWaitContainer::IterateWake(T callback)
+    {
+        bool bRetStatus { true };
+
+        Lock();
+        {
+            // FIFO
+            auto pCurrentHead = this->waitList.pTail;
+            decltype(pCurrentHead) pLast {};
+            while (pCurrentHead)
+            {
+                AU_LOCK_GUARD(pCurrentHead->mutex);
+
+                auto [bCont, bRemove] = callback(*pCurrentHead);
+
+                if (bRemove)
+                {
+                    if (pLast)
+                    {
+                        pLast->pNext = pCurrentHead->pNext;
+                    }
+                    else if (this->waitList.pHead == pCurrentHead)
+                    {
+                        this->waitList.pHead = pCurrentHead->pNext;
+                    }
+
+                    if (pCurrentHead->pNext)
+                    {
+                        pCurrentHead->pNext->pBefore = pCurrentHead->pBefore;
+                    }
+
+                    if (this->waitList.pTail == pCurrentHead)
+                    {
+                        this->waitList.pTail = pLast;
+                    }
+                }
+
+                if (!bCont)
+                {
+                    bRetStatus = false;
+                    break;
+                }
+
+                pLast = pCurrentHead;
+                pCurrentHead = pCurrentHead->pBefore;
+            }
+        }
        Unlock();

+        // meh - just so i can experiment with changes
+    #if defined(WOA_ENABLE_OLD_SHORT_LIST)
+        for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
+        {
+            auto &entry = this->entries[i];
+            {
+                DoSpinLockOnVar(&entry.uAtomic);
+
+                if (entry.pAddress)
+                {
+                    AU_LOCK_GUARD(entry.mutex);
+
+                    auto [bCont, bRemove] = callback(*entry);
+
+                    if (!bCont)
+                    {
+                        for (AU_ITERATE_N(z, i + 1))
+                        {
+                            this->entries[z].uAtomic = 0;
+                        }
+
+                        return false;
+                    }
+                }
+            }
+        }
+
        for (AU_ITERATE_N(i, kDefaultWaitPerProcess))
        {
            auto &entry = this->entries[i];
@ -382,6 +506,7 @@ namespace Aurora::Threading
                entry.uAtomic = 0;
            }
        }
+    #endif

        return bRetStatus;
    }
@ -399,17 +524,37 @@ namespace Aurora::Threading
    void ProcessWaitContainer::Remove(WaitEntry *pParent)
    {
        Lock();
-        for (auto itr = this->overflow.begin();
-             itr != this->overflow.end();
-             )
        {
-            if ((*itr) == pParent)
+            auto pCurrent = this->waitList.pHead;
+            decltype(pCurrent) pLast {};
+            while (pCurrent)
            {
-                itr = this->overflow.erase(itr);
-            }
-            else
-            {
-                itr++;
+                if (pCurrent == pParent)
+                {
+                    if (pLast)
+                    {
+                        pLast->pNext = pCurrent->pNext;
+                    }
+                    else if (this->waitList.pHead == pCurrent)
+                    {
+                        this->waitList.pHead = pCurrent->pNext;
+                    }
+
+                    if (pCurrent->pNext)
+                    {
+                        pCurrent->pNext->pBefore = pCurrent->pBefore;
+                    }
+
+                    if (this->waitList.pTail == pParent)
+                    {
+                        this->waitList.pTail = pLast;
+                    }
+
+                    break;
+                }
+
+                pLast = pCurrent;
+                pCurrent = pCurrent->pNext;
            }
        }
        Unlock();
@ -451,7 +596,9 @@ namespace Aurora::Threading
        state.compare = WaitBuffer::From(pCompareAddress, uWordSize);
        state.qwNanoseconds = qwNanoseconds ? AuOptionalEx<AuUInt64> { qwNanoseconds.value() } : AuOptionalEx<AuUInt64> {}; // from default/zeroable optional, to boolean suffix
        auto bResult = pWaitEntry->SleepOn(state);
+    #if defined(WOA_USE_DEFERRED_REL)
        pWaitEntry->Release();
+    #endif
        return bResult;
    }

@ -747,19 +894,34 @@ namespace Aurora::Threading
        }
        else
        {
+        #if defined(WOA_USE_DEFERRED_REL)
            (void)gProcessWaitables.IterateForceNoCreateDuringOp([&](WaitEntry &entry) -> bool
+        #else
+            (void)gProcessWaitables.IterateWake([&](WaitEntry &entry) -> AuPair<bool, bool>
+        #endif
            {
                if (!uNMaximumThreads)
                {
+                #if defined(WOA_USE_DEFERRED_REL)
                    return false;
+                #else
+                    return AuMakePair(false, false);
+                #endif
                }

+                bool bWake {};
                if (entry.TryWakeNoLockNoReallyNoLock(pTargetAddress))
                {
+                    bWake = true;
                    uNMaximumThreads--;
                }

-                return uNMaximumThreads != 0;
+                bool bCont = uNMaximumThreads != 0;
+            #if defined(WOA_USE_DEFERRED_REL)
+                return bCont;
+            #else
+                return AuMakePair(bCont, bWake);
+            #endif
            });
        }
    }
@ -777,11 +939,31 @@ namespace Aurora::Threading
        }
        else
        {
-            (void)gProcessWaitables.IterateForceNoCreateDuringOp([=](WaitEntry &entry) -> bool
+    #if defined(WOA_USE_DEFERRED_REL)
+            (void)gProcessWaitables.IterateForceNoCreateDuringOp([&](WaitEntry &entry) -> bool
+    #else
+            (void)gProcessWaitables.IterateWake([&](WaitEntry &entry) -> AuPair<bool, bool>
+    #endif
            {
+            #if defined(WOA_USE_DEFERRED_REL)
                entry.TryWakeNoLockNoReallyNoLock(pTargetAddress);
                return true;
+            #else
+                return AuMakePair(true, entry.TryWakeNoLockNoReallyNoLock(pTargetAddress));
+            #endif
            });
        }
    }
+
+    // Future (Reece): AuThread aware (safe force-terminate)
+    // There are three ways we can go about this:
+    //  Shared pointers
+    //  Shared pointers such that we dont need to remove the raw pointer optimization
+    //  Callback on thread death
+    //  
+    // 1st would increase overhead for a case i dont want to condone 
+    // 2nd would work but would probably require a callback on death
+    // 3rd would work. 
+    // 
+    // to be addressed later
 }
--- a/Source/Threading/AuWakeOnAddress.hpp
+++ b/Source/Threading/AuWakeOnAddress.hpp
@ -39,6 +39,9 @@ namespace Aurora::Threading
        WaitEntry();
        ~WaitEntry();

+        WaitEntry *pNext {};
+        WaitEntry *pBefore {};
+
        // synch
        AuUInt32 uAtomic {}; // fastpath
        Primitives::ConditionMutexImpl mutex; // mutex ctor must come before var
@ -50,6 +53,7 @@ namespace Aurora::Threading

        // bookkeeping (parent container)
        bool bOverflow {};
+        bool bReleaseOnWake {};

        bool TryAcquire(const void *pAddress, AuUInt8 uSize);
        void Release();
@ -60,11 +64,19 @@ namespace Aurora::Threading
        bool TryWakeNoLockNoReallyNoLock(const void *pAddress);
    };

+    struct ProcessListWait
+    {
+        WaitEntry *pHead {};
+        WaitEntry *pTail {};
+    };
+
    struct ProcessWaitContainer
    {
        AuUInt32 uAtomic {};
+    #if defined(WOA_ENABLE_OLD_SHORT_LIST)
        WaitEntry entries[kDefaultWaitPerProcess];
-        AuList<WaitEntry *> overflow;
+    #endif
+        ProcessListWait waitList;

        WaitEntry *WaitBufferFrom(void *pAddress, AuUInt8 uSize);

@ -74,6 +86,9 @@ namespace Aurora::Threading
        template <typename T>
        bool IterateForceNoCreateDuringOp(T callback);

+        template <typename T>
+        bool IterateWake(T callback);
+
        void Lock();

        void Unlock();
--- a/Source/Threading/Primitives/AuConditionVariable.NT.cpp
+++ b/Source/Threading/Primitives/AuConditionVariable.NT.cpp
@ -73,8 +73,8 @@ namespace Aurora::Threading::Primitives
                    while (true)
                    {
                        auto uNow = this->wlist;
-                        auto waiting = uNow >> 2u;
-                        auto uNext = ((waiting + 1) << 2u) | (!bool(waiting)) | (uNow & 1);
+                        auto waiting = uNow >> kShiftCountByBits;
+                        auto uNext = ((waiting + 1) << kShiftCountByBits) | (!bool(waiting)) | (uNow & 1);

                        if (AuAtomicCompareExchange(&this->wlist, uNext, uNow) == uNow)
                        {
@ -104,7 +104,7 @@ namespace Aurora::Threading::Primitives
            #endif
                {
                    auto uNow = this->wlist;
-                    auto uOld = (uNow >> 2u);
+                    auto uOld = (uNow >> kShiftCountByBits);

                    if (uOld == 0)
                    {
@ -125,7 +125,7 @@ namespace Aurora::Threading::Primitives

                    // go for an atomic decrement while racing against ::Signal and ::Broadcast
                    auto waiting = uOld - 1u;
-                    auto uNext   = waiting << 2u;
+                    auto uNext   = waiting << kShiftCountByBits;

                    if (AuAtomicCompareExchange(&this->wlist, uNext, uNow) == uNow)
                    {
@ -158,8 +158,8 @@ namespace Aurora::Threading::Primitives
                while (true)
                {
                    auto uNow = this->wlist;
-                    auto waiting = uNow >> 2u;
-                    auto uNext = ((waiting + 1) << 2u) | (!bool(waiting)) | (uNow & 1);
+                    auto waiting = uNow >> kShiftCountByBits;
+                    auto uNext = ((waiting + 1) << kShiftCountByBits) | (!bool(waiting)) | (uNow & 1);

                    if (AuAtomicCompareExchange(&this->wlist, uNext, uNow) == uNow)
                    {
@ -239,9 +239,12 @@ namespace Aurora::Threading::Primitives
                return false;
            }

-            if (uSignalNext == 0)
+            if constexpr (kBoolRequiredLateSet)
            {
-                InterlockedOr((volatile LONG*)&this->wlist, 1);
+                if (uSignalNext == 0)
+                {
+                    InterlockedOr((volatile LONG *)&this->wlist, 1);
+                }
            }

            return true;
@ -254,7 +257,7 @@ namespace Aurora::Threading::Primitives
    #if !defined(AURORA_FORCE_SRW_LOCKS)
        auto original = this->wlist;
        auto expected = original;
-        expected = expected >> 2;
+        expected = expected >> kShiftCountByBits;

        if (expected)
        {
@ -262,14 +265,14 @@ namespace Aurora::Threading::Primitives
            
            while (expected)
            {
-                if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << 2)  /*intentional clear*/, original) == original)
+                if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << kShiftCountByBits)  /*intentional clear*/, original) == original)
                {
                    pNtReleaseKeyedEvent(gKeyedEventHandle, &this->wlist, FALSE, nullptr);
                    return;
                }

                original = this->wlist;
-                expected = original >> 2;
+                expected = original >> kShiftCountByBits;
            }
        }
    #else
@ -282,7 +285,7 @@ namespace Aurora::Threading::Primitives
    #if !defined(AURORA_FORCE_SRW_LOCKS)
        auto original = this->wlist;
        auto expected = original;
-        expected = expected >> 2;
+        expected = expected >> kShiftCountByBits;

        auto uBroadcastIterations = expected;

@ -293,7 +296,7 @@ namespace Aurora::Threading::Primitives
            while (expected && uBroadcastIterations)
            {
                bool bBreak {};
-                if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << 2)  /*intentional clear*/, original) == original)
+                if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << kShiftCountByBits)  /*intentional clear*/, original) == original)
                {
                    pNtReleaseKeyedEvent(gKeyedEventHandle, &this->wlist, FALSE, nullptr);

@ -302,7 +305,7 @@ namespace Aurora::Threading::Primitives
                }

                original = this->wlist;
-                expected = original >> 2;
+                expected = original >> kShiftCountByBits;

                if (bBreak)
                {
--- a/Source/Threading/Primitives/AuConditionVariable.NT.hpp
+++ b/Source/Threading/Primitives/AuConditionVariable.NT.hpp
@ -34,5 +34,37 @@ namespace Aurora::Threading::Primitives

        std::shared_ptr<Win32ConditionMutex> mutex_;
    };
+
+    static const auto kBoolRequiredLateSet = true;
+
+    // Future (Reece): I got future plans
+    static const auto kShiftCountByBits = 8u;
+    // ...otherwise
+    // assume undefined behaviour past:
+    //  * bit zero is used for atomic bit test and yield loops 
+    //    ( keyed events are an optimization mechanism for Windows XPs spinloop i had accidentally recreated in xenus. )
+    //    ( originally, nt yielding sucked with the most barebones spinlock being dumb a hypervisor-unaware, smt-aware, spinner. )
+    //    ( keyed events would then go in these spinners to serve as an early futex as early back as the year 2000 (?). )
+    //    ( that does, in fact, mean the free-toddlers crying about how 'windows stole muh kernels totally originally idea' is entirely wrong at each sub-point. )
+    //    ( though, keyed didn't see much use until Windows Vistas synch primitives were built on top of them. )
+    //    ( infamously missing 100ths scale nanosecond yimeouts and an inablity to lock with a timeout. )
+    //    ( raymond chen once claimed they didnt make it to xp because they werent fast enough )
+    //    ( raymond chen once claimed a "con" of keyedevents were that they were linear )
+    //    ( problem is, as far as i can tell, they didnt really change. whats worse, WakeOnAddress (windows 8+)
+    //    ( ...inherits the issue of not having relative/abs nanosecond scale timeouts AND the issue the primitives sucking. )
+    //    ( WakeOnAddress is nothing more than keyed events 2.0 - but with userland list keeping. )
+    //    ( scratch the concept of how i implement WakeOnAddress with lists, how older nts waited with lists under lock, )
+    //    ( they use hashmaps with "le meme lockless" interactions which are surly less expensive than reusing tls allocations ?! )
+    //    ( whether or not its even faster is still up for debate. its just easier to use. )
+    //  * bit one might be used under some niche versions of windows
+    //    (hearsay paranoia)
+    // i actually have zero reason to believe windows ever implemented lock-awareness into the kernel
+    // i think it might be fine to skip the whole bit zero thing, but still, im going to say keep the min=2
+    // worst case scenario, we end up using these bits.
+    // ....
+    // =8 is future proof
+    // =2 is recommended
+    // =0 would require a bit of a require. i think this is how other people use keyed events nowadays
+
 }
 #endif