[*] Trivial XP+ semaphore optimization for high wake counts

2023-10-12 16:58:08 +01:00 · 2023-10-12 16:58:08 +01:00 · 23ddcf9ba1
commit 23ddcf9ba1
parent e3ba8cf1c5
3 changed files with 89 additions and 18 deletions
--- a/Source/Threading/Primitives/AuConditionVariable.NT.cpp
+++ b/Source/Threading/Primitives/AuConditionVariable.NT.cpp
@ -425,6 +425,82 @@ namespace Aurora::Threading::Primitives
    #endif
    }

+    void ConditionVariableNT::BroadcastN(AuUInt32 nBroadcast)
+    {
+    #if !defined(AURORA_FORCE_SRW_LOCKS)
+
+        if (gUseNativeWaitCondvar)
+        {
+            auto original = this->wlist;
+            auto expected = original;
+            expected      = AuMin(nBroadcast, expected >> kShiftCountByBits);
+
+            if (!expected)
+            {
+                return;
+            }
+
+            AuAtomicAdd(&this->signalCount, expected);
+            
+            auto uAwoken = expected;
+
+            while (true)
+            {
+                auto uCount = expected - uAwoken;
+
+                if (AuAtomicCompareExchange(&this->wlist,
+                                            uCount,
+                                            original) == original)
+                {
+                    InternalLTSWakeCount((void *)&this->wlist, uCount);
+                    return;
+                }
+                else
+                {
+                    original = this->wlist;
+                    expected = original >> kShiftCountByBits;
+                    uAwoken = AuMin<AuUInt32>(uAwoken, expected);
+                }
+            }
+        }
+        else
+        {
+            auto original = this->wlist;
+            auto expected = original;
+            expected = expected >> kShiftCountByBits;
+
+            auto uBroadcastIterations = AuMin(nBroadcast, expected);
+
+            while (expected && uBroadcastIterations)
+            {
+                AuAtomicAdd(&this->signalCount, 1u);
+
+                while (expected && uBroadcastIterations)
+                {
+                    bool bBreak {};
+                    if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << kShiftCountByBits)  /*intentional clear*/, original) == original)
+                    {
+                        pNtReleaseKeyedEvent(gKeyedEventHandle, (void *)&this->wlist, FALSE, nullptr);
+
+                        uBroadcastIterations--;
+                        bBreak = true;
+                    }
+
+                    original = this->wlist;
+                    expected = original >> kShiftCountByBits;
+
+                    if (bBreak)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+    #else
+        ::WakeAllConditionVariable(&this->winCond_);
+    #endif
+    }
+
    AUKN_SYM IConditionVariable *ConditionVariableNew(const AuSPtr<IConditionMutex> &pMutex)
    {
        return _new ConditionVariableImpl(pMutex);
--- a/Source/Threading/Primitives/AuConditionVariable.NT.hpp
+++ b/Source/Threading/Primitives/AuConditionVariable.NT.hpp
@ -19,6 +19,7 @@ namespace Aurora::Threading::Primitives
        bool WaitForSignalNsEx(Win32ConditionMutex *pMutex, AuUInt64 timeout);
        void Signal();
        void Broadcast();
+        void BroadcastN(AuUInt32 nBroadcast);

        auline bool CheckOut();

--- a/Source/Threading/Primitives/AuSemaphore.NT.cpp
+++ b/Source/Threading/Primitives/AuSemaphore.NT.cpp
@ -214,6 +214,11 @@ namespace Aurora::Threading::Primitives
    
    void SemaphoreImpl::Unlock(AuUInt16 uCount)
    {
+        if (uCount == 0)
+        {
+            return;
+        }
+
        if (gUseNativeWaitSemapahore)
        {
            AuAtomicAdd<AuUInt32>(&this->dwState_, uCount);
@ -236,9 +241,12 @@ namespace Aurora::Threading::Primitives
            // we would have to expand our already oversized by 8, 24-byte x86_64 semaphore for a trivial perf boost
            // we cant efficiently access the conditions state or atomic guarantees...
            
-            this->mutex.Lock(); // do not [re]move this lock fence
            AuAtomicAdd<AuUInt32>(&this->dwState_, uCount); // this could be moved anywhere above the unlock, including above the lock.
+
+            {
+                this->mutex.Lock(); // do not [re]move this lock fence
                this->mutex.Unlock();
+            }
            
            if (uCount == 1)
            {
@ -246,21 +254,7 @@ namespace Aurora::Threading::Primitives
            }
            else
            {
-                if (uCount >= 3) // ...this is the only optimization we can hope to achieve
-                {
-                    // we can always save a few cycles by doing an atomic broadcast on a contended semaphore
-                    // waking up the wrong amount of threads probably doesn't matter at this point, on these target platforms
-                    this->var.Broadcast();
-                }
-                else // ...otherwise, do the handshake just a few times
-                {
-                    // doing the condvar handshake for the exact amount of threads you need, once the cond is contended, can pay off
-                    for (AU_ITERATE_N(i, uCount))
-                    {
-                        (void)i;
-                        this->var.Signal();
-                    }
-                }
+                this->var.BroadcastN(uCount);
            }
        }
    }