[*] Minor NT optimization: move branch

2023-07-10 18:51:28 +01:00 · 2023-07-10 18:51:28 +01:00 · c90a13ad95
commit c90a13ad95
parent bdec6ff8ba
2 changed files with 76 additions and 78 deletions
--- a/Source/Threading/Primitives/AuConditionMutex.NT.cpp
+++ b/Source/Threading/Primitives/AuConditionMutex.NT.cpp
@ -111,44 +111,32 @@ namespace Aurora::Threading::Primitives
    #else
        InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF);
    #endif
-
-        if (gUseNativeWaitCondvar)
+    
+        while (true)
        {
-            while (true)
+            auto uOld = uValueRef;
+            auto uValue = uOld;
+
+            if (uValue & 1)
            {
-                auto uValue = uValueRef;
+                return;
+            }

-                if (uValue < kFutexBitWait)
-                {
-                    return;
-                }
+            if (uValue < kFutexBitWait)
+            {
+                return;
+            }

+            if (gUseNativeWaitCondvar)
+            {
                if (AuAtomicCompareExchange(&uValueRef, uValue - kFutexBitWait, uValue) == uValue)
                {
-                    pWakeByAddressSingle((void *)&this->lock_.uWaitCount);
+                    pWakeByAddressSingle((void *)&uValueRef);
                    return;
                }
-
-                SMPPause();
            }
-        }
-        else
-        {
-            while (true)
+            else
            {
-                auto uOld = uValueRef;
-                auto uValue = uOld;
-
-                if (uValue & 1)
-                {
-                    return;
-                }
-
-                if (uValue < kFutexBitWait)
-                {
-                    return;
-                }
-
                if (uValue & kFutexBitWake)
                {
                    if (AuAtomicCompareExchange(&uValueRef, uValue, uValue) == uValue)
@ -167,9 +155,9 @@ namespace Aurora::Threading::Primitives
                    pNtReleaseKeyedEvent(gKeyedEventHandle, (void *)&uValueRef, 0, NULL);
                    return;
                }
-
-                SMPPause();
            }
+  
+            SMPPause();
        }
    #endif
    }
--- a/Source/Threading/Primitives/AuMutex.NT.cpp
+++ b/Source/Threading/Primitives/AuMutex.NT.cpp
@ -70,12 +70,12 @@ namespace Aurora::Threading::Primitives

    bool MutexImpl::LockMS(AuUInt64 uTimeout)
    {
-        if (AuAtomicTestAndSet(&this->state_, 0) == 0)
+        if (this->TryLockNoSpin())
        {
            return true;
        }

-        return LockNS(AuMSToNS<AuUInt64>(uTimeout));
+        return this->LockNS(AuMSToNS<AuUInt64>(uTimeout));
    }

    bool MutexImpl::LockNS(AuUInt64 uTimeout)
@ -224,6 +224,8 @@ namespace Aurora::Threading::Primitives

    void MutexImpl::Unlock()
    {
+    #if defined(AURORA_FORCE_SRW_LOCKS)
+
        if (gUseNativeWaitMutex)
        {
            auto &uValueRef = this->state_;
@ -250,61 +252,69 @@ namespace Aurora::Threading::Primitives

            return;
        }
-        else
-        {
-        #if defined(AURORA_FORCE_SRW_LOCKS)
-            ::AcquireSRWLockExclusive(&this->atomicHolder_);
-            this->state_ = 0;
-            ::ReleaseSRWLockExclusive(&this->atomicHolder_);
-            ::WakeAllConditionVariable(&this->wakeup_);
-        #else

-            auto &uValueRef = this->state_;
+        ::AcquireSRWLockExclusive(&this->atomicHolder_);
+        this->state_ = 0;
+        ::ReleaseSRWLockExclusive(&this->atomicHolder_);
+        ::WakeAllConditionVariable(&this->wakeup_);
+    #else

-        #if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64)
-            //  Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A: Section: 8.2.3.1
-            *(AuUInt8 *)&uValueRef = 0;
+        auto &uValueRef = this->state_;

-            // From this point onwards, our thread could be subject to StoreLoad re-ordering
-            // ...but it should not matter.
+    #if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64)
+        //  Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A: Section: 8.2.3.1
+        *(AuUInt8 *)&uValueRef = 0;
+
+        // From this point onwards, our thread could be subject to StoreLoad re-ordering
+        // ...but it should not matter.
            
-            // Given the memory model of x86[64], we can only really expect to be out of order during an unfenced load operation, which in this class, can only be expected under this function before the CAS.
-            // No other place reads.
+        // Given the memory model of x86[64], we can only really expect to be out of order during an unfenced load operation, which in this class, can only be expected under this function before the CAS.
+        // No other place reads.

-            // Re-ordering race condition 1: one thread wins an atomic bit set, that we dont catch until the CAS, resulting in: a slow implicit fence under the cas, a mm_pause stall, a compare, and a return
-            //                                                             alt: uValueRef reads zero, resulting in a preemptive return while no threads need to be awoken
-            // Re-ordering race condition 2: we unlock, multiple threads enter ::Lock(), we somehow read `uValue = uValueRef` as zero, and then the first atomic bitsetandtest winner thread signals the keyed mutex
-            // I fail to see how:
-            // *byte = 0; |                                    |
-            //            | interlocked atomicbitset           | interlocked atomicbitset fail 
-            //            | [logic]                            | interlocked atomic set kFutexBitWait
-            //            | *byte = 0;                         | yield
-            //            | auto uValue =[acquire]= uValueRef 
-            // ...would result in the second thread missing the third threads atomic set kFutexBitWait (cst (?) on the account of 8.2.3.1, 8.2.3.8, etc)
+        // Re-ordering race condition 1: one thread wins an atomic bit set, that we dont catch until the CAS, resulting in: a slow implicit fence under the cas, a mm_pause stall, a compare, and a return
+        //                                                             alt: uValueRef reads zero, resulting in a preemptive return while no threads need to be awoken
+        // Re-ordering race condition 2: we unlock, multiple threads enter ::Lock(), we somehow read `uValue = uValueRef` as zero, and then the first atomic bitsetandtest winner thread signals the keyed mutex
+        // I fail to see how:
+        // *byte = 0; |                                    |
+        //            | interlocked atomicbitset           | interlocked atomicbitset fail 
+        //            | [logic]                            | interlocked atomic set kFutexBitWait
+        //            | *byte = 0;                         | yield
+        //            | auto uValue =[acquire]= uValueRef 
+        // ...would result in the second thread missing the third threads atomic set kFutexBitWait (cst (?) on the account of 8.2.3.1, 8.2.3.8, etc)

-            // Also note: mfence is far too expensive and the _ReadWriteBarrier() intrinsics do absolutely nothing
-        #else
-            InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF);
-        #endif
+        // Also note: mfence is far too expensive and the _ReadWriteBarrier() intrinsics do absolutely nothing
+        _ReadWriteBarrier();
+    #else
+        InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF);
+    #endif

-            while (true)
+        while (true)
+        {
+            auto uValue = uValueRef;
+
+            if (uValue < kFutexBitWait)
            {
-                auto uValue = uValueRef;
+                return;
+            }

-                //
-                if (uValue < kFutexBitWait)
+            // StoreLoad race-conditions here cannot result in a return
+            // We should see StoreLoads of at least our *pByte = 0
+            // or we should at least see the CST of kFutexBitWait being applied
+            if (uValue & 1)
+            {
+                return;
+            }
+
+            if (gUseNativeWaitMutex)
+            {
+                if (AuAtomicCompareExchange(&uValueRef, uValue - kFutexBitWait, uValue) == uValue)
                {
+                    pWakeByAddressSingle((void *)&this->state_);
                    return;
                }
-
-                // StoreLoad race-conditions here cannot result in a return
-                // We should see StoreLoads of at least our *pByte = 0
-                // or we should at least see the CST of kFutexBitWait being applied
-                if (uValue & 1)
-                {
-                    return;
-                }
-
+            }
+            else
+            {
                if (uValue & kFutexBitWake)
                {
                    // StoreLoad paranoia
@ -324,12 +334,12 @@ namespace Aurora::Threading::Primitives
                    pNtReleaseKeyedEvent(gKeyedEventHandle, (void *)&uValueRef, 0, NULL);
                    return;
                }
-
-                SMPPause();
            }

-        #endif
+            SMPPause();
        }
+
+    #endif
    }
    
    AUKN_SYM IHyperWaitable *MutexNew()