diff --git a/Source/Threading/Primitives/AuMutex.NT.cpp b/Source/Threading/Primitives/AuMutex.NT.cpp index 093d8583..f9541693 100644 --- a/Source/Threading/Primitives/AuMutex.NT.cpp +++ b/Source/Threading/Primitives/AuMutex.NT.cpp @@ -1,4 +1,4 @@ -/*** +/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: AuMutex.NT.cpp @@ -245,22 +245,44 @@ namespace Aurora::Threading::Primitives auto &uValueRef = this->state_; #if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64) - // Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A - // Section: 9.1.1 + // Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A: Section: 9.1.1 *(AuUInt8 *)&uValueRef = 0; + + // From this point onwards, our thread could be subject to StoreLoad re-ordering + // ...but it should not matter. + + // Given the memory model of x86[64], we can only really expect to be out of order during an unfenced load operation, which in this class, can only be expected under this function before the CAS. + // No other place reads. + + // Re-ordering race condition 1: one thread wins an atomic bit set, that we dont catch until the CAS, resulting in: a slow implicit fence under the cas, a mm_pause stall, a compare, and a return + // alt: uValueRef reads zero, resulting in a preemptive return while no threads need to be awoken + // Re-ordering race condition 2: we unlock, multiple threads enter ::Lock(), we somehow read `uValue = uValueRef` as zero, and then the first atomic bitsetandtest winner thread signals the keyed mutex + // I fail to see how: + // *byte = 0; | | + // | interlocked atomicbitset | interlocked atomicbitset fail + // | [logic] | interlocked atomic set kFutexBitWait + // | *byte = 0; | yield + // | auto uValue =[acquire]= uValueRef + // ...would result in the second thread missing the third threads atomic set kFutexBitWait (cst (?) on the account of 8.2.3.1, paragraph 3) + + // Also note: mfence is far too expensive and the _ReadWriteBarrier() intrinsics do absolutely nothing #else - AuAtomicAnd(&uValueRef, ~0xFFu); + InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF); #endif - + while (true) { auto uValue = uValueRef; + // if (uValue < kFutexBitWait) { return; } + // StoreLoad race-conditions here cannot result in a return + // We should see StoreLoads of at least our *pByte = 0 + // or we should at least see the CST of kFutexBitWait being applied if (uValue & 1) { return; @@ -268,7 +290,16 @@ namespace Aurora::Threading::Primitives if (uValue & kFutexBitWake) { - return; + // StoreLoad paranoia + if (AuAtomicCompareExchange(&uValueRef, uValue, uValue) == uValue) + { + return; + } + else + { + SMPPause(); + continue; + } } if (AuAtomicCompareExchange(&uValueRef, uValue - kFutexBitWait + kFutexBitWake, uValue) == uValue)