diff --git a/Source/Threading/Primitives/AuConditionMutex.NT.cpp b/Source/Threading/Primitives/AuConditionMutex.NT.cpp index d3208f75..90ce1e74 100644 --- a/Source/Threading/Primitives/AuConditionMutex.NT.cpp +++ b/Source/Threading/Primitives/AuConditionMutex.NT.cpp @@ -118,12 +118,16 @@ namespace Aurora::Threading::Primitives auto &uValueRef = this->lock_.uWaitCount; - #if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64) - *(AuUInt8 *)&uValueRef = 0; + #if defined(AURORA_COMPILER_MSVC) + #if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64) + *(AuUInt8 *)&uValueRef = 0; + #else + InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF); + #endif #else - InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF); + __sync_lock_release((AuUInt8 *)&uValueRef); #endif - + while (true) { auto uOld = uValueRef; diff --git a/Source/Threading/Primitives/AuMutex.NT.cpp b/Source/Threading/Primitives/AuMutex.NT.cpp index 3cb1e9c4..c6814966 100644 --- a/Source/Threading/Primitives/AuMutex.NT.cpp +++ b/Source/Threading/Primitives/AuMutex.NT.cpp @@ -273,31 +273,35 @@ namespace Aurora::Threading::Primitives auto &uValueRef = this->state_; - #if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64) - // Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A: Section: 8.2.3.1 - *(AuUInt8 *)&uValueRef = 0; + #if defined(AURORA_COMPILER_MSVC) + #if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64) + // Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A: Section: 8.2.3.1 + *(AuUInt8 *)&uValueRef = 0; - // From this point onwards, our thread could be subject to StoreLoad re-ordering - // ...but it should not matter. - - // Given the memory model of x86[64], we can only really expect to be out of order during an unfenced load operation, which in this class, can only be expected under this function before the CAS. - // No other place reads. + // From this point onwards, our thread could be subject to StoreLoad re-ordering + // ...but it should not matter. + + // Given the memory model of x86[64], we can only really expect to be out of order during an unfenced load operation, which in this class, can only be expected under this function before the CAS. + // No other place reads. - // Re-ordering race condition 1: one thread wins an atomic bit set, that we dont catch until the CAS, resulting in: a slow implicit fence under the cas, a mm_pause stall, a compare, and a return - // alt: uValueRef reads zero, resulting in a preemptive return while no threads need to be awoken - // Re-ordering race condition 2: we unlock, multiple threads enter ::Lock(), we somehow read `uValue = uValueRef` as zero, and then the first atomic bitsetandtest winner thread signals the keyed mutex - // I fail to see how: - // *byte = 0; | | - // | interlocked atomicbitset | interlocked atomicbitset fail - // | [logic] | interlocked atomic set kFutexBitWait - // | *byte = 0; | yield - // | auto uValue =[acquire]= uValueRef - // ...would result in the second thread missing the third threads atomic set kFutexBitWait (cst (?) on the account of 8.2.3.1, 8.2.3.8, etc) + // Re-ordering race condition 1: one thread wins an atomic bit set, that we dont catch until the CAS, resulting in: a slow implicit fence under the cas, a mm_pause stall, a compare, and a return + // alt: uValueRef reads zero, resulting in a preemptive return while no threads need to be awoken + // Re-ordering race condition 2: we unlock, multiple threads enter ::Lock(), we somehow read `uValue = uValueRef` as zero, and then the first atomic bitsetandtest winner thread signals the keyed mutex + // I fail to see how: + // *byte = 0; | | + // | interlocked atomicbitset | interlocked atomicbitset fail + // | [logic] | interlocked atomic set kFutexBitWait + // | *byte = 0; | yield + // | auto uValue =[acquire]= uValueRef + // ...would result in the second thread missing the third threads atomic set kFutexBitWait (cst (?) on the account of 8.2.3.1, 8.2.3.8, etc) - // Also note: mfence is far too expensive and the _ReadWriteBarrier() intrinsics do absolutely nothing - _ReadWriteBarrier(); + // Also note: mfence is far too expensive and the _ReadWriteBarrier() intrinsics do absolutely nothing + _ReadWriteBarrier(); + #else + InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF); + #endif #else - InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF); + __sync_lock_release((AuUInt8 *)&uValueRef); // __atomic_store_explicit((AuUInt8 *)&uValueRef, 0, __ATOMIC_RELEASE) #endif while (true)