[*] Minor NT optimization: move branch
This commit is contained in:
parent
bdec6ff8ba
commit
c90a13ad95
@ -111,44 +111,32 @@ namespace Aurora::Threading::Primitives
|
||||
#else
|
||||
InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF);
|
||||
#endif
|
||||
|
||||
if (gUseNativeWaitCondvar)
|
||||
|
||||
while (true)
|
||||
{
|
||||
while (true)
|
||||
auto uOld = uValueRef;
|
||||
auto uValue = uOld;
|
||||
|
||||
if (uValue & 1)
|
||||
{
|
||||
auto uValue = uValueRef;
|
||||
return;
|
||||
}
|
||||
|
||||
if (uValue < kFutexBitWait)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (uValue < kFutexBitWait)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (gUseNativeWaitCondvar)
|
||||
{
|
||||
if (AuAtomicCompareExchange(&uValueRef, uValue - kFutexBitWait, uValue) == uValue)
|
||||
{
|
||||
pWakeByAddressSingle((void *)&this->lock_.uWaitCount);
|
||||
pWakeByAddressSingle((void *)&uValueRef);
|
||||
return;
|
||||
}
|
||||
|
||||
SMPPause();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (true)
|
||||
else
|
||||
{
|
||||
auto uOld = uValueRef;
|
||||
auto uValue = uOld;
|
||||
|
||||
if (uValue & 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (uValue < kFutexBitWait)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (uValue & kFutexBitWake)
|
||||
{
|
||||
if (AuAtomicCompareExchange(&uValueRef, uValue, uValue) == uValue)
|
||||
@ -167,9 +155,9 @@ namespace Aurora::Threading::Primitives
|
||||
pNtReleaseKeyedEvent(gKeyedEventHandle, (void *)&uValueRef, 0, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
SMPPause();
|
||||
}
|
||||
|
||||
SMPPause();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -70,12 +70,12 @@ namespace Aurora::Threading::Primitives
|
||||
|
||||
bool MutexImpl::LockMS(AuUInt64 uTimeout)
|
||||
{
|
||||
if (AuAtomicTestAndSet(&this->state_, 0) == 0)
|
||||
if (this->TryLockNoSpin())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return LockNS(AuMSToNS<AuUInt64>(uTimeout));
|
||||
return this->LockNS(AuMSToNS<AuUInt64>(uTimeout));
|
||||
}
|
||||
|
||||
bool MutexImpl::LockNS(AuUInt64 uTimeout)
|
||||
@ -224,6 +224,8 @@ namespace Aurora::Threading::Primitives
|
||||
|
||||
void MutexImpl::Unlock()
|
||||
{
|
||||
#if defined(AURORA_FORCE_SRW_LOCKS)
|
||||
|
||||
if (gUseNativeWaitMutex)
|
||||
{
|
||||
auto &uValueRef = this->state_;
|
||||
@ -250,61 +252,69 @@ namespace Aurora::Threading::Primitives
|
||||
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(AURORA_FORCE_SRW_LOCKS)
|
||||
::AcquireSRWLockExclusive(&this->atomicHolder_);
|
||||
this->state_ = 0;
|
||||
::ReleaseSRWLockExclusive(&this->atomicHolder_);
|
||||
::WakeAllConditionVariable(&this->wakeup_);
|
||||
#else
|
||||
|
||||
auto &uValueRef = this->state_;
|
||||
::AcquireSRWLockExclusive(&this->atomicHolder_);
|
||||
this->state_ = 0;
|
||||
::ReleaseSRWLockExclusive(&this->atomicHolder_);
|
||||
::WakeAllConditionVariable(&this->wakeup_);
|
||||
#else
|
||||
|
||||
#if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64)
|
||||
// Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A: Section: 8.2.3.1
|
||||
*(AuUInt8 *)&uValueRef = 0;
|
||||
auto &uValueRef = this->state_;
|
||||
|
||||
// From this point onwards, our thread could be subject to StoreLoad re-ordering
|
||||
// ...but it should not matter.
|
||||
#if defined(AURORA_ARCH_X86) || defined(AURORA_ARCH_X64)
|
||||
// Intel 64 and IA - 32 Architectures Software Developer's Manual, Volume 3A: Section: 8.2.3.1
|
||||
*(AuUInt8 *)&uValueRef = 0;
|
||||
|
||||
// From this point onwards, our thread could be subject to StoreLoad re-ordering
|
||||
// ...but it should not matter.
|
||||
|
||||
// Given the memory model of x86[64], we can only really expect to be out of order during an unfenced load operation, which in this class, can only be expected under this function before the CAS.
|
||||
// No other place reads.
|
||||
// Given the memory model of x86[64], we can only really expect to be out of order during an unfenced load operation, which in this class, can only be expected under this function before the CAS.
|
||||
// No other place reads.
|
||||
|
||||
// Re-ordering race condition 1: one thread wins an atomic bit set, that we dont catch until the CAS, resulting in: a slow implicit fence under the cas, a mm_pause stall, a compare, and a return
|
||||
// alt: uValueRef reads zero, resulting in a preemptive return while no threads need to be awoken
|
||||
// Re-ordering race condition 2: we unlock, multiple threads enter ::Lock(), we somehow read `uValue = uValueRef` as zero, and then the first atomic bitsetandtest winner thread signals the keyed mutex
|
||||
// I fail to see how:
|
||||
// *byte = 0; | |
|
||||
// | interlocked atomicbitset | interlocked atomicbitset fail
|
||||
// | [logic] | interlocked atomic set kFutexBitWait
|
||||
// | *byte = 0; | yield
|
||||
// | auto uValue =[acquire]= uValueRef
|
||||
// ...would result in the second thread missing the third threads atomic set kFutexBitWait (cst (?) on the account of 8.2.3.1, 8.2.3.8, etc)
|
||||
// Re-ordering race condition 1: one thread wins an atomic bit set, that we dont catch until the CAS, resulting in: a slow implicit fence under the cas, a mm_pause stall, a compare, and a return
|
||||
// alt: uValueRef reads zero, resulting in a preemptive return while no threads need to be awoken
|
||||
// Re-ordering race condition 2: we unlock, multiple threads enter ::Lock(), we somehow read `uValue = uValueRef` as zero, and then the first atomic bitsetandtest winner thread signals the keyed mutex
|
||||
// I fail to see how:
|
||||
// *byte = 0; | |
|
||||
// | interlocked atomicbitset | interlocked atomicbitset fail
|
||||
// | [logic] | interlocked atomic set kFutexBitWait
|
||||
// | *byte = 0; | yield
|
||||
// | auto uValue =[acquire]= uValueRef
|
||||
// ...would result in the second thread missing the third threads atomic set kFutexBitWait (cst (?) on the account of 8.2.3.1, 8.2.3.8, etc)
|
||||
|
||||
// Also note: mfence is far too expensive and the _ReadWriteBarrier() intrinsics do absolutely nothing
|
||||
#else
|
||||
InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF);
|
||||
#endif
|
||||
// Also note: mfence is far too expensive and the _ReadWriteBarrier() intrinsics do absolutely nothing
|
||||
_ReadWriteBarrier();
|
||||
#else
|
||||
InterlockedAndRelease((volatile LONG *)&uValueRef, ~0xFF);
|
||||
#endif
|
||||
|
||||
while (true)
|
||||
while (true)
|
||||
{
|
||||
auto uValue = uValueRef;
|
||||
|
||||
if (uValue < kFutexBitWait)
|
||||
{
|
||||
auto uValue = uValueRef;
|
||||
return;
|
||||
}
|
||||
|
||||
//
|
||||
if (uValue < kFutexBitWait)
|
||||
// StoreLoad race-conditions here cannot result in a return
|
||||
// We should see StoreLoads of at least our *pByte = 0
|
||||
// or we should at least see the CST of kFutexBitWait being applied
|
||||
if (uValue & 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (gUseNativeWaitMutex)
|
||||
{
|
||||
if (AuAtomicCompareExchange(&uValueRef, uValue - kFutexBitWait, uValue) == uValue)
|
||||
{
|
||||
pWakeByAddressSingle((void *)&this->state_);
|
||||
return;
|
||||
}
|
||||
|
||||
// StoreLoad race-conditions here cannot result in a return
|
||||
// We should see StoreLoads of at least our *pByte = 0
|
||||
// or we should at least see the CST of kFutexBitWait being applied
|
||||
if (uValue & 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if (uValue & kFutexBitWake)
|
||||
{
|
||||
// StoreLoad paranoia
|
||||
@ -324,12 +334,12 @@ namespace Aurora::Threading::Primitives
|
||||
pNtReleaseKeyedEvent(gKeyedEventHandle, (void *)&uValueRef, 0, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
SMPPause();
|
||||
}
|
||||
|
||||
#endif
|
||||
SMPPause();
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
AUKN_SYM IHyperWaitable *MutexNew()
|
||||
|
Loading…
Reference in New Issue
Block a user