[*] Trivial XP+ semaphore optimization for high wake counts
This commit is contained in:
parent
e3ba8cf1c5
commit
23ddcf9ba1
@ -425,6 +425,82 @@ namespace Aurora::Threading::Primitives
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConditionVariableNT::BroadcastN(AuUInt32 nBroadcast)
|
||||
{
|
||||
#if !defined(AURORA_FORCE_SRW_LOCKS)
|
||||
|
||||
if (gUseNativeWaitCondvar)
|
||||
{
|
||||
auto original = this->wlist;
|
||||
auto expected = original;
|
||||
expected = AuMin(nBroadcast, expected >> kShiftCountByBits);
|
||||
|
||||
if (!expected)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
AuAtomicAdd(&this->signalCount, expected);
|
||||
|
||||
auto uAwoken = expected;
|
||||
|
||||
while (true)
|
||||
{
|
||||
auto uCount = expected - uAwoken;
|
||||
|
||||
if (AuAtomicCompareExchange(&this->wlist,
|
||||
uCount,
|
||||
original) == original)
|
||||
{
|
||||
InternalLTSWakeCount((void *)&this->wlist, uCount);
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
original = this->wlist;
|
||||
expected = original >> kShiftCountByBits;
|
||||
uAwoken = AuMin<AuUInt32>(uAwoken, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto original = this->wlist;
|
||||
auto expected = original;
|
||||
expected = expected >> kShiftCountByBits;
|
||||
|
||||
auto uBroadcastIterations = AuMin(nBroadcast, expected);
|
||||
|
||||
while (expected && uBroadcastIterations)
|
||||
{
|
||||
AuAtomicAdd(&this->signalCount, 1u);
|
||||
|
||||
while (expected && uBroadcastIterations)
|
||||
{
|
||||
bool bBreak {};
|
||||
if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << kShiftCountByBits) /*intentional clear*/, original) == original)
|
||||
{
|
||||
pNtReleaseKeyedEvent(gKeyedEventHandle, (void *)&this->wlist, FALSE, nullptr);
|
||||
|
||||
uBroadcastIterations--;
|
||||
bBreak = true;
|
||||
}
|
||||
|
||||
original = this->wlist;
|
||||
expected = original >> kShiftCountByBits;
|
||||
|
||||
if (bBreak)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
::WakeAllConditionVariable(&this->winCond_);
|
||||
#endif
|
||||
}
|
||||
|
||||
AUKN_SYM IConditionVariable *ConditionVariableNew(const AuSPtr<IConditionMutex> &pMutex)
|
||||
{
|
||||
return _new ConditionVariableImpl(pMutex);
|
||||
|
@ -19,6 +19,7 @@ namespace Aurora::Threading::Primitives
|
||||
bool WaitForSignalNsEx(Win32ConditionMutex *pMutex, AuUInt64 timeout);
|
||||
void Signal();
|
||||
void Broadcast();
|
||||
void BroadcastN(AuUInt32 nBroadcast);
|
||||
|
||||
auline bool CheckOut();
|
||||
|
||||
|
@ -214,6 +214,11 @@ namespace Aurora::Threading::Primitives
|
||||
|
||||
void SemaphoreImpl::Unlock(AuUInt16 uCount)
|
||||
{
|
||||
if (uCount == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (gUseNativeWaitSemapahore)
|
||||
{
|
||||
AuAtomicAdd<AuUInt32>(&this->dwState_, uCount);
|
||||
@ -235,32 +240,21 @@ namespace Aurora::Threading::Primitives
|
||||
// realistically, we cant use the sleep counter optimization trick under windows 7
|
||||
// we would have to expand our already oversized by 8, 24-byte x86_64 semaphore for a trivial perf boost
|
||||
// we cant efficiently access the conditions state or atomic guarantees...
|
||||
|
||||
this->mutex.Lock(); // do not [re]move this lock fence
|
||||
|
||||
AuAtomicAdd<AuUInt32>(&this->dwState_, uCount); // this could be moved anywhere above the unlock, including above the lock.
|
||||
this->mutex.Unlock();
|
||||
|
||||
{
|
||||
this->mutex.Lock(); // do not [re]move this lock fence
|
||||
this->mutex.Unlock();
|
||||
}
|
||||
|
||||
if (uCount == 1)
|
||||
{
|
||||
this->var.Signal();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (uCount >= 3) // ...this is the only optimization we can hope to achieve
|
||||
{
|
||||
// we can always save a few cycles by doing an atomic broadcast on a contended semaphore
|
||||
// waking up the wrong amount of threads probably doesn't matter at this point, on these target platforms
|
||||
this->var.Broadcast();
|
||||
}
|
||||
else // ...otherwise, do the handshake just a few times
|
||||
{
|
||||
// doing the condvar handshake for the exact amount of threads you need, once the cond is contended, can pay off
|
||||
for (AU_ITERATE_N(i, uCount))
|
||||
{
|
||||
(void)i;
|
||||
this->var.Signal();
|
||||
}
|
||||
}
|
||||
this->var.BroadcastN(uCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user