[*] Trivial XP+ semaphore optimization for high wake counts

This commit is contained in:
Reece Wilson 2023-10-12 16:58:08 +01:00
parent e3ba8cf1c5
commit 23ddcf9ba1
3 changed files with 89 additions and 18 deletions

View File

@ -425,6 +425,82 @@ namespace Aurora::Threading::Primitives
#endif
}
void ConditionVariableNT::BroadcastN(AuUInt32 nBroadcast)
{
#if !defined(AURORA_FORCE_SRW_LOCKS)
if (gUseNativeWaitCondvar)
{
auto original = this->wlist;
auto expected = original;
expected = AuMin(nBroadcast, expected >> kShiftCountByBits);
if (!expected)
{
return;
}
AuAtomicAdd(&this->signalCount, expected);
auto uAwoken = expected;
while (true)
{
auto uCount = expected - uAwoken;
if (AuAtomicCompareExchange(&this->wlist,
uCount,
original) == original)
{
InternalLTSWakeCount((void *)&this->wlist, uCount);
return;
}
else
{
original = this->wlist;
expected = original >> kShiftCountByBits;
uAwoken = AuMin<AuUInt32>(uAwoken, expected);
}
}
}
else
{
auto original = this->wlist;
auto expected = original;
expected = expected >> kShiftCountByBits;
auto uBroadcastIterations = AuMin(nBroadcast, expected);
while (expected && uBroadcastIterations)
{
AuAtomicAdd(&this->signalCount, 1u);
while (expected && uBroadcastIterations)
{
bool bBreak {};
if (AuAtomicCompareExchange(&this->wlist, ((expected - 1) << kShiftCountByBits) /*intentional clear*/, original) == original)
{
pNtReleaseKeyedEvent(gKeyedEventHandle, (void *)&this->wlist, FALSE, nullptr);
uBroadcastIterations--;
bBreak = true;
}
original = this->wlist;
expected = original >> kShiftCountByBits;
if (bBreak)
{
break;
}
}
}
}
#else
::WakeAllConditionVariable(&this->winCond_);
#endif
}
AUKN_SYM IConditionVariable *ConditionVariableNew(const AuSPtr<IConditionMutex> &pMutex)
{
return _new ConditionVariableImpl(pMutex);

View File

@ -19,6 +19,7 @@ namespace Aurora::Threading::Primitives
bool WaitForSignalNsEx(Win32ConditionMutex *pMutex, AuUInt64 timeout);
void Signal();
void Broadcast();
void BroadcastN(AuUInt32 nBroadcast);
auline bool CheckOut();

View File

@ -214,6 +214,11 @@ namespace Aurora::Threading::Primitives
void SemaphoreImpl::Unlock(AuUInt16 uCount)
{
if (uCount == 0)
{
return;
}
if (gUseNativeWaitSemapahore)
{
AuAtomicAdd<AuUInt32>(&this->dwState_, uCount);
@ -236,9 +241,12 @@ namespace Aurora::Threading::Primitives
// we would have to expand our already oversized by 8, 24-byte x86_64 semaphore for a trivial perf boost
// we cant efficiently access the conditions state or atomic guarantees...
this->mutex.Lock(); // do not [re]move this lock fence
AuAtomicAdd<AuUInt32>(&this->dwState_, uCount); // this could be moved anywhere above the unlock, including above the lock.
{
this->mutex.Lock(); // do not [re]move this lock fence
this->mutex.Unlock();
}
if (uCount == 1)
{
@ -246,21 +254,7 @@ namespace Aurora::Threading::Primitives
}
else
{
if (uCount >= 3) // ...this is the only optimization we can hope to achieve
{
// we can always save a few cycles by doing an atomic broadcast on a contended semaphore
// waking up the wrong amount of threads probably doesn't matter at this point, on these target platforms
this->var.Broadcast();
}
else // ...otherwise, do the handshake just a few times
{
// doing the condvar handshake for the exact amount of threads you need, once the cond is contended, can pay off
for (AU_ITERATE_N(i, uCount))
{
(void)i;
this->var.Signal();
}
}
this->var.BroadcastN(uCount);
}
}
}