AuroraRuntime/Source/Threading/Primitives/SMTYield.hpp
2024-04-13 22:49:05 +01:00

395 lines
14 KiB
C++

/***
Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: SMTYield.hpp
Date: 2023-3-12
Author: Reece
***/
#pragma once
namespace Aurora::Threading
{
inline AuUInt32 gHasThreadLocalTimeout {};
inline thread_local AuUInt32 tlsSpinCountLocal {};
}
extern "C"
{
AuUInt32 SMTGetAPICNumber(void);
}
#include <Source/Extensions/Clocks.aarch64.hpp>
#define SPIN_FOUR 1
#define while_bc___(exp, ex) \
AuUInt32 __wsc ## ex {}; \
while ((exp) && ((__wsc ## ex++) < 2))
#define while_bc__(exp, ex) while_bc___(exp, ex)
#define while_bc_(exp, ex) while_bc__(exp, AU_WHAT(ex))
#if defined(AURORA_RUNTIME_ALWAYS_SPIN_ON_BROADCAST)
#define while_bc(exp) while (exp)
#elif defined(AURORA_RUNTIME_ALWAYS_CHECK_ONCE_ON_BROADCAST)
#define while_bc(exp) if (exp)
#elif defined(__COUNTER__)
#define while_bc(exp) while_bc_(exp, __COUNTER__)
#else
#define while_bc(exp) while_bc_(exp, __LINE__)
#endif
// Replace condition variable broadcasts: `if/while (waiters) { signal() }` loops with while_bc.
// while_bc will attempt to rebroadcast if another sleeper turns up. On unicore systems,
// depending on the scheduler, spinning in this fashion may result in a deadlock in a tight
// enough wait loop. In other systems, having a `while (waiters) { signal(); }` may help
// improve performance when other threads are tying up the system scheduler on the wake side.
// That way some threads can be late in, and we dont have to worry so much about there being a
// wake-up spin lock under real world use cases. To strike a balance between the two conditions,
// we add a little bit of extra branching overhead to ensure we don't spin more than 2-3 times.
namespace Aurora::Threading::Primitives
{
namespace ThrdCfg
{
inline bool gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
inline bool gEnableAggressiveScheduling {};
inline bool gEnableAgrSchedulingRatelimit {};
inline bool gPreferNtCondvarModernWinSpin {};
inline bool gPreferNtCondvarOlderWinSpin {};
inline bool gPreferNtSemaphoreSpinTryLock {};
inline bool gPreferNtMutexSpinTryLock {};
inline bool gPreferNtCondMutexSpinTryLock {};
inline bool gPreferLinuxSemaphoreSpinTryLock {};
inline bool gPreferLinuxMutexSpinTryLock {};
inline bool gPreferLinuxCondMutexSpinTryLock {};
inline bool gPreferEmulatedWakeOnAddress {};
inline bool gPreferWaitOnAddressAlwaysSpin {};
inline bool gPreferWaitOnAddressAlwaysSpinNative {};
inline bool gPreferRWLockReadLockSpin {};
inline bool gUWPNanosecondEmulationCheckFirst {};
inline AuUInt32 gUWPNanosecondEmulationMaxYields {};
inline bool gForceEnableAdaptiveSpin {};
inline bool gPreferEnableAdaptiveSpin {};
inline bool gPreferLinuxAdaptiveSpin {};
inline bool gPreferOldWin32AdaptiveSpin {};
inline bool gPreferNewWin32AdaptiveSpin {};
inline AuUInt32 gAdaptiveSpinCUCnt0 {};
inline AuUInt32 gAdaptiveSpinCUCnt4 {};
inline AuUInt32 gAdaptiveSpinCUCnt8 {};
inline AuUInt32 gAdaptiveSpinCUCnt16 {};
inline bool gPreferFutexRWLock {};
inline bool gPreferFutexEvent {};
inline bool gWinXpThrough7BlazeOptimizerPower {};
inline bool gPreferLinuxPrimitivesFutexNoSpin {};
inline bool gPreferUnixPrimitivesNoSpin {};
inline bool gAlwaysRWLockWriteBiasOnReadLock {};
inline bool gEnableRWLockWriteBiasOnReadLock {};
inline AuUInt8 gCountOfPCores {};
}
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
using SMTAtomic_t = AuUInt8;
#else
using SMTAtomic_t = AuUInt32;
#endif
inline SMTAtomic_t gCoreTable[256] {};
inline AuUInt32 gSpinAdaptiveThreshold {};
inline AuUInt32 gSpinAdaptiveCurrentCount {};
inline AuUInt32 gSpinAdaptiveThreadCount {};
inline AuUInt32 gUseFutexRWLock {};
inline AuUInt32 gPreferFutexEvent {};
void InitAdaptiveThreshold();
void InitAdaptiveThresholdFirstTime();
void InitCfg();
static const bool kEnableSmartScheduling =
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
true;
#else
// tbd by arch and os
false;
#endif
static auline void SMPPause()
{
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
_mm_pause();
#elif defined(AURORA_ARCH_ARM)
#if defined(AURORA_COMPILER_GCC)
asm volatile("yield");
#else
__yield();
#endif
#else
// TODO: your platform here
AuThreading::ContextYield();
#endif
}
#if defined(AURORA_ARCH_ARM)
static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
{
if (uFreq == 10000000)
{
return uCounter * 100ull;
}
else if (uFreq == 1000000)
{
return uCounter * 1000ull;
}
else if (uFreq == 100000)
{
return uCounter * 10000ull;
}
else if (uFreq == 100000000ull)
{
return uCounter * 10ull;
}
else if (uFreq == 1000000000ull)
{
return uCounter;
}
else
{
const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
const long long uPart = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
return uWhole + uPart;
}
}
static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
{
return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
// context:
// Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
// Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
// Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
// Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
// This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
// On the aarch side of things, we should be able to match the exact Intel behaviour by:
// * Reading the system wide clock (CNTVCT_EL0)
// * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
// * Divide by approx "3.6 Ghz" ops/ns
// *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
}
#define __rdtsc() RdtscArmEmulated(uClockFreq)
#define ALT_RDT
#endif
template <typename T>
bool auline YieldToSharedCore(long spin, T callback)
{
if (callback())
{
return true;
}
#if defined(AURORA_ARCH_ARM)
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
#endif
if (kEnableSmartScheduling)
{
bool bRet { false };
auto uWord = SMTGetAPICNumber();
if (uWord < AuArraySize(gCoreTable) &&
uWord < ThrdCfg::gCountOfPCores)
{
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
{
uCount /= 5;
}
else if (gHasThreadLocalTimeout)
{
uCount += tlsSpinCountLocal;
}
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
bRet = true;
break;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
return bRet;
}
else if (gSpinAdaptiveThreshold)
{
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3))
{
auto uCount = (spin) / 3;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
}
return callback();
}
template <typename T>
bool auline DoTryIf(T callback)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
}
else
{
return callback();
}
}
}