AuroraRuntime/Source/Threading/Primitives/SMTYield.hpp
J Reece Wilson 67894b399b [*] Revert clang 'optimization' because this piece of shit compiler wont listen to me.
Even worse, im just going to fucking nuke all clang related checks from orbit in our global build_scripts (8b00dc69fceea62ecbbf5a21255a41e2f23921a4), because they admit they cause a 2x slowdown.
2024-05-13 23:43:19 +01:00

785 lines
27 KiB
C++

/***
Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: SMTYield.hpp
Date: 2023-3-12
Author: Reece
***/
#pragma once
// Whatever, i'll use this header to blead these flags in.
// It's the easiest way to get at all of the translation units compiling thread primitives.
// ...not required
#if defined(AURORA_COMPILER_MSVC)
#pragma strict_gs_check(off)
#pragma check_stack(off)
#endif
// dumbshit compiler is emitting stack checks under a non-zero amount of my thread primitives.
// "dumbshit compiler" doesnt quite do it justice when it believe a fucking spinlock-lock with an atomic bit test and set is worth a stack check.
#if defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC)
#pragma GCC optimize("no-stack-protector")
#endif
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) && \
!defined(AURORA_COMPILER_MSVC) && \
!defined(AURORA_COMPILER_INTEL) && \
!defined(AURORA_A_GOOD_COMPILER_PLS)
// Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason.
// I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand?
// No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off.
// If these end up being wrong, blame clang and gnu for being cunts, not me.
static auline void __mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
{
asm volatile(".byte 0x0f, 0x01, 0xfa;" :
: "a"(__p),
"c"(__extensions),
"d"(__hints));
}
static auline void __mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
{
asm volatile(".byte 0x0f, 0x01, 0xfb;" :
: "a"(__hints),
"b"(__clock),
"c"(__extensions));
}
static auline void __umonitor(void * __address)
{
__asm__ volatile(".byte 0xF3, 0x0F, 0xAE, 0x01;" :
: "a"(__address)
: );
}
static auline unsigned char __umwait(unsigned int __control, unsigned long long __counter)
{
AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
char flag;
__asm__ volatile(".byte 0xF2, 0x0F, 0xAE, 0xF1\n"
"setb %0"
: "=r"(flag)
: "a"(uTimeLo),
"d"(uTimeHi),
"c"(__control)
: );
return flag;
}
static auline unsigned char __tpause(unsigned int __control, unsigned long long __counter)
{
AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
char flag;
__asm__ volatile(".byte 0x66, 0x0F, 0xAE, 0xF1\n"
"setb %0"
: "=r"(flag)
: "a"(uTimeHi),
"d"(uTimeHi),
"c"(__control)
: );
return flag;
}
#define _mm_monitorx __mm_monitorx
#define _mm_mwaitx __mm_mwaitx
#define _umonitor __umonitor
#define _umwait __umwait
#define _tpause __tpause
#endif
namespace Aurora::Threading
{
inline AuUInt32 gHasThreadLocalTimeout {};
inline thread_local AuUInt32 tlsSpinCountLocal {};
}
extern "C"
{
AuUInt32 SMTGetAPICNumber(void);
}
#include <Source/Extensions/Clocks.aarch64.hpp>
#define SPIN_FOUR 1
#define while_bc___(exp, ex) \
AuUInt32 __wsc ## ex {}; \
while ((exp) && ((__wsc ## ex++) < 2))
#define while_bc__(exp, ex) while_bc___(exp, ex)
#define while_bc_(exp, ex) while_bc__(exp, AU_WHAT(ex))
#if defined(AURORA_RUNTIME_ALWAYS_SPIN_ON_BROADCAST)
#define while_bc(exp) while (exp)
#elif defined(AURORA_RUNTIME_ALWAYS_CHECK_ONCE_ON_BROADCAST)
#define while_bc(exp) if (exp)
#elif defined(__COUNTER__)
#define while_bc(exp) while_bc_(exp, __COUNTER__)
#else
#define while_bc(exp) while_bc_(exp, __LINE__)
#endif
// Replace condition variable broadcasts: `if/while (waiters) { signal() }` loops with while_bc.
// while_bc will attempt to rebroadcast if another sleeper turns up. On unicore systems,
// depending on the scheduler, spinning in this fashion may result in a deadlock in a tight
// enough wait loop. In other systems, having a `while (waiters) { signal(); }` may help
// improve performance when other threads are tying up the system scheduler on the wake side.
// That way some threads can be late in, and we dont have to worry so much about there being a
// wake-up spin lock under real world use cases. To strike a balance between the two conditions,
// we add a little bit of extra branching overhead to ensure we don't spin more than 2-3 times.
namespace Aurora::Threading::Primitives
{
namespace ThrdCfg
{
inline bool gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
inline bool gEnableAggressiveScheduling {};
inline bool gEnableAgrSchedulingRatelimit {};
inline bool gPreferNtCondvarModernWinSpin {};
inline bool gPreferNtCondvarOlderWinSpin {};
inline bool gPreferNtSemaphoreSpinTryLock {};
inline bool gPreferNtMutexSpinTryLock {};
inline bool gPreferNtCondMutexSpinTryLock {};
inline bool gPreferLinuxSemaphoreSpinTryLock {};
inline bool gPreferLinuxMutexSpinTryLock {};
inline bool gPreferLinuxCondMutexSpinTryLock {};
inline bool gPreferEmulatedWakeOnAddress {};
inline bool gPreferWaitOnAddressAlwaysSpin {};
inline bool gPreferWaitOnAddressAlwaysSpinNative {};
inline bool gPreferRWLockReadLockSpin {};
inline bool gUWPNanosecondEmulationCheckFirst {};
inline AuUInt32 gUWPNanosecondEmulationMaxYields {};
inline bool gForceEnableAdaptiveSpin {};
inline bool gPreferEnableAdaptiveSpin {};
inline bool gPreferLinuxAdaptiveSpin {};
inline bool gPreferOldWin32AdaptiveSpin {};
inline bool gPreferNewWin32AdaptiveSpin {};
inline AuUInt32 gAdaptiveSpinCUCnt0 {};
inline AuUInt32 gAdaptiveSpinCUCnt4 {};
inline AuUInt32 gAdaptiveSpinCUCnt8 {};
inline AuUInt32 gAdaptiveSpinCUCnt16 {};
inline bool gPreferFutexRWLock {};
inline bool gPreferFutexEvent {};
inline bool gWinXpThrough7BlazeOptimizerPower {};
inline bool gPreferLinuxPrimitivesFutexNoSpin {};
inline bool gPreferUnixPrimitivesNoSpin {};
inline bool gAlwaysRWLockWriteBiasOnReadLock {};
inline bool gEnableRWLockWriteBiasOnReadLock {};
inline AuUInt32 gIsIntelAlderLakeOrGreater {};
inline AuUInt32 gIsZen3OrGreater {};
inline AuUInt8 gCountOfPCores {};
}
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
using SMTAtomic_t = AuUInt8;
#else
using SMTAtomic_t = AuUInt32;
#endif
inline SMTAtomic_t gCoreTable[256] {};
inline AuUInt32 gSpinAdaptiveThreshold {};
inline AuUInt32 gSpinAdaptiveCurrentCount {};
inline AuUInt32 gSpinAdaptiveThreadCount {};
inline AuUInt32 gUseFutexRWLock {};
inline AuUInt32 gPreferFutexEvent {};
static constexpr AuUInt32 kMWAITXUseTSC = (1 << 1);
static constexpr AuUInt32 kMWAITXAllowInterrupts = (1 << 0);
static constexpr AuUInt32 kMWAITXFaultGP0 = (1 << 31);
static constexpr AuUInt32 kMWAITXWaitOnStore = 0;
static constexpr AuUInt32 kMWAITXWaitOnStoreTimed = kMWAITXWaitOnStore | kMWAITXUseTSC;
struct Blocky
{
SMTAtomic_t a;
};
inline const AuAlignTo<16, Blocky> kMassiveBlock;
void InitAdaptiveThreshold();
void InitAdaptiveThresholdFirstTime();
void InitCfg();
static const bool kEnableSmartScheduling =
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
true;
#else
// tbd by arch and os
false;
#endif
static auline void SMPPause()
{
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
_mm_pause();
#elif defined(AURORA_ARCH_ARM)
#if defined(AURORA_COMPILER_GCC)
asm volatile("yield");
#else
__yield();
#endif
#else
// TODO: your platform here
AuThreading::ContextYield();
#endif
}
#if defined(AURORA_ARCH_ARM)
static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
{
if (uFreq == 10000000)
{
return uCounter * 100ull;
}
else if (uFreq == 1000000)
{
return uCounter * 1000ull;
}
else if (uFreq == 100000)
{
return uCounter * 10000ull;
}
else if (uFreq == 100000000ull)
{
return uCounter * 10ull;
}
else if (uFreq == 1000000000ull)
{
return uCounter;
}
else
{
const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
const long long uPart = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
return uWhole + uPart;
}
}
static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
{
return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
// context:
// Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
// Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
// Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
// Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
// This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
// On the aarch side of things, we should be able to match the exact Intel behaviour by:
// * Reading the system wide clock (CNTVCT_EL0)
// * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
// * Divide by approx "3.6 Ghz" ops/ns
// *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
}
#define __rdtsc() RdtscArmEmulated(uClockFreq)
#define ALT_RDT
#endif
template <typename T>
bool auline YieldToSharedCore(long spin, T callback)
{
if (callback())
{
return true;
}
#if defined(AURORA_ARCH_ARM)
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
#endif
if (kEnableSmartScheduling)
{
bool bRet { false };
auto uWord = SMTGetAPICNumber();
if (uWord < AuArraySize(gCoreTable) &&
uWord < ThrdCfg::gCountOfPCores)
{
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
{
uCount /= 5;
}
else if (gHasThreadLocalTimeout)
{
uCount += tlsSpinCountLocal;
}
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
if (ThrdCfg::gIsIntelAlderLakeOrGreater)
{
if (callback())
{
bRet = true;
}
else
{
_tpause(0, __rdtsc() + uCount);
bRet = callback();
}
}
else if (ThrdCfg::gIsZen3OrGreater)
{
if (callback())
{
bRet = true;
}
else
{
_mm_monitorx((void *)&kMassiveBlock, 0U, 0U);
_mm_mwaitx(kMWAITXUseTSC, 0, uCount);
bRet = callback();
}
}
else
#endif
{
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
bRet = true;
break;
}
else
{
SMPPause();
uCount--;
}
}
}
}
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
return bRet;
}
else if (gSpinAdaptiveThreshold)
{
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3))
{
auto uCount = (spin) / 3;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
}
return callback();
}
template <typename T>
bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord)
{
if (callback())
{
return true;
}
#if defined(AURORA_ARCH_ARM)
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
#endif
if (kEnableSmartScheduling)
{
bool bRet { false };
auto uWord = SMTGetAPICNumber();
if (uWord < AuArraySize(gCoreTable) &&
uWord < ThrdCfg::gCountOfPCores)
{
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
bool bSMTProbablyHit {};
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
{
uCount /= 5;
bSMTProbablyHit = true;
}
else if (gHasThreadLocalTimeout)
{
uCount += tlsSpinCountLocal;
}
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
if (ThrdCfg::gIsIntelAlderLakeOrGreater)
{
_umonitor((void *)AuPageRound<AuUInt>(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine));
if (callback())
{
bRet = true;
}
else
{
_umwait(/*0*/ /*1*/ bSMTProbablyHit ? 0 : 1, __rdtsc() + uCount);
bRet = callback();
}
}
else if (ThrdCfg::gIsZen3OrGreater)
{
_mm_monitorx((void *)pWord, 0U, 0U);
if (callback())
{
bRet = true;
}
else
{
_mm_mwaitx(kMWAITXWaitOnStoreTimed, 0, uCount);
bRet = callback();
}
}
else
#endif
{
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
bRet = true;
break;
}
else
{
SMPPause();
uCount--;
}
}
}
}
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
return bRet;
}
else if (gSpinAdaptiveThreshold)
{
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3))
{
auto uCount = (spin) / 3;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
else
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
SMPPause();
SMPPause();
SMPPause();
SMPPause();
uCount -= 4;
#else
SMPPause();
uCount -= 1;
#endif
}
}
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
return true;
}
else
{
SMPPause();
uCount--;
}
}
}
}
return callback();
}
template <typename T>
bool auline DoTryIf(T callback)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
}
else
{
return callback();
}
}
template <typename T>
bool auline DoTryIfAlderLake(T callback, const void *pWord)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord);
}
else
{
return callback();
}
}
template <typename T>
bool auline DoTryIfAlderLake(T callback, const volatile void *pWord)
{
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
{
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord);
}
else
{
return callback();
}
}
}