/*** Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: SMTYield.hpp Date: 2023-3-12 Author: Reece ***/ #pragma once // Whatever, i'll use this header to blead these flags in. // It's the easiest way to get at all of the translation units compiling thread primitives. // ...not required #if defined(AURORA_COMPILER_MSVC) #pragma strict_gs_check(off) #pragma check_stack(off) #endif // dumbshit compiler is emitting stack checks under a non-zero amount of my thread primitives. // "dumbshit compiler" doesnt quite do it justice when it believe a fucking spinlock-lock with an atomic bit test and set is worth a stack check. #if defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC) #pragma GCC optimize("no-stack-protector") #endif #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) && \ !defined(AURORA_COMPILER_MSVC) && \ !defined(AURORA_COMPILER_INTEL) && \ !defined(AURORA_A_GOOD_COMPILER_PLS) // Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason. // I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand? // No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off. // If these end up being wrong, blame clang and gnu for being cunts, not me. static auline void __mm_monitorx(void * __p, unsigned __extensions, unsigned __hints) { asm volatile(".byte 0x0f, 0x01, 0xfa;" : : "a"(__p), "c"(__extensions), "d"(__hints)); } static auline void __mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock) { asm volatile(".byte 0x0f, 0x01, 0xfb;" : : "a"(__hints), "b"(__clock), "c"(__extensions)); } static auline void __umonitor(void * __address) { __asm__ volatile(".byte 0xF3, 0x0F, 0xAE, 0x01;" : : "a"(__address) : ); } static auline unsigned char __umwait(unsigned int __control, unsigned long long __counter) { AuUInt32 uTimeHi = AuUInt32(__counter >> 32); AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff); char flag; __asm__ volatile(".byte 0xF2, 0x0F, 0xAE, 0xF1\n" "setb %0" : "=r"(flag) : "a"(uTimeLo), "d"(uTimeHi), "c"(__control) : ); return flag; } static auline unsigned char __tpause(unsigned int __control, unsigned long long __counter) { AuUInt32 uTimeHi = AuUInt32(__counter >> 32); AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff); char flag; __asm__ volatile(".byte 0x66, 0x0F, 0xAE, 0xF1\n" "setb %0" : "=r"(flag) : "a"(uTimeLo), "d"(uTimeHi), "c"(__control) : ); return flag; } #define _mm_monitorx __mm_monitorx #define _mm_mwaitx __mm_mwaitx #define _umonitor __umonitor #define _umwait __umwait #define _tpause __tpause #endif namespace Aurora::Threading { inline AuUInt32 gHasThreadLocalTimeout {}; inline thread_local AuUInt32 tlsSpinCountLocal {}; } extern "C" { AuUInt32 SMTGetAPICNumber(void); } #include #define SPIN_FOUR 1 #define while_bc___(exp, ex) \ AuUInt32 __wsc ## ex {}; \ while ((exp) && ((__wsc ## ex++) < 2)) #define while_bc__(exp, ex) while_bc___(exp, ex) #define while_bc_(exp, ex) while_bc__(exp, AU_WHAT(ex)) #if defined(AURORA_RUNTIME_ALWAYS_SPIN_ON_BROADCAST) #define while_bc(exp) while (exp) #elif defined(AURORA_RUNTIME_ALWAYS_CHECK_ONCE_ON_BROADCAST) #define while_bc(exp) if (exp) #elif defined(__COUNTER__) #define while_bc(exp) while_bc_(exp, __COUNTER__) #else #define while_bc(exp) while_bc_(exp, __LINE__) #endif // Replace condition variable broadcasts: `if/while (waiters) { signal() }` loops with while_bc. // while_bc will attempt to rebroadcast if another sleeper turns up. On unicore systems, // depending on the scheduler, spinning in this fashion may result in a deadlock in a tight // enough wait loop. In other systems, having a `while (waiters) { signal(); }` may help // improve performance when other threads are tying up the system scheduler on the wake side. // That way some threads can be late in, and we dont have to worry so much about there being a // wake-up spin lock under real world use cases. To strike a balance between the two conditions, // we add a little bit of extra branching overhead to ensure we don't spin more than 2-3 times. namespace Aurora::Threading::Primitives { namespace ThrdCfg { inline bool gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔 inline bool gEnableAggressiveScheduling {}; inline bool gEnableAgrSchedulingRatelimit {}; inline bool gPreferNtCondvarModernWinSpin {}; inline bool gPreferNtCondvarOlderWinSpin {}; inline bool gPreferNtSemaphoreSpinTryLock {}; inline bool gPreferNtMutexSpinTryLock {}; inline bool gPreferNtCondMutexSpinTryLock {}; inline bool gPreferLinuxSemaphoreSpinTryLock {}; inline bool gPreferLinuxMutexSpinTryLock {}; inline bool gPreferLinuxCondMutexSpinTryLock {}; inline bool gPreferEmulatedWakeOnAddress {}; inline bool gPreferWaitOnAddressAlwaysSpin {}; inline bool gPreferWaitOnAddressAlwaysSpinNative {}; inline bool gPreferRWLockReadLockSpin {}; inline bool gUWPNanosecondEmulationCheckFirst {}; inline AuUInt32 gUWPNanosecondEmulationMaxYields {}; inline bool gForceEnableAdaptiveSpin {}; inline bool gPreferEnableAdaptiveSpin {}; inline bool gPreferLinuxAdaptiveSpin {}; inline bool gPreferOldWin32AdaptiveSpin {}; inline bool gPreferNewWin32AdaptiveSpin {}; inline AuUInt32 gAdaptiveSpinCUCnt0 {}; inline AuUInt32 gAdaptiveSpinCUCnt4 {}; inline AuUInt32 gAdaptiveSpinCUCnt8 {}; inline AuUInt32 gAdaptiveSpinCUCnt16 {}; inline bool gPreferFutexRWLock {}; inline bool gPreferFutexEvent {}; inline bool gWinXpThrough7BlazeOptimizerPower {}; inline bool gPreferLinuxPrimitivesFutexNoSpin {}; inline bool gPreferUnixPrimitivesNoSpin {}; inline bool gAlwaysRWLockWriteBiasOnReadLock {}; inline bool gEnableRWLockWriteBiasOnReadLock {}; inline AuUInt32 gIsIntelAlderLakeOrGreater {}; inline AuUInt32 gIsZen3OrGreater {}; inline AuUInt8 gCountOfPCores {}; } #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) using SMTAtomic_t = AuUInt8; #else using SMTAtomic_t = AuUInt32; #endif inline SMTAtomic_t gCoreTable[256] {}; inline AuUInt32 gSpinAdaptiveThreshold {}; inline AuUInt32 gSpinAdaptiveCurrentCount {}; inline AuUInt32 gSpinAdaptiveThreadCount {}; inline AuUInt32 gUseFutexRWLock {}; inline AuUInt32 gPreferFutexEvent {}; static constexpr AuUInt32 kMWAITXUseTSC = (1 << 1); static constexpr AuUInt32 kMWAITXAllowInterrupts = (1 << 0); static constexpr AuUInt32 kMWAITXFaultGP0 = (1 << 31); static constexpr AuUInt32 kMWAITXWaitOnStore = 0; static constexpr AuUInt32 kMWAITXWaitOnStoreTimed = kMWAITXWaitOnStore | kMWAITXUseTSC; struct Blocky { SMTAtomic_t a; }; inline const AuAlignTo<16, Blocky> kMassiveBlock; void InitAdaptiveThreshold(); void InitAdaptiveThresholdFirstTime(); void InitCfg(); static const bool kEnableSmartScheduling = #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) true; #else // tbd by arch and os false; #endif static auline void SMPPause() { #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) _mm_pause(); #elif defined(AURORA_ARCH_ARM) #if defined(AURORA_COMPILER_GCC) asm volatile("yield"); #else __yield(); #endif #else // TODO: your platform here AuThreading::ContextYield(); #endif } #if defined(AURORA_ARCH_ARM) static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq) { if (uFreq == 10000000) { return uCounter * 100ull; } else if (uFreq == 1000000) { return uCounter * 1000ull; } else if (uFreq == 100000) { return uCounter * 10000ull; } else if (uFreq == 100000000ull) { return uCounter * 10ull; } else if (uFreq == 1000000000ull) { return uCounter; } else { const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull; const long long uPart = (uCounter % uFreq) * 1'000'000'000ull / uFreq; return uWhole + uPart; } } static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq) { return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4; // context: // Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor. // Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on. // Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it. // Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture. // This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock. // On the aarch side of things, we should be able to match the exact Intel behaviour by: // * Reading the system wide clock (CNTVCT_EL0) // * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0) // * Divide by approx "3.6 Ghz" ops/ns // *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares? } #define __rdtsc() RdtscArmEmulated(uClockFreq) #define ALT_RDT #endif template bool auline YieldToSharedCore(long spin, T callback) { if (callback()) { return true; } #if defined(AURORA_ARCH_ARM) AuUInt64 uClockFreq { ArmQueryClockFrequency() }; #endif if (kEnableSmartScheduling) { bool bRet { false }; auto uWord = SMTGetAPICNumber(); if (uWord < AuArraySize(gCoreTable) && uWord < ThrdCfg::gCountOfPCores) { AuAtomicStore(&gCoreTable[uWord], 1u); auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold) { auto uCount = spin; if (AuAtomicLoad(&gCoreTable[uWord ^ 1])) { uCount /= 5; } else if (gHasThreadLocalTimeout) { uCount += tlsSpinCountLocal; } #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) if (ThrdCfg::gIsIntelAlderLakeOrGreater) { if (callback()) { bRet = true; } else { _tpause(0, __rdtsc() + uCount); bRet = callback(); } } else if (ThrdCfg::gIsZen3OrGreater) { if (callback()) { bRet = true; } else { _mm_monitorx((void *)&kMassiveBlock, 0U, 0U); _mm_mwaitx(kMWAITXUseTSC, 0, uCount); bRet = callback(); } } else #endif { #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { bRet = true; break; } else { SMPPause(); uCount--; } } } } AuAtomicStore(&gCoreTable[uWord], 0u); AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); } return bRet; } else if (gSpinAdaptiveThreshold) { auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); if (uNow <= gSpinAdaptiveThreshold) { auto uCount = spin; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); return true; } else { #if defined(SPIN_FOUR) && SPIN_FOUR == 1 SMPPause(); SMPPause(); SMPPause(); SMPPause(); uCount -= 4; #else SMPPause(); uCount -= 1; #endif } } if (gHasThreadLocalTimeout) { auto uCount = tlsSpinCountLocal; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); return true; } else { SMPPause(); uCount--; } } } AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); } else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3)) { auto uCount = (spin) / 3; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); return true; } else { SMPPause(); uCount--; } } } AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); } else { auto uCount = spin; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { return true; } else { #if defined(SPIN_FOUR) && SPIN_FOUR == 1 SMPPause(); SMPPause(); SMPPause(); SMPPause(); uCount -= 4; #else SMPPause(); uCount -= 1; #endif } } if (gHasThreadLocalTimeout) { auto uCount = tlsSpinCountLocal; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { return true; } else { SMPPause(); uCount--; } } } } return callback(); } template bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord) { if (callback()) { return true; } #if defined(AURORA_ARCH_ARM) AuUInt64 uClockFreq { ArmQueryClockFrequency() }; #endif if (kEnableSmartScheduling) { bool bRet { false }; auto uWord = SMTGetAPICNumber(); if (uWord < AuArraySize(gCoreTable) && uWord < ThrdCfg::gCountOfPCores) { AuAtomicStore(&gCoreTable[uWord], 1u); auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold) { auto uCount = spin; bool bSMTProbablyHit {}; if (AuAtomicLoad(&gCoreTable[uWord ^ 1])) { uCount /= 5; bSMTProbablyHit = true; } else if (gHasThreadLocalTimeout) { uCount += tlsSpinCountLocal; } #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) if (ThrdCfg::gIsIntelAlderLakeOrGreater) { _umonitor((void *)AuPageRound(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine)); if (callback()) { bRet = true; } else { _umwait(/*0*/ /*1*/ bSMTProbablyHit ? 0 : 1, __rdtsc() + uCount); bRet = callback(); } } else if (ThrdCfg::gIsZen3OrGreater) { _mm_monitorx((void *)pWord, 0U, 0U); if (callback()) { bRet = true; } else { _mm_mwaitx(kMWAITXWaitOnStoreTimed, 0, uCount); bRet = callback(); } } else #endif { #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { bRet = true; break; } else { SMPPause(); uCount--; } } } } AuAtomicStore(&gCoreTable[uWord], 0u); AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); } return bRet; } else if (gSpinAdaptiveThreshold) { auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); if (uNow <= gSpinAdaptiveThreshold) { auto uCount = spin; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); return true; } else { #if defined(SPIN_FOUR) && SPIN_FOUR == 1 SMPPause(); SMPPause(); SMPPause(); SMPPause(); uCount -= 4; #else SMPPause(); uCount -= 1; #endif } } if (gHasThreadLocalTimeout) { auto uCount = tlsSpinCountLocal; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); return true; } else { SMPPause(); uCount--; } } } AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); } else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3)) { auto uCount = (spin) / 3; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); return true; } else { SMPPause(); uCount--; } } } AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); } else { auto uCount = spin; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { return true; } else { #if defined(SPIN_FOUR) && SPIN_FOUR == 1 SMPPause(); SMPPause(); SMPPause(); SMPPause(); uCount -= 4; #else SMPPause(); uCount -= 1; #endif } } if (gHasThreadLocalTimeout) { auto uCount = tlsSpinCountLocal; #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else while (uCount > 0) #endif { if (callback()) { return true; } else { SMPPause(); uCount--; } } } } return callback(); } template bool auline DoTryIf(T callback) { if (ThrdCfg::gPlatformIsSMPProcessorOptimized) { return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback); } else { return callback(); } } template bool auline DoTryIfAlderLake(T callback, const void *pWord) { if (ThrdCfg::gPlatformIsSMPProcessorOptimized) { return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord); } else { return callback(); } } template bool auline DoTryIfAlderLake(T callback, const volatile void *pWord) { if (ThrdCfg::gPlatformIsSMPProcessorOptimized) { return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord); } else { return callback(); } } }