Jamie Reece Wilson
3e77e61914
// Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason. // I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand? // No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off. // If these end up being wrong, blame clang and gnu for being cunts, not me. No, I will not raise our requirements above ivybridge; no, I will not expose feature macros to the STL (et al) that boosts our requirements to modern intelaviv slop and amd atomic ackers
785 lines
27 KiB
C++
785 lines
27 KiB
C++
/***
|
|
Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
|
|
|
File: SMTYield.hpp
|
|
Date: 2023-3-12
|
|
Author: Reece
|
|
***/
|
|
#pragma once
|
|
|
|
// Whatever, i'll use this header to blead these flags in.
|
|
// It's the easiest way to get at all of the translation units compiling thread primitives.
|
|
// ...not required
|
|
#if defined(AURORA_COMPILER_MSVC)
|
|
#pragma strict_gs_check(off)
|
|
#pragma check_stack(off)
|
|
#endif
|
|
|
|
// dumbshit compiler is emitting stack checks under a non-zero amount of my thread primitives.
|
|
// "dumbshit compiler" doesnt quite do it justice when it believe a fucking spinlock-lock with an atomic bit test and set is worth a stack check.
|
|
#if defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC)
|
|
#pragma GCC optimize("no-stack-protector")
|
|
#endif
|
|
|
|
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) && \
|
|
!defined(AURORA_COMPILER_MSVC) && \
|
|
!defined(AURORA_COMPILER_INTEL) && \
|
|
!defined(AURORA_A_GOOD_COMPILER_PLS)
|
|
|
|
// Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason.
|
|
// I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand?
|
|
// No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off.
|
|
// If these end up being wrong, blame clang and gnu for being cunts, not me.
|
|
|
|
static auline void __mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
|
|
{
|
|
asm volatile(".byte 0x0f, 0x01, 0xfa;" :
|
|
: "a"(__p),
|
|
"c"(__extensions),
|
|
"d"(__hints));
|
|
}
|
|
|
|
static auline void __mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
|
|
{
|
|
asm volatile(".byte 0x0f, 0x01, 0xfb;" :
|
|
: "a"(__hints),
|
|
"b"(__clock),
|
|
"c"(__extensions));
|
|
}
|
|
|
|
static auline void __umonitor(void * __address)
|
|
{
|
|
__asm__ volatile(".byte 0xF3, 0x0F, 0xAE, 0x01;" :
|
|
: "a"(__address)
|
|
: );
|
|
}
|
|
|
|
static auline unsigned char __umwait(unsigned int __control, unsigned long long __counter)
|
|
{
|
|
AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
|
|
AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
|
|
char flag;
|
|
__asm__ volatile(".byte 0xF2, 0x0F, 0xAE, 0xF1\n"
|
|
"setb %0"
|
|
: "=r"(flag)
|
|
: "a"(uTimeLo),
|
|
"d"(uTimeHi),
|
|
"c"(__control)
|
|
: );
|
|
return flag;
|
|
}
|
|
|
|
static auline unsigned char __tpause(unsigned int __control, unsigned long long __counter)
|
|
{
|
|
AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
|
|
AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
|
|
char flag;
|
|
__asm__ volatile(".byte 0x66, 0x0F, 0xAE, 0xF1\n"
|
|
"setb %0"
|
|
: "=r"(flag)
|
|
: "a"(uTimeLo),
|
|
"d"(uTimeHi),
|
|
"c"(__control)
|
|
: );
|
|
return flag;
|
|
}
|
|
|
|
#define _mm_monitorx __mm_monitorx
|
|
#define _mm_mwaitx __mm_mwaitx
|
|
#define _umonitor __umonitor
|
|
#define _umwait __umwait
|
|
#define _tpause __tpause
|
|
|
|
#endif
|
|
|
|
namespace Aurora::Threading
|
|
{
|
|
inline AuUInt32 gHasThreadLocalTimeout {};
|
|
inline thread_local AuUInt32 tlsSpinCountLocal {};
|
|
}
|
|
|
|
extern "C"
|
|
{
|
|
AuUInt32 SMTGetAPICNumber(void);
|
|
}
|
|
|
|
#include <Source/Extensions/Clocks.aarch64.hpp>
|
|
|
|
#define SPIN_FOUR 1
|
|
|
|
#define while_bc___(exp, ex) \
|
|
AuUInt32 __wsc ## ex {}; \
|
|
while ((exp) && ((__wsc ## ex++) < 2))
|
|
#define while_bc__(exp, ex) while_bc___(exp, ex)
|
|
#define while_bc_(exp, ex) while_bc__(exp, AU_WHAT(ex))
|
|
|
|
#if defined(AURORA_RUNTIME_ALWAYS_SPIN_ON_BROADCAST)
|
|
#define while_bc(exp) while (exp)
|
|
#elif defined(AURORA_RUNTIME_ALWAYS_CHECK_ONCE_ON_BROADCAST)
|
|
#define while_bc(exp) if (exp)
|
|
#elif defined(__COUNTER__)
|
|
#define while_bc(exp) while_bc_(exp, __COUNTER__)
|
|
#else
|
|
#define while_bc(exp) while_bc_(exp, __LINE__)
|
|
#endif
|
|
|
|
// Replace condition variable broadcasts: `if/while (waiters) { signal() }` loops with while_bc.
|
|
// while_bc will attempt to rebroadcast if another sleeper turns up. On unicore systems,
|
|
// depending on the scheduler, spinning in this fashion may result in a deadlock in a tight
|
|
// enough wait loop. In other systems, having a `while (waiters) { signal(); }` may help
|
|
// improve performance when other threads are tying up the system scheduler on the wake side.
|
|
// That way some threads can be late in, and we dont have to worry so much about there being a
|
|
// wake-up spin lock under real world use cases. To strike a balance between the two conditions,
|
|
// we add a little bit of extra branching overhead to ensure we don't spin more than 2-3 times.
|
|
|
|
namespace Aurora::Threading::Primitives
|
|
{
|
|
namespace ThrdCfg
|
|
{
|
|
inline bool gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
|
|
inline bool gEnableAggressiveScheduling {};
|
|
inline bool gEnableAgrSchedulingRatelimit {};
|
|
inline bool gPreferNtCondvarModernWinSpin {};
|
|
inline bool gPreferNtCondvarOlderWinSpin {};
|
|
inline bool gPreferNtSemaphoreSpinTryLock {};
|
|
inline bool gPreferNtMutexSpinTryLock {};
|
|
inline bool gPreferNtCondMutexSpinTryLock {};
|
|
inline bool gPreferLinuxSemaphoreSpinTryLock {};
|
|
inline bool gPreferLinuxMutexSpinTryLock {};
|
|
inline bool gPreferLinuxCondMutexSpinTryLock {};
|
|
inline bool gPreferEmulatedWakeOnAddress {};
|
|
inline bool gPreferWaitOnAddressAlwaysSpin {};
|
|
inline bool gPreferWaitOnAddressAlwaysSpinNative {};
|
|
inline bool gPreferRWLockReadLockSpin {};
|
|
inline bool gUWPNanosecondEmulationCheckFirst {};
|
|
inline AuUInt32 gUWPNanosecondEmulationMaxYields {};
|
|
inline bool gForceEnableAdaptiveSpin {};
|
|
inline bool gPreferEnableAdaptiveSpin {};
|
|
inline bool gPreferLinuxAdaptiveSpin {};
|
|
inline bool gPreferOldWin32AdaptiveSpin {};
|
|
inline bool gPreferNewWin32AdaptiveSpin {};
|
|
inline AuUInt32 gAdaptiveSpinCUCnt0 {};
|
|
inline AuUInt32 gAdaptiveSpinCUCnt4 {};
|
|
inline AuUInt32 gAdaptiveSpinCUCnt8 {};
|
|
inline AuUInt32 gAdaptiveSpinCUCnt16 {};
|
|
inline bool gPreferFutexRWLock {};
|
|
inline bool gPreferFutexEvent {};
|
|
inline bool gWinXpThrough7BlazeOptimizerPower {};
|
|
inline bool gPreferLinuxPrimitivesFutexNoSpin {};
|
|
inline bool gPreferUnixPrimitivesNoSpin {};
|
|
inline bool gAlwaysRWLockWriteBiasOnReadLock {};
|
|
inline bool gEnableRWLockWriteBiasOnReadLock {};
|
|
inline AuUInt32 gIsIntelAlderLakeOrGreater {};
|
|
inline AuUInt32 gIsZen3OrGreater {};
|
|
inline AuUInt8 gCountOfPCores {};
|
|
}
|
|
|
|
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
|
|
using SMTAtomic_t = AuUInt8;
|
|
#else
|
|
using SMTAtomic_t = AuUInt32;
|
|
#endif
|
|
inline SMTAtomic_t gCoreTable[256] {};
|
|
|
|
inline AuUInt32 gSpinAdaptiveThreshold {};
|
|
inline AuUInt32 gSpinAdaptiveCurrentCount {};
|
|
inline AuUInt32 gSpinAdaptiveThreadCount {};
|
|
|
|
inline AuUInt32 gUseFutexRWLock {};
|
|
inline AuUInt32 gPreferFutexEvent {};
|
|
|
|
static constexpr AuUInt32 kMWAITXUseTSC = (1 << 1);
|
|
static constexpr AuUInt32 kMWAITXAllowInterrupts = (1 << 0);
|
|
static constexpr AuUInt32 kMWAITXFaultGP0 = (1 << 31);
|
|
static constexpr AuUInt32 kMWAITXWaitOnStore = 0;
|
|
static constexpr AuUInt32 kMWAITXWaitOnStoreTimed = kMWAITXWaitOnStore | kMWAITXUseTSC;
|
|
|
|
struct Blocky
|
|
{
|
|
SMTAtomic_t a;
|
|
};
|
|
|
|
inline const AuAlignTo<16, Blocky> kMassiveBlock;
|
|
|
|
void InitAdaptiveThreshold();
|
|
void InitAdaptiveThresholdFirstTime();
|
|
void InitCfg();
|
|
|
|
static const bool kEnableSmartScheduling =
|
|
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
|
|
true;
|
|
#else
|
|
// tbd by arch and os
|
|
false;
|
|
#endif
|
|
|
|
static auline void SMPPause()
|
|
{
|
|
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
|
|
_mm_pause();
|
|
#elif defined(AURORA_ARCH_ARM)
|
|
#if defined(AURORA_COMPILER_GCC)
|
|
asm volatile("yield");
|
|
#else
|
|
__yield();
|
|
#endif
|
|
#else
|
|
// TODO: your platform here
|
|
AuThreading::ContextYield();
|
|
#endif
|
|
}
|
|
|
|
#if defined(AURORA_ARCH_ARM)
|
|
|
|
static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
|
|
{
|
|
if (uFreq == 10000000)
|
|
{
|
|
return uCounter * 100ull;
|
|
}
|
|
else if (uFreq == 1000000)
|
|
{
|
|
return uCounter * 1000ull;
|
|
}
|
|
else if (uFreq == 100000)
|
|
{
|
|
return uCounter * 10000ull;
|
|
}
|
|
else if (uFreq == 100000000ull)
|
|
{
|
|
return uCounter * 10ull;
|
|
}
|
|
else if (uFreq == 1000000000ull)
|
|
{
|
|
return uCounter;
|
|
}
|
|
else
|
|
{
|
|
const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
|
|
const long long uPart = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
|
|
return uWhole + uPart;
|
|
}
|
|
}
|
|
|
|
static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
|
|
{
|
|
return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
|
|
// context:
|
|
// Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
|
|
// Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
|
|
// Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
|
|
// Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
|
|
// This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
|
|
// On the aarch side of things, we should be able to match the exact Intel behaviour by:
|
|
// * Reading the system wide clock (CNTVCT_EL0)
|
|
// * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
|
|
// * Divide by approx "3.6 Ghz" ops/ns
|
|
// *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
|
|
}
|
|
|
|
#define __rdtsc() RdtscArmEmulated(uClockFreq)
|
|
#define ALT_RDT
|
|
#endif
|
|
|
|
template <typename T>
|
|
bool auline YieldToSharedCore(long spin, T callback)
|
|
{
|
|
if (callback())
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#if defined(AURORA_ARCH_ARM)
|
|
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
|
|
#endif
|
|
|
|
if (kEnableSmartScheduling)
|
|
{
|
|
bool bRet { false };
|
|
auto uWord = SMTGetAPICNumber();
|
|
if (uWord < AuArraySize(gCoreTable) &&
|
|
uWord < ThrdCfg::gCountOfPCores)
|
|
{
|
|
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
|
|
|
|
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
|
|
if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
|
|
{
|
|
auto uCount = spin;
|
|
|
|
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
|
|
{
|
|
uCount /= 5;
|
|
}
|
|
else if (gHasThreadLocalTimeout)
|
|
{
|
|
uCount += tlsSpinCountLocal;
|
|
}
|
|
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
|
|
if (ThrdCfg::gIsIntelAlderLakeOrGreater)
|
|
{
|
|
if (callback())
|
|
{
|
|
bRet = true;
|
|
}
|
|
else
|
|
{
|
|
_tpause(0, __rdtsc() + uCount);
|
|
bRet = callback();
|
|
}
|
|
}
|
|
else if (ThrdCfg::gIsZen3OrGreater)
|
|
{
|
|
if (callback())
|
|
{
|
|
bRet = true;
|
|
}
|
|
else
|
|
{
|
|
_mm_monitorx((void *)&kMassiveBlock, 0U, 0U);
|
|
_mm_mwaitx(kMWAITXUseTSC, 0, uCount);
|
|
bRet = callback();
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
bRet = true;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
}
|
|
|
|
return bRet;
|
|
}
|
|
else if (gSpinAdaptiveThreshold)
|
|
{
|
|
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
|
|
|
|
if (uNow <= gSpinAdaptiveThreshold)
|
|
{
|
|
auto uCount = spin;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
uCount -= 4;
|
|
#else
|
|
SMPPause();
|
|
uCount -= 1;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
if (gHasThreadLocalTimeout)
|
|
{
|
|
auto uCount = tlsSpinCountLocal;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
}
|
|
else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3))
|
|
{
|
|
auto uCount = (spin) / 3;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
}
|
|
else
|
|
{
|
|
auto uCount = spin;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
uCount -= 4;
|
|
#else
|
|
SMPPause();
|
|
uCount -= 1;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
if (gHasThreadLocalTimeout)
|
|
{
|
|
auto uCount = tlsSpinCountLocal;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return callback();
|
|
}
|
|
|
|
template <typename T>
|
|
bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord)
|
|
{
|
|
if (callback())
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#if defined(AURORA_ARCH_ARM)
|
|
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
|
|
#endif
|
|
|
|
if (kEnableSmartScheduling)
|
|
{
|
|
bool bRet { false };
|
|
auto uWord = SMTGetAPICNumber();
|
|
if (uWord < AuArraySize(gCoreTable) &&
|
|
uWord < ThrdCfg::gCountOfPCores)
|
|
{
|
|
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
|
|
|
|
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
|
|
if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
|
|
{
|
|
auto uCount = spin;
|
|
bool bSMTProbablyHit {};
|
|
|
|
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
|
|
{
|
|
uCount /= 5;
|
|
bSMTProbablyHit = true;
|
|
}
|
|
else if (gHasThreadLocalTimeout)
|
|
{
|
|
uCount += tlsSpinCountLocal;
|
|
}
|
|
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
|
|
if (ThrdCfg::gIsIntelAlderLakeOrGreater)
|
|
{
|
|
_umonitor((void *)AuPageRound<AuUInt>(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine));
|
|
|
|
if (callback())
|
|
{
|
|
bRet = true;
|
|
}
|
|
else
|
|
{
|
|
_umwait(/*0*/ /*1*/ bSMTProbablyHit ? 0 : 1, __rdtsc() + uCount);
|
|
bRet = callback();
|
|
}
|
|
}
|
|
else if (ThrdCfg::gIsZen3OrGreater)
|
|
{
|
|
_mm_monitorx((void *)pWord, 0U, 0U);
|
|
|
|
if (callback())
|
|
{
|
|
bRet = true;
|
|
}
|
|
else
|
|
{
|
|
_mm_mwaitx(kMWAITXWaitOnStoreTimed, 0, uCount);
|
|
bRet = callback();
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
bRet = true;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
}
|
|
|
|
return bRet;
|
|
}
|
|
else if (gSpinAdaptiveThreshold)
|
|
{
|
|
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
|
|
|
|
if (uNow <= gSpinAdaptiveThreshold)
|
|
{
|
|
auto uCount = spin;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
uCount -= 4;
|
|
#else
|
|
SMPPause();
|
|
uCount -= 1;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
if (gHasThreadLocalTimeout)
|
|
{
|
|
auto uCount = tlsSpinCountLocal;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
}
|
|
else if (uNow <= (gSpinAdaptiveThreadCount / 4 * 3))
|
|
{
|
|
auto uCount = (spin) / 3;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
|
|
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
|
}
|
|
else
|
|
{
|
|
auto uCount = spin;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
#if defined(SPIN_FOUR) && SPIN_FOUR == 1
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
SMPPause();
|
|
uCount -= 4;
|
|
#else
|
|
SMPPause();
|
|
uCount -= 1;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
if (gHasThreadLocalTimeout)
|
|
{
|
|
auto uCount = tlsSpinCountLocal;
|
|
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
|
auto perfCounter = __rdtsc() + uCount;
|
|
while (__rdtsc() < perfCounter)
|
|
#else
|
|
while (uCount > 0)
|
|
#endif
|
|
{
|
|
if (callback())
|
|
{
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
SMPPause();
|
|
uCount--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return callback();
|
|
}
|
|
|
|
template <typename T>
|
|
bool auline DoTryIf(T callback)
|
|
{
|
|
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
|
|
{
|
|
return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
|
|
}
|
|
else
|
|
{
|
|
return callback();
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
bool auline DoTryIfAlderLake(T callback, const void *pWord)
|
|
{
|
|
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
|
|
{
|
|
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord);
|
|
}
|
|
else
|
|
{
|
|
return callback();
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
bool auline DoTryIfAlderLake(T callback, const volatile void *pWord)
|
|
{
|
|
if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
|
|
{
|
|
return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord);
|
|
}
|
|
else
|
|
{
|
|
return callback();
|
|
}
|
|
}
|
|
} |