AuroraRuntime/Source/Threading/Primitives/SMTYield.hpp

/***
    Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: SMTYield.hpp
    Date: 2023-3-12
    Author: Reece
***/
#pragma once

// Whatever, i'll use this header to blead these flags in.
// It's the easiest way to get at all of the translation units compiling thread primitives.
// ...not required
#if defined(AURORA_COMPILER_MSVC)
    #pragma strict_gs_check(off)
    #pragma check_stack(off)
#endif

// dumbshit compiler is emitting stack checks under a non-zero amount of my thread primitives.
// "dumbshit compiler" doesnt quite do it justice when it believe a fucking spinlock-lock with an atomic bit test and set is worth a stack check.
#if defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC)
    #pragma GCC optimize("no-stack-protector")
#endif

#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) && \
    !defined(AURORA_COMPILER_MSVC) && \
    !defined(AURORA_COMPILER_INTEL) &&  \
    !defined(AURORA_A_GOOD_COMPILER_PLS)

// Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason.
// I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand?
// No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off.
// If these end up being wrong, blame clang and gnu for being cunts, not me.

static auline void __mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
{
    asm volatile(".byte 0x0f, 0x01, 0xfa;" :
                 : "a"(__p),
                   "c"(__extensions),
                   "d"(__hints));
}

static auline void __mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
{
    asm volatile(".byte 0x0f, 0x01, 0xfb;" :
                 : "a"(__hints),
                   "b"(__clock),
                   "c"(__extensions));
}

static auline void __umonitor(void * __address)
{
    __asm__ volatile(".byte 0xF3, 0x0F, 0xAE, 0x01;" :
                     : "a"(__address)
                     : );
}

static auline unsigned char __umwait(unsigned int __control, unsigned long long __counter)
{
    AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
    AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
    char flag;
    __asm__ volatile(".byte 0xF2, 0x0F, 0xAE, 0xF1\n"
                     "setb  %0"
                     : "=r"(flag)
                     : "a"(uTimeLo),
                       "d"(uTimeHi),
                       "c"(__control)
                     : );
    return flag;
}

static auline unsigned char __tpause(unsigned int __control, unsigned long long __counter)
{
    AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
    AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
    char flag;
    __asm__ volatile(".byte 0x66, 0x0F, 0xAE, 0xF1\n"
                     "setb  %0"
                     : "=r"(flag)
                     : "a"(uTimeHi),
                       "d"(uTimeHi),
                       "c"(__control)
                     : );
    return flag;
}

#define _mm_monitorx __mm_monitorx
#define _mm_mwaitx __mm_mwaitx
#define _umonitor __umonitor
#define _umwait __umwait
#define _tpause __tpause

#endif

namespace Aurora::Threading
{
    inline AuUInt32 gHasThreadLocalTimeout {};
    inline thread_local AuUInt32 tlsSpinCountLocal {};
}

extern "C"
{
    AuUInt32 SMTGetAPICNumber(void);
}

#include <Source/Extensions/Clocks.aarch64.hpp>

#define SPIN_FOUR 1

#define while_bc___(exp, ex) \
    AuUInt32 __wsc ## ex {}; \
    while ((exp) && ((__wsc ## ex++) < 2))
#define while_bc__(exp, ex) while_bc___(exp, ex)
#define while_bc_(exp, ex) while_bc__(exp, AU_WHAT(ex))

#if defined(AURORA_RUNTIME_ALWAYS_SPIN_ON_BROADCAST)
    #define while_bc(exp) while (exp)
#elif defined(AURORA_RUNTIME_ALWAYS_CHECK_ONCE_ON_BROADCAST)
    #define while_bc(exp) if (exp)
#elif defined(__COUNTER__)
    #define while_bc(exp) while_bc_(exp, __COUNTER__)
#else
    #define while_bc(exp) while_bc_(exp, __LINE__)
#endif

// Replace condition variable broadcasts: `if/while (waiters) { signal() }` loops with while_bc.
// while_bc will attempt to rebroadcast if another sleeper turns up. On unicore systems,
// depending on the scheduler, spinning in this fashion may result in a deadlock in a tight
// enough wait loop. In other systems, having a `while (waiters) { signal(); }` may help
// improve performance when other threads are tying up the system scheduler on the wake side.
// That way some threads can be late in, and we dont have to worry so much about there being a
// wake-up spin lock under real world use cases. To strike a balance between the two conditions,
// we add a little bit of extra branching overhead to ensure we don't spin more than 2-3 times.

namespace Aurora::Threading::Primitives
{
    namespace ThrdCfg
    {
        inline bool         gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
        inline bool         gEnableAggressiveScheduling {};
        inline bool         gEnableAgrSchedulingRatelimit {};
        inline bool         gPreferNtCondvarModernWinSpin {};
        inline bool         gPreferNtCondvarOlderWinSpin {};
        inline bool         gPreferNtSemaphoreSpinTryLock {};
        inline bool         gPreferNtMutexSpinTryLock {};
        inline bool         gPreferNtCondMutexSpinTryLock {};
        inline bool         gPreferLinuxSemaphoreSpinTryLock {};
        inline bool         gPreferLinuxMutexSpinTryLock {};
        inline bool         gPreferLinuxCondMutexSpinTryLock {};
        inline bool         gPreferEmulatedWakeOnAddress {};
        inline bool         gPreferWaitOnAddressAlwaysSpin {};
        inline bool         gPreferWaitOnAddressAlwaysSpinNative {};
        inline bool         gPreferRWLockReadLockSpin {};
        inline bool         gUWPNanosecondEmulationCheckFirst {};
        inline AuUInt32     gUWPNanosecondEmulationMaxYields {};
        inline bool         gForceEnableAdaptiveSpin {};
        inline bool         gPreferEnableAdaptiveSpin {};
        inline bool         gPreferLinuxAdaptiveSpin {};
        inline bool         gPreferOldWin32AdaptiveSpin {};
        inline bool         gPreferNewWin32AdaptiveSpin {};
        inline AuUInt32     gAdaptiveSpinCUCnt0 {};
        inline AuUInt32     gAdaptiveSpinCUCnt4 {};
        inline AuUInt32     gAdaptiveSpinCUCnt8 {};
        inline AuUInt32     gAdaptiveSpinCUCnt16 {};
        inline bool         gPreferFutexRWLock {};
        inline bool         gPreferFutexEvent {};
        inline bool         gWinXpThrough7BlazeOptimizerPower {};
        inline bool         gPreferLinuxPrimitivesFutexNoSpin {};
        inline bool         gPreferUnixPrimitivesNoSpin {};
        inline bool         gAlwaysRWLockWriteBiasOnReadLock {};
        inline bool         gEnableRWLockWriteBiasOnReadLock {};
        inline AuUInt32     gIsIntelAlderLakeOrGreater {};
        inline AuUInt32     gIsZen3OrGreater {};
        inline AuUInt8      gCountOfPCores {};
    }

#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
    using SMTAtomic_t = AuUInt8;
#else
    using SMTAtomic_t = AuUInt32;
#endif
    inline SMTAtomic_t gCoreTable[256] {};

    inline AuUInt32 gSpinAdaptiveThreshold {};
    inline AuUInt32 gSpinAdaptiveCurrentCount {};
    inline AuUInt32 gSpinAdaptiveThreadCount {};

    inline AuUInt32 gUseFutexRWLock {};
    inline AuUInt32 gPreferFutexEvent {};

    static constexpr AuUInt32 kMWAITXUseTSC           = (1 << 1);
    static constexpr AuUInt32 kMWAITXAllowInterrupts  = (1 << 0);
    static constexpr AuUInt32 kMWAITXFaultGP0         = (1 << 31);
    static constexpr AuUInt32 kMWAITXWaitOnStore      = 0;
    static constexpr AuUInt32 kMWAITXWaitOnStoreTimed = kMWAITXWaitOnStore | kMWAITXUseTSC;

    struct Blocky
    {
        SMTAtomic_t a;
    };

    inline const AuAlignTo<16, Blocky> kMassiveBlock;

    void InitAdaptiveThreshold();
    void InitAdaptiveThresholdFirstTime();
    void InitCfg();

    static const bool kEnableSmartScheduling =
    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
        true;
    #else
        // tbd by arch and os
        false;
    #endif

    static auline void SMPPause()
    {
    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
        _mm_pause();
    #elif defined(AURORA_ARCH_ARM)
    #if defined(AURORA_COMPILER_GCC)
        asm volatile("yield");
    #else
        __yield();
    #endif
    #else
        // TODO: your platform here
        AuThreading::ContextYield();
    #endif
    }

#if defined(AURORA_ARCH_ARM)

    static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
    {
        if (uFreq == 10000000)
        {
            return uCounter * 100ull;
        }
        else if (uFreq == 1000000)
        {
            return uCounter * 1000ull;
        }
        else if (uFreq == 100000)
        {
            return uCounter * 10000ull;
        }
        else if (uFreq == 100000000ull)
        {
            return uCounter * 10ull;
        }
        else if (uFreq == 1000000000ull)
        {
            return uCounter;
        }
        else
        {
            const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
            const long long uPart  = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
            return uWhole + uPart;
        }
    }

    static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
    {
        return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
        // context:
        //     Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
        //     Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
        //     Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
        //     Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
        //     This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
        //     On the aarch side of things, we should be able to match the exact Intel behaviour by:
        //      * Reading the system wide clock (CNTVCT_EL0)
        //      * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
        //      * Divide by approx "3.6 Ghz" ops/ns
        // *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
    }

    #define __rdtsc() RdtscArmEmulated(uClockFreq)
    #define ALT_RDT
#endif

    template <typename T>
    bool auline YieldToSharedCore(long spin, T callback)
    {
        if (callback())
        {
            return true;
        }

    #if defined(AURORA_ARCH_ARM)
        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
    #endif

        if (kEnableSmartScheduling)
        {
            bool bRet { false };
            auto uWord = SMTGetAPICNumber();
            if (uWord < AuArraySize(gCoreTable) &&
                uWord < ThrdCfg::gCountOfPCores)
            {
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);

                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
                if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
                {
                    auto uCount = spin;

                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
                    {
                        uCount /= 5;
                    }
                    else if (gHasThreadLocalTimeout)
                    {
                        uCount += tlsSpinCountLocal;
                    }

                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
                    if (ThrdCfg::gIsIntelAlderLakeOrGreater)
                    {
                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _tpause(0, __rdtsc() + uCount);
                            bRet = callback();
                        }
                    }
                    else if (ThrdCfg::gIsZen3OrGreater)
                    {
                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _mm_monitorx((void *)&kMassiveBlock, 0U, 0U);
                            _mm_mwaitx(kMWAITXUseTSC, 0, uCount);
                            bRet = callback();
                        }
                    }
                    else
                #endif
                    {
                    #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                        auto perfCounter = __rdtsc() + uCount;
                        while (__rdtsc() < perfCounter)
                    #else
                        while (uCount > 0)
                    #endif
                        {
                            if (callback())
                            {
                                bRet = true;
                                break;
                            }
                            else
                            {
                                SMPPause();
                                uCount--;
                            }
                        }
                    }
                }
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }

            return bRet;
        }
        else if (gSpinAdaptiveThreshold)
        {
            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);

            if (uNow <= gSpinAdaptiveThreshold)
            {
                auto uCount = spin;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        uCount -= 4;
                    #else
                        SMPPause();
                        uCount -= 1;
                    #endif
                    }
                }

                if (gHasThreadLocalTimeout)
                {
                    auto uCount = tlsSpinCountLocal;
                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                    auto perfCounter = __rdtsc() + uCount;
                    while (__rdtsc() < perfCounter)
                #else
                    while (uCount > 0)
                #endif
                    {
                        if (callback())
                        {
                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                            return true;
                        }
                        else
                        {
                            SMPPause();
                            uCount--;
                        }
                    }
                }

                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }
            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
            {
                auto uCount = (spin) / 3;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }

            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
        }
        else
        {
            auto uCount = spin;
        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
            auto perfCounter = __rdtsc() + uCount;
            while (__rdtsc() < perfCounter)
        #else
            while (uCount > 0)
        #endif
            {
                if (callback())
                {
                    return true;
                }
                else
                {
                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    uCount -= 4;
                #else
                    SMPPause();
                    uCount -= 1;
                #endif
                }
            }

            if (gHasThreadLocalTimeout)
            {
                auto uCount = tlsSpinCountLocal;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }
        }

        return callback();
    }

    template <typename T>
    bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord)
    {
        if (callback())
        {
            return true;
        }

    #if defined(AURORA_ARCH_ARM)
        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
    #endif

        if (kEnableSmartScheduling)
        {
            bool bRet { false };
            auto uWord = SMTGetAPICNumber();
            if (uWord < AuArraySize(gCoreTable) &&
                uWord < ThrdCfg::gCountOfPCores)
            {
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);

                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
                if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
                {
                    auto uCount = spin;
                    bool bSMTProbablyHit {};

                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
                    {
                        uCount /= 5;
                        bSMTProbablyHit = true;
                    }
                    else if (gHasThreadLocalTimeout)
                    {
                        uCount += tlsSpinCountLocal;
                    }

                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
                    if (ThrdCfg::gIsIntelAlderLakeOrGreater)
                    {
                        _umonitor((void *)AuPageRound<AuUInt>(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine));

                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _umwait(/*0*/ /*1*/ bSMTProbablyHit ? 0 : 1, __rdtsc() + uCount);
                            bRet = callback();
                        }
                    }
                    else if (ThrdCfg::gIsZen3OrGreater)
                    {
                        _mm_monitorx((void *)pWord, 0U, 0U);

                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _mm_mwaitx(kMWAITXWaitOnStoreTimed, 0, uCount);
                            bRet = callback();
                        }
                    }
                    else
                #endif
                    {
                    #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                        auto perfCounter = __rdtsc() + uCount;
                        while (__rdtsc() < perfCounter)
                    #else
                        while (uCount > 0)
                    #endif
                        {
                            if (callback())
                            {
                                bRet = true;
                                break;
                            }
                            else
                            {
                                SMPPause();
                                uCount--;
                            }
                        }
                    }
                }
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }

            return bRet;
        }
        else if (gSpinAdaptiveThreshold)
        {
            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);

            if (uNow <= gSpinAdaptiveThreshold)
            {
                auto uCount = spin;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        uCount -= 4;
                    #else
                        SMPPause();
                        uCount -= 1;
                    #endif
                    }
                }

                if (gHasThreadLocalTimeout)
                {
                    auto uCount = tlsSpinCountLocal;
                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                    auto perfCounter = __rdtsc() + uCount;
                    while (__rdtsc() < perfCounter)
                #else
                    while (uCount > 0)
                #endif
                    {
                        if (callback())
                        {
                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                            return true;
                        }
                        else
                        {
                            SMPPause();
                            uCount--;
                        }
                    }
                }

                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }
            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
            {
                auto uCount = (spin) / 3;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }

            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
        }
        else
        {
            auto uCount = spin;
        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
            auto perfCounter = __rdtsc() + uCount;
            while (__rdtsc() < perfCounter)
        #else
            while (uCount > 0)
        #endif
            {
                if (callback())
                {
                    return true;
                }
                else
                {
                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    uCount -= 4;
                #else
                    SMPPause();
                    uCount -= 1;
                #endif
                }
            }

            if (gHasThreadLocalTimeout)
            {
                auto uCount = tlsSpinCountLocal;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }
        }

        return callback();
    }

    template <typename T>
    bool auline DoTryIf(T callback)
    {
        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
        {
            return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
        }
        else
        {
            return callback();
        }
    }

    template <typename T>
    bool auline DoTryIfAlderLake(T callback, const void *pWord)
    {
        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
        {
            return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord);
        }
        else
        {
            return callback();
        }
    }

    template <typename T>
    bool auline DoTryIfAlderLake(T callback, const volatile void *pWord)
    {
        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
        {
            return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord);
        }
        else
        {
            return callback();
        }
    }
}