AuroraRuntime/Source/Threading/Primitives/SMTYield.hpp

/***
    Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: SMTYield.hpp
    Date: 2023-3-12
    Author: Reece
***/
#pragma once

// Whatever, i'll use this header to blead these flags in.
// It's the easiest way to get at all of the translation units compiling thread primitives.
// ...not required
#if defined(AURORA_COMPILER_MSVC)
    #pragma strict_gs_check(off)
    #pragma check_stack(off)
#endif

// dumbshit compiler is emitting stack checks under a non-zero amount of my thread primitives.
// "dumbshit compiler" doesnt quite do it justice when it believe a fucking spinlock-lock with an atomic bit test and set is worth a stack check.
#if defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC)
    #pragma GCC optimize("no-stack-protector")
#endif

#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) && \
    !defined(AURORA_COMPILER_MSVC) && \
    !defined(AURORA_COMPILER_INTEL) &&  \
    !defined(AURORA_A_GOOD_COMPILER_PLS)

// Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason.
// I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand?
// No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off.
// If these end up being wrong, blame clang and gnu for being cunts, not me.

static auline void __mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
{
    asm volatile(".byte 0x0f, 0x01, 0xfa;" :
                 : "a"(__p),
                   "c"(__extensions),
                   "d"(__hints));
}

static auline void __mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
{
    asm volatile(".byte 0x0f, 0x01, 0xfb;" :
                 : "a"(__hints),
                   "b"(__clock),
                   "c"(__extensions));
}

static auline void __umonitor(void * __address)
{
    __asm__ volatile(".byte 0xF3, 0x0F, 0xAE, 0x01;" :
                     : "a"(__address)
                     : );
}
 
static auline unsigned char __umwait(unsigned int __control, unsigned long long __counter)
{
    AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
    AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
    char flag;
    __asm__ volatile(".byte 0xF2, 0x0F, 0xAE, 0xF1\n"
                     "setb  %0"
                     : "=r"(flag)
                     : "a"(uTimeLo),
                       "d"(uTimeHi),
                       "c"(__control)
                     : );
    return flag;
}
 
static auline unsigned char __tpause(unsigned int __control, unsigned long long __counter)
{
    AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
    AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
    char flag;
    __asm__ volatile(".byte 0x66, 0x0F, 0xAE, 0xF1\n"
                     "setb  %0"
                     : "=r"(flag)
                     : "a"(uTimeLo),
                       "d"(uTimeHi),
                       "c"(__control)
                     : );
    return flag;
}

#define _mm_monitorx __mm_monitorx
#define _mm_mwaitx __mm_mwaitx
#define _umonitor __umonitor
#define _umwait __umwait
#define _tpause __tpause

#endif

namespace Aurora::Threading
{
    inline AuUInt32 gHasThreadLocalTimeout {};
    inline thread_local AuUInt32 tlsSpinCountLocal {};
}

extern "C"
{
    AuUInt32 SMTGetAPICNumber(void);
}

#include <Source/Extensions/Clocks.aarch64.hpp>

#define SPIN_FOUR 1

#define while_bc___(exp, ex) \
    AuUInt32 __wsc ## ex {}; \
    while ((exp) && ((__wsc ## ex++) < 2))
#define while_bc__(exp, ex) while_bc___(exp, ex)
#define while_bc_(exp, ex) while_bc__(exp, AU_WHAT(ex))

#if defined(AURORA_RUNTIME_ALWAYS_SPIN_ON_BROADCAST)
    #define while_bc(exp) while (exp)
#elif defined(AURORA_RUNTIME_ALWAYS_CHECK_ONCE_ON_BROADCAST)
    #define while_bc(exp) if (exp)
#elif defined(__COUNTER__)
    #define while_bc(exp) while_bc_(exp, __COUNTER__)
#else
    #define while_bc(exp) while_bc_(exp, __LINE__)
#endif

// Replace condition variable broadcasts: `if/while (waiters) { signal() }` loops with while_bc.
// while_bc will attempt to rebroadcast if another sleeper turns up. On unicore systems, 
// depending on the scheduler, spinning in this fashion may result in a deadlock in a tight
// enough wait loop. In other systems, having a `while (waiters) { signal(); }` may help
// improve performance when other threads are tying up the system scheduler on the wake side.
// That way some threads can be late in, and we dont have to worry so much about there being a 
// wake-up spin lock under real world use cases. To strike a balance between the two conditions,  
// we add a little bit of extra branching overhead to ensure we don't spin more than 2-3 times.

namespace Aurora::Threading::Primitives
{
    namespace ThrdCfg
    {
        inline bool         gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
        inline bool         gEnableAggressiveScheduling {};
        inline bool         gEnableAgrSchedulingRatelimit {};
        inline bool         gPreferNtCondvarModernWinSpin {};
        inline bool         gPreferNtCondvarOlderWinSpin {};
        inline bool         gPreferNtSemaphoreSpinTryLock {};
        inline bool         gPreferNtMutexSpinTryLock {};
        inline bool         gPreferNtCondMutexSpinTryLock {};
        inline bool         gPreferLinuxSemaphoreSpinTryLock {};
        inline bool         gPreferLinuxMutexSpinTryLock {};
        inline bool         gPreferLinuxCondMutexSpinTryLock {};
        inline bool         gPreferEmulatedWakeOnAddress {};
        inline bool         gPreferWaitOnAddressAlwaysSpin {};
        inline bool         gPreferWaitOnAddressAlwaysSpinNative {};
        inline bool         gPreferRWLockReadLockSpin {};
        inline bool         gUWPNanosecondEmulationCheckFirst {};
        inline AuUInt32     gUWPNanosecondEmulationMaxYields {};
        inline bool         gForceEnableAdaptiveSpin {};
        inline bool         gPreferEnableAdaptiveSpin {};
        inline bool         gPreferLinuxAdaptiveSpin {};
        inline bool         gPreferOldWin32AdaptiveSpin {};
        inline bool         gPreferNewWin32AdaptiveSpin {};
        inline AuUInt32     gAdaptiveSpinCUCnt0 {};
        inline AuUInt32     gAdaptiveSpinCUCnt4 {};
        inline AuUInt32     gAdaptiveSpinCUCnt8 {};
        inline AuUInt32     gAdaptiveSpinCUCnt16 {};
        inline bool         gPreferFutexRWLock {};
        inline bool         gPreferFutexEvent {};
        inline bool         gWinXpThrough7BlazeOptimizerPower {};
        inline bool         gPreferLinuxPrimitivesFutexNoSpin {};
        inline bool         gPreferUnixPrimitivesNoSpin {};
        inline bool         gAlwaysRWLockWriteBiasOnReadLock {};
        inline bool         gEnableRWLockWriteBiasOnReadLock {};
        inline AuUInt32     gIsIntelAlderLakeOrGreater {};
        inline AuUInt32     gIsZen3OrGreater {};
        inline AuUInt8      gCountOfPCores {};
    }
    
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
    using SMTAtomic_t = AuUInt8;
#else
    using SMTAtomic_t = AuUInt32;
#endif
    inline SMTAtomic_t gCoreTable[256] {};

    inline AuUInt32 gSpinAdaptiveThreshold {};
    inline AuUInt32 gSpinAdaptiveCurrentCount {};
    inline AuUInt32 gSpinAdaptiveThreadCount {};

    inline AuUInt32 gUseFutexRWLock {};
    inline AuUInt32 gPreferFutexEvent {};

    static constexpr AuUInt32 kMWAITXUseTSC           = (1 << 1);
    static constexpr AuUInt32 kMWAITXAllowInterrupts  = (1 << 0);
    static constexpr AuUInt32 kMWAITXFaultGP0         = (1 << 31);
    static constexpr AuUInt32 kMWAITXWaitOnStore      = 0;
    static constexpr AuUInt32 kMWAITXWaitOnStoreTimed = kMWAITXWaitOnStore | kMWAITXUseTSC;

    struct Blocky
    {
        SMTAtomic_t a;
    };

    inline const AuAlignTo<16, Blocky> kMassiveBlock;

    void InitAdaptiveThreshold();
    void InitAdaptiveThresholdFirstTime();
    void InitCfg();

    static const bool kEnableSmartScheduling =
    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
        true;
    #else
        // tbd by arch and os
        false;
    #endif

    static auline void SMPPause()
    {
    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
        _mm_pause();
    #elif defined(AURORA_ARCH_ARM)
    #if defined(AURORA_COMPILER_GCC)
        asm volatile("yield");
    #else
        __yield();
    #endif
    #else
        // TODO: your platform here
        AuThreading::ContextYield();
    #endif
    }

#if defined(AURORA_ARCH_ARM)

    static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
    {
        if (uFreq == 10000000)
        {
            return uCounter * 100ull;
        }
        else if (uFreq == 1000000)
        {
            return uCounter * 1000ull;
        }
        else if (uFreq == 100000)
        {
            return uCounter * 10000ull;
        }
        else if (uFreq == 100000000ull)
        {
            return uCounter * 10ull;
        }
        else if (uFreq == 1000000000ull)
        {
            return uCounter;
        }
        else
        {
            const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
            const long long uPart  = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
            return uWhole + uPart;
        }
    }

    static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
    {
        return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
        // context: 
        //     Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
        //     Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
        //     Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it. 
        //     Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
        //     This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
        //     On the aarch side of things, we should be able to match the exact Intel behaviour by:
        //      * Reading the system wide clock (CNTVCT_EL0)
        //      * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
        //      * Divide by approx "3.6 Ghz" ops/ns
        // *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
    }

    #define __rdtsc() RdtscArmEmulated(uClockFreq)
    #define ALT_RDT
#endif

    template <typename T>
    bool auline YieldToSharedCore(long spin, T callback)
    {
        if (callback())
        {
            return true;
        }
        
    #if defined(AURORA_ARCH_ARM)
        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
    #endif
      
        if (kEnableSmartScheduling)
        {
            bool bRet { false };
            auto uWord = SMTGetAPICNumber();
            if (uWord < AuArraySize(gCoreTable) &&
                uWord < ThrdCfg::gCountOfPCores)
            {
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);

                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
                if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
                {
                    auto uCount = spin;
                        
                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
                    {
                        uCount /= 5;
                    }
                    else if (gHasThreadLocalTimeout)
                    {
                        uCount += tlsSpinCountLocal;
                    }
                    
                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
                    if (ThrdCfg::gIsIntelAlderLakeOrGreater)
                    {
                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _tpause(0, __rdtsc() + uCount);
                            bRet = callback();
                        }
                    }
                    else if (ThrdCfg::gIsZen3OrGreater)
                    {
                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _mm_monitorx((void *)&kMassiveBlock, 0U, 0U);
                            _mm_mwaitx(kMWAITXUseTSC, 0, uCount);
                            bRet = callback();
                        }
                    }
                    else
                #endif
                    {
                    #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                        auto perfCounter = __rdtsc() + uCount;
                        while (__rdtsc() < perfCounter)
                    #else
                        while (uCount > 0)
                    #endif
                        {
                            if (callback())
                            {
                                bRet = true;
                                break;
                            }
                            else
                            {
                                SMPPause();
                                uCount--;
                            }
                        }
                    }
                }
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }

            return bRet;
        }
        else if (gSpinAdaptiveThreshold)
        {
            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);

            if (uNow <= gSpinAdaptiveThreshold)
            {
                auto uCount = spin;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        uCount -= 4;
                    #else
                        SMPPause();
                        uCount -= 1;
                    #endif
                    }
                }

                if (gHasThreadLocalTimeout)
                {
                    auto uCount = tlsSpinCountLocal;
                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                    auto perfCounter = __rdtsc() + uCount;
                    while (__rdtsc() < perfCounter)
                #else
                    while (uCount > 0)
                #endif
                    {
                        if (callback())
                        {
                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                            return true;
                        }
                        else
                        {
                            SMPPause();
                            uCount--;
                        }
                    }
                }

                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }
            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
            {
                auto uCount = (spin) / 3;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }

            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
        }
        else
        {
            auto uCount = spin;
        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
            auto perfCounter = __rdtsc() + uCount;
            while (__rdtsc() < perfCounter)
        #else
            while (uCount > 0)
        #endif            
            {
                if (callback())
                {
                    return true;
                }
                else
                {
                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    uCount -= 4;
                #else
                    SMPPause();
                    uCount -= 1;
                #endif
                }
            }

            if (gHasThreadLocalTimeout)
            {
                auto uCount = tlsSpinCountLocal;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }
        }

        return callback();
    }
    
    template <typename T>
    bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord)
    {
        if (callback())
        {
            return true;
        }
        
    #if defined(AURORA_ARCH_ARM)
        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
    #endif
      
        if (kEnableSmartScheduling)
        {
            bool bRet { false };
            auto uWord = SMTGetAPICNumber();
            if (uWord < AuArraySize(gCoreTable) &&
                uWord < ThrdCfg::gCountOfPCores)
            {
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);

                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
                if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
                {
                    auto uCount = spin;
                    bool bSMTProbablyHit {};
                        
                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
                    {
                        uCount /= 5;
                        bSMTProbablyHit = true;
                    }
                    else if (gHasThreadLocalTimeout)
                    {
                        uCount += tlsSpinCountLocal;
                    }

                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
                    if (ThrdCfg::gIsIntelAlderLakeOrGreater)
                    {
                        _umonitor((void *)AuPageRound<AuUInt>(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine));

                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _umwait(/*0*/ /*1*/ bSMTProbablyHit ? 0 : 1, __rdtsc() + uCount);
                            bRet = callback();
                        }
                    }
                    else if (ThrdCfg::gIsZen3OrGreater)
                    {
                        _mm_monitorx((void *)pWord, 0U, 0U);

                        if (callback())
                        {
                            bRet = true;
                        }
                        else
                        {
                            _mm_mwaitx(kMWAITXWaitOnStoreTimed, 0, uCount);
                            bRet = callback();
                        }
                    }
                    else
                #endif
                    {
                    #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                        auto perfCounter = __rdtsc() + uCount;
                        while (__rdtsc() < perfCounter)
                    #else
                        while (uCount > 0)
                    #endif
                        {
                            if (callback())
                            {
                                bRet = true;
                                break;
                            }
                            else
                            {
                                SMPPause();
                                uCount--;
                            }
                        }
                    }
                }
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }

            return bRet;
        }
        else if (gSpinAdaptiveThreshold)
        {
            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);

            if (uNow <= gSpinAdaptiveThreshold)
            {
                auto uCount = spin;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        uCount -= 4;
                    #else
                        SMPPause();
                        uCount -= 1;
                    #endif
                    }
                }

                if (gHasThreadLocalTimeout)
                {
                    auto uCount = tlsSpinCountLocal;
                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                    auto perfCounter = __rdtsc() + uCount;
                    while (__rdtsc() < perfCounter)
                #else
                    while (uCount > 0)
                #endif
                    {
                        if (callback())
                        {
                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                            return true;
                        }
                        else
                        {
                            SMPPause();
                            uCount--;
                        }
                    }
                }

                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }
            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
            {
                auto uCount = (spin) / 3;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }

            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
        }
        else
        {
            auto uCount = spin;
        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
            auto perfCounter = __rdtsc() + uCount;
            while (__rdtsc() < perfCounter)
        #else
            while (uCount > 0)
        #endif            
            {
                if (callback())
                {
                    return true;
                }
                else
                {
                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    uCount -= 4;
                #else
                    SMPPause();
                    uCount -= 1;
                #endif
                }
            }

            if (gHasThreadLocalTimeout)
            {
                auto uCount = tlsSpinCountLocal;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }
        }

        return callback();
    }

    template <typename T>
    bool auline DoTryIf(T callback)
    {
        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
        {
            return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
        }
        else
        {
            return callback();
        }
    }

    template <typename T>
    bool auline DoTryIfAlderLake(T callback, const void *pWord)
    {
        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
        {
            return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord);
        }
        else
        {
            return callback();
        }
    }

    template <typename T>
    bool auline DoTryIfAlderLake(T callback, const volatile void *pWord)
    {
        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
        {
            return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord);
        }
        else
        {
            return callback();
        }
    }
}
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								/***
-												[*] Update the copyright header of most of the primitives
[*] Fix generic mutex abs yield always returning true

											
										
										
											2024-01-29 14:09:59 +00:00
+								    Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Allocationless thread primitives
[*] Rename SMPYield to SMTYield

											
										
										
											2023-03-21 03:18:09 +00:00
+								    File: SMTYield.hpp
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								    Date: 2023-3-12
 								    Author: Reece
 								***/
 								#pragma once
-												[*] Clang has check_stack, strict_gs_check is msvc specific

											
										
										
											2024-05-10 21:17:20 +00:00
+								// Whatever, i'll use this header to blead these flags in.
 								// It's the easiest way to get at all of the translation units compiling thread primitives.
 								// ...not required
 								#if defined(AURORA_COMPILER_MSVC)
 								    #pragma strict_gs_check(off)
 								    #pragma check_stack(off)
 								#endif
 								// dumbshit compiler is emitting stack checks under a non-zero amount of my thread primitives.
 								// "dumbshit compiler" doesnt quite do it justice when it believe a fucking spinlock-lock with an atomic bit test and set is worth a stack check.
-												[*] Revert clang 'optimization' because this piece of shit compiler wont listen to me.
Even worse, im just going to fucking nuke all clang related checks from orbit in our global build_scripts (8b00dc69fceea62ecbbf5a21255a41e2f23921a4), because they admit they cause a 2x slowdown.

											
										
										
											2024-05-13 22:15:52 +00:00
+								#if defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC)
 								    #pragma GCC optimize("no-stack-protector")
-												[*] Clang has check_stack, strict_gs_check is msvc specific

											
										
										
											2024-05-10 21:17:20 +00:00
+								#endif
-												[*] Linux build regressions, and shrink the size of Linux RWLocks to 48 bytes from 64

											
										
										
											2024-05-07 13:41:16 +00:00
+								#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) && \
 								    !defined(AURORA_COMPILER_MSVC) && \
 								    !defined(AURORA_COMPILER_INTEL) &&  \
 								    !defined(AURORA_A_GOOD_COMPILER_PLS)
 								// Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason.
 								// I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand?
 								// No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off.
 								// If these end up being wrong, blame clang and gnu for being cunts, not me.
 								static auline void __mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
 								{
 								    asm volatile(".byte 0x0f, 0x01, 0xfa;" :
 								                 : "a"(__p),
 								                   "c"(__extensions),
 								                   "d"(__hints));
 								}
 								static auline void __mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
 								{
 								    asm volatile(".byte 0x0f, 0x01, 0xfb;" :
 								                 : "a"(__hints),
 								                   "b"(__clock),
 								                   "c"(__extensions));
 								}
 								static auline void __umonitor(void * __address)
 								{
 								    __asm__ volatile(".byte 0xF3, 0x0F, 0xAE, 0x01;" :
 								                     : "a"(__address)
 								                     : );
 								}
 								static auline unsigned char __umwait(unsigned int __control, unsigned long long __counter)
 								{
 								    AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
 								    AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
 								    char flag;
 								    __asm__ volatile(".byte 0xF2, 0x0F, 0xAE, 0xF1\n"
 								                     "setb  %0"
 								                     : "=r"(flag)
 								                     : "a"(uTimeLo),
 								                       "d"(uTimeHi),
 								                       "c"(__control)
 								                     : );
 								    return flag;
 								}
 								static auline unsigned char __tpause(unsigned int __control, unsigned long long __counter)
 								{
 								    AuUInt32 uTimeHi = AuUInt32(__counter >> 32);
 								    AuUInt32 uTimeLo = AuUInt32(__counter & 0xffffffff);
 								    char flag;
 								    __asm__ volatile(".byte 0x66, 0x0F, 0xAE, 0xF1\n"
 								                     "setb  %0"
 								                     : "=r"(flag)
-												[*] As I said, blame clang and gcc devs for being retarded cunts.

// Even if clang (and gcc) has these intrins available, you must enable them globally, unlike see for some fucking reason.
// I mean, we can do runtime branching around SSE4 paths no problem. Why all of a sudden am i being gated out of the intrins im electing to use by hand?
// No, you (the compiler) may not use these in your baseline feature set (or incl in stl locks). Yes, i still want them. Now fuck off.
// If these end up being wrong, blame clang and gnu for being cunts, not me.

No, I will not raise our requirements above ivybridge; no, I will not expose feature macros to the STL (et al) that boosts our requirements to modern intelaviv slop and amd atomic ackers

											
										
										
											2024-08-19 07:02:35 +00:00
+								                     : "a"(uTimeLo),
-												[*] Linux build regressions, and shrink the size of Linux RWLocks to 48 bytes from 64

											
										
										
											2024-05-07 13:41:16 +00:00
+								                       "d"(uTimeHi),
 								                       "c"(__control)
 								                     : );
 								    return flag;
 								}
 								#define _mm_monitorx __mm_monitorx
 								#define _mm_mwaitx __mm_mwaitx
 								#define _umonitor __umonitor
 								#define _umwait __umwait
 								#define _tpause __tpause
 								#endif
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								namespace Aurora::Threading
 								{
 								    inline AuUInt32 gHasThreadLocalTimeout {};
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								    inline thread_local AuUInt32 tlsSpinCountLocal {};
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								}
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								extern "C"
 								{
-												[*] No way should we be using DWORDs here

											
										
										
											2024-01-07 02:26:34 +00:00
+								    AuUInt32 SMTGetAPICNumber(void);
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								}
 								#include <Source/Extensions/Clocks.aarch64.hpp>
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								#define SPIN_FOUR 1
-												[+] while_bc

											
										
										
											2024-04-13 21:49:05 +00:00
+								#define while_bc___(exp, ex) \
 								    AuUInt32 __wsc ## ex {}; \
 								    while ((exp) && ((__wsc ## ex++) < 2))
 								#define while_bc__(exp, ex) while_bc___(exp, ex)
 								#define while_bc_(exp, ex) while_bc__(exp, AU_WHAT(ex))
 								#if defined(AURORA_RUNTIME_ALWAYS_SPIN_ON_BROADCAST)
 								    #define while_bc(exp) while (exp)
 								#elif defined(AURORA_RUNTIME_ALWAYS_CHECK_ONCE_ON_BROADCAST)
 								    #define while_bc(exp) if (exp)
 								#elif defined(__COUNTER__)
 								    #define while_bc(exp) while_bc_(exp, __COUNTER__)
 								#else
 								    #define while_bc(exp) while_bc_(exp, __LINE__)
 								#endif
 								// Replace condition variable broadcasts: `if/while (waiters) { signal() }` loops with while_bc.
 								// while_bc will attempt to rebroadcast if another sleeper turns up. On unicore systems,
 								// depending on the scheduler, spinning in this fashion may result in a deadlock in a tight
 								// enough wait loop. In other systems, having a `while (waiters) { signal(); }` may help
 								// improve performance when other threads are tying up the system scheduler on the wake side.
 								// That way some threads can be late in, and we dont have to worry so much about there being a
 								// wake-up spin lock under real world use cases. To strike a balance between the two conditions,
 								// we add a little bit of extra branching overhead to ensure we don't spin more than 2-3 times.
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								namespace Aurora::Threading::Primitives
 								{
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								    namespace ThrdCfg
 								    {
 								        inline bool         gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
 								        inline bool         gEnableAggressiveScheduling {};
 								        inline bool         gEnableAgrSchedulingRatelimit {};
 								        inline bool         gPreferNtCondvarModernWinSpin {};
 								        inline bool         gPreferNtCondvarOlderWinSpin {};
 								        inline bool         gPreferNtSemaphoreSpinTryLock {};
 								        inline bool         gPreferNtMutexSpinTryLock {};
 								        inline bool         gPreferNtCondMutexSpinTryLock {};
 								        inline bool         gPreferLinuxSemaphoreSpinTryLock {};
 								        inline bool         gPreferLinuxMutexSpinTryLock {};
 								        inline bool         gPreferLinuxCondMutexSpinTryLock {};
 								        inline bool         gPreferEmulatedWakeOnAddress {};
 								        inline bool         gPreferWaitOnAddressAlwaysSpin {};
-												[+] WaitOnAddress[Steady](..., AuOptional<bool> optAlreadySpun = {}) arguments
[+] ...slight UWP optimization?
[*] Lift WoA limitation

											
										
										
											2023-10-30 14:50:28 +00:00
+								        inline bool         gPreferWaitOnAddressAlwaysSpinNative {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								        inline bool         gPreferRWLockReadLockSpin {};
 								        inline bool         gUWPNanosecondEmulationCheckFirst {};
 								        inline AuUInt32     gUWPNanosecondEmulationMaxYields {};
 								        inline bool         gForceEnableAdaptiveSpin {};
 								        inline bool         gPreferEnableAdaptiveSpin {};
 								        inline bool         gPreferLinuxAdaptiveSpin {};
 								        inline bool         gPreferOldWin32AdaptiveSpin {};
 								        inline bool         gPreferNewWin32AdaptiveSpin {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt0 {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt4 {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt8 {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt16 {};
 								        inline bool         gPreferFutexRWLock {};
-												[+] Non-mutually exclusive binary semaphore / event wait path
[+] ThreadingConfig::gPreferFutexEvent

											
										
										
											2023-09-10 13:50:59 +00:00
+								        inline bool         gPreferFutexEvent {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								        inline bool         gWinXpThrough7BlazeOptimizerPower {};
 								        inline bool         gPreferLinuxPrimitivesFutexNoSpin {};
 								        inline bool         gPreferUnixPrimitivesNoSpin {};
 								        inline bool         gAlwaysRWLockWriteBiasOnReadLock {};
 								        inline bool         gEnableRWLockWriteBiasOnReadLock {};
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
+								        inline AuUInt32     gIsIntelAlderLakeOrGreater {};
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								        inline AuUInt32     gIsZen3OrGreater {};
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								        inline AuUInt8      gCountOfPCores {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								    }
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
 								#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
 								    using SMTAtomic_t = AuUInt8;
 								#else
 								    using SMTAtomic_t = AuUInt32;
 								#endif
 								    inline SMTAtomic_t gCoreTable[256] {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								    inline AuUInt32 gSpinAdaptiveThreshold {};
 								    inline AuUInt32 gSpinAdaptiveCurrentCount {};
-												[*] SMT Yield: minor branch added to SMT Yield

											
										
										
											2023-09-09 17:09:22 +00:00
+								    inline AuUInt32 gSpinAdaptiveThreadCount {};
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
-												[*] RWLock: WakeOnAddress optimization on wait to prevent mutex congestion on modern OSes

											
										
										
											2023-08-23 13:52:47 +00:00
+								    inline AuUInt32 gUseFutexRWLock {};
-												[+] Non-mutually exclusive binary semaphore / event wait path
[+] ThreadingConfig::gPreferFutexEvent

											
										
										
											2023-09-10 13:50:59 +00:00
+								    inline AuUInt32 gPreferFutexEvent {};
-												[*] RWLock: WakeOnAddress optimization on wait to prevent mutex congestion on modern OSes

											
										
										
											2023-08-23 13:52:47 +00:00
-												[*] i swore i replaced this with a tpause before
[*] ...and the docs arent clear on whether or not this clock value is relative or absolute

											
										
										
											2024-05-06 21:04:26 +00:00
+								    static constexpr AuUInt32 kMWAITXUseTSC           = (1 << 1);
 								    static constexpr AuUInt32 kMWAITXAllowInterrupts  = (1 << 0);
 								    static constexpr AuUInt32 kMWAITXFaultGP0         = (1 << 31);
 								    static constexpr AuUInt32 kMWAITXWaitOnStore      = 0;
 								    static constexpr AuUInt32 kMWAITXWaitOnStoreTimed = kMWAITXWaitOnStore | kMWAITXUseTSC;
 								    struct Blocky
 								    {
 								        SMTAtomic_t a;
 								    };
 								    inline const AuAlignTo<16, Blocky> kMassiveBlock;
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								    void InitAdaptiveThreshold();
 								    void InitAdaptiveThresholdFirstTime();
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								    void InitCfg();
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								    static const bool kEnableSmartScheduling =
 								    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
 								        true;
 								    #else
 								        // tbd by arch and os
 								        false;
 								    #endif
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								    static auline void SMPPause()
 								    {
 								    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
 								        _mm_pause();
 								    #elif defined(AURORA_ARCH_ARM)
 								    #if defined(AURORA_COMPILER_GCC)
 								        asm volatile("yield");
 								    #else
 								        __yield();
 								    #endif
 								    #else
 								        // TODO: your platform here
 								        AuThreading::ContextYield();
 								    #endif
 								    }
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								#if defined(AURORA_ARCH_ARM)
 								    static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
 								    {
 								        if (uFreq == 10000000)
 								        {
 								            return uCounter * 100ull;
 								        }
 								        else if (uFreq == 1000000)
 								        {
 								            return uCounter * 1000ull;
 								        }
 								        else if (uFreq == 100000)
 								        {
 								            return uCounter * 10000ull;
 								        }
 								        else if (uFreq == 100000000ull)
 								        {
 								            return uCounter * 10ull;
 								        }
 								        else if (uFreq == 1000000000ull)
 								        {
 								            return uCounter;
 								        }
 								        else
 								        {
 								            const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
 								            const long long uPart  = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
 								            return uWhole + uPart;
 								        }
 								    }
 								    static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
 								    {
 								        return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
 								        // context:
 								        //     Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
 								        //     Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
 								        //     Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
 								        //     Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
 								        //     This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
 								        //     On the aarch side of things, we should be able to match the exact Intel behaviour by:
 								        //      * Reading the system wide clock (CNTVCT_EL0)
 								        //      * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
 								        //      * Divide by approx "3.6 Ghz" ops/ns
 								        // *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
 								    }
 								    #define __rdtsc() RdtscArmEmulated(uClockFreq)
 								    #define ALT_RDT
 								#endif
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								    template <typename T>
 								    bool auline YieldToSharedCore(long spin, T callback)
 								    {
 								        if (callback())
 								        {
 								            return true;
 								        }
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
 								    #if defined(AURORA_ARCH_ARM)
 								        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
 								    #endif
 								        if (kEnableSmartScheduling)
 								        {
 								            bool bRet { false };
 								            auto uWord = SMTGetAPICNumber();
 								            if (uWord < AuArraySize(gCoreTable) &&
 								                uWord < ThrdCfg::gCountOfPCores)
 								            {
 								                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
 								                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								                if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								                {
 								                    auto uCount = spin;
 								                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
 								                    {
 								                        uCount /= 5;
 								                    }
 								                    else if (gHasThreadLocalTimeout)
 								                    {
 								                        uCount += tlsSpinCountLocal;
 								                    }
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
 								                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
 								                    if (ThrdCfg::gIsIntelAlderLakeOrGreater)
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								                    {
 								                        if (callback())
 								                        {
 								                            bRet = true;
 								                        }
 								                        else
 								                        {
-												[*] i swore i replaced this with a tpause before
[*] ...and the docs arent clear on whether or not this clock value is relative or absolute

											
										
										
											2024-05-06 21:04:26 +00:00
+								                            _tpause(0, __rdtsc() + uCount);
 								                            bRet = callback();
 								                        }
 								                    }
 								                    else if (ThrdCfg::gIsZen3OrGreater)
 								                    {
 								                        if (callback())
 								                        {
 								                            bRet = true;
 								                        }
 								                        else
 								                        {
-												[*] Linux build regressions, and shrink the size of Linux RWLocks to 48 bytes from 64

											
										
										
											2024-05-07 13:41:16 +00:00
+								                            _mm_monitorx((void *)&kMassiveBlock, 0U, 0U);
-												[*] i swore i replaced this with a tpause before
[*] ...and the docs arent clear on whether or not this clock value is relative or absolute

											
										
										
											2024-05-06 21:04:26 +00:00
+								                            _mm_mwaitx(kMWAITXUseTSC, 0, uCount);
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								                            bRet = callback();
 								                        }
 								                    }
 								                    else
 								                #endif
 								                    {
 								                    #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
 								                        auto perfCounter = __rdtsc() + uCount;
 								                        while (__rdtsc() < perfCounter)
 								                    #else
 								                        while (uCount > 0)
 								                    #endif
 								                        {
 								                            if (callback())
 								                            {
 								                                bRet = true;
 								                                break;
 								                            }
 								                            else
 								                            {
 								                                SMPPause();
 								                                uCount--;
 								                            }
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								                        }
 								                    }
 								                }
 								                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
 								                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								            }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								            return bRet;
 								        }
 								        else if (gSpinAdaptiveThreshold)
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								        {
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
 								            if (uNow <= gSpinAdaptiveThreshold)
 								            {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto uCount = spin;
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto perfCounter = __rdtsc() + uCount;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                while (__rdtsc() < perfCounter)
 								            #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                while (uCount > 0)
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								            #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                {
 								                    if (callback())
 								                    {
 								                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                        return true;
 								                    }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    else
 								                    {
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                        SMPPause();
 								                        SMPPause();
 								                        SMPPause();
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount -= 4;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                    #else
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount -= 1;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                    #endif
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                if (gHasThreadLocalTimeout)
 								                {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    auto uCount = tlsSpinCountLocal;
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    auto perfCounter = __rdtsc() + uCount;
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								                    while (__rdtsc() < perfCounter)
 								                #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    while (uCount > 0)
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								                #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                    {
 								                        if (callback())
 								                        {
 								                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                            return true;
 								                        }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                        else
 								                        {
 								                            SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                            uCount--;
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                        }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                    }
 								                }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								            }
-												[*] ...yes

											
										
										
											2023-09-19 00:38:16 +00:00
+								            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								            {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto uCount = (spin) / 3;
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto perfCounter = __rdtsc() + uCount;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                while (__rdtsc() < perfCounter)
 								            #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                while (uCount > 0)
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								            #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                {
 								                    if (callback())
 								                    {
 								                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                        return true;
 								                    }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    else
 								                    {
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount--;
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								            }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								        }
 								        else
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								        {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								            auto uCount = spin;
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								            auto perfCounter = __rdtsc() + uCount;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								            while (__rdtsc() < perfCounter)
 								        #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								            while (uCount > 0)
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								        #endif
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								            {
 								                if (callback())
 								                {
 								                    return true;
 								                }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                else
 								                {
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    SMPPause();
 								                    SMPPause();
 								                    SMPPause();
 								                    SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    uCount -= 4;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                #else
 								                    SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    uCount -= 1;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                #endif
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                }
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								            }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
 								            if (gHasThreadLocalTimeout)
 								            {
 								                auto uCount = tlsSpinCountLocal;
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto perfCounter = __rdtsc() + uCount;
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								                while (__rdtsc() < perfCounter)
 								            #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                while (uCount > 0)
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								            #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                {
 								                    if (callback())
 								                    {
 								                        return true;
 								                    }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    else
 								                    {
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount--;
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                }
 								            }
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								        }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								        return callback();
 								    }
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
 								    template <typename T>
 								    bool auline YieldToSharedCoreAlderLake(long spin, T callback, const void *pWord)
 								    {
 								        if (callback())
 								        {
 								            return true;
 								        }
 								    #if defined(AURORA_ARCH_ARM)
 								        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
 								    #endif
 								        if (kEnableSmartScheduling)
 								        {
 								            bool bRet { false };
 								            auto uWord = SMTGetAPICNumber();
 								            if (uWord < AuArraySize(gCoreTable) &&
 								                uWord < ThrdCfg::gCountOfPCores)
 								            {
 								                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
 								                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								                if (!gSpinAdaptiveThreshold || uNow <= gSpinAdaptiveThreshold)
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
+								                {
 								                    auto uCount = spin;
 								                    bool bSMTProbablyHit {};
 								                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
 								                    {
 								                        uCount /= 5;
 								                        bSMTProbablyHit = true;
 								                    }
 								                    else if (gHasThreadLocalTimeout)
 								                    {
 								                        uCount += tlsSpinCountLocal;
 								                    }
 								                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
 								                    if (ThrdCfg::gIsIntelAlderLakeOrGreater)
 								                    {
 								                        _umonitor((void *)AuPageRound<AuUInt>(AuUInt(pWord), AuHWInfo::GetCPUInfo().dwCacheLine));
 								                        if (callback())
 								                        {
 								                            bRet = true;
 								                        }
 								                        else
 								                        {
-												[*] Switch C0.2 and C0.1 powerstates around

											
										
										
											2024-05-03 14:52:50 +00:00
+								                            _umwait(/*0*/ /*1*/ bSMTProbablyHit ? 0 : 1, __rdtsc() + uCount);
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
+								                            bRet = callback();
 								                        }
 								                    }
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								                    else if (ThrdCfg::gIsZen3OrGreater)
 								                    {
-												[*] Linux build regressions, and shrink the size of Linux RWLocks to 48 bytes from 64

											
										
										
											2024-05-07 13:41:16 +00:00
+								                        _mm_monitorx((void *)pWord, 0U, 0U);
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
 								                        if (callback())
 								                        {
 								                            bRet = true;
 								                        }
 								                        else
 								                        {
-												[*] i swore i replaced this with a tpause before
[*] ...and the docs arent clear on whether or not this clock value is relative or absolute

											
										
										
											2024-05-06 21:04:26 +00:00
+								                            _mm_mwaitx(kMWAITXWaitOnStoreTimed, 0, uCount);
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								                            bRet = callback();
 								                        }
 								                    }
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
+								                    else
 								                #endif
 								                    {
 								                    #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
 								                        auto perfCounter = __rdtsc() + uCount;
 								                        while (__rdtsc() < perfCounter)
 								                    #else
 								                        while (uCount > 0)
 								                    #endif
 								                        {
 								                            if (callback())
 								                            {
 								                                bRet = true;
 								                                break;
 								                            }
 								                            else
 								                            {
 								                                SMPPause();
 								                                uCount--;
 								                            }
 								                        }
 								                    }
 								                }
 								                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
 								                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								            }
 								            return bRet;
 								        }
 								        else if (gSpinAdaptiveThreshold)
 								        {
 								            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
 								            if (uNow <= gSpinAdaptiveThreshold)
 								            {
 								                auto uCount = spin;
 								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
 								                auto perfCounter = __rdtsc() + uCount;
 								                while (__rdtsc() < perfCounter)
 								            #else
 								                while (uCount > 0)
 								            #endif
 								                {
 								                    if (callback())
 								                    {
 								                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                        return true;
 								                    }
 								                    else
 								                    {
 								                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
 								                        SMPPause();
 								                        SMPPause();
 								                        SMPPause();
 								                        SMPPause();
 								                        uCount -= 4;
 								                    #else
 								                        SMPPause();
 								                        uCount -= 1;
 								                    #endif
 								                    }
 								                }
 								                if (gHasThreadLocalTimeout)
 								                {
 								                    auto uCount = tlsSpinCountLocal;
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
+								                    auto perfCounter = __rdtsc() + uCount;
 								                    while (__rdtsc() < perfCounter)
 								                #else
 								                    while (uCount > 0)
 								                #endif
 								                    {
 								                        if (callback())
 								                        {
 								                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                            return true;
 								                        }
 								                        else
 								                        {
 								                            SMPPause();
 								                            uCount--;
 								                        }
 								                    }
 								                }
 								                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								            }
 								            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
 								            {
 								                auto uCount = (spin) / 3;
-												[+] Zen3 on top of AlderLake optimizations
[*] Minor alderlake adjustments

											
										
										
											2024-05-05 18:42:10 +00:00
+								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
+								                auto perfCounter = __rdtsc() + uCount;
 								                while (__rdtsc() < perfCounter)
 								            #else
 								                while (uCount > 0)
 								            #endif
 								                {
 								                    if (callback())
 								                    {
 								                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                        return true;
 								                    }
 								                    else
 								                    {
 								                        SMPPause();
 								                        uCount--;
 								                    }
 								                }
 								            }
 								            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								        }
 								        else
 								        {
 								            auto uCount = spin;
 								        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
 								            auto perfCounter = __rdtsc() + uCount;
 								            while (__rdtsc() < perfCounter)
 								        #else
 								            while (uCount > 0)
 								        #endif
 								            {
 								                if (callback())
 								                {
 								                    return true;
 								                }
 								                else
 								                {
 								                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
 								                    SMPPause();
 								                    SMPPause();
 								                    SMPPause();
 								                    SMPPause();
 								                    uCount -= 4;
 								                #else
 								                    SMPPause();
 								                    uCount -= 1;
 								                #endif
 								                }
 								            }
 								            if (gHasThreadLocalTimeout)
 								            {
 								                auto uCount = tlsSpinCountLocal;
 								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
 								                auto perfCounter = __rdtsc() + uCount;
 								                while (__rdtsc() < perfCounter)
 								            #else
 								                while (uCount > 0)
 								            #endif
 								                {
 								                    if (callback())
 								                    {
 								                        return true;
 								                    }
 								                    else
 								                    {
 								                        SMPPause();
 								                        uCount--;
 								                    }
 								                }
 								            }
 								        }
 								        return callback();
 								    }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
 								    template <typename T>
 								    bool auline DoTryIf(T callback)
 								    {
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								        {
 								            return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
 								        }
 								        else
 								        {
 								            return callback();
 								        }
 								    }
-												[*] Optimize primitives SMTYield for Alderlake+ user-space, BIOS-ring mwait, and AARCH

											
										
										
											2024-05-03 11:14:52 +00:00
 								    template <typename T>
 								    bool auline DoTryIfAlderLake(T callback, const void *pWord)
 								    {
 								        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
 								        {
 								            return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, pWord);
 								        }
 								        else
 								        {
 								            return callback();
 								        }
 								    }
 								    template <typename T>
 								    bool auline DoTryIfAlderLake(T callback, const volatile void *pWord)
 								    {
 								        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
 								        {
 								            return YieldToSharedCoreAlderLake(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback, (const void *)pWord);
 								        }
 								        else
 								        {
 								            return callback();
 								        }
 								    }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								}