From 49a617301106e9f2dca26d1d805a9b1496d05a00 Mon Sep 17 00:00:00 2001 From: Jamie Reece Wilson Date: Tue, 2 Jan 2024 02:49:23 +0000 Subject: [PATCH] [+] Improved SMT yielding [+] Clocks.aarch64.[h/c]pp --- Source/Extensions/Clocks.aarch64.cpp | 30 +++++ Source/Extensions/Clocks.aarch64.hpp | 11 ++ Source/HWInfo/AuCpuId.cpp | 20 +--- Source/HWInfo/AuCpuId.hpp | 14 +++ Source/Threading/Primitives/SMTYield.cpp | 8 ++ Source/Threading/Primitives/SMTYield.hpp | 135 ++++++++++++++++++++++- 6 files changed, 198 insertions(+), 20 deletions(-) create mode 100644 Source/Extensions/Clocks.aarch64.cpp create mode 100644 Source/Extensions/Clocks.aarch64.hpp diff --git a/Source/Extensions/Clocks.aarch64.cpp b/Source/Extensions/Clocks.aarch64.cpp new file mode 100644 index 00000000..8fb1f46e --- /dev/null +++ b/Source/Extensions/Clocks.aarch64.cpp @@ -0,0 +1,30 @@ +/*** + Copyright (C) 2024 J Reece Wilson (a/k/a "Reece"). All rights reserved. + + File: Clocks.aarch64.cpp + Date: 2024-01-02 + Author: Reece +***/ +#include + +AUKN_SYM AuUInt64 ArmQueryClockCounter() +{ + AuUInt64 uValue {}; +#if defined(AURORA_COMPILER_MSVC) + uValue = _ReadStatusReg(CNTVCT_EL0); +#else + asm volatile ("mrs %0, CNTVCT_EL0; isb; " : "=r" (uValue)); +#endif + return uValue; +} + +AUKN_SYM AuUInt64 ArmQueryClockFrequency() +{ + AuUInt64 uClockFreq {}; +#if defined(defined(AURORA_COMPILER_MSVC)) + uClockFreq = _ReadStatusReg(CNTFRQ_EL0); +#else + asm volatile ("mrs %0, CNTFRQ_EL0; isb; " : "=r" (uClockFreq)); +#endif + return uClockFreq; +} diff --git a/Source/Extensions/Clocks.aarch64.hpp b/Source/Extensions/Clocks.aarch64.hpp new file mode 100644 index 00000000..3c0a2ef7 --- /dev/null +++ b/Source/Extensions/Clocks.aarch64.hpp @@ -0,0 +1,11 @@ +/*** + Copyright (C) 2024 J Reece Wilson (a/k/a "Reece"). All rights reserved. + + File: Clocks.aarch64.hpp + Date: 2024-01-02 + Author: Reece +***/ +#pragma once + +AUKN_SYM AuUInt64 ArmQueryClockCounter(); +AUKN_SYM AuUInt64 ArmQueryClockFrequency(); \ No newline at end of file diff --git a/Source/HWInfo/AuCpuId.cpp b/Source/HWInfo/AuCpuId.cpp index e41551f9..a23661d1 100644 --- a/Source/HWInfo/AuCpuId.cpp +++ b/Source/HWInfo/AuCpuId.cpp @@ -20,41 +20,29 @@ namespace Aurora::HWInfo static AuUInt32 gGuessedCores {}; static AuUInt32 gGuessedThreads {}; - union CPUIdContext - { - struct - { - AuUInt32 eax; - AuUInt32 ebx; - AuUInt32 ecx; - AuUInt32 edx; - }; - int regs[4]; - }; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) #if defined(AURORA_COMPILER_MSVC) - static CPUIdContext cpuid(AuUInt32 a) + CPUIdContext cpuid(AuUInt32 a) { CPUIdContext context; __cpuid(context.regs, a); return context; } #elif defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC) - static CPUIdContext cpuid(AuUInt32 a) + CPUIdContext cpuid(AuUInt32 a) { CPUIdContext context; __get_cpuid(a, &context.eax, &context.ebx, &context.ecx, &context.edx); return context; } #else - static CPUIdContext cpuid(AuUInt32 a) + CPUIdContext cpuid(AuUInt32 a) { return {}; } #endif #else - static CPUIdContext cpuid(AuUInt32 a) + CPUIdContext cpuid(AuUInt32 a) { return {}; } diff --git a/Source/HWInfo/AuCpuId.hpp b/Source/HWInfo/AuCpuId.hpp index 9a30845d..b0029306 100644 --- a/Source/HWInfo/AuCpuId.hpp +++ b/Source/HWInfo/AuCpuId.hpp @@ -9,5 +9,19 @@ namespace Aurora::HWInfo { + union CPUIdContext + { + struct + { + AuUInt32 eax; + AuUInt32 ebx; + AuUInt32 ecx; + AuUInt32 edx; + }; + int regs[4]; + }; + + CPUIdContext cpuid(AuUInt32 a); + void SetCpuId(); } \ No newline at end of file diff --git a/Source/Threading/Primitives/SMTYield.cpp b/Source/Threading/Primitives/SMTYield.cpp index 11fc6b55..f9c65e50 100644 --- a/Source/Threading/Primitives/SMTYield.cpp +++ b/Source/Threading/Primitives/SMTYield.cpp @@ -7,6 +7,7 @@ ***/ #include #include "SMTYield.hpp" +#include namespace Aurora::Threading { @@ -79,6 +80,8 @@ namespace Aurora::Threading::Primitives { auto uCores = AuHwInfo::GetCPUInfo().uThreads; + ThrdCfg::gCountOfPCores = AuHwInfo::GetCPUInfo().maskPCores.CpuBitCount(); + bool bPermitWOAInternal = IsNativeWaitOnSupported(); gUseFutexRWLock = ThrdCfg::gPreferFutexRWLock && @@ -185,4 +188,9 @@ namespace Aurora::Threading::Primitives ThrdCfg::gEnableRWLockWriteBiasOnReadLock = gRuntimeConfig.threadingConfig.bEnableRWLockWriteBiasOnReadLock; ThrdCfg::gPreferFutexEvent = gRuntimeConfig.threadingConfig.bPreferFutexEvent; } +} + +extern "C" DWORD SMTGetAPICNumber(void) +{ + return AuHwInfo::cpuid(1).ebx >> 24; } \ No newline at end of file diff --git a/Source/Threading/Primitives/SMTYield.hpp b/Source/Threading/Primitives/SMTYield.hpp index ca67b36c..969f42f9 100644 --- a/Source/Threading/Primitives/SMTYield.hpp +++ b/Source/Threading/Primitives/SMTYield.hpp @@ -13,6 +13,13 @@ namespace Aurora::Threading inline thread_local AuUInt32 tlsSpinCountLocal {}; } +extern "C" +{ + DWORD SMTGetAPICNumber(void); +} + +#include + #define SPIN_FOUR 1 namespace Aurora::Threading::Primitives @@ -52,7 +59,15 @@ namespace Aurora::Threading::Primitives inline bool gPreferUnixPrimitivesNoSpin {}; inline bool gAlwaysRWLockWriteBiasOnReadLock {}; inline bool gEnableRWLockWriteBiasOnReadLock {}; + inline AuUInt8 gCountOfPCores {}; } + +#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) + using SMTAtomic_t = AuUInt8; +#else + using SMTAtomic_t = AuUInt32; +#endif + inline SMTAtomic_t gCoreTable[256] {}; inline AuUInt32 gSpinAdaptiveThreshold {}; inline AuUInt32 gSpinAdaptiveCurrentCount {}; @@ -65,6 +80,14 @@ namespace Aurora::Threading::Primitives void InitAdaptiveThresholdFirstTime(); void InitCfg(); + static const bool kEnableSmartScheduling = + #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) + true; + #else + // tbd by arch and os + false; + #endif + static auline void SMPPause() { #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) @@ -81,6 +104,58 @@ namespace Aurora::Threading::Primitives #endif } +#if defined(AURORA_ARCH_ARM) + + static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq) + { + if (uFreq == 10000000) + { + return uCounter * 100ull; + } + else if (uFreq == 1000000) + { + return uCounter * 1000ull; + } + else if (uFreq == 100000) + { + return uCounter * 10000ull; + } + else if (uFreq == 100000000ull) + { + return uCounter * 10ull; + } + else if (uFreq == 1000000000ull) + { + return uCounter; + } + else + { + const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull; + const long long uPart = (uCounter % uFreq) * 1'000'000'000ull / uFreq; + return uWhole + uPart; + } + } + + static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq) + { + return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4; + // context: + // Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor. + // Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on. + // Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it. + // Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture. + // This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock. + // On the aarch side of things, we should be able to match the exact Intel behaviour by: + // * Reading the system wide clock (CNTVCT_EL0) + // * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0) + // * Divide by approx "3.6 Ghz" ops/ns + // *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares? + } + + #define __rdtsc() RdtscArmEmulated(uClockFreq) + #define ALT_RDT +#endif + template bool auline YieldToSharedCore(long spin, T callback) { @@ -88,15 +163,67 @@ namespace Aurora::Threading::Primitives { return true; } + + #if defined(AURORA_ARCH_ARM) + AuUInt64 uClockFreq { ArmQueryClockFrequency() }; + #endif + + if (kEnableSmartScheduling) + { + bool bRet { false }; + auto uWord = SMTGetAPICNumber(); + if (uWord < AuArraySize(gCoreTable) && + uWord < ThrdCfg::gCountOfPCores) + { + AuAtomicStore(&gCoreTable[uWord], 1u); - if (gSpinAdaptiveThreshold) + auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); + if (uNow <= gSpinAdaptiveThreshold) + { + auto uCount = spin; + + if (AuAtomicLoad(&gCoreTable[uWord ^ 1])) + { + uCount /= 5; + } + else if (gHasThreadLocalTimeout) + { + uCount += tlsSpinCountLocal; + } + + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) + auto perfCounter = __rdtsc() + uCount; + while (__rdtsc() < perfCounter) + #else + while (uCount > 0) + #endif + { + if (callback()) + { + bRet = true; + break; + } + else + { + SMPPause(); + uCount--; + } + } + } + AuAtomicStore(&gCoreTable[uWord], 0u); + AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u); + } + + return bRet; + } + else if (gSpinAdaptiveThreshold) { auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u); if (uNow <= gSpinAdaptiveThreshold) { auto uCount = spin; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else @@ -176,7 +303,7 @@ namespace Aurora::Threading::Primitives else { auto uCount = spin; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else @@ -205,7 +332,7 @@ namespace Aurora::Threading::Primitives if (gHasThreadLocalTimeout) { auto uCount = tlsSpinCountLocal; - #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) + #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT) auto perfCounter = __rdtsc() + uCount; while (__rdtsc() < perfCounter) #else