[+] Improved SMT yielding

[+] Clocks.aarch64.[h/c]pp
This commit is contained in:
Reece Wilson 2024-01-02 02:49:23 +00:00
parent 63050b2262
commit 49a6173011
6 changed files with 198 additions and 20 deletions

View File

@ -0,0 +1,30 @@
/***
Copyright (C) 2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: Clocks.aarch64.cpp
Date: 2024-01-02
Author: Reece
***/
#include <Source/RuntimeInternal.hpp>
AUKN_SYM AuUInt64 ArmQueryClockCounter()
{
AuUInt64 uValue {};
#if defined(AURORA_COMPILER_MSVC)
uValue = _ReadStatusReg(CNTVCT_EL0);
#else
asm volatile ("mrs %0, CNTVCT_EL0; isb; " : "=r" (uValue));
#endif
return uValue;
}
AUKN_SYM AuUInt64 ArmQueryClockFrequency()
{
AuUInt64 uClockFreq {};
#if defined(defined(AURORA_COMPILER_MSVC))
uClockFreq = _ReadStatusReg(CNTFRQ_EL0);
#else
asm volatile ("mrs %0, CNTFRQ_EL0; isb; " : "=r" (uClockFreq));
#endif
return uClockFreq;
}

View File

@ -0,0 +1,11 @@
/***
Copyright (C) 2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: Clocks.aarch64.hpp
Date: 2024-01-02
Author: Reece
***/
#pragma once
AUKN_SYM AuUInt64 ArmQueryClockCounter();
AUKN_SYM AuUInt64 ArmQueryClockFrequency();

View File

@ -20,41 +20,29 @@ namespace Aurora::HWInfo
static AuUInt32 gGuessedCores {};
static AuUInt32 gGuessedThreads {};
union CPUIdContext
{
struct
{
AuUInt32 eax;
AuUInt32 ebx;
AuUInt32 ecx;
AuUInt32 edx;
};
int regs[4];
};
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
#if defined(AURORA_COMPILER_MSVC)
static CPUIdContext cpuid(AuUInt32 a)
CPUIdContext cpuid(AuUInt32 a)
{
CPUIdContext context;
__cpuid(context.regs, a);
return context;
}
#elif defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC)
static CPUIdContext cpuid(AuUInt32 a)
CPUIdContext cpuid(AuUInt32 a)
{
CPUIdContext context;
__get_cpuid(a, &context.eax, &context.ebx, &context.ecx, &context.edx);
return context;
}
#else
static CPUIdContext cpuid(AuUInt32 a)
CPUIdContext cpuid(AuUInt32 a)
{
return {};
}
#endif
#else
static CPUIdContext cpuid(AuUInt32 a)
CPUIdContext cpuid(AuUInt32 a)
{
return {};
}

View File

@ -9,5 +9,19 @@
namespace Aurora::HWInfo
{
union CPUIdContext
{
struct
{
AuUInt32 eax;
AuUInt32 ebx;
AuUInt32 ecx;
AuUInt32 edx;
};
int regs[4];
};
CPUIdContext cpuid(AuUInt32 a);
void SetCpuId();
}

View File

@ -7,6 +7,7 @@
***/
#include <Source/RuntimeInternal.hpp>
#include "SMTYield.hpp"
#include <Source/HWInfo/AuCpuId.hpp>
namespace Aurora::Threading
{
@ -79,6 +80,8 @@ namespace Aurora::Threading::Primitives
{
auto uCores = AuHwInfo::GetCPUInfo().uThreads;
ThrdCfg::gCountOfPCores = AuHwInfo::GetCPUInfo().maskPCores.CpuBitCount();
bool bPermitWOAInternal = IsNativeWaitOnSupported();
gUseFutexRWLock = ThrdCfg::gPreferFutexRWLock &&
@ -185,4 +188,9 @@ namespace Aurora::Threading::Primitives
ThrdCfg::gEnableRWLockWriteBiasOnReadLock = gRuntimeConfig.threadingConfig.bEnableRWLockWriteBiasOnReadLock;
ThrdCfg::gPreferFutexEvent = gRuntimeConfig.threadingConfig.bPreferFutexEvent;
}
}
extern "C" DWORD SMTGetAPICNumber(void)
{
return AuHwInfo::cpuid(1).ebx >> 24;
}

View File

@ -13,6 +13,13 @@ namespace Aurora::Threading
inline thread_local AuUInt32 tlsSpinCountLocal {};
}
extern "C"
{
DWORD SMTGetAPICNumber(void);
}
#include <Source/Extensions/Clocks.aarch64.hpp>
#define SPIN_FOUR 1
namespace Aurora::Threading::Primitives
@ -52,7 +59,15 @@ namespace Aurora::Threading::Primitives
inline bool gPreferUnixPrimitivesNoSpin {};
inline bool gAlwaysRWLockWriteBiasOnReadLock {};
inline bool gEnableRWLockWriteBiasOnReadLock {};
inline AuUInt8 gCountOfPCores {};
}
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
using SMTAtomic_t = AuUInt8;
#else
using SMTAtomic_t = AuUInt32;
#endif
inline SMTAtomic_t gCoreTable[256] {};
inline AuUInt32 gSpinAdaptiveThreshold {};
inline AuUInt32 gSpinAdaptiveCurrentCount {};
@ -65,6 +80,14 @@ namespace Aurora::Threading::Primitives
void InitAdaptiveThresholdFirstTime();
void InitCfg();
static const bool kEnableSmartScheduling =
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
true;
#else
// tbd by arch and os
false;
#endif
static auline void SMPPause()
{
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
@ -81,6 +104,58 @@ namespace Aurora::Threading::Primitives
#endif
}
#if defined(AURORA_ARCH_ARM)
static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
{
if (uFreq == 10000000)
{
return uCounter * 100ull;
}
else if (uFreq == 1000000)
{
return uCounter * 1000ull;
}
else if (uFreq == 100000)
{
return uCounter * 10000ull;
}
else if (uFreq == 100000000ull)
{
return uCounter * 10ull;
}
else if (uFreq == 1000000000ull)
{
return uCounter;
}
else
{
const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
const long long uPart = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
return uWhole + uPart;
}
}
static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
{
return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
// context:
// Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
// Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
// Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
// Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
// This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
// On the aarch side of things, we should be able to match the exact Intel behaviour by:
// * Reading the system wide clock (CNTVCT_EL0)
// * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
// * Divide by approx "3.6 Ghz" ops/ns
// *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
}
#define __rdtsc() RdtscArmEmulated(uClockFreq)
#define ALT_RDT
#endif
template <typename T>
bool auline YieldToSharedCore(long spin, T callback)
{
@ -88,15 +163,67 @@ namespace Aurora::Threading::Primitives
{
return true;
}
#if defined(AURORA_ARCH_ARM)
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
#endif
if (kEnableSmartScheduling)
{
bool bRet { false };
auto uWord = SMTGetAPICNumber();
if (uWord < AuArraySize(gCoreTable) &&
uWord < ThrdCfg::gCountOfPCores)
{
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
if (gSpinAdaptiveThreshold)
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
{
uCount /= 5;
}
else if (gHasThreadLocalTimeout)
{
uCount += tlsSpinCountLocal;
}
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
while (uCount > 0)
#endif
{
if (callback())
{
bRet = true;
break;
}
else
{
SMPPause();
uCount--;
}
}
}
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
}
return bRet;
}
else if (gSpinAdaptiveThreshold)
{
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
if (uNow <= gSpinAdaptiveThreshold)
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
@ -176,7 +303,7 @@ namespace Aurora::Threading::Primitives
else
{
auto uCount = spin;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else
@ -205,7 +332,7 @@ namespace Aurora::Threading::Primitives
if (gHasThreadLocalTimeout)
{
auto uCount = tlsSpinCountLocal;
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
auto perfCounter = __rdtsc() + uCount;
while (__rdtsc() < perfCounter)
#else