[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp
This commit is contained in:
parent
63050b2262
commit
49a6173011
30
Source/Extensions/Clocks.aarch64.cpp
Normal file
30
Source/Extensions/Clocks.aarch64.cpp
Normal file
@ -0,0 +1,30 @@
|
||||
/***
|
||||
Copyright (C) 2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
||||
|
||||
File: Clocks.aarch64.cpp
|
||||
Date: 2024-01-02
|
||||
Author: Reece
|
||||
***/
|
||||
#include <Source/RuntimeInternal.hpp>
|
||||
|
||||
AUKN_SYM AuUInt64 ArmQueryClockCounter()
|
||||
{
|
||||
AuUInt64 uValue {};
|
||||
#if defined(AURORA_COMPILER_MSVC)
|
||||
uValue = _ReadStatusReg(CNTVCT_EL0);
|
||||
#else
|
||||
asm volatile ("mrs %0, CNTVCT_EL0; isb; " : "=r" (uValue));
|
||||
#endif
|
||||
return uValue;
|
||||
}
|
||||
|
||||
AUKN_SYM AuUInt64 ArmQueryClockFrequency()
|
||||
{
|
||||
AuUInt64 uClockFreq {};
|
||||
#if defined(defined(AURORA_COMPILER_MSVC))
|
||||
uClockFreq = _ReadStatusReg(CNTFRQ_EL0);
|
||||
#else
|
||||
asm volatile ("mrs %0, CNTFRQ_EL0; isb; " : "=r" (uClockFreq));
|
||||
#endif
|
||||
return uClockFreq;
|
||||
}
|
11
Source/Extensions/Clocks.aarch64.hpp
Normal file
11
Source/Extensions/Clocks.aarch64.hpp
Normal file
@ -0,0 +1,11 @@
|
||||
/***
|
||||
Copyright (C) 2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
||||
|
||||
File: Clocks.aarch64.hpp
|
||||
Date: 2024-01-02
|
||||
Author: Reece
|
||||
***/
|
||||
#pragma once
|
||||
|
||||
AUKN_SYM AuUInt64 ArmQueryClockCounter();
|
||||
AUKN_SYM AuUInt64 ArmQueryClockFrequency();
|
@ -20,41 +20,29 @@ namespace Aurora::HWInfo
|
||||
static AuUInt32 gGuessedCores {};
|
||||
static AuUInt32 gGuessedThreads {};
|
||||
|
||||
union CPUIdContext
|
||||
{
|
||||
struct
|
||||
{
|
||||
AuUInt32 eax;
|
||||
AuUInt32 ebx;
|
||||
AuUInt32 ecx;
|
||||
AuUInt32 edx;
|
||||
};
|
||||
int regs[4];
|
||||
};
|
||||
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
|
||||
#if defined(AURORA_COMPILER_MSVC)
|
||||
static CPUIdContext cpuid(AuUInt32 a)
|
||||
CPUIdContext cpuid(AuUInt32 a)
|
||||
{
|
||||
CPUIdContext context;
|
||||
__cpuid(context.regs, a);
|
||||
return context;
|
||||
}
|
||||
#elif defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC)
|
||||
static CPUIdContext cpuid(AuUInt32 a)
|
||||
CPUIdContext cpuid(AuUInt32 a)
|
||||
{
|
||||
CPUIdContext context;
|
||||
__get_cpuid(a, &context.eax, &context.ebx, &context.ecx, &context.edx);
|
||||
return context;
|
||||
}
|
||||
#else
|
||||
static CPUIdContext cpuid(AuUInt32 a)
|
||||
CPUIdContext cpuid(AuUInt32 a)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
static CPUIdContext cpuid(AuUInt32 a)
|
||||
CPUIdContext cpuid(AuUInt32 a)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
@ -9,5 +9,19 @@
|
||||
|
||||
namespace Aurora::HWInfo
|
||||
{
|
||||
union CPUIdContext
|
||||
{
|
||||
struct
|
||||
{
|
||||
AuUInt32 eax;
|
||||
AuUInt32 ebx;
|
||||
AuUInt32 ecx;
|
||||
AuUInt32 edx;
|
||||
};
|
||||
int regs[4];
|
||||
};
|
||||
|
||||
CPUIdContext cpuid(AuUInt32 a);
|
||||
|
||||
void SetCpuId();
|
||||
}
|
@ -7,6 +7,7 @@
|
||||
***/
|
||||
#include <Source/RuntimeInternal.hpp>
|
||||
#include "SMTYield.hpp"
|
||||
#include <Source/HWInfo/AuCpuId.hpp>
|
||||
|
||||
namespace Aurora::Threading
|
||||
{
|
||||
@ -79,6 +80,8 @@ namespace Aurora::Threading::Primitives
|
||||
{
|
||||
auto uCores = AuHwInfo::GetCPUInfo().uThreads;
|
||||
|
||||
ThrdCfg::gCountOfPCores = AuHwInfo::GetCPUInfo().maskPCores.CpuBitCount();
|
||||
|
||||
bool bPermitWOAInternal = IsNativeWaitOnSupported();
|
||||
|
||||
gUseFutexRWLock = ThrdCfg::gPreferFutexRWLock &&
|
||||
@ -186,3 +189,8 @@ namespace Aurora::Threading::Primitives
|
||||
ThrdCfg::gPreferFutexEvent = gRuntimeConfig.threadingConfig.bPreferFutexEvent;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" DWORD SMTGetAPICNumber(void)
|
||||
{
|
||||
return AuHwInfo::cpuid(1).ebx >> 24;
|
||||
}
|
@ -13,6 +13,13 @@ namespace Aurora::Threading
|
||||
inline thread_local AuUInt32 tlsSpinCountLocal {};
|
||||
}
|
||||
|
||||
extern "C"
|
||||
{
|
||||
DWORD SMTGetAPICNumber(void);
|
||||
}
|
||||
|
||||
#include <Source/Extensions/Clocks.aarch64.hpp>
|
||||
|
||||
#define SPIN_FOUR 1
|
||||
|
||||
namespace Aurora::Threading::Primitives
|
||||
@ -52,8 +59,16 @@ namespace Aurora::Threading::Primitives
|
||||
inline bool gPreferUnixPrimitivesNoSpin {};
|
||||
inline bool gAlwaysRWLockWriteBiasOnReadLock {};
|
||||
inline bool gEnableRWLockWriteBiasOnReadLock {};
|
||||
inline AuUInt8 gCountOfPCores {};
|
||||
}
|
||||
|
||||
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
|
||||
using SMTAtomic_t = AuUInt8;
|
||||
#else
|
||||
using SMTAtomic_t = AuUInt32;
|
||||
#endif
|
||||
inline SMTAtomic_t gCoreTable[256] {};
|
||||
|
||||
inline AuUInt32 gSpinAdaptiveThreshold {};
|
||||
inline AuUInt32 gSpinAdaptiveCurrentCount {};
|
||||
inline AuUInt32 gSpinAdaptiveThreadCount {};
|
||||
@ -65,6 +80,14 @@ namespace Aurora::Threading::Primitives
|
||||
void InitAdaptiveThresholdFirstTime();
|
||||
void InitCfg();
|
||||
|
||||
static const bool kEnableSmartScheduling =
|
||||
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
|
||||
true;
|
||||
#else
|
||||
// tbd by arch and os
|
||||
false;
|
||||
#endif
|
||||
|
||||
static auline void SMPPause()
|
||||
{
|
||||
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
|
||||
@ -81,6 +104,58 @@ namespace Aurora::Threading::Primitives
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(AURORA_ARCH_ARM)
|
||||
|
||||
static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
|
||||
{
|
||||
if (uFreq == 10000000)
|
||||
{
|
||||
return uCounter * 100ull;
|
||||
}
|
||||
else if (uFreq == 1000000)
|
||||
{
|
||||
return uCounter * 1000ull;
|
||||
}
|
||||
else if (uFreq == 100000)
|
||||
{
|
||||
return uCounter * 10000ull;
|
||||
}
|
||||
else if (uFreq == 100000000ull)
|
||||
{
|
||||
return uCounter * 10ull;
|
||||
}
|
||||
else if (uFreq == 1000000000ull)
|
||||
{
|
||||
return uCounter;
|
||||
}
|
||||
else
|
||||
{
|
||||
const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
|
||||
const long long uPart = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
|
||||
return uWhole + uPart;
|
||||
}
|
||||
}
|
||||
|
||||
static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
|
||||
{
|
||||
return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
|
||||
// context:
|
||||
// Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
|
||||
// Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
|
||||
// Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
|
||||
// Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
|
||||
// This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
|
||||
// On the aarch side of things, we should be able to match the exact Intel behaviour by:
|
||||
// * Reading the system wide clock (CNTVCT_EL0)
|
||||
// * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
|
||||
// * Divide by approx "3.6 Ghz" ops/ns
|
||||
// *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
|
||||
}
|
||||
|
||||
#define __rdtsc() RdtscArmEmulated(uClockFreq)
|
||||
#define ALT_RDT
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
bool auline YieldToSharedCore(long spin, T callback)
|
||||
{
|
||||
@ -89,14 +164,66 @@ namespace Aurora::Threading::Primitives
|
||||
return true;
|
||||
}
|
||||
|
||||
if (gSpinAdaptiveThreshold)
|
||||
#if defined(AURORA_ARCH_ARM)
|
||||
AuUInt64 uClockFreq { ArmQueryClockFrequency() };
|
||||
#endif
|
||||
|
||||
if (kEnableSmartScheduling)
|
||||
{
|
||||
bool bRet { false };
|
||||
auto uWord = SMTGetAPICNumber();
|
||||
if (uWord < AuArraySize(gCoreTable) &&
|
||||
uWord < ThrdCfg::gCountOfPCores)
|
||||
{
|
||||
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
|
||||
|
||||
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
|
||||
if (uNow <= gSpinAdaptiveThreshold)
|
||||
{
|
||||
auto uCount = spin;
|
||||
|
||||
if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
|
||||
{
|
||||
uCount /= 5;
|
||||
}
|
||||
else if (gHasThreadLocalTimeout)
|
||||
{
|
||||
uCount += tlsSpinCountLocal;
|
||||
}
|
||||
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
||||
auto perfCounter = __rdtsc() + uCount;
|
||||
while (__rdtsc() < perfCounter)
|
||||
#else
|
||||
while (uCount > 0)
|
||||
#endif
|
||||
{
|
||||
if (callback())
|
||||
{
|
||||
bRet = true;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
SMPPause();
|
||||
uCount--;
|
||||
}
|
||||
}
|
||||
}
|
||||
AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
|
||||
AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
|
||||
}
|
||||
|
||||
return bRet;
|
||||
}
|
||||
else if (gSpinAdaptiveThreshold)
|
||||
{
|
||||
auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
|
||||
|
||||
if (uNow <= gSpinAdaptiveThreshold)
|
||||
{
|
||||
auto uCount = spin;
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
||||
auto perfCounter = __rdtsc() + uCount;
|
||||
while (__rdtsc() < perfCounter)
|
||||
#else
|
||||
@ -176,7 +303,7 @@ namespace Aurora::Threading::Primitives
|
||||
else
|
||||
{
|
||||
auto uCount = spin;
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
||||
auto perfCounter = __rdtsc() + uCount;
|
||||
while (__rdtsc() < perfCounter)
|
||||
#else
|
||||
@ -205,7 +332,7 @@ namespace Aurora::Threading::Primitives
|
||||
if (gHasThreadLocalTimeout)
|
||||
{
|
||||
auto uCount = tlsSpinCountLocal;
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
|
||||
#if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
|
||||
auto perfCounter = __rdtsc() + uCount;
|
||||
while (__rdtsc() < perfCounter)
|
||||
#else
|
||||
|
Loading…
Reference in New Issue
Block a user