mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-18 22:50:07 +00:00
x86: Optimizing memcpy for AMD Zen architecture.
Modifying the shareable cache '__x86_shared_cache_size', which is a factor in computing the non-temporal threshold parameter '__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen architectures. In the existing implementation, the shareable cache is computed as 'L3 per thread, L2 per core'. Recomputing this shareable cache as 'L3 per CCX(Core-Complex)' has brought in performance gains. As per the large bench variant results, this patch also addresses the regression problem on AMD Zen architectures. Backport of commit59803e81f9
upstream, with the fix fromcb3a749a22
("x86: Restore processing of cache size tunables in init_cacheinfo") applied. Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com> Co-Authored-by: Florian Weimer <fweimer@redhat.com>
This commit is contained in:
parent
8d730cb25a
commit
48cf525f4b
@ -722,7 +722,7 @@ intel_bug_no_cache_info:
|
||||
threads = 1 << ((ecx >> 12) & 0x0f);
|
||||
}
|
||||
|
||||
if (threads == 0)
|
||||
if (threads == 0 || cpu_features->basic.family >= 0x17)
|
||||
{
|
||||
/* If APIC ID width is not available, use logical
|
||||
processor count. */
|
||||
@ -737,9 +737,23 @@ intel_bug_no_cache_info:
|
||||
if (threads > 0)
|
||||
shared /= threads;
|
||||
|
||||
/* Get shared cache per ccx for Zen architectures. */
|
||||
if (cpu_features->basic.family >= 0x17)
|
||||
{
|
||||
unsigned int eax;
|
||||
|
||||
/* Get number of threads share the L3 cache in CCX. */
|
||||
__cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
|
||||
|
||||
unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
|
||||
shared *= threads_per_ccx;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Account for exclusive L2 and L3 caches. */
|
||||
shared += core;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef DISABLE_PREFETCHW
|
||||
if (max_cpuid_ex >= 0x80000001)
|
||||
|
Loading…
Reference in New Issue
Block a user