mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-13 20:50:08 +00:00
x86: Use Avoid_Non_Temporal_Memset
to control non-temporal path
This is just a refactor and there should be no behavioral change from this commit. The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob for controlling whether we use non-temporal memset rather than having extra logic based on vendor. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
7da0886247
commit
b93dddfaf4
@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
unsigned int stepping = 0;
|
||||
enum cpu_features_kind kind;
|
||||
|
||||
/* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
|
||||
as of writing this, we only have benchmarks indicatings it profitability
|
||||
on Intel/AMD. */
|
||||
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
|= bit_arch_Avoid_Non_Temporal_Memset;
|
||||
|
||||
cpu_features->cachesize_non_temporal_divisor = 4;
|
||||
#if !HAS_CPUID
|
||||
if (__get_cpuid_max (0, 0) == 0)
|
||||
@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
|
||||
update_active (cpu_features);
|
||||
|
||||
/* Benchmarks indicate non-temporal memset can be profitable on Intel
|
||||
hardware. */
|
||||
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
&= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||
|
||||
if (family == 0x06)
|
||||
{
|
||||
model += extended_model;
|
||||
@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||
|
||||
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
|
||||
|
||||
/* Benchmarks indicate non-temporal memset can be profitable on AMD
|
||||
hardware. */
|
||||
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
&= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
|
||||
{
|
||||
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
|
||||
|
@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||
rep_movsb_threshold = 2112;
|
||||
|
||||
/* Non-temporal stores are more performant on Intel and AMD hardware above
|
||||
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||||
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
||||
&& (cpu_features->basic.kind == arch_kind_intel
|
||||
|| cpu_features->basic.kind == arch_kind_amd))
|
||||
memset_non_temporal_threshold = non_temporal_threshold;
|
||||
|
||||
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||
cases slower than the vectorized path (and for some alignments,
|
||||
it is really slow, check BZ #30994). */
|
||||
@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
if (tunable_size != 0)
|
||||
shared = tunable_size;
|
||||
|
||||
/* Non-temporal stores are more performant on some hardware above
|
||||
non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
|
||||
Intel and AMD hardware. */
|
||||
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
|
||||
memset_non_temporal_threshold = non_temporal_threshold;
|
||||
|
||||
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
|
||||
if (tunable_size > minimum_non_temporal_threshold
|
||||
&& tunable_size <= maximum_non_temporal_threshold)
|
||||
|
Loading…
Reference in New Issue
Block a user