x86: Use Avoid_Non_Temporal_Memset to control non-temporal path

This is just a refactor and there should be no behavioral change from
this commit.

The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
for controlling whether we use non-temporal memset rather than having
extra logic based on vendor.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Noah Goldstein 2024-08-14 14:37:30 +08:00 committed by H.J. Lu
parent 7da0886247
commit b93dddfaf4
2 changed files with 23 additions and 8 deletions

View File

@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
/* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
as of writing this, we only have benchmarks indicatings it profitability
on Intel/AMD. */
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|= bit_arch_Avoid_Non_Temporal_Memset;
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features)
update_active (cpu_features);
/* Benchmarks indicate non-temporal memset can be profitable on Intel
hardware. */
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
&= ~bit_arch_Avoid_Non_Temporal_Memset;
if (family == 0x06)
{
model += extended_model;
@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
/* Benchmarks indicate non-temporal memset can be profitable on AMD
hardware. */
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
&= ~bit_arch_Avoid_Non_Temporal_Memset;
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
{
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and

View File

@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
/* Non-temporal stores are more performant on Intel and AMD hardware above
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
&& (cpu_features->basic.kind == arch_kind_intel
|| cpu_features->basic.kind == arch_kind_amd))
memset_non_temporal_threshold = non_temporal_threshold;
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (tunable_size != 0)
shared = tunable_size;
/* Non-temporal stores are more performant on some hardware above
non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
memset_non_temporal_threshold = non_temporal_threshold;
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
if (tunable_size > minimum_non_temporal_threshold
&& tunable_size <= maximum_non_temporal_threshold)