x86: Add thresholds for "rep movsb/stosb" to tunables

Add x86_rep_movsb_threshold and x86_rep_stosb_threshold to tunables
to update thresholds for "rep movsb" and "rep stosb" at run-time.

Note that the user specified threshold for "rep movsb" smaller than
the minimum threshold will be ignored.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
This commit is contained in:
H.J. Lu 2020-07-06 11:48:09 -07:00
parent 6c010c5dde
commit 3f4b61a0b8
7 changed files with 86 additions and 26 deletions

View File

@ -396,6 +396,22 @@ to set threshold in bytes for non temporal store.
This tunable is specific to i386 and x86-64.
@end deftp
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
set threshold in bytes to start using "rep movsb". The value must be
greater than zero, and currently defaults to 2048 bytes.
This tunable is specific to i386 and x86-64.
@end deftp
@deftp Tunable glibc.cpu.x86_rep_stosb_threshold
The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to
set threshold in bytes to start using "rep stosb". The value must be
greater than zero, and currently defaults to 2048 bytes.
This tunable is specific to i386 and x86-64.
@end deftp
@deftp Tunable glibc.cpu.x86_ibt
The @code{glibc.cpu.x86_ibt} tunable allows the user to control how
indirect branch tracking (IBT) should be enabled. Accepted values are

View File

@ -530,6 +530,12 @@ long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
/* Threshold to use non temporal store. */
long int __x86_shared_non_temporal_threshold attribute_hidden;
/* Threshold to use Enhanced REP MOVSB. */
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
/* Threshold to use Enhanced REP STOSB. */
long int __x86_rep_stosb_threshold attribute_hidden = 2048;
#ifndef DISABLE_PREFETCHW
/* PREFETCHW support flag for use in memory and string routines. */
int __x86_prefetchw attribute_hidden;
@ -872,6 +878,36 @@ init_cacheinfo (void)
= (cpu_features->non_temporal_threshold != 0
? cpu_features->non_temporal_threshold
: __x86_shared_cache_size * threads * 3 / 4);
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
unsigned int minimum_rep_movsb_threshold;
/* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
unsigned int rep_movsb_threshold;
if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
rep_movsb_threshold = 2048 * (64 / 16);
minimum_rep_movsb_threshold = 64 * 8;
}
else if (CPU_FEATURES_ARCH_P (cpu_features,
AVX_Fast_Unaligned_Load))
{
rep_movsb_threshold = 2048 * (32 / 16);
minimum_rep_movsb_threshold = 32 * 8;
}
else
{
rep_movsb_threshold = 2048 * (16 / 16);
minimum_rep_movsb_threshold = 16 * 8;
}
if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
else
__x86_rep_movsb_threshold = rep_movsb_threshold;
# if HAVE_TUNABLES
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
# endif
}
#endif

View File

@ -606,6 +606,10 @@ no_cpuid:
TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
cpu_features->non_temporal_threshold
= TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
cpu_features->rep_movsb_threshold
= TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
cpu_features->rep_stosb_threshold
= TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL);
cpu_features->data_cache_size
= TUNABLE_GET (x86_data_cache_size, long int, NULL);
cpu_features->shared_cache_size

View File

@ -102,6 +102,10 @@ struct cpu_features
unsigned long int shared_cache_size;
/* Threshold to use non temporal store. */
unsigned long int non_temporal_threshold;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
/* Threshold to use "rep stosb". */
unsigned long int rep_stosb_threshold;
};
/* Used from outside of glibc to get access to the CPU features

View File

@ -30,6 +30,30 @@ glibc {
x86_non_temporal_threshold {
type: SIZE_T
}
x86_rep_movsb_threshold {
type: SIZE_T
# Since there is overhead to set up REP MOVSB operation, REP MOVSB
# isn't faster on short data. The memcpy micro benchmark in glibc
# shows that 2KB is the approximate value above which REP MOVSB
# becomes faster than SSE2 optimization on processors with Enhanced
# REP MOVSB. Since larger register size can move more data with a
# single load and store, the threshold is higher with larger register
# size. Note: Since the REP MOVSB threshold must be greater than 8
# times of vector size, the minium value must be updated at run-time.
minval: 1
default: 2048
}
x86_rep_stosb_threshold {
type: SIZE_T
# Since there is overhead to set up REP STOSB operation, REP STOSB
# isn't faster on short data. The memset micro benchmark in glibc
# shows that 2KB is the approximate value above which REP STOSB
# becomes faster on processors with Enhanced REP STOSB. Since the
# stored value is fixed, larger register size has minimal impact
# on threshold.
minval: 1
default: 2048
}
x86_data_cache_size {
type: SIZE_T
}

View File

@ -56,17 +56,6 @@
# endif
#endif
/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
up REP MOVSB operation, REP MOVSB isn't faster on short data. The
memcpy micro benchmark in glibc shows that 2KB is the approximate
value above which REP MOVSB becomes faster than SSE2 optimization
on processors with Enhanced REP MOVSB. Since larger register size
can move more data with a single load and store, the threshold is
higher with larger register size. */
#ifndef REP_MOVSB_THRESHOLD
# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
#endif
#ifndef PREFETCH
# define PREFETCH(addr) prefetcht0 addr
#endif
@ -253,9 +242,6 @@ L(movsb):
leaq (%rsi,%rdx), %r9
cmpq %r9, %rdi
/* Avoid slow backward REP MOVSB. */
# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
# endif
jb L(more_8x_vec_backward)
1:
mov %RDX_LP, %RCX_LP
@ -331,7 +317,7 @@ L(between_2_3):
#if defined USE_MULTIARCH && IS_IN (libc)
L(movsb_more_2x_vec):
cmpq $REP_MOVSB_THRESHOLD, %rdx
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
ja L(movsb)
#endif
L(more_2x_vec):

View File

@ -58,16 +58,6 @@
# endif
#endif
/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
up REP STOSB operation, REP STOSB isn't faster on short data. The
memset micro benchmark in glibc shows that 2KB is the approximate
value above which REP STOSB becomes faster on processors with
Enhanced REP STOSB. Since the stored value is fixed, larger register
size has minimal impact on threshold. */
#ifndef REP_STOSB_THRESHOLD
# define REP_STOSB_THRESHOLD 2048
#endif
#ifndef SECTION
# error SECTION is not defined!
#endif
@ -181,7 +171,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
ret
L(stosb_more_2x_vec):
cmpq $REP_STOSB_THRESHOLD, %rdx
cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
ja L(stosb)
#endif
L(more_2x_vec):