mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-27 07:20:11 +00:00
475b63702e
No bug. This patch doubles the rep_movsb_threshold when using ERMS. Based on benchmarks the vector copy loop, especially now that it handles 4k aliasing, is better for these medium ranged. On Skylake with ERMS: Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) 4096, 0, 0, 0, 0.975 4096, 0, 0, 1, 0.953 4096, 12, 0, 0, 0.969 4096, 12, 0, 1, 0.872 4096, 44, 0, 0, 0.979 4096, 44, 0, 1, 0.83 4096, 0, 12, 0, 1.006 4096, 0, 12, 1, 0.989 4096, 0, 44, 0, 0.739 4096, 0, 44, 1, 0.942 4096, 12, 12, 0, 1.009 4096, 12, 12, 1, 0.973 4096, 44, 44, 0, 0.791 4096, 44, 44, 1, 0.961 4096, 2048, 0, 0, 0.978 4096, 2048, 0, 1, 0.951 4096, 2060, 0, 0, 0.986 4096, 2060, 0, 1, 0.963 4096, 2048, 12, 0, 0.971 4096, 2048, 12, 1, 0.941 4096, 2060, 12, 0, 0.977 4096, 2060, 12, 1, 0.949 8192, 0, 0, 0, 0.85 8192, 0, 0, 1, 0.845 8192, 13, 0, 0, 0.937 8192, 13, 0, 1, 0.939 8192, 45, 0, 0, 0.932 8192, 45, 0, 1, 0.927 8192, 0, 13, 0, 0.621 8192, 0, 13, 1, 0.62 8192, 0, 45, 0, 0.53 8192, 0, 45, 1, 0.516 8192, 13, 13, 0, 0.664 8192, 13, 13, 1, 0.659 8192, 45, 45, 0, 0.593 8192, 45, 45, 1, 0.575 8192, 2048, 0, 0, 0.854 8192, 2048, 0, 1, 0.834 8192, 2061, 0, 0, 0.863 8192, 2061, 0, 1, 0.857 8192, 2048, 13, 0, 0.63 8192, 2048, 13, 1, 0.629 8192, 2061, 13, 0, 0.627 8192, 2061, 13, 1, 0.62 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
71 lines
2.6 KiB
Plaintext
71 lines
2.6 KiB
Plaintext
# x86 specific tunables.
|
|
# Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
# This file is part of the GNU C Library.
|
|
|
|
# The GNU C Library is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License as published by the Free Software Foundation; either
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
|
|
# The GNU C Library is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with the GNU C Library; if not, see
|
|
# <https://www.gnu.org/licenses/>.
|
|
|
|
glibc {
|
|
cpu {
|
|
hwcaps {
|
|
type: STRING
|
|
}
|
|
x86_ibt {
|
|
type: STRING
|
|
}
|
|
x86_shstk {
|
|
type: STRING
|
|
}
|
|
x86_non_temporal_threshold {
|
|
type: SIZE_T
|
|
}
|
|
x86_rep_movsb_threshold {
|
|
type: SIZE_T
|
|
# Since there is overhead to set up REP MOVSB operation, REP
|
|
# MOVSB isn't faster on short data. The memcpy micro benchmark
|
|
# in glibc shows that 2KB is the approximate value above which
|
|
# REP MOVSB becomes faster than SSE2 optimization on processors
|
|
# with Enhanced REP MOVSB. Since larger register size can move
|
|
# more data with a single load and store, the threshold is
|
|
# higher with larger register size. Micro benchmarks show AVX
|
|
# REP MOVSB becomes faster apprximately at 8KB. The AVX512
|
|
# threshold is extrapolated to 16KB. For machines with FSRM the
|
|
# threshold is universally set at 2112 bytes. Note: Since the
|
|
# REP MOVSB threshold must be greater than 8 times of vector
|
|
# size and the default value is 4096 * (vector size / 16), the
|
|
# default value and the minimum value must be updated at
|
|
# run-time. NB: Don't set the default value since we can't tell
|
|
# if the tunable value is set by user or not [BZ #27069].
|
|
minval: 1
|
|
}
|
|
x86_rep_stosb_threshold {
|
|
type: SIZE_T
|
|
# Since there is overhead to set up REP STOSB operation, REP STOSB
|
|
# isn't faster on short data. The memset micro benchmark in glibc
|
|
# shows that 2KB is the approximate value above which REP STOSB
|
|
# becomes faster on processors with Enhanced REP STOSB. Since the
|
|
# stored value is fixed, larger register size has minimal impact
|
|
# on threshold.
|
|
minval: 1
|
|
default: 2048
|
|
}
|
|
x86_data_cache_size {
|
|
type: SIZE_T
|
|
}
|
|
x86_shared_cache_size {
|
|
type: SIZE_T
|
|
}
|
|
}
|
|
}
|