glibc/sysdeps/x86/cpu-features.h
H.J. Lu b52b0d793d x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [BZ #21265]
In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector,
mask and bound registers.  It simplifies _dl_runtime_resolve and supports
different calling conventions.  ld.so code size is reduced by more than
1 KB.  However, use fxsave/xsave/xsavec takes a little bit more cycles
than saving and restoring vector and bound registers individually.

Latency for _dl_runtime_resolve to lookup the function, foo, from one
shared library plus libc.so:

                             Before    After     Change

Westmere (SSE)/fxsave         345      866       151%
IvyBridge (AVX)/xsave         420      643       53%
Haswell (AVX)/xsave           713      1252      75%
Skylake (AVX+MPX)/xsavec      559      719       28%
Skylake (AVX512+MPX)/xsavec   145      272       87%
Ryzen (AVX)/xsavec            280      553       97%

This is the worst case where portion of time spent for saving and
restoring registers is bigger than majority of cases.  With smaller
_dl_runtime_resolve code size, overall performance impact is negligible.

On IvyBridge, differences in build and test time of binutils with lazy
binding GCC and binutils are noises.  On Westmere, differences in
bootstrap and "makc check" time of GCC 7 with lazy binding GCC and
binutils are also noises.

	[BZ #21265]
	* sysdeps/x86/cpu-features-offsets.sym (XSAVE_STATE_SIZE_OFFSET):
	New.
	* sysdeps/x86/cpu-features.c: Include <libc-pointer-arith.h>.
	(get_common_indeces): Set xsave_state_size, xsave_state_full_size
	and bit_arch_XSAVEC_Usable if needed.
	(init_cpu_features): Remove bit_arch_Use_dl_runtime_resolve_slow
	and bit_arch_Use_dl_runtime_resolve_opt.
	* sysdeps/x86/cpu-features.h (bit_arch_Use_dl_runtime_resolve_opt):
	Removed.
	(bit_arch_Use_dl_runtime_resolve_slow): Likewise.
	(bit_arch_Prefer_No_AVX512): Updated.
	(bit_arch_MathVec_Prefer_No_AVX512): Likewise.
	(bit_arch_XSAVEC_Usable): New.
	(STATE_SAVE_OFFSET): Likewise.
	(STATE_SAVE_MASK): Likewise.
	[__ASSEMBLER__]: Include <cpu-features-offsets.h>.
	(cpu_features): Add xsave_state_size and xsave_state_full_size.
	(index_arch_Use_dl_runtime_resolve_opt): Removed.
	(index_arch_Use_dl_runtime_resolve_slow): Likewise.
	(index_arch_XSAVEC_Usable): New.
	* sysdeps/x86/cpu-tunables.c (TUNABLE_CALLBACK (set_hwcaps)):
	Support XSAVEC_Usable.  Remove Use_dl_runtime_resolve_slow.
	* sysdeps/x86_64/Makefile (tst-x86_64-1-ENV): New if tunables
	is enabled.
	* sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup):
	Replace _dl_runtime_resolve_sse, _dl_runtime_resolve_avx,
	_dl_runtime_resolve_avx_slow, _dl_runtime_resolve_avx_opt,
	_dl_runtime_resolve_avx512 and _dl_runtime_resolve_avx512_opt
	with _dl_runtime_resolve_fxsave, _dl_runtime_resolve_xsave and
	_dl_runtime_resolve_xsavec.
	* sysdeps/x86_64/dl-trampoline.S (DL_RUNTIME_UNALIGNED_VEC_SIZE):
	Removed.
	(DL_RUNTIME_RESOLVE_REALIGN_STACK): Check STATE_SAVE_ALIGNMENT
	instead of VEC_SIZE.
	(REGISTER_SAVE_BND0): Removed.
	(REGISTER_SAVE_BND1): Likewise.
	(REGISTER_SAVE_BND3): Likewise.
	(REGISTER_SAVE_RAX): Always defined to 0.
	(VMOV): Removed.
	(_dl_runtime_resolve_avx): Likewise.
	(_dl_runtime_resolve_avx_slow): Likewise.
	(_dl_runtime_resolve_avx_opt): Likewise.
	(_dl_runtime_resolve_avx512): Likewise.
	(_dl_runtime_resolve_avx512_opt): Likewise.
	(_dl_runtime_resolve_sse): Likewise.
	(_dl_runtime_resolve_sse_vex): Likewise.
	(USE_FXSAVE): New.
	(_dl_runtime_resolve_fxsave): Likewise.
	(USE_XSAVE): Likewise.
	(_dl_runtime_resolve_xsave): Likewise.
	(USE_XSAVEC): Likewise.
	(_dl_runtime_resolve_xsavec): Likewise.
	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_avx512):
	Removed.
	(_dl_runtime_resolve_avx512_opt): Likewise.
	(_dl_runtime_resolve_avx): Likewise.
	(_dl_runtime_resolve_avx_opt): Likewise.
	(_dl_runtime_resolve_sse): Likewise.
	(_dl_runtime_resolve_sse_vex): Likewise.
	(_dl_runtime_resolve_fxsave): New.
	(_dl_runtime_resolve_xsave): Likewise.
	(_dl_runtime_resolve_xsavec): Likewise.
2017-10-20 11:00:34 -07:00

296 lines
10 KiB
C

/* This file is part of the GNU C Library.
Copyright (C) 2008-2017 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#ifndef cpu_features_h
#define cpu_features_h
#define bit_arch_Fast_Rep_String (1 << 0)
#define bit_arch_Fast_Copy_Backward (1 << 1)
#define bit_arch_Slow_BSF (1 << 2)
#define bit_arch_Fast_Unaligned_Load (1 << 4)
#define bit_arch_Prefer_PMINUB_for_stringop (1 << 5)
#define bit_arch_AVX_Usable (1 << 6)
#define bit_arch_FMA_Usable (1 << 7)
#define bit_arch_FMA4_Usable (1 << 8)
#define bit_arch_Slow_SSE4_2 (1 << 9)
#define bit_arch_AVX2_Usable (1 << 10)
#define bit_arch_AVX_Fast_Unaligned_Load (1 << 11)
#define bit_arch_AVX512F_Usable (1 << 12)
#define bit_arch_AVX512DQ_Usable (1 << 13)
#define bit_arch_I586 (1 << 14)
#define bit_arch_I686 (1 << 15)
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
#define bit_arch_Fast_Unaligned_Copy (1 << 18)
#define bit_arch_Prefer_ERMS (1 << 19)
#define bit_arch_Prefer_No_AVX512 (1 << 20)
#define bit_arch_MathVec_Prefer_No_AVX512 (1 << 21)
#define bit_arch_XSAVEC_Usable (1 << 22)
/* CPUID Feature flags. */
/* COMMON_CPUID_INDEX_1. */
#define bit_cpu_CX8 (1 << 8)
#define bit_cpu_CMOV (1 << 15)
#define bit_cpu_SSE (1 << 25)
#define bit_cpu_SSE2 (1 << 26)
#define bit_cpu_SSSE3 (1 << 9)
#define bit_cpu_SSE4_1 (1 << 19)
#define bit_cpu_SSE4_2 (1 << 20)
#define bit_cpu_OSXSAVE (1 << 27)
#define bit_cpu_AVX (1 << 28)
#define bit_cpu_POPCOUNT (1 << 23)
#define bit_cpu_FMA (1 << 12)
#define bit_cpu_FMA4 (1 << 16)
#define bit_cpu_HTT (1 << 28)
#define bit_cpu_LZCNT (1 << 5)
#define bit_cpu_MOVBE (1 << 22)
#define bit_cpu_POPCNT (1 << 23)
/* COMMON_CPUID_INDEX_7. */
#define bit_cpu_BMI1 (1 << 3)
#define bit_cpu_BMI2 (1 << 8)
#define bit_cpu_ERMS (1 << 9)
#define bit_cpu_RTM (1 << 11)
#define bit_cpu_AVX2 (1 << 5)
#define bit_cpu_AVX512F (1 << 16)
#define bit_cpu_AVX512DQ (1 << 17)
#define bit_cpu_AVX512PF (1 << 26)
#define bit_cpu_AVX512ER (1 << 27)
#define bit_cpu_AVX512CD (1 << 28)
#define bit_cpu_AVX512BW (1 << 30)
#define bit_cpu_AVX512VL (1u << 31)
#define bit_cpu_IBT (1u << 20)
#define bit_cpu_SHSTK (1u << 7)
/* XCR0 Feature flags. */
#define bit_XMM_state (1 << 1)
#define bit_YMM_state (1 << 2)
#define bit_Opmask_state (1 << 5)
#define bit_ZMM0_15_state (1 << 6)
#define bit_ZMM16_31_state (1 << 7)
/* The integer bit array index for the first set of internal feature bits. */
#define FEATURE_INDEX_1 0
/* The current maximum size of the feature integer bit array. */
#define FEATURE_INDEX_MAX 1
/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need
space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be
aligned to 16 bytes for fxsave and 64 bytes for xsave. */
#define STATE_SAVE_OFFSET (8 * 7 + 8)
/* Save SSE, AVX, AVX512, mask and bound registers. */
#define STATE_SAVE_MASK \
((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
#ifdef __ASSEMBLER__
# include <cpu-features-offsets.h>
#else /* __ASSEMBLER__ */
enum
{
COMMON_CPUID_INDEX_1 = 0,
COMMON_CPUID_INDEX_7,
COMMON_CPUID_INDEX_80000001, /* for AMD */
/* Keep the following line at the end. */
COMMON_CPUID_INDEX_MAX
};
struct cpu_features
{
enum cpu_features_kind
{
arch_kind_unknown = 0,
arch_kind_intel,
arch_kind_amd,
arch_kind_other
} kind;
int max_cpuid;
struct cpuid_registers
{
unsigned int eax;
unsigned int ebx;
unsigned int ecx;
unsigned int edx;
} cpuid[COMMON_CPUID_INDEX_MAX];
unsigned int family;
unsigned int model;
/* The state size for XSAVEC or XSAVE. The type must be unsigned long
int so that we use
sub xsave_state_size_offset(%rip) %RSP_LP
in _dl_runtime_resolve. */
unsigned long int xsave_state_size;
/* The full state size for XSAVE when XSAVEC is disabled by
GLIBC_TUNABLES=glibc.tune.hwcaps=-XSAVEC_Usable
*/
unsigned int xsave_state_full_size;
unsigned int feature[FEATURE_INDEX_MAX];
/* Data cache size for use in memory and string routines, typically
L1 size. */
unsigned long int data_cache_size;
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
unsigned long int shared_cache_size;
/* Threshold to use non temporal store. */
unsigned long int non_temporal_threshold;
};
/* Used from outside of glibc to get access to the CPU features
structure. */
extern const struct cpu_features *__get_cpu_features (void)
__attribute__ ((const));
# if defined (_LIBC) && !IS_IN (nonlib)
/* Unused for x86. */
# define INIT_ARCH()
# define __get_cpu_features() (&GLRO(dl_x86_cpu_features))
# endif
/* Only used directly in cpu-features.c. */
# define CPU_FEATURES_CPU_P(ptr, name) \
((ptr->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0)
# define CPU_FEATURES_ARCH_P(ptr, name) \
((ptr->feature[index_arch_##name] & (bit_arch_##name)) != 0)
/* HAS_* evaluates to true if we may use the feature at runtime. */
# define HAS_CPU_FEATURE(name) \
CPU_FEATURES_CPU_P (__get_cpu_features (), name)
# define HAS_ARCH_FEATURE(name) \
CPU_FEATURES_ARCH_P (__get_cpu_features (), name)
# define index_cpu_CX8 COMMON_CPUID_INDEX_1
# define index_cpu_CMOV COMMON_CPUID_INDEX_1
# define index_cpu_SSE COMMON_CPUID_INDEX_1
# define index_cpu_SSE2 COMMON_CPUID_INDEX_1
# define index_cpu_SSSE3 COMMON_CPUID_INDEX_1
# define index_cpu_SSE4_1 COMMON_CPUID_INDEX_1
# define index_cpu_SSE4_2 COMMON_CPUID_INDEX_1
# define index_cpu_AVX COMMON_CPUID_INDEX_1
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7
# define index_cpu_AVX512F COMMON_CPUID_INDEX_7
# define index_cpu_AVX512DQ COMMON_CPUID_INDEX_7
# define index_cpu_AVX512PF COMMON_CPUID_INDEX_7
# define index_cpu_AVX512ER COMMON_CPUID_INDEX_7
# define index_cpu_AVX512CD COMMON_CPUID_INDEX_7
# define index_cpu_AVX512BW COMMON_CPUID_INDEX_7
# define index_cpu_AVX512VL COMMON_CPUID_INDEX_7
# define index_cpu_ERMS COMMON_CPUID_INDEX_7
# define index_cpu_RTM COMMON_CPUID_INDEX_7
# define index_cpu_FMA COMMON_CPUID_INDEX_1
# define index_cpu_FMA4 COMMON_CPUID_INDEX_80000001
# define index_cpu_POPCOUNT COMMON_CPUID_INDEX_1
# define index_cpu_OSXSAVE COMMON_CPUID_INDEX_1
# define index_cpu_HTT COMMON_CPUID_INDEX_1
# define index_cpu_BMI1 COMMON_CPUID_INDEX_7
# define index_cpu_BMI2 COMMON_CPUID_INDEX_7
# define index_cpu_LZCNT COMMON_CPUID_INDEX_1
# define index_cpu_MOVBE COMMON_CPUID_INDEX_1
# define index_cpu_POPCNT COMMON_CPUID_INDEX_1
# define index_cpu_IBT COMMON_CPUID_INDEX_7
# define index_cpu_SHSTK COMMON_CPUID_INDEX_7
# define reg_CX8 edx
# define reg_CMOV edx
# define reg_SSE edx
# define reg_SSE2 edx
# define reg_SSSE3 ecx
# define reg_SSE4_1 ecx
# define reg_SSE4_2 ecx
# define reg_AVX ecx
# define reg_AVX2 ebx
# define reg_AVX512F ebx
# define reg_AVX512DQ ebx
# define reg_AVX512PF ebx
# define reg_AVX512ER ebx
# define reg_AVX512CD ebx
# define reg_AVX512BW ebx
# define reg_AVX512VL ebx
# define reg_ERMS ebx
# define reg_RTM ebx
# define reg_FMA ecx
# define reg_FMA4 ecx
# define reg_POPCOUNT ecx
# define reg_OSXSAVE ecx
# define reg_HTT edx
# define reg_BMI1 ebx
# define reg_BMI2 ebx
# define reg_LZCNT ecx
# define reg_MOVBE ecx
# define reg_POPCNT ecx
# define reg_IBT edx
# define reg_SHSTK ecx
# define index_arch_Fast_Rep_String FEATURE_INDEX_1
# define index_arch_Fast_Copy_Backward FEATURE_INDEX_1
# define index_arch_Slow_BSF FEATURE_INDEX_1
# define index_arch_Fast_Unaligned_Load FEATURE_INDEX_1
# define index_arch_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
# define index_arch_AVX_Usable FEATURE_INDEX_1
# define index_arch_FMA_Usable FEATURE_INDEX_1
# define index_arch_FMA4_Usable FEATURE_INDEX_1
# define index_arch_Slow_SSE4_2 FEATURE_INDEX_1
# define index_arch_AVX2_Usable FEATURE_INDEX_1
# define index_arch_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
# define index_arch_AVX512F_Usable FEATURE_INDEX_1
# define index_arch_AVX512DQ_Usable FEATURE_INDEX_1
# define index_arch_I586 FEATURE_INDEX_1
# define index_arch_I686 FEATURE_INDEX_1
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
# define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1
# define index_arch_Prefer_ERMS FEATURE_INDEX_1
# define index_arch_Prefer_No_AVX512 FEATURE_INDEX_1
# define index_arch_MathVec_Prefer_No_AVX512 FEATURE_INDEX_1
# define index_arch_XSAVEC_Usable FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
#ifdef __x86_64__
# define HAS_CPUID 1
#elif defined __i586__ || defined __pentium__
# define HAS_CPUID 1
# define HAS_I586 1
# define HAS_I686 HAS_ARCH_FEATURE (I686)
#elif (defined __i686__ || defined __pentiumpro__ \
|| defined __pentium4__ || defined __nocona__ \
|| defined __atom__ || defined __core2__ \
|| defined __corei7__ || defined __corei7_avx__ \
|| defined __core_avx2__ || defined __nehalem__ \
|| defined __sandybridge__ || defined __haswell__ \
|| defined __knl__ || defined __bonnell__ \
|| defined __silvermont__ \
|| defined __k6__ || defined __k8__ \
|| defined __athlon__ || defined __amdfam10__ \
|| defined __bdver1__ || defined __bdver2__ \
|| defined __bdver3__ || defined __bdver4__ \
|| defined __btver1__ || defined __btver2__)
# define HAS_CPUID 1
# define HAS_I586 1
# define HAS_I686 1
#else
# define HAS_CPUID 0
# define HAS_I586 HAS_ARCH_FEATURE (I586)
# define HAS_I686 HAS_ARCH_FEATURE (I686)
#endif
#endif /* cpu_features_h */