glibc/sysdeps/x86_64/Makefile
H.J. Lu b52b0d793d x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [BZ #21265]
In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector,
mask and bound registers.  It simplifies _dl_runtime_resolve and supports
different calling conventions.  ld.so code size is reduced by more than
1 KB.  However, use fxsave/xsave/xsavec takes a little bit more cycles
than saving and restoring vector and bound registers individually.

Latency for _dl_runtime_resolve to lookup the function, foo, from one
shared library plus libc.so:

                             Before    After     Change

Westmere (SSE)/fxsave         345      866       151%
IvyBridge (AVX)/xsave         420      643       53%
Haswell (AVX)/xsave           713      1252      75%
Skylake (AVX+MPX)/xsavec      559      719       28%
Skylake (AVX512+MPX)/xsavec   145      272       87%
Ryzen (AVX)/xsavec            280      553       97%

This is the worst case where portion of time spent for saving and
restoring registers is bigger than majority of cases.  With smaller
_dl_runtime_resolve code size, overall performance impact is negligible.

On IvyBridge, differences in build and test time of binutils with lazy
binding GCC and binutils are noises.  On Westmere, differences in
bootstrap and "makc check" time of GCC 7 with lazy binding GCC and
binutils are also noises.

	[BZ #21265]
	* sysdeps/x86/cpu-features-offsets.sym (XSAVE_STATE_SIZE_OFFSET):
	New.
	* sysdeps/x86/cpu-features.c: Include <libc-pointer-arith.h>.
	(get_common_indeces): Set xsave_state_size, xsave_state_full_size
	and bit_arch_XSAVEC_Usable if needed.
	(init_cpu_features): Remove bit_arch_Use_dl_runtime_resolve_slow
	and bit_arch_Use_dl_runtime_resolve_opt.
	* sysdeps/x86/cpu-features.h (bit_arch_Use_dl_runtime_resolve_opt):
	Removed.
	(bit_arch_Use_dl_runtime_resolve_slow): Likewise.
	(bit_arch_Prefer_No_AVX512): Updated.
	(bit_arch_MathVec_Prefer_No_AVX512): Likewise.
	(bit_arch_XSAVEC_Usable): New.
	(STATE_SAVE_OFFSET): Likewise.
	(STATE_SAVE_MASK): Likewise.
	[__ASSEMBLER__]: Include <cpu-features-offsets.h>.
	(cpu_features): Add xsave_state_size and xsave_state_full_size.
	(index_arch_Use_dl_runtime_resolve_opt): Removed.
	(index_arch_Use_dl_runtime_resolve_slow): Likewise.
	(index_arch_XSAVEC_Usable): New.
	* sysdeps/x86/cpu-tunables.c (TUNABLE_CALLBACK (set_hwcaps)):
	Support XSAVEC_Usable.  Remove Use_dl_runtime_resolve_slow.
	* sysdeps/x86_64/Makefile (tst-x86_64-1-ENV): New if tunables
	is enabled.
	* sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup):
	Replace _dl_runtime_resolve_sse, _dl_runtime_resolve_avx,
	_dl_runtime_resolve_avx_slow, _dl_runtime_resolve_avx_opt,
	_dl_runtime_resolve_avx512 and _dl_runtime_resolve_avx512_opt
	with _dl_runtime_resolve_fxsave, _dl_runtime_resolve_xsave and
	_dl_runtime_resolve_xsavec.
	* sysdeps/x86_64/dl-trampoline.S (DL_RUNTIME_UNALIGNED_VEC_SIZE):
	Removed.
	(DL_RUNTIME_RESOLVE_REALIGN_STACK): Check STATE_SAVE_ALIGNMENT
	instead of VEC_SIZE.
	(REGISTER_SAVE_BND0): Removed.
	(REGISTER_SAVE_BND1): Likewise.
	(REGISTER_SAVE_BND3): Likewise.
	(REGISTER_SAVE_RAX): Always defined to 0.
	(VMOV): Removed.
	(_dl_runtime_resolve_avx): Likewise.
	(_dl_runtime_resolve_avx_slow): Likewise.
	(_dl_runtime_resolve_avx_opt): Likewise.
	(_dl_runtime_resolve_avx512): Likewise.
	(_dl_runtime_resolve_avx512_opt): Likewise.
	(_dl_runtime_resolve_sse): Likewise.
	(_dl_runtime_resolve_sse_vex): Likewise.
	(USE_FXSAVE): New.
	(_dl_runtime_resolve_fxsave): Likewise.
	(USE_XSAVE): Likewise.
	(_dl_runtime_resolve_xsave): Likewise.
	(USE_XSAVEC): Likewise.
	(_dl_runtime_resolve_xsavec): Likewise.
	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_avx512):
	Removed.
	(_dl_runtime_resolve_avx512_opt): Likewise.
	(_dl_runtime_resolve_avx): Likewise.
	(_dl_runtime_resolve_avx_opt): Likewise.
	(_dl_runtime_resolve_sse): Likewise.
	(_dl_runtime_resolve_sse_vex): Likewise.
	(_dl_runtime_resolve_fxsave): New.
	(_dl_runtime_resolve_xsave): Likewise.
	(_dl_runtime_resolve_xsavec): Likewise.
2017-10-20 11:00:34 -07:00

168 lines
5.3 KiB
Makefile

# The i387 `long double' is a distinct type we support.
long-double-fcts = yes
ifeq ($(subdir),csu)
gen-as-const-headers += link-defines.sym
endif
ifeq ($(subdir),gmon)
sysdep_routines += _mcount
# We cannot compile _mcount.S with -pg because that would create
# recursive calls when ENTRY is used. Just copy the normal static
# object.
sysdep_noprof += _mcount
endif
ifeq ($(subdir),malloc)
tests += tst-mallocalign1
endif
ifeq ($(subdir),string)
sysdep_routines += cacheinfo strcasecmp_l-nonascii strncase_l-nonascii
gen-as-const-headers += locale-defines.sym
endif
ifeq ($(subdir),elf)
# There is no good reason to use MMX in x86-64 ld.so with GCC.
CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
-mno-mmx)
sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr
tests += ifuncmain8
modules-names += ifuncmod8
$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
tests += tst-quad1 tst-quad2
modules-names += tst-quadmod1 tst-quadmod2
$(objpfx)tst-quad1: $(objpfx)tst-quadmod1.so
$(objpfx)tst-quad2: $(objpfx)tst-quadmod2.so
quad-pie-test += tst-quad1pie tst-quad2pie
tests += $(quad-pie-test)
tests-pie += $(quad-pie-test)
test-extras += tst-quadmod1pie tst-quadmod2pie
extra-test-objs += tst-quadmod1pie.o tst-quadmod2pie.o
$(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o
$(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o
CFLAGS-tst-quad1pie.c = $(PIE-ccflag)
CFLAGS-tst-quad2pie.c = $(PIE-ccflag)
tests += tst-x86_64-1
modules-names += x86_64/tst-x86_64mod-1
LDFLAGS-tst-x86_64mod-1.so = -Wl,-soname,tst-x86_64mod-1.so
ifneq (no,$(have-tunables))
# Test the state size for XSAVE when XSAVEC is disabled.
tst-x86_64-1-ENV = GLIBC_TUNABLES=glibc.tune.hwcaps=-XSAVEC_Usable
endif
$(objpfx)tst-x86_64-1: $(objpfx)x86_64/tst-x86_64mod-1.so
ifneq (no,$(have-tunables))
tests += tst-platform-1
modules-names += tst-platformmod-1 x86_64/tst-platformmod-2
CFLAGS-tst-platform-1.c = -mno-avx
CFLAGS-tst-platformmod-1.c = -mno-avx
CFLAGS-tst-platformmod-2.c = -mno-avx
LDFLAGS-tst-platformmod-2.so = -Wl,-soname,tst-platformmod-2.so
$(objpfx)tst-platform-1: $(objpfx)tst-platformmod-1.so
$(objpfx)tst-platform-1.out: $(objpfx)x86_64/tst-platformmod-2.so
# Turn off AVX512F_Usable and AVX2_Usable so that GLRO(dl_platform) is
# always set to x86_64.
tst-platform-1-ENV = LD_PRELOAD=$(objpfx)\$$PLATFORM/tst-platformmod-2.so \
GLIBC_TUNABLES=glibc.tune.hwcaps=-AVX512F_Usable,-AVX2_Usable
endif
tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 \
tst-audit10 tst-sse tst-avx tst-avx512
test-extras += tst-audit4-aux tst-audit10-aux \
tst-avx-aux tst-avx512-aux
extra-test-objs += tst-audit4-aux.o tst-audit10-aux.o \
tst-avx-aux.o tst-avx512-aux.o
ifeq ($(have-insert),yes)
tests += tst-split-dynreloc
LDFLAGS-tst-split-dynreloc = -Wl,-T,$(..)sysdeps/x86_64/tst-split-dynreloc.lds
tst-split-dynreloc-ENV = LD_BIND_NOW=1
endif
modules-names += tst-auditmod3a tst-auditmod3b \
tst-auditmod4a tst-auditmod4b \
tst-auditmod5a tst-auditmod5b \
tst-auditmod6a tst-auditmod6b tst-auditmod6c \
tst-auditmod7a tst-auditmod7b \
tst-auditmod10a tst-auditmod10b \
tst-ssemod tst-avxmod tst-avx512mod
$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
$(objpfx)tst-audit4: $(objpfx)tst-audit4-aux.o $(objpfx)tst-auditmod4a.so
$(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so
tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so
$(objpfx)tst-audit5: $(objpfx)tst-auditmod5a.so
$(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so
tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so
$(objpfx)tst-audit6: $(objpfx)tst-auditmod6a.so
$(objpfx)tst-audit6.out: $(objpfx)tst-auditmod6b.so \
$(objpfx)tst-auditmod6c.so
tst-audit6-ENV = LD_AUDIT=$(objpfx)tst-auditmod6b.so:$(objpfx)tst-auditmod6c.so
$(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so
$(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so
tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so
$(objpfx)tst-audit10: $(objpfx)tst-audit10-aux.o $(objpfx)tst-auditmod10a.so
$(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so
tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so
$(objpfx)tst-sse: $(objpfx)tst-ssemod.so
$(objpfx)tst-avx: $(objpfx)tst-avx-aux.o $(objpfx)tst-avxmod.so
$(objpfx)tst-avx512: $(objpfx)tst-avx512-aux.o $(objpfx)tst-avx512mod.so
AVX-CFLAGS=-mavx -mno-vzeroupper
CFLAGS-tst-audit4-aux.c += $(AVX-CFLAGS)
CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS)
CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS)
CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS)
CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS)
CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS)
CFLAGS-tst-avx-aux.c += $(AVX-CFLAGS)
CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
ifeq (yes,$(config-cflags-avx512))
AVX512-CFLAGS = -mavx512f
CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
endif
endif
ifeq ($(subdir),csu)
gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
endif
$(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os
$(make-target-directory)
rm -f $@
ln $< $@
do-tests-clean common-mostlyclean: tst-x86_64-1-clean
.PHONY: tst-x86_64-1-clean
tst-x86_64-1-clean:
-rm -rf $(objpfx)x86_64
$(objpfx)x86_64/tst-platformmod-2.os: $(objpfx)tst-platformmod-2.os
$(make-target-directory)
rm -f $@
ln $< $@