mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-16 05:40:08 +00:00
b52b0d793d
In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector, mask and bound registers. It simplifies _dl_runtime_resolve and supports different calling conventions. ld.so code size is reduced by more than 1 KB. However, use fxsave/xsave/xsavec takes a little bit more cycles than saving and restoring vector and bound registers individually. Latency for _dl_runtime_resolve to lookup the function, foo, from one shared library plus libc.so: Before After Change Westmere (SSE)/fxsave 345 866 151% IvyBridge (AVX)/xsave 420 643 53% Haswell (AVX)/xsave 713 1252 75% Skylake (AVX+MPX)/xsavec 559 719 28% Skylake (AVX512+MPX)/xsavec 145 272 87% Ryzen (AVX)/xsavec 280 553 97% This is the worst case where portion of time spent for saving and restoring registers is bigger than majority of cases. With smaller _dl_runtime_resolve code size, overall performance impact is negligible. On IvyBridge, differences in build and test time of binutils with lazy binding GCC and binutils are noises. On Westmere, differences in bootstrap and "makc check" time of GCC 7 with lazy binding GCC and binutils are also noises. [BZ #21265] * sysdeps/x86/cpu-features-offsets.sym (XSAVE_STATE_SIZE_OFFSET): New. * sysdeps/x86/cpu-features.c: Include <libc-pointer-arith.h>. (get_common_indeces): Set xsave_state_size, xsave_state_full_size and bit_arch_XSAVEC_Usable if needed. (init_cpu_features): Remove bit_arch_Use_dl_runtime_resolve_slow and bit_arch_Use_dl_runtime_resolve_opt. * sysdeps/x86/cpu-features.h (bit_arch_Use_dl_runtime_resolve_opt): Removed. (bit_arch_Use_dl_runtime_resolve_slow): Likewise. (bit_arch_Prefer_No_AVX512): Updated. (bit_arch_MathVec_Prefer_No_AVX512): Likewise. (bit_arch_XSAVEC_Usable): New. (STATE_SAVE_OFFSET): Likewise. (STATE_SAVE_MASK): Likewise. [__ASSEMBLER__]: Include <cpu-features-offsets.h>. (cpu_features): Add xsave_state_size and xsave_state_full_size. (index_arch_Use_dl_runtime_resolve_opt): Removed. (index_arch_Use_dl_runtime_resolve_slow): Likewise. (index_arch_XSAVEC_Usable): New. * sysdeps/x86/cpu-tunables.c (TUNABLE_CALLBACK (set_hwcaps)): Support XSAVEC_Usable. Remove Use_dl_runtime_resolve_slow. * sysdeps/x86_64/Makefile (tst-x86_64-1-ENV): New if tunables is enabled. * sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup): Replace _dl_runtime_resolve_sse, _dl_runtime_resolve_avx, _dl_runtime_resolve_avx_slow, _dl_runtime_resolve_avx_opt, _dl_runtime_resolve_avx512 and _dl_runtime_resolve_avx512_opt with _dl_runtime_resolve_fxsave, _dl_runtime_resolve_xsave and _dl_runtime_resolve_xsavec. * sysdeps/x86_64/dl-trampoline.S (DL_RUNTIME_UNALIGNED_VEC_SIZE): Removed. (DL_RUNTIME_RESOLVE_REALIGN_STACK): Check STATE_SAVE_ALIGNMENT instead of VEC_SIZE. (REGISTER_SAVE_BND0): Removed. (REGISTER_SAVE_BND1): Likewise. (REGISTER_SAVE_BND3): Likewise. (REGISTER_SAVE_RAX): Always defined to 0. (VMOV): Removed. (_dl_runtime_resolve_avx): Likewise. (_dl_runtime_resolve_avx_slow): Likewise. (_dl_runtime_resolve_avx_opt): Likewise. (_dl_runtime_resolve_avx512): Likewise. (_dl_runtime_resolve_avx512_opt): Likewise. (_dl_runtime_resolve_sse): Likewise. (_dl_runtime_resolve_sse_vex): Likewise. (USE_FXSAVE): New. (_dl_runtime_resolve_fxsave): Likewise. (USE_XSAVE): Likewise. (_dl_runtime_resolve_xsave): Likewise. (USE_XSAVEC): Likewise. (_dl_runtime_resolve_xsavec): Likewise. * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_avx512): Removed. (_dl_runtime_resolve_avx512_opt): Likewise. (_dl_runtime_resolve_avx): Likewise. (_dl_runtime_resolve_avx_opt): Likewise. (_dl_runtime_resolve_sse): Likewise. (_dl_runtime_resolve_sse_vex): Likewise. (_dl_runtime_resolve_fxsave): New. (_dl_runtime_resolve_xsave): Likewise. (_dl_runtime_resolve_xsavec): Likewise.
168 lines
5.3 KiB
Makefile
168 lines
5.3 KiB
Makefile
# The i387 `long double' is a distinct type we support.
|
|
long-double-fcts = yes
|
|
|
|
ifeq ($(subdir),csu)
|
|
gen-as-const-headers += link-defines.sym
|
|
endif
|
|
|
|
ifeq ($(subdir),gmon)
|
|
sysdep_routines += _mcount
|
|
# We cannot compile _mcount.S with -pg because that would create
|
|
# recursive calls when ENTRY is used. Just copy the normal static
|
|
# object.
|
|
sysdep_noprof += _mcount
|
|
endif
|
|
|
|
ifeq ($(subdir),malloc)
|
|
tests += tst-mallocalign1
|
|
endif
|
|
|
|
ifeq ($(subdir),string)
|
|
sysdep_routines += cacheinfo strcasecmp_l-nonascii strncase_l-nonascii
|
|
gen-as-const-headers += locale-defines.sym
|
|
endif
|
|
|
|
ifeq ($(subdir),elf)
|
|
# There is no good reason to use MMX in x86-64 ld.so with GCC.
|
|
CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
|
|
-mno-mmx)
|
|
|
|
sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr
|
|
|
|
tests += ifuncmain8
|
|
modules-names += ifuncmod8
|
|
|
|
$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
|
|
|
|
tests += tst-quad1 tst-quad2
|
|
modules-names += tst-quadmod1 tst-quadmod2
|
|
|
|
$(objpfx)tst-quad1: $(objpfx)tst-quadmod1.so
|
|
$(objpfx)tst-quad2: $(objpfx)tst-quadmod2.so
|
|
|
|
quad-pie-test += tst-quad1pie tst-quad2pie
|
|
tests += $(quad-pie-test)
|
|
tests-pie += $(quad-pie-test)
|
|
test-extras += tst-quadmod1pie tst-quadmod2pie
|
|
extra-test-objs += tst-quadmod1pie.o tst-quadmod2pie.o
|
|
|
|
$(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o
|
|
$(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o
|
|
|
|
CFLAGS-tst-quad1pie.c = $(PIE-ccflag)
|
|
CFLAGS-tst-quad2pie.c = $(PIE-ccflag)
|
|
|
|
tests += tst-x86_64-1
|
|
modules-names += x86_64/tst-x86_64mod-1
|
|
LDFLAGS-tst-x86_64mod-1.so = -Wl,-soname,tst-x86_64mod-1.so
|
|
ifneq (no,$(have-tunables))
|
|
# Test the state size for XSAVE when XSAVEC is disabled.
|
|
tst-x86_64-1-ENV = GLIBC_TUNABLES=glibc.tune.hwcaps=-XSAVEC_Usable
|
|
endif
|
|
|
|
$(objpfx)tst-x86_64-1: $(objpfx)x86_64/tst-x86_64mod-1.so
|
|
|
|
ifneq (no,$(have-tunables))
|
|
tests += tst-platform-1
|
|
modules-names += tst-platformmod-1 x86_64/tst-platformmod-2
|
|
CFLAGS-tst-platform-1.c = -mno-avx
|
|
CFLAGS-tst-platformmod-1.c = -mno-avx
|
|
CFLAGS-tst-platformmod-2.c = -mno-avx
|
|
LDFLAGS-tst-platformmod-2.so = -Wl,-soname,tst-platformmod-2.so
|
|
$(objpfx)tst-platform-1: $(objpfx)tst-platformmod-1.so
|
|
$(objpfx)tst-platform-1.out: $(objpfx)x86_64/tst-platformmod-2.so
|
|
# Turn off AVX512F_Usable and AVX2_Usable so that GLRO(dl_platform) is
|
|
# always set to x86_64.
|
|
tst-platform-1-ENV = LD_PRELOAD=$(objpfx)\$$PLATFORM/tst-platformmod-2.so \
|
|
GLIBC_TUNABLES=glibc.tune.hwcaps=-AVX512F_Usable,-AVX2_Usable
|
|
endif
|
|
|
|
tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 \
|
|
tst-audit10 tst-sse tst-avx tst-avx512
|
|
test-extras += tst-audit4-aux tst-audit10-aux \
|
|
tst-avx-aux tst-avx512-aux
|
|
extra-test-objs += tst-audit4-aux.o tst-audit10-aux.o \
|
|
tst-avx-aux.o tst-avx512-aux.o
|
|
|
|
ifeq ($(have-insert),yes)
|
|
tests += tst-split-dynreloc
|
|
LDFLAGS-tst-split-dynreloc = -Wl,-T,$(..)sysdeps/x86_64/tst-split-dynreloc.lds
|
|
tst-split-dynreloc-ENV = LD_BIND_NOW=1
|
|
endif
|
|
|
|
modules-names += tst-auditmod3a tst-auditmod3b \
|
|
tst-auditmod4a tst-auditmod4b \
|
|
tst-auditmod5a tst-auditmod5b \
|
|
tst-auditmod6a tst-auditmod6b tst-auditmod6c \
|
|
tst-auditmod7a tst-auditmod7b \
|
|
tst-auditmod10a tst-auditmod10b \
|
|
tst-ssemod tst-avxmod tst-avx512mod
|
|
|
|
$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
|
|
$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
|
|
tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
|
|
|
|
$(objpfx)tst-audit4: $(objpfx)tst-audit4-aux.o $(objpfx)tst-auditmod4a.so
|
|
$(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so
|
|
tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so
|
|
|
|
$(objpfx)tst-audit5: $(objpfx)tst-auditmod5a.so
|
|
$(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so
|
|
tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so
|
|
|
|
$(objpfx)tst-audit6: $(objpfx)tst-auditmod6a.so
|
|
$(objpfx)tst-audit6.out: $(objpfx)tst-auditmod6b.so \
|
|
$(objpfx)tst-auditmod6c.so
|
|
tst-audit6-ENV = LD_AUDIT=$(objpfx)tst-auditmod6b.so:$(objpfx)tst-auditmod6c.so
|
|
|
|
$(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so
|
|
$(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so
|
|
tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so
|
|
|
|
$(objpfx)tst-audit10: $(objpfx)tst-audit10-aux.o $(objpfx)tst-auditmod10a.so
|
|
$(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so
|
|
tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so
|
|
|
|
$(objpfx)tst-sse: $(objpfx)tst-ssemod.so
|
|
$(objpfx)tst-avx: $(objpfx)tst-avx-aux.o $(objpfx)tst-avxmod.so
|
|
$(objpfx)tst-avx512: $(objpfx)tst-avx512-aux.o $(objpfx)tst-avx512mod.so
|
|
|
|
AVX-CFLAGS=-mavx -mno-vzeroupper
|
|
CFLAGS-tst-audit4-aux.c += $(AVX-CFLAGS)
|
|
CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS)
|
|
CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS)
|
|
CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS)
|
|
CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS)
|
|
CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS)
|
|
CFLAGS-tst-avx-aux.c += $(AVX-CFLAGS)
|
|
CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
|
|
ifeq (yes,$(config-cflags-avx512))
|
|
AVX512-CFLAGS = -mavx512f
|
|
CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
|
|
CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
|
|
CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
|
|
CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
|
|
CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
|
|
endif
|
|
endif
|
|
|
|
ifeq ($(subdir),csu)
|
|
gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
|
|
endif
|
|
|
|
$(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os
|
|
$(make-target-directory)
|
|
rm -f $@
|
|
ln $< $@
|
|
|
|
do-tests-clean common-mostlyclean: tst-x86_64-1-clean
|
|
|
|
.PHONY: tst-x86_64-1-clean
|
|
tst-x86_64-1-clean:
|
|
-rm -rf $(objpfx)x86_64
|
|
|
|
$(objpfx)x86_64/tst-platformmod-2.os: $(objpfx)tst-platformmod-2.os
|
|
$(make-target-directory)
|
|
rm -f $@
|
|
ln $< $@
|