mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-23 11:20:07 +00:00
x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258]
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve the first 8 vector registers. The code layout is if only %xmm0 - %xmm7 registers are used preserve %xmm0 - %xmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %zmm0 - %zmm7 registers Branch predication always executes the fallthrough code path to preserve %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7 registers are used. This leads to lower CPU frequency on Skylake server. This patch changes the fallthrough code path to preserve %xmm0 - %xmm7 registers instead: if whole %zmm0 - %zmm7 registers are used preserve %zmm0 - %zmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %xmm0 - %xmm7 registers Tested on Skylake server. [BZ #21258] * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt): Define only if _dl_runtime_resolve is defined to _dl_runtime_resolve_sse_vex. * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt): Fallthrough to _dl_runtime_resolve_sse_vex.
This commit is contained in:
parent
a640393a18
commit
c15f8eb50c
@ -1,3 +1,12 @@
|
|||||||
|
2017-03-21 H.J. Lu <hongjiu.lu@intel.com>
|
||||||
|
|
||||||
|
[BZ #21258]
|
||||||
|
* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
|
||||||
|
Define only if _dl_runtime_resolve is defined to
|
||||||
|
_dl_runtime_resolve_sse_vex.
|
||||||
|
* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
|
||||||
|
Fallthrough to _dl_runtime_resolve_sse_vex.
|
||||||
|
|
||||||
2017-03-21 Joseph Myers <joseph@codesourcery.com>
|
2017-03-21 Joseph Myers <joseph@codesourcery.com>
|
||||||
|
|
||||||
* INSTALL: Regenerated.
|
* INSTALL: Regenerated.
|
||||||
|
@ -87,11 +87,9 @@
|
|||||||
#endif
|
#endif
|
||||||
#define VEC(i) zmm##i
|
#define VEC(i) zmm##i
|
||||||
#define _dl_runtime_resolve _dl_runtime_resolve_avx512
|
#define _dl_runtime_resolve _dl_runtime_resolve_avx512
|
||||||
#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
|
|
||||||
#define _dl_runtime_profile _dl_runtime_profile_avx512
|
#define _dl_runtime_profile _dl_runtime_profile_avx512
|
||||||
#include "dl-trampoline.h"
|
#include "dl-trampoline.h"
|
||||||
#undef _dl_runtime_resolve
|
#undef _dl_runtime_resolve
|
||||||
#undef _dl_runtime_resolve_opt
|
|
||||||
#undef _dl_runtime_profile
|
#undef _dl_runtime_profile
|
||||||
#undef VEC
|
#undef VEC
|
||||||
#undef VMOV
|
#undef VMOV
|
||||||
@ -145,4 +143,5 @@
|
|||||||
# define VMOV vmovdqu
|
# define VMOV vmovdqu
|
||||||
#endif
|
#endif
|
||||||
#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex
|
#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex
|
||||||
|
#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
|
||||||
#include "dl-trampoline.h"
|
#include "dl-trampoline.h"
|
||||||
|
@ -129,19 +129,20 @@ _dl_runtime_resolve_opt:
|
|||||||
# YMM state isn't in use.
|
# YMM state isn't in use.
|
||||||
PRESERVE_BND_REGS_PREFIX
|
PRESERVE_BND_REGS_PREFIX
|
||||||
jz _dl_runtime_resolve_sse_vex
|
jz _dl_runtime_resolve_sse_vex
|
||||||
# elif VEC_SIZE == 64
|
# elif VEC_SIZE == 16
|
||||||
# For ZMM registers, check if YMM state and ZMM state are in
|
# For ZMM registers, check if YMM state and ZMM state are in
|
||||||
# use.
|
# use.
|
||||||
andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
|
andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
|
||||||
cmpl $bit_YMM_state, %r11d
|
cmpl $bit_YMM_state, %r11d
|
||||||
# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
|
# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
|
||||||
# neither YMM state nor ZMM state are in use.
|
|
||||||
PRESERVE_BND_REGS_PREFIX
|
PRESERVE_BND_REGS_PREFIX
|
||||||
jl _dl_runtime_resolve_sse_vex
|
jg _dl_runtime_resolve_avx512
|
||||||
# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
|
# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
|
||||||
# ZMM state isn't in use.
|
# ZMM state isn't in use.
|
||||||
PRESERVE_BND_REGS_PREFIX
|
PRESERVE_BND_REGS_PREFIX
|
||||||
je _dl_runtime_resolve_avx
|
je _dl_runtime_resolve_avx
|
||||||
|
# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
|
||||||
|
# neither YMM state nor ZMM state are in use.
|
||||||
# else
|
# else
|
||||||
# error Unsupported VEC_SIZE!
|
# error Unsupported VEC_SIZE!
|
||||||
# endif
|
# endif
|
||||||
|
Loading…
Reference in New Issue
Block a user