x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258]

On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
the first 8 vector registers.  The code layout is

  if only %xmm0 - %xmm7 registers are used
     preserve %xmm0 - %xmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %zmm0 - %zmm7 registers

Branch predication always executes the fallthrough code path to preserve
%zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
registers are used.  This leads to lower CPU frequency on Skylake
server.  This patch changes the fallthrough code path to preserve
%xmm0 - %xmm7 registers instead:

  if whole %zmm0 - %zmm7 registers are used
    preserve %zmm0 - %zmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %xmm0 - %xmm7 registers

Tested on Skylake server.

	[BZ #21258]
	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
	Define only if _dl_runtime_resolve is defined to
	_dl_runtime_resolve_sse_vex.
	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
	Fallthrough to _dl_runtime_resolve_sse_vex.
This commit is contained in:
H.J. Lu 2017-03-21 10:59:31 -07:00
parent a640393a18
commit c15f8eb50c
3 changed files with 15 additions and 6 deletions

View File

@ -1,3 +1,12 @@
2017-03-21 H.J. Lu <hongjiu.lu@intel.com>
[BZ #21258]
* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
Define only if _dl_runtime_resolve is defined to
_dl_runtime_resolve_sse_vex.
* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
Fallthrough to _dl_runtime_resolve_sse_vex.
2017-03-21 Joseph Myers <joseph@codesourcery.com>
* INSTALL: Regenerated.

View File

@ -87,11 +87,9 @@
#endif
#define VEC(i) zmm##i
#define _dl_runtime_resolve _dl_runtime_resolve_avx512
#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
#define _dl_runtime_profile _dl_runtime_profile_avx512
#include "dl-trampoline.h"
#undef _dl_runtime_resolve
#undef _dl_runtime_resolve_opt
#undef _dl_runtime_profile
#undef VEC
#undef VMOV
@ -145,4 +143,5 @@
# define VMOV vmovdqu
#endif
#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex
#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
#include "dl-trampoline.h"

View File

@ -129,19 +129,20 @@ _dl_runtime_resolve_opt:
# YMM state isn't in use.
PRESERVE_BND_REGS_PREFIX
jz _dl_runtime_resolve_sse_vex
# elif VEC_SIZE == 64
# elif VEC_SIZE == 16
# For ZMM registers, check if YMM state and ZMM state are in
# use.
andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
cmpl $bit_YMM_state, %r11d
# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
# neither YMM state nor ZMM state are in use.
# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
PRESERVE_BND_REGS_PREFIX
jl _dl_runtime_resolve_sse_vex
jg _dl_runtime_resolve_avx512
# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
# ZMM state isn't in use.
PRESERVE_BND_REGS_PREFIX
je _dl_runtime_resolve_avx
# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
# neither YMM state nor ZMM state are in use.
# else
# error Unsupported VEC_SIZE!
# endif