glibc/sysdeps/x86_64/multiarch/memmove.S
H.J. Lu f43cb35c9b Require binutils 2.24 to build x86-64 glibc [BZ #20139]
If assembler doesn't support AVX512DQ, _dl_runtime_resolve_avx is used
to save the first 8 vector registers, which only saves the lower 256
bits of vector register, for lazy binding.  When it is called on AVX512
platform, the upper 256 bits of ZMM registers are clobbered.  Parameters
passed in ZMM registers will be wrong when the function is called the
first time.  This patch requires binutils 2.24, whose assembler can store
and load ZMM registers, to build x86-64 glibc.  Since mathvec library
needs assembler support for AVX512DQ,  we disable mathvec if assembler
doesn't support AVX512DQ.

	[BZ #20139]
	* config.h.in (HAVE_AVX512_ASM_SUPPORT): Renamed to ...
	(HAVE_AVX512DQ_ASM_SUPPORT): This.
	* sysdeps/x86_64/configure.ac: Require assembler from binutils
	2.24 or above.
	(HAVE_AVX512_ASM_SUPPORT): Removed.
	(HAVE_AVX512DQ_ASM_SUPPORT): New.
	* sysdeps/x86_64/configure: Regenerated.
	* sysdeps/x86_64/dl-trampoline.S: Make HAVE_AVX512_ASM_SUPPORT
	check unconditional.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Likewise.
	* sysdeps/x86_64/multiarch/memcpy.S: Likewise.
	* sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
	Likewise.
	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
	Likewise.
	* sysdeps/x86_64/multiarch/memmove.S: Likewise.
	* sysdeps/x86_64/multiarch/memmove_chk.S: Likewise.
	* sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
	* sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S:
	Likewise.
	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
	Likewise.
	* sysdeps/x86_64/multiarch/memset.S: Likewise.
	* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S: Check
	HAVE_AVX512DQ_ASM_SUPPORT instead of HAVE_AVX512_ASM_SUPPORT.
	* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx51:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S:
	Likewise.
2016-07-01 06:03:05 -07:00

100 lines
3.0 KiB
ArmAsm

/* Multiple versions of memmove
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2016 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in lib and for
DSO. */
#if IS_IN (libc)
.text
ENTRY(__libc_memmove)
.type __libc_memmove, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
lea __memmove_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (Prefer_ERMS)
jnz 2f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
lea __memmove_avx512_no_vzeroupper(%rip), %RAX_LP
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jnz 2f
lea __memmove_avx512_unaligned_erms(%rip), %RAX_LP
HAS_CPU_FEATURE (ERMS)
jnz 2f
lea __memmove_avx512_unaligned(%rip), %RAX_LP
ret
1: lea __memmove_avx_unaligned(%rip), %RAX_LP
HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
jz L(Fast_Unaligned_Load)
HAS_CPU_FEATURE (ERMS)
jz 2f
lea __memmove_avx_unaligned_erms(%rip), %RAX_LP
ret
L(Fast_Unaligned_Load):
lea __memmove_sse2_unaligned(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
jz L(SSSE3)
HAS_CPU_FEATURE (ERMS)
jz 2f
lea __memmove_sse2_unaligned_erms(%rip), %RAX_LP
ret
L(SSSE3):
HAS_CPU_FEATURE (SSSE3)
jz 2f
lea __memmove_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
jnz 2f
lea __memmove_ssse3(%rip), %RAX_LP
2: ret
END(__libc_memmove)
#endif
#if IS_IN (libc)
# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
# ifdef SHARED
libc_hidden_ver (__memmove_sse2_unaligned, memmove)
libc_hidden_ver (__memcpy_sse2_unaligned, memcpy)
libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy)
libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy)
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal memmove calls through a PLT.
The speedup we get from using SSE2 instructions is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def
# endif
strong_alias (__libc_memmove, memmove)
#endif
#if !defined SHARED || !IS_IN (libc)
weak_alias (__mempcpy, mempcpy)
#endif
#include "../memmove.S"
#if defined SHARED && IS_IN (libc)
# include <shlib-compat.h>
# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
/* Use __memmove_sse2_unaligned to support overlapping addresses. */
compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
# endif
#endif