mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-05 01:00:14 +00:00
Added memset optimized with AVX512 for KNL hardware.
It shows improvement up to 28% over AVX2 memset (performance results attached at <https://sourceware.org/ml/libc-alpha/2015-12/msg00052.html>). * sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: New file. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Added new file. * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests. * sysdeps/x86_64/multiarch/memset.S: Added new IFUNC branch. * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. * sysdeps/x86/cpu-features.h (bit_Prefer_No_VZEROUPPER, index_Prefer_No_VZEROUPPER): New. * sysdeps/x86/cpu-features.c (init_cpu_features): Set the Prefer_No_VZEROUPPER for Knights Landing.
This commit is contained in:
parent
794950ed1d
commit
83d776f979
14
ChangeLog
14
ChangeLog
@ -1,4 +1,16 @@
|
|||||||
2015-12-18 Torvald Riegel <triegel@redhat.com>
|
2015-12-19 Andrew Senkevich <andrew.senkevich@intel.com>
|
||||||
|
|
||||||
|
* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: New file.
|
||||||
|
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Added new file.
|
||||||
|
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
|
||||||
|
* sysdeps/x86_64/multiarch/memset.S: Added new IFUNC branch.
|
||||||
|
* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.
|
||||||
|
* sysdeps/x86/cpu-features.h (bit_Prefer_No_VZEROUPPER,
|
||||||
|
index_Prefer_No_VZEROUPPER): New feature.
|
||||||
|
* sysdeps/x86/cpu-features.c (init_cpu_features): Set the
|
||||||
|
Prefer_No_VZEROUPPER for Knights Landing.
|
||||||
|
|
||||||
|
015-12-18 Torvald Riegel <triegel@redhat.com>
|
||||||
|
|
||||||
* math/atest-exp2.c (mp_exp_m1): Remove.
|
* math/atest-exp2.c (mp_exp_m1): Remove.
|
||||||
|
|
||||||
|
@ -80,6 +80,8 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|||||||
|
|
||||||
case 0x57:
|
case 0x57:
|
||||||
/* Knights Landing. Enable Silvermont optimizations. */
|
/* Knights Landing. Enable Silvermont optimizations. */
|
||||||
|
cpu_features->feature[index_Prefer_No_VZEROUPPER]
|
||||||
|
|= bit_Prefer_No_VZEROUPPER;
|
||||||
|
|
||||||
case 0x37:
|
case 0x37:
|
||||||
case 0x4a:
|
case 0x4a:
|
||||||
|
@ -34,6 +34,7 @@
|
|||||||
#define bit_I586 (1 << 14)
|
#define bit_I586 (1 << 14)
|
||||||
#define bit_I686 (1 << 15)
|
#define bit_I686 (1 << 15)
|
||||||
#define bit_Prefer_MAP_32BIT_EXEC (1 << 16)
|
#define bit_Prefer_MAP_32BIT_EXEC (1 << 16)
|
||||||
|
#define bit_Prefer_No_VZEROUPPER (1 << 17)
|
||||||
|
|
||||||
/* CPUID Feature flags. */
|
/* CPUID Feature flags. */
|
||||||
|
|
||||||
@ -99,6 +100,8 @@
|
|||||||
# define index_I586 FEATURE_INDEX_1*FEATURE_SIZE
|
# define index_I586 FEATURE_INDEX_1*FEATURE_SIZE
|
||||||
# define index_I686 FEATURE_INDEX_1*FEATURE_SIZE
|
# define index_I686 FEATURE_INDEX_1*FEATURE_SIZE
|
||||||
# define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
|
# define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
|
||||||
|
# define index_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
|
||||||
|
|
||||||
|
|
||||||
# if defined (_LIBC) && !IS_IN (nonlib)
|
# if defined (_LIBC) && !IS_IN (nonlib)
|
||||||
# ifdef __x86_64__
|
# ifdef __x86_64__
|
||||||
@ -251,6 +254,7 @@ extern const struct cpu_features *__get_cpu_features (void)
|
|||||||
# define index_I586 FEATURE_INDEX_1
|
# define index_I586 FEATURE_INDEX_1
|
||||||
# define index_I686 FEATURE_INDEX_1
|
# define index_I686 FEATURE_INDEX_1
|
||||||
# define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
|
# define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
|
||||||
|
# define index_Prefer_No_VZEROUPPER FEATURE_INDEX_1
|
||||||
|
|
||||||
#endif /* !__ASSEMBLER__ */
|
#endif /* !__ASSEMBLER__ */
|
||||||
|
|
||||||
|
@ -18,7 +18,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
|
|||||||
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
|
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
|
||||||
strcat-sse2-unaligned strncat-sse2-unaligned \
|
strcat-sse2-unaligned strncat-sse2-unaligned \
|
||||||
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
|
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
|
||||||
strcspn-c strpbrk-c strspn-c varshift memset-avx2
|
strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
|
||||||
|
memset-avx512-no-vzeroupper
|
||||||
CFLAGS-varshift.c += -msse4
|
CFLAGS-varshift.c += -msse4
|
||||||
CFLAGS-strcspn-c.c += -msse4
|
CFLAGS-strcspn-c.c += -msse4
|
||||||
CFLAGS-strpbrk-c.c += -msse4
|
CFLAGS-strpbrk-c.c += -msse4
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <wchar.h>
|
#include <wchar.h>
|
||||||
#include <ifunc-impl-list.h>
|
#include <ifunc-impl-list.h>
|
||||||
|
#include <sysdep.h>
|
||||||
#include "init-arch.h"
|
#include "init-arch.h"
|
||||||
|
|
||||||
/* Maximum number of IFUNC implementations. */
|
/* Maximum number of IFUNC implementations. */
|
||||||
@ -76,14 +77,26 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||||||
__memset_chk_sse2)
|
__memset_chk_sse2)
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||||
__memset_chk_avx2))
|
__memset_chk_avx2)
|
||||||
|
#ifdef HAVE_AVX512_ASM_SUPPORT
|
||||||
|
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||||
|
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||||
|
__memset_chk_avx512_no_vzeroupper)
|
||||||
|
#endif
|
||||||
|
)
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/memset.S. */
|
/* Support sysdeps/x86_64/multiarch/memset.S. */
|
||||||
IFUNC_IMPL (i, name, memset,
|
IFUNC_IMPL (i, name, memset,
|
||||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
|
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
IFUNC_IMPL_ADD (array, i, memset,
|
||||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||||
__memset_avx2))
|
__memset_avx2)
|
||||||
|
#ifdef HAVE_AVX512_ASM_SUPPORT
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset,
|
||||||
|
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||||
|
__memset_avx512_no_vzeroupper)
|
||||||
|
#endif
|
||||||
|
)
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
|
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
|
||||||
IFUNC_IMPL (i, name, stpncpy,
|
IFUNC_IMPL (i, name, stpncpy,
|
||||||
|
194
sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
Normal file
194
sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
/* memset optimized with AVX512 for KNL hardware.
|
||||||
|
Copyright (C) 2015 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<http://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
|
||||||
|
|
||||||
|
#include "asm-syntax.h"
|
||||||
|
#ifndef MEMSET
|
||||||
|
# define MEMSET __memset_avx512_no_vzeroupper
|
||||||
|
# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.section .text,"ax",@progbits
|
||||||
|
#if defined PIC
|
||||||
|
ENTRY (MEMSET_CHK)
|
||||||
|
cmpq %rdx, %rcx
|
||||||
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||||
|
END (MEMSET_CHK)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ENTRY (MEMSET)
|
||||||
|
vpxor %xmm0, %xmm0, %xmm0
|
||||||
|
vmovd %esi, %xmm1
|
||||||
|
lea (%rdi, %rdx), %rsi
|
||||||
|
mov %rdi, %rax
|
||||||
|
vpshufb %xmm0, %xmm1, %xmm0
|
||||||
|
cmp $16, %rdx
|
||||||
|
jb L(less_16bytes)
|
||||||
|
cmp $512, %rdx
|
||||||
|
vbroadcastss %xmm0, %zmm2
|
||||||
|
ja L(512bytesormore)
|
||||||
|
cmp $256, %rdx
|
||||||
|
jb L(less_256bytes)
|
||||||
|
vmovups %zmm2, (%rdi)
|
||||||
|
vmovups %zmm2, 0x40(%rdi)
|
||||||
|
vmovups %zmm2, 0x80(%rdi)
|
||||||
|
vmovups %zmm2, 0xC0(%rdi)
|
||||||
|
vmovups %zmm2, -0x100(%rsi)
|
||||||
|
vmovups %zmm2, -0xC0(%rsi)
|
||||||
|
vmovups %zmm2, -0x80(%rsi)
|
||||||
|
vmovups %zmm2, -0x40(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_256bytes):
|
||||||
|
cmp $128, %dl
|
||||||
|
jb L(less_128bytes)
|
||||||
|
vmovups %zmm2, (%rdi)
|
||||||
|
vmovups %zmm2, 0x40(%rdi)
|
||||||
|
vmovups %zmm2, -0x80(%rsi)
|
||||||
|
vmovups %zmm2, -0x40(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_128bytes):
|
||||||
|
cmp $64, %dl
|
||||||
|
jb L(less_64bytes)
|
||||||
|
vmovups %zmm2, (%rdi)
|
||||||
|
vmovups %zmm2, -0x40(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_64bytes):
|
||||||
|
cmp $32, %dl
|
||||||
|
jb L(less_32bytes)
|
||||||
|
vmovdqu %ymm2, (%rdi)
|
||||||
|
vmovdqu %ymm2, -0x20(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_32bytes):
|
||||||
|
vmovdqu %xmm0, (%rdi)
|
||||||
|
vmovdqu %xmm0, -0x10(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_16bytes):
|
||||||
|
cmp $8, %dl
|
||||||
|
jb L(less_8bytes)
|
||||||
|
vmovq %xmm0, (%rdi)
|
||||||
|
vmovq %xmm0, -0x08(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_8bytes):
|
||||||
|
vmovd %xmm0, %ecx
|
||||||
|
cmp $4, %dl
|
||||||
|
jb L(less_4bytes)
|
||||||
|
mov %ecx, (%rdi)
|
||||||
|
mov %ecx, -0x04(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_4bytes):
|
||||||
|
cmp $2, %dl
|
||||||
|
jb L(less_2bytes)
|
||||||
|
mov %cx, (%rdi)
|
||||||
|
mov %cx, -0x02(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(less_2bytes):
|
||||||
|
cmp $1, %dl
|
||||||
|
jb L(less_1bytes)
|
||||||
|
mov %cl, (%rdi)
|
||||||
|
L(less_1bytes):
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(512bytesormore):
|
||||||
|
mov __x86_shared_cache_size_half(%rip), %rcx
|
||||||
|
cmp %rcx, %rdx
|
||||||
|
ja L(preloop_large)
|
||||||
|
cmp $1024, %rdx
|
||||||
|
ja L(1024bytesormore)
|
||||||
|
|
||||||
|
vmovups %zmm2, (%rdi)
|
||||||
|
vmovups %zmm2, 0x40(%rdi)
|
||||||
|
vmovups %zmm2, 0x80(%rdi)
|
||||||
|
vmovups %zmm2, 0xC0(%rdi)
|
||||||
|
vmovups %zmm2, 0x100(%rdi)
|
||||||
|
vmovups %zmm2, 0x140(%rdi)
|
||||||
|
vmovups %zmm2, 0x180(%rdi)
|
||||||
|
vmovups %zmm2, 0x1C0(%rdi)
|
||||||
|
vmovups %zmm2, -0x200(%rsi)
|
||||||
|
vmovups %zmm2, -0x1C0(%rsi)
|
||||||
|
vmovups %zmm2, -0x180(%rsi)
|
||||||
|
vmovups %zmm2, -0x140(%rsi)
|
||||||
|
vmovups %zmm2, -0x100(%rsi)
|
||||||
|
vmovups %zmm2, -0xC0(%rsi)
|
||||||
|
vmovups %zmm2, -0x80(%rsi)
|
||||||
|
vmovups %zmm2, -0x40(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
/* Align on 64 and loop with aligned stores. */
|
||||||
|
L(1024bytesormore):
|
||||||
|
sub $0x100, %rsi
|
||||||
|
vmovups %zmm2, (%rax)
|
||||||
|
and $-0x40, %rdi
|
||||||
|
add $0x40, %rdi
|
||||||
|
|
||||||
|
L(gobble_256bytes_loop):
|
||||||
|
vmovaps %zmm2, (%rdi)
|
||||||
|
vmovaps %zmm2, 0x40(%rdi)
|
||||||
|
vmovaps %zmm2, 0x80(%rdi)
|
||||||
|
vmovaps %zmm2, 0xC0(%rdi)
|
||||||
|
add $0x100, %rdi
|
||||||
|
cmp %rsi, %rdi
|
||||||
|
jb L(gobble_256bytes_loop)
|
||||||
|
vmovups %zmm2, (%rsi)
|
||||||
|
vmovups %zmm2, 0x40(%rsi)
|
||||||
|
vmovups %zmm2, 0x80(%rsi)
|
||||||
|
vmovups %zmm2, 0xC0(%rsi)
|
||||||
|
ret
|
||||||
|
|
||||||
|
/* Align on 128 and loop with non-temporal stores. */
|
||||||
|
L(preloop_large):
|
||||||
|
and $-0x80, %rdi
|
||||||
|
add $0x80, %rdi
|
||||||
|
vmovups %zmm2, (%rax)
|
||||||
|
vmovups %zmm2, 0x40(%rax)
|
||||||
|
sub $0x200, %rsi
|
||||||
|
|
||||||
|
L(gobble_512bytes_nt_loop):
|
||||||
|
vmovntdq %zmm2, (%rdi)
|
||||||
|
vmovntdq %zmm2, 0x40(%rdi)
|
||||||
|
vmovntdq %zmm2, 0x80(%rdi)
|
||||||
|
vmovntdq %zmm2, 0xC0(%rdi)
|
||||||
|
vmovntdq %zmm2, 0x100(%rdi)
|
||||||
|
vmovntdq %zmm2, 0x140(%rdi)
|
||||||
|
vmovntdq %zmm2, 0x180(%rdi)
|
||||||
|
vmovntdq %zmm2, 0x1C0(%rdi)
|
||||||
|
add $0x200, %rdi
|
||||||
|
cmp %rsi, %rdi
|
||||||
|
jb L(gobble_512bytes_nt_loop)
|
||||||
|
sfence
|
||||||
|
vmovups %zmm2, (%rsi)
|
||||||
|
vmovups %zmm2, 0x40(%rsi)
|
||||||
|
vmovups %zmm2, 0x80(%rsi)
|
||||||
|
vmovups %zmm2, 0xC0(%rsi)
|
||||||
|
vmovups %zmm2, 0x100(%rsi)
|
||||||
|
vmovups %zmm2, 0x140(%rsi)
|
||||||
|
vmovups %zmm2, 0x180(%rsi)
|
||||||
|
vmovups %zmm2, 0x1C0(%rsi)
|
||||||
|
ret
|
||||||
|
END (MEMSET)
|
||||||
|
#endif
|
@ -30,6 +30,13 @@ ENTRY(memset)
|
|||||||
HAS_ARCH_FEATURE (AVX2_Usable)
|
HAS_ARCH_FEATURE (AVX2_Usable)
|
||||||
jz 2f
|
jz 2f
|
||||||
leaq __memset_avx2(%rip), %rax
|
leaq __memset_avx2(%rip), %rax
|
||||||
|
#ifdef HAVE_AVX512_ASM_SUPPORT
|
||||||
|
HAS_ARCH_FEATURE (AVX512F_Usable)
|
||||||
|
jz 2f
|
||||||
|
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
|
||||||
|
jz 2f
|
||||||
|
leaq __memset_avx512_no_vzeroupper(%rip), %rax
|
||||||
|
#endif
|
||||||
2: ret
|
2: ret
|
||||||
END(memset)
|
END(memset)
|
||||||
#endif
|
#endif
|
||||||
|
@ -30,6 +30,13 @@ ENTRY(__memset_chk)
|
|||||||
HAS_ARCH_FEATURE (AVX2_Usable)
|
HAS_ARCH_FEATURE (AVX2_Usable)
|
||||||
jz 2f
|
jz 2f
|
||||||
leaq __memset_chk_avx2(%rip), %rax
|
leaq __memset_chk_avx2(%rip), %rax
|
||||||
|
#ifdef HAVE_AVX512_ASM_SUPPORT
|
||||||
|
HAS_ARCH_FEATURE (AVX512F_Usable)
|
||||||
|
jz 2f
|
||||||
|
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
|
||||||
|
jz 2f
|
||||||
|
leaq __memset_chk_avx512_no_vzeroupper(%rip), %rax
|
||||||
|
#endif
|
||||||
2: ret
|
2: ret
|
||||||
END(__memset_chk)
|
END(__memset_chk)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user