diff --git a/ChangeLog b/ChangeLog index bc8bc31a22..841d55e340 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2014-06-19 Ling Ma + H.J. Lu + + * sysdeps/x86_64/multiarch/Makefile: Add memset-avx2. + * sysdeps/x86_64/multiarch/memset-avx2.S: New file. + * sysdeps/x86_64/multiarch/memset.S: Likewise. + * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. + * sysdeps/x86_64/multiarch/rtld-memset.S: Likewise. + 2014-06-19 Andreas Schwab [BZ #17069] diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 57a3c13e8a..42df96f636 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -17,7 +17,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ + memset-avx2 + ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S new file mode 100644 index 0000000000..b45f8a0d53 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2.S @@ -0,0 +1,168 @@ +/* memset with AVX2 + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#if !defined NOT_IN_libc + +#include "asm-syntax.h" +#ifndef MEMSET +# define MEMSET __memset_avx2 +# define MEMSET_CHK __memset_chk_avx2 +#endif + + .section .text.avx2,"ax",@progbits +#if defined PIC +ENTRY (MEMSET_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMSET_CHK) +#endif + +ENTRY (MEMSET) + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %rsi + mov %rdi, %rax + vpshufb %xmm0, %xmm1, %xmm0 + cmp $16, %rdx + jb L(less_16bytes) + cmp $256, %rdx + jae L(256bytesormore) + cmp $128, %dl + jb L(less_128bytes) + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, 0x10(%rdi) + vmovdqu %xmm0, 0x20(%rdi) + vmovdqu %xmm0, 0x30(%rdi) + vmovdqu %xmm0, 0x40(%rdi) + vmovdqu %xmm0, 0x50(%rdi) + vmovdqu %xmm0, 0x60(%rdi) + vmovdqu %xmm0, 0x70(%rdi) + vmovdqu %xmm0, -0x80(%rsi) + vmovdqu %xmm0, -0x70(%rsi) + vmovdqu %xmm0, -0x60(%rsi) + vmovdqu %xmm0, -0x50(%rsi) + vmovdqu %xmm0, -0x40(%rsi) + vmovdqu %xmm0, -0x30(%rsi) + vmovdqu %xmm0, -0x20(%rsi) + vmovdqu %xmm0, -0x10(%rsi) + ret + + .p2align 4 +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, 0x10(%rdi) + vmovdqu %xmm0, 0x20(%rdi) + vmovdqu %xmm0, 0x30(%rdi) + vmovdqu %xmm0, -0x40(%rsi) + vmovdqu %xmm0, -0x30(%rsi) + vmovdqu %xmm0, -0x20(%rsi) + vmovdqu %xmm0, -0x10(%rsi) + ret + + .p2align 4 +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, 0x10(%rdi) + vmovdqu %xmm0, -0x20(%rsi) + vmovdqu %xmm0, -0x10(%rsi) + ret + + .p2align 4 +L(less_32bytes): + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, -0x10(%rsi) + ret + + .p2align 4 +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + vmovq %xmm0, (%rdi) + vmovq %xmm0, -0x08(%rsi) + ret + + .p2align 4 +L(less_8bytes): + vmovd %xmm0, %ecx + cmp $4, %dl + jb L(less_4bytes) + mov %ecx, (%rdi) + mov %ecx, -0x04(%rsi) + ret + + .p2align 4 +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov %cx, (%rdi) + mov %cx, -0x02(%rsi) + ret + + .p2align 4 +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov %cl, (%rdi) +L(less_1bytes): + ret + + .p2align 4 +L(256bytesormore): + vinserti128 $1, %xmm0, %ymm0, %ymm0 + and $-0x20, %rdi + add $0x20, %rdi + vmovdqu %ymm0, (%rax) + sub %rdi, %rax + lea -0x80(%rax, %rdx), %rcx + cmp $4096, %rcx + ja L(gobble_data) +L(gobble_128_loop): + vmovdqa %ymm0, (%rdi) + vmovdqa %ymm0, 0x20(%rdi) + vmovdqa %ymm0, 0x40(%rdi) + vmovdqa %ymm0, 0x60(%rdi) + sub $-0x80, %rdi + add $-0x80, %ecx + jb L(gobble_128_loop) + mov %rsi, %rax + vmovdqu %ymm0, -0x80(%rsi) + vmovdqu %ymm0, -0x60(%rsi) + vmovdqu %ymm0, -0x40(%rsi) + vmovdqu %ymm0, -0x20(%rsi) + sub %rdx, %rax + vzeroupper + ret + + .p2align 4 +L(gobble_data): + sub $-0x80, %rcx + vmovd %xmm0, %eax + rep stosb + mov %rsi, %rax + sub %rdx, %rax + vzeroupper + ret + +END (MEMSET) +#endif diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S new file mode 100644 index 0000000000..3113d1cbc0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset.S @@ -0,0 +1,59 @@ +/* Multiple versions of memset + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +ENTRY(memset) + .type memset, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __memset_sse2(%rip), %rax + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + jz 2f + leaq __memset_avx2(%rip), %rax +2: ret +END(memset) +#endif + +#if !defined NOT_IN_libc +# undef memset +# define memset __memset_sse2 + +# undef __memset_chk +# define __memset_chk __memset_chk_sse2 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memset calls through a PLT. + The speedup we get from using GPR instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_sse2 +# endif + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S new file mode 100644 index 0000000000..2182780822 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset_chk.S @@ -0,0 +1,44 @@ +/* Multiple versions of memset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __memset_chk_sse2(%rip), %rax + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + jz 2f + leaq __memset_chk_avx2(%rip), %rax +2: ret +END(__memset_chk) + +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else +# include "../memset_chk.S" +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S new file mode 100644 index 0000000000..8092aa07da --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-memset.S @@ -0,0 +1 @@ +#include "../rtld-memset.S"