mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-21 20:40:05 +00:00
Improve 64bit memcpy performance for Haswell CPU with AVX instruction
In this patch we take advantage of HSW memory bandwidth, manage to reduce miss branch prediction by avoiding using branch instructions and force destination to be aligned with avx instruction. The CPU2006 403.gcc benchmark indicates this patch improves performance from 2% to 10%.
This commit is contained in:
parent
a53fbd8e6c
commit
05f3633da4
20
ChangeLog
20
ChangeLog
@ -1,3 +1,23 @@
|
||||
2014-07-30 Ling Ma <ling.ml@alibaba-inc.com>
|
||||
|
||||
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
|
||||
memmove-avx-unaligned, memcpy-avx-unaligned and
|
||||
mempcpy-avx-unaligned.
|
||||
* sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
|
||||
Add tests for AVX memcpy functions.
|
||||
* sysdeps/x86_64/multiarch/memcpy.S: Add support for AVX memcpy.
|
||||
* sysdeps/x86_64/multiarch/memcpy_chk.S: Add support for AVX
|
||||
memcpy_chk.
|
||||
* sysdeps/x86_64/multiarch/memmove.c: Add support for AVX memmove.
|
||||
* sysdeps/x86_64/multiarch/memmove_chk.c: Add support for AVX
|
||||
memmove_chk.
|
||||
* sysdeps/x86_64/multiarch/mempcpy.S: Add support for AVX mempcpy.
|
||||
* sysdeps/x86_64/multiarch/mempcpy_chk.S: Add support for AVX
|
||||
mempcpy_chk.
|
||||
* sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: New file.
|
||||
* sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: New file.
|
||||
* sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: New file.
|
||||
|
||||
2013-07-29 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
|
||||
|
||||
[BZ #17213]
|
||||
|
@ -11,6 +11,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
|
||||
memcmp-sse4 memcpy-ssse3 \
|
||||
memcpy-sse2-unaligned mempcpy-ssse3 \
|
||||
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
|
||||
memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
|
||||
memmove-ssse3-back strcasecmp_l-ssse3 \
|
||||
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
|
||||
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
|
||||
|
@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
|
||||
IFUNC_IMPL (i, name, __memmove_chk,
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
|
||||
__memmove_chk_avx_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
|
||||
__memmove_chk_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
|
||||
@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/memmove.S. */
|
||||
IFUNC_IMPL (i, name, memmove,
|
||||
IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
|
||||
__memmove_avx_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
|
||||
__memmove_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
|
||||
@ -214,6 +218,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
#ifdef SHARED
|
||||
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
|
||||
IFUNC_IMPL (i, name, __memcpy_chk,
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
|
||||
__memcpy_chk_avx_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
|
||||
__memcpy_chk_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
|
||||
@ -223,6 +229,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
|
||||
IFUNC_IMPL (i, name, memcpy,
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
|
||||
__memcpy_avx_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
|
||||
__memcpy_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
|
||||
@ -231,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
|
||||
IFUNC_IMPL (i, name, __mempcpy_chk,
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
|
||||
__mempcpy_chk_avx_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
|
||||
__mempcpy_chk_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
|
||||
@ -240,6 +250,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
|
||||
IFUNC_IMPL (i, name, mempcpy,
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
|
||||
__mempcpy_avx_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
|
||||
__mempcpy_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
|
||||
|
376
sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
Normal file
376
sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
Normal file
@ -0,0 +1,376 @@
|
||||
/* memcpy with AVX
|
||||
Copyright (C) 2014 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
#if !defined NOT_IN_libc \
|
||||
&& (defined SHARED \
|
||||
|| defined USE_AS_MEMMOVE \
|
||||
|| !defined USE_MULTIARCH)
|
||||
|
||||
#include "asm-syntax.h"
|
||||
#ifndef MEMCPY
|
||||
# define MEMCPY __memcpy_avx_unaligned
|
||||
# define MEMCPY_CHK __memcpy_chk_avx_unaligned
|
||||
#endif
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
#if !defined USE_AS_BCOPY
|
||||
ENTRY (MEMCPY_CHK)
|
||||
cmpq %rdx, %rcx
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMCPY_CHK)
|
||||
#endif
|
||||
|
||||
ENTRY (MEMCPY)
|
||||
mov %rdi, %rax
|
||||
#ifdef USE_AS_MEMPCPY
|
||||
add %rdx, %rax
|
||||
#endif
|
||||
cmp $256, %rdx
|
||||
jae L(256bytesormore)
|
||||
cmp $16, %dl
|
||||
jb L(less_16bytes)
|
||||
cmp $128, %dl
|
||||
jb L(less_128bytes)
|
||||
vmovdqu (%rsi), %xmm0
|
||||
lea (%rsi, %rdx), %rcx
|
||||
vmovdqu 0x10(%rsi), %xmm1
|
||||
vmovdqu 0x20(%rsi), %xmm2
|
||||
vmovdqu 0x30(%rsi), %xmm3
|
||||
vmovdqu 0x40(%rsi), %xmm4
|
||||
vmovdqu 0x50(%rsi), %xmm5
|
||||
vmovdqu 0x60(%rsi), %xmm6
|
||||
vmovdqu 0x70(%rsi), %xmm7
|
||||
vmovdqu -0x80(%rcx), %xmm8
|
||||
vmovdqu -0x70(%rcx), %xmm9
|
||||
vmovdqu -0x60(%rcx), %xmm10
|
||||
vmovdqu -0x50(%rcx), %xmm11
|
||||
vmovdqu -0x40(%rcx), %xmm12
|
||||
vmovdqu -0x30(%rcx), %xmm13
|
||||
vmovdqu -0x20(%rcx), %xmm14
|
||||
vmovdqu -0x10(%rcx), %xmm15
|
||||
lea (%rdi, %rdx), %rdx
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm1, 0x10(%rdi)
|
||||
vmovdqu %xmm2, 0x20(%rdi)
|
||||
vmovdqu %xmm3, 0x30(%rdi)
|
||||
vmovdqu %xmm4, 0x40(%rdi)
|
||||
vmovdqu %xmm5, 0x50(%rdi)
|
||||
vmovdqu %xmm6, 0x60(%rdi)
|
||||
vmovdqu %xmm7, 0x70(%rdi)
|
||||
vmovdqu %xmm8, -0x80(%rdx)
|
||||
vmovdqu %xmm9, -0x70(%rdx)
|
||||
vmovdqu %xmm10, -0x60(%rdx)
|
||||
vmovdqu %xmm11, -0x50(%rdx)
|
||||
vmovdqu %xmm12, -0x40(%rdx)
|
||||
vmovdqu %xmm13, -0x30(%rdx)
|
||||
vmovdqu %xmm14, -0x20(%rdx)
|
||||
vmovdqu %xmm15, -0x10(%rdx)
|
||||
ret
|
||||
.p2align 4
|
||||
L(less_128bytes):
|
||||
cmp $64, %dl
|
||||
jb L(less_64bytes)
|
||||
vmovdqu (%rsi), %xmm0
|
||||
lea (%rsi, %rdx), %rcx
|
||||
vmovdqu 0x10(%rsi), %xmm1
|
||||
vmovdqu 0x20(%rsi), %xmm2
|
||||
lea (%rdi, %rdx), %rdx
|
||||
vmovdqu 0x30(%rsi), %xmm3
|
||||
vmovdqu -0x40(%rcx), %xmm4
|
||||
vmovdqu -0x30(%rcx), %xmm5
|
||||
vmovdqu -0x20(%rcx), %xmm6
|
||||
vmovdqu -0x10(%rcx), %xmm7
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm1, 0x10(%rdi)
|
||||
vmovdqu %xmm2, 0x20(%rdi)
|
||||
vmovdqu %xmm3, 0x30(%rdi)
|
||||
vmovdqu %xmm4, -0x40(%rdx)
|
||||
vmovdqu %xmm5, -0x30(%rdx)
|
||||
vmovdqu %xmm6, -0x20(%rdx)
|
||||
vmovdqu %xmm7, -0x10(%rdx)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less_64bytes):
|
||||
cmp $32, %dl
|
||||
jb L(less_32bytes)
|
||||
vmovdqu (%rsi), %xmm0
|
||||
vmovdqu 0x10(%rsi), %xmm1
|
||||
vmovdqu -0x20(%rsi, %rdx), %xmm6
|
||||
vmovdqu -0x10(%rsi, %rdx), %xmm7
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm1, 0x10(%rdi)
|
||||
vmovdqu %xmm6, -0x20(%rdi, %rdx)
|
||||
vmovdqu %xmm7, -0x10(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less_32bytes):
|
||||
vmovdqu (%rsi), %xmm0
|
||||
vmovdqu -0x10(%rsi, %rdx), %xmm7
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm7, -0x10(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less_16bytes):
|
||||
cmp $8, %dl
|
||||
jb L(less_8bytes)
|
||||
movq -0x08(%rsi, %rdx), %rcx
|
||||
movq (%rsi), %rsi
|
||||
movq %rsi, (%rdi)
|
||||
movq %rcx, -0x08(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less_8bytes):
|
||||
cmp $4, %dl
|
||||
jb L(less_4bytes)
|
||||
mov -0x04(%rsi, %rdx), %ecx
|
||||
mov (%rsi), %esi
|
||||
mov %esi, (%rdi)
|
||||
mov %ecx, -0x04(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
L(less_4bytes):
|
||||
cmp $1, %dl
|
||||
jbe L(less_2bytes)
|
||||
mov -0x02(%rsi, %rdx), %cx
|
||||
mov (%rsi), %si
|
||||
mov %si, (%rdi)
|
||||
mov %cx, -0x02(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
L(less_2bytes):
|
||||
jb L(less_0bytes)
|
||||
mov (%rsi), %cl
|
||||
mov %cl, (%rdi)
|
||||
L(less_0bytes):
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(256bytesormore):
|
||||
#ifdef USE_AS_MEMMOVE
|
||||
mov %rdi, %rcx
|
||||
sub %rsi, %rcx
|
||||
cmp %rdx, %rcx
|
||||
jc L(copy_backward)
|
||||
#endif
|
||||
cmp $2048, %rdx
|
||||
jae L(gobble_data_movsb)
|
||||
mov %rax, %r8
|
||||
lea (%rsi, %rdx), %rcx
|
||||
mov %rdi, %r10
|
||||
vmovdqu -0x80(%rcx), %xmm5
|
||||
vmovdqu -0x70(%rcx), %xmm6
|
||||
mov $0x80, %rax
|
||||
and $-32, %rdi
|
||||
add $32, %rdi
|
||||
vmovdqu -0x60(%rcx), %xmm7
|
||||
vmovdqu -0x50(%rcx), %xmm8
|
||||
mov %rdi, %r11
|
||||
sub %r10, %r11
|
||||
vmovdqu -0x40(%rcx), %xmm9
|
||||
vmovdqu -0x30(%rcx), %xmm10
|
||||
sub %r11, %rdx
|
||||
vmovdqu -0x20(%rcx), %xmm11
|
||||
vmovdqu -0x10(%rcx), %xmm12
|
||||
vmovdqu (%rsi), %ymm4
|
||||
add %r11, %rsi
|
||||
sub %eax, %edx
|
||||
L(goble_128_loop):
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu 0x20(%rsi), %ymm1
|
||||
vmovdqu 0x40(%rsi), %ymm2
|
||||
vmovdqu 0x60(%rsi), %ymm3
|
||||
add %rax, %rsi
|
||||
vmovdqa %ymm0, (%rdi)
|
||||
vmovdqa %ymm1, 0x20(%rdi)
|
||||
vmovdqa %ymm2, 0x40(%rdi)
|
||||
vmovdqa %ymm3, 0x60(%rdi)
|
||||
add %rax, %rdi
|
||||
sub %eax, %edx
|
||||
jae L(goble_128_loop)
|
||||
add %eax, %edx
|
||||
add %rdi, %rdx
|
||||
vmovdqu %ymm4, (%r10)
|
||||
vzeroupper
|
||||
vmovdqu %xmm5, -0x80(%rdx)
|
||||
vmovdqu %xmm6, -0x70(%rdx)
|
||||
vmovdqu %xmm7, -0x60(%rdx)
|
||||
vmovdqu %xmm8, -0x50(%rdx)
|
||||
vmovdqu %xmm9, -0x40(%rdx)
|
||||
vmovdqu %xmm10, -0x30(%rdx)
|
||||
vmovdqu %xmm11, -0x20(%rdx)
|
||||
vmovdqu %xmm12, -0x10(%rdx)
|
||||
mov %r8, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(gobble_data_movsb):
|
||||
#ifdef SHARED_CACHE_SIZE_HALF
|
||||
mov $SHARED_CACHE_SIZE_HALF, %rcx
|
||||
#else
|
||||
mov __x86_shared_cache_size_half(%rip), %rcx
|
||||
#endif
|
||||
shl $3, %rcx
|
||||
cmp %rcx, %rdx
|
||||
jae L(gobble_big_data_fwd)
|
||||
mov %rdx, %rcx
|
||||
mov %rdx, %rcx
|
||||
rep movsb
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(gobble_big_data_fwd):
|
||||
lea (%rsi, %rdx), %rcx
|
||||
vmovdqu (%rsi), %ymm4
|
||||
vmovdqu -0x80(%rsi,%rdx), %xmm5
|
||||
vmovdqu -0x70(%rcx), %xmm6
|
||||
vmovdqu -0x60(%rcx), %xmm7
|
||||
vmovdqu -0x50(%rcx), %xmm8
|
||||
vmovdqu -0x40(%rcx), %xmm9
|
||||
vmovdqu -0x30(%rcx), %xmm10
|
||||
vmovdqu -0x20(%rcx), %xmm11
|
||||
vmovdqu -0x10(%rcx), %xmm12
|
||||
mov %rdi, %r8
|
||||
and $-32, %rdi
|
||||
add $32, %rdi
|
||||
mov %rdi, %r10
|
||||
sub %r8, %r10
|
||||
sub %r10, %rdx
|
||||
add %r10, %rsi
|
||||
lea (%rdi, %rdx), %rcx
|
||||
add $-0x80, %rdx
|
||||
L(gobble_mem_fwd_loop):
|
||||
prefetchnta 0x1c0(%rsi)
|
||||
prefetchnta 0x280(%rsi)
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu 0x20(%rsi), %ymm1
|
||||
vmovdqu 0x40(%rsi), %ymm2
|
||||
vmovdqu 0x60(%rsi), %ymm3
|
||||
sub $-0x80, %rsi
|
||||
vmovntdq %ymm0, (%rdi)
|
||||
vmovntdq %ymm1, 0x20(%rdi)
|
||||
vmovntdq %ymm2, 0x40(%rdi)
|
||||
vmovntdq %ymm3, 0x60(%rdi)
|
||||
sub $-0x80, %rdi
|
||||
add $-0x80, %rdx
|
||||
jb L(gobble_mem_fwd_loop)
|
||||
sfence
|
||||
vmovdqu %ymm4, (%r8)
|
||||
vzeroupper
|
||||
vmovdqu %xmm5, -0x80(%rcx)
|
||||
vmovdqu %xmm6, -0x70(%rcx)
|
||||
vmovdqu %xmm7, -0x60(%rcx)
|
||||
vmovdqu %xmm8, -0x50(%rcx)
|
||||
vmovdqu %xmm9, -0x40(%rcx)
|
||||
vmovdqu %xmm10, -0x30(%rcx)
|
||||
vmovdqu %xmm11, -0x20(%rcx)
|
||||
vmovdqu %xmm12, -0x10(%rcx)
|
||||
ret
|
||||
|
||||
#ifdef USE_AS_MEMMOVE
|
||||
.p2align 4
|
||||
L(copy_backward):
|
||||
#ifdef SHARED_CACHE_SIZE_HALF
|
||||
mov $SHARED_CACHE_SIZE_HALF, %rcx
|
||||
#else
|
||||
mov __x86_shared_cache_size_half(%rip), %rcx
|
||||
#endif
|
||||
shl $3, %rcx
|
||||
vmovdqu (%rsi), %xmm5
|
||||
vmovdqu 0x10(%rsi), %xmm6
|
||||
add %rdx, %rdi
|
||||
vmovdqu 0x20(%rsi), %xmm7
|
||||
vmovdqu 0x30(%rsi), %xmm8
|
||||
lea -0x20(%rdi), %r10
|
||||
mov %rdi, %r11
|
||||
vmovdqu 0x40(%rsi), %xmm9
|
||||
vmovdqu 0x50(%rsi), %xmm10
|
||||
and $0x1f, %r11
|
||||
vmovdqu 0x60(%rsi), %xmm11
|
||||
vmovdqu 0x70(%rsi), %xmm12
|
||||
xor %r11, %rdi
|
||||
add %rdx, %rsi
|
||||
vmovdqu -0x20(%rsi), %ymm4
|
||||
sub %r11, %rsi
|
||||
sub %r11, %rdx
|
||||
cmp %rcx, %rdx
|
||||
ja L(gobble_big_data_bwd)
|
||||
add $-0x80, %rdx
|
||||
L(gobble_mem_bwd_llc):
|
||||
vmovdqu -0x20(%rsi), %ymm0
|
||||
vmovdqu -0x40(%rsi), %ymm1
|
||||
vmovdqu -0x60(%rsi), %ymm2
|
||||
vmovdqu -0x80(%rsi), %ymm3
|
||||
lea -0x80(%rsi), %rsi
|
||||
vmovdqa %ymm0, -0x20(%rdi)
|
||||
vmovdqa %ymm1, -0x40(%rdi)
|
||||
vmovdqa %ymm2, -0x60(%rdi)
|
||||
vmovdqa %ymm3, -0x80(%rdi)
|
||||
lea -0x80(%rdi), %rdi
|
||||
add $-0x80, %rdx
|
||||
jb L(gobble_mem_bwd_llc)
|
||||
vmovdqu %ymm4, (%r10)
|
||||
vzeroupper
|
||||
vmovdqu %xmm5, (%rax)
|
||||
vmovdqu %xmm6, 0x10(%rax)
|
||||
vmovdqu %xmm7, 0x20(%rax)
|
||||
vmovdqu %xmm8, 0x30(%rax)
|
||||
vmovdqu %xmm9, 0x40(%rax)
|
||||
vmovdqu %xmm10, 0x50(%rax)
|
||||
vmovdqu %xmm11, 0x60(%rax)
|
||||
vmovdqu %xmm12, 0x70(%rax)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(gobble_big_data_bwd):
|
||||
add $-0x80, %rdx
|
||||
L(gobble_mem_bwd_loop):
|
||||
prefetchnta -0x1c0(%rsi)
|
||||
prefetchnta -0x280(%rsi)
|
||||
vmovdqu -0x20(%rsi), %ymm0
|
||||
vmovdqu -0x40(%rsi), %ymm1
|
||||
vmovdqu -0x60(%rsi), %ymm2
|
||||
vmovdqu -0x80(%rsi), %ymm3
|
||||
lea -0x80(%rsi), %rsi
|
||||
vmovntdq %ymm0, -0x20(%rdi)
|
||||
vmovntdq %ymm1, -0x40(%rdi)
|
||||
vmovntdq %ymm2, -0x60(%rdi)
|
||||
vmovntdq %ymm3, -0x80(%rdi)
|
||||
lea -0x80(%rdi), %rdi
|
||||
add $-0x80, %rdx
|
||||
jb L(gobble_mem_bwd_loop)
|
||||
sfence
|
||||
vmovdqu %ymm4, (%r10)
|
||||
vzeroupper
|
||||
vmovdqu %xmm5, (%rax)
|
||||
vmovdqu %xmm6, 0x10(%rax)
|
||||
vmovdqu %xmm7, 0x20(%rax)
|
||||
vmovdqu %xmm8, 0x30(%rax)
|
||||
vmovdqu %xmm9, 0x40(%rax)
|
||||
vmovdqu %xmm10, 0x50(%rax)
|
||||
vmovdqu %xmm11, 0x60(%rax)
|
||||
vmovdqu %xmm12, 0x70(%rax)
|
||||
ret
|
||||
#endif
|
||||
END (MEMCPY)
|
||||
#endif
|
@ -32,6 +32,10 @@ ENTRY(__new_memcpy)
|
||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leaq __memcpy_avx_unaligned(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
jz 1f
|
||||
ret
|
||||
1: leaq __memcpy_sse2(%rip), %rax
|
||||
testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
|
||||
jnz 2f
|
||||
|
@ -39,6 +39,9 @@ ENTRY(__memcpy_chk)
|
||||
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
|
||||
jz 2f
|
||||
leaq __memcpy_chk_ssse3_back(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
jz 2f
|
||||
leaq __memcpy_chk_avx_unaligned(%rip), %rax
|
||||
2: ret
|
||||
END(__memcpy_chk)
|
||||
# else
|
||||
|
22
sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
Normal file
22
sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
Normal file
@ -0,0 +1,22 @@
|
||||
/* memmove with AVX
|
||||
Copyright (C) 2014 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#define USE_AS_MEMMOVE
|
||||
#define MEMCPY __memmove_avx_unaligned
|
||||
#define MEMCPY_CHK __memmove_chk_avx_unaligned
|
||||
#include "memcpy-avx-unaligned.S"
|
@ -35,6 +35,8 @@
|
||||
extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
|
||||
extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
|
||||
extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
|
||||
extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
|
||||
|
||||
#endif
|
||||
|
||||
#include "string/memmove.c"
|
||||
@ -47,10 +49,12 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
|
||||
ifunc symbol properly. */
|
||||
extern __typeof (__redirect_memmove) __libc_memmove;
|
||||
libc_ifunc (__libc_memmove,
|
||||
HAS_SSSE3
|
||||
? (HAS_FAST_COPY_BACKWARD
|
||||
? __memmove_ssse3_back : __memmove_ssse3)
|
||||
: __memmove_sse2)
|
||||
HAS_AVX
|
||||
? __memmove_avx_unaligned
|
||||
: (HAS_SSSE3
|
||||
? (HAS_FAST_COPY_BACKWARD
|
||||
? __memmove_ssse3_back : __memmove_ssse3)
|
||||
: __memmove_sse2));
|
||||
|
||||
strong_alias (__libc_memmove, memmove)
|
||||
|
||||
|
@ -25,11 +25,13 @@
|
||||
extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
|
||||
extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
|
||||
extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
|
||||
extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
|
||||
|
||||
#include "debug/memmove_chk.c"
|
||||
|
||||
libc_ifunc (__memmove_chk,
|
||||
HAS_SSSE3
|
||||
HAS_AVX ? __memmove_chk_avx_unaligned :
|
||||
(HAS_SSSE3
|
||||
? (HAS_FAST_COPY_BACKWARD
|
||||
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
|
||||
: __memmove_chk_sse2);
|
||||
: __memmove_chk_sse2));
|
||||
|
22
sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
Normal file
22
sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
Normal file
@ -0,0 +1,22 @@
|
||||
/* mempcpy with AVX
|
||||
Copyright (C) 2014 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#define USE_AS_MEMPCPY
|
||||
#define MEMCPY __mempcpy_avx_unaligned
|
||||
#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
|
||||
#include "memcpy-avx-unaligned.S"
|
@ -37,6 +37,9 @@ ENTRY(__mempcpy)
|
||||
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_ssse3_back(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_avx_unaligned(%rip), %rax
|
||||
2: ret
|
||||
END(__mempcpy)
|
||||
|
||||
|
@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk)
|
||||
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_chk_ssse3_back(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_chk_avx_unaligned(%rip), %rax
|
||||
2: ret
|
||||
END(__mempcpy_chk)
|
||||
# else
|
||||
|
Loading…
Reference in New Issue
Block a user