mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-10 03:10:09 +00:00
6fb8cbcb58
This patch includes optimized 64bit memcpy/memmove for Atom, Core 2 and Core i7. It improves memcpy by up to 3X on Atom, up to 4X on Core 2 and up to 1X on Core i7. It also improves memmove by up to 3X on Atom, up to 4X on Core 2 and up to 2X on Core i7.
575 lines
9.6 KiB
ArmAsm
575 lines
9.6 KiB
ArmAsm
/*
|
|
Optimized memcpy for x86-64.
|
|
|
|
Copyright (C) 2007 Free Software Foundation, Inc.
|
|
Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
|
|
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, write to the Free
|
|
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
02111-1307 USA.
|
|
*/
|
|
|
|
#include <sysdep.h>
|
|
#include "asm-syntax.h"
|
|
|
|
/* Stack slots in the red-zone. */
|
|
|
|
#ifdef USE_AS_MEMPCPY
|
|
# define RETVAL (0)
|
|
#else
|
|
# define RETVAL (-8)
|
|
#endif
|
|
#define SAVE0 (RETVAL - 8)
|
|
#define SAVE1 (SAVE0 - 8)
|
|
#define SAVE2 (SAVE1 - 8)
|
|
#define SAVE3 (SAVE2 - 8)
|
|
|
|
.text
|
|
|
|
#if defined PIC && !defined NOT_IN_libc
|
|
ENTRY_CHK (__memcpy_chk)
|
|
|
|
cmpq %rdx, %rcx
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
|
|
END_CHK (__memcpy_chk)
|
|
#endif
|
|
|
|
ENTRY(memcpy) /* (void *, const void*, size_t) */
|
|
|
|
/* Handle tiny blocks. */
|
|
|
|
L(1try): /* up to 32B */
|
|
cmpq $32, %rdx
|
|
#ifndef USE_AS_MEMPCPY
|
|
movq %rdi, %rax /* save return value */
|
|
#endif
|
|
jae L(1after)
|
|
|
|
L(1): /* 1-byte once */
|
|
testb $1, %dl
|
|
jz L(1a)
|
|
|
|
movzbl (%rsi), %ecx
|
|
movb %cl, (%rdi)
|
|
|
|
incq %rsi
|
|
incq %rdi
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(1a): /* 2-byte once */
|
|
testb $2, %dl
|
|
jz L(1b)
|
|
|
|
movzwl (%rsi), %ecx
|
|
movw %cx, (%rdi)
|
|
|
|
addq $2, %rsi
|
|
addq $2, %rdi
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(1b): /* 4-byte once */
|
|
testb $4, %dl
|
|
jz L(1c)
|
|
|
|
movl (%rsi), %ecx
|
|
movl %ecx, (%rdi)
|
|
|
|
addq $4, %rsi
|
|
addq $4, %rdi
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(1c): /* 8-byte once */
|
|
testb $8, %dl
|
|
jz L(1d)
|
|
|
|
movq (%rsi), %rcx
|
|
movq %rcx, (%rdi)
|
|
|
|
addq $8, %rsi
|
|
addq $8, %rdi
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(1d): /* 16-byte loop */
|
|
andl $0xf0, %edx
|
|
jz L(exit)
|
|
|
|
.p2align 4
|
|
|
|
L(1loop):
|
|
movq (%rsi), %rcx
|
|
movq 8(%rsi), %r8
|
|
movq %rcx, (%rdi)
|
|
movq %r8, 8(%rdi)
|
|
|
|
subl $16, %edx
|
|
|
|
leaq 16(%rsi), %rsi
|
|
leaq 16(%rdi), %rdi
|
|
|
|
jnz L(1loop)
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(exit): /* exit */
|
|
#ifdef USE_AS_MEMPCPY
|
|
movq %rdi, %rax /* return value */
|
|
#else
|
|
rep
|
|
#endif
|
|
retq
|
|
|
|
.p2align 4
|
|
|
|
L(1after):
|
|
#ifndef USE_AS_MEMPCPY
|
|
movq %rax, RETVAL(%rsp) /* save return value */
|
|
#endif
|
|
|
|
/* Align to the natural word size. */
|
|
|
|
L(aligntry):
|
|
movl %esi, %ecx /* align by source */
|
|
|
|
andl $7, %ecx
|
|
jz L(alignafter) /* already aligned */
|
|
|
|
L(align): /* align */
|
|
leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
|
|
subl $8, %ecx
|
|
|
|
.p2align 4
|
|
|
|
L(alignloop): /* 1-byte alignment loop */
|
|
movzbl (%rsi), %eax
|
|
movb %al, (%rdi)
|
|
|
|
incl %ecx
|
|
|
|
leaq 1(%rsi), %rsi
|
|
leaq 1(%rdi), %rdi
|
|
|
|
jnz L(alignloop)
|
|
|
|
.p2align 4
|
|
|
|
L(alignafter):
|
|
|
|
/* Handle mid-sized blocks. */
|
|
|
|
L(32try): /* up to 1KB */
|
|
cmpq $1024, %rdx
|
|
ja L(32after)
|
|
|
|
L(32): /* 32-byte loop */
|
|
movl %edx, %ecx
|
|
shrl $5, %ecx
|
|
jz L(32skip)
|
|
|
|
.p2align 4
|
|
|
|
L(32loop):
|
|
decl %ecx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %r8
|
|
movq 16(%rsi), %r9
|
|
movq 24(%rsi), %r10
|
|
|
|
movq %rax, (%rdi)
|
|
movq %r8, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
|
|
leaq 32(%rsi), %rsi
|
|
leaq 32(%rdi), %rdi
|
|
|
|
jz L(32skip) /* help out smaller blocks */
|
|
|
|
decl %ecx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %r8
|
|
movq 16(%rsi), %r9
|
|
movq 24(%rsi), %r10
|
|
|
|
movq %rax, (%rdi)
|
|
movq %r8, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
|
|
leaq 32(%rsi), %rsi
|
|
leaq 32(%rdi), %rdi
|
|
|
|
jnz L(32loop)
|
|
|
|
.p2align 4
|
|
|
|
L(32skip):
|
|
andl $31, %edx /* check for left overs */
|
|
#ifdef USE_AS_MEMPCPY
|
|
jnz L(1)
|
|
|
|
movq %rdi, %rax
|
|
#else
|
|
movq RETVAL(%rsp), %rax
|
|
jnz L(1)
|
|
|
|
rep
|
|
#endif
|
|
retq /* exit */
|
|
|
|
.p2align 4
|
|
|
|
L(32after):
|
|
|
|
/*
|
|
In order to minimize code-size in RTLD, algorithms specific for
|
|
larger blocks are excluded when building for RTLD.
|
|
*/
|
|
|
|
/* Handle blocks smaller than 1/2 L1. */
|
|
|
|
L(fasttry): /* first 1/2 L1 */
|
|
#ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */
|
|
movq __x86_64_data_cache_size_half(%rip), %r11
|
|
cmpq %rdx, %r11 /* calculate the smaller of */
|
|
cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
|
|
#endif
|
|
|
|
L(fast): /* good ol' MOVS */
|
|
#ifndef NOT_IN_libc
|
|
movq %r11, %rcx
|
|
andq $-8, %r11
|
|
#else
|
|
movq %rdx, %rcx
|
|
#endif
|
|
shrq $3, %rcx
|
|
jz L(fastskip)
|
|
|
|
rep
|
|
movsq
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(fastskip):
|
|
#ifndef NOT_IN_libc
|
|
subq %r11, %rdx /* check for more */
|
|
testq $-8, %rdx
|
|
jnz L(fastafter)
|
|
#endif
|
|
|
|
andl $7, %edx /* check for left overs */
|
|
#ifdef USE_AS_MEMPCPY
|
|
jnz L(1)
|
|
|
|
movq %rdi, %rax
|
|
#else
|
|
movq RETVAL(%rsp), %rax
|
|
jnz L(1)
|
|
|
|
rep
|
|
#endif
|
|
retq /* exit */
|
|
|
|
#ifndef NOT_IN_libc /* none of the algorithms below for RTLD */
|
|
|
|
.p2align 4
|
|
|
|
L(fastafter):
|
|
|
|
/* Handle large blocks smaller than 1/2 L2. */
|
|
|
|
L(pretry): /* first 1/2 L2 */
|
|
movq __x86_64_shared_cache_size_half (%rip), %r8
|
|
cmpq %rdx, %r8 /* calculate the lesser of */
|
|
cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
|
|
|
|
L(pre): /* 64-byte with prefetching */
|
|
movq %r8, %rcx
|
|
andq $-64, %r8
|
|
shrq $6, %rcx
|
|
jz L(preskip)
|
|
|
|
movq %r14, SAVE0(%rsp)
|
|
cfi_rel_offset (%r14, SAVE0)
|
|
movq %r13, SAVE1(%rsp)
|
|
cfi_rel_offset (%r13, SAVE1)
|
|
movq %r12, SAVE2(%rsp)
|
|
cfi_rel_offset (%r12, SAVE2)
|
|
movq %rbx, SAVE3(%rsp)
|
|
cfi_rel_offset (%rbx, SAVE3)
|
|
|
|
cmpl $0, __x86_64_prefetchw(%rip)
|
|
jz L(preloop) /* check if PREFETCHW OK */
|
|
|
|
.p2align 4
|
|
|
|
/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
|
|
|
|
L(prewloop): /* cache-line in state M */
|
|
decq %rcx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8 (%rsi), %rbx
|
|
movq 16 (%rsi), %r9
|
|
movq 24 (%rsi), %r10
|
|
movq 32 (%rsi), %r11
|
|
movq 40 (%rsi), %r12
|
|
movq 48 (%rsi), %r13
|
|
movq 56 (%rsi), %r14
|
|
|
|
prefetcht0 0 + 896 (%rsi)
|
|
prefetcht0 64 + 896 (%rsi)
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
movq %r11, 32(%rdi)
|
|
movq %r12, 40(%rdi)
|
|
movq %r13, 48(%rdi)
|
|
movq %r14, 56(%rdi)
|
|
|
|
leaq 64(%rsi), %rsi
|
|
leaq 64(%rdi), %rdi
|
|
|
|
jz L(prebail)
|
|
|
|
decq %rcx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %rbx
|
|
movq 16(%rsi), %r9
|
|
movq 24(%rsi), %r10
|
|
movq 32(%rsi), %r11
|
|
movq 40(%rsi), %r12
|
|
movq 48(%rsi), %r13
|
|
movq 56(%rsi), %r14
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
movq %r11, 32(%rdi)
|
|
movq %r12, 40(%rdi)
|
|
movq %r13, 48(%rdi)
|
|
movq %r14, 56(%rdi)
|
|
|
|
prefetchw 896 - 64(%rdi)
|
|
prefetchw 896 - 0(%rdi)
|
|
|
|
leaq 64(%rsi), %rsi
|
|
leaq 64(%rdi), %rdi
|
|
|
|
jnz L(prewloop)
|
|
jmp L(prebail)
|
|
|
|
.p2align 4
|
|
|
|
/* ... when PREFETCHW is not available. */
|
|
|
|
L(preloop): /* cache-line in state E */
|
|
decq %rcx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %rbx
|
|
movq 16(%rsi), %r9
|
|
movq 24(%rsi), %r10
|
|
movq 32(%rsi), %r11
|
|
movq 40(%rsi), %r12
|
|
movq 48(%rsi), %r13
|
|
movq 56(%rsi), %r14
|
|
|
|
prefetcht0 896 + 0(%rsi)
|
|
prefetcht0 896 + 64(%rsi)
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
movq %r11, 32(%rdi)
|
|
movq %r12, 40(%rdi)
|
|
movq %r13, 48(%rdi)
|
|
movq %r14, 56(%rdi)
|
|
|
|
leaq 64 (%rsi), %rsi
|
|
leaq 64 (%rdi), %rdi
|
|
|
|
jz L(prebail)
|
|
|
|
decq %rcx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %rbx
|
|
movq 16(%rsi), %r9
|
|
movq 24(%rsi), %r10
|
|
movq 32(%rsi), %r11
|
|
movq 40(%rsi), %r12
|
|
movq 48(%rsi), %r13
|
|
movq 56(%rsi), %r14
|
|
|
|
prefetcht0 896 - 64(%rdi)
|
|
prefetcht0 896 - 0(%rdi)
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
movq %r11, 32(%rdi)
|
|
movq %r12, 40(%rdi)
|
|
movq %r13, 48(%rdi)
|
|
movq %r14, 56(%rdi)
|
|
|
|
leaq 64(%rsi), %rsi
|
|
leaq 64(%rdi), %rdi
|
|
|
|
jnz L(preloop)
|
|
|
|
L(prebail):
|
|
movq SAVE3(%rsp), %rbx
|
|
cfi_restore (%rbx)
|
|
movq SAVE2(%rsp), %r12
|
|
cfi_restore (%r12)
|
|
movq SAVE1(%rsp), %r13
|
|
cfi_restore (%r13)
|
|
movq SAVE0(%rsp), %r14
|
|
cfi_restore (%r14)
|
|
|
|
/* .p2align 4 */
|
|
|
|
L(preskip):
|
|
subq %r8, %rdx /* check for more */
|
|
testq $-64, %rdx
|
|
jnz L(preafter)
|
|
|
|
andl $63, %edx /* check for left overs */
|
|
#ifdef USE_AS_MEMPCPY
|
|
jnz L(1)
|
|
|
|
movq %rdi, %rax
|
|
#else
|
|
movq RETVAL(%rsp), %rax
|
|
jnz L(1)
|
|
|
|
rep
|
|
#endif
|
|
retq /* exit */
|
|
|
|
.p2align 4
|
|
|
|
L(preafter):
|
|
|
|
/* Handle huge blocks. */
|
|
|
|
L(NTtry):
|
|
|
|
L(NT): /* non-temporal 128-byte */
|
|
movq %rdx, %rcx
|
|
shrq $7, %rcx
|
|
jz L(NTskip)
|
|
|
|
movq %r14, SAVE0(%rsp)
|
|
cfi_rel_offset (%r14, SAVE0)
|
|
movq %r13, SAVE1(%rsp)
|
|
cfi_rel_offset (%r13, SAVE1)
|
|
movq %r12, SAVE2(%rsp)
|
|
cfi_rel_offset (%r12, SAVE2)
|
|
|
|
.p2align 4
|
|
|
|
L(NTloop):
|
|
prefetchnta 768(%rsi)
|
|
prefetchnta 832(%rsi)
|
|
|
|
decq %rcx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %r8
|
|
movq 16(%rsi), %r9
|
|
movq 24(%rsi), %r10
|
|
movq 32(%rsi), %r11
|
|
movq 40(%rsi), %r12
|
|
movq 48(%rsi), %r13
|
|
movq 56(%rsi), %r14
|
|
|
|
movntiq %rax, (%rdi)
|
|
movntiq %r8, 8(%rdi)
|
|
movntiq %r9, 16(%rdi)
|
|
movntiq %r10, 24(%rdi)
|
|
movntiq %r11, 32(%rdi)
|
|
movntiq %r12, 40(%rdi)
|
|
movntiq %r13, 48(%rdi)
|
|
movntiq %r14, 56(%rdi)
|
|
|
|
movq 64(%rsi), %rax
|
|
movq 72(%rsi), %r8
|
|
movq 80(%rsi), %r9
|
|
movq 88(%rsi), %r10
|
|
movq 96(%rsi), %r11
|
|
movq 104(%rsi), %r12
|
|
movq 112(%rsi), %r13
|
|
movq 120(%rsi), %r14
|
|
|
|
movntiq %rax, 64(%rdi)
|
|
movntiq %r8, 72(%rdi)
|
|
movntiq %r9, 80(%rdi)
|
|
movntiq %r10, 88(%rdi)
|
|
movntiq %r11, 96(%rdi)
|
|
movntiq %r12, 104(%rdi)
|
|
movntiq %r13, 112(%rdi)
|
|
movntiq %r14, 120(%rdi)
|
|
|
|
leaq 128(%rsi), %rsi
|
|
leaq 128(%rdi), %rdi
|
|
|
|
jnz L(NTloop)
|
|
|
|
sfence /* serialize memory stores */
|
|
|
|
movq SAVE2(%rsp), %r12
|
|
cfi_restore (%r12)
|
|
movq SAVE1(%rsp), %r13
|
|
cfi_restore (%r13)
|
|
movq SAVE0(%rsp), %r14
|
|
cfi_restore (%r14)
|
|
|
|
L(NTskip):
|
|
andl $127, %edx /* check for left overs */
|
|
#ifdef USE_AS_MEMPCPY
|
|
jnz L(1)
|
|
|
|
movq %rdi, %rax
|
|
#else
|
|
movq RETVAL(%rsp), %rax
|
|
jnz L(1)
|
|
|
|
rep
|
|
#endif
|
|
retq /* exit */
|
|
|
|
#endif /* !NOT_IN_libc */
|
|
|
|
END(memcpy)
|
|
|
|
#ifndef USE_AS_MEMPCPY
|
|
libc_hidden_builtin_def (memcpy)
|
|
#endif
|