mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-15 21:40:05 +00:00
96fbb9a328
This patch introduces the new arch13 ifunc variant for memmove. For the forward or non-overlapping case it is just using memcpy. For the backward case it relies on the new instruction mvcrl. The instruction copies up to 256 bytes at once. In case of an overlap, it copies the bytes like copying them one by one starting from right to left. ChangeLog: * sysdeps/s390/ifunc-memcpy.h (HAVE_MEMMOVE_ARCH13, MEMMOVE_ARCH13 HAVE_MEMMOVE_IFUNC_AND_ARCH13_SUPPORT): New defines. * sysdeps/s390/memcpy-z900.S: Add arch13 memmove implementation. * sysdeps/s390/memmove.c (memmove): Add arch13 variant in ifunc selector. * sysdeps/s390/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add ifunc variant for arch13 memmove. * sysdeps/s390/multiarch/ifunc-resolve.h (S390_STFLE_BITS_ARCH13_MIE3, S390_IS_ARCH13_MIE3): New defines.
367 lines
10 KiB
ArmAsm
367 lines
10 KiB
ArmAsm
/* memcpy - copy a block from source to destination. 31/64 bit S/390 version.
|
|
Copyright (C) 2012-2019 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
|
|
#include <sysdep.h>
|
|
#include "asm-syntax.h"
|
|
#include <ifunc-memcpy.h>
|
|
|
|
/* INPUT PARAMETERS
|
|
%r2 = address of destination memory area
|
|
%r3 = address of source memory area
|
|
%r4 = number of bytes to copy. */
|
|
|
|
.text
|
|
|
|
#if defined __s390x__
|
|
# define LTGR ltgr
|
|
# define CGHI cghi
|
|
# define LGR lgr
|
|
# define AGHI aghi
|
|
# define BRCTG brctg
|
|
#else
|
|
# define LTGR ltr
|
|
# define CGHI chi
|
|
# define LGR lr
|
|
# define AGHI ahi
|
|
# define BRCTG brct
|
|
#endif /* ! defined __s390x__ */
|
|
|
|
#if HAVE_MEMCPY_Z900_G5
|
|
ENTRY(MEMPCPY_Z900_G5)
|
|
# if defined __s390x__
|
|
.machine "z900"
|
|
# else
|
|
.machine "g5"
|
|
# endif /* ! defined __s390x__ */
|
|
LGR %r1,%r2 # Use as dest
|
|
la %r2,0(%r4,%r2) # Return dest + n
|
|
j .L_Z900_G5_start
|
|
END(MEMPCPY_Z900_G5)
|
|
|
|
ENTRY(MEMCPY_Z900_G5)
|
|
# if defined __s390x__
|
|
.machine "z900"
|
|
# else
|
|
.machine "g5"
|
|
# endif /* ! defined __s390x__ */
|
|
LGR %r1,%r2 # r1: Use as dest ; r2: Return dest
|
|
.L_Z900_G5_start:
|
|
LTGR %r4,%r4
|
|
je .L_Z900_G5_4
|
|
AGHI %r4,-1
|
|
# if defined __s390x__
|
|
srlg %r5,%r4,8
|
|
# else
|
|
lr %r5,%r4
|
|
srl %r5,8
|
|
# endif /* ! defined __s390x__ */
|
|
LTGR %r5,%r5
|
|
jne .L_Z900_G5_13
|
|
.L_Z900_G5_3:
|
|
# if defined __s390x__
|
|
larl %r5,.L_Z900_G5_15
|
|
# define Z900_G5_EX_D 0
|
|
# else
|
|
basr %r5,0
|
|
.L_Z900_G5_14:
|
|
# define Z900_G5_EX_D .L_Z900_G5_15-.L_Z900_G5_14
|
|
# endif /* ! defined __s390x__ */
|
|
ex %r4,Z900_G5_EX_D(%r5)
|
|
.L_Z900_G5_4:
|
|
br %r14
|
|
.L_Z900_G5_13:
|
|
CGHI %r5,4096 # Switch to mvcle for copies >1MB
|
|
jh __memcpy_mvcle
|
|
.L_Z900_G5_12:
|
|
mvc 0(256,%r1),0(%r3)
|
|
la %r1,256(%r1)
|
|
la %r3,256(%r3)
|
|
BRCTG %r5,.L_Z900_G5_12
|
|
j .L_Z900_G5_3
|
|
.L_Z900_G5_15:
|
|
mvc 0(1,%r1),0(%r3)
|
|
END(MEMCPY_Z900_G5)
|
|
#endif /* HAVE_MEMCPY_Z900_G5 */
|
|
|
|
ENTRY(__memcpy_mvcle)
|
|
# Using as standalone function will result in unexpected
|
|
# results since the length field is incremented by 1 in order to
|
|
# compensate the changes already done in the functions above.
|
|
LGR %r0,%r2 # backup return dest [ + n ]
|
|
AGHI %r4,1 # length + 1
|
|
LGR %r5,%r4 # source length
|
|
LGR %r4,%r3 # source address
|
|
LGR %r2,%r1 # destination address
|
|
LGR %r3,%r5 # destination length = source length
|
|
.L_MVCLE_1:
|
|
mvcle %r2,%r4,0 # thats it, MVCLE is your friend
|
|
jo .L_MVCLE_1
|
|
LGR %r2,%r0 # return destination address
|
|
br %r14
|
|
END(__memcpy_mvcle)
|
|
|
|
#undef LTGR
|
|
#undef CGHI
|
|
#undef LGR
|
|
#undef AGHI
|
|
#undef BRCTG
|
|
|
|
#if HAVE_MEMCPY_Z10
|
|
ENTRY(MEMPCPY_Z10)
|
|
.machine "z10"
|
|
.machinemode "zarch_nohighgprs"
|
|
lgr %r1,%r2 # Use as dest
|
|
la %r2,0(%r4,%r2) # Return dest + n
|
|
j .L_Z10_start
|
|
END(MEMPCPY_Z10)
|
|
|
|
ENTRY(MEMCPY_Z10)
|
|
.machine "z10"
|
|
.machinemode "zarch_nohighgprs"
|
|
lgr %r1,%r2 # r1: Use as dest ; r2: Return dest
|
|
.L_Z10_start:
|
|
# if !defined __s390x__
|
|
llgfr %r4,%r4
|
|
# endif /* !defined __s390x__ */
|
|
cgije %r4,0,.L_Z10_4
|
|
aghi %r4,-1
|
|
srlg %r5,%r4,8
|
|
cgijlh %r5,0,.L_Z10_13
|
|
.L_Z10_3:
|
|
exrl %r4,.L_Z10_15
|
|
.L_Z10_4:
|
|
br %r14
|
|
.L_Z10_13:
|
|
cgfi %r5,65535 # Switch to mvcle for copies >16MB
|
|
jh __memcpy_mvcle
|
|
.L_Z10_12:
|
|
pfd 1,768(%r3)
|
|
pfd 2,768(%r1)
|
|
mvc 0(256,%r1),0(%r3)
|
|
la %r1,256(%r1)
|
|
la %r3,256(%r3)
|
|
brctg %r5,.L_Z10_12
|
|
j .L_Z10_3
|
|
.L_Z10_15:
|
|
mvc 0(1,%r1),0(%r3)
|
|
END(MEMCPY_Z10)
|
|
#endif /* HAVE_MEMCPY_Z10 */
|
|
|
|
#if HAVE_MEMCPY_Z196
|
|
ENTRY(MEMPCPY_Z196)
|
|
.machine "z196"
|
|
.machinemode "zarch_nohighgprs"
|
|
lgr %r1,%r2 # Use as dest
|
|
la %r2,0(%r4,%r2) # Return dest + n
|
|
j .L_Z196_start
|
|
END(MEMPCPY_Z196)
|
|
|
|
ENTRY(MEMCPY_Z196)
|
|
.machine "z196"
|
|
.machinemode "zarch_nohighgprs"
|
|
lgr %r1,%r2 # r1: Use as dest ; r2: Return dest
|
|
.L_Z196_start:
|
|
# if !defined __s390x__
|
|
llgfr %r4,%r4
|
|
# endif /* !defined __s390x__ */
|
|
ltgr %r4,%r4
|
|
je .L_Z196_4
|
|
.L_Z196_start2:
|
|
aghi %r4,-1
|
|
srlg %r5,%r4,8
|
|
ltgr %r5,%r5
|
|
jne .L_Z196_5
|
|
.L_Z196_3:
|
|
exrl %r4,.L_Z196_14
|
|
.L_Z196_4:
|
|
br %r14
|
|
.L_Z196_5:
|
|
cgfi %r5,262144 # Switch to mvcle for copies >64MB
|
|
jh __memcpy_mvcle
|
|
.L_Z196_2:
|
|
pfd 1,768(%r3)
|
|
pfd 2,768(%r1)
|
|
mvc 0(256,%r1),0(%r3)
|
|
aghi %r5,-1
|
|
la %r1,256(%r1)
|
|
la %r3,256(%r3)
|
|
jne .L_Z196_2
|
|
j .L_Z196_3
|
|
.L_Z196_14:
|
|
mvc 0(1,%r1),0(%r3)
|
|
END(MEMCPY_Z196)
|
|
#endif /* HAVE_MEMCPY_Z196 */
|
|
|
|
#if HAVE_MEMMOVE_Z13
|
|
ENTRY(MEMMOVE_Z13)
|
|
.machine "z13"
|
|
.machinemode "zarch_nohighgprs"
|
|
# if !defined __s390x__
|
|
/* Note: The 31bit dst and src pointers are prefixed with zeroes. */
|
|
llgfr %r4,%r4
|
|
llgfr %r3,%r3
|
|
llgfr %r2,%r2
|
|
# endif /* !defined __s390x__ */
|
|
sgrk %r0,%r2,%r3
|
|
clgijh %r4,16,.L_MEMMOVE_Z13_LARGE
|
|
aghik %r5,%r4,-1
|
|
.L_MEMMOVE_Z13_SMALL:
|
|
jl .L_MEMMOVE_Z13_END /* Jump away if len was zero. */
|
|
/* Store up to 16 bytes with vll/vstl which needs the index
|
|
instead of lengths. */
|
|
vll %v16,%r5,0(%r3)
|
|
vstl %v16,%r5,0(%r2)
|
|
.L_MEMMOVE_Z13_END:
|
|
br %r14
|
|
.L_MEMMOVE_Z13_LARGE:
|
|
lgr %r1,%r2 /* For memcpy: r1: Use as dest ;
|
|
r2: Return dest */
|
|
/* The unsigned comparison (dst - src >= len) determines if we can
|
|
execute the forward case with memcpy. */
|
|
#if ! HAVE_MEMCPY_Z196
|
|
# error The z13 variant of memmove needs the z196 variant of memcpy!
|
|
#endif
|
|
clgrjhe %r0,%r4,.L_Z196_start2
|
|
risbgn %r5,%r4,4,128+63,60 /* r5 = r4 / 16 */
|
|
aghi %r4,-16
|
|
clgijhe %r5,8,.L_MEMMOVE_Z13_LARGE_64B
|
|
.L_MEMMOVE_Z13_LARGE_16B_LOOP:
|
|
/* Store at least 16 bytes with vl/vst. The number of 16byte blocks
|
|
is stored in r5. */
|
|
vl %v16,0(%r4,%r3)
|
|
vst %v16,0(%r4,%r2)
|
|
aghi %r4,-16
|
|
brctg %r5,.L_MEMMOVE_Z13_LARGE_16B_LOOP
|
|
aghik %r5,%r4,15
|
|
j .L_MEMMOVE_Z13_SMALL
|
|
.L_MEMMOVE_Z13_LARGE_64B:
|
|
/* Store at least 128 bytes with 4x vl/vst. The number of 64byte blocks
|
|
will be stored in r0. */
|
|
aghi %r4,-48
|
|
srlg %r0,%r5,2 /* r5 = %r0 / 4
|
|
=> Number of 64byte blocks. */
|
|
.L_MEMMOVE_Z13_LARGE_64B_LOOP:
|
|
vl %v20,48(%r4,%r3)
|
|
vl %v19,32(%r4,%r3)
|
|
vl %v18,16(%r4,%r3)
|
|
vl %v17,0(%r4,%r3)
|
|
vst %v20,48(%r4,%r2)
|
|
vst %v19,32(%r4,%r2)
|
|
vst %v18,16(%r4,%r2)
|
|
vst %v17,0(%r4,%r2)
|
|
aghi %r4,-64
|
|
brctg %r0,.L_MEMMOVE_Z13_LARGE_64B_LOOP
|
|
aghi %r4,48
|
|
/* Recalculate the number of 16byte blocks. */
|
|
risbg %r5,%r5,62,128+63,0 /* r5 = r5 & 3
|
|
=> Remaining 16byte blocks. */
|
|
jne .L_MEMMOVE_Z13_LARGE_16B_LOOP
|
|
aghik %r5,%r4,15
|
|
j .L_MEMMOVE_Z13_SMALL
|
|
END(MEMMOVE_Z13)
|
|
#endif /* HAVE_MEMMOVE_Z13 */
|
|
|
|
#if HAVE_MEMMOVE_ARCH13
|
|
ENTRY(MEMMOVE_ARCH13)
|
|
.machine "arch13"
|
|
.machinemode "zarch_nohighgprs"
|
|
# if ! defined __s390x__
|
|
/* Note: The 31bit dst and src pointers are prefixed with zeroes. */
|
|
llgfr %r4,%r4
|
|
llgfr %r3,%r3
|
|
llgfr %r2,%r2
|
|
# endif /* ! defined __s390x__ */
|
|
sgrk %r5,%r2,%r3
|
|
aghik %r0,%r4,-1 /* Both vstl and mvcrl needs highest index. */
|
|
clgijh %r4,16,.L_MEMMOVE_ARCH13_LARGE
|
|
.L_MEMMOVE_ARCH13_SMALL:
|
|
jl .L_MEMMOVE_ARCH13_END /* Return if len was zero (cc of aghik). */
|
|
/* Store up to 16 bytes with vll/vstl (needs highest index). */
|
|
vll %v16,%r0,0(%r3)
|
|
vstl %v16,%r0,0(%r2)
|
|
.L_MEMMOVE_ARCH13_END:
|
|
br %r14
|
|
.L_MEMMOVE_ARCH13_LARGE:
|
|
lgr %r1,%r2 /* For memcpy: r1: Use as dest ; r2: Return dest */
|
|
/* The unsigned comparison (dst - src >= len) determines if we can
|
|
execute the forward case with memcpy. */
|
|
#if ! HAVE_MEMCPY_Z196
|
|
# error The arch13 variant of memmove needs the z196 variant of memcpy!
|
|
#endif
|
|
/* Backward case. */
|
|
clgrjhe %r5,%r4,.L_Z196_start2
|
|
clgijh %r0,255,.L_MEMMOVE_ARCH13_LARGER_256B
|
|
/* Move up to 256bytes with mvcrl (move right to left). */
|
|
mvcrl 0(%r1),0(%r3) /* Move (r0 + 1) bytes from r3 to r1. */
|
|
br %r14
|
|
.L_MEMMOVE_ARCH13_LARGER_256B:
|
|
/* First move the "remaining" block of up to 256 bytes at the end of
|
|
src/dst buffers. Then move blocks of 256bytes in a loop starting
|
|
with the block at the end.
|
|
(If src/dst pointers are aligned e.g. to 256 bytes, then the pointers
|
|
passed to mvcrl instructions are aligned, too) */
|
|
risbgn %r5,%r0,8,128+63,56 /* r5 = r0 / 256 */
|
|
risbgn %r0,%r0,56,128+63,0 /* r0 = r0 & 0xFF */
|
|
slgr %r4,%r0
|
|
lay %r1,-1(%r4,%r1)
|
|
lay %r3,-1(%r4,%r3)
|
|
mvcrl 0(%r1),0(%r3) /* Move (r0 + 1) bytes from r3 to r1. */
|
|
lghi %r0,255 /* Always copy 256 bytes in the loop below! */
|
|
.L_MEMMOVE_ARCH13_LARGE_256B_LOOP:
|
|
aghi %r1,-256
|
|
aghi %r3,-256
|
|
mvcrl 0(%r1),0(%r3) /* Move (r0 + 1) bytes from r3 to r1. */
|
|
brctg %r5,.L_MEMMOVE_ARCH13_LARGE_256B_LOOP
|
|
br %r14
|
|
END(MEMMOVE_ARCH13)
|
|
#endif /* HAVE_MEMMOVE_ARCH13 */
|
|
|
|
#if ! HAVE_MEMCPY_IFUNC
|
|
/* If we don't use ifunc, define an alias for mem[p]cpy here.
|
|
Otherwise see sysdeps/s390/mem[p]cpy.c. */
|
|
strong_alias (MEMCPY_DEFAULT, memcpy)
|
|
strong_alias (MEMPCPY_DEFAULT, __mempcpy)
|
|
weak_alias (__mempcpy, mempcpy)
|
|
#endif
|
|
|
|
#if ! HAVE_MEMMOVE_IFUNC
|
|
/* If we don't use ifunc, define an alias for memmove here.
|
|
Otherwise see sysdeps/s390/memmove.c. */
|
|
# if ! HAVE_MEMMOVE_C
|
|
/* If the c variant is needed, then sysdeps/s390/memmove-c.c
|
|
defines memmove.
|
|
Otherwise MEMMOVE_DEFAULT is implemented here and we have to define it. */
|
|
strong_alias (MEMMOVE_DEFAULT, memmove)
|
|
# endif
|
|
#endif
|
|
|
|
#if defined SHARED && IS_IN (libc)
|
|
/* Defines the internal symbols.
|
|
Compare to libc_hidden_[builtin_]def (mem[p]cpy) in string/mem[p]cpy.c. */
|
|
strong_alias (MEMCPY_DEFAULT, __GI_memcpy)
|
|
strong_alias (MEMPCPY_DEFAULT, __GI_mempcpy)
|
|
strong_alias (MEMPCPY_DEFAULT, __GI___mempcpy)
|
|
# if ! HAVE_MEMMOVE_C
|
|
/* If the c variant is needed, then sysdeps/s390/memmove-c.c
|
|
defines the internal symbol.
|
|
Otherwise MEMMOVE_DEFAULT is implemented here and we have to define it. */
|
|
strong_alias (MEMMOVE_DEFAULT, __GI_memmove)
|
|
# endif
|
|
#endif
|