diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h index bc8d842238..6c0a3fe4e1 100644 --- a/sysdeps/aarch64/cpu-features.h +++ b/sysdeps/aarch64/cpu-features.h @@ -40,14 +40,6 @@ #define MIDR_IMPLEMENTOR(midr) \ (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) -#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ - && MIDR_PARTNUM(midr) == 0x0a1) - -#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \ - && MIDR_PARTNUM(midr) == 0x516) -#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ - && MIDR_PARTNUM(midr) == 0xaf) - #define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P' \ && MIDR_PARTNUM(midr) == 0x000) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 3e251cc234..772b16a358 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -7,8 +7,6 @@ sysdep_routines += \ memcpy_mops \ memcpy_oryon1 \ memcpy_sve \ - memcpy_thunderx \ - memcpy_thunderx2 \ memmove_mops \ memset_a64fx \ memset_emag \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index b2fda541f9..4a981e931d 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -35,9 +35,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c. */ IFUNC_IMPL (i, name, memcpy, - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1) - IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx) IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) @@ -45,9 +43,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, mops, __memcpy_mops) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1) - IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx) IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 15c954778b..9251297ee6 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -30,8 +30,6 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; -extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; -extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden; @@ -55,12 +53,6 @@ select_memcpy_ifunc (void) if (IS_ORYON1 (midr)) return __memcpy_oryon1; - if (IS_THUNDERX (midr)) - return __memcpy_thunderx; - - if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) - return __memcpy_thunderx2; - return __memcpy_generic; } diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S deleted file mode 100644 index 5d8438a82e..0000000000 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S +++ /dev/null @@ -1,305 +0,0 @@ -/* A Thunderx Optimized memcpy implementation for AARCH64. - Copyright (C) 2017-2024 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -/* The actual code in this memcpy and memmove should be identical to the - generic version except for the code under '#ifdef THUNDERX'. This is - to make is easier to keep this version and the generic version in sync - for changes that are not specific to thunderx. */ - -#include - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses. - * - */ - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst -#define G_l count -#define G_h dst -#define tmp1 x14 - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - In order to share code with memmove, small and medium copies read all - data before writing, allowing any kind of overlap. So small, medium - and large backwards memmoves are handled by falling through into memcpy. - Overlapping large forward memmoves use a loop that copies backwards. -*/ - -ENTRY (__memmove_thunderx) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.lo L(move_long) - - /* Common case falls through into memcpy. */ -END (__memmove_thunderx) - -ENTRY (__memcpy_thunderx) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - prfm PLDL1KEEP, [src] - add srcend, src, count - add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) - cmp count, 96 - b.hi L(copy_long) - - /* Medium copies: 17..96 bytes. */ - sub tmp1, count, 1 - ldp A_l, A_h, [src] - tbnz tmp1, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz tmp1, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 - /* Small copies: 0..16 bytes. */ -L(copy16): - cmp count, 8 - b.lo 1f - ldr A_l, [src] - ldr A_h, [srcend, -8] - str A_l, [dstin] - str A_h, [dstend, -8] - ret - .p2align 4 -1: - tbz count, 2, 1f - ldr A_lw, [src] - ldr A_hw, [srcend, -4] - str A_lw, [dstin] - str A_hw, [dstend, -4] - ret - - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret - - .p2align 4 - /* Copy 64..96 bytes. Copy 64 bytes from the start and - 32 bytes from the end. */ -L(copy96): - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [src, 32] - ldp D_l, D_h, [src, 48] - ldp E_l, E_h, [srcend, -32] - ldp F_l, F_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin, 32] - stp D_l, D_h, [dstin, 48] - stp E_l, E_h, [dstend, -32] - stp F_l, F_h, [dstend, -16] - ret - - /* Align DST to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - .p2align 4 -L(copy_long): - - /* On thunderx, large memcpy's are helped by software prefetching. - This loop is identical to the one below it but with prefetching - instructions included. For loops that are less than 32768 bytes, - the prefetching does not help and slow the code down so we only - use the prefetching loop for the largest memcpys. */ - - cmp count, #32768 - b.lo L(copy_long_without_prefetch) - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldp D_l, D_h, [src] - sub src, src, tmp1 - prfm pldl1strm, [src, 384] - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - -L(prefetch_loop64): - tbz src, #6, 1f - prfm pldl1strm, [src, 512] -1: - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! - subs count, count, 64 - b.hi L(prefetch_loop64) - b L(last64) - -L(copy_long_without_prefetch): - - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(last64) -L(loop64): - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! - subs count, count, 64 - b.hi L(loop64) - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -L(last64): - ldp E_l, E_h, [srcend, -64] - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [srcend, -48] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [srcend, -16] - stp D_l, D_h, [dst, 64] - stp E_l, E_h, [dstend, -64] - stp A_l, A_h, [dstend, -48] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] - ret - - .p2align 4 -L(move_long): - cbz tmp1, 3f - - add srcend, src, count - add dstend, dstin, count - - /* Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - and tmp1, dstend, 15 - ldp D_l, D_h, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_l, A_h, [srcend, -16] - stp D_l, D_h, [dstend, -16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -48] - ldp D_l, D_h, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - - nop -1: - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [srcend, -16] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [srcend, -48] - stp D_l, D_h, [dstend, -64]! - ldp D_l, D_h, [srcend, -64]! - subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp G_l, G_h, [src, 48] - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [src, 32] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [src, 16] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [src] - stp D_l, D_h, [dstend, -64] - stp G_l, G_h, [dstin, 48] - stp A_l, A_h, [dstin, 32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin] -3: ret - -END (__memcpy_thunderx) diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S deleted file mode 100644 index a3d79aafcd..0000000000 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S +++ /dev/null @@ -1,457 +0,0 @@ -/* A Thunderx2 Optimized memcpy implementation for AARCH64. - Copyright (C) 2018-2024 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses. - * - */ - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define tmp2 x6 -#define tmp3 x7 -#define tmp3w w7 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst -#define G_l count -#define G_h dst -#define tmp1 x14 - -#define A_q q0 -#define B_q q1 -#define C_q q2 -#define D_q q3 -#define E_q q4 -#define F_q q5 -#define G_q q6 -#define H_q q7 -#define I_q q16 -#define J_q q17 - -#define A_v v0 -#define B_v v1 -#define C_v v2 -#define D_v v3 -#define E_v v4 -#define F_v v5 -#define G_v v6 -#define H_v v7 -#define I_v v16 -#define J_v v17 - -/* Overlapping large forward memmoves use a loop that copies backwards. - Otherwise memcpy is used. Small moves branch to memcopy16 directly. - The longer memcpy cases fall through to the memcpy head. -*/ - -ENTRY (__memmove_thunderx2) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - add srcend, src, count - cmp count, 16 - b.ls L(memcopy16) - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.lo L(move_long) - -END (__memmove_thunderx2) - - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use load-and-merge - approach in the case src and dst addresses are unaligned not evenly, - so that, actual loads and stores are always aligned. - Large copies use the loops processing 64 bytes per iteration for - unaligned case and 128 bytes per iteration for aligned ones. -*/ - -#define MEMCPY_PREFETCH_LDR 640 - -ENTRY (__memcpy_thunderx2) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - add srcend, src, count - cmp count, 16 - b.ls L(memcopy16) - ldr A_q, [src], #16 - add dstend, dstin, count - and tmp1, src, 15 - cmp count, 96 - b.hi L(memcopy_long) - - /* Medium copies: 17..96 bytes. */ - ldr E_q, [srcend, -16] - cmp count, 64 - b.gt L(memcpy_copy96) - cmp count, 48 - b.le L(bytes_17_to_48) - /* 49..64 bytes */ - ldp B_q, C_q, [src] - str E_q, [dstend, -16] - stp A_q, B_q, [dstin] - str C_q, [dstin, 32] - ret - -L(bytes_17_to_48): - /* 17..48 bytes*/ - cmp count, 32 - b.gt L(bytes_32_to_48) - /* 17..32 bytes*/ - str A_q, [dstin] - str E_q, [dstend, -16] - ret - -L(bytes_32_to_48): - /* 32..48 */ - ldr B_q, [src] - str A_q, [dstin] - str E_q, [dstend, -16] - str B_q, [dstin, 16] - ret - - .p2align 4 - /* Small copies: 0..16 bytes. */ -L(memcopy16): - cmp count, 8 - b.lo L(bytes_0_to_8) - ldr A_l, [src] - ldr A_h, [srcend, -8] - add dstend, dstin, count - str A_l, [dstin] - str A_h, [dstend, -8] - ret - .p2align 4 - -L(bytes_0_to_8): - tbz count, 2, L(bytes_0_to_3) - ldr A_lw, [src] - ldr A_hw, [srcend, -4] - add dstend, dstin, count - str A_lw, [dstin] - str A_hw, [dstend, -4] - ret - - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -L(bytes_0_to_3): - cbz count, 1f - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] - add dstend, dstin, count - ldrb B_lw, [src, tmp1] - strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] - strb A_lw, [dstin] -1: - ret - - .p2align 4 - -L(memcpy_copy96): - /* Copying 65..96 bytes. A_q (first 16 bytes) and - E_q(last 16 bytes) are already loaded. The size - is large enough to benefit from aligned loads */ - bic src, src, 15 - ldp B_q, C_q, [src] - /* Loaded 64 bytes, second 16-bytes chunk can be - overlapping with the first chunk by tmp1 bytes. - Stored 16 bytes. */ - sub dst, dstin, tmp1 - add count, count, tmp1 - /* The range of count being [65..96] becomes [65..111] - after tmp [0..15] gets added to it, - count now is +48 */ - cmp count, 80 - b.gt L(copy96_medium) - ldr D_q, [src, 32] - stp B_q, C_q, [dst, 16] - str D_q, [dst, 48] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - - .p2align 4 -L(copy96_medium): - ldp D_q, G_q, [src, 32] - cmp count, 96 - b.gt L(copy96_large) - stp B_q, C_q, [dst, 16] - stp D_q, G_q, [dst, 48] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - -L(copy96_large): - ldr F_q, [src, 64] - str B_q, [dst, 16] - stp C_q, D_q, [dst, 32] - stp G_q, F_q, [dst, 64] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - - .p2align 4 -L(memcopy_long): - bic src, src, 15 - ldp B_q, C_q, [src], #32 - sub dst, dstin, tmp1 - add count, count, tmp1 - add dst, dst, 16 - and tmp1, dst, 15 - ldp D_q, E_q, [src], #32 - str A_q, [dstin] - - /* Already loaded 64+16 bytes. Check if at - least 64 more bytes left */ - subs count, count, 64+64+16 - b.lt L(loop128_exit0) - cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 - b.lt L(loop128) - cbnz tmp1, L(dst_unaligned) - sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 - - .p2align 4 - -L(loop128_prefetch): - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] - ldp F_q, G_q, [src], #32 - stp B_q, C_q, [dst], #32 - ldp H_q, I_q, [src], #32 - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] - ldp B_q, C_q, [src], #32 - stp D_q, E_q, [dst], #32 - ldp D_q, E_q, [src], #32 - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst], #32 - subs count, count, 128 - b.ge L(loop128_prefetch) - - add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 - .p2align 4 -L(loop128): - ldp F_q, G_q, [src], #32 - ldp H_q, I_q, [src], #32 - stp B_q, C_q, [dst], #32 - stp D_q, E_q, [dst], #32 - subs count, count, 64 - b.lt L(loop128_exit1) - ldp B_q, C_q, [src], #32 - ldp D_q, E_q, [src], #32 - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst], #32 - subs count, count, 64 - b.ge L(loop128) -L(loop128_exit0): - ldp F_q, G_q, [srcend, -64] - ldp H_q, I_q, [srcend, -32] - stp B_q, C_q, [dst], #32 - stp D_q, E_q, [dst] - stp F_q, G_q, [dstend, -64] - stp H_q, I_q, [dstend, -32] - ret -L(loop128_exit1): - ldp B_q, C_q, [srcend, -64] - ldp D_q, E_q, [srcend, -32] - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst] - stp B_q, C_q, [dstend, -64] - stp D_q, E_q, [dstend, -32] - ret - -L(dst_unaligned_tail): - ldp C_q, D_q, [srcend, -64] - ldp E_q, F_q, [srcend, -32] - stp A_q, B_q, [dst], #32 - stp H_q, I_q, [dst], #16 - str G_q, [dst, tmp1] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstend, -32] - ret - -L(dst_unaligned): - /* For the unaligned store case the code loads two - aligned chunks and then merges them using ext - instruction. This can be up to 30% faster than - the the simple unaligned store access. - - Current state: tmp1 = dst % 16; C_q, D_q, E_q - contains data yet to be stored. src and dst points - to next-to-be-processed data. A_q, B_q contains - data already stored before, count = bytes left to - be load decremented by 64. - - The control is passed here if at least 64 bytes left - to be loaded. The code does two aligned loads and then - extracts (16-tmp1) bytes from the first register and - tmp1 bytes from the next register forming the value - for the aligned store. - - As ext instruction can only have it's index encoded - as immediate. 15 code chunks process each possible - index value. Computed goto is used to reach the - required code. */ - - /* Store the 16 bytes to dst and align dst for further - operations, several bytes will be stored at this - address once more */ - - ldp F_q, G_q, [src], #32 - stp B_q, C_q, [dst], #32 - bic dst, dst, 15 - sub count, count, 32 - adrp tmp2, L(ext_table) - add tmp2, tmp2, :lo12:L(ext_table) - add tmp2, tmp2, tmp1, LSL #2 - ldr tmp3w, [tmp2] - add tmp2, tmp2, tmp3w, SXTW - br tmp2 - -.p2align 4 - /* to make the loop in each chunk 16-bytes aligned */ - nop -#define EXT_CHUNK(shft) \ -L(ext_size_ ## shft):;\ - ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ - ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\ - ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ -1:;\ - stp A_q, B_q, [dst], #32;\ - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\ - ldp C_q, D_q, [src], #32;\ - ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ - stp H_q, I_q, [dst], #32;\ - ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\ - ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\ - ldp F_q, G_q, [src], #32;\ - ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\ - subs count, count, 64;\ - b.ge 1b;\ -2:;\ - ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ - b L(dst_unaligned_tail); - -EXT_CHUNK(1) -EXT_CHUNK(2) -EXT_CHUNK(3) -EXT_CHUNK(4) -EXT_CHUNK(5) -EXT_CHUNK(6) -EXT_CHUNK(7) -EXT_CHUNK(8) -EXT_CHUNK(9) -EXT_CHUNK(10) -EXT_CHUNK(11) -EXT_CHUNK(12) -EXT_CHUNK(13) -EXT_CHUNK(14) -EXT_CHUNK(15) - -L(move_long): - .p2align 4 -1: - cbz tmp1, 3f - - add srcend, src, count - add dstend, dstin, count - - and tmp1, srcend, 15 - ldr D_q, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_q, B_q, [srcend, -32] - str D_q, [dstend, -16] - ldp C_q, D_q, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - - .p2align 4 -1: - subs count, count, 64 - stp A_q, B_q, [dstend, -32] - ldp A_q, B_q, [srcend, -32] - stp C_q, D_q, [dstend, -64]! - ldp C_q, D_q, [srcend, -64]! - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp E_q, F_q, [src, 32] - ldp G_q, H_q, [src] - stp A_q, B_q, [dstend, -32] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstin, 32] - stp G_q, H_q, [dstin] -3: ret - - -END (__memcpy_thunderx2) - .section .rodata - .p2align 4 - -L(ext_table): - /* The first entry is for the alignment of 0 and is never - actually used (could be any value). */ - .word 0 - .word L(ext_size_1) -. - .word L(ext_size_2) -. - .word L(ext_size_3) -. - .word L(ext_size_4) -. - .word L(ext_size_5) -. - .word L(ext_size_6) -. - .word L(ext_size_7) -. - .word L(ext_size_8) -. - .word L(ext_size_9) -. - .word L(ext_size_10) -. - .word L(ext_size_11) -. - .word L(ext_size_12) -. - .word L(ext_size_13) -. - .word L(ext_size_14) -. - .word L(ext_size_15) -. diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index fe95037be3..106011acec 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -29,8 +29,6 @@ extern __typeof (__redirect_memmove) __libc_memmove; extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden; extern __typeof (__redirect_memmove) __memmove_mops attribute_hidden; @@ -50,12 +48,6 @@ select_memmove_ifunc (void) return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; } - if (IS_THUNDERX (midr)) - return __memmove_thunderx; - - if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) - return __memmove_thunderx2; - return __memmove_generic; }