glibc/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
Noah Goldstein f049f52dfe x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. Improve the loop a bit (similiar to what we do in strlen with
       2x vpminu + kortest instead of 3x vpminu + kmov + test).
    4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    stpcpy-evex      -> 0.922
    strcat-evex      -> 0.985
    strcpy-evex      -> 0.880

    strncpy-evex     -> 0.831
    stpncpy-evex     -> 0.780

    strncat-evex     -> 0.958

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-evex      ->  819 / 1874 -> 0.437
    strcpy-evex      ->  700 / 1074 -> 0.652
    stpcpy-evex      ->  735 / 1094 -> 0.672

    strncpy-evex     -> 1397 / 2611 -> 0.535
    stpncpy-evex     -> 1489 / 2691 -> 0.553

    strncat-evex     -> 1184 / 2832 -> 0.418

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-evex.S    -> strcpy, stpcpy, strcat
           strncpy-evex.S   -> strncpy
           strncat-evex.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

    2. All implementations can be made evex512 by including
       "x86-evex512-vecs.h" at the top.

    3. All implementations have an optional define:
        `USE_EVEX_MASKED_STORE`
       Setting to one uses evex-masked stores for handling short
       strings.  This saves code size and branches.  It's disabled
       for all implementations are the moment as there are some
       serious drawbacks to masked stores in certain cases, but
       that may be fixed on future architectures.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
2022-11-08 19:22:33 -08:00

111 lines
2.7 KiB
ArmAsm

/* strlen used for begining of str{n}cat using EVEX 256/512.
Copyright (C) 2011-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* NOTE: This file is meant to be included by strcat-evex or
strncat-evex and does not standalone. Before including %rdi
must be saved in %rax. */
/* Simple strlen implementation that ends at
L(strcat_strlen_done). */
vpxorq %VZERO_128, %VZERO_128, %VZERO_128
movq %rdi, %r8
andq $(VEC_SIZE * -1), %r8
VPCMPEQ (%r8), %VZERO, %k0
KMOV %k0, %VRCX
#ifdef USE_AS_WCSCPY
subl %r8d, %edi
shrl $2, %edi
#endif
shrx %VRDI, %VRCX, %VRCX
#ifdef USE_AS_WCSCPY
movq %rax, %rdi
#endif
test %VRCX, %VRCX
jnz L(bsf_and_done_v0)
VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
KMOV %k0, %VRCX
leaq (VEC_SIZE)(%r8), %rdi
test %VRCX, %VRCX
jnz L(bsf_and_done_v0)
VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v1)
VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v2)
VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v3)
andq $-(VEC_SIZE * 4), %rdi
.p2align 4,, 8
L(loop_2x_vec):
VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0)
VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2)
VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
VPTESTN %VMM(1), %VMM(1), %k1
VPTESTN %VMM(3), %VMM(3), %k3
subq $(VEC_SIZE * -4), %rdi
KORTEST %k1, %k3
jz L(loop_2x_vec)
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v0)
KMOV %k1, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v1)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v2)
KMOV %k3, %VRCX
L(bsf_and_done_v3):
addq $VEC_SIZE, %rdi
L(bsf_and_done_v2):
bsf %VRCX, %VRCX
leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
jmp L(strcat_strlen_done)
.p2align 4,, 4
L(bsf_and_done_v1):
addq $VEC_SIZE, %rdi
L(bsf_and_done_v0):
bsf %VRCX, %VRCX
#ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
#else
addq %rcx, %rdi
#endif
L(strcat_strlen_done):