mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-07 10:00:07 +00:00
f5bff979d0
Just a few small QOL changes. 1. Prefer `add` > `lea` as it has high execution units it can run on. 2. Don't break macro-fusion between `test` and `jcc` geometric_mean(N=20) of all benchmarks New / Original: 0.973 All string/memory tests pass. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
552 lines
9.4 KiB
ArmAsm
552 lines
9.4 KiB
ArmAsm
/* wcscpy with SSSE3
|
|
Copyright (C) 2011-2022 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#if IS_IN (libc)
|
|
# include <sysdep.h>
|
|
|
|
.section .text.ssse3,"ax",@progbits
|
|
ENTRY (__wcscpy_ssse3)
|
|
|
|
mov %rsi, %rcx
|
|
mov %rdi, %rdx
|
|
|
|
cmpl $0, (%rcx)
|
|
jz L(Exit4)
|
|
cmpl $0, 4(%rcx)
|
|
jz L(Exit8)
|
|
cmpl $0, 8(%rcx)
|
|
jz L(Exit12)
|
|
cmpl $0, 12(%rcx)
|
|
jz L(Exit16)
|
|
|
|
lea 16(%rcx), %rsi
|
|
and $-16, %rsi
|
|
|
|
pxor %xmm0, %xmm0
|
|
mov (%rcx), %r9
|
|
mov %r9, (%rdx)
|
|
|
|
pcmpeqd (%rsi), %xmm0
|
|
mov 8(%rcx), %r9
|
|
mov %r9, 8(%rdx)
|
|
|
|
pmovmskb %xmm0, %rax
|
|
sub %rcx, %rsi
|
|
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
mov %rdx, %rax
|
|
addq $16, %rdx
|
|
and $-16, %rdx
|
|
sub %rdx, %rax
|
|
sub %rax, %rcx
|
|
mov %rcx, %rax
|
|
and $0xf, %rax
|
|
mov $0, %rsi
|
|
|
|
/* case: rcx_offset == rdx_offset */
|
|
|
|
jz L(Align16Both)
|
|
|
|
cmp $4, %rax
|
|
je L(Shl4)
|
|
cmp $8, %rax
|
|
je L(Shl8)
|
|
jmp L(Shl12)
|
|
|
|
L(Align16Both):
|
|
movaps (%rcx), %xmm1
|
|
movaps 16(%rcx), %xmm2
|
|
movaps %xmm1, (%rdx)
|
|
pcmpeqd %xmm2, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rsi
|
|
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm3
|
|
movaps %xmm2, (%rdx, %rsi)
|
|
pcmpeqd %xmm3, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rsi
|
|
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm4
|
|
movaps %xmm3, (%rdx, %rsi)
|
|
pcmpeqd %xmm4, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rsi
|
|
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm1
|
|
movaps %xmm4, (%rdx, %rsi)
|
|
pcmpeqd %xmm1, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rsi
|
|
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm2
|
|
movaps %xmm1, (%rdx, %rsi)
|
|
pcmpeqd %xmm2, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rsi
|
|
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm3
|
|
movaps %xmm2, (%rdx, %rsi)
|
|
pcmpeqd %xmm3, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rsi
|
|
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps %xmm3, (%rdx, %rsi)
|
|
mov %rcx, %rax
|
|
lea 16(%rcx, %rsi), %rcx
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
sub %rax, %rdx
|
|
|
|
mov $-0x40, %rsi
|
|
|
|
.p2align 4
|
|
L(Aligned64Loop):
|
|
movaps (%rcx), %xmm2
|
|
movaps %xmm2, %xmm4
|
|
movaps 16(%rcx), %xmm5
|
|
movaps 32(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 48(%rcx), %xmm7
|
|
pminub %xmm5, %xmm2
|
|
pminub %xmm7, %xmm3
|
|
pminub %xmm2, %xmm3
|
|
pcmpeqd %xmm0, %xmm3
|
|
pmovmskb %xmm3, %eax
|
|
addq $64, %rdx
|
|
addq $64, %rcx
|
|
testl %eax, %eax
|
|
jnz L(Aligned64Leave)
|
|
movaps %xmm4, -64(%rdx)
|
|
movaps %xmm5, -48(%rdx)
|
|
movaps %xmm6, -32(%rdx)
|
|
movaps %xmm7, -16(%rdx)
|
|
jmp L(Aligned64Loop)
|
|
|
|
L(Aligned64Leave):
|
|
pcmpeqd %xmm4, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
pcmpeqd %xmm5, %xmm0
|
|
|
|
pmovmskb %xmm0, %eax
|
|
movaps %xmm4, -64(%rdx)
|
|
addq $16, %rsi
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
pcmpeqd %xmm6, %xmm0
|
|
|
|
pmovmskb %xmm0, %eax
|
|
movaps %xmm5, -48(%rdx)
|
|
addq $16, %rsi
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps %xmm6, -32(%rdx)
|
|
pcmpeqd %xmm7, %xmm0
|
|
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rsi
|
|
test %eax, %eax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
mov $-0x40, %rsi
|
|
movaps %xmm7, -16(%rdx)
|
|
jmp L(Aligned64Loop)
|
|
|
|
.p2align 4
|
|
L(Shl4):
|
|
movaps -4(%rcx), %xmm1
|
|
movaps 12(%rcx), %xmm2
|
|
L(Shl4Start):
|
|
pcmpeqd %xmm2, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
movaps %xmm2, %xmm3
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 28(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
movaps %xmm2, %xmm1
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 28(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
movaps %xmm2, %xmm3
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 28(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
addq $28, %rcx
|
|
addq $16, %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
addq $-12, %rcx
|
|
sub %rax, %rdx
|
|
|
|
movaps -4(%rcx), %xmm1
|
|
|
|
.p2align 4
|
|
L(Shl4LoopStart):
|
|
movaps 12(%rcx), %xmm2
|
|
movaps 28(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 44(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 60(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqd %xmm0, %xmm7
|
|
pmovmskb %xmm7, %eax
|
|
movaps %xmm5, %xmm7
|
|
palignr $4, %xmm4, %xmm5
|
|
palignr $4, %xmm3, %xmm4
|
|
test %eax, %eax
|
|
jnz L(Shl4Start)
|
|
|
|
palignr $4, %xmm2, %xmm3
|
|
addq $64, %rcx
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
addq $64, %rdx
|
|
jmp L(Shl4LoopStart)
|
|
|
|
L(Shl4LoopExit):
|
|
movdqu -4(%rcx), %xmm1
|
|
mov $12, %rsi
|
|
movdqu %xmm1, -4(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl8):
|
|
movaps -8(%rcx), %xmm1
|
|
movaps 8(%rcx), %xmm2
|
|
L(Shl8Start):
|
|
pcmpeqd %xmm2, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
movaps %xmm2, %xmm3
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 24(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
movaps %xmm2, %xmm1
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 24(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
movaps %xmm2, %xmm3
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 24(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
addq $24, %rcx
|
|
addq $16, %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
addq $-8, %rcx
|
|
sub %rax, %rdx
|
|
|
|
movaps -8(%rcx), %xmm1
|
|
|
|
.p2align 4
|
|
L(Shl8LoopStart):
|
|
movaps 8(%rcx), %xmm2
|
|
movaps 24(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 40(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 56(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqd %xmm0, %xmm7
|
|
pmovmskb %xmm7, %eax
|
|
movaps %xmm5, %xmm7
|
|
palignr $8, %xmm4, %xmm5
|
|
palignr $8, %xmm3, %xmm4
|
|
test %eax, %eax
|
|
jnz L(Shl8Start)
|
|
|
|
palignr $8, %xmm2, %xmm3
|
|
addq $64, %rcx
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
addq $64, %rdx
|
|
jmp L(Shl8LoopStart)
|
|
|
|
L(Shl8LoopExit):
|
|
mov (%rcx), %r9
|
|
mov $8, %rsi
|
|
mov %r9, (%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl12):
|
|
movaps -12(%rcx), %xmm1
|
|
movaps 4(%rcx), %xmm2
|
|
L(Shl12Start):
|
|
pcmpeqd %xmm2, %xmm0
|
|
pmovmskb %xmm0, %eax
|
|
movaps %xmm2, %xmm3
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 20(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
movaps %xmm2, %xmm1
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 20(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
movaps %xmm2, %xmm3
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 20(%rcx), %xmm2
|
|
|
|
pcmpeqd %xmm2, %xmm0
|
|
addq $16, %rdx
|
|
pmovmskb %xmm0, %eax
|
|
addq $16, %rcx
|
|
|
|
test %eax, %eax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
addq $20, %rcx
|
|
addq $16, %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
addq $-4, %rcx
|
|
sub %rax, %rdx
|
|
|
|
movaps -12(%rcx), %xmm1
|
|
|
|
.p2align 4
|
|
L(Shl12LoopStart):
|
|
movaps 4(%rcx), %xmm2
|
|
movaps 20(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 36(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 52(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqd %xmm0, %xmm7
|
|
pmovmskb %xmm7, %eax
|
|
movaps %xmm5, %xmm7
|
|
palignr $12, %xmm4, %xmm5
|
|
palignr $12, %xmm3, %xmm4
|
|
test %eax, %eax
|
|
jnz L(Shl12Start)
|
|
palignr $12, %xmm2, %xmm3
|
|
addq $64, %rcx
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
addq $64, %rdx
|
|
jmp L(Shl12LoopStart)
|
|
|
|
L(Shl12LoopExit):
|
|
mov (%rcx), %r9d
|
|
mov $4, %rsi
|
|
mov %r9d, (%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(CopyFrom1To16Bytes):
|
|
add %rsi, %rdx
|
|
add %rsi, %rcx
|
|
|
|
test %al, %al
|
|
jz L(ExitHigh)
|
|
test $0x01, %al
|
|
jnz L(Exit4)
|
|
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov %rdi, %rax
|
|
ret
|
|
|
|
.p2align 4
|
|
L(ExitHigh):
|
|
test $0x01, %ah
|
|
jnz L(Exit12)
|
|
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 8(%rcx), %rax
|
|
mov %rax, 8(%rdx)
|
|
mov %rdi, %rax
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit4):
|
|
movl (%rcx), %eax
|
|
movl %eax, (%rdx)
|
|
mov %rdi, %rax
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit8):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov %rdi, %rax
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit12):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 8(%rcx), %eax
|
|
mov %eax, 8(%rdx)
|
|
mov %rdi, %rax
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit16):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 8(%rcx), %rax
|
|
mov %rax, 8(%rdx)
|
|
mov %rdi, %rax
|
|
ret
|
|
|
|
END(__wcscpy_ssse3)
|
|
#endif
|