glibc/sysdeps/i386/i686/multiarch/strcat-sse2.S
2012-02-09 23:18:22 +00:00

1244 lines
24 KiB
ArmAsm

/* strcat with SSE2
Copyright (C) 2011-2012 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#ifndef NOT_IN_libc
# include <sysdep.h>
# define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
# define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
# define POP(REG) popl REG; CFI_POP (REG)
# ifdef SHARED
# define JMPTBL(I, B) I - B
/* Load an entry in a jump table into ECX and branch to it. TABLE is a
jump table with relative offsets. INDEX is a register contains the
index into the jump table. SCALE is the scale of INDEX. */
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
/* We first load PC into ECX. */ \
SETUP_PIC_REG(cx); \
/* Get the address of the jump table. */ \
addl $(TABLE - .), %ecx; \
/* Get the entry and convert the relative offset to the \
absolute address. */ \
addl (%ecx,INDEX,SCALE), %ecx; \
/* We loaded the jump table and adjuested ECX. Go. */ \
jmp *%ecx
# else
# define JMPTBL(I, B) I
/* Branch to an entry in a jump table. TABLE is a jump table with
absolute offsets. INDEX is a register contains the index into the
jump table. SCALE is the scale of INDEX. */
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
jmp *TABLE(,INDEX,SCALE)
# endif
# ifndef STRCAT
# define STRCAT __strcat_sse2
# endif
# define PARMS 4
# define STR1 PARMS+4
# define STR2 STR1+4
# ifdef USE_AS_STRNCAT
# define LEN STR2+8
# define STR3 STR1+4
# else
# define STR3 STR1
# endif
# define USE_AS_STRCAT
# ifdef USE_AS_STRNCAT
# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
# else
# define RETURN POP(%esi); ret; CFI_PUSH(%esi);
# endif
.text
ENTRY (STRCAT)
PUSH (%esi)
mov STR1(%esp), %eax
mov STR2(%esp), %esi
# ifdef USE_AS_STRNCAT
PUSH (%ebx)
movl LEN(%esp), %ebx
test %ebx, %ebx
jz L(ExitZero)
# endif
cmpb $0, (%esi)
mov %esi, %ecx
mov %eax, %edx
jz L(ExitZero)
and $63, %ecx
and $63, %edx
cmp $32, %ecx
ja L(StrlenCore7_1)
cmp $48, %edx
ja L(alignment_prolog)
pxor %xmm0, %xmm0
pxor %xmm4, %xmm4
pxor %xmm7, %xmm7
movdqu (%eax), %xmm1
movdqu (%esi), %xmm5
pcmpeqb %xmm1, %xmm0
movdqu 16(%esi), %xmm6
pmovmskb %xmm0, %ecx
pcmpeqb %xmm5, %xmm4
pcmpeqb %xmm6, %xmm7
test %ecx, %ecx
jnz L(exit_less16_)
mov %eax, %ecx
and $-16, %eax
jmp L(loop_prolog)
L(alignment_prolog):
pxor %xmm0, %xmm0
pxor %xmm4, %xmm4
mov %edx, %ecx
pxor %xmm7, %xmm7
and $15, %ecx
and $-16, %eax
pcmpeqb (%eax), %xmm0
movdqu (%esi), %xmm5
movdqu 16(%esi), %xmm6
pmovmskb %xmm0, %edx
pcmpeqb %xmm5, %xmm4
shr %cl, %edx
pcmpeqb %xmm6, %xmm7
test %edx, %edx
jnz L(exit_less16)
add %eax, %ecx
pxor %xmm0, %xmm0
L(loop_prolog):
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
.p2align 4
L(align16_loop):
pcmpeqb 16(%eax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit16)
pcmpeqb 32(%eax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit32)
pcmpeqb 48(%eax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit48)
pcmpeqb 64(%eax), %xmm3
pmovmskb %xmm3, %edx
lea 64(%eax), %eax
test %edx, %edx
jz L(align16_loop)
bsf %edx, %edx
add %edx, %eax
jmp L(StartStrcpyPart)
.p2align 4
L(exit16):
bsf %edx, %edx
lea 16(%eax, %edx), %eax
jmp L(StartStrcpyPart)
.p2align 4
L(exit32):
bsf %edx, %edx
lea 32(%eax, %edx), %eax
jmp L(StartStrcpyPart)
.p2align 4
L(exit48):
bsf %edx, %edx
lea 48(%eax, %edx), %eax
jmp L(StartStrcpyPart)
.p2align 4
L(exit_less16):
bsf %edx, %edx
add %ecx, %eax
add %edx, %eax
jmp L(StartStrcpyPart)
.p2align 4
L(exit_less16_):
bsf %ecx, %ecx
add %ecx, %eax
.p2align 4
L(StartStrcpyPart):
pmovmskb %xmm4, %edx
# ifdef USE_AS_STRNCAT
cmp $16, %ebx
jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16BytesTail1)
movdqu %xmm5, (%eax)
pmovmskb %xmm7, %edx
# ifdef USE_AS_STRNCAT
cmp $32, %ebx
jbe L(CopyFrom1To32Bytes1Case2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To32Bytes1)
mov %esi, %ecx
and $-16, %esi
and $15, %ecx
pxor %xmm0, %xmm0
# ifdef USE_AS_STRNCAT
add %ecx, %ebx
# endif
sub %ecx, %eax
jmp L(Unalign16Both)
L(StrlenCore7_1):
mov %eax, %ecx
pxor %xmm0, %xmm0
and $15, %ecx
and $-16, %eax
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
shr %cl, %edx
test %edx, %edx
jnz L(exit_less16_1)
add %eax, %ecx
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
.p2align 4
L(align16_loop_1):
pcmpeqb 16(%eax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit16_1)
pcmpeqb 32(%eax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit32_1)
pcmpeqb 48(%eax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit48_1)
pcmpeqb 64(%eax), %xmm3
pmovmskb %xmm3, %edx
lea 64(%eax), %eax
test %edx, %edx
jz L(align16_loop_1)
bsf %edx, %edx
add %edx, %eax
jmp L(StartStrcpyPart_1)
.p2align 4
L(exit16_1):
bsf %edx, %edx
lea 16(%eax, %edx), %eax
jmp L(StartStrcpyPart_1)
.p2align 4
L(exit32_1):
bsf %edx, %edx
lea 32(%eax, %edx), %eax
jmp L(StartStrcpyPart_1)
.p2align 4
L(exit48_1):
bsf %edx, %edx
lea 48(%eax, %edx), %eax
jmp L(StartStrcpyPart_1)
.p2align 4
L(exit_less16_1):
bsf %edx, %edx
add %ecx, %eax
add %edx, %eax
.p2align 4
L(StartStrcpyPart_1):
mov %esi, %ecx
and $15, %ecx
and $-16, %esi
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
# ifdef USE_AS_STRNCAT
cmp $48, %ebx
ja L(BigN)
# endif
pcmpeqb (%esi), %xmm1
# ifdef USE_AS_STRNCAT
add %ecx, %ebx
# endif
pmovmskb %xmm1, %edx
shr %cl, %edx
# ifdef USE_AS_STRNCAT
cmp $16, %ebx
jbe L(CopyFrom1To16BytesTailCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16BytesTail)
pcmpeqb 16(%esi), %xmm0
pmovmskb %xmm0, %edx
# ifdef USE_AS_STRNCAT
cmp $32, %ebx
jbe L(CopyFrom1To32BytesCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To32Bytes)
movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
movdqu %xmm1, (%eax)
sub %ecx, %eax
.p2align 4
L(Unalign16Both):
mov $16, %ecx
movdqa (%esi, %ecx), %xmm1
movaps 16(%esi, %ecx), %xmm2
movdqu %xmm1, (%eax, %ecx)
pcmpeqb %xmm2, %xmm0
pmovmskb %xmm0, %edx
add $16, %ecx
# ifdef USE_AS_STRNCAT
sub $48, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
L(Unalign16BothBigN):
movaps 16(%esi, %ecx), %xmm3
movdqu %xmm2, (%eax, %ecx)
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
add $16, %ecx
# ifdef USE_AS_STRNCAT
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
movaps 16(%esi, %ecx), %xmm4
movdqu %xmm3, (%eax, %ecx)
pcmpeqb %xmm4, %xmm0
pmovmskb %xmm0, %edx
add $16, %ecx
# ifdef USE_AS_STRNCAT
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
movaps 16(%esi, %ecx), %xmm1
movdqu %xmm4, (%eax, %ecx)
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %edx
add $16, %ecx
# ifdef USE_AS_STRNCAT
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
movaps 16(%esi, %ecx), %xmm2
movdqu %xmm1, (%eax, %ecx)
pcmpeqb %xmm2, %xmm0
pmovmskb %xmm0, %edx
add $16, %ecx
# ifdef USE_AS_STRNCAT
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
movaps 16(%esi, %ecx), %xmm3
movdqu %xmm2, (%eax, %ecx)
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
add $16, %ecx
# ifdef USE_AS_STRNCAT
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
movdqu %xmm3, (%eax, %ecx)
mov %esi, %edx
lea 16(%esi, %ecx), %esi
and $-0x40, %esi
sub %esi, %edx
sub %edx, %eax
# ifdef USE_AS_STRNCAT
lea 128(%ebx, %edx), %ebx
# endif
movaps (%esi), %xmm2
movaps %xmm2, %xmm4
movaps 16(%esi), %xmm5
movaps 32(%esi), %xmm3
movaps %xmm3, %xmm6
movaps 48(%esi), %xmm7
pminub %xmm5, %xmm2
pminub %xmm7, %xmm3
pminub %xmm2, %xmm3
pcmpeqb %xmm0, %xmm3
pmovmskb %xmm3, %edx
# ifdef USE_AS_STRNCAT
sub $64, %ebx
jbe L(UnalignedLeaveCase2OrCase3)
# endif
test %edx, %edx
jnz L(Unaligned64Leave)
.p2align 4
L(Unaligned64Loop_start):
add $64, %eax
add $64, %esi
movdqu %xmm4, -64(%eax)
movaps (%esi), %xmm2
movdqa %xmm2, %xmm4
movdqu %xmm5, -48(%eax)
movaps 16(%esi), %xmm5
pminub %xmm5, %xmm2
movaps 32(%esi), %xmm3
movdqu %xmm6, -32(%eax)
movaps %xmm3, %xmm6
movdqu %xmm7, -16(%eax)
movaps 48(%esi), %xmm7
pminub %xmm7, %xmm3
pminub %xmm2, %xmm3
pcmpeqb %xmm0, %xmm3
pmovmskb %xmm3, %edx
# ifdef USE_AS_STRNCAT
sub $64, %ebx
jbe L(UnalignedLeaveCase2OrCase3)
# endif
test %edx, %edx
jz L(Unaligned64Loop_start)
L(Unaligned64Leave):
pxor %xmm1, %xmm1
pcmpeqb %xmm4, %xmm0
pcmpeqb %xmm5, %xmm1
pmovmskb %xmm0, %edx
pmovmskb %xmm1, %ecx
test %edx, %edx
jnz L(CopyFrom1To16BytesUnaligned_0)
test %ecx, %ecx
jnz L(CopyFrom1To16BytesUnaligned_16)
pcmpeqb %xmm6, %xmm0
pcmpeqb %xmm7, %xmm1
pmovmskb %xmm0, %edx
pmovmskb %xmm1, %ecx
test %edx, %edx
jnz L(CopyFrom1To16BytesUnaligned_32)
bsf %ecx, %edx
movdqu %xmm4, (%eax)
movdqu %xmm5, 16(%eax)
movdqu %xmm6, 32(%eax)
add $48, %esi
add $48, %eax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
# ifdef USE_AS_STRNCAT
.p2align 4
L(BigN):
pcmpeqb (%esi), %xmm1
pmovmskb %xmm1, %edx
shr %cl, %edx
test %edx, %edx
jnz L(CopyFrom1To16BytesTail)
pcmpeqb 16(%esi), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(CopyFrom1To32Bytes)
movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
movdqu %xmm1, (%eax)
sub %ecx, %eax
sub $48, %ebx
add %ecx, %ebx
mov $16, %ecx
movdqa (%esi, %ecx), %xmm1
movaps 16(%esi, %ecx), %xmm2
movdqu %xmm1, (%eax, %ecx)
pcmpeqb %xmm2, %xmm0
pmovmskb %xmm0, %edx
add $16, %ecx
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
jmp L(Unalign16BothBigN)
# endif
/*------------end of main part-------------------------------*/
/* Case1 */
.p2align 4
L(CopyFrom1To16Bytes):
add %ecx, %eax
add %ecx, %esi
bsf %edx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
.p2align 4
L(CopyFrom1To16BytesTail):
add %ecx, %esi
bsf %edx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
.p2align 4
L(CopyFrom1To32Bytes1):
add $16, %esi
add $16, %eax
L(CopyFrom1To16BytesTail1):
bsf %edx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
.p2align 4
L(CopyFrom1To32Bytes):
bsf %edx, %edx
add %ecx, %esi
add $16, %edx
sub %ecx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
.p2align 4
L(CopyFrom1To16BytesUnaligned_0):
bsf %edx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
.p2align 4
L(CopyFrom1To16BytesUnaligned_16):
bsf %ecx, %edx
movdqu %xmm4, (%eax)
add $16, %esi
add $16, %eax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
.p2align 4
L(CopyFrom1To16BytesUnaligned_32):
bsf %edx, %edx
movdqu %xmm4, (%eax)
movdqu %xmm5, 16(%eax)
add $32, %esi
add $32, %eax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
# ifdef USE_AS_STRNCAT
.p2align 4
L(CopyFrom1To16BytesExit):
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
/* Case2 */
.p2align 4
L(CopyFrom1To16BytesCase2):
add $16, %ebx
add %ecx, %eax
add %ecx, %esi
bsf %edx, %edx
cmp %ebx, %edx
jb L(CopyFrom1To16BytesExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
.p2align 4
L(CopyFrom1To32BytesCase2):
sub %ecx, %ebx
add %ecx, %esi
bsf %edx, %edx
add $16, %edx
sub %ecx, %edx
cmp %ebx, %edx
jb L(CopyFrom1To16BytesExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
L(CopyFrom1To16BytesTailCase2):
sub %ecx, %ebx
add %ecx, %esi
bsf %edx, %edx
cmp %ebx, %edx
jb L(CopyFrom1To16BytesExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
L(CopyFrom1To16BytesTail1Case2):
bsf %edx, %edx
cmp %ebx, %edx
jb L(CopyFrom1To16BytesExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
/* Case2 or Case3, Case3 */
.p2align 4
L(CopyFrom1To16BytesCase2OrCase3):
test %edx, %edx
jnz L(CopyFrom1To16BytesCase2)
L(CopyFrom1To16BytesCase3):
add $16, %ebx
add %ecx, %eax
add %ecx, %esi
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
.p2align 4
L(CopyFrom1To32BytesCase2OrCase3):
test %edx, %edx
jnz L(CopyFrom1To32BytesCase2)
sub %ecx, %ebx
add %ecx, %esi
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
.p2align 4
L(CopyFrom1To16BytesTailCase2OrCase3):
test %edx, %edx
jnz L(CopyFrom1To16BytesTailCase2)
sub %ecx, %ebx
add %ecx, %esi
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
.p2align 4
L(CopyFrom1To32Bytes1Case2OrCase3):
add $16, %eax
add $16, %esi
sub $16, %ebx
L(CopyFrom1To16BytesTail1Case2OrCase3):
test %edx, %edx
jnz L(CopyFrom1To16BytesTail1Case2)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
# endif
# ifdef USE_AS_STRNCAT
.p2align 4
L(StrncatExit0):
movb %bh, (%eax)
mov STR3(%esp), %eax
RETURN
# endif
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit1):
movb %bh, 1(%eax)
# endif
L(Exit1):
# ifdef USE_AS_STRNCAT
movb (%esi), %dh
# endif
movb %dh, (%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit2):
movb %bh, 2(%eax)
# endif
L(Exit2):
movw (%esi), %dx
movw %dx, (%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit3):
movb %bh, 3(%eax)
# endif
L(Exit3):
movw (%esi), %cx
movw %cx, (%eax)
# ifdef USE_AS_STRNCAT
movb 2(%esi), %dh
# endif
movb %dh, 2(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit4):
movb %bh, 4(%eax)
# endif
L(Exit4):
movl (%esi), %edx
movl %edx, (%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit5):
movb %bh, 5(%eax)
# endif
L(Exit5):
movl (%esi), %ecx
# ifdef USE_AS_STRNCAT
movb 4(%esi), %dh
# endif
movb %dh, 4(%eax)
movl %ecx, (%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit6):
movb %bh, 6(%eax)
# endif
L(Exit6):
movl (%esi), %ecx
movw 4(%esi), %dx
movl %ecx, (%eax)
movw %dx, 4(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit7):
movb %bh, 7(%eax)
# endif
L(Exit7):
movl (%esi), %ecx
movl 3(%esi), %edx
movl %ecx, (%eax)
movl %edx, 3(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit8):
movb %bh, 8(%eax)
# endif
L(Exit8):
movlpd (%esi), %xmm0
movlpd %xmm0, (%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit9):
movb %bh, 9(%eax)
# endif
L(Exit9):
movlpd (%esi), %xmm0
# ifdef USE_AS_STRNCAT
movb 8(%esi), %dh
# endif
movb %dh, 8(%eax)
movlpd %xmm0, (%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit10):
movb %bh, 10(%eax)
# endif
L(Exit10):
movlpd (%esi), %xmm0
movw 8(%esi), %dx
movlpd %xmm0, (%eax)
movw %dx, 8(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit11):
movb %bh, 11(%eax)
# endif
L(Exit11):
movlpd (%esi), %xmm0
movl 7(%esi), %edx
movlpd %xmm0, (%eax)
movl %edx, 7(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit12):
movb %bh, 12(%eax)
# endif
L(Exit12):
movlpd (%esi), %xmm0
movl 8(%esi), %edx
movlpd %xmm0, (%eax)
movl %edx, 8(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit13):
movb %bh, 13(%eax)
# endif
L(Exit13):
movlpd (%esi), %xmm0
movlpd 5(%esi), %xmm1
movlpd %xmm0, (%eax)
movlpd %xmm1, 5(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit14):
movb %bh, 14(%eax)
# endif
L(Exit14):
movlpd (%esi), %xmm0
movlpd 6(%esi), %xmm1
movlpd %xmm0, (%eax)
movlpd %xmm1, 6(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit15):
movb %bh, 15(%eax)
# endif
L(Exit15):
movlpd (%esi), %xmm0
movlpd 7(%esi), %xmm1
movlpd %xmm0, (%eax)
movlpd %xmm1, 7(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit16):
movb %bh, 16(%eax)
# endif
L(Exit16):
movdqu (%esi), %xmm0
movdqu %xmm0, (%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit17):
movb %bh, 17(%eax)
# endif
L(Exit17):
movdqu (%esi), %xmm0
# ifdef USE_AS_STRNCAT
movb 16(%esi), %dh
# endif
movdqu %xmm0, (%eax)
movb %dh, 16(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit18):
movb %bh, 18(%eax)
# endif
L(Exit18):
movdqu (%esi), %xmm0
movw 16(%esi), %cx
movdqu %xmm0, (%eax)
movw %cx, 16(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit19):
movb %bh, 19(%eax)
# endif
L(Exit19):
movdqu (%esi), %xmm0
movl 15(%esi), %ecx
movdqu %xmm0, (%eax)
movl %ecx, 15(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit20):
movb %bh, 20(%eax)
# endif
L(Exit20):
movdqu (%esi), %xmm0
movl 16(%esi), %ecx
movdqu %xmm0, (%eax)
movl %ecx, 16(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit21):
movb %bh, 21(%eax)
# endif
L(Exit21):
movdqu (%esi), %xmm0
movl 16(%esi), %ecx
# ifdef USE_AS_STRNCAT
movb 20(%esi), %dh
# endif
movdqu %xmm0, (%eax)
movl %ecx, 16(%eax)
movb %dh, 20(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit22):
movb %bh, 22(%eax)
# endif
L(Exit22):
movdqu (%esi), %xmm0
movlpd 14(%esi), %xmm3
movdqu %xmm0, (%eax)
movlpd %xmm3, 14(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit23):
movb %bh, 23(%eax)
# endif
L(Exit23):
movdqu (%esi), %xmm0
movlpd 15(%esi), %xmm3
movdqu %xmm0, (%eax)
movlpd %xmm3, 15(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit24):
movb %bh, 24(%eax)
# endif
L(Exit24):
movdqu (%esi), %xmm0
movlpd 16(%esi), %xmm2
movdqu %xmm0, (%eax)
movlpd %xmm2, 16(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit25):
movb %bh, 25(%eax)
# endif
L(Exit25):
movdqu (%esi), %xmm0
movlpd 16(%esi), %xmm2
# ifdef USE_AS_STRNCAT
movb 24(%esi), %dh
# endif
movdqu %xmm0, (%eax)
movlpd %xmm2, 16(%eax)
movb %dh, 24(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit26):
movb %bh, 26(%eax)
# endif
L(Exit26):
movdqu (%esi), %xmm0
movlpd 16(%esi), %xmm2
movw 24(%esi), %cx
movdqu %xmm0, (%eax)
movlpd %xmm2, 16(%eax)
movw %cx, 24(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit27):
movb %bh, 27(%eax)
# endif
L(Exit27):
movdqu (%esi), %xmm0
movlpd 16(%esi), %xmm2
movl 23(%esi), %ecx
movdqu %xmm0, (%eax)
movlpd %xmm2, 16(%eax)
movl %ecx, 23(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit28):
movb %bh, 28(%eax)
# endif
L(Exit28):
movdqu (%esi), %xmm0
movlpd 16(%esi), %xmm2
movl 24(%esi), %ecx
movdqu %xmm0, (%eax)
movlpd %xmm2, 16(%eax)
movl %ecx, 24(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit29):
movb %bh, 29(%eax)
# endif
L(Exit29):
movdqu (%esi), %xmm0
movdqu 13(%esi), %xmm2
movdqu %xmm0, (%eax)
movdqu %xmm2, 13(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit30):
movb %bh, 30(%eax)
# endif
L(Exit30):
movdqu (%esi), %xmm0
movdqu 14(%esi), %xmm2
movdqu %xmm0, (%eax)
movdqu %xmm2, 14(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit31):
movb %bh, 31(%eax)
# endif
L(Exit31):
movdqu (%esi), %xmm0
movdqu 15(%esi), %xmm2
movdqu %xmm0, (%eax)
movdqu %xmm2, 15(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
# ifdef USE_AS_STRNCAT
L(StrncatExit32):
movb %bh, 32(%eax)
# endif
L(Exit32):
movdqu (%esi), %xmm0
movdqu 16(%esi), %xmm2
movdqu %xmm0, (%eax)
movdqu %xmm2, 16(%eax)
mov STR3(%esp), %eax
RETURN
# ifdef USE_AS_STRNCAT
.p2align 4
L(UnalignedLeaveCase2OrCase3):
test %edx, %edx
jnz L(Unaligned64LeaveCase2)
L(Unaligned64LeaveCase3):
lea 64(%ebx), %ecx
and $-16, %ecx
add $48, %ebx
jl L(CopyFrom1To16BytesCase3)
movdqu %xmm4, (%eax)
sub $16, %ebx
jb L(CopyFrom1To16BytesCase3)
movdqu %xmm5, 16(%eax)
sub $16, %ebx
jb L(CopyFrom1To16BytesCase3)
movdqu %xmm6, 32(%eax)
sub $16, %ebx
jb L(CopyFrom1To16BytesCase3)
movdqu %xmm7, 48(%eax)
xor %bh, %bh
movb %bh, 64(%eax)
mov STR3(%esp), %eax
RETURN
.p2align 4
L(Unaligned64LeaveCase2):
xor %ecx, %ecx
pcmpeqb %xmm4, %xmm0
pmovmskb %xmm0, %edx
add $48, %ebx
jle L(CopyFrom1To16BytesCase2OrCase3)
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
pcmpeqb %xmm5, %xmm0
pmovmskb %xmm0, %edx
movdqu %xmm4, (%eax)
add $16, %ecx
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
pcmpeqb %xmm6, %xmm0
pmovmskb %xmm0, %edx
movdqu %xmm5, 16(%eax)
add $16, %ecx
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
test %edx, %edx
jnz L(CopyFrom1To16Bytes)
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %edx
movdqu %xmm6, 32(%eax)
lea 16(%eax, %ecx), %eax
lea 16(%esi, %ecx), %esi
bsf %edx, %edx
cmp %ebx, %edx
jb L(CopyFrom1To16BytesExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
# endif
.p2align 4
L(ExitZero):
RETURN
END (STRCAT)
.p2align 4
.section .rodata
L(ExitTable):
.int JMPTBL(L(Exit1), L(ExitTable))
.int JMPTBL(L(Exit2), L(ExitTable))
.int JMPTBL(L(Exit3), L(ExitTable))
.int JMPTBL(L(Exit4), L(ExitTable))
.int JMPTBL(L(Exit5), L(ExitTable))
.int JMPTBL(L(Exit6), L(ExitTable))
.int JMPTBL(L(Exit7), L(ExitTable))
.int JMPTBL(L(Exit8), L(ExitTable))
.int JMPTBL(L(Exit9), L(ExitTable))
.int JMPTBL(L(Exit10), L(ExitTable))
.int JMPTBL(L(Exit11), L(ExitTable))
.int JMPTBL(L(Exit12), L(ExitTable))
.int JMPTBL(L(Exit13), L(ExitTable))
.int JMPTBL(L(Exit14), L(ExitTable))
.int JMPTBL(L(Exit15), L(ExitTable))
.int JMPTBL(L(Exit16), L(ExitTable))
.int JMPTBL(L(Exit17), L(ExitTable))
.int JMPTBL(L(Exit18), L(ExitTable))
.int JMPTBL(L(Exit19), L(ExitTable))
.int JMPTBL(L(Exit20), L(ExitTable))
.int JMPTBL(L(Exit21), L(ExitTable))
.int JMPTBL(L(Exit22), L(ExitTable))
.int JMPTBL(L(Exit23), L(ExitTable))
.int JMPTBL(L(Exit24), L(ExitTable))
.int JMPTBL(L(Exit25), L(ExitTable))
.int JMPTBL(L(Exit26), L(ExitTable))
.int JMPTBL(L(Exit27), L(ExitTable))
.int JMPTBL(L(Exit28), L(ExitTable))
.int JMPTBL(L(Exit29), L(ExitTable))
.int JMPTBL(L(Exit30), L(ExitTable))
.int JMPTBL(L(Exit31), L(ExitTable))
.int JMPTBL(L(Exit32), L(ExitTable))
# ifdef USE_AS_STRNCAT
L(ExitStrncatTable):
.int JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
.int JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
# endif
#endif