glibc/sysdeps/x86_64/multiarch/strcmp.S

1882 lines
46 KiB
ArmAsm

/* strcmp with SSE4.2
Copyright (C) 2009, 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
#ifdef USE_AS_STRNCMP
/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
if the new counter > the old one or is 0. */
# define UPDATE_STRNCMP_COUNTER \
/* calculate left number to compare */ \
lea -16(%rcx, %r11), %r9; \
cmp %r9, %r11; \
jb LABEL(strcmp_exitz_sse4_2); \
test %r9, %r9; \
je LABEL(strcmp_exitz_sse4_2); \
mov %r9, %r11
# define STRCMP_SSE42 __strncmp_sse42
# define STRCMP_SSSE3 __strncmp_ssse3
# define STRCMP_SSE2 __strncmp_sse2
# define __GI_STRCMP __GI_strncmp
#elif defined USE_AS_STRCASECMP_L
# include "locale-defines.h"
# define UPDATE_STRNCMP_COUNTER
# define STRCMP_SSE42 __strcasecmp_l_sse42
# define STRCMP_SSSE3 __strcasecmp_l_ssse3
# define STRCMP_SSE2 __strcasecmp_l_sse2
# define __GI_STRCMP __GI___strcasecmp_l
#else
# define UPDATE_STRNCMP_COUNTER
# ifndef STRCMP
# define STRCMP strcmp
# define STRCMP_SSE42 __strcmp_sse42
# define STRCMP_SSSE3 __strcmp_ssse3
# define STRCMP_SSE2 __strcmp_sse2
# define __GI_STRCMP __GI_strcmp
# endif
#endif
#ifndef LABEL
# define LABEL(l) L(l)
#endif
/* Define multiple versions only for the definition in libc. Don't
define multiple versions for strncmp in static library since we
need strncmp before the initialization happened. */
#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
.text
ENTRY(STRCMP)
.type STRCMP, @gnu_indirect_function
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
1:
leaq STRCMP_SSE42(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jnz 2f
leaq STRCMP_SSSE3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 2f
leaq STRCMP_SSE2(%rip), %rax
2: ret
END(STRCMP)
# ifdef USE_AS_STRCASECMP_L
ENTRY(__strcasecmp)
.type __strcasecmp, @gnu_indirect_function
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
1:
leaq __strcasecmp_sse42(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jnz 2f
leaq __strcasecmp_ssse3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 2f
leaq __strcasecmp_sse2(%rip), %rax
2: ret
END(__strcasecmp)
weak_alias (__strcasecmp, strcasecmp)
# endif
/* We use 0x1a:
_SIDD_SBYTE_OPS
| _SIDD_CMP_EQUAL_EACH
| _SIDD_NEGATIVE_POLARITY
| _SIDD_LEAST_SIGNIFICANT
on pcmpistri to find out if two 16byte data elements are the same
and the offset of the first different byte. There are 4 cases:
1. Both 16byte data elements are valid and identical.
2. Both 16byte data elements have EOS and identical.
3. Both 16byte data elements are valid and they differ at offset X.
4. At least one 16byte data element has EOS at offset X. Two 16byte
data elements must differ at or before offset X.
Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
case ECX CFlag ZFlag SFlag
1 16 0 0 0
2 16 0 1 1
3 X 1 0 0
4 0 <= X 1 0/1 0/1
We exit from the loop for cases 2, 3 and 4 with jbe which branches
when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
case 2. */
/* Put all SSE 4.2 functions together. */
.section .text.sse4.2,"ax",@progbits
.align 16
.type STRCMP_SSE42, @function
#ifdef USE_AS_STRCASECMP_L
/* 5-byte NOP. */
.byte 0x0f,0x1f,0x44,0x00,0x00
ENTRY (__strcasecmp_sse42)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
movq %fs:(%rax),%rdx
END (__strcasecmp_sse42)
/* FALLTHROUGH to strcasecmp_l. */
#endif
STRCMP_SSE42:
cfi_startproc
CALL_MCOUNT
/*
* This implementation uses SSE to compare up to 16 bytes at a time.
*/
#ifdef USE_AS_STRCASECMP_L
/* We have to fall back on the C implementation for locales
with encodings not matching ASCII for single bytes. */
# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
# else
movq (%rdx), %rax
# endif
testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
jne __strcasecmp_l_nonascii
#endif
#ifdef USE_AS_STRNCMP
test %rdx, %rdx
je LABEL(strcmp_exitz_sse4_2)
cmp $1, %rdx
je LABEL(Byte0_sse4_2)
mov %rdx, %r11
#endif
mov %esi, %ecx
mov %edi, %eax
/* Use 64bit AND here to avoid long NOP padding. */
and $0x3f, %rcx /* rsi alignment in cache line */
and $0x3f, %rax /* rdi alignment in cache line */
#ifdef USE_AS_STRCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
.Lbelowupper_sse4:
.quad 0x4040404040404040
.quad 0x4040404040404040
.Ltopupper_sse4:
.quad 0x5b5b5b5b5b5b5b5b
.quad 0x5b5b5b5b5b5b5b5b
.Ltouppermask_sse4:
.quad 0x2020202020202020
.quad 0x2020202020202020
.previous
movdqa .Lbelowupper_sse4(%rip), %xmm4
# define UCLOW_reg %xmm4
movdqa .Ltopupper_sse4(%rip), %xmm5
# define UCHIGH_reg %xmm5
movdqa .Ltouppermask_sse4(%rip), %xmm6
# define LCQWORD_reg %xmm6
#endif
cmp $0x30, %ecx
ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
cmp $0x30, %eax
ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
movdqu (%rdi), %xmm1
movdqu (%rsi), %xmm2
# ifdef USE_AS_STRCASECMP_L
# define TOLOWER(reg1, reg2) \
movdqa reg1, %xmm7; \
movdqa UCHIGH_reg, %xmm8; \
movdqa reg2, %xmm9; \
movdqa UCHIGH_reg, %xmm10; \
pcmpgtb UCLOW_reg, %xmm7; \
pcmpgtb reg1, %xmm8; \
pcmpgtb UCLOW_reg, %xmm9; \
pcmpgtb reg2, %xmm10; \
pand %xmm8, %xmm7; \
pand %xmm10, %xmm9; \
pand LCQWORD_reg, %xmm7; \
pand LCQWORD_reg, %xmm9; \
por %xmm7, reg1; \
por %xmm9, reg2
TOLOWER (%xmm1, %xmm2)
# else
# define TOLOWER(reg1, reg2)
# endif
pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */
#endif
add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */
/*
* Determine source and destination string offsets from 16-byte alignment.
* Use relative offset difference between the two to determine which case
* below to use.
*/
.p2align 4
LABEL(crosscache_sse4_2):
and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
mov $0xffff, %edx /* for equivalent offset */
xor %r8d, %r8d
and $0xf, %ecx /* offset of rsi */
and $0xf, %eax /* offset of rdi */
cmp %eax, %ecx
je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */
ja LABEL(bigger_sse4_2)
mov %edx, %r8d /* r8d is offset flag for exit tail */
xchg %ecx, %eax
xchg %rsi, %rdi
LABEL(bigger_sse4_2):
lea 15(%rax), %r9
sub %rcx, %r9
lea LABEL(unaligned_table_sse4_2)(%rip), %r10
movslq (%r10, %r9,4), %r9
lea (%r10, %r9), %r10
jmp *%r10 /* jump to corresponding case */
/*
* The following cases will be handled by ashr_0
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(0~15) n(0~15) 15(15+ n-n) ashr_0
*/
.p2align 4
LABEL(ashr_0_sse4_2):
movdqa (%rsi), %xmm1
pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
#ifndef USE_AS_STRCASECMP_L
pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
#else
movdqa (%rdi), %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
#endif
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
sub %r9d, %edx
/*
* edx must be the same with r9d if in left byte (16-rcx) is equal to
* the start from (16-rax) and no null char was seen.
*/
jne LABEL(less32bytes_sse4_2) /* mismatch or null char */
UPDATE_STRNCMP_COUNTER
mov $16, %rcx
mov $16, %r9
pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
/*
* Now both strings are aligned at 16-byte boundary. Loop over strings
* checking 32-bytes per iteration.
*/
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(ashr_0_use_sse4_2):
movdqa (%rdi,%rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
lea 16(%rdx), %rdx
jbe LABEL(ashr_0_use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
movdqa (%rdi,%rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
lea 16(%rdx), %rdx
jbe LABEL(ashr_0_use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
jmp LABEL(ashr_0_use_sse4_2)
.p2align 4
LABEL(ashr_0_use_sse4_2_exit):
jnc LABEL(strcmp_exitz_sse4_2)
#ifdef USE_AS_STRNCMP
sub %rcx, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
lea -16(%rdx, %rcx), %rcx
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %edx
# ifdef USE_AS_STRCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
movl (%rcx,%rax,4), %eax
movl (%rcx,%rdx,4), %edx
# endif
sub %edx, %eax
ret
/*
* The following cases will be handled by ashr_1
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(15) n -15 0(15 +(n-15) - n) ashr_1
*/
.p2align 4
LABEL(ashr_1_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pslldq $15, %xmm2 /* shift first string to align with second */
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
psubb %xmm0, %xmm2 /* packed sub of comparison results*/
pmovmskb %xmm2, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads*/
mov $1, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 1(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_1_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_1_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $1, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_1_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $1, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_1_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_1_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $1, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $14, %ecx
ja LABEL(loop_ashr_1_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_2
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(14~15) n -14 1(15 +(n-14) - n) ashr_2
*/
.p2align 4
LABEL(ashr_2_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $14, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $2, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 2(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_2_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_2_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $2, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_2_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $2, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_2_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_2_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $2, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $13, %ecx
ja LABEL(loop_ashr_2_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_3
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(13~15) n -13 2(15 +(n-13) - n) ashr_3
*/
.p2align 4
LABEL(ashr_3_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $13, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $3, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 3(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
LABEL(loop_ashr_3_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_3_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $3, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_3_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $3, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_3_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_3_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $3, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $12, %ecx
ja LABEL(loop_ashr_3_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_4
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(12~15) n -12 3(15 +(n-12) - n) ashr_4
*/
.p2align 4
LABEL(ashr_4_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $12, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $4, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 4(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_4_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_4_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $4, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_4_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $4, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_4_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_4_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $4, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $11, %ecx
ja LABEL(loop_ashr_4_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_5
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
*/
.p2align 4
LABEL(ashr_5_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $11, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $5, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 5(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_5_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_5_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $5, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_5_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $5, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_5_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_5_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $5, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $10, %ecx
ja LABEL(loop_ashr_5_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_6
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
*/
.p2align 4
LABEL(ashr_6_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $10, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $6, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 6(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_6_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_6_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $6, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_6_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $6, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_6_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_6_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $6, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $9, %ecx
ja LABEL(loop_ashr_6_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_7
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
*/
.p2align 4
LABEL(ashr_7_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $9, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $7, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 7(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_7_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_7_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $7, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_7_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $7, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_7_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_7_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $7, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $8, %ecx
ja LABEL(loop_ashr_7_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_8
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
*/
.p2align 4
LABEL(ashr_8_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $8, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $8, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 8(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_8_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_8_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $8, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_8_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $8, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_8_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_8_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $8, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $7, %ecx
ja LABEL(loop_ashr_8_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_9
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
*/
.p2align 4
LABEL(ashr_9_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $7, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $9, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 9(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_9_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_9_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $9, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_9_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $9, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_9_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_9_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $9, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $6, %ecx
ja LABEL(loop_ashr_9_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_10
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
*/
.p2align 4
LABEL(ashr_10_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $6, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $10, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 10(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_10_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_10_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $10, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_10_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $10, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_10_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_10_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $10, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $5, %ecx
ja LABEL(loop_ashr_10_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_11
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
*/
.p2align 4
LABEL(ashr_11_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $5, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $11, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 11(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_11_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_11_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $11, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_11_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $11, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_11_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_11_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $11, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $4, %ecx
ja LABEL(loop_ashr_11_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_12
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
*/
.p2align 4
LABEL(ashr_12_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $4, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $12, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 12(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_12_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_12_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $12, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_12_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $12, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_12_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_12_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $12, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $3, %ecx
ja LABEL(loop_ashr_12_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_13
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
*/
.p2align 4
LABEL(ashr_13_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $3, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $13, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 13(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_13_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_13_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $13, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_13_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $13, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_13_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_13_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $13, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $2, %ecx
ja LABEL(loop_ashr_13_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_14
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
*/
.p2align 4
LABEL(ashr_14_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $2, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $14, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 14(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_14_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_14_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $14, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_14_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $14, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_14_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_14_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $14, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $1, %ecx
ja LABEL(loop_ashr_14_use_sse4_2)
jmp LABEL(nibble_ashr_use_sse4_2_exit)
/*
* The following cases will be handled by ashr_15
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
*/
.p2align 4
LABEL(ashr_15_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $1, %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $15, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 15(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
.p2align 4
LABEL(loop_ashr_15_use_sse4_2):
add $16, %r10
jg LABEL(nibble_ashr_15_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $15, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
add $16, %r10
jg LABEL(nibble_ashr_15_use_sse4_2)
movdqa (%rdi, %rdx), %xmm0
palignr $15, -16(%rdi, %rdx), %xmm0
#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
movdqa (%rsi,%rdx), %xmm1
TOLOWER (%xmm0, %xmm1)
pcmpistri $0x1a, %xmm1, %xmm0
#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_15_use_sse4_2)
.p2align 4
LABEL(nibble_ashr_15_use_sse4_2):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
psrldq $15, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#ifdef USE_AS_STRNCMP
cmp %r11, %rcx
jae LABEL(nibble_ashr_use_sse4_2_exit)
#endif
cmp $0, %ecx
ja LABEL(loop_ashr_15_use_sse4_2)
LABEL(nibble_ashr_use_sse4_2_exit):
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
.p2align 4
LABEL(use_sse4_2_exit):
jnc LABEL(strcmp_exitz_sse4_2)
#ifdef USE_AS_STRNCMP
sub %rcx, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
add %rcx, %rdx
lea -16(%rdi, %r9), %rdi
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
test %r8d, %r8d
jz LABEL(use_sse4_2_ret_sse4_2)
xchg %eax, %edx
LABEL(use_sse4_2_ret_sse4_2):
# ifdef USE_AS_STRCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
movl (%rcx,%rdx,4), %edx
movl (%rcx,%rax,4), %eax
# endif
sub %edx, %eax
ret
LABEL(less32bytes_sse4_2):
lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
test %r8d, %r8d
jz LABEL(ret_sse4_2)
xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
.p2align 4
LABEL(ret_sse4_2):
LABEL(less16bytes_sse4_2):
bsf %rdx, %rdx /* find and store bit index in %rdx */
#ifdef USE_AS_STRNCMP
sub %rdx, %r11
jbe LABEL(strcmp_exitz_sse4_2)
#endif
movzbl (%rsi, %rdx), %ecx
movzbl (%rdi, %rdx), %eax
# ifdef USE_AS_STRCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
movl (%rdx,%rcx,4), %ecx
movl (%rdx,%rax,4), %eax
# endif
sub %ecx, %eax
ret
LABEL(strcmp_exitz_sse4_2):
xor %eax, %eax
ret
.p2align 4
// XXX Same as code above
LABEL(Byte0_sse4_2):
movzx (%rsi), %ecx
movzx (%rdi), %eax
# ifdef USE_AS_STRCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
movl (%rdx,%rcx,4), %ecx
movl (%rdx,%rax,4), %eax
# endif
sub %ecx, %eax
ret
cfi_endproc
.size STRCMP_SSE42, .-STRCMP_SSE42
# undef UCLOW_reg
# undef UCHIGH_reg
# undef LCQWORD_reg
# undef TOLOWER
/* Put all SSE 4.2 functions together. */
.section .rodata.sse4.2,"a",@progbits
.p2align 3
LABEL(unaligned_table_sse4_2):
.int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2)
.int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2)
# undef ENTRY
# define ENTRY(name) \
.type STRCMP_SSE2, @function; \
.align 16; \
STRCMP_SSE2: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
# ifdef USE_AS_STRCASECMP_L
# define ENTRY2(name) \
.type __strcasecmp_sse2, @function; \
.align 16; \
__strcasecmp_sse2: cfi_startproc; \
CALL_MCOUNT
# define END2(name) \
cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
# endif
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
The speedup we get from using SSE4.2 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def(name) \
.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
#endif
#include "../strcmp.S"