From d561fbb041fe6aa205f652aecefe4bb84fd124a5 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 12 Jul 2022 12:28:06 -0700 Subject: [PATCH] x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Because strcmp-sse2.S implements so many functions (more from avx2/evex/sse42) add a new file 'strcmp-naming.h' to assist in getting the correct symbol name for all the function across multiarch/non-multiarch builds. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/multiarch/rtld-strcmp.S | 18 + sysdeps/x86_64/multiarch/rtld-strncmp.S | 18 + sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S | 5 +- sysdeps/x86_64/multiarch/strcmp-naming.h | 68 + sysdeps/x86_64/multiarch/strcmp-sse2.S | 2140 ++++++++++++++++- sysdeps/x86_64/multiarch/strncase_l-sse2.S | 5 +- sysdeps/x86_64/multiarch/strncmp-sse2.S | 12 +- sysdeps/x86_64/strcasecmp_l.S | 11 +- sysdeps/x86_64/strcmp.S | 2147 +----------------- sysdeps/x86_64/strncase_l.S | 11 +- sysdeps/x86_64/strncmp.S | 7 +- 11 files changed, 2264 insertions(+), 2178 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/rtld-strcmp.S create mode 100644 sysdeps/x86_64/multiarch/rtld-strncmp.S create mode 100644 sysdeps/x86_64/multiarch/strcmp-naming.h diff --git a/sysdeps/x86_64/multiarch/rtld-strcmp.S b/sysdeps/x86_64/multiarch/rtld-strcmp.S new file mode 100644 index 0000000000..207078bdcc --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strcmp.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/rtld-strncmp.S b/sysdeps/x86_64/multiarch/rtld-strncmp.S new file mode 100644 index 0000000000..ac32150406 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strncmp.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "../strncmp.S" diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S index 2360d104dd..a2b5741399 100644 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S @@ -16,8 +16,5 @@ License along with the GNU C Library; if not, see . */ -#define STRCMP __strcasecmp_l_sse2 #define USE_AS_STRCASECMP_L -#define NO_NOLOCALE_ALIAS -#define __strcasecmp __strcasecmp_sse2 -#include +#include "strcmp-sse2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-naming.h b/sysdeps/x86_64/multiarch/strcmp-naming.h new file mode 100644 index 0000000000..6a7529b6a4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-naming.h @@ -0,0 +1,68 @@ +#ifndef _STRCMP_NAMING_H_ +#define _STRCMP_NAMING_H_ + +/* Utility macros. */ +#define STRCMP_SUFFIX(x, y) x##y +#define STRCMP_NAME(x, y) STRCMP_SUFFIX (x, y) + +/* Setup base of all definitions. */ +#define STRNCASECMP_BASE __strncasecmp +#define STRCASECMP_BASE __strcasecmp +#define WCSCMP_BASE __wcscmp + +#if defined USE_MULTIARCH && IS_IN (libc) +# define WCSNCMP_BASE __wcsncmp +# define STRNCMP_BASE __strncmp +# define STRCMP_BASE __strcmp + +#else +/* Covers IS_IN (rtld) or non-multiarch build. */ +# define WCSNCMP_BASE wcsncmp +# define STRNCMP_BASE strncmp +# define STRCMP_BASE strcmp + +# undef STRCMP_ISA +# define STRCMP_ISA +#endif + +#if IS_IN (rtld) || defined USE_MULTIARCH +# define ISA_HIDDEN_JUMPTARGET(...) __VA_ARGS__ +#else +# define ISA_HIDDEN_JUMPTARGET(...) HIDDEN_JUMPTARGET (__VA_ARGS__) +#endif + +/* Get correct symbol for OVERFLOW_STRCMP, STRCMP, and + STRCASECMP. */ +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + +# if defined USE_AS_WCSCMP || defined USE_AS_WCSNCMP +# define OVERFLOW_STRCMP_SYM WCSCMP_BASE +# define STRCMP_SYM WCSNCMP_BASE +# elif defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define OVERFLOW_STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l) +# define STRCMP_SYM STRCMP_NAME (STRNCASECMP_BASE, _l) +# else +# define OVERFLOW_STRCMP_SYM STRCMP_BASE +# define STRCMP_SYM STRNCMP_BASE +# endif + +# define STRCASECMP_SYM STRNCASECMP_BASE +# define OVERFLOW_STRCMP \ + ISA_HIDDEN_JUMPTARGET (STRCMP_NAME (OVERFLOW_STRCMP_SYM, STRCMP_ISA)) +#else +# ifdef USE_AS_WCSCMP +# define STRCMP_SYM WCSCMP_BASE +# elif defined USE_AS_STRCASECMP_L +# define STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l) +# else +# define STRCMP_SYM STRCMP_BASE +# endif + +# define STRCASECMP_SYM STRCASECMP_BASE +#endif + +#define STRCASECMP_L_NONASCII STRCMP_NAME (STRCASECMP_SYM, _l_nonascii) +#define STRCASECMP STRCMP_NAME (STRCASECMP_SYM, STRCMP_ISA) +#define STRCMP STRCMP_NAME (STRCMP_SYM, STRCMP_ISA) + +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2.S b/sysdeps/x86_64/multiarch/strcmp-sse2.S index b8f95e59cf..b1220231ab 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2.S @@ -16,13 +16,2141 @@ License along with the GNU C Library; if not, see . */ -#if IS_IN (libc) +#if IS_IN (libc) || IS_IN (rtld) + +# define STRCMP_ISA _sse2 +# include "strcmp-naming.h" + # include -# define STRCMP __strcmp_sse2 +# undef UPDATE_STRNCMP_COUNTER -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strcmp) +# ifndef LABEL +# define LABEL(l) L(l) +# endif + +# ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +# elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER +# elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 +# else +# define UPDATE_STRNCMP_COUNTER +# endif + + .text +# ifdef USE_AS_STRCASECMP_L +# ifndef ENTRY2 +# define ENTRY2(name) ENTRY (name) +# define END2(name) END (name) +# endif + +ENTRY2 (STRCASECMP) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END2 (STRCASECMP) + /* FALLTHROUGH to strcasecmp_l. */ +# elif defined USE_AS_STRNCASECMP_L +# ifndef ENTRY2 +# define ENTRY2(name) ENTRY (name) +# define END2(name) END (name) +# endif + +ENTRY2 (STRCASECMP) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END2 (STRCASECMP) + /* FALLTHROUGH to strncasecmp_l. */ +# endif + +ENTRY (STRCMP) +# ifdef USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP +# else + mov (%rdx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strcasecmp_l_nonascii +# elif defined USE_AS_STRNCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP +# else + mov (%rcx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strncasecmp_l_nonascii +# endif + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + test %RDX_LP, %RDX_LP + je LABEL(strcmp_exitz) + cmp $1, %RDX_LP + je LABEL(Byte0) + mov %RDX_LP, %R11_LP +# endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Llcase_min: + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +.Llcase_max: + .quad 0x9999999999999999 + .quad 0x9999999999999999 +.Lcase_add: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + movdqa .Llcase_min(%rip), %xmm5 +# define LCASE_MIN_reg %xmm5 + movdqa .Llcase_max(%rip), %xmm6 +# define LCASE_MAX_reg %xmm6 + movdqa .Lcase_add(%rip), %xmm7 +# define CASE_ADD_reg %xmm7 +# endif + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm8; \ + movdqa LCASE_MIN_reg, %xmm9; \ + paddb reg1, %xmm8; \ + paddb reg2, %xmm9; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm9; \ + pandn CASE_ADD_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm9; \ + paddb %xmm8, reg1; \ + paddb %xmm9, reg2 + TOLOWER (%xmm1, %xmm2) +# else +# define TOLOWER(reg1, reg2) +# endif + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, find different value or null char */ +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +# endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ +# else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ +# endif + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + .p2align 4 +LABEL(loop_ashr_0): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) /* mismatch or null char seen */ + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + jmp LABEL(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_1): + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + +LABEL(gobble_ashr_1): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_1) + + /* + * Nibble avoids loads across page boundary. This is to avoid a potential + * access into unmapped memory. + */ + .p2align 4 +LABEL(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ + pmovmskb %xmm0, %edx + test $0xfffe, %edx + jnz LABEL(ashr_1_exittail) /* find null char*/ + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $15, %r11 + jbe LABEL(ashr_1_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* substract 4K from %r10 */ + jmp LABEL(gobble_ashr_1) + + /* + * Once find null char, determine if there is a string mismatch + * before the null char. + */ + .p2align 4 +LABEL(ashr_1_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_2): + add $16, %r10 + jg LABEL(nibble_ashr_2) + +LABEL(gobble_ashr_2): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_2) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_2) + + .p2align 4 +LABEL(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfffc, %edx + jnz LABEL(ashr_2_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $14, %r11 + jbe LABEL(ashr_2_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_2) + + .p2align 4 +LABEL(ashr_2_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_3): + add $16, %r10 + jg LABEL(nibble_ashr_3) + +LABEL(gobble_ashr_3): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_3) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_3) + + .p2align 4 +LABEL(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff8, %edx + jnz LABEL(ashr_3_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $13, %r11 + jbe LABEL(ashr_3_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_3) + + .p2align 4 +LABEL(ashr_3_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_4): + add $16, %r10 + jg LABEL(nibble_ashr_4) + +LABEL(gobble_ashr_4): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_4) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_4) + + .p2align 4 +LABEL(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff0, %edx + jnz LABEL(ashr_4_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $12, %r11 + jbe LABEL(ashr_4_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_4) + + .p2align 4 +LABEL(ashr_4_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_5): + add $16, %r10 + jg LABEL(nibble_ashr_5) + +LABEL(gobble_ashr_5): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_5) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_5) + + .p2align 4 +LABEL(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffe0, %edx + jnz LABEL(ashr_5_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $11, %r11 + jbe LABEL(ashr_5_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_5) + + .p2align 4 +LABEL(ashr_5_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_6): + add $16, %r10 + jg LABEL(nibble_ashr_6) + +LABEL(gobble_ashr_6): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_6) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_6) + + .p2align 4 +LABEL(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffc0, %edx + jnz LABEL(ashr_6_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $10, %r11 + jbe LABEL(ashr_6_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_6) + + .p2align 4 +LABEL(ashr_6_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_7): + add $16, %r10 + jg LABEL(nibble_ashr_7) + +LABEL(gobble_ashr_7): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_7) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_7) + + .p2align 4 +LABEL(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff80, %edx + jnz LABEL(ashr_7_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $9, %r11 + jbe LABEL(ashr_7_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_7) + + .p2align 4 +LABEL(ashr_7_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_8): + add $16, %r10 + jg LABEL(nibble_ashr_8) + +LABEL(gobble_ashr_8): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_8) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_8) + + .p2align 4 +LABEL(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff00, %edx + jnz LABEL(ashr_8_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, %r11 + jbe LABEL(ashr_8_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_8) + + .p2align 4 +LABEL(ashr_8_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_9): + add $16, %r10 + jg LABEL(nibble_ashr_9) + +LABEL(gobble_ashr_9): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_9) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 /* store for next cycle */ + jmp LABEL(loop_ashr_9) + + .p2align 4 +LABEL(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfe00, %edx + jnz LABEL(ashr_9_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, %r11 + jbe LABEL(ashr_9_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_9) + + .p2align 4 +LABEL(ashr_9_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_10): + add $16, %r10 + jg LABEL(nibble_ashr_10) + +LABEL(gobble_ashr_10): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_10) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_10) + + .p2align 4 +LABEL(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfc00, %edx + jnz LABEL(ashr_10_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, %r11 + jbe LABEL(ashr_10_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_10) + + .p2align 4 +LABEL(ashr_10_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_11): + add $16, %r10 + jg LABEL(nibble_ashr_11) + +LABEL(gobble_ashr_11): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_11) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_11) + + .p2align 4 +LABEL(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf800, %edx + jnz LABEL(ashr_11_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, %r11 + jbe LABEL(ashr_11_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_11) + + .p2align 4 +LABEL(ashr_11_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_12): + add $16, %r10 + jg LABEL(nibble_ashr_12) + +LABEL(gobble_ashr_12): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_12) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_12) + + .p2align 4 +LABEL(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf000, %edx + jnz LABEL(ashr_12_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, %r11 + jbe LABEL(ashr_12_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_12) + + .p2align 4 +LABEL(ashr_12_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_13): + add $16, %r10 + jg LABEL(nibble_ashr_13) + +LABEL(gobble_ashr_13): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_13) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_13) + + .p2align 4 +LABEL(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xe000, %edx + jnz LABEL(ashr_13_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, %r11 + jbe LABEL(ashr_13_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_13) + + .p2align 4 +LABEL(ashr_13_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_14): + add $16, %r10 + jg LABEL(nibble_ashr_14) + +LABEL(gobble_ashr_14): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_14) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_14) + + .p2align 4 +LABEL(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xc000, %edx + jnz LABEL(ashr_14_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, %r11 + jbe LABEL(ashr_14_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_14) + + .p2align 4 +LABEL(ashr_14_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_15): + add $16, %r10 + jg LABEL(nibble_ashr_15) + +LABEL(gobble_ashr_15): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_15) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_15) + + .p2align 4 +LABEL(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0x8000, %edx + jnz LABEL(ashr_15_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmpq $1, %r11 + jbe LABEL(ashr_15_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_15) + + .p2align 4 +LABEL(ashr_15_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $15, %xmm3 + psrldq $15, %xmm0 + + .p2align 4 +LABEL(aftertail): + TOLOWER (%xmm1, %xmm3) + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + not %edx + + .p2align 4 +LABEL(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +# endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +# endif + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 +LABEL(Byte0): + movzbl (%rsi), %ecx + movzbl (%rdi), %eax + +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +# endif + + sub %ecx, %eax + ret +END (STRCMP) + + .section .rodata,"a",@progbits + .p2align 3 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) #endif - -#include diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse2.S b/sysdeps/x86_64/multiarch/strncase_l-sse2.S index 0ca4c836b2..fd8ad07450 100644 --- a/sysdeps/x86_64/multiarch/strncase_l-sse2.S +++ b/sysdeps/x86_64/multiarch/strncase_l-sse2.S @@ -16,8 +16,5 @@ License along with the GNU C Library; if not, see . */ -#define STRCMP __strncasecmp_l_sse2 -#define NO_NOLOCALE_ALIAS #define USE_AS_STRNCASECMP_L -#define __strncasecmp __strncasecmp_sse2 -#include +#include "strcmp-sse2.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2.S b/sysdeps/x86_64/multiarch/strncmp-sse2.S index e3ba94f926..2152b8dc3d 100644 --- a/sysdeps/x86_64/multiarch/strncmp-sse2.S +++ b/sysdeps/x86_64/multiarch/strncmp-sse2.S @@ -16,15 +16,5 @@ License along with the GNU C Library; if not, see . */ -#include - -#if IS_IN (libc) -# define STRCMP __strncmp_sse2 -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strcmp) -#else -# define STRCMP strncmp -#endif - #define USE_AS_STRNCMP -#include +#include "strcmp-sse2.S" diff --git a/sysdeps/x86_64/strcasecmp_l.S b/sysdeps/x86_64/strcasecmp_l.S index 5456b3a49e..84fd7fdfd3 100644 --- a/sysdeps/x86_64/strcasecmp_l.S +++ b/sysdeps/x86_64/strcasecmp_l.S @@ -1,6 +1,11 @@ -#define STRCMP __strcasecmp_l -#define USE_AS_STRCASECMP_L -#include "strcmp.S" +/* Symbols = __strcasecmp_l and __strcasecmp. */ + +#include "multiarch/strcasecmp_l-sse2.S" + +libc_hidden_builtin_def (__strcasecmp_l) weak_alias (__strcasecmp_l, strcasecmp_l) libc_hidden_def (strcasecmp_l) + +weak_alias (__strcasecmp, strcasecmp) +libc_hidden_def (__strcasecmp) diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index c38dc627f9..19e54bd3a7 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -16,2148 +16,7 @@ License along with the GNU C Library; if not, see . */ -#include -#include "asm-syntax.h" +/* Symbol = strcmp. */ -#undef UPDATE_STRNCMP_COUNTER - -#ifndef LABEL -#define LABEL(l) L(l) -#endif - -#ifdef USE_AS_STRNCMP -/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz - if the new counter > the old one or is 0. */ -# define UPDATE_STRNCMP_COUNTER \ - /* calculate left number to compare */ \ - lea -16(%rcx, %r11), %r9; \ - cmp %r9, %r11; \ - jb LABEL(strcmp_exitz); \ - test %r9, %r9; \ - je LABEL(strcmp_exitz); \ - mov %r9, %r11 - -#elif defined USE_AS_STRCASECMP_L -# include "locale-defines.h" - -# define UPDATE_STRNCMP_COUNTER -#elif defined USE_AS_STRNCASECMP_L -# include "locale-defines.h" - -# define UPDATE_STRNCMP_COUNTER \ - /* calculate left number to compare */ \ - lea -16(%rcx, %r11), %r9; \ - cmp %r9, %r11; \ - jb LABEL(strcmp_exitz); \ - test %r9, %r9; \ - je LABEL(strcmp_exitz); \ - mov %r9, %r11 -#else -# define UPDATE_STRNCMP_COUNTER -# ifndef STRCMP -# define STRCMP strcmp -# endif -#endif - - .text -#ifdef USE_AS_STRCASECMP_L -# ifndef ENTRY2 -# define ENTRY2(name) ENTRY (name) -# define END2(name) END (name) -# endif - -ENTRY2 (__strcasecmp) - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax - mov %fs:(%rax),%RDX_LP - - /* Either 1 or 5 bytes (dependeing if CET is enabled). */ - .p2align 4 -END2 (__strcasecmp) -# ifndef NO_NOLOCALE_ALIAS -weak_alias (__strcasecmp, strcasecmp) -libc_hidden_def (__strcasecmp) -# endif - /* FALLTHROUGH to strcasecmp_l. */ -#elif defined USE_AS_STRNCASECMP_L -# ifndef ENTRY2 -# define ENTRY2(name) ENTRY (name) -# define END2(name) END (name) -# endif - -ENTRY2 (__strncasecmp) - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax - mov %fs:(%rax),%RCX_LP - - /* Either 1 or 5 bytes (dependeing if CET is enabled). */ - .p2align 4 -END2 (__strncasecmp) -# ifndef NO_NOLOCALE_ALIAS -weak_alias (__strncasecmp, strncasecmp) -libc_hidden_def (__strncasecmp) -# endif - /* FALLTHROUGH to strncasecmp_l. */ -#endif - -ENTRY (STRCMP) -#ifdef USE_AS_STRCASECMP_L - /* We have to fall back on the C implementation for locales - with encodings not matching ASCII for single bytes. */ -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP -# else - mov (%rdx), %RAX_LP -# endif - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) - jne __strcasecmp_l_nonascii -#elif defined USE_AS_STRNCASECMP_L - /* We have to fall back on the C implementation for locales - with encodings not matching ASCII for single bytes. */ -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP -# else - mov (%rcx), %RAX_LP -# endif - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) - jne __strncasecmp_l_nonascii -#endif - -/* - * This implementation uses SSE to compare up to 16 bytes at a time. - */ -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - test %RDX_LP, %RDX_LP - je LABEL(strcmp_exitz) - cmp $1, %RDX_LP - je LABEL(Byte0) - mov %RDX_LP, %R11_LP -#endif - mov %esi, %ecx - mov %edi, %eax -/* Use 64bit AND here to avoid long NOP padding. */ - and $0x3f, %rcx /* rsi alignment in cache line */ - and $0x3f, %rax /* rdi alignment in cache line */ -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -.Llcase_min: - .quad 0x3f3f3f3f3f3f3f3f - .quad 0x3f3f3f3f3f3f3f3f -.Llcase_max: - .quad 0x9999999999999999 - .quad 0x9999999999999999 -.Lcase_add: - .quad 0x2020202020202020 - .quad 0x2020202020202020 - .previous - movdqa .Llcase_min(%rip), %xmm5 -# define LCASE_MIN_reg %xmm5 - movdqa .Llcase_max(%rip), %xmm6 -# define LCASE_MAX_reg %xmm6 - movdqa .Lcase_add(%rip), %xmm7 -# define CASE_ADD_reg %xmm7 -#endif - cmp $0x30, %ecx - ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ - cmp $0x30, %eax - ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ - movlpd (%rdi), %xmm1 - movlpd (%rsi), %xmm2 - movhpd 8(%rdi), %xmm1 - movhpd 8(%rsi), %xmm2 -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# define TOLOWER(reg1, reg2) \ - movdqa LCASE_MIN_reg, %xmm8; \ - movdqa LCASE_MIN_reg, %xmm9; \ - paddb reg1, %xmm8; \ - paddb reg2, %xmm9; \ - pcmpgtb LCASE_MAX_reg, %xmm8; \ - pcmpgtb LCASE_MAX_reg, %xmm9; \ - pandn CASE_ADD_reg, %xmm8; \ - pandn CASE_ADD_reg, %xmm9; \ - paddb %xmm8, reg1; \ - paddb %xmm9, reg2 - TOLOWER (%xmm1, %xmm2) -#else -# define TOLOWER(reg1, reg2) -#endif - pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ - pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ - psubb %xmm0, %xmm1 /* packed sub of comparison results*/ - pmovmskb %xmm1, %edx - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ - jnz LABEL(less16bytes) /* If not, find different value or null char */ -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) /* finish comparision */ -#endif - add $16, %rsi /* prepare to search next 16 bytes */ - add $16, %rdi /* prepare to search next 16 bytes */ - - /* - * Determine source and destination string offsets from 16-byte alignment. - * Use relative offset difference between the two to determine which case - * below to use. - */ - .p2align 4 -LABEL(crosscache): - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ - mov $0xffff, %edx /* for equivalent offset */ - xor %r8d, %r8d - and $0xf, %ecx /* offset of rsi */ - and $0xf, %eax /* offset of rdi */ - cmp %eax, %ecx - je LABEL(ashr_0) /* rsi and rdi relative offset same */ - ja LABEL(bigger) - mov %edx, %r8d /* r8d is offset flag for exit tail */ - xchg %ecx, %eax - xchg %rsi, %rdi -LABEL(bigger): - lea 15(%rax), %r9 - sub %rcx, %r9 - lea LABEL(unaligned_table)(%rip), %r10 - movslq (%r10, %r9,4), %r9 - lea (%r10, %r9), %r10 - _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ - -/* - * The following cases will be handled by ashr_0 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(0~15) n(0~15) 15(15+ n-n) ashr_0 - */ - .p2align 4 -LABEL(ashr_0): - - movdqa (%rsi), %xmm1 - pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ -#else - movdqa (%rdi), %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ -#endif - psubb %xmm0, %xmm1 /* packed sub of comparison results*/ - pmovmskb %xmm1, %r9d - shr %cl, %edx /* adjust 0xffff for offset */ - shr %cl, %r9d /* adjust for 16-byte offset */ - sub %r9d, %edx - /* - * edx must be the same with r9d if in left byte (16-rcx) is equal to - * the start from (16-rax) and no null char was seen. - */ - jne LABEL(less32bytes) /* mismatch or null char */ - UPDATE_STRNCMP_COUNTER - mov $16, %rcx - mov $16, %r9 - pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ - - /* - * Now both strings are aligned at 16-byte boundary. Loop over strings - * checking 32-bytes per iteration. - */ - .p2align 4 -LABEL(loop_ashr_0): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) /* mismatch or null char seen */ - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rcx - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rcx - jmp LABEL(loop_ashr_0) - -/* - * The following cases will be handled by ashr_1 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(15) n -15 0(15 +(n-15) - n) ashr_1 - */ - .p2align 4 -LABEL(ashr_1): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ - pslldq $15, %xmm2 /* shift first string to align with second */ - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ - psubb %xmm0, %xmm2 /* packed sub of comparison results*/ - pmovmskb %xmm2, %r9d - shr %cl, %edx /* adjust 0xffff for offset */ - shr %cl, %r9d /* adjust for 16-byte offset */ - sub %r9d, %edx - jnz LABEL(less32bytes) /* mismatch or null char seen */ - movdqa (%rdi), %xmm3 - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads*/ - mov $1, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 1(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_1): - add $16, %r10 - jg LABEL(nibble_ashr_1) /* cross page boundary */ - -LABEL(gobble_ashr_1): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 /* store for next cycle */ - - psrldq $1, %xmm3 - pslldq $15, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_1) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 /* store for next cycle */ - - psrldq $1, %xmm3 - pslldq $15, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_1) - - /* - * Nibble avoids loads across page boundary. This is to avoid a potential - * access into unmapped memory. - */ - .p2align 4 -LABEL(nibble_ashr_1): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ - pmovmskb %xmm0, %edx - test $0xfffe, %edx - jnz LABEL(ashr_1_exittail) /* find null char*/ - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $15, %r11 - jbe LABEL(ashr_1_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 /* substract 4K from %r10 */ - jmp LABEL(gobble_ashr_1) - - /* - * Once find null char, determine if there is a string mismatch - * before the null char. - */ - .p2align 4 -LABEL(ashr_1_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $1, %xmm0 - psrldq $1, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_2 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 - */ - .p2align 4 -LABEL(ashr_2): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $14, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $2, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 2(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_2): - add $16, %r10 - jg LABEL(nibble_ashr_2) - -LABEL(gobble_ashr_2): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $2, %xmm3 - pslldq $14, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_2) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $2, %xmm3 - pslldq $14, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_2) - - .p2align 4 -LABEL(nibble_ashr_2): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xfffc, %edx - jnz LABEL(ashr_2_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $14, %r11 - jbe LABEL(ashr_2_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_2) - - .p2align 4 -LABEL(ashr_2_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $2, %xmm0 - psrldq $2, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_3 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 - */ - .p2align 4 -LABEL(ashr_3): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $13, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $3, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 3(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_3): - add $16, %r10 - jg LABEL(nibble_ashr_3) - -LABEL(gobble_ashr_3): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $3, %xmm3 - pslldq $13, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_3) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $3, %xmm3 - pslldq $13, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_3) - - .p2align 4 -LABEL(nibble_ashr_3): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xfff8, %edx - jnz LABEL(ashr_3_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $13, %r11 - jbe LABEL(ashr_3_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_3) - - .p2align 4 -LABEL(ashr_3_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $3, %xmm0 - psrldq $3, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_4 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 - */ - .p2align 4 -LABEL(ashr_4): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $12, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $4, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 4(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_4): - add $16, %r10 - jg LABEL(nibble_ashr_4) - -LABEL(gobble_ashr_4): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $4, %xmm3 - pslldq $12, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_4) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $4, %xmm3 - pslldq $12, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_4) - - .p2align 4 -LABEL(nibble_ashr_4): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xfff0, %edx - jnz LABEL(ashr_4_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $12, %r11 - jbe LABEL(ashr_4_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_4) - - .p2align 4 -LABEL(ashr_4_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $4, %xmm0 - psrldq $4, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_5 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 - */ - .p2align 4 -LABEL(ashr_5): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $11, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $5, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 5(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_5): - add $16, %r10 - jg LABEL(nibble_ashr_5) - -LABEL(gobble_ashr_5): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $5, %xmm3 - pslldq $11, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_5) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $5, %xmm3 - pslldq $11, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_5) - - .p2align 4 -LABEL(nibble_ashr_5): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xffe0, %edx - jnz LABEL(ashr_5_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $11, %r11 - jbe LABEL(ashr_5_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_5) - - .p2align 4 -LABEL(ashr_5_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $5, %xmm0 - psrldq $5, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_6 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 - */ - .p2align 4 -LABEL(ashr_6): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $10, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $6, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 6(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_6): - add $16, %r10 - jg LABEL(nibble_ashr_6) - -LABEL(gobble_ashr_6): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $6, %xmm3 - pslldq $10, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_6) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $6, %xmm3 - pslldq $10, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_6) - - .p2align 4 -LABEL(nibble_ashr_6): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xffc0, %edx - jnz LABEL(ashr_6_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $10, %r11 - jbe LABEL(ashr_6_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_6) - - .p2align 4 -LABEL(ashr_6_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $6, %xmm0 - psrldq $6, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_7 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 - */ - .p2align 4 -LABEL(ashr_7): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $9, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $7, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 7(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_7): - add $16, %r10 - jg LABEL(nibble_ashr_7) - -LABEL(gobble_ashr_7): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $7, %xmm3 - pslldq $9, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_7) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $7, %xmm3 - pslldq $9, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_7) - - .p2align 4 -LABEL(nibble_ashr_7): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xff80, %edx - jnz LABEL(ashr_7_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $9, %r11 - jbe LABEL(ashr_7_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_7) - - .p2align 4 -LABEL(ashr_7_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $7, %xmm0 - psrldq $7, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_8 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 - */ - .p2align 4 -LABEL(ashr_8): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $8, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $8, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 8(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_8): - add $16, %r10 - jg LABEL(nibble_ashr_8) - -LABEL(gobble_ashr_8): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $8, %xmm3 - pslldq $8, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_8) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $8, %xmm3 - pslldq $8, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_8) - - .p2align 4 -LABEL(nibble_ashr_8): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xff00, %edx - jnz LABEL(ashr_8_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $8, %r11 - jbe LABEL(ashr_8_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_8) - - .p2align 4 -LABEL(ashr_8_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $8, %xmm0 - psrldq $8, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_9 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 - */ - .p2align 4 -LABEL(ashr_9): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $7, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $9, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 9(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_9): - add $16, %r10 - jg LABEL(nibble_ashr_9) - -LABEL(gobble_ashr_9): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $9, %xmm3 - pslldq $7, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_9) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $9, %xmm3 - pslldq $7, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 /* store for next cycle */ - jmp LABEL(loop_ashr_9) - - .p2align 4 -LABEL(nibble_ashr_9): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xfe00, %edx - jnz LABEL(ashr_9_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $7, %r11 - jbe LABEL(ashr_9_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_9) - - .p2align 4 -LABEL(ashr_9_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $9, %xmm0 - psrldq $9, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_10 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 - */ - .p2align 4 -LABEL(ashr_10): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $6, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $10, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 10(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_10): - add $16, %r10 - jg LABEL(nibble_ashr_10) - -LABEL(gobble_ashr_10): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $10, %xmm3 - pslldq $6, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_10) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $10, %xmm3 - pslldq $6, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_10) - - .p2align 4 -LABEL(nibble_ashr_10): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xfc00, %edx - jnz LABEL(ashr_10_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $6, %r11 - jbe LABEL(ashr_10_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_10) - - .p2align 4 -LABEL(ashr_10_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $10, %xmm0 - psrldq $10, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_11 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 - */ - .p2align 4 -LABEL(ashr_11): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $5, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $11, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 11(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_11): - add $16, %r10 - jg LABEL(nibble_ashr_11) - -LABEL(gobble_ashr_11): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $11, %xmm3 - pslldq $5, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_11) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $11, %xmm3 - pslldq $5, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_11) - - .p2align 4 -LABEL(nibble_ashr_11): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xf800, %edx - jnz LABEL(ashr_11_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $5, %r11 - jbe LABEL(ashr_11_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_11) - - .p2align 4 -LABEL(ashr_11_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $11, %xmm0 - psrldq $11, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_12 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 - */ - .p2align 4 -LABEL(ashr_12): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $4, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $12, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 12(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_12): - add $16, %r10 - jg LABEL(nibble_ashr_12) - -LABEL(gobble_ashr_12): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $12, %xmm3 - pslldq $4, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_12) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $12, %xmm3 - pslldq $4, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_12) - - .p2align 4 -LABEL(nibble_ashr_12): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xf000, %edx - jnz LABEL(ashr_12_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $4, %r11 - jbe LABEL(ashr_12_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_12) - - .p2align 4 -LABEL(ashr_12_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $12, %xmm0 - psrldq $12, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_13 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 - */ - .p2align 4 -LABEL(ashr_13): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $3, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $13, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 13(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_13): - add $16, %r10 - jg LABEL(nibble_ashr_13) - -LABEL(gobble_ashr_13): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $13, %xmm3 - pslldq $3, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_13) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $13, %xmm3 - pslldq $3, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_13) - - .p2align 4 -LABEL(nibble_ashr_13): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xe000, %edx - jnz LABEL(ashr_13_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $3, %r11 - jbe LABEL(ashr_13_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_13) - - .p2align 4 -LABEL(ashr_13_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $13, %xmm0 - psrldq $13, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_14 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 - */ - .p2align 4 -LABEL(ashr_14): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $2, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $14, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 14(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_14): - add $16, %r10 - jg LABEL(nibble_ashr_14) - -LABEL(gobble_ashr_14): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $14, %xmm3 - pslldq $2, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_14) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $14, %xmm3 - pslldq $2, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_14) - - .p2align 4 -LABEL(nibble_ashr_14): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0xc000, %edx - jnz LABEL(ashr_14_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp $2, %r11 - jbe LABEL(ashr_14_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_14) - - .p2align 4 -LABEL(ashr_14_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $14, %xmm0 - psrldq $14, %xmm3 - jmp LABEL(aftertail) - -/* - * The following cases will be handled by ashr_15 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 - */ - .p2align 4 -LABEL(ashr_15): - pxor %xmm0, %xmm0 - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pslldq $1, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - pxor %xmm0, %xmm0 - mov $16, %rcx /* index for loads */ - mov $15, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 15(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - - sub $0x1000, %r10 /* subtract 4K pagesize */ - - .p2align 4 -LABEL(loop_ashr_15): - add $16, %r10 - jg LABEL(nibble_ashr_15) - -LABEL(gobble_ashr_15): - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $15, %xmm3 - pslldq $1, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - - add $16, %r10 - jg LABEL(nibble_ashr_15) /* cross page boundary */ - - movdqa (%rsi, %rcx), %xmm1 - movdqa (%rdi, %rcx), %xmm2 - movdqa %xmm2, %xmm4 - - psrldq $15, %xmm3 - pslldq $1, %xmm2 - por %xmm3, %xmm2 /* merge into one 16byte value */ - - TOLOWER (%xmm1, %xmm2) - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - sub $0xffff, %edx - jnz LABEL(exit) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rcx - movdqa %xmm4, %xmm3 - jmp LABEL(loop_ashr_15) - - .p2align 4 -LABEL(nibble_ashr_15): - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ - pmovmskb %xmm0, %edx - test $0x8000, %edx - jnz LABEL(ashr_15_exittail) - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmpq $1, %r11 - jbe LABEL(ashr_15_exittail) -#endif - - pxor %xmm0, %xmm0 - sub $0x1000, %r10 - jmp LABEL(gobble_ashr_15) - - .p2align 4 -LABEL(ashr_15_exittail): - movdqa (%rsi, %rcx), %xmm1 - psrldq $15, %xmm3 - psrldq $15, %xmm0 - - .p2align 4 -LABEL(aftertail): - TOLOWER (%xmm1, %xmm3) - pcmpeqb %xmm3, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - not %edx - - .p2align 4 -LABEL(exit): - lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ -LABEL(less32bytes): - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ - test %r8d, %r8d - jz LABEL(ret) - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ - - .p2align 4 -LABEL(ret): -LABEL(less16bytes): - bsf %rdx, %rdx /* find and store bit index in %rdx */ - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub %rdx, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzbl (%rsi, %rdx), %ecx - movzbl (%rdi, %rdx), %eax - -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx - movl (%rdx,%rcx,4), %ecx - movl (%rdx,%rax,4), %eax -#endif - - sub %ecx, %eax - ret - -LABEL(strcmp_exitz): - xor %eax, %eax - ret - - .p2align 4 -LABEL(Byte0): - movzbl (%rsi), %ecx - movzbl (%rdi), %eax - -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx - movl (%rdx,%rcx,4), %ecx - movl (%rdx,%rax,4), %eax -#endif - - sub %ecx, %eax - ret -END (STRCMP) - - .section .rodata,"a",@progbits - .p2align 3 -LABEL(unaligned_table): - .int LABEL(ashr_1) - LABEL(unaligned_table) - .int LABEL(ashr_2) - LABEL(unaligned_table) - .int LABEL(ashr_3) - LABEL(unaligned_table) - .int LABEL(ashr_4) - LABEL(unaligned_table) - .int LABEL(ashr_5) - LABEL(unaligned_table) - .int LABEL(ashr_6) - LABEL(unaligned_table) - .int LABEL(ashr_7) - LABEL(unaligned_table) - .int LABEL(ashr_8) - LABEL(unaligned_table) - .int LABEL(ashr_9) - LABEL(unaligned_table) - .int LABEL(ashr_10) - LABEL(unaligned_table) - .int LABEL(ashr_11) - LABEL(unaligned_table) - .int LABEL(ashr_12) - LABEL(unaligned_table) - .int LABEL(ashr_13) - LABEL(unaligned_table) - .int LABEL(ashr_14) - LABEL(unaligned_table) - .int LABEL(ashr_15) - LABEL(unaligned_table) - .int LABEL(ashr_0) - LABEL(unaligned_table) -libc_hidden_builtin_def (STRCMP) +#include "multiarch/strcmp-sse2.S" +libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/x86_64/strncase_l.S b/sysdeps/x86_64/strncase_l.S index c725cd85b3..3780fc50b1 100644 --- a/sysdeps/x86_64/strncase_l.S +++ b/sysdeps/x86_64/strncase_l.S @@ -1,6 +1,11 @@ -#define STRCMP __strncasecmp_l -#define USE_AS_STRNCASECMP_L -#include "strcmp.S" +/* Symbols = __strncasecmp_l and __strncasecmp. */ + +#include "multiarch/strncase_l-sse2.S" + +libc_hidden_builtin_def (__strncasecmp_l) weak_alias (__strncasecmp_l, strncasecmp_l) libc_hidden_def (strncasecmp_l) + +weak_alias (__strncasecmp, strncasecmp) +libc_hidden_def (__strncasecmp) diff --git a/sysdeps/x86_64/strncmp.S b/sysdeps/x86_64/strncmp.S index 0af34e7f15..13d9e82ee2 100644 --- a/sysdeps/x86_64/strncmp.S +++ b/sysdeps/x86_64/strncmp.S @@ -1,3 +1,4 @@ -#define STRCMP strncmp -#define USE_AS_STRNCMP -#include "strcmp.S" +/* Symbol = strncmp. */ + +#include "multiarch/strncmp-sse2.S" +libc_hidden_builtin_def (strncmp)