glibc/sysdeps/powerpc/powerpc64/power8/strcasecmp.S

458 lines
11 KiB
ArmAsm

/* Optimized strcasecmp implementation for PowerPC64.
Copyright (C) 2016-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <locale-defines.h>
/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
#ifndef USE_AS_STRNCASECMP
# define __STRCASECMP __strcasecmp
# define STRCASECMP strcasecmp
#else
# define __STRCASECMP __strncasecmp
# define STRCASECMP strncasecmp
#endif
/* Convert 16 bytes to lowercase and compare */
#define TOLOWER() \
vaddubm v8, v4, v1; \
vaddubm v7, v4, v3; \
vcmpgtub v8, v8, v2; \
vsel v4, v7, v4, v8; \
vaddubm v8, v5, v1; \
vaddubm v7, v5, v3; \
vcmpgtub v8, v8, v2; \
vsel v5, v7, v5, v8; \
vcmpequb. v7, v5, v4;
/*
* Get 16 bytes for unaligned case.
* reg1: Vector to hold next 16 bytes.
* reg2: Address to read from.
* reg3: Permute control vector.
* v8: Tmp vector used to mask unwanted bytes.
* v9: Tmp vector,0 when null is found on first 16 bytes
*/
#ifdef __LITTLE_ENDIAN__
#define GET16BYTES(reg1, reg2, reg3) \
lvx reg1, 0, reg2; \
vspltisb v8, -1; \
vperm v8, v8, reg1, reg3; \
vcmpequb. v8, v0, v8; \
beq cr6, 1f; \
vspltisb v9, 0; \
b 2f; \
.align 4; \
1: \
addi r6, reg2, 16; \
lvx v9, 0, r6; \
2: \
vperm reg1, v9, reg1, reg3;
#else
#define GET16BYTES(reg1, reg2, reg3) \
lvx reg1, 0, reg2; \
vspltisb v8, -1; \
vperm v8, reg1, v8, reg3; \
vcmpequb. v8, v0, v8; \
beq cr6, 1f; \
vspltisb v9, 0; \
b 2f; \
.align 4; \
1: \
addi r6, reg2, 16; \
lvx v9, 0, r6; \
2: \
vperm reg1, reg1, v9, reg3;
#endif
/* Check null in v4, v5 and convert to lower. */
#define CHECKNULLANDCONVERT() \
vcmpequb. v7, v0, v5; \
beq cr6, 3f; \
vcmpequb. v7, v0, v4; \
beq cr6, 3f; \
b L(null_found); \
.align 4; \
3: \
TOLOWER()
#ifdef _ARCH_PWR8
# define VCLZD_V8_v7 vclzd v8, v7;
# define MFVRD_R3_V1 mfvrd r3, v1;
# define VSUBUDM_V9_V8 vsubudm v9, v9, v8;
# define VPOPCNTD_V8_V8 vpopcntd v8, v8;
# define VADDUQM_V7_V8 vadduqm v9, v7, v8;
#else
# define VCLZD_V8_v7 .long 0x11003fc2
# define MFVRD_R3_V1 .long 0x7c230067
# define VSUBUDM_V9_V8 .long 0x112944c0
# define VPOPCNTD_V8_V8 .long 0x110047c3
# define VADDUQM_V7_V8 .long 0x11274100
#endif
.machine power7
ENTRY (__STRCASECMP)
#ifdef USE_AS_STRNCASECMP
CALL_MCOUNT 3
#else
CALL_MCOUNT 2
#endif
#define rRTN r3 /* Return value */
#define rSTR1 r10 /* 1st string */
#define rSTR2 r4 /* 2nd string */
#define rCHAR1 r6 /* Byte read from 1st string */
#define rCHAR2 r7 /* Byte read from 2nd string */
#define rADDR1 r8 /* Address of tolower(rCHAR1) */
#define rADDR2 r12 /* Address of tolower(rCHAR2) */
#define rLWR1 r8 /* Word tolower(rCHAR1) */
#define rLWR2 r12 /* Word tolower(rCHAR2) */
#define rTMP r9
#define rLOC r11 /* Default locale address */
cmpd cr7, rRTN, rSTR2
/* Get locale address. */
ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
add rLOC, rTMP, __libc_tsd_LOCALE@tls
ld rLOC, 0(rLOC)
mr rSTR1, rRTN
li rRTN, 0
beqlr cr7
#ifdef USE_AS_STRNCASECMP
cmpdi cr7, r5, 0
beq cr7, L(retnull)
cmpdi cr7, r5, 16
blt cr7, L(bytebybyte)
#endif
vspltisb v0, 0
vspltisb v8, -1
/* Check for null in initial characters.
Check max of 16 char depending on the alignment.
If null is present, proceed byte by byte. */
lvx v4, 0, rSTR1
#ifdef __LITTLE_ENDIAN__
lvsr v10, 0, rSTR1 /* Compute mask. */
vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */
#else
lvsl v10, 0, rSTR1
vperm v9, v4, v8, v10
#endif
vcmpequb. v9, v0, v9 /* Check for null bytes. */
bne cr6, L(bytebybyte)
lvx v5, 0, rSTR2
/* Calculate alignment. */
#ifdef __LITTLE_ENDIAN__
lvsr v6, 0, rSTR2
vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */
#else
lvsl v6, 0, rSTR2
vperm v9, v5, v8, v6
#endif
vcmpequb. v9, v0, v9 /* Check for null bytes. */
bne cr6, L(bytebybyte)
/* Check if locale has non ascii characters. */
ld rTMP, 0(rLOC)
addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
lwz rTMP, 0(r6)
cmpdi cr7, rTMP, 1
beq cr7, L(bytebybyte)
/* Load vector registers with values used for TOLOWER. */
/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */
vspltisb v3, 2
vspltisb v9, 4
vsl v3, v3, v9
vaddubm v1, v3, v3
vnor v1, v1, v1
vspltisb v2, 7
vsububm v2, v3, v2
andi. rADDR1, rSTR1, 0xF
beq cr0, L(align)
addi r6, rSTR1, 16
lvx v9, 0, r6
/* Compute 16 bytes from previous two loads. */
#ifdef __LITTLE_ENDIAN__
vperm v4, v9, v4, v10
#else
vperm v4, v4, v9, v10
#endif
L(align):
andi. rADDR2, rSTR2, 0xF
beq cr0, L(align1)
addi r6, rSTR2, 16
lvx v9, 0, r6
/* Compute 16 bytes from previous two loads. */
#ifdef __LITTLE_ENDIAN__
vperm v5, v9, v5, v6
#else
vperm v5, v5, v9, v6
#endif
L(align1):
CHECKNULLANDCONVERT()
blt cr6, L(match)
b L(different)
.align 4
L(match):
clrldi r6, rSTR1, 60
subfic r7, r6, 16
#ifdef USE_AS_STRNCASECMP
sub r5, r5, r7
#endif
add rSTR1, rSTR1, r7
add rSTR2, rSTR2, r7
andi. rADDR2, rSTR2, 0xF
addi rSTR1, rSTR1, -16
addi rSTR2, rSTR2, -16
beq cr0, L(aligned)
#ifdef __LITTLE_ENDIAN__
lvsr v6, 0, rSTR2
#else
lvsl v6, 0, rSTR2
#endif
/* There are 2 loops depending on the input alignment.
Each loop gets 16 bytes from s1 and s2, check for null,
convert to lowercase and compare. Loop till difference
or null occurs. */
L(s1_align):
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
#ifdef USE_AS_STRNCASECMP
cmpdi cr7, r5, 16
blt cr7, L(bytebybyte)
addi r5, r5, -16
#endif
lvx v4, 0, rSTR1
GET16BYTES(v5, rSTR2, v6)
CHECKNULLANDCONVERT()
blt cr6, L(s1_align)
b L(different)
.align 4
L(aligned):
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
#ifdef USE_AS_STRNCASECMP
cmpdi cr7, r5, 16
blt cr7, L(bytebybyte)
addi r5, r5, -16
#endif
lvx v4, 0, rSTR1
lvx v5, 0, rSTR2
CHECKNULLANDCONVERT()
blt cr6, L(aligned)
/* Calculate and return the difference. */
L(different):
vaddubm v1, v3, v3
vcmpequb v7, v0, v7
#ifdef __LITTLE_ENDIAN__
/* Count trailing zero. */
vspltisb v8, -1
VADDUQM_V7_V8
vandc v8, v9, v7
VPOPCNTD_V8_V8
vspltb v6, v8, 15
vcmpequb. v6, v6, v1
blt cr6, L(shift8)
#else
/* Count leading zero. */
VCLZD_V8_v7
vspltb v6, v8, 7
vcmpequb. v6, v6, v1
blt cr6, L(shift8)
vsro v8, v8, v1
#endif
b L(skipsum)
.align 4
L(shift8):
vsumsws v8, v8, v0
L(skipsum):
#ifdef __LITTLE_ENDIAN__
/* Shift registers based on leading zero count. */
vsro v6, v5, v8
vsro v7, v4, v8
/* Merge and move to GPR. */
vmrglb v6, v6, v7
vslo v1, v6, v1
MFVRD_R3_V1
/* Place the characters that are different in first position. */
sldi rSTR2, rRTN, 56
srdi rSTR2, rSTR2, 56
sldi rSTR1, rRTN, 48
srdi rSTR1, rSTR1, 56
#else
vslo v6, v5, v8
vslo v7, v4, v8
vmrghb v1, v6, v7
MFVRD_R3_V1
srdi rSTR2, rRTN, 48
sldi rSTR2, rSTR2, 56
srdi rSTR2, rSTR2, 56
srdi rSTR1, rRTN, 56
#endif
subf rRTN, rSTR1, rSTR2
extsw rRTN, rRTN
blr
.align 4
/* OK. We've hit the end of the string. We need to be careful that
we don't compare two strings as different because of junk beyond
the end of the strings... */
L(null_found):
vaddubm v10, v3, v3
#ifdef __LITTLE_ENDIAN__
/* Count trailing zero. */
vspltisb v8, -1
VADDUQM_V7_V8
vandc v8, v9, v7
VPOPCNTD_V8_V8
vspltb v6, v8, 15
vcmpequb. v6, v6, v10
blt cr6, L(shift_8)
#else
/* Count leading zero. */
VCLZD_V8_v7
vspltb v6, v8, 7
vcmpequb. v6, v6, v10
blt cr6, L(shift_8)
vsro v8, v8, v10
#endif
b L(skipsum1)
.align 4
L(shift_8):
vsumsws v8, v8, v0
L(skipsum1):
/* Calculate shift count based on count of zero. */
vspltisb v10, 7
vslb v10, v10, v10
vsldoi v9, v0, v10, 1
VSUBUDM_V9_V8
vspltisb v8, 8
vsldoi v8, v0, v8, 1
VSUBUDM_V9_V8
/* Shift and remove junk after null character. */
#ifdef __LITTLE_ENDIAN__
vslo v5, v5, v9
vslo v4, v4, v9
#else
vsro v5, v5, v9
vsro v4, v4, v9
#endif
/* Convert and compare 16 bytes. */
TOLOWER()
blt cr6, L(retnull)
b L(different)
.align 4
L(retnull):
li rRTN, 0
blr
.align 4
L(bytebybyte):
/* Unrolling loop for POWER: loads are done with 'lbz' plus
offset and string descriptors are only updated in the end
of loop unrolling. */
ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
#ifdef USE_AS_STRNCASECMP
rldicl rTMP, r5, 62, 2
cmpdi cr7, rTMP, 0
beq cr7, L(lessthan4)
mtctr rTMP
#endif
L(loop):
cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
beq cr1, L(done)
lbz rCHAR1, 1(rSTR1)
lbz rCHAR2, 1(rSTR2)
cmpdi rCHAR1, 0
sldi rADDR1, rCHAR1, 2
sldi rADDR2, rCHAR2, 2
lwzx rLWR1, rLOC, rADDR1
lwzx rLWR2, rLOC, rADDR2
cmpw cr1, rLWR1, rLWR2
crorc 4*cr1+eq,eq,4*cr1+eq
beq cr1, L(done)
lbz rCHAR1, 2(rSTR1)
lbz rCHAR2, 2(rSTR2)
cmpdi rCHAR1, 0
sldi rADDR1, rCHAR1, 2
sldi rADDR2, rCHAR2, 2
lwzx rLWR1, rLOC, rADDR1
lwzx rLWR2, rLOC, rADDR2
cmpw cr1, rLWR1, rLWR2
crorc 4*cr1+eq,eq,4*cr1+eq
beq cr1, L(done)
lbz rCHAR1, 3(rSTR1)
lbz rCHAR2, 3(rSTR2)
cmpdi rCHAR1, 0
/* Increment both string descriptors */
addi rSTR1, rSTR1, 4
addi rSTR2, rSTR2, 4
sldi rADDR1, rCHAR1, 2
sldi rADDR2, rCHAR2, 2
lwzx rLWR1, rLOC, rADDR1
lwzx rLWR2, rLOC, rADDR2
cmpw cr1, rLWR1, rLWR2
crorc 4*cr1+eq,eq,4*cr1+eq
beq cr1, L(done)
lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
#ifdef USE_AS_STRNCASECMP
bdnz L(loop)
#else
b L(loop)
#endif
#ifdef USE_AS_STRNCASECMP
L(lessthan4):
clrldi r5, r5, 62
cmpdi cr7, r5, 0
beq cr7, L(retnull)
mtctr r5
L(loop1):
cmpdi rCHAR1, 0
sldi rADDR1, rCHAR1, 2
sldi rADDR2, rCHAR2, 2
lwzx rLWR1, rLOC, rADDR1
lwzx rLWR2, rLOC, rADDR2
cmpw cr1, rLWR1, rLWR2
crorc 4*cr1+eq,eq,4*cr1+eq
beq cr1, L(done)
addi rSTR1, rSTR1, 1
addi rSTR2, rSTR2, 1
lbz rCHAR1, 0(rSTR1)
lbz rCHAR2, 0(rSTR2)
bdnz L(loop1)
#endif
L(done):
subf r0, rLWR2, rLWR1
extsw rRTN, r0
blr
END (__STRCASECMP)
weak_alias (__STRCASECMP, STRCASECMP)
libc_hidden_builtin_def (__STRCASECMP)