glibc/sysdeps/powerpc/powerpc64/power8/strnlen.S

403 lines
13 KiB
ArmAsm

/* Optimized strnlen implementation for POWER8 using a vmx loop.
Copyright (C) 2017-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* It is implemented the following heuristic:
1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
reading doublewords. Uses the POWER7 algorithm.
2. Case maxlen > 32: check for null bytes in the first 16 bytes using
unaligned accesses. Return length if found. Otherwise:
2.1 Case maxlen < 64: deduct the bytes previously read, align
the pointer to 16 bytes and loop through reading quadwords
until find null bytes or reach maxlen.
2.2 Case maxlen > 64: deduct the bytes previously read, align
the pointer to 64 bytes and set up a counter to loop through
reading in strides of 64 bytes. In case it finished the loop
with null bytes not found, process the remainder bytes by
switching to the loop to heuristic in 2.1. */
#include <sysdep.h>
/* Define default page size to 4KB. */
#define PAGE_SIZE 4096
/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */
.machine power8
ENTRY_TOCLESS (__strnlen)
CALL_MCOUNT 2
dcbt 0,r3
cmpldi r4,32 /* Check if maxlen <= 32. */
ble L(small_range) /* If maxlen <= 32. */
/* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
otherwise the processor throws an memory access error.
Use following code to check there is room for such as accesses:
(((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
If it is disallowed then switch to the code that handles
the string when maxlen <= 32. */
clrldi r10,r3,52
cmpldi cr7,r10,PAGE_SIZE-16
bgt cr7,L(small_range) /* If less than 16B of page end. */
/* Compute our permute constant r8. */
li r7,0
/* Compute a bpermd constant to move bit 0 of each word into
a halfword value, and count trailing zeros. */
#ifdef __LITTLE_ENDIAN__
li r8,0x2820
oris r8,r8,0x3830
sldi r8,r8,32
ori r8,r8,0x0800
oris r8,r8,0x1810
#else
li r8,0x1018
oris r8,r8,0x0008
sldi r8,r8,32
ori r8,r8,0x3038
oris r8,r8,0x2028
#endif
/* maxlen > 32. Optimistically check for null bytes in the first
16 bytes of the string using unaligned accesses. */
ld r5,0(r3)
ld r6,8(r3)
cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */
cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */
or. r7,r10,r11
bne cr0, L(early_find) /* If found null bytes. */
/* At this point maxlen > 32 and null bytes were not found at first
16 bytes. Prepare for loop using VMX. */
/* r3 == s, r4 == maxlen. All other volatile regs are unused now. */
addi r5,r3,16 /* Align up, or just add the 16B we
already checked. */
li r0,15
and r7,r5,r0 /* Find offset into 16B alignment. */
andc r5,r5,r0 /* Quadword align up s to the next quadword. */
li r0,16
subf r0,r7,r0
subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */
/* Compute offsets for vmx loads, and precompute the vbpermq
constants for both the 64B and 16B loops. */
li r6,0
vspltisb v0,0
vspltisb v10,3
lvsl v11,r6,r6
vslb v10,v11,v10
cmpldi r4,64 /* Check maxlen < 64. */
blt L(smaller) /* If maxlen < 64 */
/* In order to begin the 64B loop, it needs to be 64
bytes aligned. So read quadwords until it is aligned or found null
bytes. At worst case it will be aligned after the fourth iteration,
so unroll the loop to avoid counter checking. */
andi. r7,r5,63 /* Check if is 64 bytes aligned. */
beq cr0,L(preloop_64B) /* If it is already 64B aligned. */
lvx v1,r5,r6
vcmpequb. v1,v1,v0
addi r5,r5,16
addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */
bne cr6,L(found_aligning64B) /* If found null bytes. */
/* Unroll 2x above code block until aligned or find null bytes. */
andi. r7,r5,63
beq cr0,L(preloop_64B)
lvx v1,r5,r6
vcmpequb. v1,v1,v0
addi r5,r5,16
addi r4,r4,-16
bne cr6,L(found_aligning64B)
andi. r7,r5,63
beq cr0,L(preloop_64B)
lvx v1,r5,r6
vcmpequb. v1,v1,v0
addi r5,r5,16
addi r4,r4,-16
bne cr6,L(found_aligning64B)
/* At this point it should be 16 bytes aligned.
Prepare for the 64B loop. */
.p2align 4
L(preloop_64B):
/* Check if maxlen became is less than 64, therefore disallowing the
64B loop. If it happened switch to the 16B loop code. */
cmpldi r4,64 /* Check if maxlen < 64. */
blt L(smaller) /* If maxlen < 64. */
/* Set some constant values. */
li r7,16
li r10,32
li r9,48
/* Compute the number of 64 bytes iterations needed. */
srdi r11,r4,6 /* Compute loop count (maxlen / 64). */
andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */
mtctr r11 /* Move loop count to counter register. */
/* Handle maxlen > 64. Loop over the bytes in strides of 64B. */
.p2align 4
L(loop_64B):
lvx v1,r5,r6 /* r5 is the pointer to s. */
lvx v2,r5,r7
lvx v3,r5,r10
lvx v4,r5,r9
/* Compare the four 16B vectors to obtain the least 16 values.
Null bytes should emerge into v7, then check for null bytes. */
vminub v5,v1,v2
vminub v6,v3,v4
vminub v7,v5,v6
vcmpequb. v7,v7,v0 /* Check for null bytes. */
addi r5,r5,64 /* Add pointer to next iteration. */
bne cr6,L(found_64B) /* If found null bytes. */
bdnz L(loop_64B) /* Continue the loop if count > 0. */
/* Hit loop end without null match. So branch to handle the remainder. */
/* Prepare a 16B loop to handle two cases:
1. If 32 > maxlen < 64.
2. If maxlen >= 64, and reached end of the 64B loop with null
bytes not found. Thus handle the remainder bytes here. */
.p2align 4
L(smaller):
cmpldi r4,0 /* Check maxlen is zero. */
beq L(done) /* If maxlen is zero. */
/* Place rounded up number of qw's to check into a vmx
register, and use some vector tricks to minimize
branching. */
mtvrd v7,r4 /* copy maxlen from gpr to vector register. */
vspltisb v5,1
vspltisb v6,15
vspltb v2,v7,7
vaddubs v3,v5,v6
#ifdef __LITTLE_ENDIAN__
vspltish v5,1 /* Compute 16 in each byte. */
#endif
/* Loop in 16B aligned incremements now. */
.p2align 4
L(loop_16B):
lvx v1,r5,r6 /* Load quadword into vector register. */
addi r5,r5,16 /* Increment address to next 16B block. */
vor v7,v2,v2 /* Save loop count (v2) into v7. */
vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */
vminub v4,v1,v2
vcmpequb. v4,v4,v0 /* Checking for null bytes. */
beq cr6,L(loop_16B) /* If null bytes not found. */
vcmpequb v1,v1,v0
vbpermq v1,v1,v10
#ifdef __LITTLE_ENDIAN__
vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */
vandc v2,v2,v1
vpopcnth v1,v2 /* count of trailing zeros, 16 if none. */
#else
vclzh v1,v1 /* count the leading zeros, 16 if none. */
#endif
/* Truncate to maximum allowable offset. */
vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond
maxlen. */
vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */
mfvrd r0,v1
addi r5,r5,-16 /* Undo speculative bump. */
extsb r0,r0 /* Clear whatever gunk is in the high 56b. */
add r5,r5,r0 /* Add the offset of whatever was found. */
L(done):
subf r3,r3,r5 /* Length is equal to the offset of null byte
matched minus the pointer to s. */
blr /* Done. */
/* Handle case of maxlen > 64 and found null bytes in last block
of 64 bytes read. */
.p2align 4
L(found_64B):
/* A zero was found. Reduce the result. */
vcmpequb v1,v1,v0
vcmpequb v2,v2,v0
vcmpequb v3,v3,v0
vcmpequb v4,v4,v0
/* Permute the first bit of each byte into bits 48-63. */
vbpermq v1,v1,v10
vbpermq v2,v2,v10
vbpermq v3,v3,v10
vbpermq v4,v4,v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vsldoi v2,v2,v2,2
vsldoi v3,v3,v3,4
vsldoi v4,v4,v4,6
#else
vsldoi v1,v1,v1,6
vsldoi v2,v2,v2,4
vsldoi v3,v3,v3,2
#endif
/* Merge the results and move to a GPR. */
vor v1,v2,v1
vor v2,v3,v4
vor v4,v1,v2
/* Adjust address to the start of the current 64B block. */
addi r5,r5,-64
mfvrd r10,v4
#ifdef __LITTLE_ENDIAN__
addi r9,r10,-1 /* Form a mask from trailing zeros. */
andc r9,r9,r10
popcntd r0,r9 /* Count the bits in the mask. */
#else
cntlzd r0,r10 /* Count leading zeros before the match. */
#endif
subf r5,r3,r5
add r3,r5,r0 /* Compute final length. */
blr /* Done. */
/* Handle case where null bytes were found while aligning
as a preparation for the 64B loop. */
.p2align 4
L(found_aligning64B):
vbpermq v1,v1,v10
#ifdef __LITTLE_ENDIAN__
mfvrd r10,v1
addi r9,r10,-1 /* Form a mask from trailing zeros. */
andc r9,r9,r10
popcntd r0,r9 /* Count the bits in the mask. */
#else
vsldoi v1,v1,v1,6
mfvrd r10,v1
cntlzd r0,r10 /* Count leading zeros before the match. */
#endif
addi r5,r5,-16 /* Adjust address to offset of last 16 bytes
read. */
/* Calculate length as subtracted the pointer to s of last 16 bytes
offset, added with the bytes before the match. */
subf r5,r3,r5
add r3,r5,r0
blr /* Done. */
/* Handle case of maxlen > 32 and found a null bytes within the first
16 bytes of s. */
.p2align 4
L(early_find):
bpermd r5,r8,r10 /* r8 contains the bit permute constants. */
bpermd r6,r8,r11
sldi r5,r5,8
or r5,r5,r6 /* r5 should hold a 16B mask of
a potential 0. */
cntlzd r5,r5 /* Count leading zeros. */
addi r3,r5,-48 /* Deduct the 48 leading zeros always
present. */
blr /* Done. */
/* Handle case of maxlen <= 32. Use the POWER7 algorithm. */
.p2align 4
L(small_range):
clrrdi r8,r3,3 /* Align the pointer to 8B. */
li r0,0
/* Register's content at this point:
r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
r7 == last acceptable address. */
cmpldi r4,0 /* Check if maxlen is zero. */
beq L(end_max) /* If maxlen is zero. */
/* Calculate the last acceptable address and check for possible
addition overflow by using satured math:
r7 = r3 + r4
r7 |= -(r7 < x) */
add r7,r3,r4
subfc r6,r3,r7
subfe r9,r9,r9
extsw r6,r9
or r7,r7,r6
addi r7,r7,-1
clrrdi r7,r7,3 /* Align to 8B address of last
acceptable address. */
rlwinm r6,r3,3,26,28 /* Calculate padding. */
ld r12,0(r8) /* Load aligned doubleword. */
cmpb r10,r12,r0 /* Check for null bytes. */
#ifdef __LITTLE_ENDIAN__
srd r10,r10,r6
sld r10,r10,r6
#else
sld r10,r10,r6
srd r10,r10,r6
#endif /* __LITTLE_ENDIAN__ */
cmpldi cr7,r10,0
bne cr7,L(done_small) /* If found null byte. */
cmpld r8,r7 /* Check if reached maxlen. */
beq L(end_max) /* If reached maxlen. */
/* Still handling case of maxlen <= 32. Read doubleword aligned until
find null bytes or reach maxlen. */
.p2align 4
L(loop_small):
ldu r12,8(r8) /* Load next doubleword and update r8. */
cmpb r10,r12,r0 /* Check for null bytes. */
cmpldi cr6,r10,0
bne cr6,L(done_small) /* If found null bytes. */
cmpld r8,r7 /* Check if reached maxlen. */
bne L(loop_small) /* If it has more bytes to read. */
mr r3,r4 /* Reached maxlen with null bytes not found.
Length is equal to maxlen. */
blr /* Done. */
/* Still handling case of maxlen <= 32. Found null bytes.
Registers: r10 == match bits within doubleword, r8 == address of
last doubleword read, r3 == pointer to s, r4 == maxlen. */
.p2align 4
L(done_small):
#ifdef __LITTLE_ENDIAN__
/* Count trailing zeros. */
addi r0,r10,-1
andc r0,r0,r10
popcntd r0,r0
#else
cntlzd r0,r10 /* Count leading zeros before the match. */
#endif
sub r3,r8,r3 /* Calculate total of bytes before the match. */
srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
add r3,r3,r0 /* Length until the match. */
cmpld r3,r4 /* Check length is greater than maxlen. */
blelr
mr r3,r4 /* If length is greater than maxlen, return
maxlen. */
blr
/* Handle case of reached maxlen with null bytes not found. */
.p2align 4
L(end_max):
mr r3,r4 /* Length is equal to maxlen. */
blr /* Done. */
END (__strnlen)
libc_hidden_def (__strnlen)
weak_alias (__strnlen, strnlen)
libc_hidden_def (strnlen)