PowerPC LE strlen

http://sourceware.org/ml/libc-alpha/2013-08/msg00097.html

This is the first of nine patches adding little-endian support to the
existing optimised string and memory functions.  I did spend some
time with a power7 simulator looking at cycle by cycle behaviour for
memchr, but most of these patches have not been run on cpu simulators
to check that we are going as fast as possible.  I'm sure PowerPC can
do better.  However, the little-endian support mostly leaves main
loops unchanged, so I'm banking on previous authors having done a
good job on big-endian..  As with most code you stare at long enough,
I found some improvements for big-endian too.

Little-endian support for strlen.  Like most of the string functions,
I leave the main word or multiple-word loops substantially unchanged,
just needing to modify the tail.

Removing the branch in the power7 functions is just a tidy.  .align
produces a branch anyway.  Modifying regs in the non-power7 functions
is to suit the new little-endian tail.

	* sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
	support.  Don't branch over align.
	* sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
	* sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian support.
	Rearrange tmp reg use to suit.  Comment.
	* sysdeps/powerpc/powerpc32/strlen.S: Likewise.
This commit is contained in:
Alan Modra 2013-08-17 18:40:11 +09:30
parent f7c399cff5
commit db9b4570c5
5 changed files with 140 additions and 47 deletions

View File

@ -1,3 +1,12 @@
2013-10-04 Alan Modra <amodra@gmail.com>
* sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
support. Don't branch over align.
* sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
* sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian
support. Rearrange tmp reg use to suit. Comment.
* sysdeps/powerpc/powerpc32/strlen.S: Likewise.
2013-10-04 Alan Modra <amodra@gmail.com>
* sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h: New file.

View File

@ -29,7 +29,11 @@ ENTRY (strlen)
li r0,0 /* Word with null chars to use with cmpb. */
li r5,-1 /* MASK = 0xffffffffffffffff. */
lwz r12,0(r4) /* Load word from memory. */
#ifdef __LITTLE_ENDIAN__
slw r5,r5,r6
#else
srw r5,r5,r6 /* MASK = MASK >> padding. */
#endif
orc r9,r12,r5 /* Mask bits that are not part of the string. */
cmpb r10,r9,r0 /* Check for null bytes in WORD1. */
cmpwi cr7,r10,0 /* If r10 == 0, no null's have been found. */
@ -47,9 +51,6 @@ ENTRY (strlen)
cmpb r10,r12,r0
cmpwi cr7,r10,0
bne cr7,L(done)
b L(loop) /* We branch here (rather than falling through)
to skip the nops due to heavy alignment
of the loop below. */
/* Main loop to look for the end of the string. Since it's a
small loop (< 8 instructions), align it to 32-bytes. */
@ -86,9 +87,15 @@ L(loop):
0xff in the same position as the null byte in the original
word from the string. Use that to calculate the length. */
L(done):
cntlzw r0,r10 /* Count leading zeroes before the match. */
#ifdef __LITTLE_ENDIAN__
addi r9, r10, -1 /* Form a mask from trailing zeros. */
andc r9, r9, r10
popcntw r0, r9 /* Count the bits in the mask. */
#else
cntlzw r0,r10 /* Count leading zeros before the match. */
#endif
subf r5,r3,r4
srwi r0,r0,3 /* Convert leading zeroes to bytes. */
srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r5,r0 /* Compute final length. */
blr
END (strlen)

View File

@ -29,7 +29,12 @@
1 is subtracted you get a value in the range 0x00-0x7f, none of which
have their high bit set. The expression here is
(x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
there were no 0x00 bytes in the word.
there were no 0x00 bytes in the word. You get 0x80 in bytes that
match, but possibly false 0x80 matches in the next more significant
byte to a true match due to carries. For little-endian this is
of no consequence since the least significant match is the one
we're interested in, but big-endian needs method 2 to find which
byte matches.
2) Given a word 'x', we can test to see _which_ byte was zero by
calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@ -72,7 +77,7 @@
ENTRY (strlen)
#define rTMP1 r0
#define rTMP4 r0
#define rRTN r3 /* incoming STR arg, outgoing result */
#define rSTR r4 /* current string position */
#define rPADN r5 /* number of padding bits we prepend to the
@ -82,9 +87,9 @@ ENTRY (strlen)
#define rWORD1 r8 /* current string word */
#define rWORD2 r9 /* next string word */
#define rMASK r9 /* mask for first string word */
#define rTMP2 r10
#define rTMP3 r11
#define rTMP4 r12
#define rTMP1 r10
#define rTMP2 r11
#define rTMP3 r12
clrrwi rSTR, rRTN, 2
@ -93,15 +98,20 @@ ENTRY (strlen)
lwz rWORD1, 0(rSTR)
li rMASK, -1
addi r7F7F, r7F7F, 0x7f7f
/* That's the setup done, now do the first pair of words.
We make an exception and use method (2) on the first two words, to reduce
overhead. */
/* We use method (2) on the first two words, because rFEFE isn't
required which reduces setup overhead. Also gives a faster return
for small strings on big-endian due to needing to recalculate with
method (2) anyway. */
#ifdef __LITTLE_ENDIAN__
slw rMASK, rMASK, rPADN
#else
srw rMASK, rMASK, rPADN
#endif
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
add rTMP1, rTMP1, r7F7F
nor rTMP1, rTMP2, rTMP1
and. rWORD1, rTMP1, rMASK
nor rTMP3, rTMP2, rTMP1
and. rTMP3, rTMP3, rMASK
mtcrf 0x01, rRTN
bne L(done0)
lis rFEFE, -0x101
@ -110,11 +120,12 @@ ENTRY (strlen)
bt 29, L(loop)
/* Handle second word of pair. */
/* Perhaps use method (1) here for little-endian, saving one instruction? */
lwzu rWORD1, 4(rSTR)
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
add rTMP1, rTMP1, r7F7F
nor. rWORD1, rTMP2, rTMP1
nor. rTMP3, rTMP2, rTMP1
bne L(done0)
/* The loop. */
@ -128,28 +139,52 @@ L(loop):
add rTMP3, rFEFE, rWORD2
nor rTMP4, r7F7F, rWORD2
bne L(done1)
and. rTMP1, rTMP3, rTMP4
and. rTMP3, rTMP3, rTMP4
beq L(loop)
#ifndef __LITTLE_ENDIAN__
and rTMP1, r7F7F, rWORD2
add rTMP1, rTMP1, r7F7F
andc rWORD1, rTMP4, rTMP1
andc rTMP3, rTMP4, rTMP1
b L(done0)
L(done1):
and rTMP1, r7F7F, rWORD1
subi rSTR, rSTR, 4
add rTMP1, rTMP1, r7F7F
andc rWORD1, rTMP2, rTMP1
andc rTMP3, rTMP2, rTMP1
/* When we get to here, rSTR points to the first word in the string that
contains a zero byte, and the most significant set bit in rWORD1 is in that
byte. */
contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
and 0x00 otherwise. */
L(done0):
cntlzw rTMP3, rWORD1
cntlzw rTMP3, rTMP3
subf rTMP1, rRTN, rSTR
srwi rTMP3, rTMP3, 3
add rRTN, rTMP1, rTMP3
blr
#else
L(done0):
addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
andc rTMP1, rTMP1, rTMP3
cntlzw rTMP1, rTMP1 /* Count bits not in the mask. */
subf rTMP3, rRTN, rSTR
subfic rTMP1, rTMP1, 32-7
srwi rTMP1, rTMP1, 3
add rRTN, rTMP1, rTMP3
blr
L(done1):
addi rTMP3, rTMP1, -1
andc rTMP3, rTMP3, rTMP1
cntlzw rTMP3, rTMP3
subf rTMP1, rRTN, rSTR
subfic rTMP3, rTMP3, 32-7-32
srawi rTMP3, rTMP3, 3
add rRTN, rTMP1, rTMP3
blr
#endif
END (strlen)
libc_hidden_builtin_def (strlen)

View File

@ -30,7 +30,11 @@ ENTRY (strlen)
with cmpb. */
li r5,-1 /* MASK = 0xffffffffffffffff. */
ld r12,0(r4) /* Load doubleword from memory. */
#ifdef __LITTLE_ENDIAN__
sld r5,r5,r6
#else
srd r5,r5,r6 /* MASK = MASK >> padding. */
#endif
orc r9,r12,r5 /* Mask bits that are not part of the string. */
cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
@ -48,9 +52,6 @@ ENTRY (strlen)
cmpb r10,r12,r0
cmpdi cr7,r10,0
bne cr7,L(done)
b L(loop) /* We branch here (rather than falling through)
to skip the nops due to heavy alignment
of the loop below. */
/* Main loop to look for the end of the string. Since it's a
small loop (< 8 instructions), align it to 32-bytes. */
@ -87,9 +88,15 @@ L(loop):
0xff in the same position as the null byte in the original
doubleword from the string. Use that to calculate the length. */
L(done):
cntlzd r0,r10 /* Count leading zeroes before the match. */
#ifdef __LITTLE_ENDIAN__
addi r9, r10, -1 /* Form a mask from trailing zeros. */
andc r9, r9, r10
popcntd r0, r9 /* Count the bits in the mask. */
#else
cntlzd r0,r10 /* Count leading zeros before the match. */
#endif
subf r5,r3,r4
srdi r0,r0,3 /* Convert leading zeroes to bytes. */
srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
add r3,r5,r0 /* Compute final length. */
blr
END (strlen)

View File

@ -29,7 +29,12 @@
1 is subtracted you get a value in the range 0x00-0x7f, none of which
have their high bit set. The expression here is
(x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
there were no 0x00 bytes in the word.
there were no 0x00 bytes in the word. You get 0x80 in bytes that
match, but possibly false 0x80 matches in the next more significant
byte to a true match due to carries. For little-endian this is
of no consequence since the least significant match is the one
we're interested in, but big-endian needs method 2 to find which
byte matches.
2) Given a word 'x', we can test to see _which_ byte was zero by
calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@ -62,7 +67,7 @@
Answer:
1) Added a Data Cache Block Touch early to prefetch the first 128
byte cache line. Adding dcbt instructions to the loop would not be
effective since most strings will be shorter than the cache line.*/
effective since most strings will be shorter than the cache line. */
/* Some notes on register usage: Under the SVR4 ABI, we can use registers
0 and 3 through 12 (so long as we don't call any procedures) without
@ -78,7 +83,7 @@
ENTRY (strlen)
CALL_MCOUNT 1
#define rTMP1 r0
#define rTMP4 r0
#define rRTN r3 /* incoming STR arg, outgoing result */
#define rSTR r4 /* current string position */
#define rPADN r5 /* number of padding bits we prepend to the
@ -88,9 +93,9 @@ ENTRY (strlen)
#define rWORD1 r8 /* current string doubleword */
#define rWORD2 r9 /* next string doubleword */
#define rMASK r9 /* mask for first string doubleword */
#define rTMP2 r10
#define rTMP3 r11
#define rTMP4 r12
#define rTMP1 r10
#define rTMP2 r11
#define rTMP3 r12
dcbt 0,rRTN
clrrdi rSTR, rRTN, 3
@ -100,30 +105,36 @@ ENTRY (strlen)
addi r7F7F, r7F7F, 0x7f7f
li rMASK, -1
insrdi r7F7F, r7F7F, 32, 0
/* That's the setup done, now do the first pair of doublewords.
We make an exception and use method (2) on the first two doublewords,
to reduce overhead. */
/* We use method (2) on the first two doublewords, because rFEFE isn't
required which reduces setup overhead. Also gives a faster return
for small strings on big-endian due to needing to recalculate with
method (2) anyway. */
#ifdef __LITTLE_ENDIAN__
sld rMASK, rMASK, rPADN
#else
srd rMASK, rMASK, rPADN
#endif
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
lis rFEFE, -0x101
add rTMP1, rTMP1, r7F7F
addi rFEFE, rFEFE, -0x101
nor rTMP1, rTMP2, rTMP1
and. rWORD1, rTMP1, rMASK
nor rTMP3, rTMP2, rTMP1
and. rTMP3, rTMP3, rMASK
mtcrf 0x01, rRTN
bne L(done0)
sldi rTMP1, rFEFE, 32
add rFEFE, rFEFE, rTMP1
sldi rTMP1, rFEFE, 32
add rFEFE, rFEFE, rTMP1
/* Are we now aligned to a doubleword boundary? */
bt 28, L(loop)
/* Handle second doubleword of pair. */
/* Perhaps use method (1) here for little-endian, saving one instruction? */
ldu rWORD1, 8(rSTR)
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
add rTMP1, rTMP1, r7F7F
nor. rWORD1, rTMP2, rTMP1
nor. rTMP3, rTMP2, rTMP1
bne L(done0)
/* The loop. */
@ -137,28 +148,52 @@ L(loop):
add rTMP3, rFEFE, rWORD2
nor rTMP4, r7F7F, rWORD2
bne L(done1)
and. rTMP1, rTMP3, rTMP4
and. rTMP3, rTMP3, rTMP4
beq L(loop)
#ifndef __LITTLE_ENDIAN__
and rTMP1, r7F7F, rWORD2
add rTMP1, rTMP1, r7F7F
andc rWORD1, rTMP4, rTMP1
andc rTMP3, rTMP4, rTMP1
b L(done0)
L(done1):
and rTMP1, r7F7F, rWORD1
subi rSTR, rSTR, 8
add rTMP1, rTMP1, r7F7F
andc rWORD1, rTMP2, rTMP1
andc rTMP3, rTMP2, rTMP1
/* When we get to here, rSTR points to the first doubleword in the string that
contains a zero byte, and the most significant set bit in rWORD1 is in that
byte. */
contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
otherwise. */
L(done0):
cntlzd rTMP3, rWORD1
cntlzd rTMP3, rTMP3
subf rTMP1, rRTN, rSTR
srdi rTMP3, rTMP3, 3
add rRTN, rTMP1, rTMP3
blr
#else
L(done0):
addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
andc rTMP1, rTMP1, rTMP3
cntlzd rTMP1, rTMP1 /* Count bits not in the mask. */
subf rTMP3, rRTN, rSTR
subfic rTMP1, rTMP1, 64-7
srdi rTMP1, rTMP1, 3
add rRTN, rTMP1, rTMP3
blr
L(done1):
addi rTMP3, rTMP1, -1
andc rTMP3, rTMP3, rTMP1
cntlzd rTMP3, rTMP3
subf rTMP1, rRTN, rSTR
subfic rTMP3, rTMP3, 64-7-64
sradi rTMP3, rTMP3, 3
add rRTN, rTMP1, rTMP3
blr
#endif
END (strlen)
libc_hidden_builtin_def (strlen)