PowerPC LE strlen

http://sourceware.org/ml/libc-alpha/2013-08/msg00097.html This is the first of nine patches adding little-endian support to the existing optimised string and memory functions. I did spend some time with a power7 simulator looking at cycle by cycle behaviour for memchr, but most of these patches have not been run on cpu simulators to check that we are going as fast as possible. I'm sure PowerPC can do better. However, the little-endian support mostly leaves main loops unchanged, so I'm banking on previous authors having done a good job on big-endian.. As with most code you stare at long enough, I found some improvements for big-endian too. Little-endian support for strlen. Like most of the string functions, I leave the main word or multiple-word loops substantially unchanged, just needing to modify the tail. Removing the branch in the power7 functions is just a tidy. .align produces a branch anyway. Modifying regs in the non-power7 functions is to suit the new little-endian tail. * sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian support. Don't branch over align. * sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise. * sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian support. Rearrange tmp reg use to suit. Comment. * sysdeps/powerpc/powerpc32/strlen.S: Likewise.
2024-12-23 11:20:07 +00:00 · 2013-08-17 18:40:11 +09:30 · 2013-08-17 18:40:11 +09:30 · db9b4570c5
commit db9b4570c5
parent f7c399cff5
5 changed files with 140 additions and 47 deletions
--- a/9
+++ b/9
@ -1,3 +1,12 @@
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
+	support.  Don't branch over align.
+	* sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
+	* sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian
+	support.  Rearrange tmp reg use to suit.  Comment.
+	* sysdeps/powerpc/powerpc32/strlen.S: Likewise.
+
 2013-10-04  Alan Modra  <amodra@gmail.com>

 	* sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h: New file.
--- a/sysdeps/powerpc/powerpc32/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc32/power7/strlen.S
@ -29,7 +29,11 @@ ENTRY (strlen)
 	li	r0,0	      /* Word with null chars to use with cmpb.  */
 	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
 	lwz	r12,0(r4)     /* Load word from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	r5,r5,r6
+#else
 	srw	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
 	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
 	cmpb	r10,r9,r0     /* Check for null bytes in WORD1.  */
 	cmpwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
@ -47,9 +51,6 @@ ENTRY (strlen)
 	cmpb	r10,r12,r0
 	cmpwi	cr7,r10,0
 	bne	cr7,L(done)
-	b	L(loop)	      /* We branch here (rather than falling through)
-				 to skip the nops due to heavy alignment
-				 of the loop below.  */

 	/* Main loop to look for the end of the string.  Since it's a
 	   small loop (< 8 instructions), align it to 32-bytes.  */
@ -86,9 +87,15 @@ L(loop):
 	   0xff in the same position as the null byte in the original
 	   word from the string.  Use that to calculate the length.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntw r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
 	subf	r5,r3,r4
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r5,r0      /* Compute final length.  */
 	blr
 END (strlen)
--- a/sysdeps/powerpc/powerpc32/strlen.S
+++ b/sysdeps/powerpc/powerpc32/strlen.S
@ -29,7 +29,12 @@
      1 is subtracted you get a value in the range 0x00-0x7f, none of which
      have their high bit set. The expression here is
      (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
-      there were no 0x00 bytes in the word.
+      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+      match, but possibly false 0x80 matches in the next more significant
+      byte to a true match due to carries.  For little-endian this is
+      of no consequence since the least significant match is the one
+      we're interested in, but big-endian needs method 2 to find which
+      byte matches.

   2) Given a word 'x', we can test to see _which_ byte was zero by
      calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@ -72,7 +77,7 @@

 ENTRY (strlen)

-#define rTMP1	r0
+#define rTMP4	r0
 #define rRTN	r3	/* incoming STR arg, outgoing result */
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
@ -82,9 +87,9 @@ ENTRY (strlen)
 #define rWORD1	r8	/* current string word */
 #define rWORD2	r9	/* next string word */
 #define rMASK	r9	/* mask for first string word */
-#define rTMP2	r10
-#define rTMP3	r11
-#define rTMP4	r12
+#define rTMP1	r10
+#define rTMP2	r11
+#define rTMP3	r12


 	clrrwi	rSTR, rRTN, 2
@ -93,15 +98,20 @@ ENTRY (strlen)
 	lwz	rWORD1, 0(rSTR)
 	li	rMASK, -1
 	addi	r7F7F, r7F7F, 0x7f7f
-/* That's the setup done, now do the first pair of words.
-   We make an exception and use method (2) on the first two words, to reduce
-   overhead.  */
+/* We use method (2) on the first two words, because rFEFE isn't
+   required which reduces setup overhead.  Also gives a faster return
+   for small strings on big-endian due to needing to recalculate with
+   method (2) anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rPADN
+#else
 	srw	rMASK, rMASK, rPADN
+#endif
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP1, rTMP2, rTMP1
-	and.	rWORD1, rTMP1, rMASK
+	nor	rTMP3, rTMP2, rTMP1
+	and.	rTMP3, rTMP3, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
 	lis	rFEFE, -0x101
@ -110,11 +120,12 @@ ENTRY (strlen)
 	bt	29, L(loop)

 /* Handle second word of pair.  */
+/* Perhaps use method (1) here for little-endian, saving one instruction?  */
 	lwzu	rWORD1, 4(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor.	rWORD1, rTMP2, rTMP1
+	nor.	rTMP3, rTMP2, rTMP1
 	bne	L(done0)

 /* The loop.  */
@ -128,28 +139,52 @@ L(loop):
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
-	and.	rTMP1, rTMP3, rTMP4
+	and.	rTMP3, rTMP3, rTMP4
 	beq	L(loop)

+#ifndef __LITTLE_ENDIAN__
 	and	rTMP1, r7F7F, rWORD2
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP4, rTMP1
+	andc	rTMP3, rTMP4, rTMP1
 	b	L(done0)

 L(done1):
 	and	rTMP1, r7F7F, rWORD1
 	subi	rSTR, rSTR, 4
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP2, rTMP1
+	andc	rTMP3, rTMP2, rTMP1

 /* When we get to here, rSTR points to the first word in the string that
-   contains a zero byte, and the most significant set bit in rWORD1 is in that
-   byte.  */
+   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
+   and 0x00 otherwise.  */
 L(done0):
-	cntlzw	rTMP3, rWORD1
+	cntlzw	rTMP3, rTMP3
 	subf	rTMP1, rRTN, rSTR
 	srwi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	blr
+#else
+
+L(done0):
+	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
+	andc	rTMP1, rTMP1, rTMP3
+	cntlzw	rTMP1, rTMP1		/* Count bits not in the mask.  */
+	subf	rTMP3, rRTN, rSTR
+	subfic	rTMP1, rTMP1, 32-7
+	srwi	rTMP1, rTMP1, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+
+L(done1):
+	addi	rTMP3, rTMP1, -1
+	andc	rTMP3, rTMP3, rTMP1
+	cntlzw	rTMP3, rTMP3
+	subf	rTMP1, rRTN, rSTR
+	subfic	rTMP3, rTMP3, 32-7-32
+	srawi	rTMP3, rTMP3, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+#endif
+
 END (strlen)
 libc_hidden_builtin_def (strlen)
--- a/sysdeps/powerpc/powerpc64/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strlen.S
@ -30,7 +30,11 @@ ENTRY (strlen)
 				 with cmpb.  */
 	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
 	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
 	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
 	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
 	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
 	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
@ -48,9 +52,6 @@ ENTRY (strlen)
 	cmpb	r10,r12,r0
 	cmpdi	cr7,r10,0
 	bne	cr7,L(done)
-	b	L(loop)	      /* We branch here (rather than falling through)
-				 to skip the nops due to heavy alignment
-				 of the loop below.  */

 	/* Main loop to look for the end of the string.  Since it's a
 	   small loop (< 8 instructions), align it to 32-bytes.  */
@ -87,9 +88,15 @@ L(loop):
 	   0xff in the same position as the null byte in the original
 	   doubleword from the string.  Use that to calculate the length.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntd r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
 	subf	r5,r3,r4
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r5,r0      /* Compute final length.  */
 	blr
 END (strlen)
--- a/sysdeps/powerpc/powerpc64/strlen.S
+++ b/sysdeps/powerpc/powerpc64/strlen.S
@ -29,7 +29,12 @@
      1 is subtracted you get a value in the range 0x00-0x7f, none of which
      have their high bit set. The expression here is
      (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
-      there were no 0x00 bytes in the word.
+      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+      match, but possibly false 0x80 matches in the next more significant
+      byte to a true match due to carries.  For little-endian this is
+      of no consequence since the least significant match is the one
+      we're interested in, but big-endian needs method 2 to find which
+      byte matches.

   2) Given a word 'x', we can test to see _which_ byte was zero by
      calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@ -62,7 +67,7 @@
   Answer:
   1) Added a Data Cache Block Touch early to prefetch the first 128
   byte cache line. Adding dcbt instructions to the loop would not be
-   effective since most strings will be shorter than the cache line.*/
+   effective since most strings will be shorter than the cache line.  */

 /* Some notes on register usage: Under the SVR4 ABI, we can use registers
   0 and 3 through 12 (so long as we don't call any procedures) without
@ -78,7 +83,7 @@
 ENTRY (strlen)
 	CALL_MCOUNT 1

-#define rTMP1	r0
+#define rTMP4	r0
 #define rRTN	r3	/* incoming STR arg, outgoing result */
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
@ -88,9 +93,9 @@ ENTRY (strlen)
 #define rWORD1	r8	/* current string doubleword */
 #define rWORD2	r9	/* next string doubleword */
 #define rMASK	r9	/* mask for first string doubleword */
-#define rTMP2	r10
-#define rTMP3	r11
-#define rTMP4	r12
+#define rTMP1	r10
+#define rTMP2	r11
+#define rTMP3	r12

 	dcbt	0,rRTN
 	clrrdi	rSTR, rRTN, 3
@ -100,30 +105,36 @@ ENTRY (strlen)
 	addi	r7F7F, r7F7F, 0x7f7f
 	li	rMASK, -1
 	insrdi	r7F7F, r7F7F, 32, 0
-/* That's the setup done, now do the first pair of doublewords.
-   We make an exception and use method (2) on the first two doublewords,
-   to reduce overhead.  */
+/* We use method (2) on the first two doublewords, because rFEFE isn't
+   required which reduces setup overhead.  Also gives a faster return
+   for small strings on big-endian due to needing to recalculate with
+   method (2) anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	rMASK, rMASK, rPADN
+#else
 	srd	rMASK, rMASK, rPADN
+#endif
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	lis	rFEFE, -0x101
 	add	rTMP1, rTMP1, r7F7F
 	addi	rFEFE, rFEFE, -0x101
-	nor	rTMP1, rTMP2, rTMP1
-	and.	rWORD1, rTMP1, rMASK
+	nor	rTMP3, rTMP2, rTMP1
+	and.	rTMP3, rTMP3, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
-	sldi  rTMP1, rFEFE, 32
-	add  rFEFE, rFEFE, rTMP1
+	sldi	rTMP1, rFEFE, 32
+	add	rFEFE, rFEFE, rTMP1
 /* Are we now aligned to a doubleword boundary?  */
 	bt	28, L(loop)

 /* Handle second doubleword of pair.  */
+/* Perhaps use method (1) here for little-endian, saving one instruction?  */
 	ldu	rWORD1, 8(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor.	rWORD1, rTMP2, rTMP1
+	nor.	rTMP3, rTMP2, rTMP1
 	bne	L(done0)

 /* The loop.  */
@ -137,28 +148,52 @@ L(loop):
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
-	and.	rTMP1, rTMP3, rTMP4
+	and.	rTMP3, rTMP3, rTMP4
 	beq	L(loop)

+#ifndef __LITTLE_ENDIAN__
 	and	rTMP1, r7F7F, rWORD2
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP4, rTMP1
+	andc	rTMP3, rTMP4, rTMP1
 	b	L(done0)

 L(done1):
 	and	rTMP1, r7F7F, rWORD1
 	subi	rSTR, rSTR, 8
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP2, rTMP1
+	andc	rTMP3, rTMP2, rTMP1

 /* When we get to here, rSTR points to the first doubleword in the string that
-   contains a zero byte, and the most significant set bit in rWORD1 is in that
-   byte.  */
+   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
+   otherwise.  */
 L(done0):
-	cntlzd	rTMP3, rWORD1
+	cntlzd	rTMP3, rTMP3
 	subf	rTMP1, rRTN, rSTR
 	srdi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	blr
+#else
+
+L(done0):
+	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
+	andc	rTMP1, rTMP1, rTMP3
+	cntlzd	rTMP1, rTMP1		/* Count bits not in the mask.  */
+	subf	rTMP3, rRTN, rSTR
+	subfic	rTMP1, rTMP1, 64-7
+	srdi	rTMP1, rTMP1, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+
+L(done1):
+	addi	rTMP3, rTMP1, -1
+	andc	rTMP3, rTMP3, rTMP1
+	cntlzd	rTMP3, rTMP3
+	subf	rTMP1, rRTN, rSTR
+	subfic	rTMP3, rTMP3, 64-7-64
+	sradi	rTMP3, rTMP3, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+#endif
+
 END (strlen)
 libc_hidden_builtin_def (strlen)