Optimize the strlen implementation by using a page cross check and a fast check

for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
and on random tests it is ~80% better.
This commit is contained in:
Wilco Dijkstra 2015-07-13 12:38:12 +01:00
parent 70249b57e1
commit c435989f52
2 changed files with 171 additions and 67 deletions

View File

@ -1,3 +1,7 @@
2015-07-13 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
2015-07-11 H.J. Lu <hongjiu.lu@intel.com>
* stdio-common/tst-fmemopen2.c (do_test_without_buffer): Replace

View File

@ -20,9 +20,13 @@
/* Assumptions:
*
* ARMv8-a, AArch64
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
/* To test the page crossing code path more thoroughly, compile with
-DTEST_PAGE_CROSS - this will force all calls through the slower
entry path. This option is not intended for production use. */
/* Arguments and results. */
#define srcin x0
#define len x0
@ -31,87 +35,183 @@
#define src x1
#define data1 x2
#define data2 x3
#define data2a x4
#define has_nul1 x5
#define has_nul2 x6
#define tmp1 x7
#define tmp2 x8
#define tmp3 x9
#define tmp4 x10
#define zeroones x11
#define pos x12
#define has_nul1 x4
#define has_nul2 x5
#define tmp1 x4
#define tmp2 x5
#define tmp3 x6
#define tmp4 x7
#define zeroones x8
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. A faster check
(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Start of critial section -- keep to one 64Byte cache line. */
ENTRY_ALIGN (strlen, 6)
mov zeroones, #REP8_01
bic src, srcin, #15
ands tmp1, srcin, #15
b.ne L(misaligned)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
/* The inner loop deals with two Dwords at a time. This has a
slightly higher start-up cost, but we should win quite quickly,
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
L(loop):
ldp data1, data2, [src], #16
L(realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq L(loop)
/* End of critical section -- keep to one 64Byte cache line. */
sub len, src, srcin
cbz has_nul1, L(nul_in_data2)
#ifdef __AARCH64EB__
mov data2, data1
#ifdef TEST_PAGE_CROSS
# define MIN_PAGE_SIZE 15
#else
# define MIN_PAGE_SIZE 4096
#endif
sub len, len, #8
mov has_nul2, has_nul1
L(nul_in_data2):
/* Since strings are short on average, we check the first 16 bytes
of the string for a NUL character. In order to do an unaligned ldp
safely we have to do a page cross check first. If there is a NUL
byte we calculate the length from the 2 8-byte words using
conditional select to reduce branch mispredictions (it is unlikely
strlen will be repeatedly called on strings with the same length).
If the string is longer than 16 bytes, we align src so don't need
further page cross checks, and process 32 bytes per iteration
using the fast NUL check. If we encounter non-ASCII characters,
fallback to a second loop using the full NUL check.
If the page cross check fails, we read 16 bytes from an aligned
address, remove any characters before the string, and continue
in the main loop using aligned loads. Since strings crossing a
page in the first 16 bytes are rare (probability of
16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
AArch64 systems have a minimum page size of 4k. We don't bother
checking for larger page sizes - the cost of setting up the correct
page size is just not worth the extra gain from a small reduction in
the cases taking the slow path. Note that we only care about
whether the first fetch, which may be misaligned, crosses a page
boundary. */
ENTRY_ALIGN (strlen, 6)
and tmp1, srcin, MIN_PAGE_SIZE - 1
mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
b.gt L(page_cross)
ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
string is 0x01) means we cannot use has_nul1/2 directly.
Since we expect strings to be small and early-exit,
byte-swap the data now so has_null1/2 will be correct. */
rev data1, data1
rev data2, data2
#endif
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(main_loop_entry)
/* Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
clz tmp1, has_nul1
csel len, xzr, len, cc
add len, len, tmp1, lsr 3
ret
/* The inner loop processes 32 bytes per iteration and uses the fast
NUL check. If we encounter non-ASCII characters, use a second
loop with the accurate NUL check. */
.p2align 4
L(main_loop_entry):
bic src, srcin, 15
sub src, src, 16
L(main_loop):
ldp data1, data2, [src, 32]!
L(page_cross_entry):
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
tst tmp2, zeroones, lsl 7
bne 1f
ldp data1, data2, [src, 16]
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
tst tmp2, zeroones, lsl 7
beq L(main_loop)
add src, src, 16
1:
/* The fast check failed, so do the slower, accurate NUL check. */
orr tmp2, data1, REP8_7f
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(nonascii_loop)
/* Enter with C = has_nul1 == 0. */
L(tail):
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
rev data2, data2
sub tmp1, data2, zeroones
orr tmp2, data2, #REP8_7f
bic has_nul2, tmp1, tmp2
csel data1, data1, data2, cc
rev data1, data1
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
bic has_nul1, tmp1, tmp2
#else
csel has_nul1, has_nul1, has_nul2, cc
#endif
sub len, len, #8
rev has_nul2, has_nul2
clz pos, has_nul2
add len, len, pos, lsr #3 /* Bits to bytes. */
RET
sub len, src, srcin
rev has_nul1, has_nul1
add tmp2, len, 8
clz tmp1, has_nul1
csel len, len, tmp2, cc
add len, len, tmp1, lsr 3
ret
L(misaligned):
cmp tmp1, #8
neg tmp1, tmp1
ldp data1, data2, [src], #16
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
mov tmp2, #~0
L(nonascii_loop):
ldp data1, data2, [src, 16]!
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
bne L(tail)
ldp data1, data2, [src, 16]!
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(nonascii_loop)
b L(tail)
/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
srcin to 0x7f, so we ignore any NUL bytes before the string.
Then continue in the aligned loop. */
L(page_cross):
bic src, srcin, 15
ldp data1, data2, [src]
lsl tmp1, srcin, 3
mov tmp4, -1
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
/* Big-endian. Early bytes are at MSB. */
lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#endif
orr data1, data1, tmp2
orr data2a, data2, tmp2
csinv data1, data1, xzr, le
csel data2, data2, data2a, le
b L(realigned)
orr tmp1, tmp1, REP8_80
orn data1, data1, tmp1
orn tmp2, data2, tmp1
tst srcin, 8
csel data1, data1, tmp4, eq
csel data2, data2, tmp2, eq
b L(page_cross_entry)
END (strlen)
libc_hidden_builtin_def (strlen)