mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-04 19:00:09 +00:00
aarch64: Optimize string functions with shrn instruction
We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst.8b, src.8h, 4` (shift
right every 2 bytes by 4 and narrow to 1 byte) and has
same latency on all SIMD ARMv8 targets as ADDP. There
are also possible gaps for memcmp but that's for
another patch.
We see 10-20% savings for small-mid size cases (<=128)
which are primary cases for general workloads.
(cherry picked from commit 3c99806989
)
This commit is contained in:
parent
3393c72eb0
commit
c4f4b53eee
@ -41,24 +41,21 @@
|
||||
#define synd x5
|
||||
#define shift x6
|
||||
#define tmp x7
|
||||
#define wtmp w7
|
||||
|
||||
#define vrepchr v0
|
||||
#define qdata q1
|
||||
#define vdata v1
|
||||
#define vhas_chr v2
|
||||
#define vrepmask v3
|
||||
#define vend v4
|
||||
#define dend d4
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (MEMCHR)
|
||||
PTR_ARG (0)
|
||||
@ -67,12 +64,9 @@ ENTRY (MEMCHR)
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepchr.16b, chrin
|
||||
mov wtmp, 0xf00f
|
||||
dup vrepmask.8h, wtmp
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
lsl shift, srcin, 2
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
@ -111,8 +105,7 @@ L(loop32_2):
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop32)
|
||||
L(end):
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
add tmp, srcin, cntin
|
||||
sub cntrem, tmp, src
|
||||
|
@ -37,7 +37,6 @@
|
||||
#define synd x5
|
||||
#define shift x6
|
||||
#define tmp x7
|
||||
#define wtmp w7
|
||||
#define end x8
|
||||
#define endm1 x9
|
||||
|
||||
@ -45,18 +44,16 @@
|
||||
#define qdata q1
|
||||
#define vdata v1
|
||||
#define vhas_chr v2
|
||||
#define vrepmask v3
|
||||
#define vend v4
|
||||
#define dend d4
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__memrchr)
|
||||
PTR_ARG (0)
|
||||
@ -67,12 +64,9 @@ ENTRY (__memrchr)
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepchr.16b, chrin
|
||||
mov wtmp, 0xf00f
|
||||
dup vrepmask.8h, wtmp
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
neg shift, end, lsl 2
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsl synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
@ -109,8 +103,7 @@ L(loop32_2):
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop32)
|
||||
L(end):
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
|
||||
add tmp, src, 15
|
||||
|
@ -33,38 +33,32 @@
|
||||
#define src x2
|
||||
#define tmp1 x1
|
||||
#define tmp2 x3
|
||||
#define tmp2w w3
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata v1
|
||||
#define qdata q1
|
||||
#define vhas_nul v2
|
||||
#define vhas_chr v3
|
||||
#define vrepmask v4
|
||||
#define vend v5
|
||||
#define dend d5
|
||||
#define vend v4
|
||||
#define dend d4
|
||||
|
||||
/* Core algorithm:
|
||||
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__strchrnul)
|
||||
PTR_ARG (0)
|
||||
bic src, srcin, 15
|
||||
dup vrepchr.16b, chrin
|
||||
ld1 {vdata.16b}, [src]
|
||||
mov tmp2w, 0xf00f
|
||||
dup vrepmask.8h, tmp2w
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
|
||||
lsl tmp2, srcin, 2
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov tmp1, dend
|
||||
lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
|
||||
cbz tmp1, L(loop)
|
||||
@ -83,8 +77,7 @@ L(loop):
|
||||
fmov tmp1, dend
|
||||
cbz tmp1, L(loop)
|
||||
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov tmp1, dend
|
||||
#ifndef __AARCH64EB__
|
||||
rbit tmp1, tmp1
|
||||
|
@ -40,7 +40,6 @@
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define wtmp w5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
@ -50,9 +49,8 @@
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vrepmask v2
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
#define dataq2 q1
|
||||
|
||||
#ifdef BUILD_STPCPY
|
||||
@ -63,34 +61,29 @@
|
||||
# define IFSTPCPY(X,...)
|
||||
#endif
|
||||
|
||||
/* Core algorithm:
|
||||
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (STRCPY)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
mov wtmp, 0xf00f
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepmask.8h, wtmp
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
@ -162,8 +155,7 @@ L(loop):
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
|
@ -34,35 +34,29 @@
|
||||
#define src x1
|
||||
#define synd x2
|
||||
#define tmp x3
|
||||
#define wtmp w3
|
||||
#define shift x4
|
||||
|
||||
#define data q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vrepmask v2
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
|
||||
/* Core algorithm:
|
||||
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting trailing zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (STRLEN)
|
||||
PTR_ARG (0)
|
||||
bic src, srcin, 15
|
||||
mov wtmp, 0xf00f
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepmask.8h, wtmp
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(loop)
|
||||
@ -80,8 +74,7 @@ L(loop):
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
sub result, src, srcin
|
||||
fmov synd, dend
|
||||
#ifndef __AARCH64EB__
|
||||
|
@ -33,39 +33,33 @@
|
||||
#define src x2
|
||||
#define synd x3
|
||||
#define shift x4
|
||||
#define wtmp w4
|
||||
#define tmp x4
|
||||
#define cntrem x5
|
||||
|
||||
#define qdata q0
|
||||
#define vdata v0
|
||||
#define vhas_chr v1
|
||||
#define vrepmask v2
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting trailing zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__strnlen)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (1)
|
||||
bic src, srcin, 15
|
||||
mov wtmp, 0xf00f
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src], 16
|
||||
dup vrepmask.8h, wtmp
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
@ -103,8 +97,7 @@ L(loop32_2):
|
||||
cbz synd, L(loop32)
|
||||
|
||||
L(end):
|
||||
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
sub src, src, 16
|
||||
mov synd, vend.d[0]
|
||||
sub result, src, srcin
|
||||
|
Loading…
Reference in New Issue
Block a user