2013-12-13 19:31:41 +00:00
|
|
|
ifeq ($(subdir),string)
|
2017-12-11 19:39:42 +00:00
|
|
|
sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
|
|
|
memcpy-cell memcpy-power4 memcpy-ppc64 \
|
2017-05-18 05:51:20 +00:00
|
|
|
memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \
|
|
|
|
memset-power7 memset-power6 memset-power4 \
|
2014-07-15 16:19:09 +00:00
|
|
|
memset-ppc64 memset-power8 \
|
2017-06-21 05:25:12 +00:00
|
|
|
mempcpy-power7 mempcpy-ppc64 \
|
|
|
|
memchr-power8 memchr-power7 memchr-ppc64 \
|
2017-10-02 12:01:13 +00:00
|
|
|
memrchr-power8 memrchr-power7 memrchr-ppc64 \
|
|
|
|
rawmemchr-power7 rawmemchr-ppc64 \
|
|
|
|
strlen-power7 strlen-ppc64 \
|
2017-04-05 13:24:24 +00:00
|
|
|
strnlen-power8 strnlen-power7 strnlen-ppc64 \
|
|
|
|
strcasecmp-power7 strcasecmp_l-power7 \
|
2015-01-09 21:04:26 +00:00
|
|
|
strncase-power7 strncase_l-power7 \
|
2018-08-16 06:42:02 +00:00
|
|
|
strncmp-power8 strncmp-power7 \
|
2016-12-13 05:23:42 +00:00
|
|
|
strncmp-power4 strncmp-ppc64 \
|
2016-12-27 19:48:37 +00:00
|
|
|
strchr-power8 strchr-power7 strchr-ppc64 \
|
|
|
|
strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \
|
2014-12-23 11:59:44 +00:00
|
|
|
strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
|
|
|
|
stpcpy-power7 stpcpy-ppc64 \
|
2017-04-18 05:58:56 +00:00
|
|
|
strrchr-power8 strrchr-power7 strrchr-ppc64 \
|
2017-04-13 05:59:20 +00:00
|
|
|
strncat-power8 strncat-power7 strncat-ppc64 \
|
2014-11-19 21:27:56 +00:00
|
|
|
strncpy-power7 strncpy-ppc64 \
|
2014-12-31 16:47:41 +00:00
|
|
|
stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
|
2018-08-16 06:42:02 +00:00
|
|
|
strcmp-power8 strcmp-power7 strcmp-ppc64 \
|
2015-01-21 12:41:46 +00:00
|
|
|
strcat-power8 strcat-power7 strcat-ppc64 \
|
|
|
|
memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
|
2016-03-14 21:40:46 +00:00
|
|
|
strncpy-power8 strstr-power7 strstr-ppc64 \
|
2016-04-25 14:11:02 +00:00
|
|
|
strspn-power8 strspn-ppc64 strcspn-power8 strcspn-ppc64 \
|
2016-06-14 09:21:16 +00:00
|
|
|
strlen-power8 strcasestr-power8 strcasestr-ppc64 \
|
|
|
|
strcasecmp-ppc64 strcasecmp-power8 strncase-ppc64 \
|
|
|
|
strncase-power8
|
2013-12-13 19:40:28 +00:00
|
|
|
|
2018-08-16 06:42:02 +00:00
|
|
|
ifneq (,$(filter %le,$(config-machine)))
|
2021-04-30 21:12:08 +00:00
|
|
|
sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
|
2021-04-30 21:12:08 +00:00
|
|
|
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
powerpc: Add optimized strlen for POWER10
Improvements compared to POWER9 version:
1. Take into account first 16B comparison for aligned strings
The previous version compares the first 16B and increments r4 by the number
of bytes until the address is 16B-aligned, then starts doing aligned loads at
that address. For aligned strings, this causes the first 16B to be compared
twice, because the increment is 0. Here we calculate the next 16B-aligned
address differently, which avoids that issue.
2. Use simple comparisons for the first ~192 bytes
The main loop is good for big strings, but comparing 16B each time is better
for smaller strings. So after aligning the address to 16 Bytes, we check
more 176B in 16B chunks. There may be some overlaps with the main loop for
unaligned strings, but we avoid using the more aggressive strategy too soon,
and also allow the loop to start at a 64B-aligned address. This greatly
benefits smaller strings and avoids overlapping checks if the string is
already aligned at a 64B boundary.
3. Reduce dependencies between load blocks caused by address calculation on loop
Doing a precise time tracing on the code showed many loads in the loop were
stalled waiting for updates to r4 from previous code blocks. This
implementation avoids that as much as possible by using 2 registers (r4 and
r5) to hold addresses to be used by different parts of the code.
Also, the previous code aligned the address to 16B, then to 64B by doing a
few 48B loops (if needed) until the address was aligned. The main loop could
not start until that 48B loop had finished and r4 was updated with the
current address. Here we calculate the address used by the loop very early,
so it can start sooner.
The main loop now uses 2 pointers 128B apart to make pointer updates less
frequent, and also unrolls 1 iteration to guarantee there is enough time
between iterations to update the pointers, reducing stalled cycles.
4. Use new P10 instructions
lxvp is used to load 32B with a single instruction, reducing contention in
the load queue.
vextractbm allows simplifying the tail code for the loop, replacing
vbpermq and avoiding having to generate a permute control vector.
Reviewed-by: Paul E Murphy <murphyp@linux.ibm.com>
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
2020-09-29 18:40:08 +00:00
|
|
|
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
|
|
|
strlen-power10
|
2018-08-16 06:42:02 +00:00
|
|
|
endif
|
2013-12-13 19:40:28 +00:00
|
|
|
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
|
|
|
|
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
|
2015-01-20 20:41:38 +00:00
|
|
|
endif
|