x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S

Slightly faster method of doing TOLOWER that saves an
instruction.

Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.

geometric_mean(N=40) of all benchmarks New / Original: .920

All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit d154758e61)
This commit is contained in:
Noah Goldstein 2022-03-23 16:57:38 -05:00 committed by Sunil K Pandey
parent 5997011826
commit 3605c74407

View File

@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp))
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP mov %fs:(%rax),%RDX_LP
// XXX 5 byte should be before the function /* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* 5-byte NOP. */ .p2align 4
.byte 0x0f,0x1f,0x44,0x00,0x00
END (GLABEL(__strcasecmp)) END (GLABEL(__strcasecmp))
/* FALLTHROUGH to strcasecmp_l. */ /* FALLTHROUGH to strcasecmp_l. */
#endif #endif
@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp))
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP mov %fs:(%rax),%RCX_LP
// XXX 5 byte should be before the function /* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* 5-byte NOP. */ .p2align 4
.byte 0x0f,0x1f,0x44,0x00,0x00
END (GLABEL(__strncasecmp)) END (GLABEL(__strncasecmp))
/* FALLTHROUGH to strncasecmp_l. */ /* FALLTHROUGH to strncasecmp_l. */
#endif #endif
@ -170,27 +168,22 @@ STRCMP_SSE42:
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16 .section .rodata.cst16,"aM",@progbits,16
.align 16 .align 16
LABEL(belowupper): LABEL(lcase_min):
.quad 0x4040404040404040 .quad 0x3f3f3f3f3f3f3f3f
.quad 0x4040404040404040 .quad 0x3f3f3f3f3f3f3f3f
LABEL(topupper): LABEL(lcase_max):
# ifdef USE_AVX .quad 0x9999999999999999
.quad 0x5a5a5a5a5a5a5a5a .quad 0x9999999999999999
.quad 0x5a5a5a5a5a5a5a5a LABEL(case_add):
# else
.quad 0x5b5b5b5b5b5b5b5b
.quad 0x5b5b5b5b5b5b5b5b
# endif
LABEL(touppermask):
.quad 0x2020202020202020 .quad 0x2020202020202020
.quad 0x2020202020202020 .quad 0x2020202020202020
.previous .previous
movdqa LABEL(belowupper)(%rip), %xmm4 movdqa LABEL(lcase_min)(%rip), %xmm4
# define UCLOW_reg %xmm4 # define LCASE_MIN_reg %xmm4
movdqa LABEL(topupper)(%rip), %xmm5 movdqa LABEL(lcase_max)(%rip), %xmm5
# define UCHIGH_reg %xmm5 # define LCASE_MAX_reg %xmm5
movdqa LABEL(touppermask)(%rip), %xmm6 movdqa LABEL(case_add)(%rip), %xmm6
# define LCQWORD_reg %xmm6 # define CASE_ADD_reg %xmm6
#endif #endif
cmp $0x30, %ecx cmp $0x30, %ecx
ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@ -201,32 +194,26 @@ LABEL(touppermask):
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
# ifdef USE_AVX # ifdef USE_AVX
# define TOLOWER(reg1, reg2) \ # define TOLOWER(reg1, reg2) \
vpcmpgtb UCLOW_reg, reg1, %xmm7; \ vpaddb LCASE_MIN_reg, reg1, %xmm7; \
vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ vpaddb LCASE_MIN_reg, reg2, %xmm8; \
vpcmpgtb UCLOW_reg, reg2, %xmm9; \ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
vpandn %xmm7, %xmm8, %xmm8; \ vpandn CASE_ADD_reg, %xmm7, %xmm7; \
vpandn %xmm9, %xmm10, %xmm10; \ vpandn CASE_ADD_reg, %xmm8, %xmm8; \
vpand LCQWORD_reg, %xmm8, %xmm8; \ vpaddb %xmm7, reg1, reg1; \
vpand LCQWORD_reg, %xmm10, %xmm10; \ vpaddb %xmm8, reg2, reg2
vpor reg1, %xmm8, reg1; \
vpor reg2, %xmm10, reg2
# else # else
# define TOLOWER(reg1, reg2) \ # define TOLOWER(reg1, reg2) \
movdqa reg1, %xmm7; \ movdqa LCASE_MIN_reg, %xmm7; \
movdqa UCHIGH_reg, %xmm8; \ movdqa LCASE_MIN_reg, %xmm8; \
movdqa reg2, %xmm9; \ paddb reg1, %xmm7; \
movdqa UCHIGH_reg, %xmm10; \ paddb reg2, %xmm8; \
pcmpgtb UCLOW_reg, %xmm7; \ pcmpgtb LCASE_MAX_reg, %xmm7; \
pcmpgtb reg1, %xmm8; \ pcmpgtb LCASE_MAX_reg, %xmm8; \
pcmpgtb UCLOW_reg, %xmm9; \ pandn CASE_ADD_reg, %xmm7; \
pcmpgtb reg2, %xmm10; \ pandn CASE_ADD_reg, %xmm8; \
pand %xmm8, %xmm7; \ paddb %xmm7, reg1; \
pand %xmm10, %xmm9; \ paddb %xmm8, reg2
pand LCQWORD_reg, %xmm7; \
pand LCQWORD_reg, %xmm9; \
por %xmm7, reg1; \
por %xmm9, reg2
# endif # endif
TOLOWER (%xmm1, %xmm2) TOLOWER (%xmm1, %xmm2)
#else #else