mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-24 11:41:07 +00:00
x86: Cleanup page cross code in memcmp-avx2-movbe.S
Old code was both inefficient and wasted code size. New code (-62 bytes) and comparable or better performance in the page cross case. geometric_mean(N=20) of page cross cases New / Original: 0.960 size, align0, align1, ret, New Time/Old Time 1, 4095, 0, 0, 1.001 1, 4095, 0, 1, 0.999 1, 4095, 0, -1, 1.0 2, 4094, 0, 0, 1.0 2, 4094, 0, 1, 1.0 2, 4094, 0, -1, 1.0 3, 4093, 0, 0, 1.0 3, 4093, 0, 1, 1.0 3, 4093, 0, -1, 1.0 4, 4092, 0, 0, 0.987 4, 4092, 0, 1, 1.0 4, 4092, 0, -1, 1.0 5, 4091, 0, 0, 0.984 5, 4091, 0, 1, 1.002 5, 4091, 0, -1, 1.005 6, 4090, 0, 0, 0.993 6, 4090, 0, 1, 1.001 6, 4090, 0, -1, 1.003 7, 4089, 0, 0, 0.991 7, 4089, 0, 1, 1.0 7, 4089, 0, -1, 1.001 8, 4088, 0, 0, 0.875 8, 4088, 0, 1, 0.881 8, 4088, 0, -1, 0.888 9, 4087, 0, 0, 0.872 9, 4087, 0, 1, 0.879 9, 4087, 0, -1, 0.883 10, 4086, 0, 0, 0.878 10, 4086, 0, 1, 0.886 10, 4086, 0, -1, 0.873 11, 4085, 0, 0, 0.878 11, 4085, 0, 1, 0.881 11, 4085, 0, -1, 0.879 12, 4084, 0, 0, 0.873 12, 4084, 0, 1, 0.889 12, 4084, 0, -1, 0.875 13, 4083, 0, 0, 0.873 13, 4083, 0, 1, 0.863 13, 4083, 0, -1, 0.863 14, 4082, 0, 0, 0.838 14, 4082, 0, 1, 0.869 14, 4082, 0, -1, 0.877 15, 4081, 0, 0, 0.841 15, 4081, 0, 1, 0.869 15, 4081, 0, -1, 0.876 16, 4080, 0, 0, 0.988 16, 4080, 0, 1, 0.99 16, 4080, 0, -1, 0.989 17, 4079, 0, 0, 0.978 17, 4079, 0, 1, 0.981 17, 4079, 0, -1, 0.98 18, 4078, 0, 0, 0.981 18, 4078, 0, 1, 0.98 18, 4078, 0, -1, 0.985 19, 4077, 0, 0, 0.977 19, 4077, 0, 1, 0.979 19, 4077, 0, -1, 0.986 20, 4076, 0, 0, 0.977 20, 4076, 0, 1, 0.986 20, 4076, 0, -1, 0.984 21, 4075, 0, 0, 0.977 21, 4075, 0, 1, 0.983 21, 4075, 0, -1, 0.988 22, 4074, 0, 0, 0.983 22, 4074, 0, 1, 0.994 22, 4074, 0, -1, 0.993 23, 4073, 0, 0, 0.98 23, 4073, 0, 1, 0.992 23, 4073, 0, -1, 0.995 24, 4072, 0, 0, 0.989 24, 4072, 0, 1, 0.989 24, 4072, 0, -1, 0.991 25, 4071, 0, 0, 0.99 25, 4071, 0, 1, 0.999 25, 4071, 0, -1, 0.996 26, 4070, 0, 0, 0.993 26, 4070, 0, 1, 0.995 26, 4070, 0, -1, 0.998 27, 4069, 0, 0, 0.993 27, 4069, 0, 1, 0.999 27, 4069, 0, -1, 1.0 28, 4068, 0, 0, 0.997 28, 4068, 0, 1, 1.0 28, 4068, 0, -1, 0.999 29, 4067, 0, 0, 0.996 29, 4067, 0, 1, 0.999 29, 4067, 0, -1, 0.999 30, 4066, 0, 0, 0.991 30, 4066, 0, 1, 1.001 30, 4066, 0, -1, 0.999 31, 4065, 0, 0, 0.988 31, 4065, 0, 1, 0.998 31, 4065, 0, -1, 0.998 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
7cbc03d030
commit
23102686ec
@ -429,22 +429,21 @@ L(page_cross_less_vec):
|
||||
# ifndef USE_AS_WMEMCMP
|
||||
cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
/* Fall through for [4, 7]. */
|
||||
cmpl $4, %edx
|
||||
jae L(between_4_7)
|
||||
jb L(between_2_3)
|
||||
|
||||
/* Load as big endian to avoid branches. */
|
||||
movzwl (%rdi), %eax
|
||||
movzwl (%rsi), %ecx
|
||||
shll $8, %eax
|
||||
shll $8, %ecx
|
||||
bswap %eax
|
||||
bswap %ecx
|
||||
movzbl -1(%rdi, %rdx), %edi
|
||||
movzbl -1(%rsi, %rdx), %esi
|
||||
orl %edi, %eax
|
||||
orl %esi, %ecx
|
||||
/* Subtraction is okay because the upper 8 bits are zero. */
|
||||
subl %ecx, %eax
|
||||
movbe (%rdi), %eax
|
||||
movbe (%rsi), %ecx
|
||||
shlq $32, %rax
|
||||
shlq $32, %rcx
|
||||
movbe -4(%rdi, %rdx), %edi
|
||||
movbe -4(%rsi, %rdx), %esi
|
||||
orq %rdi, %rax
|
||||
orq %rsi, %rcx
|
||||
subq %rcx, %rax
|
||||
/* Fast path for return zero. */
|
||||
jnz L(ret_nonzero)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
@ -457,9 +456,33 @@ L(one_or_less):
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
.p2align 4,, 5
|
||||
L(ret_nonzero):
|
||||
sbbl %eax, %eax
|
||||
orl $1, %eax
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
.p2align 4,, 2
|
||||
L(zero):
|
||||
xorl %eax, %eax
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(between_8_15):
|
||||
# endif
|
||||
movbe (%rdi), %rax
|
||||
movbe (%rsi), %rcx
|
||||
subq %rcx, %rax
|
||||
jnz L(ret_nonzero)
|
||||
movbe -8(%rdi, %rdx), %rax
|
||||
movbe -8(%rsi, %rdx), %rcx
|
||||
subq %rcx, %rax
|
||||
/* Fast path for return zero. */
|
||||
jnz L(ret_nonzero)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
# else
|
||||
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
vmovq (%rdi), %xmm1
|
||||
vmovq (%rsi), %xmm2
|
||||
@ -475,16 +498,13 @@ L(between_8_15):
|
||||
VPCMPEQ %xmm1, %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
subl $0xffff, %eax
|
||||
/* Fast path for return zero. */
|
||||
jnz L(return_vec_0)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
# endif
|
||||
|
||||
.p2align 4
|
||||
L(zero):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
.p2align 4,, 10
|
||||
L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
vmovdqu (%rsi), %xmm2
|
||||
@ -501,11 +521,17 @@ L(between_16_31):
|
||||
VPCMPEQ (%rdi), %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
subl $0xffff, %eax
|
||||
/* Fast path for return zero. */
|
||||
jnz L(return_vec_0)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
.p2align 4,, 2
|
||||
L(zero):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(one_or_less):
|
||||
jb L(zero)
|
||||
@ -520,22 +546,20 @@ L(one_or_less):
|
||||
# else
|
||||
|
||||
.p2align 4
|
||||
L(between_4_7):
|
||||
/* Load as big endian with overlapping movbe to avoid branches.
|
||||
*/
|
||||
movbe (%rdi), %eax
|
||||
movbe (%rsi), %ecx
|
||||
shlq $32, %rax
|
||||
shlq $32, %rcx
|
||||
movbe -4(%rdi, %rdx), %edi
|
||||
movbe -4(%rsi, %rdx), %esi
|
||||
orq %rdi, %rax
|
||||
orq %rsi, %rcx
|
||||
subq %rcx, %rax
|
||||
jz L(zero_4_7)
|
||||
sbbl %eax, %eax
|
||||
orl $1, %eax
|
||||
L(zero_4_7):
|
||||
L(between_2_3):
|
||||
/* Load as big endian to avoid branches. */
|
||||
movzwl (%rdi), %eax
|
||||
movzwl (%rsi), %ecx
|
||||
bswap %eax
|
||||
bswap %ecx
|
||||
shrl %eax
|
||||
shrl %ecx
|
||||
movzbl -1(%rdi, %rdx), %edi
|
||||
movzbl -1(%rsi, %rdx), %esi
|
||||
orl %edi, %eax
|
||||
orl %esi, %ecx
|
||||
/* Subtraction is okay because the upper bit is zero. */
|
||||
subl %ecx, %eax
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
# endif
|
||||
|
Loading…
Reference in New Issue
Block a user