Use p2align instead ALIGN

This commit is contained in:
Ondřej Bílka 2013-10-08 15:46:48 +02:00
parent 41500766f7
commit e7044ea76b
10 changed files with 301 additions and 323 deletions

View File

@ -1,3 +1,16 @@
2013-10-08 Ondřej Bílka <neleai@seznam.cz>
* sysdeps/x86_64/memset.S (ALIGN): Macro removed.
Use .p2align directive instead, throughout.
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise.
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Likewise.
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Likewise.
* sysdeps/x86_64/strchr.S: Likewise.
* sysdeps/x86_64/strrchr.S: Likewise.
2013-10-08 Siddhesh Poyarekar <siddhesh@redhat.com> 2013-10-08 Siddhesh Poyarekar <siddhesh@redhat.com>
* sysdeps/ieee754/dbl-64/e_pow.c: Fix code formatting. * sysdeps/ieee754/dbl-64/e_pow.c: Fix code formatting.

View File

@ -19,10 +19,6 @@
#include <sysdep.h> #include <sysdep.h>
#ifndef ALIGN
# define ALIGN(n) .p2align n
#endif
.text .text
#if !defined NOT_IN_libc #if !defined NOT_IN_libc
ENTRY(__bzero) ENTRY(__bzero)
@ -71,12 +67,12 @@ L(entry_from_bzero):
L(return): L(return):
rep rep
ret ret
ALIGN (4) .p2align 4
L(between_32_64_bytes): L(between_32_64_bytes):
movdqu %xmm8, 16(%rdi) movdqu %xmm8, 16(%rdi)
movdqu %xmm8, -32(%rdi,%rdx) movdqu %xmm8, -32(%rdi,%rdx)
ret ret
ALIGN (4) .p2align 4
L(loop_start): L(loop_start):
leaq 64(%rdi), %rcx leaq 64(%rdi), %rcx
movdqu %xmm8, (%rdi) movdqu %xmm8, (%rdi)
@ -92,7 +88,7 @@ L(loop_start):
andq $-64, %rdx andq $-64, %rdx
cmpq %rdx, %rcx cmpq %rdx, %rcx
je L(return) je L(return)
ALIGN (4) .p2align 4
L(loop): L(loop):
movdqa %xmm8, (%rcx) movdqa %xmm8, (%rcx)
movdqa %xmm8, 16(%rcx) movdqa %xmm8, 16(%rcx)

View File

@ -25,10 +25,6 @@
# define MEMCMP __memcmp_sse4_1 # define MEMCMP __memcmp_sse4_1
# endif # endif
# ifndef ALIGN
# define ALIGN(n) .p2align n
# endif
# define JMPTBL(I, B) (I - B) # define JMPTBL(I, B) (I - B)
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
@ -60,7 +56,7 @@ ENTRY (MEMCMP)
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
ALIGN (4) .p2align 4
L(firstbyte): L(firstbyte):
movzbl (%rdi), %eax movzbl (%rdi), %eax
movzbl (%rsi), %ecx movzbl (%rsi), %ecx
@ -68,7 +64,7 @@ L(firstbyte):
ret ret
# endif # endif
ALIGN (4) .p2align 4
L(79bytesormore): L(79bytesormore):
movdqu (%rsi), %xmm1 movdqu (%rsi), %xmm1
movdqu (%rdi), %xmm2 movdqu (%rdi), %xmm2
@ -316,7 +312,7 @@ L(less32bytesin256):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4) .p2align 4
L(512bytesormore): L(512bytesormore):
# ifdef DATA_CACHE_SIZE_HALF # ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %R8_LP mov $DATA_CACHE_SIZE_HALF, %R8_LP
@ -329,7 +325,7 @@ L(512bytesormore):
cmp %r8, %rdx cmp %r8, %rdx
ja L(L2_L3_cache_unaglined) ja L(L2_L3_cache_unaglined)
sub $64, %rdx sub $64, %rdx
ALIGN (4) .p2align 4
L(64bytesormore_loop): L(64bytesormore_loop):
movdqu (%rdi), %xmm2 movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2 pxor (%rsi), %xmm2
@ -361,7 +357,7 @@ L(64bytesormore_loop):
L(L2_L3_cache_unaglined): L(L2_L3_cache_unaglined):
sub $64, %rdx sub $64, %rdx
ALIGN (4) .p2align 4
L(L2_L3_unaligned_128bytes_loop): L(L2_L3_unaligned_128bytes_loop):
prefetchnta 0x1c0(%rdi) prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi) prefetchnta 0x1c0(%rsi)
@ -396,7 +392,7 @@ L(L2_L3_unaligned_128bytes_loop):
/* /*
* This case is for machines which are sensitive for unaligned instructions. * This case is for machines which are sensitive for unaligned instructions.
*/ */
ALIGN (4) .p2align 4
L(2aligned): L(2aligned):
cmp $128, %rdx cmp $128, %rdx
ja L(128bytesormorein2aligned) ja L(128bytesormorein2aligned)
@ -444,7 +440,7 @@ L(less32bytesin64in2alinged):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4) .p2align 4
L(128bytesormorein2aligned): L(128bytesormorein2aligned):
cmp $512, %rdx cmp $512, %rdx
ja L(512bytesormorein2aligned) ja L(512bytesormorein2aligned)
@ -519,7 +515,7 @@ L(less32bytesin128in2aligned):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4) .p2align 4
L(256bytesormorein2aligned): L(256bytesormorein2aligned):
sub $256, %rdx sub $256, %rdx
@ -632,7 +628,7 @@ L(less32bytesin256in2alinged):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4) .p2align 4
L(512bytesormorein2aligned): L(512bytesormorein2aligned):
# ifdef DATA_CACHE_SIZE_HALF # ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %R8_LP mov $DATA_CACHE_SIZE_HALF, %R8_LP
@ -646,7 +642,7 @@ L(512bytesormorein2aligned):
ja L(L2_L3_cache_aglined) ja L(L2_L3_cache_aglined)
sub $64, %rdx sub $64, %rdx
ALIGN (4) .p2align 4
L(64bytesormore_loopin2aligned): L(64bytesormore_loopin2aligned):
movdqa (%rdi), %xmm2 movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2 pxor (%rsi), %xmm2
@ -678,7 +674,7 @@ L(64bytesormore_loopin2aligned):
L(L2_L3_cache_aglined): L(L2_L3_cache_aglined):
sub $64, %rdx sub $64, %rdx
ALIGN (4) .p2align 4
L(L2_L3_aligned_128bytes_loop): L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi) prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi) prefetchnta 0x1c0(%rsi)
@ -711,7 +707,7 @@ L(L2_L3_aligned_128bytes_loop):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4) .p2align 4
L(64bytesormore_loop_end): L(64bytesormore_loop_end):
add $16, %rdi add $16, %rdi
add $16, %rsi add $16, %rsi
@ -806,7 +802,7 @@ L(8bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(12bytes): L(12bytes):
mov -12(%rdi), %rax mov -12(%rdi), %rax
mov -12(%rsi), %rcx mov -12(%rsi), %rcx
@ -827,7 +823,7 @@ L(0bytes):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
/* unreal case for wmemcmp */ /* unreal case for wmemcmp */
ALIGN (4) .p2align 4
L(65bytes): L(65bytes):
movdqu -65(%rdi), %xmm1 movdqu -65(%rdi), %xmm1
movdqu -65(%rsi), %xmm2 movdqu -65(%rsi), %xmm2
@ -864,7 +860,7 @@ L(9bytes):
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(13bytes): L(13bytes):
mov -13(%rdi), %rax mov -13(%rdi), %rax
mov -13(%rsi), %rcx mov -13(%rsi), %rcx
@ -877,7 +873,7 @@ L(13bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(5bytes): L(5bytes):
mov -5(%rdi), %eax mov -5(%rdi), %eax
mov -5(%rsi), %ecx mov -5(%rsi), %ecx
@ -888,7 +884,7 @@ L(5bytes):
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(66bytes): L(66bytes):
movdqu -66(%rdi), %xmm1 movdqu -66(%rdi), %xmm1
movdqu -66(%rsi), %xmm2 movdqu -66(%rsi), %xmm2
@ -929,7 +925,7 @@ L(10bytes):
sub %ecx, %eax sub %ecx, %eax
ret ret
ALIGN (4) .p2align 4
L(14bytes): L(14bytes):
mov -14(%rdi), %rax mov -14(%rdi), %rax
mov -14(%rsi), %rcx mov -14(%rsi), %rcx
@ -942,7 +938,7 @@ L(14bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(6bytes): L(6bytes):
mov -6(%rdi), %eax mov -6(%rdi), %eax
mov -6(%rsi), %ecx mov -6(%rsi), %ecx
@ -958,7 +954,7 @@ L(2bytes):
sub %ecx, %eax sub %ecx, %eax
ret ret
ALIGN (4) .p2align 4
L(67bytes): L(67bytes):
movdqu -67(%rdi), %xmm2 movdqu -67(%rdi), %xmm2
movdqu -67(%rsi), %xmm1 movdqu -67(%rsi), %xmm1
@ -997,7 +993,7 @@ L(11bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(15bytes): L(15bytes):
mov -15(%rdi), %rax mov -15(%rdi), %rax
mov -15(%rsi), %rcx mov -15(%rsi), %rcx
@ -1010,7 +1006,7 @@ L(15bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(7bytes): L(7bytes):
mov -7(%rdi), %eax mov -7(%rdi), %eax
mov -7(%rsi), %ecx mov -7(%rsi), %ecx
@ -1023,7 +1019,7 @@ L(7bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(3bytes): L(3bytes):
movzwl -3(%rdi), %eax movzwl -3(%rdi), %eax
movzwl -3(%rsi), %ecx movzwl -3(%rsi), %ecx
@ -1036,7 +1032,7 @@ L(1bytes):
ret ret
# endif # endif
ALIGN (4) .p2align 4
L(68bytes): L(68bytes):
movdqu -68(%rdi), %xmm2 movdqu -68(%rdi), %xmm2
movdqu -68(%rsi), %xmm1 movdqu -68(%rsi), %xmm1
@ -1079,7 +1075,7 @@ L(20bytes):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */ /* unreal cases for wmemcmp */
ALIGN (4) .p2align 4
L(69bytes): L(69bytes):
movdqu -69(%rsi), %xmm1 movdqu -69(%rsi), %xmm1
movdqu -69(%rdi), %xmm2 movdqu -69(%rdi), %xmm2
@ -1115,7 +1111,7 @@ L(21bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(70bytes): L(70bytes):
movdqu -70(%rsi), %xmm1 movdqu -70(%rsi), %xmm1
movdqu -70(%rdi), %xmm2 movdqu -70(%rdi), %xmm2
@ -1151,7 +1147,7 @@ L(22bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(71bytes): L(71bytes):
movdqu -71(%rsi), %xmm1 movdqu -71(%rsi), %xmm1
movdqu -71(%rdi), %xmm2 movdqu -71(%rdi), %xmm2
@ -1188,7 +1184,7 @@ L(23bytes):
ret ret
# endif # endif
ALIGN (4) .p2align 4
L(72bytes): L(72bytes):
movdqu -72(%rsi), %xmm1 movdqu -72(%rsi), %xmm1
movdqu -72(%rdi), %xmm2 movdqu -72(%rdi), %xmm2
@ -1227,7 +1223,7 @@ L(24bytes):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */ /* unreal cases for wmemcmp */
ALIGN (4) .p2align 4
L(73bytes): L(73bytes):
movdqu -73(%rsi), %xmm1 movdqu -73(%rsi), %xmm1
movdqu -73(%rdi), %xmm2 movdqu -73(%rdi), %xmm2
@ -1265,7 +1261,7 @@ L(25bytes):
sub %ecx, %eax sub %ecx, %eax
ret ret
ALIGN (4) .p2align 4
L(74bytes): L(74bytes):
movdqu -74(%rsi), %xmm1 movdqu -74(%rsi), %xmm1
movdqu -74(%rdi), %xmm2 movdqu -74(%rdi), %xmm2
@ -1302,7 +1298,7 @@ L(26bytes):
movzwl -2(%rsi), %ecx movzwl -2(%rsi), %ecx
jmp L(diffin2bytes) jmp L(diffin2bytes)
ALIGN (4) .p2align 4
L(75bytes): L(75bytes):
movdqu -75(%rsi), %xmm1 movdqu -75(%rsi), %xmm1
movdqu -75(%rdi), %xmm2 movdqu -75(%rdi), %xmm2
@ -1342,7 +1338,7 @@ L(27bytes):
xor %eax, %eax xor %eax, %eax
ret ret
# endif # endif
ALIGN (4) .p2align 4
L(76bytes): L(76bytes):
movdqu -76(%rsi), %xmm1 movdqu -76(%rsi), %xmm1
movdqu -76(%rdi), %xmm2 movdqu -76(%rdi), %xmm2
@ -1388,7 +1384,7 @@ L(28bytes):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */ /* unreal cases for wmemcmp */
ALIGN (4) .p2align 4
L(77bytes): L(77bytes):
movdqu -77(%rsi), %xmm1 movdqu -77(%rsi), %xmm1
movdqu -77(%rdi), %xmm2 movdqu -77(%rdi), %xmm2
@ -1430,7 +1426,7 @@ L(29bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(78bytes): L(78bytes):
movdqu -78(%rsi), %xmm1 movdqu -78(%rsi), %xmm1
movdqu -78(%rdi), %xmm2 movdqu -78(%rdi), %xmm2
@ -1470,7 +1466,7 @@ L(30bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(79bytes): L(79bytes):
movdqu -79(%rsi), %xmm1 movdqu -79(%rsi), %xmm1
movdqu -79(%rdi), %xmm2 movdqu -79(%rdi), %xmm2
@ -1510,7 +1506,7 @@ L(31bytes):
xor %eax, %eax xor %eax, %eax
ret ret
# endif # endif
ALIGN (4) .p2align 4
L(64bytes): L(64bytes):
movdqu -64(%rdi), %xmm2 movdqu -64(%rdi), %xmm2
movdqu -64(%rsi), %xmm1 movdqu -64(%rsi), %xmm1
@ -1548,7 +1544,7 @@ L(32bytes):
/* /*
* Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
*/ */
ALIGN (3) .p2align 3
L(less16bytes): L(less16bytes):
movsbq %dl, %rdx movsbq %dl, %rdx
mov (%rsi, %rdx), %rcx mov (%rsi, %rdx), %rcx
@ -1585,7 +1581,7 @@ L(diffin2bytes):
sub %ecx, %eax sub %ecx, %eax
ret ret
ALIGN (4) .p2align 4
L(end): L(end):
and $0xff, %eax and $0xff, %eax
and $0xff, %ecx and $0xff, %ecx
@ -1599,7 +1595,7 @@ L(end):
neg %eax neg %eax
ret ret
ALIGN (4) .p2align 4
L(nequal_bigger): L(nequal_bigger):
ret ret
@ -1611,7 +1607,7 @@ L(unreal_case):
END (MEMCMP) END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits .section .rodata.sse4.1,"a",@progbits
ALIGN (3) .p2align 3
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
L(table_64bytes): L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(0bytes), L(table_64bytes))

View File

@ -25,10 +25,6 @@
# define MEMCMP __memcmp_ssse3 # define MEMCMP __memcmp_ssse3
# endif # endif
# ifndef ALIGN
# define ALIGN(n) .p2align n
# endif
/* Warning! /* Warning!
wmemcmp has to use SIGNED comparison for elements. wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts. memcmp has to use UNSIGNED comparison for elemnts.
@ -50,7 +46,7 @@ ENTRY (MEMCMP)
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
/* ECX >= 32. */ /* ECX >= 32. */
L(48bytesormore): L(48bytesormore):
movdqu (%rdi), %xmm3 movdqu (%rdi), %xmm3
@ -90,7 +86,7 @@ L(48bytesormore):
je L(shr_6) je L(shr_6)
jmp L(shr_7) jmp L(shr_7)
ALIGN (2) .p2align 2
L(next_unaligned_table): L(next_unaligned_table):
cmp $8, %edx cmp $8, %edx
je L(shr_8) je L(shr_8)
@ -117,7 +113,7 @@ L(next_unaligned_table):
jmp L(shr_12) jmp L(shr_12)
# endif # endif
ALIGN (4) .p2align 4
L(shr_0): L(shr_0):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -137,7 +133,7 @@ L(shr_0):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_0_gobble): L(shr_0_gobble):
movdqa (%rsi), %xmm0 movdqa (%rsi), %xmm0
xor %eax, %eax xor %eax, %eax
@ -180,7 +176,7 @@ L(next):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
ALIGN (4) .p2align 4
L(shr_1): L(shr_1):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -207,7 +203,7 @@ L(shr_1):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_1_gobble): L(shr_1_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -258,7 +254,7 @@ L(shr_1_gobble_next):
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_2): L(shr_2):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -285,7 +281,7 @@ L(shr_2):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_2_gobble): L(shr_2_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -335,7 +331,7 @@ L(shr_2_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_3): L(shr_3):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -362,7 +358,7 @@ L(shr_3):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_3_gobble): L(shr_3_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -414,7 +410,7 @@ L(shr_3_gobble_next):
# endif # endif
ALIGN (4) .p2align 4
L(shr_4): L(shr_4):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -441,7 +437,7 @@ L(shr_4):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_4_gobble): L(shr_4_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -493,7 +489,7 @@ L(shr_4_gobble_next):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
ALIGN (4) .p2align 4
L(shr_5): L(shr_5):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -520,7 +516,7 @@ L(shr_5):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_5_gobble): L(shr_5_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -570,7 +566,7 @@ L(shr_5_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_6): L(shr_6):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -597,7 +593,7 @@ L(shr_6):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_6_gobble): L(shr_6_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -647,7 +643,7 @@ L(shr_6_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_7): L(shr_7):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -674,7 +670,7 @@ L(shr_7):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_7_gobble): L(shr_7_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -726,7 +722,7 @@ L(shr_7_gobble_next):
# endif # endif
ALIGN (4) .p2align 4
L(shr_8): L(shr_8):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -753,7 +749,7 @@ L(shr_8):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_8_gobble): L(shr_8_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -805,7 +801,7 @@ L(shr_8_gobble_next):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
ALIGN (4) .p2align 4
L(shr_9): L(shr_9):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -832,7 +828,7 @@ L(shr_9):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_9_gobble): L(shr_9_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -882,7 +878,7 @@ L(shr_9_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_10): L(shr_10):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -909,7 +905,7 @@ L(shr_10):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_10_gobble): L(shr_10_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -959,7 +955,7 @@ L(shr_10_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_11): L(shr_11):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -986,7 +982,7 @@ L(shr_11):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_11_gobble): L(shr_11_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -1038,7 +1034,7 @@ L(shr_11_gobble_next):
# endif # endif
ALIGN (4) .p2align 4
L(shr_12): L(shr_12):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -1065,7 +1061,7 @@ L(shr_12):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_12_gobble): L(shr_12_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -1117,7 +1113,7 @@ L(shr_12_gobble_next):
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
ALIGN (4) .p2align 4
L(shr_13): L(shr_13):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -1144,7 +1140,7 @@ L(shr_13):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_13_gobble): L(shr_13_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -1194,7 +1190,7 @@ L(shr_13_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_14): L(shr_14):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -1221,7 +1217,7 @@ L(shr_14):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_14_gobble): L(shr_14_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -1271,7 +1267,7 @@ L(shr_14_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_15): L(shr_15):
cmp $80, %rcx cmp $80, %rcx
lea -48(%rcx), %rcx lea -48(%rcx), %rcx
@ -1298,7 +1294,7 @@ L(shr_15):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
ALIGN (4) .p2align 4
L(shr_15_gobble): L(shr_15_gobble):
sub $32, %rcx sub $32, %rcx
movdqa 16(%rsi), %xmm0 movdqa 16(%rsi), %xmm0
@ -1348,7 +1344,7 @@ L(shr_15_gobble_next):
add %rcx, %rdi add %rcx, %rdi
jmp L(less48bytes) jmp L(less48bytes)
# endif # endif
ALIGN (4) .p2align 4
L(exit): L(exit):
pmovmskb %xmm1, %r8d pmovmskb %xmm1, %r8d
sub $0xffff, %r8d sub $0xffff, %r8d
@ -1389,56 +1385,56 @@ L(less16bytes):
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(Byte16): L(Byte16):
movzbl -16(%rdi), %eax movzbl -16(%rdi), %eax
movzbl -16(%rsi), %edx movzbl -16(%rsi), %edx
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(Byte17): L(Byte17):
movzbl -15(%rdi), %eax movzbl -15(%rdi), %eax
movzbl -15(%rsi), %edx movzbl -15(%rsi), %edx
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(Byte18): L(Byte18):
movzbl -14(%rdi), %eax movzbl -14(%rdi), %eax
movzbl -14(%rsi), %edx movzbl -14(%rsi), %edx
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(Byte19): L(Byte19):
movzbl -13(%rdi), %eax movzbl -13(%rdi), %eax
movzbl -13(%rsi), %edx movzbl -13(%rsi), %edx
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(Byte20): L(Byte20):
movzbl -12(%rdi), %eax movzbl -12(%rdi), %eax
movzbl -12(%rsi), %edx movzbl -12(%rsi), %edx
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(Byte21): L(Byte21):
movzbl -11(%rdi), %eax movzbl -11(%rdi), %eax
movzbl -11(%rsi), %edx movzbl -11(%rsi), %edx
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(Byte22): L(Byte22):
movzbl -10(%rdi), %eax movzbl -10(%rdi), %eax
movzbl -10(%rsi), %edx movzbl -10(%rsi), %edx
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(next_24_bytes): L(next_24_bytes):
lea 8(%rdi), %rdi lea 8(%rdi), %rdi
lea 8(%rsi), %rsi lea 8(%rsi), %rsi
@ -1479,14 +1475,14 @@ L(next_24_bytes):
jne L(find_diff) jne L(find_diff)
ret ret
ALIGN (4) .p2align 4
L(second_double_word): L(second_double_word):
mov -12(%rdi), %eax mov -12(%rdi), %eax
cmp -12(%rsi), %eax cmp -12(%rsi), %eax
jne L(find_diff) jne L(find_diff)
ret ret
ALIGN (4) .p2align 4
L(next_two_double_words): L(next_two_double_words):
and $15, %dh and $15, %dh
jz L(fourth_double_word) jz L(fourth_double_word)
@ -1495,7 +1491,7 @@ L(next_two_double_words):
jne L(find_diff) jne L(find_diff)
ret ret
ALIGN (4) .p2align 4
L(fourth_double_word): L(fourth_double_word):
mov -4(%rdi), %eax mov -4(%rdi), %eax
cmp -4(%rsi), %eax cmp -4(%rsi), %eax
@ -1503,7 +1499,7 @@ L(fourth_double_word):
ret ret
# endif # endif
ALIGN (4) .p2align 4
L(less48bytes): L(less48bytes):
cmp $8, %ecx cmp $8, %ecx
jae L(more8bytes) jae L(more8bytes)
@ -1527,7 +1523,7 @@ L(less48bytes):
jmp L(4bytes) jmp L(4bytes)
# endif # endif
ALIGN (4) .p2align 4
L(more8bytes): L(more8bytes):
cmp $16, %ecx cmp $16, %ecx
jae L(more16bytes) jae L(more16bytes)
@ -1551,7 +1547,7 @@ L(more8bytes):
jmp L(12bytes) jmp L(12bytes)
# endif # endif
ALIGN (4) .p2align 4
L(more16bytes): L(more16bytes):
cmp $24, %ecx cmp $24, %ecx
jae L(more24bytes) jae L(more24bytes)
@ -1575,7 +1571,7 @@ L(more16bytes):
jmp L(20bytes) jmp L(20bytes)
# endif # endif
ALIGN (4) .p2align 4
L(more24bytes): L(more24bytes):
cmp $32, %ecx cmp $32, %ecx
jae L(more32bytes) jae L(more32bytes)
@ -1599,7 +1595,7 @@ L(more24bytes):
jmp L(28bytes) jmp L(28bytes)
# endif # endif
ALIGN (4) .p2align 4
L(more32bytes): L(more32bytes):
cmp $40, %ecx cmp $40, %ecx
jae L(more40bytes) jae L(more40bytes)
@ -1623,7 +1619,7 @@ L(more32bytes):
jmp L(36bytes) jmp L(36bytes)
# endif # endif
ALIGN (4) .p2align 4
L(more40bytes): L(more40bytes):
cmp $40, %ecx cmp $40, %ecx
je L(40bytes) je L(40bytes)
@ -1642,7 +1638,7 @@ L(more40bytes):
je L(46bytes) je L(46bytes)
jmp L(47bytes) jmp L(47bytes)
ALIGN (4) .p2align 4
L(44bytes): L(44bytes):
movl -44(%rdi), %eax movl -44(%rdi), %eax
movl -44(%rsi), %ecx movl -44(%rsi), %ecx
@ -1702,7 +1698,7 @@ L(0bytes):
xor %eax, %eax xor %eax, %eax
ret ret
# else # else
ALIGN (4) .p2align 4
L(44bytes): L(44bytes):
movl -44(%rdi), %eax movl -44(%rdi), %eax
cmp -44(%rsi), %eax cmp -44(%rsi), %eax
@ -1753,7 +1749,7 @@ L(0bytes):
# endif # endif
# ifndef USE_AS_WMEMCMP # ifndef USE_AS_WMEMCMP
ALIGN (4) .p2align 4
L(45bytes): L(45bytes):
movl -45(%rdi), %eax movl -45(%rdi), %eax
movl -45(%rsi), %ecx movl -45(%rsi), %ecx
@ -1816,7 +1812,7 @@ L(1bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(46bytes): L(46bytes):
movl -46(%rdi), %eax movl -46(%rdi), %eax
movl -46(%rsi), %ecx movl -46(%rsi), %ecx
@ -1882,7 +1878,7 @@ L(2bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(47bytes): L(47bytes):
movl -47(%rdi), %eax movl -47(%rdi), %eax
movl -47(%rsi), %ecx movl -47(%rsi), %ecx
@ -1951,7 +1947,7 @@ L(3bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) .p2align 4
L(find_diff): L(find_diff):
cmpb %cl, %al cmpb %cl, %al
jne L(set) jne L(set)
@ -1973,19 +1969,19 @@ L(set):
# else # else
/* for wmemcmp */ /* for wmemcmp */
ALIGN (4) .p2align 4
L(find_diff): L(find_diff):
mov $1, %eax mov $1, %eax
jg L(find_diff_bigger) jg L(find_diff_bigger)
neg %eax neg %eax
ret ret
ALIGN (4) .p2align 4
L(find_diff_bigger): L(find_diff_bigger):
ret ret
# endif # endif
ALIGN (4) .p2align 4
L(equal): L(equal):
xor %eax, %eax xor %eax, %eax
ret ret

View File

@ -20,10 +20,6 @@
#include "asm-syntax.h" #include "asm-syntax.h"
#ifndef ALIGN
# define ALIGN(n) .p2align n
#endif
ENTRY(__memcpy_sse2_unaligned) ENTRY(__memcpy_sse2_unaligned)
movq %rsi, %rax movq %rsi, %rax
@ -44,7 +40,7 @@ L(return):
movq %rdi, %rax movq %rdi, %rax
ret ret
.p2align 4,,10 .p2align 4,,10
ALIGN(4) .p2align 4
.L31: .L31:
movdqu 16(%rsi), %xmm8 movdqu 16(%rsi), %xmm8
cmpq $64, %rdx cmpq $64, %rdx
@ -77,7 +73,7 @@ L(return):
leaq 32(%r10), %r8 leaq 32(%r10), %r8
leaq 48(%r10), %rax leaq 48(%r10), %rax
.p2align 4,,10 .p2align 4,,10
ALIGN(4) .p2align 4
L(loop): L(loop):
movdqu (%rcx,%r10), %xmm8 movdqu (%rcx,%r10), %xmm8
movdqa %xmm8, (%rcx) movdqa %xmm8, (%rcx)
@ -151,7 +147,7 @@ L(less_16):
.L3: .L3:
leaq -1(%rdx), %rax leaq -1(%rdx), %rax
.p2align 4,,10 .p2align 4,,10
ALIGN(4) .p2align 4
.L11: .L11:
movzbl (%rsi,%rax), %edx movzbl (%rsi,%rax), %edx
movb %dl, (%rdi,%rax) movb %dl, (%rdi,%rax)

View File

@ -31,10 +31,6 @@
# define MEMCPY_CHK __memcpy_chk_ssse3_back # define MEMCPY_CHK __memcpy_chk_ssse3_back
#endif #endif
#ifndef ALIGN
# define ALIGN(n) .p2align n
#endif
#define JMPTBL(I, B) I - B #define JMPTBL(I, B) I - B
/* Branch to an entry in a jump table. TABLE is a jump table with /* Branch to an entry in a jump table. TABLE is a jump table with
@ -87,7 +83,7 @@ L(bk_write):
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
#endif #endif
ALIGN (4) .p2align 4
L(144bytesormore): L(144bytesormore):
#ifndef USE_AS_MEMMOVE #ifndef USE_AS_MEMMOVE
@ -119,7 +115,7 @@ L(144bytesormore):
jmp *%r9 jmp *%r9
ud2 ud2
ALIGN (4) .p2align 4
L(copy_backward): L(copy_backward):
#ifdef DATA_CACHE_SIZE #ifdef DATA_CACHE_SIZE
mov $DATA_CACHE_SIZE, %RCX_LP mov $DATA_CACHE_SIZE, %RCX_LP
@ -149,7 +145,7 @@ L(copy_backward):
jmp *%r9 jmp *%r9
ud2 ud2
ALIGN (4) .p2align 4
L(shl_0): L(shl_0):
mov %rdx, %r9 mov %rdx, %r9
@ -162,7 +158,7 @@ L(shl_0):
#endif #endif
jae L(gobble_mem_fwd) jae L(gobble_mem_fwd)
sub $0x80, %rdx sub $0x80, %rdx
ALIGN (4) .p2align 4
L(shl_0_loop): L(shl_0_loop):
movdqa (%rsi), %xmm1 movdqa (%rsi), %xmm1
movdqa %xmm1, (%rdi) movdqa %xmm1, (%rdi)
@ -190,7 +186,7 @@ L(shl_0_loop):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_0_bwd): L(shl_0_bwd):
sub $0x80, %rdx sub $0x80, %rdx
L(copy_backward_loop): L(copy_backward_loop):
@ -221,7 +217,7 @@ L(copy_backward_loop):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_1): L(shl_1):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x01(%rsi), %xmm1 movaps -0x01(%rsi), %xmm1
@ -258,7 +254,7 @@ L(shl_1):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_1_bwd): L(shl_1_bwd):
movaps -0x01(%rsi), %xmm1 movaps -0x01(%rsi), %xmm1
@ -304,7 +300,7 @@ L(shl_1_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_2): L(shl_2):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x02(%rsi), %xmm1 movaps -0x02(%rsi), %xmm1
@ -341,7 +337,7 @@ L(shl_2):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_2_bwd): L(shl_2_bwd):
movaps -0x02(%rsi), %xmm1 movaps -0x02(%rsi), %xmm1
@ -387,7 +383,7 @@ L(shl_2_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_3): L(shl_3):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x03(%rsi), %xmm1 movaps -0x03(%rsi), %xmm1
@ -424,7 +420,7 @@ L(shl_3):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_3_bwd): L(shl_3_bwd):
movaps -0x03(%rsi), %xmm1 movaps -0x03(%rsi), %xmm1
@ -470,7 +466,7 @@ L(shl_3_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_4): L(shl_4):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x04(%rsi), %xmm1 movaps -0x04(%rsi), %xmm1
@ -507,7 +503,7 @@ L(shl_4):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_4_bwd): L(shl_4_bwd):
movaps -0x04(%rsi), %xmm1 movaps -0x04(%rsi), %xmm1
@ -553,7 +549,7 @@ L(shl_4_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_5): L(shl_5):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x05(%rsi), %xmm1 movaps -0x05(%rsi), %xmm1
@ -590,7 +586,7 @@ L(shl_5):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_5_bwd): L(shl_5_bwd):
movaps -0x05(%rsi), %xmm1 movaps -0x05(%rsi), %xmm1
@ -636,7 +632,7 @@ L(shl_5_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_6): L(shl_6):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x06(%rsi), %xmm1 movaps -0x06(%rsi), %xmm1
@ -673,7 +669,7 @@ L(shl_6):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_6_bwd): L(shl_6_bwd):
movaps -0x06(%rsi), %xmm1 movaps -0x06(%rsi), %xmm1
@ -719,7 +715,7 @@ L(shl_6_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_7): L(shl_7):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x07(%rsi), %xmm1 movaps -0x07(%rsi), %xmm1
@ -756,7 +752,7 @@ L(shl_7):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_7_bwd): L(shl_7_bwd):
movaps -0x07(%rsi), %xmm1 movaps -0x07(%rsi), %xmm1
@ -802,7 +798,7 @@ L(shl_7_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_8): L(shl_8):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x08(%rsi), %xmm1 movaps -0x08(%rsi), %xmm1
@ -839,7 +835,7 @@ L(shl_8):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_8_bwd): L(shl_8_bwd):
movaps -0x08(%rsi), %xmm1 movaps -0x08(%rsi), %xmm1
@ -886,7 +882,7 @@ L(shl_8_end_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_9): L(shl_9):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x09(%rsi), %xmm1 movaps -0x09(%rsi), %xmm1
@ -923,7 +919,7 @@ L(shl_9):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_9_bwd): L(shl_9_bwd):
movaps -0x09(%rsi), %xmm1 movaps -0x09(%rsi), %xmm1
@ -969,7 +965,7 @@ L(shl_9_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_10): L(shl_10):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x0a(%rsi), %xmm1 movaps -0x0a(%rsi), %xmm1
@ -1006,7 +1002,7 @@ L(shl_10):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_10_bwd): L(shl_10_bwd):
movaps -0x0a(%rsi), %xmm1 movaps -0x0a(%rsi), %xmm1
@ -1052,7 +1048,7 @@ L(shl_10_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_11): L(shl_11):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x0b(%rsi), %xmm1 movaps -0x0b(%rsi), %xmm1
@ -1089,7 +1085,7 @@ L(shl_11):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_11_bwd): L(shl_11_bwd):
movaps -0x0b(%rsi), %xmm1 movaps -0x0b(%rsi), %xmm1
@ -1135,7 +1131,7 @@ L(shl_11_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_12): L(shl_12):
sub $0x80, %rdx sub $0x80, %rdx
movdqa -0x0c(%rsi), %xmm1 movdqa -0x0c(%rsi), %xmm1
@ -1173,7 +1169,7 @@ L(shl_12):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_12_bwd): L(shl_12_bwd):
movaps -0x0c(%rsi), %xmm1 movaps -0x0c(%rsi), %xmm1
@ -1219,7 +1215,7 @@ L(shl_12_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_13): L(shl_13):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x0d(%rsi), %xmm1 movaps -0x0d(%rsi), %xmm1
@ -1256,7 +1252,7 @@ L(shl_13):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_13_bwd): L(shl_13_bwd):
movaps -0x0d(%rsi), %xmm1 movaps -0x0d(%rsi), %xmm1
@ -1302,7 +1298,7 @@ L(shl_13_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_14): L(shl_14):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x0e(%rsi), %xmm1 movaps -0x0e(%rsi), %xmm1
@ -1339,7 +1335,7 @@ L(shl_14):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_14_bwd): L(shl_14_bwd):
movaps -0x0e(%rsi), %xmm1 movaps -0x0e(%rsi), %xmm1
@ -1385,7 +1381,7 @@ L(shl_14_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_15): L(shl_15):
sub $0x80, %rdx sub $0x80, %rdx
movaps -0x0f(%rsi), %xmm1 movaps -0x0f(%rsi), %xmm1
@ -1422,7 +1418,7 @@ L(shl_15):
add %rdx, %rsi add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(shl_15_bwd): L(shl_15_bwd):
movaps -0x0f(%rsi), %xmm1 movaps -0x0f(%rsi), %xmm1
@ -1468,7 +1464,7 @@ L(shl_15_bwd):
sub %rdx, %rsi sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
ALIGN (4) .p2align 4
L(gobble_mem_fwd): L(gobble_mem_fwd):
movdqu (%rsi), %xmm1 movdqu (%rsi), %xmm1
movdqu %xmm0, (%r8) movdqu %xmm0, (%r8)
@ -1570,7 +1566,7 @@ L(gobble_mem_fwd_end):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
ALIGN (4) .p2align 4
L(gobble_mem_bwd): L(gobble_mem_bwd):
add %rdx, %rsi add %rdx, %rsi
add %rdx, %rdi add %rdx, %rdi
@ -2833,7 +2829,7 @@ L(bwd_write_1bytes):
END (MEMCPY) END (MEMCPY)
.section .rodata.ssse3,"a",@progbits .section .rodata.ssse3,"a",@progbits
ALIGN (3) .p2align 3
L(table_144_bytes_bwd): L(table_144_bytes_bwd):
.int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
.int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
@ -2980,7 +2976,7 @@ L(table_144_bytes_bwd):
.int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
.int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
ALIGN (3) .p2align 3
L(table_144_bytes_fwd): L(table_144_bytes_fwd):
.int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
.int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
@ -3127,7 +3123,7 @@ L(table_144_bytes_fwd):
.int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
.int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
ALIGN (3) .p2align 3
L(shl_table_fwd): L(shl_table_fwd):
.int JMPTBL (L(shl_0), L(shl_table_fwd)) .int JMPTBL (L(shl_0), L(shl_table_fwd))
.int JMPTBL (L(shl_1), L(shl_table_fwd)) .int JMPTBL (L(shl_1), L(shl_table_fwd))
@ -3146,7 +3142,7 @@ L(shl_table_fwd):
.int JMPTBL (L(shl_14), L(shl_table_fwd)) .int JMPTBL (L(shl_14), L(shl_table_fwd))
.int JMPTBL (L(shl_15), L(shl_table_fwd)) .int JMPTBL (L(shl_15), L(shl_table_fwd))
ALIGN (3) .p2align 3
L(shl_table_bwd): L(shl_table_bwd):
.int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
.int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))

File diff suppressed because it is too large Load Diff

View File

@ -17,7 +17,6 @@
<http://www.gnu.org/licenses/>. */ <http://www.gnu.org/licenses/>. */
#include "sysdep.h" #include "sysdep.h"
#define ALIGN(x) .p2align x
ENTRY ( __strcmp_sse2_unaligned) ENTRY ( __strcmp_sse2_unaligned)
movl %edi, %eax movl %edi, %eax
@ -43,7 +42,7 @@ L(return):
subl %edx, %eax subl %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(next_48_bytes): L(next_48_bytes):
movdqu 16(%rdi), %xmm6 movdqu 16(%rdi), %xmm6
movdqu 16(%rsi), %xmm3 movdqu 16(%rsi), %xmm3
@ -85,7 +84,7 @@ L(main_loop_header):
movq %rcx, %rsi movq %rcx, %rsi
jmp L(loop_start) jmp L(loop_start)
ALIGN (4) .p2align 4
L(loop): L(loop):
addq $64, %rax addq $64, %rax
addq $64, %rdx addq $64, %rdx
@ -141,7 +140,7 @@ L(back_to_loop):
subl %edx, %eax subl %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(loop_cross_page): L(loop_cross_page):
xor %r10, %r10 xor %r10, %r10
movq %rdx, %r9 movq %rdx, %r9
@ -191,7 +190,7 @@ L(loop_cross_page):
subl %edx, %eax subl %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(cross_page_loop): L(cross_page_loop):
cmpb %cl, %al cmpb %cl, %al
jne L(different) jne L(different)

View File

@ -19,11 +19,6 @@
#include <sysdep.h> #include <sysdep.h>
# ifndef ALIGN
# define ALIGN(n) .p2align n
# endif
.text .text
ENTRY (strchr) ENTRY (strchr)
movd %esi, %xmm1 movd %esi, %xmm1
@ -54,7 +49,7 @@ ENTRY (strchr)
#endif #endif
ret ret
ALIGN(3) .p2align 3
L(next_48_bytes): L(next_48_bytes):
movdqu 16(%rdi), %xmm0 movdqu 16(%rdi), %xmm0
movdqa %xmm0, %xmm4 movdqa %xmm0, %xmm4
@ -83,10 +78,10 @@ ENTRY (strchr)
L(loop_start): L(loop_start):
/* We use this alignment to force loop be aligned to 8 but not /* We use this alignment to force loop be aligned to 8 but not
16 bytes. This gives better sheduling on AMD processors. */ 16 bytes. This gives better sheduling on AMD processors. */
ALIGN(4) .p2align 4
pxor %xmm6, %xmm6 pxor %xmm6, %xmm6
andq $-64, %rdi andq $-64, %rdi
ALIGN(3) .p2align 3
L(loop64): L(loop64):
addq $64, %rdi addq $64, %rdi
movdqa (%rdi), %xmm5 movdqa (%rdi), %xmm5
@ -129,7 +124,7 @@ L(loop64):
orq %rcx, %rax orq %rcx, %rax
salq $48, %rdx salq $48, %rdx
orq %rdx, %rax orq %rdx, %rax
ALIGN(3) .p2align 3
L(return): L(return):
bsfq %rax, %rax bsfq %rax, %rax
#ifdef AS_STRCHRNUL #ifdef AS_STRCHRNUL
@ -141,7 +136,7 @@ L(return):
cmovne %rdx, %rax cmovne %rdx, %rax
#endif #endif
ret ret
ALIGN(4) .p2align 4
L(cross_page): L(cross_page):
movq %rdi, %rdx movq %rdi, %rdx

View File

@ -19,11 +19,6 @@
#include <sysdep.h> #include <sysdep.h>
# ifndef ALIGN
# define ALIGN(n) .p2align n
# endif
.text .text
ENTRY (strrchr) ENTRY (strrchr)
movd %esi, %xmm1 movd %esi, %xmm1
@ -51,7 +46,7 @@ ENTRY (strrchr)
addq %rdi, %rax addq %rdi, %rax
ret ret
ALIGN(4) .p2align 4
L(next_48_bytes): L(next_48_bytes):
movdqu 16(%rdi), %xmm4 movdqu 16(%rdi), %xmm4
movdqa %xmm4, %xmm5 movdqa %xmm4, %xmm5
@ -91,7 +86,7 @@ L(next_48_bytes):
leaq (%rdi,%rsi), %rax leaq (%rdi,%rsi), %rax
ret ret
ALIGN(4) .p2align 4
L(loop_header2): L(loop_header2):
testq %rsi, %rsi testq %rsi, %rsi
movq %rdi, %rcx movq %rdi, %rcx
@ -102,7 +97,7 @@ L(loop_header):
andq $-64, %rdi andq $-64, %rdi
jmp L(loop_entry) jmp L(loop_entry)
ALIGN(4) .p2align 4
L(loop64): L(loop64):
testq %rdx, %rdx testq %rdx, %rdx
cmovne %rdx, %rsi cmovne %rdx, %rsi
@ -163,18 +158,18 @@ L(loop_entry):
leaq (%rcx,%rsi), %rax leaq (%rcx,%rsi), %rax
ret ret
ALIGN(4) .p2align 4
L(no_c_found): L(no_c_found):
movl $1, %esi movl $1, %esi
xorl %ecx, %ecx xorl %ecx, %ecx
jmp L(loop_header) jmp L(loop_header)
ALIGN(4) .p2align 4
L(exit): L(exit):
xorl %eax, %eax xorl %eax, %eax
ret ret
ALIGN(4) .p2align 4
L(cross_page): L(cross_page):
movq %rdi, %rax movq %rdi, %rax
pxor %xmm0, %xmm0 pxor %xmm0, %xmm0