x86_64: Fix svml_d_cosh4_core_avx2.S code formatting

This commit contains following formatting changes

1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
   between it and the first operand.
3. Instruction greater than 7 characters in length have a
   space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-03-07 10:47:11 -08:00
parent 23ff8b384f
commit e4d257bdf9

View File

@ -50,8 +50,7 @@
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
.section .text.avx2, "ax", @progbits
ENTRY(_ZGVdN4v_cosh_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
@ -64,33 +63,33 @@ ENTRY(_ZGVdN4v_cosh_avx2)
vmovupd _dSign+__svml_dcosh_data_internal(%rip), %ymm8
vmovupd _dbShifter+__svml_dcosh_data_internal(%rip), %ymm6
/*
/*
* Load argument
* dM = x*2^K/log(2) + RShifter
*/
vmovupd _dbInvLn2+__svml_dcosh_data_internal(%rip), %ymm3
/*
/*
* trick
* 256=-iIndex
*/
vmovups _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm14
/* dXSign=0x001000000000 */
/* dXSign=0x001000000000 */
vpsrlq $11, %ymm8, %ymm5
vmovapd %ymm0, %ymm7
/* Abs argument */
/* Abs argument */
vandnpd %ymm7, %ymm8, %ymm4
vfmadd213pd %ymm6, %ymm4, %ymm3
/* Index and lookup */
/* Index and lookup */
vextractf128 $1, %ymm3, %xmm12
vshufps $136, %xmm12, %xmm3, %xmm13
vpand _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm13, %xmm15
vpsubd %xmm15, %xmm14, %xmm0
/* iDomainRange*=3 */
/* iDomainRange*=3 */
vpslld $3, %xmm0, %xmm2
vmovd %xmm2, %r9d
vpextrd $2, %xmm2, %r11d
@ -98,21 +97,21 @@ ENTRY(_ZGVdN4v_cosh_avx2)
vpextrd $1, %xmm2, %r10d
movslq %r11d, %r11
movslq %r10d, %r10
vmovsd (%rax,%r9), %xmm12
vmovsd (%rax, %r9), %xmm12
/*
/*
* Check for overflow\underflow
*
*/
vextractf128 $1, %ymm4, %xmm9
vmovsd (%rax,%r11), %xmm14
vmovhpd (%rax,%r10), %xmm12, %xmm13
vmovsd (%rax, %r11), %xmm14
vmovhpd (%rax, %r10), %xmm12, %xmm13
vshufps $221, %xmm9, %xmm4, %xmm10
/* iIndex*=3 */
/* iIndex*=3 */
vpslld $3, %xmm15, %xmm9
/*
/*
* R
* dN = dM - RShifter
*/
@ -121,83 +120,83 @@ ENTRY(_ZGVdN4v_cosh_avx2)
vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm10, %xmm11
vmovupd _dbLn2hi+__svml_dcosh_data_internal(%rip), %ymm6
/*
* G1,G2,G3: dTdif,dTn * 2^N,2^(-N)
/*
* G1, G2, G3: dTdif, dTn * 2^N, 2^(-N)
* NB: copied from sinh_la - to be optimized!!!!!
*/
vpsllq $44, %ymm3, %ymm3
vmovmskps %xmm11, %edx
/* dR = dX - dN*Log2_hi/2^K */
/* dR = dX - dN*Log2_hi/2^K */
vfnmadd231pd %ymm6, %ymm15, %ymm4
/* lM now is an EXP(2^N) */
/* lM now is an EXP(2^N) */
vpand _lExpMask+__svml_dcosh_data_internal(%rip), %ymm3, %ymm3
/* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */
/* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */
vfnmadd231pd _dbLn2lo+__svml_dcosh_data_internal(%rip), %ymm15, %ymm4
movslq %ecx, %rcx
vpextrd $2, %xmm9, %edi
vpextrd $1, %xmm9, %esi
movslq %edi, %rdi
vmovsd (%rax,%rcx), %xmm1
vmovsd (%rax, %rcx), %xmm1
vpextrd $3, %xmm9, %r8d
vpextrd $3, %xmm2, %ecx
movslq %esi, %rsi
movslq %r8d, %r8
movslq %ecx, %rcx
/* dR2 = dR^2 */
/* dR2 = dR^2 */
vmulpd %ymm4, %ymm4, %ymm0
vmovsd (%rax,%rdi), %xmm10
vmovhpd (%rax,%rsi), %xmm1, %xmm8
vmovhpd (%rax,%r8), %xmm10, %xmm11
vmovhpd (%rax,%rcx), %xmm14, %xmm2
vmovsd (%rax, %rdi), %xmm10
vmovhpd (%rax, %rsi), %xmm1, %xmm8
vmovhpd (%rax, %r8), %xmm10, %xmm11
vmovhpd (%rax, %rcx), %xmm14, %xmm2
vinsertf128 $1, %xmm11, %ymm8, %ymm1
vinsertf128 $1, %xmm2, %ymm13, %ymm2
vpaddq %ymm3, %ymm1, %ymm6
/* */
/* */
vpsubq %ymm3, %ymm2, %ymm1
/*
/*
* sinh(r) = r +r*r^2*a3 ....
* dSinh_r = r^2*a3
*/
vmulpd _dPC3+__svml_dcosh_data_internal(%rip), %ymm0, %ymm2
/* lX- = EXP(1/2) */
/* lX- = EXP(1/2) */
vpsubq %ymm5, %ymm1, %ymm5
/* dSinh_r = r + r*r^2*a3 */
/* dSinh_r = r + r*r^2*a3 */
vfmadd213pd %ymm4, %ymm4, %ymm2
/* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
/* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
vmovupd _dPC4+__svml_dcosh_data_internal(%rip), %ymm4
/* dTn = dTn*2^N - dTn*2^-N */
/* dTn = dTn*2^N - dTn*2^-N */
vsubpd %ymm5, %ymm6, %ymm1
/* dTp = dTn*2^N + dTn*2^-N */
/* dTp = dTn*2^N + dTn*2^-N */
vaddpd %ymm5, %ymm6, %ymm3
vfmadd213pd _dPC2+__svml_dcosh_data_internal(%rip), %ymm0, %ymm4
vmulpd %ymm2, %ymm1, %ymm1
vmulpd %ymm4, %ymm0, %ymm0
/* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
/* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
vfmadd213pd %ymm1, %ymm3, %ymm0
/* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
/* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
vaddpd %ymm0, %ymm3, %ymm0
/* Ret H */
/* Ret H */
testl %edx, %edx
/* Go to special inputs processing branch */
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm7
/* Restore registers
/* Restore registers
* and exit the function
*/
@ -210,7 +209,7 @@ L(EXIT):
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
/* Branch to process
* special inputs
*/
@ -236,18 +235,18 @@ L(SPECIAL_VALUES_BRANCH):
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
/* Special inputs
* processing loop
*/
@ -255,7 +254,7 @@ L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
@ -267,7 +266,7 @@ L(SPECIAL_VALUES_LOOP):
cfi_restore(14)
vmovupd 64(%rsp), %ymm0
/* Go to exit */
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
@ -277,19 +276,19 @@ L(SPECIAL_VALUES_LOOP):
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
movsd 32(%rsp, %r14, 8), %xmm0
call cosh@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 64(%rsp,%r14,8)
movsd %xmm0, 64(%rsp, %r14, 8)
/* Process special inputs in loop */
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN4v_cosh_avx2)
@ -299,25 +298,24 @@ END(_ZGVdN4v_cosh_avx2)
#ifdef __svml_dcosh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _dbT[(1 + (1<<8))][2]; //dTpj ONLY!
typedef struct {
__declspec(align(32)) VUINT32 _dbT[(1+(1<<8))][2]; // dTpj ONLY!
__declspec(align(32)) VUINT32 _dbInvLn2[4][2];
__declspec(align(32)) VUINT32 _dbLn2hi[4][2];
__declspec(align(32)) VUINT32 _dbLn2lo[4][2];
__declspec(align(32)) VUINT32 _dbShifter[4][2];
__declspec(align(32)) VUINT32 _iIndexMask[8][1]; //(1<<K)1-
__declspec(align(32)) VUINT32 _iIndexMask[8][1]; // (1<<K)1-
__declspec(align(32)) VUINT32 _dPC2[4][2];
__declspec(align(32)) VUINT32 _dPC3[4][2];
__declspec(align(32)) VUINT32 _dPC4[4][2];
__declspec(align(32)) VUINT32 _iMaxIndex[8][1]; //(1<<K)
__declspec(align(32)) VUINT32 _iMaxIndex[8][1]; // (1<<K)
__declspec(align(32)) VUINT32 _lExpMask[4][2];
__declspec(align(32)) VUINT32 _dSign[4][2]; //0x8000000000000000
__declspec(align(32)) VUINT32 _dSign[4][2]; // 0x8000000000000000
__declspec(align(32)) VUINT32 _iDomainRange[8][1];
} __svml_dcosh_data_internal;
#endif
__svml_dcosh_data_internal:
/*== _dbT ==*/
/* _dbT */
.quad 0x3fe0000000000000, 0x3fe00b1afa5abcbf, 0x3fe0163da9fb3335, 0x3fe02168143b0281
.quad 0x3fe02c9a3e778061, 0x3fe037d42e11bbcc, 0x3fe04315e86e7f85, 0x3fe04e5f72f654b1
.quad 0x3fe059b0d3158574, 0x3fe0650a0e3c1f89, 0x3fe0706b29ddf6de, 0x3fe07bd42b72a836
@ -386,9 +384,9 @@ __svml_dcosh_data_internal:
.align 32
.quad 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe /* _dbInvLn2 = 1/log(2) */
.align 32
.quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi*/
.quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi */
.align 32
.quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo*/
.quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo */
.align 32
.quad 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000 /* _dbShifter */
.align 32
@ -404,9 +402,9 @@ __svml_dcosh_data_internal:
.align 32
.quad 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 /* _lExpMask */
.align 32
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dSign*/
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dSign */
.align 32
.long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp*/
.long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp */
.align 32
.type __svml_dcosh_data_internal,@object
.size __svml_dcosh_data_internal,.-__svml_dcosh_data_internal
.type __svml_dcosh_data_internal, @object
.size __svml_dcosh_data_internal, .-__svml_dcosh_data_internal