x86_64: Fix svml_d_tan8_core_avx512.S code formatting

This commit contains following formatting changes

1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
   between it and the first operand.
3. Instruction greater than 7 characters in length have a
   space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-03-07 10:47:15 -08:00
parent 7425f0c1e5
commit 8589dee1f2

View File

@ -55,8 +55,7 @@
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
.section .text.evex512, "ax", @progbits
ENTRY(_ZGVeN8v_tan_skx)
pushq %rbp
cfi_def_cfa_offset(16)
@ -67,10 +66,10 @@ ENTRY(_ZGVeN8v_tan_skx)
subq $192, %rsp
xorl %edx, %edx
/* Large values check */
/* Large values check */
vmovups _dReductionRangeVal+__svml_dtan_data_internal(%rip), %zmm1
/*
/*
*
* Main path
*
@ -85,7 +84,7 @@ ENTRY(_ZGVeN8v_tan_skx)
vcmppd $22, {sae}, %zmm1, %zmm0, %k1
vmovups __svml_dtan_data_internal(%rip), %zmm1
/*
/*
*
* End of main path
*/
@ -97,11 +96,11 @@ ENTRY(_ZGVeN8v_tan_skx)
vfnmadd231pd {rn-sae}, %zmm8, %zmm3, %zmm5
vfnmadd213pd {rn-sae}, %zmm5, %zmm4, %zmm8
/* Go to auxilary branch */
/* Go to auxilary branch */
jne L(AUX_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm8 zmm11 k1
/* Return from auxilary branch
/* Return from auxilary branch
* for out of main path inputs
*/
@ -121,40 +120,40 @@ L(AUX_BRANCH_RETURN):
vmulpd {rn-sae}, %zmm8, %zmm5, %zmm7
vfmadd213pd {rn-sae}, %zmm8, %zmm6, %zmm7
/*
/*
* Computer Denominator:
* dDenominator - dDlow ~= 1-(dTh+dTl)*(dP+dPlow)
*/
vmovups _dOne_uisa+__svml_dtan_data_internal(%rip), %zmm8
/*
/*
* Compute Numerator:
* dNumerator + dNlow ~= dTh+dTl+dP+dPlow
*/
vaddpd {rn-sae}, %zmm0, %zmm7, %zmm9
vfnmadd213pd {rn-sae}, %zmm8, %zmm7, %zmm0
/*
/*
* Now computes (dNumerator + dNlow)/(dDenominator - dDlow)
* Choose NR iteration instead of hardware division
*/
vrcp14pd %zmm0, %zmm10
/* One NR iteration to refine dRcp */
/* One NR iteration to refine dRcp */
vfnmadd231pd {rn-sae}, %zmm10, %zmm0, %zmm8
vfmadd213pd {rn-sae}, %zmm10, %zmm8, %zmm10
vmulpd {rn-sae}, %zmm9, %zmm10, %zmm12
/* One NR iteration to refine dQuotient */
/* One NR iteration to refine dQuotient */
vfmsub213pd {rn-sae}, %zmm9, %zmm12, %zmm0
vfnmadd213pd {rn-sae}, %zmm12, %zmm10, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
/* Restore registers
/* Restore registers
* and exit the function
*/
@ -167,7 +166,7 @@ L(EXIT):
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
/* Branch to process
* special inputs
*/
@ -193,18 +192,18 @@ L(SPECIAL_VALUES_BRANCH):
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
/* Special inputs
* processing loop
*/
@ -212,7 +211,7 @@ L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
@ -224,7 +223,7 @@ L(SPECIAL_VALUES_LOOP):
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
@ -234,33 +233,33 @@ L(SPECIAL_VALUES_LOOP):
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 64(%rsp,%r14,8), %xmm0
movsd 64(%rsp, %r14, 8), %xmm0
call tan@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 128(%rsp,%r14,8)
movsd %xmm0, 128(%rsp, %r14, 8)
/* Process special inputs in loop */
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
cfi_restore(12)
cfi_restore(13)
cfi_restore(14)
# LOE rbx r15 r12d r13d
/* Auxilary branch
/* Auxilary branch
* for out of main path inputs
*/
L(AUX_BRANCH):
vmovups _dRangeVal+__svml_dtan_data_internal(%rip), %zmm14
/*
/*
* Get the (2^a / 2pi) mod 1 values from the table.
* Because doesn't have L-type gather, we need a trivial cast
*/
@ -280,7 +279,7 @@ L(AUX_BRANCH):
vpandnq %zmm6, %zmm6, %zmm5{%k2}
vcmppd $3, {sae}, %zmm5, %zmm5, %k0
/*
/*
* Break the P_xxx and m into 32-bit chunks ready for
* the long multiplication via 32x32->64 multiplications
*/
@ -292,14 +291,14 @@ L(AUX_BRANCH):
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm8, %zmm8, %zmm8
vgatherdpd (%rax,%ymm3), %zmm2{%k3}
vgatherdpd 8(%rax,%ymm3), %zmm1{%k4}
vgatherdpd 16(%rax,%ymm3), %zmm8{%k5}
vgatherdpd (%rax, %ymm3), %zmm2{%k3}
vgatherdpd 8(%rax, %ymm3), %zmm1{%k4}
vgatherdpd 16(%rax, %ymm3), %zmm8{%k5}
vpsrlq $32, %zmm2, %zmm5
vpsrlq $32, %zmm1, %zmm0
vpsrlq $32, %zmm8, %zmm13
/*
/*
* Also get the significand as an integer
* NB: adding in the integer bit is wrong for denorms!
* To make this work for denorms we should do something slightly different
@ -313,7 +312,7 @@ L(AUX_BRANCH):
vpandq %zmm6, %zmm8, %zmm15
vpandq %zmm6, %zmm14, %zmm14
/* Now do the big multiplication and carry propagation */
/* Now do the big multiplication and carry propagation */
vpmullq %zmm10, %zmm7, %zmm4
vpmullq %zmm12, %zmm7, %zmm2
vpmullq %zmm13, %zmm7, %zmm1
@ -340,7 +339,7 @@ L(AUX_BRANCH):
vpaddq %zmm2, %zmm10, %zmm1
vpaddq %zmm1, %zmm0, %zmm8
/*
/*
* Now round at the 2^-9 bit position for reduction mod pi/2^8
* instead of the original 2pi (but still with the same 2pi scaling).
* Use a shifter of 2^43 + 2^42.
@ -359,7 +358,7 @@ L(AUX_BRANCH):
vpsrlq $32, %zmm15, %zmm12
vpaddq %zmm13, %zmm12, %zmm5
/* Assemble reduced argument from the pieces */
/* Assemble reduced argument from the pieces */
vpandq %zmm6, %zmm14, %zmm10
vpandq %zmm6, %zmm15, %zmm7
vpsllq $32, %zmm5, %zmm6
@ -368,7 +367,7 @@ L(AUX_BRANCH):
vpaddq %zmm10, %zmm5, %zmm10
vpsrlq $12, %zmm4, %zmm6
/*
/*
* We want to incorporate the original sign now too.
* Do it here for convenience in getting the right N value,
* though we could wait right to the end if we were prepared
@ -379,7 +378,7 @@ L(AUX_BRANCH):
vpandq .FLT_25(%rip){1to8}, %zmm10, %zmm13
vpsllq $28, %zmm13, %zmm14
/*
/*
* Create floating-point high part, implicitly adding integer bit 1
* Incorporate overall sign at this stage too.
*/
@ -389,7 +388,7 @@ L(AUX_BRANCH):
vsubpd {rn-sae}, %zmm1, %zmm12, %zmm3
vsubpd {rn-sae}, %zmm3, %zmm2, %zmm7
/*
/*
* Create floating-point low and medium parts, respectively
* lo_23, ... lo_0, 0, ..., 0
* hi_11, ... hi_0, lo_63, ..., lo_24
@ -402,7 +401,7 @@ L(AUX_BRANCH):
vpandq .FLT_27(%rip){1to8}, %zmm4, %zmm4
vsubpd {rn-sae}, %zmm6, %zmm15, %zmm8
/*
/*
* If the magnitude of the input is <= 2^-20, then
* just pass through the input, since no reduction will be needed and
* the main path will only work accurately if the reduced argument is
@ -419,24 +418,24 @@ L(AUX_BRANCH):
vporq %zmm1, %zmm0, %zmm4
vsubpd {rn-sae}, %zmm1, %zmm4, %zmm2
/* Now add them up into 2 reasonably aligned pieces */
/* Now add them up into 2 reasonably aligned pieces */
vaddpd {rn-sae}, %zmm2, %zmm7, %zmm13
vsubpd {rn-sae}, %zmm13, %zmm7, %zmm7
vaddpd {rn-sae}, %zmm7, %zmm2, %zmm3
vaddpd {rn-sae}, %zmm8, %zmm3, %zmm0
vpbroadcastq .FLT_29(%rip), %zmm8
/* Grab our final N value as an integer, appropriately masked mod 2^9 */
/* Grab our final N value as an integer, appropriately masked mod 2^9 */
vpandq .FLT_23(%rip){1to8}, %zmm12, %zmm5
/*
/*
* Now multiply those numbers all by 2 pi, reasonably accurately.
* (RHi + RLo) * (pi_lead + pi_trail) ~=
* RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead)
*/
vpbroadcastq .FLT_28(%rip), %zmm12
/* The output is _VRES_Z (high) + _VRES_E (low), and the integer part is _VRES_IND */
/* The output is _VRES_Z (high) + _VRES_E (low), and the integer part is _VRES_IND */
vpmovqd %zmm5, %ymm4
vmulpd {rn-sae}, %zmm12, %zmm13, %zmm6
vmovaps %zmm12, %zmm10
@ -458,7 +457,7 @@ L(AUX_BRANCH):
vpsrld $31, %ymm5, %ymm1
vpsubd %ymm1, %ymm0, %ymm2
/*
/*
*
* End of large arguments path
*
@ -477,7 +476,7 @@ L(AUX_BRANCH):
vfmadd213pd {rn-sae}, %zmm13, %zmm12, %zmm15
vblendmpd %zmm15, %zmm8, %zmm8{%k1}
/* Return to main vector processing path */
/* Return to main vector processing path */
jmp L(AUX_BRANCH_RETURN)
# LOE rbx r12 r13 r14 r15 edx zmm1 zmm8 zmm11
END(_ZGVeN8v_tan_skx)
@ -487,8 +486,7 @@ END(_ZGVeN8v_tan_skx)
#ifdef __svml_dtan_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
typedef struct {
__declspec(align(64)) VUINT32 _dInvPI_uisa[8][2];
__declspec(align(64)) VUINT32 _dPI1_uisa[8][2];
__declspec(align(64)) VUINT32 _dPI2_uisa[8][2];
@ -507,7 +505,7 @@ typedef unsigned int VUINT32;
__declspec(align(64)) VUINT32 _dPI1[8][2];
__declspec(align(64)) VUINT32 _dPI2[8][2];
__declspec(align(64)) VUINT32 _dPI3[8][2];
} __svml_dtan_data_internal;
} __svml_dtan_data_internal;
#endif
__svml_dtan_data_internal:
/* UISA */
@ -518,7 +516,7 @@ __svml_dtan_data_internal:
.quad 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06 /* _dPI2_uisa */
.align 64
.quad 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09 /* _dPI3_uisa */
/*== Th_tbl_uisa ==*/
/* Th_tbl_uisa */
.align 64
.quad 0x8000000000000000, 0x3fc975f5e0553158, 0x3fda827999fcef32, 0x3fe561b82ab7f990
.quad 0x3ff0000000000000, 0x3ff7f218e25a7461, 0x4003504f333f9de6, 0x40141bfee2424771
@ -552,14 +550,13 @@ __svml_dtan_data_internal:
.align 64
.quad 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345 /* _dPI3 */
.align 64
.type __svml_dtan_data_internal,@object
.size __svml_dtan_data_internal,.-__svml_dtan_data_internal
.type __svml_dtan_data_internal, @object
.size __svml_dtan_data_internal, .-__svml_dtan_data_internal
.align 64
#ifdef __svml_dtan_reduction_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
typedef struct {
__declspec(align(64)) VUINT32 _dPtable[2048][3][2];
} __svml_dtan_reduction_data_internal;
#endif
@ -2614,120 +2611,120 @@ __svml_dtan_reduction_data_internal:
.quad 0x4F758FD7CBE2F67A, 0x0E73EF14A525D4D7, 0xF6BF623F1ABA10AC /* 2046 */
.quad 0x9EEB1FAF97C5ECF4, 0x1CE7DE294A4BA9AF, 0xED7EC47E35742158 /* 2047 */
.align 64
.type __svml_dtan_reduction_data_internal,@object
.size __svml_dtan_reduction_data_internal,.-__svml_dtan_reduction_data_internal
.type __svml_dtan_reduction_data_internal, @object
.size __svml_dtan_reduction_data_internal, .-__svml_dtan_reduction_data_internal
.space 512, 0x00
.align 32
.FLT_32:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .FLT_32,@object
.size .FLT_32,32
.long 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008
.type .FLT_32, @object
.size .FLT_32, 32
.align 8
.FLT_16:
.long 0x00000000,0x7ff00000
.type .FLT_16,@object
.size .FLT_16,8
.long 0x00000000, 0x7ff00000
.type .FLT_16, @object
.size .FLT_16, 8
.align 8
.FLT_17:
.long 0xffffffff,0x000fffff
.type .FLT_17,@object
.size .FLT_17,8
.long 0xffffffff, 0x000fffff
.type .FLT_17, @object
.size .FLT_17, 8
.align 8
.FLT_18:
.long 0x00000000,0x00100000
.type .FLT_18,@object
.size .FLT_18,8
.long 0x00000000, 0x00100000
.type .FLT_18, @object
.size .FLT_18, 8
.align 8
.FLT_19:
.long 0xffffffff,0x00000000
.type .FLT_19,@object
.size .FLT_19,8
.long 0xffffffff, 0x00000000
.type .FLT_19, @object
.size .FLT_19, 8
.align 8
.FLT_20:
.long 0x00000000,0x80000000
.type .FLT_20,@object
.size .FLT_20,8
.long 0x00000000, 0x80000000
.type .FLT_20, @object
.size .FLT_20, 8
.align 8
.FLT_21:
.long 0x00000000,0x3ff00000
.type .FLT_21,@object
.size .FLT_21,8
.long 0x00000000, 0x3ff00000
.type .FLT_21, @object
.size .FLT_21, 8
.align 8
.FLT_22:
.long 0x00000000,0x42a80000
.type .FLT_22,@object
.size .FLT_22,8
.long 0x00000000, 0x42a80000
.type .FLT_22, @object
.size .FLT_22, 8
.align 8
.FLT_23:
.long 0x000001ff,0x00000000
.type .FLT_23,@object
.size .FLT_23,8
.long 0x000001ff, 0x00000000
.type .FLT_23, @object
.size .FLT_23, 8
.align 8
.FLT_24:
.long 0x00000000,0x39700000
.type .FLT_24,@object
.size .FLT_24,8
.long 0x00000000, 0x39700000
.type .FLT_24, @object
.size .FLT_24, 8
.align 8
.FLT_25:
.long 0x00ffffff,0x00000000
.type .FLT_25,@object
.size .FLT_25,8
.long 0x00ffffff, 0x00000000
.type .FLT_25, @object
.size .FLT_25, 8
.align 8
.FLT_26:
.long 0x00000000,0x3cb00000
.type .FLT_26,@object
.size .FLT_26,8
.long 0x00000000, 0x3cb00000
.type .FLT_26, @object
.size .FLT_26, 8
.align 8
.FLT_27:
.long 0x00000fff,0x00000000
.type .FLT_27,@object
.size .FLT_27,8
.long 0x00000fff, 0x00000000
.type .FLT_27, @object
.size .FLT_27, 8
.align 8
.FLT_28:
.long 0x54442d18,0x401921fb
.type .FLT_28,@object
.size .FLT_28,8
.long 0x54442d18, 0x401921fb
.type .FLT_28, @object
.size .FLT_28, 8
.align 8
.FLT_29:
.long 0x33145c07,0x3cb1a626
.type .FLT_29,@object
.size .FLT_29,8
.long 0x33145c07, 0x3cb1a626
.type .FLT_29, @object
.size .FLT_29, 8
.align 8
.FLT_30:
.long 0xffffffff,0x7fffffff
.type .FLT_30,@object
.size .FLT_30,8
.long 0xffffffff, 0x7fffffff
.type .FLT_30, @object
.size .FLT_30, 8
.align 8
.FLT_31:
.long 0x00000000,0x3eb00000
.type .FLT_31,@object
.size .FLT_31,8
.long 0x00000000, 0x3eb00000
.type .FLT_31, @object
.size .FLT_31, 8
.align 8
.FLT_33:
.long 0x54442d18,0x3f8921fb
.type .FLT_33,@object
.size .FLT_33,8
.long 0x54442d18, 0x3f8921fb
.type .FLT_33, @object
.size .FLT_33, 8
.align 8
.FLT_34:
.long 0x33145c07,0x3c21a626
.type .FLT_34,@object
.size .FLT_34,8
.long 0x33145c07, 0x3c21a626
.type .FLT_34, @object
.size .FLT_34, 8