x86_64: Fix svml_d_tan8_core_avx512.S code formatting

This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
2025-01-12 20:20:18 +00:00 · 2022-03-07 10:47:15 -08:00 · 2022-03-07 10:47:15 -08:00 · 8589dee1f2
commit 8589dee1f2
parent 7425f0c1e5
1 changed files with 2583 additions and 2586 deletions
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
@ -55,8 +55,7 @@

 #include <sysdep.h>

-        .text
-	.section .text.evex512,"ax",@progbits
+	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN8v_tan_skx)
 	pushq	%rbp
 	cfi_def_cfa_offset(16)
@ -67,10 +66,10 @@ ENTRY(_ZGVeN8v_tan_skx)
 	subq	$192, %rsp
 	xorl	%edx, %edx

-/* Large values check */
+	/* Large values check */
 	vmovups	_dReductionRangeVal+__svml_dtan_data_internal(%rip), %zmm1

-/*
+	/*
 	 *
 	 * Main path
 	 *
@ -85,7 +84,7 @@ ENTRY(_ZGVeN8v_tan_skx)
 	vcmppd	$22, {sae}, %zmm1, %zmm0, %k1
 	vmovups	__svml_dtan_data_internal(%rip), %zmm1

-/*
+	/*
 	 *
 	 * End of main path
 	 */
@ -97,11 +96,11 @@ ENTRY(_ZGVeN8v_tan_skx)
 	vfnmadd231pd {rn-sae}, %zmm8, %zmm3, %zmm5
 	vfnmadd213pd {rn-sae}, %zmm5, %zmm4, %zmm8

-/* Go to auxilary branch */
+	/* Go to auxilary branch */
 	jne	L(AUX_BRANCH)
 	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm8 zmm11 k1

-/* Return from auxilary branch
+	/* Return from auxilary branch
 	 * for out of main path inputs
 	 */

@ -121,40 +120,40 @@ L(AUX_BRANCH_RETURN):
 	vmulpd	{rn-sae}, %zmm8, %zmm5, %zmm7
 	vfmadd213pd {rn-sae}, %zmm8, %zmm6, %zmm7

-/*
+	/*
 	 * Computer Denominator:
 	 * dDenominator - dDlow ~= 1-(dTh+dTl)*(dP+dPlow)
 	 */
 	vmovups	_dOne_uisa+__svml_dtan_data_internal(%rip), %zmm8

-/*
+	/*
 	 * Compute Numerator:
 	 * dNumerator + dNlow ~= dTh+dTl+dP+dPlow
 	 */
 	vaddpd	{rn-sae}, %zmm0, %zmm7, %zmm9
 	vfnmadd213pd {rn-sae}, %zmm8, %zmm7, %zmm0

-/*
+	/*
 	 * Now computes (dNumerator + dNlow)/(dDenominator - dDlow)
 	 * Choose NR iteration instead of hardware division
 	 */
 	vrcp14pd %zmm0, %zmm10

-/* One NR iteration to refine dRcp */
+	/* One NR iteration to refine dRcp */
 	vfnmadd231pd {rn-sae}, %zmm10, %zmm0, %zmm8
 	vfmadd213pd {rn-sae}, %zmm10, %zmm8, %zmm10
 	vmulpd	{rn-sae}, %zmm9, %zmm10, %zmm12

-/* One NR iteration to refine dQuotient */
+	/* One NR iteration to refine dQuotient */
 	vfmsub213pd {rn-sae}, %zmm9, %zmm12, %zmm0
 	vfnmadd213pd {rn-sae}, %zmm12, %zmm10, %zmm0
 	testl	%edx, %edx

-/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
 	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11

-/* Restore registers
+	/* Restore registers
 	 * and exit the function
 	 */

@ -167,7 +166,7 @@ L(EXIT):
 	cfi_def_cfa(6, 16)
 	cfi_offset(6, -16)

-/* Branch to process
+	/* Branch to process
 	 * special inputs
 	 */

@ -193,18 +192,18 @@ L(SPECIAL_VALUES_BRANCH):
 	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
 	# LOE rbx r15 r12d r13d

-/* Range mask
+	/* Range mask
 	 * bits check
 	 */

 L(RANGEMASK_CHECK):
 	btl	%r12d, %r13d

-/* Call scalar math function */
+	/* Call scalar math function */
 	jc	L(SCALAR_MATH_CALL)
 	# LOE rbx r15 r12d r13d

-/* Special inputs
+	/* Special inputs
 	 * processing loop
 	 */

@ -212,7 +211,7 @@ L(SPECIAL_VALUES_LOOP):
 	incl	%r12d
 	cmpl	$8, %r12d

-/* Check bits in range mask */
+	/* Check bits in range mask */
 	jl	L(RANGEMASK_CHECK)
 	# LOE rbx r15 r12d r13d

@ -224,7 +223,7 @@ L(SPECIAL_VALUES_LOOP):
 	cfi_restore(14)
 	vmovups	128(%rsp), %zmm0

-/* Go to exit */
+	/* Go to exit */
 	jmp	L(EXIT)
 	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
 	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
@ -234,33 +233,33 @@ L(SPECIAL_VALUES_LOOP):
 	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
 	# LOE rbx r12 r13 r14 r15 zmm0

-/* Scalar math fucntion call
+	/* Scalar math fucntion call
 	 * to process special input
 	 */

 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-        movsd     64(%rsp,%r14,8), %xmm0
+	movsd	64(%rsp, %r14, 8), %xmm0
 	call	tan@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0

-        movsd     %xmm0, 128(%rsp,%r14,8)
+	movsd	%xmm0, 128(%rsp, %r14, 8)

-/* Process special inputs in loop */
+	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
 	cfi_restore(12)
 	cfi_restore(13)
 	cfi_restore(14)
 	# LOE rbx r15 r12d r13d

-/* Auxilary branch
+	/* Auxilary branch
 	 * for out of main path inputs
 	 */

 L(AUX_BRANCH):
 	vmovups	_dRangeVal+__svml_dtan_data_internal(%rip), %zmm14

-/*
+	/*
 	 * Get the (2^a / 2pi) mod 1 values from the table.
 	 * Because doesn't have L-type gather, we need a trivial cast
 	 */
@ -280,7 +279,7 @@ L(AUX_BRANCH):
 	vpandnq	%zmm6, %zmm6, %zmm5{%k2}
 	vcmppd	$3, {sae}, %zmm5, %zmm5, %k0

-/*
+	/*
 	 * Break the P_xxx and m into 32-bit chunks ready for
 	 * the long multiplication via 32x32->64 multiplications
 	 */
@ -292,14 +291,14 @@ L(AUX_BRANCH):
 	vpxord	%zmm2, %zmm2, %zmm2
 	vpxord	%zmm1, %zmm1, %zmm1
 	vpxord	%zmm8, %zmm8, %zmm8
-        vgatherdpd (%rax,%ymm3), %zmm2{%k3}
-        vgatherdpd 8(%rax,%ymm3), %zmm1{%k4}
-        vgatherdpd 16(%rax,%ymm3), %zmm8{%k5}
+	vgatherdpd (%rax, %ymm3), %zmm2{%k3}
+	vgatherdpd 8(%rax, %ymm3), %zmm1{%k4}
+	vgatherdpd 16(%rax, %ymm3), %zmm8{%k5}
 	vpsrlq	$32, %zmm2, %zmm5
 	vpsrlq	$32, %zmm1, %zmm0
 	vpsrlq	$32, %zmm8, %zmm13

-/*
+	/*
 	 * Also get the significand as an integer
 	 * NB: adding in the integer bit is wrong for denorms!
 	 * To make this work for denorms we should do something slightly different
@ -313,7 +312,7 @@ L(AUX_BRANCH):
 	vpandq	%zmm6, %zmm8, %zmm15
 	vpandq	%zmm6, %zmm14, %zmm14

-/* Now do the big multiplication and carry propagation */
+	/* Now do the big multiplication and carry propagation */
 	vpmullq	%zmm10, %zmm7, %zmm4
 	vpmullq	%zmm12, %zmm7, %zmm2
 	vpmullq	%zmm13, %zmm7, %zmm1
@ -340,7 +339,7 @@ L(AUX_BRANCH):
 	vpaddq	%zmm2, %zmm10, %zmm1
 	vpaddq	%zmm1, %zmm0, %zmm8

-/*
+	/*
 	 * Now round at the 2^-9 bit position for reduction mod pi/2^8
 	 * instead of the original 2pi (but still with the same 2pi scaling).
 	 * Use a shifter of 2^43 + 2^42.
@ -359,7 +358,7 @@ L(AUX_BRANCH):
 	vpsrlq	$32, %zmm15, %zmm12
 	vpaddq	%zmm13, %zmm12, %zmm5

-/* Assemble reduced argument from the pieces */
+	/* Assemble reduced argument from the pieces */
 	vpandq	%zmm6, %zmm14, %zmm10
 	vpandq	%zmm6, %zmm15, %zmm7
 	vpsllq	$32, %zmm5, %zmm6
@ -368,7 +367,7 @@ L(AUX_BRANCH):
 	vpaddq	%zmm10, %zmm5, %zmm10
 	vpsrlq	$12, %zmm4, %zmm6

-/*
+	/*
 	 * We want to incorporate the original sign now too.
 	 * Do it here for convenience in getting the right N value,
 	 * though we could wait right to the end if we were prepared
@ -379,7 +378,7 @@ L(AUX_BRANCH):
 	vpandq	.FLT_25(%rip){1to8}, %zmm10, %zmm13
 	vpsllq	$28, %zmm13, %zmm14

-/*
+	/*
 	 * Create floating-point high part, implicitly adding integer bit 1
 	 * Incorporate overall sign at this stage too.
 	 */
@ -389,7 +388,7 @@ L(AUX_BRANCH):
 	vsubpd	{rn-sae}, %zmm1, %zmm12, %zmm3
 	vsubpd	{rn-sae}, %zmm3, %zmm2, %zmm7

-/*
+	/*
 	 * Create floating-point low and medium parts, respectively
 	 * lo_23, ... lo_0, 0, ..., 0
 	 * hi_11, ... hi_0, lo_63, ..., lo_24
@ -402,7 +401,7 @@ L(AUX_BRANCH):
 	vpandq	.FLT_27(%rip){1to8}, %zmm4, %zmm4
 	vsubpd	{rn-sae}, %zmm6, %zmm15, %zmm8

-/*
+	/*
 	 * If the magnitude of the input is <= 2^-20, then
 	 * just pass through the input, since no reduction will be needed and
 	 * the main path will only work accurately if the reduced argument is
@ -419,24 +418,24 @@ L(AUX_BRANCH):
 	vporq	%zmm1, %zmm0, %zmm4
 	vsubpd	{rn-sae}, %zmm1, %zmm4, %zmm2

-/* Now add them up into 2 reasonably aligned pieces */
+	/* Now add them up into 2 reasonably aligned pieces */
 	vaddpd	{rn-sae}, %zmm2, %zmm7, %zmm13
 	vsubpd	{rn-sae}, %zmm13, %zmm7, %zmm7
 	vaddpd	{rn-sae}, %zmm7, %zmm2, %zmm3
 	vaddpd	{rn-sae}, %zmm8, %zmm3, %zmm0
 	vpbroadcastq .FLT_29(%rip), %zmm8

-/* Grab our final N value as an integer, appropriately masked mod 2^9 */
+	/* Grab our final N value as an integer, appropriately masked mod 2^9 */
 	vpandq	.FLT_23(%rip){1to8}, %zmm12, %zmm5

-/*
+	/*
 	 * Now multiply those numbers all by 2 pi, reasonably accurately.
 	 * (RHi + RLo) * (pi_lead + pi_trail) ~=
 	 * RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead)
 	 */
 	vpbroadcastq .FLT_28(%rip), %zmm12

-/* The output is _VRES_Z (high) + _VRES_E (low), and the integer part is _VRES_IND */
+	/* The output is _VRES_Z (high) + _VRES_E (low), and the integer part is _VRES_IND */
 	vpmovqd	%zmm5, %ymm4
 	vmulpd	{rn-sae}, %zmm12, %zmm13, %zmm6
 	vmovaps	%zmm12, %zmm10
@ -458,7 +457,7 @@ L(AUX_BRANCH):
 	vpsrld	$31, %ymm5, %ymm1
 	vpsubd	%ymm1, %ymm0, %ymm2

-/*
+	/*
 	 *
 	 * End of large arguments path
 	 *
@ -477,7 +476,7 @@ L(AUX_BRANCH):
 	vfmadd213pd {rn-sae}, %zmm13, %zmm12, %zmm15
 	vblendmpd %zmm15, %zmm8, %zmm8{%k1}

-/* Return to main vector processing path */
+	/* Return to main vector processing path */
 	jmp	L(AUX_BRANCH_RETURN)
 	# LOE rbx r12 r13 r14 r15 edx zmm1 zmm8 zmm11
 END(_ZGVeN8v_tan_skx)
@ -487,8 +486,7 @@ END(_ZGVeN8v_tan_skx)

 #ifdef __svml_dtan_data_internal_typedef
 typedef unsigned int VUINT32;
-    typedef struct
-    {
+typedef struct {
 	__declspec(align(64)) VUINT32 _dInvPI_uisa[8][2];
 	__declspec(align(64)) VUINT32 _dPI1_uisa[8][2];
 	__declspec(align(64)) VUINT32 _dPI2_uisa[8][2];
@ -507,7 +505,7 @@ typedef unsigned int VUINT32;
 	__declspec(align(64)) VUINT32 _dPI1[8][2];
 	__declspec(align(64)) VUINT32 _dPI2[8][2];
 	__declspec(align(64)) VUINT32 _dPI3[8][2];
-    } __svml_dtan_data_internal;
+} __svml_dtan_data_internal;
 #endif
 __svml_dtan_data_internal:
 	/* UISA */
@ -518,7 +516,7 @@ __svml_dtan_data_internal:
 	.quad	0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06, 0x3c61a62633145c06 /* _dPI2_uisa */
 	.align	64
 	.quad	0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09, 0x391c1cd129024e09 /* _dPI3_uisa */
-        /*== Th_tbl_uisa ==*/
+	/* Th_tbl_uisa */
 	.align	64
 	.quad	0x8000000000000000, 0x3fc975f5e0553158, 0x3fda827999fcef32, 0x3fe561b82ab7f990
 	.quad	0x3ff0000000000000, 0x3ff7f218e25a7461, 0x4003504f333f9de6, 0x40141bfee2424771
@ -552,14 +550,13 @@ __svml_dtan_data_internal:
 	.align	64
 	.quad	0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345, 0x3B298A2E03707345 /* _dPI3 */
 	.align	64
-        .type	__svml_dtan_data_internal,@object
-        .size	__svml_dtan_data_internal,.-__svml_dtan_data_internal
+	.type	__svml_dtan_data_internal, @object
+	.size	__svml_dtan_data_internal, .-__svml_dtan_data_internal
 	.align	64

 #ifdef __svml_dtan_reduction_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct
-{
+typedef struct {
 	__declspec(align(64)) VUINT32 _dPtable[2048][3][2];
 } __svml_dtan_reduction_data_internal;
 #endif
@ -2614,120 +2611,120 @@ __svml_dtan_reduction_data_internal:
 	.quad	0x4F758FD7CBE2F67A, 0x0E73EF14A525D4D7, 0xF6BF623F1ABA10AC /* 2046 */
 	.quad	0x9EEB1FAF97C5ECF4, 0x1CE7DE294A4BA9AF, 0xED7EC47E35742158 /* 2047 */
 	.align	64
-        .type	__svml_dtan_reduction_data_internal,@object
-        .size	__svml_dtan_reduction_data_internal,.-__svml_dtan_reduction_data_internal
+	.type	__svml_dtan_reduction_data_internal, @object
+	.size	__svml_dtan_reduction_data_internal, .-__svml_dtan_reduction_data_internal
 	.space	512, 0x00
 	.align	32

 .FLT_32:
-        .long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
-        .type	.FLT_32,@object
-        .size	.FLT_32,32
+	.long	0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008
+	.type	.FLT_32, @object
+	.size	.FLT_32, 32
 	.align	8

 .FLT_16:
-        .long	0x00000000,0x7ff00000
-        .type	.FLT_16,@object
-        .size	.FLT_16,8
+	.long	0x00000000, 0x7ff00000
+	.type	.FLT_16, @object
+	.size	.FLT_16, 8
 	.align	8

 .FLT_17:
-        .long	0xffffffff,0x000fffff
-        .type	.FLT_17,@object
-        .size	.FLT_17,8
+	.long	0xffffffff, 0x000fffff
+	.type	.FLT_17, @object
+	.size	.FLT_17, 8
 	.align	8

 .FLT_18:
-        .long	0x00000000,0x00100000
-        .type	.FLT_18,@object
-        .size	.FLT_18,8
+	.long	0x00000000, 0x00100000
+	.type	.FLT_18, @object
+	.size	.FLT_18, 8
 	.align	8

 .FLT_19:
-        .long	0xffffffff,0x00000000
-        .type	.FLT_19,@object
-        .size	.FLT_19,8
+	.long	0xffffffff, 0x00000000
+	.type	.FLT_19, @object
+	.size	.FLT_19, 8
 	.align	8

 .FLT_20:
-        .long	0x00000000,0x80000000
-        .type	.FLT_20,@object
-        .size	.FLT_20,8
+	.long	0x00000000, 0x80000000
+	.type	.FLT_20, @object
+	.size	.FLT_20, 8
 	.align	8

 .FLT_21:
-        .long	0x00000000,0x3ff00000
-        .type	.FLT_21,@object
-        .size	.FLT_21,8
+	.long	0x00000000, 0x3ff00000
+	.type	.FLT_21, @object
+	.size	.FLT_21, 8
 	.align	8

 .FLT_22:
-        .long	0x00000000,0x42a80000
-        .type	.FLT_22,@object
-        .size	.FLT_22,8
+	.long	0x00000000, 0x42a80000
+	.type	.FLT_22, @object
+	.size	.FLT_22, 8
 	.align	8

 .FLT_23:
-        .long	0x000001ff,0x00000000
-        .type	.FLT_23,@object
-        .size	.FLT_23,8
+	.long	0x000001ff, 0x00000000
+	.type	.FLT_23, @object
+	.size	.FLT_23, 8
 	.align	8

 .FLT_24:
-        .long	0x00000000,0x39700000
-        .type	.FLT_24,@object
-        .size	.FLT_24,8
+	.long	0x00000000, 0x39700000
+	.type	.FLT_24, @object
+	.size	.FLT_24, 8
 	.align	8

 .FLT_25:
-        .long	0x00ffffff,0x00000000
-        .type	.FLT_25,@object
-        .size	.FLT_25,8
+	.long	0x00ffffff, 0x00000000
+	.type	.FLT_25, @object
+	.size	.FLT_25, 8
 	.align	8

 .FLT_26:
-        .long	0x00000000,0x3cb00000
-        .type	.FLT_26,@object
-        .size	.FLT_26,8
+	.long	0x00000000, 0x3cb00000
+	.type	.FLT_26, @object
+	.size	.FLT_26, 8
 	.align	8

 .FLT_27:
-        .long	0x00000fff,0x00000000
-        .type	.FLT_27,@object
-        .size	.FLT_27,8
+	.long	0x00000fff, 0x00000000
+	.type	.FLT_27, @object
+	.size	.FLT_27, 8
 	.align	8

 .FLT_28:
-        .long	0x54442d18,0x401921fb
-        .type	.FLT_28,@object
-        .size	.FLT_28,8
+	.long	0x54442d18, 0x401921fb
+	.type	.FLT_28, @object
+	.size	.FLT_28, 8
 	.align	8

 .FLT_29:
-        .long	0x33145c07,0x3cb1a626
-        .type	.FLT_29,@object
-        .size	.FLT_29,8
+	.long	0x33145c07, 0x3cb1a626
+	.type	.FLT_29, @object
+	.size	.FLT_29, 8
 	.align	8

 .FLT_30:
-        .long	0xffffffff,0x7fffffff
-        .type	.FLT_30,@object
-        .size	.FLT_30,8
+	.long	0xffffffff, 0x7fffffff
+	.type	.FLT_30, @object
+	.size	.FLT_30, 8
 	.align	8

 .FLT_31:
-        .long	0x00000000,0x3eb00000
-        .type	.FLT_31,@object
-        .size	.FLT_31,8
+	.long	0x00000000, 0x3eb00000
+	.type	.FLT_31, @object
+	.size	.FLT_31, 8
 	.align	8

 .FLT_33:
-        .long	0x54442d18,0x3f8921fb
-        .type	.FLT_33,@object
-        .size	.FLT_33,8
+	.long	0x54442d18, 0x3f8921fb
+	.type	.FLT_33, @object
+	.size	.FLT_33, 8
 	.align	8

 .FLT_34:
-        .long	0x33145c07,0x3c21a626
-        .type	.FLT_34,@object
-        .size	.FLT_34,8
+	.long	0x33145c07, 0x3c21a626
+	.type	.FLT_34, @object
+	.size	.FLT_34, 8