x86_64: Fix svml_s_atanf16_core_avx512.S code formatting

This commit contains following formatting changes

1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
   between it and the first operand.
3. Instruction greater than 7 characters in length have a
   space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-03-07 10:47:09 -08:00
parent f285711506
commit 67a8f9b86f

View File

@ -30,145 +30,144 @@
/* Offsets for data table __svml_satan_data_internal_avx512
*/
#define AbsMask 0
#define Shifter 64
#define MaxThreshold 128
#define MOne 192
#define One 256
#define LargeX 320
#define Zero 384
#define Tbl_H 448
#define Pi2 576
#define coeff_1 640
#define coeff_2 704
#define coeff_3 768
#define AbsMask 0
#define Shifter 64
#define MaxThreshold 128
#define MOne 192
#define One 256
#define LargeX 320
#define Zero 384
#define Tbl_H 448
#define Pi2 576
#define coeff_1 640
#define coeff_2 704
#define coeff_3 768
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
.section .text.exex512, "ax", @progbits
ENTRY(_ZGVeN16v_atanf_skx)
vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
/* round to 2 bits after binary point */
vreduceps $40, {sae}, %zmm7, %zmm5
/* round to 2 bits after binary point */
vreduceps $40, {sae}, %zmm7, %zmm5
/* saturate X range */
vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
vcmpps $29, {sae}, %zmm3, %zmm7, %k1
/* saturate X range */
vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
vcmpps $29, {sae}, %zmm3, %zmm7, %k1
/* table lookup sequence */
vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
vxorps %zmm0, %zmm7, %zmm0
vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
/* table lookup sequence */
vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
vxorps %zmm0, %zmm7, %zmm0
vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
/* if|X|>=MaxThreshold, set DiffX=-1 */
vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
/* if|X|>=MaxThreshold, set DiffX=-1 */
vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
/* if|X|>=MaxThreshold, set Y=X */
vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
/* if|X|>=MaxThreshold, set Y=X */
vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
/* R+Rl = DiffX/Y */
vgetmantps $0, {sae}, %zmm9, %zmm12
vgetexpps {sae}, %zmm9, %zmm10
vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
vgetmantps $0, {sae}, %zmm8, %zmm15
vgetexpps {sae}, %zmm8, %zmm11
vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
/* R+Rl = DiffX/Y */
vgetmantps $0, {sae}, %zmm9, %zmm12
vgetexpps {sae}, %zmm9, %zmm10
vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
vgetmantps $0, {sae}, %zmm8, %zmm15
vgetexpps {sae}, %zmm8, %zmm11
vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
/* set table value to Pi/2 for large X */
vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
vrcp14ps %zmm15, %zmm13
vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
/* set table value to Pi/2 for large X */
vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
vrcp14ps %zmm15, %zmm13
vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
/* polynomial evaluation */
vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
vxorps %zmm0, %zmm10, %zmm0
ret
/* polynomial evaluation */
vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
vxorps %zmm0, %zmm10, %zmm0
ret
END(_ZGVeN16v_atanf_skx)
.section .rodata, "a"
.align 64
.section .rodata, "a"
.align 64
#ifdef __svml_satan_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 AbsMask[16][1];
__declspec(align(64)) VUINT32 Shifter[16][1];
__declspec(align(64)) VUINT32 MaxThreshold[16][1];
__declspec(align(64)) VUINT32 MOne[16][1];
__declspec(align(64)) VUINT32 One[16][1];
__declspec(align(64)) VUINT32 LargeX[16][1];
__declspec(align(64)) VUINT32 Zero[16][1];
__declspec(align(64)) VUINT32 Tbl_H[32][1];
__declspec(align(64)) VUINT32 Pi2[16][1];
__declspec(align(64)) VUINT32 coeff[3][16][1];
} __svml_satan_data_internal_avx512;
__declspec(align(64)) VUINT32 AbsMask[16][1];
__declspec(align(64)) VUINT32 Shifter[16][1];
__declspec(align(64)) VUINT32 MaxThreshold[16][1];
__declspec(align(64)) VUINT32 MOne[16][1];
__declspec(align(64)) VUINT32 One[16][1];
__declspec(align(64)) VUINT32 LargeX[16][1];
__declspec(align(64)) VUINT32 Zero[16][1];
__declspec(align(64)) VUINT32 Tbl_H[32][1];
__declspec(align(64)) VUINT32 Pi2[16][1];
__declspec(align(64)) VUINT32 coeff[3][16][1];
} __svml_satan_data_internal_avx512;
#endif
__svml_satan_data_internal_avx512:
/*== AbsMask ==*/
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== Shifter ==*/
.align 64
.long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
/*== MaxThreshold ==*/
.align 64
.long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
/*== MOne ==*/
.align 64
.long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
/*== One ==*/
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== LargeX ==*/
.align 64
.long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
/*== Zero ==*/
.align 64
.long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
/*== Tbl_H ==*/
.align 64
.long 0x00000000, 0x3e7adbb0
.long 0x3eed6338, 0x3f24bc7d
.long 0x3f490fdb, 0x3f6563e3
.long 0x3f7b985f, 0x3f869c79
.long 0x3f8db70d, 0x3f93877b
.long 0x3f985b6c, 0x3f9c6b53
.long 0x3f9fe0bb, 0x3fa2daa4
.long 0x3fa57088, 0x3fa7b46f
.long 0x3fa9b465, 0x3fab7b7a
.long 0x3fad1283, 0x3fae809e
.long 0x3fafcb99, 0x3fb0f836
.long 0x3fb20a6a, 0x3fb30581
.long 0x3fb3ec43, 0x3fb4c10a
.long 0x3fb585d7, 0x3fb63c64
.long 0x3fb6e62c, 0x3fb78478
.long 0x3fb81868, 0x3fb8a2f5
/*== Pi2 ==*/
.align 64
.long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
/*== coeff3 ==*/
.align 64
.long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
.long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
.long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
.align 64
.type __svml_satan_data_internal_avx512,@object
.size __svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512
/* AbsMask */
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/* Shifter */
.align 64
.long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
/* MaxThreshold */
.align 64
.long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
/* MOne */
.align 64
.long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
/* One */
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/* LargeX */
.align 64
.long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
/* Zero */
.align 64
.long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
/* Tbl_H */
.align 64
.long 0x00000000, 0x3e7adbb0
.long 0x3eed6338, 0x3f24bc7d
.long 0x3f490fdb, 0x3f6563e3
.long 0x3f7b985f, 0x3f869c79
.long 0x3f8db70d, 0x3f93877b
.long 0x3f985b6c, 0x3f9c6b53
.long 0x3f9fe0bb, 0x3fa2daa4
.long 0x3fa57088, 0x3fa7b46f
.long 0x3fa9b465, 0x3fab7b7a
.long 0x3fad1283, 0x3fae809e
.long 0x3fafcb99, 0x3fb0f836
.long 0x3fb20a6a, 0x3fb30581
.long 0x3fb3ec43, 0x3fb4c10a
.long 0x3fb585d7, 0x3fb63c64
.long 0x3fb6e62c, 0x3fb78478
.long 0x3fb81868, 0x3fb8a2f5
/* Pi2 */
.align 64
.long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
/* coeff3 */
.align 64
.long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
.long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
.long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
.align 64
.type __svml_satan_data_internal_avx512, @object
.size __svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512