mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-26 04:31:03 +00:00
x86_64: Fix svml_d_tan4_core_avx2.S code formatting
This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
parent
7f852d2592
commit
7425f0c1e5
@ -69,8 +69,7 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.avx2,"ax",@progbits
|
||||
.section .text.avx2, "ax", @progbits
|
||||
ENTRY(_ZGVdN4v_tan_avx2)
|
||||
pushq %rbp
|
||||
cfi_def_cfa_offset(16)
|
||||
@ -80,7 +79,7 @@ ENTRY(_ZGVdN4v_tan_avx2)
|
||||
andq $-32, %rsp
|
||||
subq $96, %rsp
|
||||
|
||||
/* Legacy Code */
|
||||
/* Legacy Code */
|
||||
xorl %r9d, %r9d
|
||||
vmovupd _dAbsMask+__svml_dtan_data_internal(%rip), %ymm14
|
||||
vmovupd _dRShift+__svml_dtan_data_internal(%rip), %ymm2
|
||||
@ -89,20 +88,20 @@ ENTRY(_ZGVdN4v_tan_avx2)
|
||||
vmovupd _dP3+__svml_dtan_data_internal(%rip), %ymm9
|
||||
vmovapd %ymm0, %ymm15
|
||||
|
||||
/* b) Remove sign using AND 0x7fffffffffffffff operation */
|
||||
/* b) Remove sign using AND 0x7fffffffffffffff operation */
|
||||
vandpd %ymm14, %ymm15, %ymm1
|
||||
|
||||
/*
|
||||
/*
|
||||
* c) Getting octant Y by 2/Pi multiplication
|
||||
* d) Add "Right Shifter" (0x4330000000000000) value
|
||||
*/
|
||||
vfmadd213pd %ymm2, %ymm1, %ymm6
|
||||
|
||||
/* g) Subtract "Right Shifter" (0x4330000000000000) value */
|
||||
/* g) Subtract "Right Shifter" (0x4330000000000000) value */
|
||||
vsubpd %ymm2, %ymm6, %ymm11
|
||||
vpsllq $62, %ymm6, %ymm7
|
||||
|
||||
/*
|
||||
/*
|
||||
* e) Treat obtained value as integer for destination sign setting.
|
||||
* Shift first bit of this value to the last (sign) position (S << 63)
|
||||
* f) Change destination sign if source sign is negative
|
||||
@ -110,16 +109,16 @@ ENTRY(_ZGVdN4v_tan_avx2)
|
||||
*/
|
||||
vpsllq $63, %ymm6, %ymm4
|
||||
|
||||
/* X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; */
|
||||
/* X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; */
|
||||
vfnmadd213pd %ymm1, %ymm11, %ymm3
|
||||
vfnmadd231pd _dPI2_FMA+__svml_dtan_data_internal(%rip), %ymm11, %ymm3
|
||||
vfnmadd132pd _dPI3_FMA+__svml_dtan_data_internal(%rip), %ymm3, %ymm11
|
||||
vmovupd _dQ3+__svml_dtan_data_internal(%rip), %ymm3
|
||||
|
||||
/* a) Calculate X^2 = X * X */
|
||||
/* a) Calculate X^2 = X * X */
|
||||
vmulpd %ymm11, %ymm11, %ymm10
|
||||
|
||||
/*
|
||||
/*
|
||||
* b) Calculate 2 polynomials:
|
||||
* P = X * (P0 + X^2 * (P1 + x^2 * (P2 + x^2 * (P3))));
|
||||
* Q = Q0 + X^2 * (Q1 + x^2 * (Q2 + x^2 * (Q3)));
|
||||
@ -133,15 +132,15 @@ ENTRY(_ZGVdN4v_tan_avx2)
|
||||
vfmadd213pd _dQ0+__svml_dtan_data_internal(%rip), %ymm10, %ymm3
|
||||
vfmadd213pd %ymm11, %ymm11, %ymm9
|
||||
|
||||
/* 1) Range reduction to [-Pi/4; +Pi/4] interval */
|
||||
/* 1) Range reduction to [-Pi/4; +Pi/4] interval */
|
||||
vxorpd %ymm8, %ymm8, %ymm8
|
||||
vcmpneqpd %ymm8, %ymm7, %ymm2
|
||||
|
||||
/* a) Grab sign from source argument and save it. */
|
||||
/* a) Grab sign from source argument and save it. */
|
||||
vandnpd %ymm15, %ymm14, %ymm5
|
||||
vxorpd %ymm5, %ymm4, %ymm0
|
||||
|
||||
/*
|
||||
/*
|
||||
* c) Swap P and Q if first bit of obtained value after
|
||||
* Right Shifting is set to 1. Using And, Andnot & Or operations.
|
||||
*/
|
||||
@ -152,14 +151,14 @@ ENTRY(_ZGVdN4v_tan_avx2)
|
||||
vorpd %ymm13, %ymm12, %ymm6
|
||||
vorpd %ymm5, %ymm4, %ymm7
|
||||
|
||||
/* d) Divide R = P / Q; */
|
||||
/* d) Divide R = P / Q; */
|
||||
vdivpd %ymm7, %ymm6, %ymm8
|
||||
|
||||
/* Large values check */
|
||||
/* Large values check */
|
||||
vcmpnle_uqpd _dReductionRangeVal+__svml_dtan_data_internal(%rip), %ymm1, %ymm14
|
||||
vmovmskpd %ymm14, %eax
|
||||
|
||||
/*
|
||||
/*
|
||||
* 3) Destination sign setting
|
||||
* a) Set shifted destination sign using XOR operation:
|
||||
* R = XOR( R, S );
|
||||
@ -167,22 +166,22 @@ ENTRY(_ZGVdN4v_tan_avx2)
|
||||
vxorpd %ymm0, %ymm8, %ymm0
|
||||
testl %eax, %eax
|
||||
|
||||
/* Go to auxilary branch */
|
||||
/* Go to auxilary branch */
|
||||
jne L(AUX_BRANCH)
|
||||
# LOE rbx r12 r13 r14 r15 r9d ymm0 ymm1 ymm14 ymm15
|
||||
|
||||
/* Return from auxilary branch
|
||||
/* Return from auxilary branch
|
||||
* for out of main path inputs
|
||||
*/
|
||||
|
||||
L(AUX_BRANCH_RETURN):
|
||||
testl %r9d, %r9d
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx r12 r13 r14 r15 r9d ymm0 ymm15
|
||||
|
||||
/* Restore registers
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
@ -195,7 +194,7 @@ L(EXIT):
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
|
||||
/* Branch to process
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
@ -221,18 +220,18 @@ L(SPECIAL_VALUES_BRANCH):
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Range mask
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %r12d, %r13d
|
||||
|
||||
/* Call scalar math function */
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Special inputs
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
@ -240,7 +239,7 @@ L(SPECIAL_VALUES_LOOP):
|
||||
incl %r12d
|
||||
cmpl $4, %r12d
|
||||
|
||||
/* Check bits in range mask */
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
@ -252,7 +251,7 @@ L(SPECIAL_VALUES_LOOP):
|
||||
cfi_restore(14)
|
||||
vmovupd 64(%rsp), %ymm0
|
||||
|
||||
/* Go to exit */
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
|
||||
@ -262,33 +261,33 @@ L(SPECIAL_VALUES_LOOP):
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r12 r13 r14 r15 ymm0
|
||||
|
||||
/* Scalar math fucntion call
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %r12d, %r14d
|
||||
movsd 32(%rsp,%r14,8), %xmm0
|
||||
movsd 32(%rsp, %r14, 8), %xmm0
|
||||
call tan@PLT
|
||||
# LOE rbx r14 r15 r12d r13d xmm0
|
||||
|
||||
movsd %xmm0, 64(%rsp,%r14,8)
|
||||
movsd %xmm0, 64(%rsp, %r14, 8)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
cfi_restore(12)
|
||||
cfi_restore(13)
|
||||
cfi_restore(14)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Auxilary branch
|
||||
/* Auxilary branch
|
||||
* for out of main path inputs
|
||||
*/
|
||||
|
||||
L(AUX_BRANCH):
|
||||
vpand .FLT_17(%rip), %ymm15, %ymm4
|
||||
|
||||
/*
|
||||
/*
|
||||
* Get the (2^a / 2pi) mod 1 values from the table.
|
||||
* Because doesn't have L-type gather, we need a trivial cast
|
||||
*/
|
||||
@ -303,41 +302,41 @@ L(AUX_BRANCH):
|
||||
vmovd %xmm13, %r10d
|
||||
vmovd %xmm8, %edx
|
||||
vpextrd $2, %xmm13, %r11d
|
||||
lea (%r10,%r10,2), %edi
|
||||
lea (%r10, %r10, 2), %edi
|
||||
vpextrd $2, %xmm8, %eax
|
||||
lea (%rdx,%rdx,2), %ecx
|
||||
lea (%rdx, %rdx, 2), %ecx
|
||||
shll $3, %edi
|
||||
shll $3, %ecx
|
||||
|
||||
/*
|
||||
/*
|
||||
* Also get the significand as an integer
|
||||
* NB: adding in the integer bit is wrong for denorms!
|
||||
* To make this work for denorms we should do something slightly different
|
||||
*/
|
||||
vpand .FLT_18(%rip), %ymm15, %ymm13
|
||||
lea (%r11,%r11,2), %esi
|
||||
vmovq (%rdi,%r8), %xmm10
|
||||
vmovq (%rcx,%r8), %xmm5
|
||||
vmovq 8(%rdi,%r8), %xmm3
|
||||
lea (%r11, %r11, 2), %esi
|
||||
vmovq (%rdi, %r8), %xmm10
|
||||
vmovq (%rcx, %r8), %xmm5
|
||||
vmovq 8(%rdi, %r8), %xmm3
|
||||
shll $3, %esi
|
||||
lea (%rax,%rax,2), %r10d
|
||||
lea (%rax, %rax, 2), %r10d
|
||||
shll $3, %r10d
|
||||
vmovhpd (%rsi,%r8), %xmm10, %xmm9
|
||||
vmovhpd (%r10,%r8), %xmm5, %xmm12
|
||||
vmovhpd (%rsi, %r8), %xmm10, %xmm9
|
||||
vmovhpd (%r10, %r8), %xmm5, %xmm12
|
||||
vpaddq .FLT_19(%rip), %ymm13, %ymm8
|
||||
vmovq 16(%rdi,%r8), %xmm1
|
||||
vmovq 8(%rcx,%r8), %xmm7
|
||||
vmovq 16(%rcx,%r8), %xmm11
|
||||
vmovhpd 8(%rsi,%r8), %xmm3, %xmm6
|
||||
vmovq 16(%rdi, %r8), %xmm1
|
||||
vmovq 8(%rcx, %r8), %xmm7
|
||||
vmovq 16(%rcx, %r8), %xmm11
|
||||
vmovhpd 8(%rsi, %r8), %xmm3, %xmm6
|
||||
|
||||
/* Load constants (not all needed at once) */
|
||||
/* Load constants (not all needed at once) */
|
||||
lea _dCoeffs+96+__svml_dtan_data_internal(%rip), %rcx
|
||||
vmovhpd 16(%rsi,%r8), %xmm1, %xmm4
|
||||
vmovhpd 8(%r10,%r8), %xmm7, %xmm2
|
||||
vmovhpd 16(%r10,%r8), %xmm11, %xmm0
|
||||
vmovhpd 16(%rsi, %r8), %xmm1, %xmm4
|
||||
vmovhpd 8(%r10, %r8), %xmm7, %xmm2
|
||||
vmovhpd 16(%r10, %r8), %xmm11, %xmm0
|
||||
vinsertf128 $1, %xmm12, %ymm9, %ymm10
|
||||
|
||||
/*
|
||||
/*
|
||||
* Break the P_xxx and m into 32-bit chunks ready for
|
||||
* the long multiplication via 32x32->64 multiplications
|
||||
*/
|
||||
@ -381,7 +380,7 @@ L(AUX_BRANCH):
|
||||
vpaddq %ymm13, %ymm1, %ymm3
|
||||
vpmuludq %ymm10, %ymm6, %ymm6
|
||||
|
||||
/* Now do the big multiplication and carry propagation */
|
||||
/* Now do the big multiplication and carry propagation */
|
||||
vpmuludq %ymm7, %ymm9, %ymm7
|
||||
vpaddq %ymm3, %ymm0, %ymm2
|
||||
vpand %ymm12, %ymm6, %ymm10
|
||||
@ -390,11 +389,11 @@ L(AUX_BRANCH):
|
||||
vpand %ymm12, %ymm2, %ymm6
|
||||
vpaddq %ymm9, %ymm10, %ymm8
|
||||
|
||||
/* Assemble reduced argument from the pieces */
|
||||
/* Assemble reduced argument from the pieces */
|
||||
vpand %ymm12, %ymm11, %ymm12
|
||||
vpaddq %ymm8, %ymm13, %ymm13
|
||||
|
||||
/*
|
||||
/*
|
||||
* We want to incorporate the original sign now too.
|
||||
* Do it here for convenience in getting the right N value,
|
||||
* though we could wait right to the end if we were prepared
|
||||
@ -404,7 +403,7 @@ L(AUX_BRANCH):
|
||||
vpand .FLT_21(%rip), %ymm15, %ymm8
|
||||
vpaddq %ymm12, %ymm5, %ymm9
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now round at the 2^-9 bit position for reduction mod pi/2^8
|
||||
* instead of the original 2pi (but still with the same 2pi scaling).
|
||||
* Use a shifter of 2^43 + 2^42.
|
||||
@ -416,13 +415,13 @@ L(AUX_BRANCH):
|
||||
vmovupd .FLT_23(%rip), %ymm5
|
||||
vpsllq $32, %ymm13, %ymm3
|
||||
|
||||
/*
|
||||
/*
|
||||
* Create floating-point high part, implicitly adding integer bit 1
|
||||
* Incorporate overall sign at this stage too.
|
||||
*/
|
||||
vpxor .FLT_22(%rip), %ymm8, %ymm10
|
||||
|
||||
/*
|
||||
/*
|
||||
* Create floating-point low and medium parts, respectively
|
||||
* lo_23, ... lo_0, 0, ..., 0
|
||||
* hi_11, ... hi_0, lo_63, ..., lo_24
|
||||
@ -447,7 +446,7 @@ L(AUX_BRANCH):
|
||||
vpsllq $40, %ymm8, %ymm3
|
||||
vpor %ymm9, %ymm3, %ymm11
|
||||
|
||||
/*
|
||||
/*
|
||||
* If the magnitude of the input is <= 2^-20, then
|
||||
* just pass through the input, since no reduction will be needed and
|
||||
* the main path will only work accurately if the reduced argument is
|
||||
@ -457,12 +456,12 @@ L(AUX_BRANCH):
|
||||
vpor %ymm5, %ymm11, %ymm9
|
||||
vsubpd %ymm5, %ymm9, %ymm8
|
||||
|
||||
/* Now add them up into 2 reasonably aligned pieces */
|
||||
/* Now add them up into 2 reasonably aligned pieces */
|
||||
vaddpd %ymm8, %ymm0, %ymm12
|
||||
vsubpd %ymm12, %ymm0, %ymm0
|
||||
vaddpd %ymm0, %ymm8, %ymm9
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now multiply those numbers all by 2 pi, reasonably accurately.
|
||||
* (RHi + RLo) * (pi_lead + pi_trail) ~=
|
||||
* RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead)
|
||||
@ -473,10 +472,10 @@ L(AUX_BRANCH):
|
||||
vmovapd %ymm8, %ymm13
|
||||
vfmsub213pd %ymm7, %ymm12, %ymm13
|
||||
|
||||
/* Grab our final N value as an integer, appropriately masked mod 2^9 */
|
||||
/* Grab our final N value as an integer, appropriately masked mod 2^9 */
|
||||
vandps .FLT_32(%rip), %ymm10, %ymm10
|
||||
|
||||
/*
|
||||
/*
|
||||
* Argument reduction is now finished: x = n * pi/256 + r
|
||||
* where n = lIndex and r = dZ
|
||||
* But we have n modulo 512, needed for sin/cos with period 2pi
|
||||
@ -493,7 +492,7 @@ L(AUX_BRANCH):
|
||||
vorpd %ymm1, %ymm2, %ymm0
|
||||
vandpd %ymm11, %ymm4, %ymm13
|
||||
|
||||
/*
|
||||
/*
|
||||
* The output is _VRES_Z (high) + _VRES_E (low), and the integer part is _VRES_IND
|
||||
* Simply absorb E into Z instead of keeping a 2-part result.
|
||||
*/
|
||||
@ -503,52 +502,52 @@ L(AUX_BRANCH):
|
||||
vmovd %xmm5, %r11d
|
||||
imull $104, %edi, %edx
|
||||
imull $104, %r11d, %esi
|
||||
vmovq -96(%rdx,%rcx), %xmm9
|
||||
vmovq -80(%rdx,%rcx), %xmm6
|
||||
vmovq -96(%rdx, %rcx), %xmm9
|
||||
vmovq -80(%rdx, %rcx), %xmm6
|
||||
vpextrd $2, %xmm10, %r8d
|
||||
vpextrd $2, %xmm5, %edi
|
||||
imull $104, %r8d, %eax
|
||||
imull $104, %edi, %r8d
|
||||
vmovq -96(%rsi,%rcx), %xmm12
|
||||
vmovq -72(%rdx,%rcx), %xmm4
|
||||
vmovq -80(%rsi,%rcx), %xmm2
|
||||
vmovhpd -96(%rax,%rcx), %xmm9, %xmm8
|
||||
vmovhpd -96(%r8,%rcx), %xmm12, %xmm3
|
||||
vmovq -56(%rdx,%rcx), %xmm9
|
||||
vmovq -72(%rsi,%rcx), %xmm0
|
||||
vmovq -56(%rsi,%rcx), %xmm5
|
||||
vmovhpd -80(%rax,%rcx), %xmm6, %xmm7
|
||||
vmovhpd -72(%rax,%rcx), %xmm4, %xmm11
|
||||
vmovhpd -80(%r8,%rcx), %xmm2, %xmm1
|
||||
vmovhpd -72(%r8,%rcx), %xmm0, %xmm10
|
||||
vmovq -48(%rdx,%rcx), %xmm6
|
||||
vmovq -48(%rsi,%rcx), %xmm0
|
||||
vmovq -96(%rsi, %rcx), %xmm12
|
||||
vmovq -72(%rdx, %rcx), %xmm4
|
||||
vmovq -80(%rsi, %rcx), %xmm2
|
||||
vmovhpd -96(%rax, %rcx), %xmm9, %xmm8
|
||||
vmovhpd -96(%r8, %rcx), %xmm12, %xmm3
|
||||
vmovq -56(%rdx, %rcx), %xmm9
|
||||
vmovq -72(%rsi, %rcx), %xmm0
|
||||
vmovq -56(%rsi, %rcx), %xmm5
|
||||
vmovhpd -80(%rax, %rcx), %xmm6, %xmm7
|
||||
vmovhpd -72(%rax, %rcx), %xmm4, %xmm11
|
||||
vmovhpd -80(%r8, %rcx), %xmm2, %xmm1
|
||||
vmovhpd -72(%r8, %rcx), %xmm0, %xmm10
|
||||
vmovq -48(%rdx, %rcx), %xmm6
|
||||
vmovq -48(%rsi, %rcx), %xmm0
|
||||
vinsertf128 $1, %xmm3, %ymm8, %ymm12
|
||||
vmovhpd -56(%rax,%rcx), %xmm9, %xmm8
|
||||
vmovhpd -56(%r8,%rcx), %xmm5, %xmm3
|
||||
vmovq -40(%rdx,%rcx), %xmm9
|
||||
vmovq -40(%rsi,%rcx), %xmm5
|
||||
vmovhpd -56(%rax, %rcx), %xmm9, %xmm8
|
||||
vmovhpd -56(%r8, %rcx), %xmm5, %xmm3
|
||||
vmovq -40(%rdx, %rcx), %xmm9
|
||||
vmovq -40(%rsi, %rcx), %xmm5
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm2
|
||||
vinsertf128 $1, %xmm10, %ymm11, %ymm1
|
||||
vinsertf128 $1, %xmm3, %ymm8, %ymm4
|
||||
vmovhpd -48(%rax,%rcx), %xmm6, %xmm7
|
||||
vmovhpd -40(%rax,%rcx), %xmm9, %xmm8
|
||||
vmovq -32(%rdx,%rcx), %xmm11
|
||||
vmovhpd -48(%r8,%rcx), %xmm0, %xmm10
|
||||
vmovhpd -40(%r8,%rcx), %xmm5, %xmm6
|
||||
vmovq -32(%rsi,%rcx), %xmm9
|
||||
vmovhpd -32(%rax,%rcx), %xmm11, %xmm0
|
||||
vmovq -24(%rsi,%rcx), %xmm11
|
||||
vmovq -24(%rdx,%rcx), %xmm5
|
||||
vmovhpd -48(%rax, %rcx), %xmm6, %xmm7
|
||||
vmovhpd -40(%rax, %rcx), %xmm9, %xmm8
|
||||
vmovq -32(%rdx, %rcx), %xmm11
|
||||
vmovhpd -48(%r8, %rcx), %xmm0, %xmm10
|
||||
vmovhpd -40(%r8, %rcx), %xmm5, %xmm6
|
||||
vmovq -32(%rsi, %rcx), %xmm9
|
||||
vmovhpd -32(%rax, %rcx), %xmm11, %xmm0
|
||||
vmovq -24(%rsi, %rcx), %xmm11
|
||||
vmovq -24(%rdx, %rcx), %xmm5
|
||||
|
||||
/*
|
||||
/*
|
||||
* Sum of dominant component(s)
|
||||
* Compute C0_hi + C1_hi * Z + Recip_hi = H4
|
||||
* H2 = C0_hi + C1_hi * Z (exact since C1_hi is 1 bit)
|
||||
*/
|
||||
vfmadd213pd %ymm1, %ymm13, %ymm4
|
||||
|
||||
/*
|
||||
/*
|
||||
* Higher polynomial terms
|
||||
* Stage 1 (with unlimited parallelism)
|
||||
* Z2 = Z^2
|
||||
@ -556,40 +555,40 @@ L(AUX_BRANCH):
|
||||
vmulpd %ymm13, %ymm13, %ymm1
|
||||
vinsertf128 $1, %xmm10, %ymm7, %ymm3
|
||||
vinsertf128 $1, %xmm6, %ymm8, %ymm10
|
||||
vmovhpd -32(%r8,%rcx), %xmm9, %xmm8
|
||||
vmovhpd -24(%r8,%rcx), %xmm11, %xmm9
|
||||
vmovq -16(%rsi,%rcx), %xmm11
|
||||
vmovhpd -24(%rax,%rcx), %xmm5, %xmm7
|
||||
vmovhpd -32(%r8, %rcx), %xmm9, %xmm8
|
||||
vmovhpd -24(%r8, %rcx), %xmm11, %xmm9
|
||||
vmovq -16(%rsi, %rcx), %xmm11
|
||||
vmovhpd -24(%rax, %rcx), %xmm5, %xmm7
|
||||
|
||||
/* P4 = C1_lo + C2 * Z */
|
||||
/* P4 = C1_lo + C2 * Z */
|
||||
vfmadd213pd %ymm3, %ymm13, %ymm10
|
||||
vinsertf128 $1, %xmm8, %ymm0, %ymm6
|
||||
vmovq -16(%rdx,%rcx), %xmm8
|
||||
vmovhpd -16(%rax,%rcx), %xmm8, %xmm5
|
||||
vmovhpd -16(%r8,%rcx), %xmm11, %xmm0
|
||||
vmovq -8(%rdx,%rcx), %xmm8
|
||||
vmovq -8(%rsi,%rcx), %xmm11
|
||||
vmovq -16(%rdx, %rcx), %xmm8
|
||||
vmovhpd -16(%rax, %rcx), %xmm8, %xmm5
|
||||
vmovhpd -16(%r8, %rcx), %xmm11, %xmm0
|
||||
vmovq -8(%rdx, %rcx), %xmm8
|
||||
vmovq -8(%rsi, %rcx), %xmm11
|
||||
vinsertf128 $1, %xmm9, %ymm7, %ymm9
|
||||
vinsertf128 $1, %xmm0, %ymm5, %ymm7
|
||||
vmovhpd -8(%rax,%rcx), %xmm8, %xmm5
|
||||
vmovhpd -8(%r8,%rcx), %xmm11, %xmm0
|
||||
vmovhpd -8(%rax, %rcx), %xmm8, %xmm5
|
||||
vmovhpd -8(%r8, %rcx), %xmm11, %xmm0
|
||||
|
||||
/* P5 = C3 + C4 * Z */
|
||||
/* P5 = C3 + C4 * Z */
|
||||
vfmadd213pd %ymm6, %ymm13, %ymm9
|
||||
|
||||
/* P10 = C1_lo + C2 * Z + C3 * Z^2 + C4 * Z^3 */
|
||||
/* P10 = C1_lo + C2 * Z + C3 * Z^2 + C4 * Z^3 */
|
||||
vfmadd213pd %ymm10, %ymm1, %ymm9
|
||||
vinsertf128 $1, %xmm0, %ymm5, %ymm8
|
||||
vmovq (%rdx,%rcx), %xmm5
|
||||
vmovq (%rsi,%rcx), %xmm0
|
||||
vmovhpd (%rax,%rcx), %xmm5, %xmm11
|
||||
vmovhpd (%r8,%rcx), %xmm0, %xmm5
|
||||
vmovq (%rdx, %rcx), %xmm5
|
||||
vmovq (%rsi, %rcx), %xmm0
|
||||
vmovhpd (%rax, %rcx), %xmm5, %xmm11
|
||||
vmovhpd (%r8, %rcx), %xmm0, %xmm5
|
||||
|
||||
/* P6 = C5 + C6 * Z */
|
||||
/* P6 = C5 + C6 * Z */
|
||||
vfmadd213pd %ymm7, %ymm13, %ymm8
|
||||
vinsertf128 $1, %xmm5, %ymm11, %ymm5
|
||||
|
||||
/*
|
||||
/*
|
||||
* Compute reciprocal component
|
||||
* Construct a separate reduced argument modulo pi near pi/2 multiples.
|
||||
* i.e. (pi/2 - x) mod pi, simply by subtracting the reduced argument
|
||||
@ -597,21 +596,21 @@ L(AUX_BRANCH):
|
||||
*/
|
||||
vsubpd %ymm13, %ymm12, %ymm11
|
||||
|
||||
/* P9 = C5 + C6 * Z + C7 * Z^2 */
|
||||
/* P9 = C5 + C6 * Z + C7 * Z^2 */
|
||||
vfmadd213pd %ymm8, %ymm1, %ymm5
|
||||
|
||||
/* Now compute an approximate reciprocal to mix into the computation. */
|
||||
/* Now compute an approximate reciprocal to mix into the computation. */
|
||||
vcvtpd2ps %ymm11, %xmm12
|
||||
vrcpps %xmm12, %xmm12
|
||||
vcvtps2pd %xmm12, %ymm0
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now compute the error dEr where dRecip_hi = (1/R_full) * (1 - dEr)
|
||||
* so that we can compensate for it.
|
||||
*/
|
||||
vfnmadd213pd .FLT_34(%rip), %ymm0, %ymm11
|
||||
|
||||
/*
|
||||
/*
|
||||
* Get a working-precision reciprocal 1/dR_full
|
||||
* using a fourth-order polynomial approximation
|
||||
* R + (E*R) * (1 + E) * (1 + E^2)
|
||||
@ -621,7 +620,7 @@ L(AUX_BRANCH):
|
||||
vfmadd213pd %ymm0, %ymm0, %ymm12
|
||||
vfmadd213pd %ymm0, %ymm11, %ymm12
|
||||
|
||||
/*
|
||||
/*
|
||||
*
|
||||
* End of large arguments path
|
||||
*
|
||||
@ -629,29 +628,29 @@ L(AUX_BRANCH):
|
||||
*/
|
||||
vmovupd (%rsp), %ymm0
|
||||
|
||||
/*
|
||||
/*
|
||||
* dRecip_hi is only used when dTau is one (cotangent path)
|
||||
* H4 = C0_hi + C1_hi * Z + Recip_hi
|
||||
*/
|
||||
vfmadd213pd %ymm4, %ymm2, %ymm12
|
||||
|
||||
/*
|
||||
/*
|
||||
* Stage 2 (with unlimited parallelism)
|
||||
* Z4 = Z^4
|
||||
*/
|
||||
vmulpd %ymm1, %ymm1, %ymm2
|
||||
|
||||
/*
|
||||
/*
|
||||
* Stage 3 (with unlimited parallelism)
|
||||
* P12 = C1_lo + C2 * Z + ... + C7 * Z^6
|
||||
*/
|
||||
vfmadd213pd %ymm9, %ymm2, %ymm5
|
||||
|
||||
/* And now the very final summation */
|
||||
/* And now the very final summation */
|
||||
vfmadd213pd %ymm12, %ymm13, %ymm5
|
||||
vblendvpd %ymm14, %ymm5, %ymm0, %ymm0
|
||||
|
||||
/* Return to main vector processing path */
|
||||
/* Return to main vector processing path */
|
||||
jmp L(AUX_BRANCH_RETURN)
|
||||
# LOE rbx r12 r13 r14 r15 r9d ymm0 ymm15
|
||||
END(_ZGVdN4v_tan_avx2)
|
||||
@ -660,117 +659,116 @@ END(_ZGVdN4v_tan_avx2)
|
||||
.align 32
|
||||
|
||||
.FLT_17:
|
||||
.long 0x00000000,0x7ff00000,0x00000000,0x7ff00000,0x00000000,0x7ff00000,0x00000000,0x7ff00000
|
||||
.type .FLT_17,@object
|
||||
.size .FLT_17,32
|
||||
.long 0x00000000, 0x7ff00000, 0x00000000, 0x7ff00000, 0x00000000, 0x7ff00000, 0x00000000, 0x7ff00000
|
||||
.type .FLT_17, @object
|
||||
.size .FLT_17, 32
|
||||
.align 32
|
||||
|
||||
.FLT_18:
|
||||
.long 0xffffffff,0x000fffff,0xffffffff,0x000fffff,0xffffffff,0x000fffff,0xffffffff,0x000fffff
|
||||
.type .FLT_18,@object
|
||||
.size .FLT_18,32
|
||||
.long 0xffffffff, 0x000fffff, 0xffffffff, 0x000fffff, 0xffffffff, 0x000fffff, 0xffffffff, 0x000fffff
|
||||
.type .FLT_18, @object
|
||||
.size .FLT_18, 32
|
||||
.align 32
|
||||
|
||||
.FLT_19:
|
||||
.long 0x00000000,0x00100000,0x00000000,0x00100000,0x00000000,0x00100000,0x00000000,0x00100000
|
||||
.type .FLT_19,@object
|
||||
.size .FLT_19,32
|
||||
.long 0x00000000, 0x00100000, 0x00000000, 0x00100000, 0x00000000, 0x00100000, 0x00000000, 0x00100000
|
||||
.type .FLT_19, @object
|
||||
.size .FLT_19, 32
|
||||
.align 32
|
||||
|
||||
.FLT_20:
|
||||
.long 0xffffffff,0x00000000,0xffffffff,0x00000000,0xffffffff,0x00000000,0xffffffff,0x00000000
|
||||
.type .FLT_20,@object
|
||||
.size .FLT_20,32
|
||||
.long 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, 0xffffffff, 0x00000000
|
||||
.type .FLT_20, @object
|
||||
.size .FLT_20, 32
|
||||
.align 32
|
||||
|
||||
.FLT_21:
|
||||
.long 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000
|
||||
.type .FLT_21,@object
|
||||
.size .FLT_21,32
|
||||
.long 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000
|
||||
.type .FLT_21, @object
|
||||
.size .FLT_21, 32
|
||||
.align 32
|
||||
|
||||
.FLT_22:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .FLT_22,@object
|
||||
.size .FLT_22,32
|
||||
.long 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000
|
||||
.type .FLT_22, @object
|
||||
.size .FLT_22, 32
|
||||
.align 32
|
||||
|
||||
.FLT_23:
|
||||
.long 0x00000000,0x42a80000,0x00000000,0x42a80000,0x00000000,0x42a80000,0x00000000,0x42a80000
|
||||
.type .FLT_23,@object
|
||||
.size .FLT_23,32
|
||||
.long 0x00000000, 0x42a80000, 0x00000000, 0x42a80000, 0x00000000, 0x42a80000, 0x00000000, 0x42a80000
|
||||
.type .FLT_23, @object
|
||||
.size .FLT_23, 32
|
||||
.align 32
|
||||
|
||||
.FLT_24:
|
||||
.long 0x00000000,0x39700000,0x00000000,0x39700000,0x00000000,0x39700000,0x00000000,0x39700000
|
||||
.type .FLT_24,@object
|
||||
.size .FLT_24,32
|
||||
.long 0x00000000, 0x39700000, 0x00000000, 0x39700000, 0x00000000, 0x39700000, 0x00000000, 0x39700000
|
||||
.type .FLT_24, @object
|
||||
.size .FLT_24, 32
|
||||
.align 32
|
||||
|
||||
.FLT_25:
|
||||
.long 0x00ffffff,0x00000000,0x00ffffff,0x00000000,0x00ffffff,0x00000000,0x00ffffff,0x00000000
|
||||
.type .FLT_25,@object
|
||||
.size .FLT_25,32
|
||||
.long 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000
|
||||
.type .FLT_25, @object
|
||||
.size .FLT_25, 32
|
||||
.align 32
|
||||
|
||||
.FLT_26:
|
||||
.long 0x00000000,0x3cb00000,0x00000000,0x3cb00000,0x00000000,0x3cb00000,0x00000000,0x3cb00000
|
||||
.type .FLT_26,@object
|
||||
.size .FLT_26,32
|
||||
.long 0x00000000, 0x3cb00000, 0x00000000, 0x3cb00000, 0x00000000, 0x3cb00000, 0x00000000, 0x3cb00000
|
||||
.type .FLT_26, @object
|
||||
.size .FLT_26, 32
|
||||
.align 32
|
||||
|
||||
.FLT_27:
|
||||
.long 0x00000fff,0x00000000,0x00000fff,0x00000000,0x00000fff,0x00000000,0x00000fff,0x00000000
|
||||
.type .FLT_27,@object
|
||||
.size .FLT_27,32
|
||||
.long 0x00000fff, 0x00000000, 0x00000fff, 0x00000000, 0x00000fff, 0x00000000, 0x00000fff, 0x00000000
|
||||
.type .FLT_27, @object
|
||||
.size .FLT_27, 32
|
||||
.align 32
|
||||
|
||||
.FLT_28:
|
||||
.long 0x54442d18,0x401921fb,0x54442d18,0x401921fb,0x54442d18,0x401921fb,0x54442d18,0x401921fb
|
||||
.type .FLT_28,@object
|
||||
.size .FLT_28,32
|
||||
.long 0x54442d18, 0x401921fb, 0x54442d18, 0x401921fb, 0x54442d18, 0x401921fb, 0x54442d18, 0x401921fb
|
||||
.type .FLT_28, @object
|
||||
.size .FLT_28, 32
|
||||
.align 32
|
||||
|
||||
.FLT_29:
|
||||
.long 0x33145c07,0x3cb1a626,0x33145c07,0x3cb1a626,0x33145c07,0x3cb1a626,0x33145c07,0x3cb1a626
|
||||
.type .FLT_29,@object
|
||||
.size .FLT_29,32
|
||||
.long 0x33145c07, 0x3cb1a626, 0x33145c07, 0x3cb1a626, 0x33145c07, 0x3cb1a626, 0x33145c07, 0x3cb1a626
|
||||
.type .FLT_29, @object
|
||||
.size .FLT_29, 32
|
||||
.align 32
|
||||
|
||||
.FLT_30:
|
||||
.long 0xffffffff,0x7fffffff,0xffffffff,0x7fffffff,0xffffffff,0x7fffffff,0xffffffff,0x7fffffff
|
||||
.type .FLT_30,@object
|
||||
.size .FLT_30,32
|
||||
.long 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff
|
||||
.type .FLT_30, @object
|
||||
.size .FLT_30, 32
|
||||
.align 32
|
||||
|
||||
.FLT_31:
|
||||
.long 0x00000000,0x3eb00000,0x00000000,0x3eb00000,0x00000000,0x3eb00000,0x00000000,0x3eb00000
|
||||
.type .FLT_31,@object
|
||||
.size .FLT_31,32
|
||||
.long 0x00000000, 0x3eb00000, 0x00000000, 0x3eb00000, 0x00000000, 0x3eb00000, 0x00000000, 0x3eb00000
|
||||
.type .FLT_31, @object
|
||||
.size .FLT_31, 32
|
||||
.align 32
|
||||
|
||||
.FLT_32:
|
||||
.long 0x000001ff,0x00000000,0x000001ff,0x00000000,0x000001ff,0x00000000,0x000001ff,0x00000000
|
||||
.type .FLT_32,@object
|
||||
.size .FLT_32,32
|
||||
.long 0x000001ff, 0x00000000, 0x000001ff, 0x00000000, 0x000001ff, 0x00000000, 0x000001ff, 0x00000000
|
||||
.type .FLT_32, @object
|
||||
.size .FLT_32, 32
|
||||
.align 32
|
||||
|
||||
.FLT_33:
|
||||
.long 0x000000ff,0x00000000,0x000000ff,0x00000000,0x000000ff,0x00000000,0x000000ff,0x00000000
|
||||
.type .FLT_33,@object
|
||||
.size .FLT_33,32
|
||||
.long 0x000000ff, 0x00000000, 0x000000ff, 0x00000000, 0x000000ff, 0x00000000, 0x000000ff, 0x00000000
|
||||
.type .FLT_33, @object
|
||||
.size .FLT_33, 32
|
||||
.align 32
|
||||
|
||||
.FLT_34:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .FLT_34,@object
|
||||
.size .FLT_34,32
|
||||
.long 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000
|
||||
.type .FLT_34, @object
|
||||
.size .FLT_34, 32
|
||||
.align 32
|
||||
|
||||
#ifdef __svml_dtan_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
typedef struct {
|
||||
__declspec(align(32)) VUINT32 _dAbsMask[4][2];
|
||||
__declspec(align(32)) VUINT32 _dRangeVal[4][2];
|
||||
__declspec(align(32)) VUINT32 _dRShift[4][2];
|
||||
@ -790,7 +788,7 @@ typedef unsigned int VUINT32;
|
||||
__declspec(align(32)) VUINT32 _dQ1[4][2];
|
||||
__declspec(align(32)) VUINT32 _dQ2[4][2];
|
||||
__declspec(align(32)) VUINT32 _dQ3[4][2];
|
||||
} __svml_dtan_data_internal;
|
||||
} __svml_dtan_data_internal;
|
||||
#endif
|
||||
__svml_dtan_data_internal:
|
||||
/* Shared value*/
|
||||
@ -4161,14 +4159,13 @@ __svml_dtan_data_internal:
|
||||
.align 32
|
||||
.quad 0xbf2b525b03bc92a6, 0xbf2b525b03bc92a6, 0xbf2b525b03bc92a6, 0xbf2b525b03bc92a6 /* _dQ3 */
|
||||
.align 32
|
||||
.type __svml_dtan_data_internal,@object
|
||||
.size __svml_dtan_data_internal,.-__svml_dtan_data_internal
|
||||
.type __svml_dtan_data_internal, @object
|
||||
.size __svml_dtan_data_internal, .-__svml_dtan_data_internal
|
||||
.align 32
|
||||
|
||||
#ifdef __svml_dtan_reduction_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
typedef struct {
|
||||
__declspec(align(32)) VUINT32 _dPtable[2048][3][2];
|
||||
} __svml_dtan_reduction_data_internal;
|
||||
#endif
|
||||
@ -6223,5 +6220,5 @@ __svml_dtan_reduction_data_internal:
|
||||
.quad 0x4F758FD7CBE2F67A, 0x0E73EF14A525D4D7, 0xF6BF623F1ABA10AC /* 2046 */
|
||||
.quad 0x9EEB1FAF97C5ECF4, 0x1CE7DE294A4BA9AF, 0xED7EC47E35742158 /* 2047 */
|
||||
.align 32
|
||||
.type __svml_dtan_reduction_data_internal,@object
|
||||
.size __svml_dtan_reduction_data_internal,.-__svml_dtan_reduction_data_internal
|
||||
.type __svml_dtan_reduction_data_internal, @object
|
||||
.size __svml_dtan_reduction_data_internal, .-__svml_dtan_reduction_data_internal
|
||||
|
Loading…
Reference in New Issue
Block a user