diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S index 8936a7150b..25df252108 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S @@ -34,379 +34,377 @@ /* Offsets for data table __svml_dcosh_data_internal */ -#define _dbT 0 -#define _dbInvLn2 2080 -#define _dbLn2hi 2112 -#define _dbLn2lo 2144 -#define _dbShifter 2176 -#define _iIndexMask 2208 -#define _dPC2 2240 -#define _dPC3 2272 -#define _dPC4 2304 -#define _iMaxIndex 2336 -#define _lExpMask 2368 -#define _dSign 2400 -#define _iDomainRange 2432 +#define _dbT 0 +#define _dbInvLn2 2080 +#define _dbLn2hi 2112 +#define _dbLn2lo 2144 +#define _dbShifter 2176 +#define _iIndexMask 2208 +#define _dPC2 2240 +#define _dPC3 2272 +#define _dPC4 2304 +#define _iMaxIndex 2336 +#define _lExpMask 2368 +#define _dSign 2400 +#define _iDomainRange 2432 #include - .text - .section .text.avx2,"ax",@progbits + .section .text.avx2, "ax", @progbits ENTRY(_ZGVdN4v_cosh_avx2) - pushq %rbp - cfi_def_cfa_offset(16) - movq %rsp, %rbp - cfi_def_cfa(6, 16) - cfi_offset(6, -16) - andq $-32, %rsp - subq $96, %rsp - lea _dbT+__svml_dcosh_data_internal(%rip), %rax - vmovupd _dSign+__svml_dcosh_data_internal(%rip), %ymm8 - vmovupd _dbShifter+__svml_dcosh_data_internal(%rip), %ymm6 + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-32, %rsp + subq $96, %rsp + lea _dbT+__svml_dcosh_data_internal(%rip), %rax + vmovupd _dSign+__svml_dcosh_data_internal(%rip), %ymm8 + vmovupd _dbShifter+__svml_dcosh_data_internal(%rip), %ymm6 -/* - * Load argument - * dM = x*2^K/log(2) + RShifter - */ - vmovupd _dbInvLn2+__svml_dcosh_data_internal(%rip), %ymm3 + /* + * Load argument + * dM = x*2^K/log(2) + RShifter + */ + vmovupd _dbInvLn2+__svml_dcosh_data_internal(%rip), %ymm3 -/* - * trick - * 256=-iIndex - */ - vmovups _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm14 + /* + * trick + * 256=-iIndex + */ + vmovups _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm14 -/* dXSign=0x001000000000 */ - vpsrlq $11, %ymm8, %ymm5 - vmovapd %ymm0, %ymm7 + /* dXSign=0x001000000000 */ + vpsrlq $11, %ymm8, %ymm5 + vmovapd %ymm0, %ymm7 -/* Abs argument */ - vandnpd %ymm7, %ymm8, %ymm4 - vfmadd213pd %ymm6, %ymm4, %ymm3 + /* Abs argument */ + vandnpd %ymm7, %ymm8, %ymm4 + vfmadd213pd %ymm6, %ymm4, %ymm3 -/* Index and lookup */ - vextractf128 $1, %ymm3, %xmm12 - vshufps $136, %xmm12, %xmm3, %xmm13 - vpand _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm13, %xmm15 - vpsubd %xmm15, %xmm14, %xmm0 + /* Index and lookup */ + vextractf128 $1, %ymm3, %xmm12 + vshufps $136, %xmm12, %xmm3, %xmm13 + vpand _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm13, %xmm15 + vpsubd %xmm15, %xmm14, %xmm0 -/* iDomainRange*=3 */ - vpslld $3, %xmm0, %xmm2 - vmovd %xmm2, %r9d - vpextrd $2, %xmm2, %r11d - movslq %r9d, %r9 - vpextrd $1, %xmm2, %r10d - movslq %r11d, %r11 - movslq %r10d, %r10 - vmovsd (%rax,%r9), %xmm12 + /* iDomainRange*=3 */ + vpslld $3, %xmm0, %xmm2 + vmovd %xmm2, %r9d + vpextrd $2, %xmm2, %r11d + movslq %r9d, %r9 + vpextrd $1, %xmm2, %r10d + movslq %r11d, %r11 + movslq %r10d, %r10 + vmovsd (%rax, %r9), %xmm12 -/* - * Check for overflow\underflow - * - */ - vextractf128 $1, %ymm4, %xmm9 - vmovsd (%rax,%r11), %xmm14 - vmovhpd (%rax,%r10), %xmm12, %xmm13 - vshufps $221, %xmm9, %xmm4, %xmm10 + /* + * Check for overflow\underflow + * + */ + vextractf128 $1, %ymm4, %xmm9 + vmovsd (%rax, %r11), %xmm14 + vmovhpd (%rax, %r10), %xmm12, %xmm13 + vshufps $221, %xmm9, %xmm4, %xmm10 -/* iIndex*=3 */ - vpslld $3, %xmm15, %xmm9 + /* iIndex*=3 */ + vpslld $3, %xmm15, %xmm9 -/* - * R - * dN = dM - RShifter - */ - vsubpd %ymm6, %ymm3, %ymm15 - vmovd %xmm9, %ecx - vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm10, %xmm11 - vmovupd _dbLn2hi+__svml_dcosh_data_internal(%rip), %ymm6 + /* + * R + * dN = dM - RShifter + */ + vsubpd %ymm6, %ymm3, %ymm15 + vmovd %xmm9, %ecx + vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm10, %xmm11 + vmovupd _dbLn2hi+__svml_dcosh_data_internal(%rip), %ymm6 -/* - * G1,G2,G3: dTdif,dTn * 2^N,2^(-N) - * NB: copied from sinh_la - to be optimized!!!!! - */ - vpsllq $44, %ymm3, %ymm3 - vmovmskps %xmm11, %edx + /* + * G1, G2, G3: dTdif, dTn * 2^N, 2^(-N) + * NB: copied from sinh_la - to be optimized!!!!! + */ + vpsllq $44, %ymm3, %ymm3 + vmovmskps %xmm11, %edx -/* dR = dX - dN*Log2_hi/2^K */ - vfnmadd231pd %ymm6, %ymm15, %ymm4 + /* dR = dX - dN*Log2_hi/2^K */ + vfnmadd231pd %ymm6, %ymm15, %ymm4 -/* lM now is an EXP(2^N) */ - vpand _lExpMask+__svml_dcosh_data_internal(%rip), %ymm3, %ymm3 + /* lM now is an EXP(2^N) */ + vpand _lExpMask+__svml_dcosh_data_internal(%rip), %ymm3, %ymm3 -/* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */ - vfnmadd231pd _dbLn2lo+__svml_dcosh_data_internal(%rip), %ymm15, %ymm4 - movslq %ecx, %rcx - vpextrd $2, %xmm9, %edi - vpextrd $1, %xmm9, %esi - movslq %edi, %rdi - vmovsd (%rax,%rcx), %xmm1 - vpextrd $3, %xmm9, %r8d - vpextrd $3, %xmm2, %ecx - movslq %esi, %rsi - movslq %r8d, %r8 - movslq %ecx, %rcx + /* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */ + vfnmadd231pd _dbLn2lo+__svml_dcosh_data_internal(%rip), %ymm15, %ymm4 + movslq %ecx, %rcx + vpextrd $2, %xmm9, %edi + vpextrd $1, %xmm9, %esi + movslq %edi, %rdi + vmovsd (%rax, %rcx), %xmm1 + vpextrd $3, %xmm9, %r8d + vpextrd $3, %xmm2, %ecx + movslq %esi, %rsi + movslq %r8d, %r8 + movslq %ecx, %rcx -/* dR2 = dR^2 */ - vmulpd %ymm4, %ymm4, %ymm0 - vmovsd (%rax,%rdi), %xmm10 - vmovhpd (%rax,%rsi), %xmm1, %xmm8 - vmovhpd (%rax,%r8), %xmm10, %xmm11 - vmovhpd (%rax,%rcx), %xmm14, %xmm2 - vinsertf128 $1, %xmm11, %ymm8, %ymm1 - vinsertf128 $1, %xmm2, %ymm13, %ymm2 - vpaddq %ymm3, %ymm1, %ymm6 + /* dR2 = dR^2 */ + vmulpd %ymm4, %ymm4, %ymm0 + vmovsd (%rax, %rdi), %xmm10 + vmovhpd (%rax, %rsi), %xmm1, %xmm8 + vmovhpd (%rax, %r8), %xmm10, %xmm11 + vmovhpd (%rax, %rcx), %xmm14, %xmm2 + vinsertf128 $1, %xmm11, %ymm8, %ymm1 + vinsertf128 $1, %xmm2, %ymm13, %ymm2 + vpaddq %ymm3, %ymm1, %ymm6 -/* */ - vpsubq %ymm3, %ymm2, %ymm1 + /* */ + vpsubq %ymm3, %ymm2, %ymm1 -/* - * sinh(r) = r +r*r^2*a3 .... - * dSinh_r = r^2*a3 - */ - vmulpd _dPC3+__svml_dcosh_data_internal(%rip), %ymm0, %ymm2 + /* + * sinh(r) = r +r*r^2*a3 .... + * dSinh_r = r^2*a3 + */ + vmulpd _dPC3+__svml_dcosh_data_internal(%rip), %ymm0, %ymm2 -/* lX- = EXP(1/2) */ - vpsubq %ymm5, %ymm1, %ymm5 + /* lX- = EXP(1/2) */ + vpsubq %ymm5, %ymm1, %ymm5 -/* dSinh_r = r + r*r^2*a3 */ - vfmadd213pd %ymm4, %ymm4, %ymm2 + /* dSinh_r = r + r*r^2*a3 */ + vfmadd213pd %ymm4, %ymm4, %ymm2 -/* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ - vmovupd _dPC4+__svml_dcosh_data_internal(%rip), %ymm4 + /* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ + vmovupd _dPC4+__svml_dcosh_data_internal(%rip), %ymm4 -/* dTn = dTn*2^N - dTn*2^-N */ - vsubpd %ymm5, %ymm6, %ymm1 + /* dTn = dTn*2^N - dTn*2^-N */ + vsubpd %ymm5, %ymm6, %ymm1 -/* dTp = dTn*2^N + dTn*2^-N */ - vaddpd %ymm5, %ymm6, %ymm3 - vfmadd213pd _dPC2+__svml_dcosh_data_internal(%rip), %ymm0, %ymm4 - vmulpd %ymm2, %ymm1, %ymm1 - vmulpd %ymm4, %ymm0, %ymm0 + /* dTp = dTn*2^N + dTn*2^-N */ + vaddpd %ymm5, %ymm6, %ymm3 + vfmadd213pd _dPC2+__svml_dcosh_data_internal(%rip), %ymm0, %ymm4 + vmulpd %ymm2, %ymm1, %ymm1 + vmulpd %ymm4, %ymm0, %ymm0 -/* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ - vfmadd213pd %ymm1, %ymm3, %ymm0 + /* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ + vfmadd213pd %ymm1, %ymm3, %ymm0 -/* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ - vaddpd %ymm0, %ymm3, %ymm0 + /* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ + vaddpd %ymm0, %ymm3, %ymm0 -/* Ret H */ - testl %edx, %edx + /* Ret H */ + testl %edx, %edx -/* Go to special inputs processing branch */ - jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 + /* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + # LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 -/* Restore registers - * and exit the function - */ + /* Restore registers + * and exit the function + */ L(EXIT): - movq %rbp, %rsp - popq %rbp - cfi_def_cfa(7, 8) - cfi_restore(6) - ret - cfi_def_cfa(6, 16) - cfi_offset(6, -16) + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) -/* Branch to process - * special inputs - */ + /* Branch to process + * special inputs + */ L(SPECIAL_VALUES_BRANCH): - vmovupd %ymm7, 32(%rsp) - vmovupd %ymm0, 64(%rsp) - # LOE rbx r12 r13 r14 r15 edx ymm0 + vmovupd %ymm7, 32(%rsp) + vmovupd %ymm0, 64(%rsp) + # LOE rbx r12 r13 r14 r15 edx ymm0 - xorl %eax, %eax - # LOE rbx r12 r13 r14 r15 eax edx + xorl %eax, %eax + # LOE rbx r12 r13 r14 r15 eax edx - vzeroupper - movq %r12, 16(%rsp) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 - movl %eax, %r12d - movq %r13, 8(%rsp) - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 - movl %edx, %r13d - movq %r14, (%rsp) - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r15 r12d r13d + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r15 r12d r13d -/* Range mask - * bits check - */ + /* Range mask + * bits check + */ L(RANGEMASK_CHECK): - btl %r12d, %r13d + btl %r12d, %r13d -/* Call scalar math function */ - jc L(SCALAR_MATH_CALL) - # LOE rbx r15 r12d r13d + /* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + # LOE rbx r15 r12d r13d -/* Special inputs - * processing loop - */ + /* Special inputs + * processing loop + */ L(SPECIAL_VALUES_LOOP): - incl %r12d - cmpl $4, %r12d + incl %r12d + cmpl $4, %r12d -/* Check bits in range mask */ - jl L(RANGEMASK_CHECK) - # LOE rbx r15 r12d r13d + /* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + # LOE rbx r15 r12d r13d - movq 16(%rsp), %r12 - cfi_restore(12) - movq 8(%rsp), %r13 - cfi_restore(13) - movq (%rsp), %r14 - cfi_restore(14) - vmovupd 64(%rsp), %ymm0 + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovupd 64(%rsp), %ymm0 -/* Go to exit */ - jmp L(EXIT) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r12 r13 r14 r15 ymm0 + /* Go to exit */ + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r12 r13 r14 r15 ymm0 -/* Scalar math fucntion call - * to process special input - */ + /* Scalar math fucntion call + * to process special input + */ L(SCALAR_MATH_CALL): - movl %r12d, %r14d - movsd 32(%rsp,%r14,8), %xmm0 - call cosh@PLT - # LOE rbx r14 r15 r12d r13d xmm0 + movl %r12d, %r14d + movsd 32(%rsp, %r14, 8), %xmm0 + call cosh@PLT + # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp,%r14,8) + movsd %xmm0, 64(%rsp, %r14, 8) -/* Process special inputs in loop */ - jmp L(SPECIAL_VALUES_LOOP) - # LOE rbx r15 r12d r13d + /* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + # LOE rbx r15 r12d r13d END(_ZGVdN4v_cosh_avx2) - .section .rodata, "a" - .align 32 + .section .rodata, "a" + .align 32 #ifdef __svml_dcosh_data_internal_typedef typedef unsigned int VUINT32; -typedef struct -{ - __declspec(align(32)) VUINT32 _dbT[(1 + (1<<8))][2]; //dTpj ONLY! - __declspec(align(32)) VUINT32 _dbInvLn2[4][2]; - __declspec(align(32)) VUINT32 _dbLn2hi[4][2]; - __declspec(align(32)) VUINT32 _dbLn2lo[4][2]; - __declspec(align(32)) VUINT32 _dbShifter[4][2]; - __declspec(align(32)) VUINT32 _iIndexMask[8][1]; //(1<