x86-64: Add vector cbrt/cbrtf implementation to libmvec

Implement vectorized cbrt/cbrtf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector cbrt/cbrtf with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 09:11:23 -08:00
parent aa1809a1df
commit 2bf02c5843
50 changed files with 3031 additions and 1 deletions

View File

@ -197,4 +197,15 @@
#define __DECL_SIMD_sinhf32x
#define __DECL_SIMD_sinhf64x
#define __DECL_SIMD_sinhf128x
#define __DECL_SIMD_cbrt
#define __DECL_SIMD_cbrtf
#define __DECL_SIMD_cbrtl
#define __DECL_SIMD_cbrtf16
#define __DECL_SIMD_cbrtf32
#define __DECL_SIMD_cbrtf64
#define __DECL_SIMD_cbrtf128
#define __DECL_SIMD_cbrtf32x
#define __DECL_SIMD_cbrtf64x
#define __DECL_SIMD_cbrtf128x
#endif

View File

@ -149,7 +149,7 @@ __MATHCALL_VEC (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
#if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99
/* Return the cube root of X. */
__MATHCALL (cbrt,, (_Mdouble_ __x));
__MATHCALL_VEC (cbrt,, (_Mdouble_ __x));
#endif

View File

@ -49,6 +49,7 @@ GLIBC_2.22 _ZGVeN8vvv_sincos F
GLIBC_2.35 _ZGVbN2v_acos F
GLIBC_2.35 _ZGVbN2v_asin F
GLIBC_2.35 _ZGVbN2v_atan F
GLIBC_2.35 _ZGVbN2v_cbrt F
GLIBC_2.35 _ZGVbN2v_cosh F
GLIBC_2.35 _ZGVbN2v_exp10 F
GLIBC_2.35 _ZGVbN2v_exp2 F
@ -58,6 +59,7 @@ GLIBC_2.35 _ZGVbN2vv_hypot F
GLIBC_2.35 _ZGVbN4v_acosf F
GLIBC_2.35 _ZGVbN4v_asinf F
GLIBC_2.35 _ZGVbN4v_atanf F
GLIBC_2.35 _ZGVbN4v_cbrtf F
GLIBC_2.35 _ZGVbN4v_coshf F
GLIBC_2.35 _ZGVbN4v_exp10f F
GLIBC_2.35 _ZGVbN4v_exp2f F
@ -67,6 +69,7 @@ GLIBC_2.35 _ZGVbN4vv_hypotf F
GLIBC_2.35 _ZGVcN4v_acos F
GLIBC_2.35 _ZGVcN4v_asin F
GLIBC_2.35 _ZGVcN4v_atan F
GLIBC_2.35 _ZGVcN4v_cbrt F
GLIBC_2.35 _ZGVcN4v_cosh F
GLIBC_2.35 _ZGVcN4v_exp10 F
GLIBC_2.35 _ZGVcN4v_exp2 F
@ -76,6 +79,7 @@ GLIBC_2.35 _ZGVcN4vv_hypot F
GLIBC_2.35 _ZGVcN8v_acosf F
GLIBC_2.35 _ZGVcN8v_asinf F
GLIBC_2.35 _ZGVcN8v_atanf F
GLIBC_2.35 _ZGVcN8v_cbrtf F
GLIBC_2.35 _ZGVcN8v_coshf F
GLIBC_2.35 _ZGVcN8v_exp10f F
GLIBC_2.35 _ZGVcN8v_exp2f F
@ -85,6 +89,7 @@ GLIBC_2.35 _ZGVcN8vv_hypotf F
GLIBC_2.35 _ZGVdN4v_acos F
GLIBC_2.35 _ZGVdN4v_asin F
GLIBC_2.35 _ZGVdN4v_atan F
GLIBC_2.35 _ZGVdN4v_cbrt F
GLIBC_2.35 _ZGVdN4v_cosh F
GLIBC_2.35 _ZGVdN4v_exp10 F
GLIBC_2.35 _ZGVdN4v_exp2 F
@ -94,6 +99,7 @@ GLIBC_2.35 _ZGVdN4vv_hypot F
GLIBC_2.35 _ZGVdN8v_acosf F
GLIBC_2.35 _ZGVdN8v_asinf F
GLIBC_2.35 _ZGVdN8v_atanf F
GLIBC_2.35 _ZGVdN8v_cbrtf F
GLIBC_2.35 _ZGVdN8v_coshf F
GLIBC_2.35 _ZGVdN8v_exp10f F
GLIBC_2.35 _ZGVdN8v_exp2f F
@ -103,6 +109,7 @@ GLIBC_2.35 _ZGVdN8vv_hypotf F
GLIBC_2.35 _ZGVeN16v_acosf F
GLIBC_2.35 _ZGVeN16v_asinf F
GLIBC_2.35 _ZGVeN16v_atanf F
GLIBC_2.35 _ZGVeN16v_cbrtf F
GLIBC_2.35 _ZGVeN16v_coshf F
GLIBC_2.35 _ZGVeN16v_exp10f F
GLIBC_2.35 _ZGVeN16v_exp2f F
@ -112,6 +119,7 @@ GLIBC_2.35 _ZGVeN16vv_hypotf F
GLIBC_2.35 _ZGVeN8v_acos F
GLIBC_2.35 _ZGVeN8v_asin F
GLIBC_2.35 _ZGVeN8v_atan F
GLIBC_2.35 _ZGVeN8v_cbrt F
GLIBC_2.35 _ZGVeN8v_cosh F
GLIBC_2.35 _ZGVeN8v_exp10 F
GLIBC_2.35 _ZGVeN8v_exp2 F

View File

@ -94,6 +94,10 @@
# define __DECL_SIMD_sinh __DECL_SIMD_x86_64
# undef __DECL_SIMD_sinhf
# define __DECL_SIMD_sinhf __DECL_SIMD_x86_64
# undef __DECL_SIMD_cbrt
# define __DECL_SIMD_cbrt __DECL_SIMD_x86_64
# undef __DECL_SIMD_cbrtf
# define __DECL_SIMD_cbrtf __DECL_SIMD_x86_64
# endif
#endif

View File

@ -46,6 +46,8 @@
!GCC$ builtin (expm1f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (sinh) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (sinhf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cbrt) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cbrtf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -77,3 +79,5 @@
!GCC$ builtin (expm1f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (sinh) attributes simd (notinbranch) if('x32')
!GCC$ builtin (sinhf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cbrt) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cbrtf) attributes simd (notinbranch) if('x32')

View File

@ -25,6 +25,7 @@ libmvec-funcs = \
acos \
asin \
atan \
cbrt \
cos \
cosh \
exp \

View File

@ -17,6 +17,7 @@ libmvec {
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
_ZGVbN2v_asin; _ZGVcN4v_asin; _ZGVdN4v_asin; _ZGVeN8v_asin;
_ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
_ZGVbN2v_cbrt; _ZGVcN4v_cbrt; _ZGVdN4v_cbrt; _ZGVeN8v_cbrt;
_ZGVbN2v_cosh; _ZGVcN4v_cosh; _ZGVdN4v_cosh; _ZGVeN8v_cosh;
_ZGVbN2v_exp10; _ZGVcN4v_exp10; _ZGVdN4v_exp10; _ZGVeN8v_exp10;
_ZGVbN2v_exp2; _ZGVcN4v_exp2; _ZGVdN4v_exp2; _ZGVeN8v_exp2;
@ -26,6 +27,7 @@ libmvec {
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
_ZGVbN4v_asinf; _ZGVcN8v_asinf; _ZGVdN8v_asinf; _ZGVeN16v_asinf;
_ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
_ZGVbN4v_cbrtf; _ZGVcN8v_cbrtf; _ZGVdN8v_cbrtf; _ZGVeN16v_cbrtf;
_ZGVbN4v_coshf; _ZGVcN8v_coshf; _ZGVdN8v_coshf; _ZGVeN16v_coshf;
_ZGVbN4v_exp10f; _ZGVcN8v_exp10f; _ZGVdN8v_exp10f; _ZGVeN16v_exp10f;
_ZGVbN4v_exp2f; _ZGVcN8v_exp2f; _ZGVdN8v_exp2f; _ZGVeN16v_exp2f;

View File

@ -583,6 +583,26 @@ float: 1
float128: 1
ldouble: 1
Function: "cbrt_vlen16":
float: 1
Function: "cbrt_vlen2":
double: 1
Function: "cbrt_vlen4":
double: 1
float: 2
Function: "cbrt_vlen4_avx2":
double: 1
Function: "cbrt_vlen8":
double: 1
float: 2
Function: "cbrt_vlen8_avx2":
float: 2
Function: Real part of "ccos":
double: 1
float: 1

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized cbrt, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_cbrt _ZGVbN2v_cbrt_sse2
#include "../svml_d_cbrt2_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized cbrt, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_cbrt
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_cbrt, __GI__ZGVbN2v_cbrt, __redirect__ZGVbN2v_cbrt)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,467 @@
/* Function cbrt vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
* Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
* where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision
* cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
* (T stores the high 53 bits, D stores the low order bits)
* Result=2^k*T+(2^k*T*r)*P+2^k*D
* where P=p1+p2*r+..+p8*r^7
*
*/
/* Offsets for data table __svml_dcbrt_data_internal
*/
#define _dRcp 0
#define _dCbrtHiLo 256
#define _dA7 1024
#define _dA6 1040
#define _dA5 1056
#define _dA4 1072
#define _dA3 1088
#define _dA2 1104
#define _dA1 1120
#define _dNeg65Div64 1136
#define _dSgnf6Mask 1152
#define _dNegOne 1168
#define _dMantissaMask 1184
#define _lExpHiMask 1200
#define _lExpLoMask 1216
#define _l1556 1232
#define _iRcpIndexMask 1248
#define _iAbsMask 1264
#define _iSignMask 1280
#define _iBias 1296
#define _iSub 1312
#define _iCmp 1328
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN2v_cbrt_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
/* Calculate CbrtIndex */
movaps %xmm0, %xmm10
psrlq $52, %xmm10
/* Load 1/(1+iRcpIndex/32+1/64) reciprocal table value */
lea __svml_dcbrt_data_internal(%rip), %r8
pand _lExpLoMask+__svml_dcbrt_data_internal(%rip), %xmm10
movdqu _l1556+__svml_dcbrt_data_internal(%rip), %xmm9
pmuludq %xmm10, %xmm9
/* If the exponent field is zero - go to callout to process denormals */
movq _iAbsMask+__svml_dcbrt_data_internal(%rip), %xmm7
/* Calculate Rcp table index */
movq _iRcpIndexMask+__svml_dcbrt_data_internal(%rip), %xmm13
/* Get iX - high part of argument */
pshufd $221, %xmm0, %xmm4
/*
* Declarations
* Load constants
*/
movq _iSignMask+__svml_dcbrt_data_internal(%rip), %xmm1
pand %xmm4, %xmm7
pand %xmm4, %xmm13
/* Compute 2^k */
psrld $20, %xmm4
movq _iBias+__svml_dcbrt_data_internal(%rip), %xmm2
pand %xmm1, %xmm4
pshufd $136, %xmm9, %xmm15
por %xmm2, %xmm4
psrld $14, %xmm15
psrld $12, %xmm13
paddd %xmm15, %xmm4
pxor %xmm2, %xmm2
pslld $20, %xmm4
movdqa %xmm15, %xmm11
movd %xmm13, %edx
paddd %xmm15, %xmm11
pshufd $1, %xmm13, %xmm8
punpckldq %xmm4, %xmm2
/*
* VAND( L, l2k, = l2k, lExpHiMask );
* Argument reduction Z
*/
movups _dMantissaMask+__svml_dcbrt_data_internal(%rip), %xmm1
movups _dSgnf6Mask+__svml_dcbrt_data_internal(%rip), %xmm4
andps %xmm0, %xmm1
movd %xmm8, %ecx
andps %xmm0, %xmm4
orps _dNegOne+__svml_dcbrt_data_internal(%rip), %xmm1
orps _dNeg65Div64+__svml_dcbrt_data_internal(%rip), %xmm4
movslq %edx, %rdx
subpd %xmm4, %xmm1
movslq %ecx, %rcx
movsd (%r8,%rdx), %xmm3
movq _iSub+__svml_dcbrt_data_internal(%rip), %xmm5
psubd %xmm5, %xmm7
movhpd (%r8,%rcx), %xmm3
mulpd %xmm1, %xmm3
/* Polynomial */
movups _dA7+__svml_dcbrt_data_internal(%rip), %xmm5
mulpd %xmm3, %xmm5
addpd _dA6+__svml_dcbrt_data_internal(%rip), %xmm5
mulpd %xmm3, %xmm5
addpd _dA5+__svml_dcbrt_data_internal(%rip), %xmm5
mulpd %xmm3, %xmm5
addpd _dA4+__svml_dcbrt_data_internal(%rip), %xmm5
mulpd %xmm3, %xmm5
addpd _dA3+__svml_dcbrt_data_internal(%rip), %xmm5
pshufd $136, %xmm10, %xmm12
psubd %xmm15, %xmm12
psubd %xmm11, %xmm12
mulpd %xmm3, %xmm5
pslld $8, %xmm12
paddd %xmm12, %xmm13
/* Load cbrt(2^j*(1+iRcpIndex/32+1/64)) Hi & Lo values */
movd %xmm13, %esi
pshufd $1, %xmm13, %xmm14
movq _iCmp+__svml_dcbrt_data_internal(%rip), %xmm6
movd %xmm14, %edi
pcmpgtd %xmm6, %xmm7
movmskps %xmm7, %eax
addpd _dA2+__svml_dcbrt_data_internal(%rip), %xmm5
movslq %esi, %rsi
movslq %edi, %rdi
mulpd %xmm3, %xmm5
movsd 256(%r8,%rsi), %xmm6
movhpd 256(%r8,%rdi), %xmm6
/* THi*2^k, TLo*2^k */
mulpd %xmm2, %xmm6
addpd _dA1+__svml_dcbrt_data_internal(%rip), %xmm5
/* THi*2^k*Z */
mulpd %xmm6, %xmm3
/* Final reconstruction */
mulpd %xmm3, %xmm5
addpd %xmm5, %xmm6
andl $3, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm6
/* Restore registers
* and exit the function
*/
L(EXIT):
movaps %xmm6, %xmm0
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm0, 32(%rsp)
movups %xmm6, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax xmm6
xorl %edx, %edx
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %edx, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %eax, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $2, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm6
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm6
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call cbrt@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movsd %xmm0, 48(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN2v_cbrt_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_dcbrt_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(16)) VUINT32 _dRcp[32][2];
__declspec(align(16)) VUINT32 _dCbrtHiLo[96][2];
__declspec(align(16)) VUINT32 _dA7[2][2];
__declspec(align(16)) VUINT32 _dA6[2][2];
__declspec(align(16)) VUINT32 _dA5[2][2];
__declspec(align(16)) VUINT32 _dA4[2][2];
__declspec(align(16)) VUINT32 _dA3[2][2];
__declspec(align(16)) VUINT32 _dA2[2][2];
__declspec(align(16)) VUINT32 _dA1[2][2];
__declspec(align(16)) VUINT32 _dNeg65Div64[2][2];
__declspec(align(16)) VUINT32 _dSgnf6Mask[2][2];
__declspec(align(16)) VUINT32 _dNegOne[2][2];
__declspec(align(16)) VUINT32 _dMantissaMask[2][2];
__declspec(align(16)) VUINT32 _lExpHiMask[2][2];
__declspec(align(16)) VUINT32 _lExpLoMask[2][2];
__declspec(align(16)) VUINT32 _l1556[2][2];
__declspec(align(16)) VUINT32 _iRcpIndexMask[4][1];
__declspec(align(16)) VUINT32 _iAbsMask[4][1];
__declspec(align(16)) VUINT32 _iSignMask[4][1];
__declspec(align(16)) VUINT32 _iBias[4][1];
__declspec(align(16)) VUINT32 _iSub[4][1];
__declspec(align(16)) VUINT32 _iCmp[4][1];
} __svml_dcbrt_data_internal;
#endif
__svml_dcbrt_data_internal:
/*== _dRcp ==*/
.quad 0xBFEF81F81F81F820 /* (1/(1+0/32+1/64)) = -.984615 */
.quad 0xBFEE9131ABF0B767 /* (1/(1+1/32+1/64)) = -.955224 */
.quad 0xBFEDAE6076B981DB /* (1/(1+2/32+1/64)) = -.927536 */
.quad 0xBFECD85689039B0B /* (1/(1+3/32+1/64)) = -.901408 */
.quad 0xBFEC0E070381C0E0 /* (1/(1+4/32+1/64)) = -.876712 */
.quad 0xBFEB4E81B4E81B4F /* (1/(1+5/32+1/64)) = -.853333 */
.quad 0xBFEA98EF606A63BE /* (1/(1+6/32+1/64)) = -.831169 */
.quad 0xBFE9EC8E951033D9 /* (1/(1+7/32+1/64)) = -.810127 */
.quad 0xBFE948B0FCD6E9E0 /* (1/(1+8/32+1/64)) = -.790123 */
.quad 0xBFE8ACB90F6BF3AA /* (1/(1+9/32+1/64)) = -.771084 */
.quad 0xBFE8181818181818 /* (1/(1+10/32+1/64)) = -.752941 */
.quad 0xBFE78A4C8178A4C8 /* (1/(1+11/32+1/64)) = -.735632 */
.quad 0xBFE702E05C0B8170 /* (1/(1+12/32+1/64)) = -.719101 */
.quad 0xBFE6816816816817 /* (1/(1+13/32+1/64)) = -.703297 */
.quad 0xBFE6058160581606 /* (1/(1+14/32+1/64)) = -.688172 */
.quad 0xBFE58ED2308158ED /* (1/(1+15/32+1/64)) = -.673684 */
.quad 0xBFE51D07EAE2F815 /* (1/(1+16/32+1/64)) = -.659794 */
.quad 0xBFE4AFD6A052BF5B /* (1/(1+17/32+1/64)) = -.646465 */
.quad 0xBFE446F86562D9FB /* (1/(1+18/32+1/64)) = -.633663 */
.quad 0xBFE3E22CBCE4A902 /* (1/(1+19/32+1/64)) = -.621359 */
.quad 0xBFE3813813813814 /* (1/(1+20/32+1/64)) = -.609524 */
.quad 0xBFE323E34A2B10BF /* (1/(1+21/32+1/64)) = -.598131 */
.quad 0xBFE2C9FB4D812CA0 /* (1/(1+22/32+1/64)) = -.587156 */
.quad 0xBFE27350B8812735 /* (1/(1+23/32+1/64)) = -.576577 */
.quad 0xBFE21FB78121FB78 /* (1/(1+24/32+1/64)) = -.566372 */
.quad 0xBFE1CF06ADA2811D /* (1/(1+25/32+1/64)) = -.556522 */
.quad 0xBFE1811811811812 /* (1/(1+26/32+1/64)) = -.547009 */
.quad 0xBFE135C81135C811 /* (1/(1+27/32+1/64)) = -.537815 */
.quad 0xBFE0ECF56BE69C90 /* (1/(1+28/32+1/64)) = -.528926 */
.quad 0xBFE0A6810A6810A7 /* (1/(1+29/32+1/64)) = -.520325 */
.quad 0xBFE0624DD2F1A9FC /* (1/(1+30/32+1/64)) = -.512 */
.quad 0xBFE0204081020408 /* (1/(1+31/32+1/64)) = -.503937 */
/*== _dCbrtHiLo ==*/
.align 16
.quad 0x3FF01539221D4C97 /* HI((2^0*(1+0/32+1/64))^(1/3)) = 1.005181 */
.quad 0x3FF03F06771A2E33 /* HI((2^0*(1+1/32+1/64))^(1/3)) = 1.015387 */
.quad 0x3FF06800E629D671 /* HI((2^0*(1+2/32+1/64))^(1/3)) = 1.025391 */
.quad 0x3FF090328731DEB2 /* HI((2^0*(1+3/32+1/64))^(1/3)) = 1.035204 */
.quad 0x3FF0B7A4B1BD64AC /* HI((2^0*(1+4/32+1/64))^(1/3)) = 1.044835 */
.quad 0x3FF0DE601024FB87 /* HI((2^0*(1+5/32+1/64))^(1/3)) = 1.054291 */
.quad 0x3FF1046CB0597000 /* HI((2^0*(1+6/32+1/64))^(1/3)) = 1.06358 */
.quad 0x3FF129D212A9BA9B /* HI((2^0*(1+7/32+1/64))^(1/3)) = 1.07271 */
.quad 0x3FF14E9736CDAF38 /* HI((2^0*(1+8/32+1/64))^(1/3)) = 1.081687 */
.quad 0x3FF172C2A772F507 /* HI((2^0*(1+9/32+1/64))^(1/3)) = 1.090518 */
.quad 0x3FF1965A848001D3 /* HI((2^0*(1+10/32+1/64))^(1/3)) = 1.099207 */
.quad 0x3FF1B9648C38C55D /* HI((2^0*(1+11/32+1/64))^(1/3)) = 1.107762 */
.quad 0x3FF1DBE6236A0C45 /* HI((2^0*(1+12/32+1/64))^(1/3)) = 1.116186 */
.quad 0x3FF1FDE45CBB1F9F /* HI((2^0*(1+13/32+1/64))^(1/3)) = 1.124485 */
.quad 0x3FF21F63FF409042 /* HI((2^0*(1+14/32+1/64))^(1/3)) = 1.132664 */
.quad 0x3FF240698C6746E5 /* HI((2^0*(1+15/32+1/64))^(1/3)) = 1.140726 */
.quad 0x3FF260F9454BB99B /* HI((2^0*(1+16/32+1/64))^(1/3)) = 1.148675 */
.quad 0x3FF281172F8E7073 /* HI((2^0*(1+17/32+1/64))^(1/3)) = 1.156516 */
.quad 0x3FF2A0C719B4B6D0 /* HI((2^0*(1+18/32+1/64))^(1/3)) = 1.164252 */
.quad 0x3FF2C00C9F2263EC /* HI((2^0*(1+19/32+1/64))^(1/3)) = 1.171887 */
.quad 0x3FF2DEEB2BB7FB78 /* HI((2^0*(1+20/32+1/64))^(1/3)) = 1.179423 */
.quad 0x3FF2FD65FF1EFBBC /* HI((2^0*(1+21/32+1/64))^(1/3)) = 1.186865 */
.quad 0x3FF31B802FCCF6A2 /* HI((2^0*(1+22/32+1/64))^(1/3)) = 1.194214 */
.quad 0x3FF3393CADC50708 /* HI((2^0*(1+23/32+1/64))^(1/3)) = 1.201474 */
.quad 0x3FF3569E451E4C2A /* HI((2^0*(1+24/32+1/64))^(1/3)) = 1.208647 */
.quad 0x3FF373A7A0554CDE /* HI((2^0*(1+25/32+1/64))^(1/3)) = 1.215736 */
.quad 0x3FF3905B4A6D76CE /* HI((2^0*(1+26/32+1/64))^(1/3)) = 1.222743 */
.quad 0x3FF3ACBBB0E756B6 /* HI((2^0*(1+27/32+1/64))^(1/3)) = 1.229671 */
.quad 0x3FF3C8CB258FA340 /* HI((2^0*(1+28/32+1/64))^(1/3)) = 1.236522 */
.quad 0x3FF3E48BE02AC0CE /* HI((2^0*(1+29/32+1/64))^(1/3)) = 1.243297 */
.quad 0x3FF4000000000000 /* HI((2^0*(1+30/32+1/64))^(1/3)) = 1.25 */
.quad 0x3FF41B298D47800E /* HI((2^0*(1+31/32+1/64))^(1/3)) = 1.256631 */
.quad 0x3FF443604B34D9B2 /* HI((2^1*(1+0/32+1/64))^(1/3)) = 1.266449 */
.quad 0x3FF4780B20906571 /* HI((2^1*(1+1/32+1/64))^(1/3)) = 1.279307 */
.quad 0x3FF4ABAC3EE06706 /* HI((2^1*(1+2/32+1/64))^(1/3)) = 1.291912 */
.quad 0x3FF4DE505DA66B8D /* HI((2^1*(1+3/32+1/64))^(1/3)) = 1.304276 */
.quad 0x3FF51003420A5C07 /* HI((2^1*(1+4/32+1/64))^(1/3)) = 1.316409 */
.quad 0x3FF540CFD6FD11C1 /* HI((2^1*(1+5/32+1/64))^(1/3)) = 1.328323 */
.quad 0x3FF570C04260716B /* HI((2^1*(1+6/32+1/64))^(1/3)) = 1.340027 */
.quad 0x3FF59FDDF7A45F38 /* HI((2^1*(1+7/32+1/64))^(1/3)) = 1.35153 */
.quad 0x3FF5CE31C83539DF /* HI((2^1*(1+8/32+1/64))^(1/3)) = 1.36284 */
.quad 0x3FF5FBC3F20966A4 /* HI((2^1*(1+9/32+1/64))^(1/3)) = 1.373966 */
.quad 0x3FF6289C2C8F1B70 /* HI((2^1*(1+10/32+1/64))^(1/3)) = 1.384915 */
.quad 0x3FF654C1B4316DCF /* HI((2^1*(1+11/32+1/64))^(1/3)) = 1.395693 */
.quad 0x3FF6803B54A34E44 /* HI((2^1*(1+12/32+1/64))^(1/3)) = 1.406307 */
.quad 0x3FF6AB0F72182659 /* HI((2^1*(1+13/32+1/64))^(1/3)) = 1.416763 */
.quad 0x3FF6D544118C08BC /* HI((2^1*(1+14/32+1/64))^(1/3)) = 1.427067 */
.quad 0x3FF6FEDEE0388D4A /* HI((2^1*(1+15/32+1/64))^(1/3)) = 1.437224 */
.quad 0x3FF727E53A4F645E /* HI((2^1*(1+16/32+1/64))^(1/3)) = 1.44724 */
.quad 0x3FF7505C31104114 /* HI((2^1*(1+17/32+1/64))^(1/3)) = 1.457119 */
.quad 0x3FF77848904CD549 /* HI((2^1*(1+18/32+1/64))^(1/3)) = 1.466866 */
.quad 0x3FF79FAEE36B2534 /* HI((2^1*(1+19/32+1/64))^(1/3)) = 1.476485 */
.quad 0x3FF7C69379F4605B /* HI((2^1*(1+20/32+1/64))^(1/3)) = 1.48598 */
.quad 0x3FF7ECFA6BBCA391 /* HI((2^1*(1+21/32+1/64))^(1/3)) = 1.495356 */
.quad 0x3FF812E79CAE7EB9 /* HI((2^1*(1+22/32+1/64))^(1/3)) = 1.504615 */
.quad 0x3FF8385EC043C71D /* HI((2^1*(1+23/32+1/64))^(1/3)) = 1.513762 */
.quad 0x3FF85D635CB41B9D /* HI((2^1*(1+24/32+1/64))^(1/3)) = 1.5228 */
.quad 0x3FF881F8CDE083DB /* HI((2^1*(1+25/32+1/64))^(1/3)) = 1.531731 */
.quad 0x3FF8A6224802B8A8 /* HI((2^1*(1+26/32+1/64))^(1/3)) = 1.54056 */
.quad 0x3FF8C9E2DA25E5E4 /* HI((2^1*(1+27/32+1/64))^(1/3)) = 1.549289 */
.quad 0x3FF8ED3D706E1010 /* HI((2^1*(1+28/32+1/64))^(1/3)) = 1.55792 */
.quad 0x3FF91034D632B6DF /* HI((2^1*(1+29/32+1/64))^(1/3)) = 1.566457 */
.quad 0x3FF932CBB7F0CF2D /* HI((2^1*(1+30/32+1/64))^(1/3)) = 1.574901 */
.quad 0x3FF95504A517BF3A /* HI((2^1*(1+31/32+1/64))^(1/3)) = 1.583256 */
.quad 0x3FF987AF34F8BB19 /* HI((2^2*(1+0/32+1/64))^(1/3)) = 1.595626 */
.quad 0x3FF9CA0A8337B317 /* HI((2^2*(1+1/32+1/64))^(1/3)) = 1.611826 */
.quad 0x3FFA0B1709CC13D5 /* HI((2^2*(1+2/32+1/64))^(1/3)) = 1.627708 */
.quad 0x3FFA4AE4CE6419ED /* HI((2^2*(1+3/32+1/64))^(1/3)) = 1.643285 */
.quad 0x3FFA8982A5567031 /* HI((2^2*(1+4/32+1/64))^(1/3)) = 1.658572 */
.quad 0x3FFAC6FE500AB570 /* HI((2^2*(1+5/32+1/64))^(1/3)) = 1.673582 */
.quad 0x3FFB036497A15A17 /* HI((2^2*(1+6/32+1/64))^(1/3)) = 1.688328 */
.quad 0x3FFB3EC164671755 /* HI((2^2*(1+7/32+1/64))^(1/3)) = 1.702821 */
.quad 0x3FFB791FD288C46F /* HI((2^2*(1+8/32+1/64))^(1/3)) = 1.717071 */
.quad 0x3FFBB28A44693BE4 /* HI((2^2*(1+9/32+1/64))^(1/3)) = 1.731089 */
.quad 0x3FFBEB0A72EB6E31 /* HI((2^2*(1+10/32+1/64))^(1/3)) = 1.744883 */
.quad 0x3FFC22A97BF5F697 /* HI((2^2*(1+11/32+1/64))^(1/3)) = 1.758462 */
.quad 0x3FFC596FEF6AF983 /* HI((2^2*(1+12/32+1/64))^(1/3)) = 1.771835 */
.quad 0x3FFC8F65DAC655A3 /* HI((2^2*(1+13/32+1/64))^(1/3)) = 1.785009 */
.quad 0x3FFCC492D38CE8D9 /* HI((2^2*(1+14/32+1/64))^(1/3)) = 1.797992 */
.quad 0x3FFCF8FE00B19367 /* HI((2^2*(1+15/32+1/64))^(1/3)) = 1.810789 */
.quad 0x3FFD2CAE230F8709 /* HI((2^2*(1+16/32+1/64))^(1/3)) = 1.823408 */
.quad 0x3FFD5FA99D15208F /* HI((2^2*(1+17/32+1/64))^(1/3)) = 1.835855 */
.quad 0x3FFD91F679B6E505 /* HI((2^2*(1+18/32+1/64))^(1/3)) = 1.848135 */
.quad 0x3FFDC39A72BF2302 /* HI((2^2*(1+19/32+1/64))^(1/3)) = 1.860255 */
.quad 0x3FFDF49AF68C1570 /* HI((2^2*(1+20/32+1/64))^(1/3)) = 1.872218 */
.quad 0x3FFE24FD2D4C23B8 /* HI((2^2*(1+21/32+1/64))^(1/3)) = 1.884031 */
.quad 0x3FFE54C5FDC5EC73 /* HI((2^2*(1+22/32+1/64))^(1/3)) = 1.895697 */
.quad 0x3FFE83FA11B81DBB /* HI((2^2*(1+23/32+1/64))^(1/3)) = 1.907221 */
.quad 0x3FFEB29DD9DBAF25 /* HI((2^2*(1+24/32+1/64))^(1/3)) = 1.918608 */
.quad 0x3FFEE0B59191D374 /* HI((2^2*(1+25/32+1/64))^(1/3)) = 1.929861 */
.quad 0x3FFF0E454245E4BF /* HI((2^2*(1+26/32+1/64))^(1/3)) = 1.940984 */
.quad 0x3FFF3B50C68A9DD3 /* HI((2^2*(1+27/32+1/64))^(1/3)) = 1.951981 */
.quad 0x3FFF67DBCCF922DC /* HI((2^2*(1+28/32+1/64))^(1/3)) = 1.962856 */
.quad 0x3FFF93E9DAD7A4A6 /* HI((2^2*(1+29/32+1/64))^(1/3)) = 1.973612 */
.quad 0x3FFFBF7E4E8CC9CB /* HI((2^2*(1+30/32+1/64))^(1/3)) = 1.984251 */
.quad 0x3FFFEA9C61E47CD3 /* HI((2^2*(1+31/32+1/64))^(1/3)) = 1.994778 */
.align 16
.quad 0x3F93750AD588F115, 0x3F93750AD588F115 /* _dA7 */
.align 16
.quad 0xBF98090D6221A247, 0xBF98090D6221A247 /* _dA6 */
.align 16
.quad 0x3F9EE7113506AC12, 0x3F9EE7113506AC12 /* _dA5 */
.align 16
.quad 0xBFA511E8D2B3183B, 0xBFA511E8D2B3183B /* _dA4 */
.align 16
.quad 0x3FAF9ADD3C0CA458, 0x3FAF9ADD3C0CA458 /* _dA3 */
.align 16
.quad 0xBFBC71C71C71C71C, 0xBFBC71C71C71C71C /* _dA2 */
.align 16
.quad 0x3FD5555555555555, 0x3FD5555555555555 /* _dA1 */
.align 16
.quad 0xBFF0400000000000, 0xBFF0400000000000 /* _dNeg65Div64 */
.align 16
.quad 0x000FC00000000000, 0x000FC00000000000 /* _dSgnf6Mask */
.align 16
.quad 0xBFF0000000000000, 0xBFF0000000000000 /* _dNegOne */
.align 16
.quad 0x000FFFFFFFFFFFFF, 0x000FFFFFFFFFFFFF /* _dMantissaMask */
.align 16
.quad 0xFFF0000000000000, 0xFFF0000000000000 /* _lExpHiMask */
.align 16
.quad 0x00000000000007FF, 0x00000000000007FF /* _lExpLoMask */
.align 16
.quad 0x0000000000001556, 0x0000000000001556 /* _l1556 */
.align 16
.long 0x000F8000, 0x000F8000, 0x000F8000, 0x000F8000 /* _iRcpIndexMask */
.align 16
.long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF /* _iAbsMask */
.align 16
.long 0x00000800, 0x00000800, 0x00000800, 0x00000800 /* _iSignMask */
.align 16
.long 0x000002AA, 0x000002AA, 0x000002AA, 0x000002AA /* _iBias */
.align 16
.long 0x80100000, 0x80100000, 0x80100000, 0x80100000 /* _iSub */
.align 16
.long 0xffdfffff, 0xffdfffff, 0xffdfffff, 0xffdfffff /* _iCmp */
.align 16
.type __svml_dcbrt_data_internal,@object
.size __svml_dcbrt_data_internal,.-__svml_dcbrt_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized cbrt, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_cbrt _ZGVdN4v_cbrt_sse_wrapper
#include "../svml_d_cbrt4_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized cbrt, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_cbrt
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_cbrt, __GI__ZGVdN4v_cbrt, __redirect__ZGVdN4v_cbrt)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,505 @@
/* Function cbrt vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
* Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
* where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision
* cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
* (T stores the high 53 bits, D stores the low order bits)
* Result=2^k*T+(2^k*T*r)*P+2^k*D
* where P=p1+p2*r+..+p8*r^7
*
*/
/* Offsets for data table __svml_dcbrt_data_internal
*/
#define _dRcp 0
#define _dCbrtHiLo 256
#define _dA7 1024
#define _dA6 1056
#define _dA5 1088
#define _dA4 1120
#define _dA3 1152
#define _dA2 1184
#define _dA1 1216
#define _dNeg65Div64 1248
#define _dSgnf6Mask 1280
#define _dNegOne 1312
#define _dMantissaMask 1344
#define _lExpHiMask 1376
#define _lExpLoMask 1408
#define _l1556 1440
#define _iRcpIndexMask 1472
#define _iAbsMask 1504
#define _iSignMask 1536
#define _iBias 1568
#define _iSub 1600
#define _iCmp 1632
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN4v_cbrt_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
/* Load 1/(1+iRcpIndex/32+1/64) reciprocal table value */
lea __svml_dcbrt_data_internal(%rip), %rax
vmovapd %ymm0, %ymm5
/*
* Declarations
* Load constants
* Get iX - high part of argument
*/
vextractf128 $1, %ymm5, %xmm6
/* Calculate CbrtIndex */
vpsrlq $52, %ymm5, %ymm15
vshufps $221, %xmm6, %xmm5, %xmm4
/* Calculate Rcp table index */
vandps _iRcpIndexMask+__svml_dcbrt_data_internal(%rip), %xmm4, %xmm10
vpsrld $12, %xmm10, %xmm3
vmovd %xmm3, %ecx
/* If the exponent field is zero - go to callout to process denormals */
vandps _iAbsMask+__svml_dcbrt_data_internal(%rip), %xmm4, %xmm7
/* Compute 2^k */
vpsrld $20, %xmm4, %xmm4
vpsubd _iSub+__svml_dcbrt_data_internal(%rip), %xmm7, %xmm8
vandps _lExpLoMask+__svml_dcbrt_data_internal(%rip), %ymm15, %ymm0
vpmuludq _l1556+__svml_dcbrt_data_internal(%rip), %ymm0, %ymm6
vpextrd $2, %xmm3, %edi
movslq %ecx, %rcx
vpextrd $1, %xmm3, %esi
movslq %edi, %rdi
vpextrd $3, %xmm3, %r8d
movslq %esi, %rsi
movslq %r8d, %r8
vpcmpgtd _iCmp+__svml_dcbrt_data_internal(%rip), %xmm8, %xmm9
vmovsd (%rax,%rcx), %xmm11
vmovmskps %xmm9, %edx
vmovsd (%rax,%rdi), %xmm13
vmovhpd (%rax,%rsi), %xmm11, %xmm12
vmovhpd (%rax,%r8), %xmm13, %xmm14
vextractf128 $1, %ymm6, %xmm7
vshufps $136, %xmm7, %xmm6, %xmm8
vmovups __VUNPACK_ODD_ind1.613.0.1(%rip), %ymm7
vextractf128 $1, %ymm0, %xmm1
vshufps $136, %xmm1, %xmm0, %xmm9
vpsrld $14, %xmm8, %xmm1
vpsubd %xmm1, %xmm9, %xmm10
vpaddd %xmm1, %xmm1, %xmm11
/*
* VAND( L, l2k, = l2k, lExpHiMask );
* Argument reduction Z
*/
vandpd _dMantissaMask+__svml_dcbrt_data_internal(%rip), %ymm5, %ymm9
vinsertf128 $1, %xmm14, %ymm12, %ymm2
vpsubd %xmm11, %xmm10, %xmm12
vpslld $8, %xmm12, %xmm13
vpaddd %xmm13, %xmm3, %xmm15
/* Load cbrt(2^j*(1+iRcpIndex/32+1/64)) Hi & Lo values */
vmovd %xmm15, %r9d
vpextrd $2, %xmm15, %r11d
movslq %r9d, %r9
vpextrd $1, %xmm15, %r10d
movslq %r11d, %r11
vpextrd $3, %xmm15, %ecx
movslq %r10d, %r10
movslq %ecx, %rcx
vmovsd 256(%rax,%r9), %xmm3
vmovsd 256(%rax,%r11), %xmm0
vandpd _dSgnf6Mask+__svml_dcbrt_data_internal(%rip), %ymm5, %ymm10
vmovhpd 256(%rax,%r10), %xmm3, %xmm14
vmovhpd 256(%rax,%rcx), %xmm0, %xmm3
vorpd _dNegOne+__svml_dcbrt_data_internal(%rip), %ymm9, %ymm11
vorpd _dNeg65Div64+__svml_dcbrt_data_internal(%rip), %ymm10, %ymm12
vsubpd %ymm12, %ymm11, %ymm13
vmulpd %ymm13, %ymm2, %ymm2
vinsertf128 $1, %xmm3, %ymm14, %ymm0
vpand _iSignMask+__svml_dcbrt_data_internal(%rip), %xmm4, %xmm3
vpor _iBias+__svml_dcbrt_data_internal(%rip), %xmm3, %xmm4
vpaddd %xmm1, %xmm4, %xmm1
vpslld $20, %xmm1, %xmm6
/* Polynomial */
vmovupd _dA7+__svml_dcbrt_data_internal(%rip), %ymm1
vfmadd213pd _dA6+__svml_dcbrt_data_internal(%rip), %ymm2, %ymm1
vfmadd213pd _dA5+__svml_dcbrt_data_internal(%rip), %ymm2, %ymm1
vfmadd213pd _dA4+__svml_dcbrt_data_internal(%rip), %ymm2, %ymm1
vfmadd213pd _dA3+__svml_dcbrt_data_internal(%rip), %ymm2, %ymm1
vfmadd213pd _dA2+__svml_dcbrt_data_internal(%rip), %ymm2, %ymm1
vfmadd213pd _dA1+__svml_dcbrt_data_internal(%rip), %ymm2, %ymm1
vpermps %ymm6, %ymm7, %ymm8
vandps __VUNPACK_ODD_mask.613.0.1(%rip), %ymm8, %ymm14
/* THi*2^k, TLo*2^k */
vmulpd %ymm14, %ymm0, %ymm0
/* THi*2^k*Z */
vmulpd %ymm0, %ymm2, %ymm2
/* Final reconstruction */
vmulpd %ymm2, %ymm1, %ymm3
vaddpd %ymm3, %ymm0, %ymm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm5
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovupd %ymm5, 32(%rsp)
vmovupd %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx ymm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovupd 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call cbrt@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 64(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN4v_cbrt_avx2)
.section .rodata, "a"
.align 32
__VUNPACK_ODD_ind1.613.0.1:
.rept 3
.long 0
.endr
.long 1
.long 0
.long 2
.long 0
.long 3
.align 32
__VUNPACK_ODD_mask.613.0.1:
.long 0
.long -1
.long 0
.long -1
.long 0
.long -1
.long 0
.long -1
.section .rodata, "a"
.align 32
#ifdef __svml_dcbrt_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _dRcp[32][2];
__declspec(align(32)) VUINT32 _dCbrtHiLo[96][2];
__declspec(align(32)) VUINT32 _dA7[4][2];
__declspec(align(32)) VUINT32 _dA6[4][2];
__declspec(align(32)) VUINT32 _dA5[4][2];
__declspec(align(32)) VUINT32 _dA4[4][2];
__declspec(align(32)) VUINT32 _dA3[4][2];
__declspec(align(32)) VUINT32 _dA2[4][2];
__declspec(align(32)) VUINT32 _dA1[4][2];
__declspec(align(32)) VUINT32 _dNeg65Div64[4][2];
__declspec(align(32)) VUINT32 _dSgnf6Mask[4][2];
__declspec(align(32)) VUINT32 _dNegOne[4][2];
__declspec(align(32)) VUINT32 _dMantissaMask[4][2];
__declspec(align(32)) VUINT32 _lExpHiMask[4][2];
__declspec(align(32)) VUINT32 _lExpLoMask[4][2];
__declspec(align(32)) VUINT32 _l1556[4][2];
__declspec(align(32)) VUINT32 _iRcpIndexMask[8][1];
__declspec(align(32)) VUINT32 _iAbsMask[8][1];
__declspec(align(32)) VUINT32 _iSignMask[8][1];
__declspec(align(32)) VUINT32 _iBias[8][1];
__declspec(align(32)) VUINT32 _iSub[8][1];
__declspec(align(32)) VUINT32 _iCmp[8][1];
} __svml_dcbrt_data_internal;
#endif
__svml_dcbrt_data_internal:
/*== _dRcp ==*/
.quad 0xBFEF81F81F81F820 /* (1/(1+0/32+1/64)) = -.984615 */
.quad 0xBFEE9131ABF0B767 /* (1/(1+1/32+1/64)) = -.955224 */
.quad 0xBFEDAE6076B981DB /* (1/(1+2/32+1/64)) = -.927536 */
.quad 0xBFECD85689039B0B /* (1/(1+3/32+1/64)) = -.901408 */
.quad 0xBFEC0E070381C0E0 /* (1/(1+4/32+1/64)) = -.876712 */
.quad 0xBFEB4E81B4E81B4F /* (1/(1+5/32+1/64)) = -.853333 */
.quad 0xBFEA98EF606A63BE /* (1/(1+6/32+1/64)) = -.831169 */
.quad 0xBFE9EC8E951033D9 /* (1/(1+7/32+1/64)) = -.810127 */
.quad 0xBFE948B0FCD6E9E0 /* (1/(1+8/32+1/64)) = -.790123 */
.quad 0xBFE8ACB90F6BF3AA /* (1/(1+9/32+1/64)) = -.771084 */
.quad 0xBFE8181818181818 /* (1/(1+10/32+1/64)) = -.752941 */
.quad 0xBFE78A4C8178A4C8 /* (1/(1+11/32+1/64)) = -.735632 */
.quad 0xBFE702E05C0B8170 /* (1/(1+12/32+1/64)) = -.719101 */
.quad 0xBFE6816816816817 /* (1/(1+13/32+1/64)) = -.703297 */
.quad 0xBFE6058160581606 /* (1/(1+14/32+1/64)) = -.688172 */
.quad 0xBFE58ED2308158ED /* (1/(1+15/32+1/64)) = -.673684 */
.quad 0xBFE51D07EAE2F815 /* (1/(1+16/32+1/64)) = -.659794 */
.quad 0xBFE4AFD6A052BF5B /* (1/(1+17/32+1/64)) = -.646465 */
.quad 0xBFE446F86562D9FB /* (1/(1+18/32+1/64)) = -.633663 */
.quad 0xBFE3E22CBCE4A902 /* (1/(1+19/32+1/64)) = -.621359 */
.quad 0xBFE3813813813814 /* (1/(1+20/32+1/64)) = -.609524 */
.quad 0xBFE323E34A2B10BF /* (1/(1+21/32+1/64)) = -.598131 */
.quad 0xBFE2C9FB4D812CA0 /* (1/(1+22/32+1/64)) = -.587156 */
.quad 0xBFE27350B8812735 /* (1/(1+23/32+1/64)) = -.576577 */
.quad 0xBFE21FB78121FB78 /* (1/(1+24/32+1/64)) = -.566372 */
.quad 0xBFE1CF06ADA2811D /* (1/(1+25/32+1/64)) = -.556522 */
.quad 0xBFE1811811811812 /* (1/(1+26/32+1/64)) = -.547009 */
.quad 0xBFE135C81135C811 /* (1/(1+27/32+1/64)) = -.537815 */
.quad 0xBFE0ECF56BE69C90 /* (1/(1+28/32+1/64)) = -.528926 */
.quad 0xBFE0A6810A6810A7 /* (1/(1+29/32+1/64)) = -.520325 */
.quad 0xBFE0624DD2F1A9FC /* (1/(1+30/32+1/64)) = -.512 */
.quad 0xBFE0204081020408 /* (1/(1+31/32+1/64)) = -.503937 */
/*== _dCbrtHiLo ==*/
.align 32
.quad 0x3FF01539221D4C97 /* HI((2^0*(1+0/32+1/64))^(1/3)) = 1.005181 */
.quad 0x3FF03F06771A2E33 /* HI((2^0*(1+1/32+1/64))^(1/3)) = 1.015387 */
.quad 0x3FF06800E629D671 /* HI((2^0*(1+2/32+1/64))^(1/3)) = 1.025391 */
.quad 0x3FF090328731DEB2 /* HI((2^0*(1+3/32+1/64))^(1/3)) = 1.035204 */
.quad 0x3FF0B7A4B1BD64AC /* HI((2^0*(1+4/32+1/64))^(1/3)) = 1.044835 */
.quad 0x3FF0DE601024FB87 /* HI((2^0*(1+5/32+1/64))^(1/3)) = 1.054291 */
.quad 0x3FF1046CB0597000 /* HI((2^0*(1+6/32+1/64))^(1/3)) = 1.06358 */
.quad 0x3FF129D212A9BA9B /* HI((2^0*(1+7/32+1/64))^(1/3)) = 1.07271 */
.quad 0x3FF14E9736CDAF38 /* HI((2^0*(1+8/32+1/64))^(1/3)) = 1.081687 */
.quad 0x3FF172C2A772F507 /* HI((2^0*(1+9/32+1/64))^(1/3)) = 1.090518 */
.quad 0x3FF1965A848001D3 /* HI((2^0*(1+10/32+1/64))^(1/3)) = 1.099207 */
.quad 0x3FF1B9648C38C55D /* HI((2^0*(1+11/32+1/64))^(1/3)) = 1.107762 */
.quad 0x3FF1DBE6236A0C45 /* HI((2^0*(1+12/32+1/64))^(1/3)) = 1.116186 */
.quad 0x3FF1FDE45CBB1F9F /* HI((2^0*(1+13/32+1/64))^(1/3)) = 1.124485 */
.quad 0x3FF21F63FF409042 /* HI((2^0*(1+14/32+1/64))^(1/3)) = 1.132664 */
.quad 0x3FF240698C6746E5 /* HI((2^0*(1+15/32+1/64))^(1/3)) = 1.140726 */
.quad 0x3FF260F9454BB99B /* HI((2^0*(1+16/32+1/64))^(1/3)) = 1.148675 */
.quad 0x3FF281172F8E7073 /* HI((2^0*(1+17/32+1/64))^(1/3)) = 1.156516 */
.quad 0x3FF2A0C719B4B6D0 /* HI((2^0*(1+18/32+1/64))^(1/3)) = 1.164252 */
.quad 0x3FF2C00C9F2263EC /* HI((2^0*(1+19/32+1/64))^(1/3)) = 1.171887 */
.quad 0x3FF2DEEB2BB7FB78 /* HI((2^0*(1+20/32+1/64))^(1/3)) = 1.179423 */
.quad 0x3FF2FD65FF1EFBBC /* HI((2^0*(1+21/32+1/64))^(1/3)) = 1.186865 */
.quad 0x3FF31B802FCCF6A2 /* HI((2^0*(1+22/32+1/64))^(1/3)) = 1.194214 */
.quad 0x3FF3393CADC50708 /* HI((2^0*(1+23/32+1/64))^(1/3)) = 1.201474 */
.quad 0x3FF3569E451E4C2A /* HI((2^0*(1+24/32+1/64))^(1/3)) = 1.208647 */
.quad 0x3FF373A7A0554CDE /* HI((2^0*(1+25/32+1/64))^(1/3)) = 1.215736 */
.quad 0x3FF3905B4A6D76CE /* HI((2^0*(1+26/32+1/64))^(1/3)) = 1.222743 */
.quad 0x3FF3ACBBB0E756B6 /* HI((2^0*(1+27/32+1/64))^(1/3)) = 1.229671 */
.quad 0x3FF3C8CB258FA340 /* HI((2^0*(1+28/32+1/64))^(1/3)) = 1.236522 */
.quad 0x3FF3E48BE02AC0CE /* HI((2^0*(1+29/32+1/64))^(1/3)) = 1.243297 */
.quad 0x3FF4000000000000 /* HI((2^0*(1+30/32+1/64))^(1/3)) = 1.25 */
.quad 0x3FF41B298D47800E /* HI((2^0*(1+31/32+1/64))^(1/3)) = 1.256631 */
.quad 0x3FF443604B34D9B2 /* HI((2^1*(1+0/32+1/64))^(1/3)) = 1.266449 */
.quad 0x3FF4780B20906571 /* HI((2^1*(1+1/32+1/64))^(1/3)) = 1.279307 */
.quad 0x3FF4ABAC3EE06706 /* HI((2^1*(1+2/32+1/64))^(1/3)) = 1.291912 */
.quad 0x3FF4DE505DA66B8D /* HI((2^1*(1+3/32+1/64))^(1/3)) = 1.304276 */
.quad 0x3FF51003420A5C07 /* HI((2^1*(1+4/32+1/64))^(1/3)) = 1.316409 */
.quad 0x3FF540CFD6FD11C1 /* HI((2^1*(1+5/32+1/64))^(1/3)) = 1.328323 */
.quad 0x3FF570C04260716B /* HI((2^1*(1+6/32+1/64))^(1/3)) = 1.340027 */
.quad 0x3FF59FDDF7A45F38 /* HI((2^1*(1+7/32+1/64))^(1/3)) = 1.35153 */
.quad 0x3FF5CE31C83539DF /* HI((2^1*(1+8/32+1/64))^(1/3)) = 1.36284 */
.quad 0x3FF5FBC3F20966A4 /* HI((2^1*(1+9/32+1/64))^(1/3)) = 1.373966 */
.quad 0x3FF6289C2C8F1B70 /* HI((2^1*(1+10/32+1/64))^(1/3)) = 1.384915 */
.quad 0x3FF654C1B4316DCF /* HI((2^1*(1+11/32+1/64))^(1/3)) = 1.395693 */
.quad 0x3FF6803B54A34E44 /* HI((2^1*(1+12/32+1/64))^(1/3)) = 1.406307 */
.quad 0x3FF6AB0F72182659 /* HI((2^1*(1+13/32+1/64))^(1/3)) = 1.416763 */
.quad 0x3FF6D544118C08BC /* HI((2^1*(1+14/32+1/64))^(1/3)) = 1.427067 */
.quad 0x3FF6FEDEE0388D4A /* HI((2^1*(1+15/32+1/64))^(1/3)) = 1.437224 */
.quad 0x3FF727E53A4F645E /* HI((2^1*(1+16/32+1/64))^(1/3)) = 1.44724 */
.quad 0x3FF7505C31104114 /* HI((2^1*(1+17/32+1/64))^(1/3)) = 1.457119 */
.quad 0x3FF77848904CD549 /* HI((2^1*(1+18/32+1/64))^(1/3)) = 1.466866 */
.quad 0x3FF79FAEE36B2534 /* HI((2^1*(1+19/32+1/64))^(1/3)) = 1.476485 */
.quad 0x3FF7C69379F4605B /* HI((2^1*(1+20/32+1/64))^(1/3)) = 1.48598 */
.quad 0x3FF7ECFA6BBCA391 /* HI((2^1*(1+21/32+1/64))^(1/3)) = 1.495356 */
.quad 0x3FF812E79CAE7EB9 /* HI((2^1*(1+22/32+1/64))^(1/3)) = 1.504615 */
.quad 0x3FF8385EC043C71D /* HI((2^1*(1+23/32+1/64))^(1/3)) = 1.513762 */
.quad 0x3FF85D635CB41B9D /* HI((2^1*(1+24/32+1/64))^(1/3)) = 1.5228 */
.quad 0x3FF881F8CDE083DB /* HI((2^1*(1+25/32+1/64))^(1/3)) = 1.531731 */
.quad 0x3FF8A6224802B8A8 /* HI((2^1*(1+26/32+1/64))^(1/3)) = 1.54056 */
.quad 0x3FF8C9E2DA25E5E4 /* HI((2^1*(1+27/32+1/64))^(1/3)) = 1.549289 */
.quad 0x3FF8ED3D706E1010 /* HI((2^1*(1+28/32+1/64))^(1/3)) = 1.55792 */
.quad 0x3FF91034D632B6DF /* HI((2^1*(1+29/32+1/64))^(1/3)) = 1.566457 */
.quad 0x3FF932CBB7F0CF2D /* HI((2^1*(1+30/32+1/64))^(1/3)) = 1.574901 */
.quad 0x3FF95504A517BF3A /* HI((2^1*(1+31/32+1/64))^(1/3)) = 1.583256 */
.quad 0x3FF987AF34F8BB19 /* HI((2^2*(1+0/32+1/64))^(1/3)) = 1.595626 */
.quad 0x3FF9CA0A8337B317 /* HI((2^2*(1+1/32+1/64))^(1/3)) = 1.611826 */
.quad 0x3FFA0B1709CC13D5 /* HI((2^2*(1+2/32+1/64))^(1/3)) = 1.627708 */
.quad 0x3FFA4AE4CE6419ED /* HI((2^2*(1+3/32+1/64))^(1/3)) = 1.643285 */
.quad 0x3FFA8982A5567031 /* HI((2^2*(1+4/32+1/64))^(1/3)) = 1.658572 */
.quad 0x3FFAC6FE500AB570 /* HI((2^2*(1+5/32+1/64))^(1/3)) = 1.673582 */
.quad 0x3FFB036497A15A17 /* HI((2^2*(1+6/32+1/64))^(1/3)) = 1.688328 */
.quad 0x3FFB3EC164671755 /* HI((2^2*(1+7/32+1/64))^(1/3)) = 1.702821 */
.quad 0x3FFB791FD288C46F /* HI((2^2*(1+8/32+1/64))^(1/3)) = 1.717071 */
.quad 0x3FFBB28A44693BE4 /* HI((2^2*(1+9/32+1/64))^(1/3)) = 1.731089 */
.quad 0x3FFBEB0A72EB6E31 /* HI((2^2*(1+10/32+1/64))^(1/3)) = 1.744883 */
.quad 0x3FFC22A97BF5F697 /* HI((2^2*(1+11/32+1/64))^(1/3)) = 1.758462 */
.quad 0x3FFC596FEF6AF983 /* HI((2^2*(1+12/32+1/64))^(1/3)) = 1.771835 */
.quad 0x3FFC8F65DAC655A3 /* HI((2^2*(1+13/32+1/64))^(1/3)) = 1.785009 */
.quad 0x3FFCC492D38CE8D9 /* HI((2^2*(1+14/32+1/64))^(1/3)) = 1.797992 */
.quad 0x3FFCF8FE00B19367 /* HI((2^2*(1+15/32+1/64))^(1/3)) = 1.810789 */
.quad 0x3FFD2CAE230F8709 /* HI((2^2*(1+16/32+1/64))^(1/3)) = 1.823408 */
.quad 0x3FFD5FA99D15208F /* HI((2^2*(1+17/32+1/64))^(1/3)) = 1.835855 */
.quad 0x3FFD91F679B6E505 /* HI((2^2*(1+18/32+1/64))^(1/3)) = 1.848135 */
.quad 0x3FFDC39A72BF2302 /* HI((2^2*(1+19/32+1/64))^(1/3)) = 1.860255 */
.quad 0x3FFDF49AF68C1570 /* HI((2^2*(1+20/32+1/64))^(1/3)) = 1.872218 */
.quad 0x3FFE24FD2D4C23B8 /* HI((2^2*(1+21/32+1/64))^(1/3)) = 1.884031 */
.quad 0x3FFE54C5FDC5EC73 /* HI((2^2*(1+22/32+1/64))^(1/3)) = 1.895697 */
.quad 0x3FFE83FA11B81DBB /* HI((2^2*(1+23/32+1/64))^(1/3)) = 1.907221 */
.quad 0x3FFEB29DD9DBAF25 /* HI((2^2*(1+24/32+1/64))^(1/3)) = 1.918608 */
.quad 0x3FFEE0B59191D374 /* HI((2^2*(1+25/32+1/64))^(1/3)) = 1.929861 */
.quad 0x3FFF0E454245E4BF /* HI((2^2*(1+26/32+1/64))^(1/3)) = 1.940984 */
.quad 0x3FFF3B50C68A9DD3 /* HI((2^2*(1+27/32+1/64))^(1/3)) = 1.951981 */
.quad 0x3FFF67DBCCF922DC /* HI((2^2*(1+28/32+1/64))^(1/3)) = 1.962856 */
.quad 0x3FFF93E9DAD7A4A6 /* HI((2^2*(1+29/32+1/64))^(1/3)) = 1.973612 */
.quad 0x3FFFBF7E4E8CC9CB /* HI((2^2*(1+30/32+1/64))^(1/3)) = 1.984251 */
.quad 0x3FFFEA9C61E47CD3 /* HI((2^2*(1+31/32+1/64))^(1/3)) = 1.994778 */
.align 32
.quad 0x3F93750AD588F115, 0x3F93750AD588F115, 0x3F93750AD588F115, 0x3F93750AD588F115 /* _dA7 */
.align 32
.quad 0xBF98090D6221A247, 0xBF98090D6221A247, 0xBF98090D6221A247, 0xBF98090D6221A247 /* _dA6 */
.align 32
.quad 0x3F9EE7113506AC12, 0x3F9EE7113506AC12, 0x3F9EE7113506AC12, 0x3F9EE7113506AC12 /* _dA5 */
.align 32
.quad 0xBFA511E8D2B3183B, 0xBFA511E8D2B3183B, 0xBFA511E8D2B3183B, 0xBFA511E8D2B3183B /* _dA4 */
.align 32
.quad 0x3FAF9ADD3C0CA458, 0x3FAF9ADD3C0CA458, 0x3FAF9ADD3C0CA458, 0x3FAF9ADD3C0CA458 /* _dA3 */
.align 32
.quad 0xBFBC71C71C71C71C, 0xBFBC71C71C71C71C, 0xBFBC71C71C71C71C, 0xBFBC71C71C71C71C /* _dA2 */
.align 32
.quad 0x3FD5555555555555, 0x3FD5555555555555, 0x3FD5555555555555, 0x3FD5555555555555 /* _dA1 */
.align 32
.quad 0xBFF0400000000000, 0xBFF0400000000000, 0xBFF0400000000000, 0xBFF0400000000000 /* _dNeg65Div64 */
.align 32
.quad 0x000FC00000000000, 0x000FC00000000000, 0x000FC00000000000, 0x000FC00000000000 /* _dSgnf6Mask */
.align 32
.quad 0xBFF0000000000000, 0xBFF0000000000000, 0xBFF0000000000000, 0xBFF0000000000000 /* _dNegOne */
.align 32
.quad 0x000FFFFFFFFFFFFF, 0x000FFFFFFFFFFFFF, 0x000FFFFFFFFFFFFF, 0x000FFFFFFFFFFFFF /* _dMantissaMask */
.align 32
.quad 0xFFF0000000000000, 0xFFF0000000000000, 0xFFF0000000000000, 0xFFF0000000000000 /* _lExpHiMask */
.align 32
.quad 0x00000000000007FF, 0x00000000000007FF, 0x00000000000007FF, 0x00000000000007FF /* _lExpLoMask */
.align 32
.quad 0x0000000000001556, 0x0000000000001556, 0x0000000000001556, 0x0000000000001556 /* _l1556 */
.align 32
.long 0x000F8000, 0x000F8000, 0x000F8000, 0x000F8000, 0x000F8000, 0x000F8000, 0x000F8000, 0x000F8000 /* _iRcpIndexMask */
.align 32
.long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF /* _iAbsMask */
.align 32
.long 0x00000800, 0x00000800, 0x00000800, 0x00000800, 0x00000800, 0x00000800, 0x00000800, 0x00000800 /* _iSignMask */
.align 32
.long 0x000002AA, 0x000002AA, 0x000002AA, 0x000002AA, 0x000002AA, 0x000002AA, 0x000002AA, 0x000002AA /* _iBias */
.align 32
.long 0x80100000, 0x80100000, 0x80100000, 0x80100000, 0x80100000, 0x80100000, 0x80100000, 0x80100000 /* _iSub */
.align 32
.long 0xffdfffff, 0xffdfffff, 0xffdfffff, 0xffdfffff, 0xffdfffff, 0xffdfffff, 0xffdfffff, 0xffdfffff /* _iCmp */
.align 32
.type __svml_dcbrt_data_internal,@object
.size __svml_dcbrt_data_internal,.-__svml_dcbrt_data_internal

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized cbrt, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_cbrt _ZGVeN8v_cbrt_avx2_wrapper
#include "../svml_d_cbrt8_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized cbrt, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_cbrt
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_cbrt, __GI__ZGVeN8v_cbrt, __redirect__ZGVeN8v_cbrt)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,253 @@
/* Function cbrt vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
* Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
* where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision
* cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
* (T stores the high 53 bits, D stores the low order bits)
* Result=2^k*T+(2^k*T*r)*P+2^k*D
* where P=p1+p2*r+..+p8*r^7
*
*/
/* Offsets for data table __svml_dcbrt_data_internal_avx512
*/
#define etbl_H 0
#define etbl_L 64
#define cbrt_tbl_H 128
#define BiasL 256
#define SZero 320
#define OneThird 384
#define Bias3 448
#define Three 512
#define One 576
#define poly_coeff10 640
#define poly_coeff9 704
#define poly_coeff8 768
#define poly_coeff7 832
#define poly_coeff6 896
#define poly_coeff5 960
#define poly_coeff4 1024
#define poly_coeff3 1088
#define poly_coeff2 1152
#define poly_coeff1 1216
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_cbrt_skx)
vgetmantpd $0, {sae}, %zmm0, %zmm14
/* GetExp(x) */
vgetexppd {sae}, %zmm0, %zmm7
vmovups BiasL+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
/* exponent/3 */
vmovups OneThird+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
vmovups Bias3+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
/* Reduced argument: R = DblRcp*Mantissa - 1 */
vmovups One+__svml_dcbrt_data_internal_avx512(%rip), %zmm2
/* exponent%3 (to be used as index) */
vmovups Three+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
/* DblRcp ~ 1/Mantissa */
vrcp14pd %zmm14, %zmm13
vaddpd {rn-sae}, %zmm8, %zmm7, %zmm12
vandpd SZero+__svml_dcbrt_data_internal_avx512(%rip), %zmm0, %zmm6
/* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */
vrndscalepd $72, {sae}, %zmm13, %zmm15
vfmsub231pd {rn-sae}, %zmm12, %zmm9, %zmm10
/* polynomial */
vmovups poly_coeff10+__svml_dcbrt_data_internal_avx512(%rip), %zmm0
vmovups poly_coeff8+__svml_dcbrt_data_internal_avx512(%rip), %zmm7
vmovups poly_coeff7+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
vfmsub231pd {rn-sae}, %zmm15, %zmm14, %zmm2
vrndscalepd $9, {sae}, %zmm10, %zmm5
/* Table lookup */
vmovups cbrt_tbl_H+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
vmovups poly_coeff6+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
vmovups poly_coeff3+__svml_dcbrt_data_internal_avx512(%rip), %zmm13
vfmadd231pd {rn-sae}, %zmm2, %zmm7, %zmm9
vfnmadd231pd {rn-sae}, %zmm5, %zmm11, %zmm12
vmovups poly_coeff5+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
vmovups poly_coeff1+__svml_dcbrt_data_internal_avx512(%rip), %zmm14
/* Prepare table index */
vpsrlq $49, %zmm15, %zmm1
/* Table lookup: 2^(exponent%3) */
vpermpd __svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm4
vpermpd etbl_L+__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm3
vpermt2pd cbrt_tbl_H+64+__svml_dcbrt_data_internal_avx512(%rip), %zmm1, %zmm10
vmovups poly_coeff9+__svml_dcbrt_data_internal_avx512(%rip), %zmm1
vfmadd231pd {rn-sae}, %zmm2, %zmm8, %zmm11
vmovups poly_coeff2+__svml_dcbrt_data_internal_avx512(%rip), %zmm12
vscalefpd {rn-sae}, %zmm5, %zmm10, %zmm15
vfmadd231pd {rn-sae}, %zmm2, %zmm0, %zmm1
vmovups poly_coeff4+__svml_dcbrt_data_internal_avx512(%rip), %zmm5
vfmadd231pd {rn-sae}, %zmm2, %zmm12, %zmm14
vmulpd {rn-sae}, %zmm2, %zmm2, %zmm0
vfmadd231pd {rn-sae}, %zmm2, %zmm5, %zmm13
/* Sh*R */
vmulpd {rn-sae}, %zmm2, %zmm4, %zmm2
vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm1
vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm1
vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
/* Sl + (Sh*R)*Poly */
vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm2
/*
* branch-free
* scaled_Th*(Sh+Sl+Sh*R*Poly)
*/
vaddpd {rn-sae}, %zmm4, %zmm2, %zmm3
vmulpd {rn-sae}, %zmm15, %zmm3, %zmm4
vorpd %zmm6, %zmm4, %zmm0
ret
END(_ZGVeN8v_cbrt_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_dcbrt_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 etbl_H[8][2];
__declspec(align(64)) VUINT32 etbl_L[8][2];
__declspec(align(64)) VUINT32 cbrt_tbl_H[16][2];
__declspec(align(64)) VUINT32 BiasL[8][2];
__declspec(align(64)) VUINT32 SZero[8][2];
__declspec(align(64)) VUINT32 OneThird[8][2];
__declspec(align(64)) VUINT32 Bias3[8][2];
__declspec(align(64)) VUINT32 Three[8][2];
__declspec(align(64)) VUINT32 One[8][2];
__declspec(align(64)) VUINT32 poly_coeff10[8][2];
__declspec(align(64)) VUINT32 poly_coeff9[8][2];
__declspec(align(64)) VUINT32 poly_coeff8[8][2];
__declspec(align(64)) VUINT32 poly_coeff7[8][2];
__declspec(align(64)) VUINT32 poly_coeff6[8][2];
__declspec(align(64)) VUINT32 poly_coeff5[8][2];
__declspec(align(64)) VUINT32 poly_coeff4[8][2];
__declspec(align(64)) VUINT32 poly_coeff3[8][2];
__declspec(align(64)) VUINT32 poly_coeff2[8][2];
__declspec(align(64)) VUINT32 poly_coeff1[8][2];
} __svml_dcbrt_data_internal_avx512;
#endif
__svml_dcbrt_data_internal_avx512:
/*== etbl_H ==*/
.quad 0x3ff0000000000000
.quad 0x3ff428a2f98d728b
.quad 0x3ff965fea53d6e3d
.quad 0x0000000000000000
.quad 0xbff0000000000000
.quad 0xbff428a2f98d728b
.quad 0xbff965fea53d6e3d
.quad 0x0000000000000000
/*== etbl_L ==*/
.align 64
.quad 0x0000000000000000
.quad 0xbc7ddc22548ea41e
.quad 0xbc9f53e999952f09
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x3c7ddc22548ea41e
.quad 0x3c9f53e999952f09
.quad 0x0000000000000000
/*== cbrt_tbl_H ==*/
.align 64
.quad 0x3ff428a2f98d728b
.quad 0x3ff361f35ca116ff
.quad 0x3ff2b6b5edf6b54a
.quad 0x3ff220e6dd675180
.quad 0x3ff19c3b38e975a8
.quad 0x3ff12589c21fb842
.quad 0x3ff0ba6ee5f9aad4
.quad 0x3ff059123d3a9848
.quad 0x3ff0000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
/*== BiasL ==*/
.align 64
.quad 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000
/*== Zero ==*/
.align 64
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
/*== OneThird ==*/
.align 64
.quad 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556
/*== Bias3 ==*/
.align 64
.quad 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000
/*== Three ==*/
.align 64
.quad 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000
/*==One ==*/
.align 64
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
/*== poly_coeff10 ==*/
.align 64
.quad 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62
/*== poly_coeff9 ==*/
.align 64
.quad 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875
/*== poly_coeff8 ==*/
.align 64
.quad 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f
/*== poly_coeff7 ==*/
.align 64
.quad 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914
/*== poly_coeff6 ==*/
.align 64
.quad 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e
/*== poly_coeff5 ==*/
.align 64
.quad 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569
/*== poly_coeff4 ==*/
.align 64
.quad 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e
/*== poly_coeff3 ==*/
.align 64
.quad 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31
/*== poly_coeff2 ==*/
.align 64
.quad 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741
/*== poly_coeff1 ==*/
.align 64
.quad 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557
.align 64
.type __svml_dcbrt_data_internal_avx512,@object
.size __svml_dcbrt_data_internal_avx512,.-__svml_dcbrt_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized cbrtf.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_cbrtf _ZGVeN16v_cbrtf_avx2_wrapper
#include "../svml_s_cbrtf16_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized cbrtf, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_cbrtf
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_cbrtf, __GI__ZGVeN16v_cbrtf,
__redirect__ZGVeN16v_cbrtf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,235 @@
/* Function cbrtf vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
* Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
* where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in single precision
* cbrtf(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
* (T stores the high 24 bits, D stores the low order bits)
* Result=2^k*T+(2^k*T*r)*P+2^k*D
* where P=p1+p2*r+..
*
*/
/* Offsets for data table __svml_scbrt_data_internal_avx512
*/
#define etbl_H 0
#define etbl_L 64
#define cbrt_tbl_H 128
#define BiasL 256
#define SZero 320
#define OneThird 384
#define Bias3 448
#define Three 512
#define One 576
#define poly_coeff3 640
#define poly_coeff2 704
#define poly_coeff1 768
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_cbrtf_skx)
vgetmantps $0, {sae}, %zmm0, %zmm8
/* GetExp(x) */
vgetexpps {sae}, %zmm0, %zmm1
vmovups BiasL+__svml_scbrt_data_internal_avx512(%rip), %zmm2
/* exponent/3 */
vmovups OneThird+__svml_scbrt_data_internal_avx512(%rip), %zmm3
vmovups Bias3+__svml_scbrt_data_internal_avx512(%rip), %zmm4
vmovups One+__svml_scbrt_data_internal_avx512(%rip), %zmm15
/* exponent%3 (to be used as index) */
vmovups Three+__svml_scbrt_data_internal_avx512(%rip), %zmm5
/* polynomial */
vmovups poly_coeff3+__svml_scbrt_data_internal_avx512(%rip), %zmm11
vmovups poly_coeff1+__svml_scbrt_data_internal_avx512(%rip), %zmm14
/* Table lookup */
vmovups cbrt_tbl_H+__svml_scbrt_data_internal_avx512(%rip), %zmm12
/* DblRcp ~ 1/Mantissa */
vrcp14ps %zmm8, %zmm7
vaddps {rn-sae}, %zmm2, %zmm1, %zmm6
vandps SZero+__svml_scbrt_data_internal_avx512(%rip), %zmm0, %zmm0
/* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */
vrndscaleps $88, {sae}, %zmm7, %zmm9
vfmsub231ps {rn-sae}, %zmm6, %zmm3, %zmm4
vmovups poly_coeff2+__svml_scbrt_data_internal_avx512(%rip), %zmm7
/* Reduced argument: R = DblRcp*Mantissa - 1 */
vfmsub231ps {rn-sae}, %zmm9, %zmm8, %zmm15
vrndscaleps $9, {sae}, %zmm4, %zmm13
/* Prepare table index */
vpsrld $19, %zmm9, %zmm10
vfmadd231ps {rn-sae}, %zmm15, %zmm11, %zmm7
vfnmadd231ps {rn-sae}, %zmm13, %zmm5, %zmm6
vpermt2ps cbrt_tbl_H+64+__svml_scbrt_data_internal_avx512(%rip), %zmm10, %zmm12
vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm7
vscalefps {rn-sae}, %zmm13, %zmm12, %zmm2
/* Table lookup: 2^(exponent%3) */
vpermps __svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm1
vpermps etbl_L+__svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm6
/* Sh*R */
vmulps {rn-sae}, %zmm15, %zmm1, %zmm14
/* Sl + (Sh*R)*Poly */
vfmadd213ps {rn-sae}, %zmm6, %zmm7, %zmm14
/*
* branch-free
* scaled_Th*(Sh+Sl+Sh*R*Poly)
*/
vaddps {rn-sae}, %zmm1, %zmm14, %zmm15
vmulps {rn-sae}, %zmm2, %zmm15, %zmm3
vorps %zmm0, %zmm3, %zmm0
ret
END(_ZGVeN16v_cbrtf_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_scbrt_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 etbl_H[16][1];
__declspec(align(64)) VUINT32 etbl_L[16][1];
__declspec(align(64)) VUINT32 cbrt_tbl_H[32][1];
__declspec(align(64)) VUINT32 BiasL[16][1];
__declspec(align(64)) VUINT32 SZero[16][1];
__declspec(align(64)) VUINT32 OneThird[16][1];
__declspec(align(64)) VUINT32 Bias3[16][1];
__declspec(align(64)) VUINT32 Three[16][1];
__declspec(align(64)) VUINT32 One[16][1];
__declspec(align(64)) VUINT32 poly_coeff3[16][1];
__declspec(align(64)) VUINT32 poly_coeff2[16][1];
__declspec(align(64)) VUINT32 poly_coeff1[16][1];
} __svml_scbrt_data_internal_avx512;
#endif
__svml_scbrt_data_internal_avx512:
/*== etbl_H ==*/
.long 0x3f800000
.long 0x3fa14518
.long 0x3fcb2ff5
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
/*== etbl_L ==*/
.align 64
.long 0x00000000
.long 0xb2ce51af
.long 0x32a7adc8
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
/*== cbrt_tbl_H ==*/
.align 64
.long 0x3fa14518
.long 0x3f9e0b2b
.long 0x3f9b0f9b
.long 0x3f984a9a
.long 0x3f95b5af
.long 0x3f934b6c
.long 0x3f910737
.long 0x3f8ee526
.long 0x3f8ce1da
.long 0x3f8afa6a
.long 0x3f892c4e
.long 0x3f87754e
.long 0x3f85d377
.long 0x3f844510
.long 0x3f82c892
.long 0x3f815c9f
.long 0x3f800000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
.long 0x00000000
/*== BiasL ==*/
.align 64
.long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000
/*== Zero ==*/
.align 64
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
/*== OneThird ==*/
.align 64
.long 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab
/*== Bias3 ==*/
.align 64
.long 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000
/*== Three ==*/
.align 64
.long 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000
/*==One ==*/
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== poly_coeff3 ==*/
.align 64
.long 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c
/*== poly_coeff2 ==*/
.align 64
.long 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363
/*== poly_coeff1 ==*/
.align 64
.long 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa
.align 64
.type __svml_scbrt_data_internal_avx512,@object
.size __svml_scbrt_data_internal_avx512,.-__svml_scbrt_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized cbrtf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_cbrtf _ZGVbN4v_cbrtf_sse2
#include "../svml_s_cbrtf4_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized cbrtf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_cbrtf
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_cbrtf, __GI__ZGVbN4v_cbrtf,
__redirect__ZGVbN4v_cbrtf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,490 @@
/* Function cbrtf vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
* Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
* where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in single precision
* cbrtf(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
* (T stores the high 24 bits, D stores the low order bits)
* Result=2^k*T+(2^k*T*r)*P+2^k*D
* where P=p1+p2*r+..
*
*/
/* Offsets for data table __svml_scbrt_data_internal
*/
#define _sRcp 0
#define _sCbrtHL 128
#define _sP2 512
#define _sP1 528
#define _sMantissaMask 544
#define _sMantissaMask1 560
#define _sExpMask 576
#define _sExpMask1 592
#define _iRcpIndexMask 608
#define _iBExpMask 624
#define _iSignMask 640
#define _iBias 656
#define _iOne 672
#define _i555 688
#define _iAbsMask 704
#define _iSubConst 720
#define _iCmpConst 736
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_cbrtf_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
/*
* Load constants
* Reciprocal index calculation
*/
movaps %xmm0, %xmm2
movdqu _iRcpIndexMask+__svml_scbrt_data_internal(%rip), %xmm3
psrld $16, %xmm2
pand %xmm2, %xmm3
/* Load reciprocal value */
lea __svml_scbrt_data_internal(%rip), %rdx
pshufd $1, %xmm3, %xmm5
/* Get signed biased exponent */
psrld $7, %xmm2
movd %xmm3, %eax
movd %xmm5, %ecx
/* Get absolute biased exponent */
movdqu _iBExpMask+__svml_scbrt_data_internal(%rip), %xmm15
/*
* Calculate exponent/3
* i555Exp=(2^{12}-1)/3*exponent
*/
movdqu _i555+__svml_scbrt_data_internal(%rip), %xmm14
pand %xmm2, %xmm15
movslq %eax, %rax
movdqa %xmm14, %xmm5
movslq %ecx, %rcx
psrlq $32, %xmm14
pmuludq %xmm15, %xmm5
movd (%rdx,%rax), %xmm4
movd (%rdx,%rcx), %xmm6
punpckldq %xmm6, %xmm4
movdqa %xmm15, %xmm6
psrlq $32, %xmm15
pmuludq %xmm14, %xmm15
pshufd $2, %xmm3, %xmm7
psllq $32, %xmm15
pshufd $3, %xmm3, %xmm8
movd %xmm7, %esi
movd %xmm8, %edi
/* Argument reduction */
movups _sMantissaMask+__svml_scbrt_data_internal(%rip), %xmm12
movups _sMantissaMask1+__svml_scbrt_data_internal(%rip), %xmm11
andps %xmm0, %xmm12
pand .FLT_17(%rip), %xmm5
andps %xmm0, %xmm11
movslq %esi, %rsi
por %xmm15, %xmm5
movslq %edi, %rdi
/* Get K (exponent=3*k+j) */
psrld $12, %xmm5
orps _sExpMask+__svml_scbrt_data_internal(%rip), %xmm12
orps _sExpMask1+__svml_scbrt_data_internal(%rip), %xmm11
psubd _iOne+__svml_scbrt_data_internal(%rip), %xmm6
/* r=y-y` */
subps %xmm11, %xmm12
/* Get J */
psubd %xmm5, %xmm6
movdqu _iAbsMask+__svml_scbrt_data_internal(%rip), %xmm1
psubd %xmm5, %xmm6
movd (%rdx,%rsi), %xmm10
pand %xmm0, %xmm1
movd (%rdx,%rdi), %xmm9
psubd %xmm5, %xmm6
punpckldq %xmm9, %xmm10
/* Get 128*J */
pslld $7, %xmm6
punpcklqdq %xmm10, %xmm4
/*
* iCbrtIndex=4*l+128*j
* Zero index if callout expected
*/
paddd %xmm6, %xmm3
psubd _iSubConst+__svml_scbrt_data_internal(%rip), %xmm1
pcmpgtd _iCmpConst+__svml_scbrt_data_internal(%rip), %xmm1
/* r=(y-y`)*rcp_table(y`) */
mulps %xmm12, %xmm4
movmskps %xmm1, %eax
/* Biased exponent-1 */
movdqu _iSignMask+__svml_scbrt_data_internal(%rip), %xmm13
pandn %xmm3, %xmm1
/*
* Add 2/3*(bias-1)+1 to (k+1/3*(bias-1))
* Attach sign to exponent
*/
movdqu _iBias+__svml_scbrt_data_internal(%rip), %xmm12
pand %xmm13, %xmm2
paddd %xmm5, %xmm12
/* Load Cbrt table Hi & Lo values */
movd %xmm1, %r8d
por %xmm2, %xmm12
pshufd $1, %xmm1, %xmm2
pslld $23, %xmm12
pshufd $2, %xmm1, %xmm7
pshufd $3, %xmm1, %xmm1
movd %xmm2, %r9d
movd %xmm7, %r10d
movd %xmm1, %r11d
/* Polynomial: p1+r*(p2*r+r*(p3+r*p4)) */
movups _sP2+__svml_scbrt_data_internal(%rip), %xmm11
mulps %xmm4, %xmm11
movslq %r8d, %r8
addps _sP1+__svml_scbrt_data_internal(%rip), %xmm11
movslq %r9d, %r9
movslq %r10d, %r10
movslq %r11d, %r11
movd 128(%rdx,%r8), %xmm10
movd 128(%rdx,%r9), %xmm3
movd 128(%rdx,%r10), %xmm9
movd 128(%rdx,%r11), %xmm8
punpckldq %xmm3, %xmm10
punpckldq %xmm8, %xmm9
punpcklqdq %xmm9, %xmm10
/* sCbrtHi *= 2^k */
mulps %xmm10, %xmm12
/* T`*r */
mulps %xmm12, %xmm4
/* (T`*r)*P */
mulps %xmm4, %xmm11
/*
* T`*r*P+D`
* result = T`+(T`*r*P+D`)
*/
addps %xmm11, %xmm12
testl %eax, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm12
/* Restore registers
* and exit the function
*/
L(EXIT):
movaps %xmm12, %xmm0
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm0, 32(%rsp)
movups %xmm12, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax
xorl %edx, %edx
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %edx, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %eax, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm12
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm12
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call cbrtf@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movss %xmm0, 48(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN4v_cbrtf_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_scbrt_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(16)) VUINT32 _sRcp[32][1];
__declspec(align(16)) VUINT32 _sCbrtHL[96][1];
__declspec(align(16)) VUINT32 _sP2[4][1];
__declspec(align(16)) VUINT32 _sP1[4][1];
__declspec(align(16)) VUINT32 _sMantissaMask[4][1];
__declspec(align(16)) VUINT32 _sMantissaMask1[4][1];
__declspec(align(16)) VUINT32 _sExpMask[4][1];
__declspec(align(16)) VUINT32 _sExpMask1[4][1];
__declspec(align(16)) VUINT32 _iRcpIndexMask[4][1];
__declspec(align(16)) VUINT32 _iBExpMask[4][1];
__declspec(align(16)) VUINT32 _iSignMask[4][1];
__declspec(align(16)) VUINT32 _iBias[4][1];
__declspec(align(16)) VUINT32 _iOne[4][1];
__declspec(align(16)) VUINT32 _i555[4][1];
__declspec(align(16)) VUINT32 _iAbsMask[4][1];
__declspec(align(16)) VUINT32 _iSubConst[4][1];
__declspec(align(16)) VUINT32 _iCmpConst[4][1];
} __svml_scbrt_data_internal;
#endif
__svml_scbrt_data_internal:
/*== _sRcp ==*/
.long 0xBF7C0FC1 /* (1/(1+0/32+1/64)) = -.984615 */
.long 0xBF74898D /* (1/(1+1/32+1/64)) = -.955224 */
.long 0xBF6D7304 /* (1/(1+2/32+1/64)) = -.927536 */
.long 0xBF66C2B4 /* (1/(1+3/32+1/64)) = -.901408 */
.long 0xBF607038 /* (1/(1+4/32+1/64)) = -.876712 */
.long 0xBF5A740E /* (1/(1+5/32+1/64)) = -.853333 */
.long 0xBF54C77B /* (1/(1+6/32+1/64)) = -.831169 */
.long 0xBF4F6475 /* (1/(1+7/32+1/64)) = -.810127 */
.long 0xBF4A4588 /* (1/(1+8/32+1/64)) = -.790123 */
.long 0xBF4565C8 /* (1/(1+9/32+1/64)) = -.771084 */
.long 0xBF40C0C1 /* (1/(1+10/32+1/64)) = -.752941 */
.long 0xBF3C5264 /* (1/(1+11/32+1/64)) = -.735632 */
.long 0xBF381703 /* (1/(1+12/32+1/64)) = -.719101 */
.long 0xBF340B41 /* (1/(1+13/32+1/64)) = -.703297 */
.long 0xBF302C0B /* (1/(1+14/32+1/64)) = -.688172 */
.long 0xBF2C7692 /* (1/(1+15/32+1/64)) = -.673684 */
.long 0xBF28E83F /* (1/(1+16/32+1/64)) = -.659794 */
.long 0xBF257EB5 /* (1/(1+17/32+1/64)) = -.646465 */
.long 0xBF2237C3 /* (1/(1+18/32+1/64)) = -.633663 */
.long 0xBF1F1166 /* (1/(1+19/32+1/64)) = -.621359 */
.long 0xBF1C09C1 /* (1/(1+20/32+1/64)) = -.609524 */
.long 0xBF191F1A /* (1/(1+21/32+1/64)) = -.598131 */
.long 0xBF164FDA /* (1/(1+22/32+1/64)) = -.587156 */
.long 0xBF139A86 /* (1/(1+23/32+1/64)) = -.576577 */
.long 0xBF10FDBC /* (1/(1+24/32+1/64)) = -.566372 */
.long 0xBF0E7835 /* (1/(1+25/32+1/64)) = -.556522 */
.long 0xBF0C08C1 /* (1/(1+26/32+1/64)) = -.547009 */
.long 0xBF09AE41 /* (1/(1+27/32+1/64)) = -.537815 */
.long 0xBF0767AB /* (1/(1+28/32+1/64)) = -.528926 */
.long 0xBF053408 /* (1/(1+29/32+1/64)) = -.520325 */
.long 0xBF03126F /* (1/(1+30/32+1/64)) = -.512 */
.long 0xBF010204 /* (1/(1+31/32+1/64)) = -.503937 */
/*== _sCbrtHL ==*/
.align 16
.long 0x3F80A9C9 /* HI((2^0*(1+0/32+1/64))^(1/3)) = 1.005181 */
.long 0x3F81F833 /* HI((2^0*(1+1/32+1/64))^(1/3)) = 1.015387 */
.long 0x3F834007 /* HI((2^0*(1+2/32+1/64))^(1/3)) = 1.025391 */
.long 0x3F848194 /* HI((2^0*(1+3/32+1/64))^(1/3)) = 1.035204 */
.long 0x3F85BD25 /* HI((2^0*(1+4/32+1/64))^(1/3)) = 1.044835 */
.long 0x3F86F300 /* HI((2^0*(1+5/32+1/64))^(1/3)) = 1.054291 */
.long 0x3F882365 /* HI((2^0*(1+6/32+1/64))^(1/3)) = 1.06358 */
.long 0x3F894E90 /* HI((2^0*(1+7/32+1/64))^(1/3)) = 1.07271 */
.long 0x3F8A74B9 /* HI((2^0*(1+8/32+1/64))^(1/3)) = 1.081687 */
.long 0x3F8B9615 /* HI((2^0*(1+9/32+1/64))^(1/3)) = 1.090518 */
.long 0x3F8CB2D4 /* HI((2^0*(1+10/32+1/64))^(1/3)) = 1.099207 */
.long 0x3F8DCB24 /* HI((2^0*(1+11/32+1/64))^(1/3)) = 1.107762 */
.long 0x3F8EDF31 /* HI((2^0*(1+12/32+1/64))^(1/3)) = 1.116186 */
.long 0x3F8FEF22 /* HI((2^0*(1+13/32+1/64))^(1/3)) = 1.124485 */
.long 0x3F90FB1F /* HI((2^0*(1+14/32+1/64))^(1/3)) = 1.132664 */
.long 0x3F92034C /* HI((2^0*(1+15/32+1/64))^(1/3)) = 1.140726 */
.long 0x3F9307CA /* HI((2^0*(1+16/32+1/64))^(1/3)) = 1.148675 */
.long 0x3F9408B9 /* HI((2^0*(1+17/32+1/64))^(1/3)) = 1.156516 */
.long 0x3F950638 /* HI((2^0*(1+18/32+1/64))^(1/3)) = 1.164252 */
.long 0x3F960064 /* HI((2^0*(1+19/32+1/64))^(1/3)) = 1.171887 */
.long 0x3F96F759 /* HI((2^0*(1+20/32+1/64))^(1/3)) = 1.179423 */
.long 0x3F97EB2F /* HI((2^0*(1+21/32+1/64))^(1/3)) = 1.186865 */
.long 0x3F98DC01 /* HI((2^0*(1+22/32+1/64))^(1/3)) = 1.194214 */
.long 0x3F99C9E5 /* HI((2^0*(1+23/32+1/64))^(1/3)) = 1.201474 */
.long 0x3F9AB4F2 /* HI((2^0*(1+24/32+1/64))^(1/3)) = 1.208647 */
.long 0x3F9B9D3D /* HI((2^0*(1+25/32+1/64))^(1/3)) = 1.215736 */
.long 0x3F9C82DA /* HI((2^0*(1+26/32+1/64))^(1/3)) = 1.222743 */
.long 0x3F9D65DD /* HI((2^0*(1+27/32+1/64))^(1/3)) = 1.229671 */
.long 0x3F9E4659 /* HI((2^0*(1+28/32+1/64))^(1/3)) = 1.236522 */
.long 0x3F9F245F /* HI((2^0*(1+29/32+1/64))^(1/3)) = 1.243297 */
.long 0x3FA00000 /* HI((2^0*(1+30/32+1/64))^(1/3)) = 1.25 */
.long 0x3FA0D94C /* HI((2^0*(1+31/32+1/64))^(1/3)) = 1.256631 */
.long 0x3FA21B02 /* HI((2^1*(1+0/32+1/64))^(1/3)) = 1.266449 */
.long 0x3FA3C059 /* HI((2^1*(1+1/32+1/64))^(1/3)) = 1.279307 */
.long 0x3FA55D61 /* HI((2^1*(1+2/32+1/64))^(1/3)) = 1.291912 */
.long 0x3FA6F282 /* HI((2^1*(1+3/32+1/64))^(1/3)) = 1.304276 */
.long 0x3FA8801A /* HI((2^1*(1+4/32+1/64))^(1/3)) = 1.316409 */
.long 0x3FAA067E /* HI((2^1*(1+5/32+1/64))^(1/3)) = 1.328323 */
.long 0x3FAB8602 /* HI((2^1*(1+6/32+1/64))^(1/3)) = 1.340027 */
.long 0x3FACFEEF /* HI((2^1*(1+7/32+1/64))^(1/3)) = 1.35153 */
.long 0x3FAE718E /* HI((2^1*(1+8/32+1/64))^(1/3)) = 1.36284 */
.long 0x3FAFDE1F /* HI((2^1*(1+9/32+1/64))^(1/3)) = 1.373966 */
.long 0x3FB144E1 /* HI((2^1*(1+10/32+1/64))^(1/3)) = 1.384915 */
.long 0x3FB2A60D /* HI((2^1*(1+11/32+1/64))^(1/3)) = 1.395692 */
.long 0x3FB401DA /* HI((2^1*(1+12/32+1/64))^(1/3)) = 1.406307 */
.long 0x3FB5587B /* HI((2^1*(1+13/32+1/64))^(1/3)) = 1.416763 */
.long 0x3FB6AA20 /* HI((2^1*(1+14/32+1/64))^(1/3)) = 1.427067 */
.long 0x3FB7F6F7 /* HI((2^1*(1+15/32+1/64))^(1/3)) = 1.437224 */
.long 0x3FB93F29 /* HI((2^1*(1+16/32+1/64))^(1/3)) = 1.44724 */
.long 0x3FBA82E1 /* HI((2^1*(1+17/32+1/64))^(1/3)) = 1.457119 */
.long 0x3FBBC244 /* HI((2^1*(1+18/32+1/64))^(1/3)) = 1.466866 */
.long 0x3FBCFD77 /* HI((2^1*(1+19/32+1/64))^(1/3)) = 1.476485 */
.long 0x3FBE349B /* HI((2^1*(1+20/32+1/64))^(1/3)) = 1.48598 */
.long 0x3FBF67D3 /* HI((2^1*(1+21/32+1/64))^(1/3)) = 1.495356 */
.long 0x3FC0973C /* HI((2^1*(1+22/32+1/64))^(1/3)) = 1.504615 */
.long 0x3FC1C2F6 /* HI((2^1*(1+23/32+1/64))^(1/3)) = 1.513762 */
.long 0x3FC2EB1A /* HI((2^1*(1+24/32+1/64))^(1/3)) = 1.5228 */
.long 0x3FC40FC6 /* HI((2^1*(1+25/32+1/64))^(1/3)) = 1.531731 */
.long 0x3FC53112 /* HI((2^1*(1+26/32+1/64))^(1/3)) = 1.54056 */
.long 0x3FC64F16 /* HI((2^1*(1+27/32+1/64))^(1/3)) = 1.549289 */
.long 0x3FC769EB /* HI((2^1*(1+28/32+1/64))^(1/3)) = 1.55792 */
.long 0x3FC881A6 /* HI((2^1*(1+29/32+1/64))^(1/3)) = 1.566457 */
.long 0x3FC9965D /* HI((2^1*(1+30/32+1/64))^(1/3)) = 1.574901 */
.long 0x3FCAA825 /* HI((2^1*(1+31/32+1/64))^(1/3)) = 1.583256 */
.long 0x3FCC3D79 /* HI((2^2*(1+0/32+1/64))^(1/3)) = 1.595626 */
.long 0x3FCE5054 /* HI((2^2*(1+1/32+1/64))^(1/3)) = 1.611826 */
.long 0x3FD058B8 /* HI((2^2*(1+2/32+1/64))^(1/3)) = 1.627707 */
.long 0x3FD25726 /* HI((2^2*(1+3/32+1/64))^(1/3)) = 1.643285 */
.long 0x3FD44C15 /* HI((2^2*(1+4/32+1/64))^(1/3)) = 1.658572 */
.long 0x3FD637F2 /* HI((2^2*(1+5/32+1/64))^(1/3)) = 1.673582 */
.long 0x3FD81B24 /* HI((2^2*(1+6/32+1/64))^(1/3)) = 1.688328 */
.long 0x3FD9F60B /* HI((2^2*(1+7/32+1/64))^(1/3)) = 1.702821 */
.long 0x3FDBC8FE /* HI((2^2*(1+8/32+1/64))^(1/3)) = 1.717071 */
.long 0x3FDD9452 /* HI((2^2*(1+9/32+1/64))^(1/3)) = 1.731089 */
.long 0x3FDF5853 /* HI((2^2*(1+10/32+1/64))^(1/3)) = 1.744883 */
.long 0x3FE1154B /* HI((2^2*(1+11/32+1/64))^(1/3)) = 1.758462 */
.long 0x3FE2CB7F /* HI((2^2*(1+12/32+1/64))^(1/3)) = 1.771835 */
.long 0x3FE47B2E /* HI((2^2*(1+13/32+1/64))^(1/3)) = 1.785009 */
.long 0x3FE62496 /* HI((2^2*(1+14/32+1/64))^(1/3)) = 1.797992 */
.long 0x3FE7C7F0 /* HI((2^2*(1+15/32+1/64))^(1/3)) = 1.810789 */
.long 0x3FE96571 /* HI((2^2*(1+16/32+1/64))^(1/3)) = 1.823408 */
.long 0x3FEAFD4C /* HI((2^2*(1+17/32+1/64))^(1/3)) = 1.835855 */
.long 0x3FEC8FB3 /* HI((2^2*(1+18/32+1/64))^(1/3)) = 1.848135 */
.long 0x3FEE1CD3 /* HI((2^2*(1+19/32+1/64))^(1/3)) = 1.860255 */
.long 0x3FEFA4D7 /* HI((2^2*(1+20/32+1/64))^(1/3)) = 1.872218 */
.long 0x3FF127E9 /* HI((2^2*(1+21/32+1/64))^(1/3)) = 1.88403 */
.long 0x3FF2A62F /* HI((2^2*(1+22/32+1/64))^(1/3)) = 1.895697 */
.long 0x3FF41FD0 /* HI((2^2*(1+23/32+1/64))^(1/3)) = 1.907221 */
.long 0x3FF594EE /* HI((2^2*(1+24/32+1/64))^(1/3)) = 1.918607 */
.long 0x3FF705AC /* HI((2^2*(1+25/32+1/64))^(1/3)) = 1.929861 */
.long 0x3FF8722A /* HI((2^2*(1+26/32+1/64))^(1/3)) = 1.940984 */
.long 0x3FF9DA86 /* HI((2^2*(1+27/32+1/64))^(1/3)) = 1.951981 */
.long 0x3FFB3EDE /* HI((2^2*(1+28/32+1/64))^(1/3)) = 1.962856 */
.long 0x3FFC9F4E /* HI((2^2*(1+29/32+1/64))^(1/3)) = 1.973612 */
.long 0x3FFDFBF2 /* HI((2^2*(1+30/32+1/64))^(1/3)) = 1.984251 */
.long 0x3FFF54E3 /* HI((2^2*(1+31/32+1/64))^(1/3)) = 1.994778 */
.align 16
.long 0xBDE3A962, 0xBDE3A962, 0xBDE3A962, 0xBDE3A962 /* _sP2 */
.align 16
.long 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91 /* _sP1 */
.align 16
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff /* _sMantissaMask (EXP_MSK3) */
.align 16
.long 0x007e0000, 0x007e0000, 0x007e0000, 0x007e0000 /* _sMantissaMask1 (SIG_MASK) */
.align 16
.long 0xBF800000, 0xBF800000, 0xBF800000, 0xBF800000 /* _sExpMask (EXP_MASK) */
.align 16
.long 0xBF820000, 0xBF820000, 0xBF820000, 0xBF820000 /* _sExpMask1 (EXP_MASK2) */
.align 16
.long 0x0000007c, 0x0000007c, 0x0000007c, 0x0000007c /* _iRcpIndexMask */
.align 16
.long 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff /* _iBExpMask */
.align 16
.long 0x00000100, 0x00000100, 0x00000100, 0x00000100 /* _iSignMask */
.align 16
.long 0x00000055, 0x00000055, 0x00000055, 0x00000055 /* _iBias */
.align 16
.long 0x00000001, 0x00000001, 0x00000001, 0x00000001 /* _iOne */
.align 16
.long 0x00000555, 0x00000555, 0x00000555, 0x00000555 /* _i555 */
.align 16
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
.align 16
.long 0x80800000, 0x80800000, 0x80800000, 0x80800000 /* _iSubConst */
.align 16
.long 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF /* _iCmpConst */
.align 16
.type __svml_scbrt_data_internal,@object
.size __svml_scbrt_data_internal,.-__svml_scbrt_data_internal
.align 16
.FLT_17:
.long 0xffffffff,0x00000000,0xffffffff,0x00000000
.type .FLT_17,@object
.size .FLT_17,16

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized cbrtf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_cbrtf _ZGVdN8v_cbrtf_sse_wrapper
#include "../svml_s_cbrtf8_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized cbrtf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_cbrtf
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_cbrtf, __GI__ZGVdN8v_cbrtf,
__redirect__ZGVdN8v_cbrtf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,509 @@
/* Function cbrtf vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
* Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
* where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in single precision
* cbrtf(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
* (T stores the high 24 bits, D stores the low order bits)
* Result=2^k*T+(2^k*T*r)*P+2^k*D
* where P=p1+p2*r+..
*
*/
/* Offsets for data table __svml_scbrt_data_internal
*/
#define _sRcp 0
#define _sCbrtHL 128
#define _sP2 512
#define _sP1 544
#define _sMantissaMask 576
#define _sMantissaMask1 608
#define _sExpMask 640
#define _sExpMask1 672
#define _iRcpIndexMask 704
#define _iBExpMask 736
#define _iSignMask 768
#define _iBias 800
#define _iOne 832
#define _i555 864
#define _iAbsMask 896
#define _iSubConst 928
#define _iCmpConst 960
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_cbrtf_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
/* Load reciprocal value */
lea __svml_scbrt_data_internal(%rip), %rdx
vmovaps %ymm0, %ymm5
/*
* Load constants
* Reciprocal index calculation
*/
vpsrld $16, %ymm5, %ymm3
vpand _iRcpIndexMask+__svml_scbrt_data_internal(%rip), %ymm3, %ymm4
vextractf128 $1, %ymm4, %xmm15
vmovd %xmm4, %eax
vmovd %xmm15, %r8d
vpextrd $1, %xmm15, %r9d
vpextrd $2, %xmm15, %r10d
vpextrd $3, %xmm15, %r11d
movslq %r8d, %r8
movslq %r9d, %r9
movslq %r10d, %r10
movslq %r11d, %r11
vpextrd $1, %xmm4, %ecx
vpextrd $2, %xmm4, %esi
vpextrd $3, %xmm4, %edi
movslq %eax, %rax
movslq %ecx, %rcx
movslq %esi, %rsi
movslq %edi, %rdi
vmovd (%rdx,%r8), %xmm13
vmovd (%rdx,%r9), %xmm14
vmovd (%rdx,%r10), %xmm1
vmovd (%rdx,%r11), %xmm0
vpunpckldq %xmm14, %xmm13, %xmm2
vpunpckldq %xmm0, %xmm1, %xmm13
/* Get signed biased exponent */
vpsrld $7, %ymm3, %ymm0
vmovd (%rdx,%rax), %xmm6
vmovd (%rdx,%rcx), %xmm7
vmovd (%rdx,%rsi), %xmm8
vmovd (%rdx,%rdi), %xmm9
vpunpckldq %xmm7, %xmm6, %xmm10
vpunpckldq %xmm9, %xmm8, %xmm11
vpunpcklqdq %xmm11, %xmm10, %xmm12
vpunpcklqdq %xmm13, %xmm2, %xmm6
vandps _iAbsMask+__svml_scbrt_data_internal(%rip), %ymm5, %ymm3
/* Argument reduction */
vandps _sMantissaMask+__svml_scbrt_data_internal(%rip), %ymm5, %ymm8
vandps _sMantissaMask1+__svml_scbrt_data_internal(%rip), %ymm5, %ymm9
vpsubd _iSubConst+__svml_scbrt_data_internal(%rip), %ymm3, %ymm7
vorps _sExpMask+__svml_scbrt_data_internal(%rip), %ymm8, %ymm10
vorps _sExpMask1+__svml_scbrt_data_internal(%rip), %ymm9, %ymm11
/* r=y-y` */
vsubps %ymm11, %ymm10, %ymm15
/* Biased exponent-1 */
vpand _iSignMask+__svml_scbrt_data_internal(%rip), %ymm0, %ymm8
vpcmpgtd _iCmpConst+__svml_scbrt_data_internal(%rip), %ymm7, %ymm2
vmovmskps %ymm2, %eax
vinsertf128 $1, %xmm6, %ymm12, %ymm14
/* Get absolute biased exponent */
vpand _iBExpMask+__svml_scbrt_data_internal(%rip), %ymm0, %ymm6
/* r=(y-y`)*rcp_table(y`) */
vmulps %ymm15, %ymm14, %ymm1
vpsubd _iOne+__svml_scbrt_data_internal(%rip), %ymm6, %ymm10
/*
* Calculate exponent/3
* i555Exp=(2^{12}-1)/3*exponent
*/
vpmulld _i555+__svml_scbrt_data_internal(%rip), %ymm6, %ymm3
/* Get K (exponent=3*k+j) */
vpsrld $12, %ymm3, %ymm13
/* Get J */
vpsubd %ymm13, %ymm10, %ymm11
/* Add 2/3*(bias-1)+1 to (k+1/3*(bias-1)) */
vpaddd _iBias+__svml_scbrt_data_internal(%rip), %ymm13, %ymm7
vpsubd %ymm13, %ymm11, %ymm12
/* Attach sign to exponent */
vpor %ymm8, %ymm7, %ymm9
vpsubd %ymm13, %ymm12, %ymm14
vpslld $23, %ymm9, %ymm0
/* Get 128*J */
vpslld $7, %ymm14, %ymm15
/* iCbrtIndex=4*l+128*j */
vpaddd %ymm15, %ymm4, %ymm4
/* Zero index if callout expected */
vpandn %ymm4, %ymm2, %ymm4
/* Load Cbrt table Hi & Lo values */
vmovd %xmm4, %ecx
vextractf128 $1, %ymm4, %xmm13
vpextrd $1, %xmm4, %esi
movslq %ecx, %rcx
movslq %esi, %rsi
vmovd %xmm13, %r9d
vmovd 128(%rdx,%rcx), %xmm2
vpextrd $2, %xmm4, %edi
vpextrd $3, %xmm4, %r8d
vmovd 128(%rdx,%rsi), %xmm3
vpextrd $1, %xmm13, %r10d
vpextrd $2, %xmm13, %ecx
vpextrd $3, %xmm13, %esi
movslq %edi, %rdi
movslq %r8d, %r8
movslq %r9d, %r9
movslq %r10d, %r10
movslq %ecx, %rcx
movslq %esi, %rsi
vmovd 128(%rdx,%rdi), %xmm6
vmovd 128(%rdx,%r8), %xmm7
vmovd 128(%rdx,%r9), %xmm11
vmovd 128(%rdx,%r10), %xmm12
vmovd 128(%rdx,%rcx), %xmm14
vmovd 128(%rdx,%rsi), %xmm15
vpunpckldq %xmm3, %xmm2, %xmm8
vpunpckldq %xmm7, %xmm6, %xmm9
vpunpckldq %xmm12, %xmm11, %xmm4
vpunpckldq %xmm15, %xmm14, %xmm11
vpunpcklqdq %xmm9, %xmm8, %xmm10
vpunpcklqdq %xmm11, %xmm4, %xmm2
vinsertf128 $1, %xmm2, %ymm10, %ymm3
/* sCbrtHi *= 2^k */
vmulps %ymm3, %ymm0, %ymm2
/* Polynomial: p1+r*(p2*r+r*(p3+r*p4)) */
vmovups _sP2+__svml_scbrt_data_internal(%rip), %ymm0
vfmadd213ps _sP1+__svml_scbrt_data_internal(%rip), %ymm1, %ymm0
/* T`*r */
vmulps %ymm2, %ymm1, %ymm1
/* (T`*r)*P */
vmulps %ymm1, %ymm0, %ymm0
/*
* T`*r*P+D`
* result = T`+(T`*r*P+D`)
*/
vaddps %ymm0, %ymm2, %ymm0
testl %eax, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 eax ymm0 ymm5
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %ymm5, 32(%rsp)
vmovups %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 eax ymm0
xorl %edx, %edx
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %edx, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %eax, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call cbrtf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 64(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN8v_cbrtf_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_scbrt_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _sRcp[32][1];
__declspec(align(32)) VUINT32 _sCbrtHL[96][1];
__declspec(align(32)) VUINT32 _sP2[8][1];
__declspec(align(32)) VUINT32 _sP1[8][1];
__declspec(align(32)) VUINT32 _sMantissaMask[8][1];
__declspec(align(32)) VUINT32 _sMantissaMask1[8][1];
__declspec(align(32)) VUINT32 _sExpMask[8][1];
__declspec(align(32)) VUINT32 _sExpMask1[8][1];
__declspec(align(32)) VUINT32 _iRcpIndexMask[8][1];
__declspec(align(32)) VUINT32 _iBExpMask[8][1];
__declspec(align(32)) VUINT32 _iSignMask[8][1];
__declspec(align(32)) VUINT32 _iBias[8][1];
__declspec(align(32)) VUINT32 _iOne[8][1];
__declspec(align(32)) VUINT32 _i555[8][1];
__declspec(align(32)) VUINT32 _iAbsMask[8][1];
__declspec(align(32)) VUINT32 _iSubConst[8][1];
__declspec(align(32)) VUINT32 _iCmpConst[8][1];
} __svml_scbrt_data_internal;
#endif
__svml_scbrt_data_internal:
/*== _sRcp ==*/
.long 0xBF7C0FC1 /* (1/(1+0/32+1/64)) = -.984615 */
.long 0xBF74898D /* (1/(1+1/32+1/64)) = -.955224 */
.long 0xBF6D7304 /* (1/(1+2/32+1/64)) = -.927536 */
.long 0xBF66C2B4 /* (1/(1+3/32+1/64)) = -.901408 */
.long 0xBF607038 /* (1/(1+4/32+1/64)) = -.876712 */
.long 0xBF5A740E /* (1/(1+5/32+1/64)) = -.853333 */
.long 0xBF54C77B /* (1/(1+6/32+1/64)) = -.831169 */
.long 0xBF4F6475 /* (1/(1+7/32+1/64)) = -.810127 */
.long 0xBF4A4588 /* (1/(1+8/32+1/64)) = -.790123 */
.long 0xBF4565C8 /* (1/(1+9/32+1/64)) = -.771084 */
.long 0xBF40C0C1 /* (1/(1+10/32+1/64)) = -.752941 */
.long 0xBF3C5264 /* (1/(1+11/32+1/64)) = -.735632 */
.long 0xBF381703 /* (1/(1+12/32+1/64)) = -.719101 */
.long 0xBF340B41 /* (1/(1+13/32+1/64)) = -.703297 */
.long 0xBF302C0B /* (1/(1+14/32+1/64)) = -.688172 */
.long 0xBF2C7692 /* (1/(1+15/32+1/64)) = -.673684 */
.long 0xBF28E83F /* (1/(1+16/32+1/64)) = -.659794 */
.long 0xBF257EB5 /* (1/(1+17/32+1/64)) = -.646465 */
.long 0xBF2237C3 /* (1/(1+18/32+1/64)) = -.633663 */
.long 0xBF1F1166 /* (1/(1+19/32+1/64)) = -.621359 */
.long 0xBF1C09C1 /* (1/(1+20/32+1/64)) = -.609524 */
.long 0xBF191F1A /* (1/(1+21/32+1/64)) = -.598131 */
.long 0xBF164FDA /* (1/(1+22/32+1/64)) = -.587156 */
.long 0xBF139A86 /* (1/(1+23/32+1/64)) = -.576577 */
.long 0xBF10FDBC /* (1/(1+24/32+1/64)) = -.566372 */
.long 0xBF0E7835 /* (1/(1+25/32+1/64)) = -.556522 */
.long 0xBF0C08C1 /* (1/(1+26/32+1/64)) = -.547009 */
.long 0xBF09AE41 /* (1/(1+27/32+1/64)) = -.537815 */
.long 0xBF0767AB /* (1/(1+28/32+1/64)) = -.528926 */
.long 0xBF053408 /* (1/(1+29/32+1/64)) = -.520325 */
.long 0xBF03126F /* (1/(1+30/32+1/64)) = -.512 */
.long 0xBF010204 /* (1/(1+31/32+1/64)) = -.503937 */
/*== _sCbrtHL ==*/
.align 32
.long 0x3F80A9C9 /* HI((2^0*(1+0/32+1/64))^(1/3)) = 1.005181 */
.long 0x3F81F833 /* HI((2^0*(1+1/32+1/64))^(1/3)) = 1.015387 */
.long 0x3F834007 /* HI((2^0*(1+2/32+1/64))^(1/3)) = 1.025391 */
.long 0x3F848194 /* HI((2^0*(1+3/32+1/64))^(1/3)) = 1.035204 */
.long 0x3F85BD25 /* HI((2^0*(1+4/32+1/64))^(1/3)) = 1.044835 */
.long 0x3F86F300 /* HI((2^0*(1+5/32+1/64))^(1/3)) = 1.054291 */
.long 0x3F882365 /* HI((2^0*(1+6/32+1/64))^(1/3)) = 1.06358 */
.long 0x3F894E90 /* HI((2^0*(1+7/32+1/64))^(1/3)) = 1.07271 */
.long 0x3F8A74B9 /* HI((2^0*(1+8/32+1/64))^(1/3)) = 1.081687 */
.long 0x3F8B9615 /* HI((2^0*(1+9/32+1/64))^(1/3)) = 1.090518 */
.long 0x3F8CB2D4 /* HI((2^0*(1+10/32+1/64))^(1/3)) = 1.099207 */
.long 0x3F8DCB24 /* HI((2^0*(1+11/32+1/64))^(1/3)) = 1.107762 */
.long 0x3F8EDF31 /* HI((2^0*(1+12/32+1/64))^(1/3)) = 1.116186 */
.long 0x3F8FEF22 /* HI((2^0*(1+13/32+1/64))^(1/3)) = 1.124485 */
.long 0x3F90FB1F /* HI((2^0*(1+14/32+1/64))^(1/3)) = 1.132664 */
.long 0x3F92034C /* HI((2^0*(1+15/32+1/64))^(1/3)) = 1.140726 */
.long 0x3F9307CA /* HI((2^0*(1+16/32+1/64))^(1/3)) = 1.148675 */
.long 0x3F9408B9 /* HI((2^0*(1+17/32+1/64))^(1/3)) = 1.156516 */
.long 0x3F950638 /* HI((2^0*(1+18/32+1/64))^(1/3)) = 1.164252 */
.long 0x3F960064 /* HI((2^0*(1+19/32+1/64))^(1/3)) = 1.171887 */
.long 0x3F96F759 /* HI((2^0*(1+20/32+1/64))^(1/3)) = 1.179423 */
.long 0x3F97EB2F /* HI((2^0*(1+21/32+1/64))^(1/3)) = 1.186865 */
.long 0x3F98DC01 /* HI((2^0*(1+22/32+1/64))^(1/3)) = 1.194214 */
.long 0x3F99C9E5 /* HI((2^0*(1+23/32+1/64))^(1/3)) = 1.201474 */
.long 0x3F9AB4F2 /* HI((2^0*(1+24/32+1/64))^(1/3)) = 1.208647 */
.long 0x3F9B9D3D /* HI((2^0*(1+25/32+1/64))^(1/3)) = 1.215736 */
.long 0x3F9C82DA /* HI((2^0*(1+26/32+1/64))^(1/3)) = 1.222743 */
.long 0x3F9D65DD /* HI((2^0*(1+27/32+1/64))^(1/3)) = 1.229671 */
.long 0x3F9E4659 /* HI((2^0*(1+28/32+1/64))^(1/3)) = 1.236522 */
.long 0x3F9F245F /* HI((2^0*(1+29/32+1/64))^(1/3)) = 1.243297 */
.long 0x3FA00000 /* HI((2^0*(1+30/32+1/64))^(1/3)) = 1.25 */
.long 0x3FA0D94C /* HI((2^0*(1+31/32+1/64))^(1/3)) = 1.256631 */
.long 0x3FA21B02 /* HI((2^1*(1+0/32+1/64))^(1/3)) = 1.266449 */
.long 0x3FA3C059 /* HI((2^1*(1+1/32+1/64))^(1/3)) = 1.279307 */
.long 0x3FA55D61 /* HI((2^1*(1+2/32+1/64))^(1/3)) = 1.291912 */
.long 0x3FA6F282 /* HI((2^1*(1+3/32+1/64))^(1/3)) = 1.304276 */
.long 0x3FA8801A /* HI((2^1*(1+4/32+1/64))^(1/3)) = 1.316409 */
.long 0x3FAA067E /* HI((2^1*(1+5/32+1/64))^(1/3)) = 1.328323 */
.long 0x3FAB8602 /* HI((2^1*(1+6/32+1/64))^(1/3)) = 1.340027 */
.long 0x3FACFEEF /* HI((2^1*(1+7/32+1/64))^(1/3)) = 1.35153 */
.long 0x3FAE718E /* HI((2^1*(1+8/32+1/64))^(1/3)) = 1.36284 */
.long 0x3FAFDE1F /* HI((2^1*(1+9/32+1/64))^(1/3)) = 1.373966 */
.long 0x3FB144E1 /* HI((2^1*(1+10/32+1/64))^(1/3)) = 1.384915 */
.long 0x3FB2A60D /* HI((2^1*(1+11/32+1/64))^(1/3)) = 1.395692 */
.long 0x3FB401DA /* HI((2^1*(1+12/32+1/64))^(1/3)) = 1.406307 */
.long 0x3FB5587B /* HI((2^1*(1+13/32+1/64))^(1/3)) = 1.416763 */
.long 0x3FB6AA20 /* HI((2^1*(1+14/32+1/64))^(1/3)) = 1.427067 */
.long 0x3FB7F6F7 /* HI((2^1*(1+15/32+1/64))^(1/3)) = 1.437224 */
.long 0x3FB93F29 /* HI((2^1*(1+16/32+1/64))^(1/3)) = 1.44724 */
.long 0x3FBA82E1 /* HI((2^1*(1+17/32+1/64))^(1/3)) = 1.457119 */
.long 0x3FBBC244 /* HI((2^1*(1+18/32+1/64))^(1/3)) = 1.466866 */
.long 0x3FBCFD77 /* HI((2^1*(1+19/32+1/64))^(1/3)) = 1.476485 */
.long 0x3FBE349B /* HI((2^1*(1+20/32+1/64))^(1/3)) = 1.48598 */
.long 0x3FBF67D3 /* HI((2^1*(1+21/32+1/64))^(1/3)) = 1.495356 */
.long 0x3FC0973C /* HI((2^1*(1+22/32+1/64))^(1/3)) = 1.504615 */
.long 0x3FC1C2F6 /* HI((2^1*(1+23/32+1/64))^(1/3)) = 1.513762 */
.long 0x3FC2EB1A /* HI((2^1*(1+24/32+1/64))^(1/3)) = 1.5228 */
.long 0x3FC40FC6 /* HI((2^1*(1+25/32+1/64))^(1/3)) = 1.531731 */
.long 0x3FC53112 /* HI((2^1*(1+26/32+1/64))^(1/3)) = 1.54056 */
.long 0x3FC64F16 /* HI((2^1*(1+27/32+1/64))^(1/3)) = 1.549289 */
.long 0x3FC769EB /* HI((2^1*(1+28/32+1/64))^(1/3)) = 1.55792 */
.long 0x3FC881A6 /* HI((2^1*(1+29/32+1/64))^(1/3)) = 1.566457 */
.long 0x3FC9965D /* HI((2^1*(1+30/32+1/64))^(1/3)) = 1.574901 */
.long 0x3FCAA825 /* HI((2^1*(1+31/32+1/64))^(1/3)) = 1.583256 */
.long 0x3FCC3D79 /* HI((2^2*(1+0/32+1/64))^(1/3)) = 1.595626 */
.long 0x3FCE5054 /* HI((2^2*(1+1/32+1/64))^(1/3)) = 1.611826 */
.long 0x3FD058B8 /* HI((2^2*(1+2/32+1/64))^(1/3)) = 1.627707 */
.long 0x3FD25726 /* HI((2^2*(1+3/32+1/64))^(1/3)) = 1.643285 */
.long 0x3FD44C15 /* HI((2^2*(1+4/32+1/64))^(1/3)) = 1.658572 */
.long 0x3FD637F2 /* HI((2^2*(1+5/32+1/64))^(1/3)) = 1.673582 */
.long 0x3FD81B24 /* HI((2^2*(1+6/32+1/64))^(1/3)) = 1.688328 */
.long 0x3FD9F60B /* HI((2^2*(1+7/32+1/64))^(1/3)) = 1.702821 */
.long 0x3FDBC8FE /* HI((2^2*(1+8/32+1/64))^(1/3)) = 1.717071 */
.long 0x3FDD9452 /* HI((2^2*(1+9/32+1/64))^(1/3)) = 1.731089 */
.long 0x3FDF5853 /* HI((2^2*(1+10/32+1/64))^(1/3)) = 1.744883 */
.long 0x3FE1154B /* HI((2^2*(1+11/32+1/64))^(1/3)) = 1.758462 */
.long 0x3FE2CB7F /* HI((2^2*(1+12/32+1/64))^(1/3)) = 1.771835 */
.long 0x3FE47B2E /* HI((2^2*(1+13/32+1/64))^(1/3)) = 1.785009 */
.long 0x3FE62496 /* HI((2^2*(1+14/32+1/64))^(1/3)) = 1.797992 */
.long 0x3FE7C7F0 /* HI((2^2*(1+15/32+1/64))^(1/3)) = 1.810789 */
.long 0x3FE96571 /* HI((2^2*(1+16/32+1/64))^(1/3)) = 1.823408 */
.long 0x3FEAFD4C /* HI((2^2*(1+17/32+1/64))^(1/3)) = 1.835855 */
.long 0x3FEC8FB3 /* HI((2^2*(1+18/32+1/64))^(1/3)) = 1.848135 */
.long 0x3FEE1CD3 /* HI((2^2*(1+19/32+1/64))^(1/3)) = 1.860255 */
.long 0x3FEFA4D7 /* HI((2^2*(1+20/32+1/64))^(1/3)) = 1.872218 */
.long 0x3FF127E9 /* HI((2^2*(1+21/32+1/64))^(1/3)) = 1.88403 */
.long 0x3FF2A62F /* HI((2^2*(1+22/32+1/64))^(1/3)) = 1.895697 */
.long 0x3FF41FD0 /* HI((2^2*(1+23/32+1/64))^(1/3)) = 1.907221 */
.long 0x3FF594EE /* HI((2^2*(1+24/32+1/64))^(1/3)) = 1.918607 */
.long 0x3FF705AC /* HI((2^2*(1+25/32+1/64))^(1/3)) = 1.929861 */
.long 0x3FF8722A /* HI((2^2*(1+26/32+1/64))^(1/3)) = 1.940984 */
.long 0x3FF9DA86 /* HI((2^2*(1+27/32+1/64))^(1/3)) = 1.951981 */
.long 0x3FFB3EDE /* HI((2^2*(1+28/32+1/64))^(1/3)) = 1.962856 */
.long 0x3FFC9F4E /* HI((2^2*(1+29/32+1/64))^(1/3)) = 1.973612 */
.long 0x3FFDFBF2 /* HI((2^2*(1+30/32+1/64))^(1/3)) = 1.984251 */
.long 0x3FFF54E3 /* HI((2^2*(1+31/32+1/64))^(1/3)) = 1.994778 */
.align 32
.long 0xBDE3A962, 0xBDE3A962, 0xBDE3A962, 0xBDE3A962, 0xBDE3A962, 0xBDE3A962, 0xBDE3A962, 0xBDE3A962 /* _sP2 */
.align 32
.long 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91, 0x3EAAAC91 /* _sP1 */
.align 32
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff /* _sMantissaMask (EXP_MSK3) */
.align 32
.long 0x007e0000, 0x007e0000, 0x007e0000, 0x007e0000, 0x007e0000, 0x007e0000, 0x007e0000, 0x007e0000 /* _sMantissaMask1 (SIG_MASK) */
.align 32
.long 0xBF800000, 0xBF800000, 0xBF800000, 0xBF800000, 0xBF800000, 0xBF800000, 0xBF800000, 0xBF800000 /* _sExpMask (EXP_MASK) */
.align 32
.long 0xBF820000, 0xBF820000, 0xBF820000, 0xBF820000, 0xBF820000, 0xBF820000, 0xBF820000, 0xBF820000 /* _sExpMask1 (EXP_MASK2) */
.align 32
.long 0x0000007c, 0x0000007c, 0x0000007c, 0x0000007c, 0x0000007c, 0x0000007c, 0x0000007c, 0x0000007c /* _iRcpIndexMask */
.align 32
.long 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff /* _iBExpMask */
.align 32
.long 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100 /* _iSignMask */
.align 32
.long 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055 /* _iBias */
.align 32
.long 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 /* _iOne */
.align 32
.long 0x00000555, 0x00000555, 0x00000555, 0x00000555, 0x00000555, 0x00000555, 0x00000555, 0x00000555 /* _i555 */
.align 32
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
.align 32
.long 0x80800000, 0x80800000, 0x80800000, 0x80800000, 0x80800000, 0x80800000, 0x80800000, 0x80800000 /* _iSubConst */
.align 32
.long 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF, 0xFEFFFFFF /* _iCmpConst */
.align 32
.type __svml_scbrt_data_internal,@object
.size __svml_scbrt_data_internal,.-__svml_scbrt_data_internal

View File

@ -0,0 +1,29 @@
/* Function cbrt vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_cbrt)
WRAPPER_IMPL_SSE2 cbrt
END (_ZGVbN2v_cbrt)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_cbrt)
#endif

View File

@ -0,0 +1,29 @@
/* Function cbrt vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_cbrt)
WRAPPER_IMPL_AVX _ZGVbN2v_cbrt
END (_ZGVdN4v_cbrt)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_cbrt)
#endif

View File

@ -0,0 +1,25 @@
/* Function cbrt vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_cbrt)
WRAPPER_IMPL_AVX _ZGVbN2v_cbrt
END (_ZGVcN4v_cbrt)

View File

@ -0,0 +1,25 @@
/* Function cbrt vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_cbrt)
WRAPPER_IMPL_AVX512 _ZGVdN4v_cbrt
END (_ZGVeN8v_cbrt)

View File

@ -0,0 +1,25 @@
/* Function cbrtf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_cbrtf)
WRAPPER_IMPL_AVX512 _ZGVdN8v_cbrtf
END (_ZGVeN16v_cbrtf)

View File

@ -0,0 +1,29 @@
/* Function cbrtf vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_cbrtf)
WRAPPER_IMPL_SSE2 cbrtf
END (_ZGVbN4v_cbrtf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_cbrtf)
#endif

View File

@ -0,0 +1,29 @@
/* Function cbrtf vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_cbrtf)
WRAPPER_IMPL_AVX _ZGVbN4v_cbrtf
END (_ZGVdN8v_cbrtf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_cbrtf)
#endif

View File

@ -0,0 +1,25 @@
/* Function cbrtf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_cbrtf)
WRAPPER_IMPL_AVX _ZGVbN4v_cbrtf
END (_ZGVcN8v_cbrtf)

View File

@ -0,0 +1 @@
#include "test-double-libmvec-cbrt.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-cbrt.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-cbrt.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC cbrt
#include "test-vector-abi-arg1.h"

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVbN2v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVbN2v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVbN2v_expm1)
VECTOR_WRAPPER (WRAPPER_NAME (sinh), _ZGVbN2v_sinh)
VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVbN2v_cbrt)
#define VEC_INT_TYPE __m128i

View File

@ -39,6 +39,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVdN4v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVdN4v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVdN4v_expm1)
VECTOR_WRAPPER (WRAPPER_NAME (sinh), _ZGVdN4v_sinh)
VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVdN4v_cbrt)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVcN4v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVcN4v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVcN4v_expm1)
VECTOR_WRAPPER (WRAPPER_NAME (sinh), _ZGVcN4v_sinh)
VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVcN4v_cbrt)
#define VEC_INT_TYPE __m128i

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVeN8v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVeN8v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVeN8v_expm1)
VECTOR_WRAPPER (WRAPPER_NAME (sinh), _ZGVeN8v_sinh)
VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVeN8v_cbrt)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View File

@ -0,0 +1 @@
#include "test-float-libmvec-cbrtf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-cbrtf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-cbrtf.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC cbrtf
#include "test-vector-abi-arg1.h"

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVeN16v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVeN16v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVeN16v_expm1f)
VECTOR_WRAPPER (WRAPPER_NAME (sinhf), _ZGVeN16v_sinhf)
VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVeN16v_cbrtf)
#define VEC_INT_TYPE __m512i

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVbN4v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVbN4v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVbN4v_expm1f)
VECTOR_WRAPPER (WRAPPER_NAME (sinhf), _ZGVbN4v_sinhf)
VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVbN4v_cbrtf)
#define VEC_INT_TYPE __m128i

View File

@ -39,6 +39,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVdN8v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVdN8v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVdN8v_expm1f)
VECTOR_WRAPPER (WRAPPER_NAME (sinhf), _ZGVdN8v_sinhf)
VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVdN8v_cbrtf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVcN8v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVcN8v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVcN8v_expm1f)
VECTOR_WRAPPER (WRAPPER_NAME (sinhf), _ZGVcN8v_sinhf)
VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVcN8v_cbrtf)
#define VEC_INT_TYPE __m128i