x86-64: Add vector atan/atanf implementation to libmvec

Implement vectorized atan/atanf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector atan/atanf with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 08:23:33 -08:00
parent 5d28a8962d
commit 146310177a
50 changed files with 1741 additions and 1 deletions

View File

@ -109,4 +109,15 @@
#define __DECL_SIMD_acosf32x
#define __DECL_SIMD_acosf64x
#define __DECL_SIMD_acosf128x
#define __DECL_SIMD_atan
#define __DECL_SIMD_atanf
#define __DECL_SIMD_atanl
#define __DECL_SIMD_atanf16
#define __DECL_SIMD_atanf32
#define __DECL_SIMD_atanf64
#define __DECL_SIMD_atanf128
#define __DECL_SIMD_atanf32x
#define __DECL_SIMD_atanf64x
#define __DECL_SIMD_atanf128x
#endif

View File

@ -54,7 +54,7 @@ __MATHCALL_VEC (acos,, (_Mdouble_ __x));
/* Arc sine of X. */
__MATHCALL (asin,, (_Mdouble_ __x));
/* Arc tangent of X. */
__MATHCALL (atan,, (_Mdouble_ __x));
__MATHCALL_VEC (atan,, (_Mdouble_ __x));
/* Arc tangent of Y/X. */
__MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));

View File

@ -47,10 +47,18 @@ GLIBC_2.22 _ZGVeN8v_sin F
GLIBC_2.22 _ZGVeN8vv_pow F
GLIBC_2.22 _ZGVeN8vvv_sincos F
GLIBC_2.35 _ZGVbN2v_acos F
GLIBC_2.35 _ZGVbN2v_atan F
GLIBC_2.35 _ZGVbN4v_acosf F
GLIBC_2.35 _ZGVbN4v_atanf F
GLIBC_2.35 _ZGVcN4v_acos F
GLIBC_2.35 _ZGVcN4v_atan F
GLIBC_2.35 _ZGVcN8v_acosf F
GLIBC_2.35 _ZGVcN8v_atanf F
GLIBC_2.35 _ZGVdN4v_acos F
GLIBC_2.35 _ZGVdN4v_atan F
GLIBC_2.35 _ZGVdN8v_acosf F
GLIBC_2.35 _ZGVdN8v_atanf F
GLIBC_2.35 _ZGVeN16v_acosf F
GLIBC_2.35 _ZGVeN16v_atanf F
GLIBC_2.35 _ZGVeN8v_acos F
GLIBC_2.35 _ZGVeN8v_atan F

View File

@ -62,6 +62,10 @@
# define __DECL_SIMD_acos __DECL_SIMD_x86_64
# undef __DECL_SIMD_acosf
# define __DECL_SIMD_acosf __DECL_SIMD_x86_64
# undef __DECL_SIMD_atan
# define __DECL_SIMD_atan __DECL_SIMD_x86_64
# undef __DECL_SIMD_atanf
# define __DECL_SIMD_atanf __DECL_SIMD_x86_64
# endif
#endif

View File

@ -30,6 +30,8 @@
!GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (atan) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (atanf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -45,3 +47,5 @@
!GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (atan) attributes simd (notinbranch) if('x32')
!GCC$ builtin (atanf) attributes simd (notinbranch) if('x32')

View File

@ -23,6 +23,7 @@ postclean-generated += libmvec.mk
# Define for both math and mathvec directories.
libmvec-funcs = \
acos \
atan \
cos \
exp \
log \

View File

@ -15,6 +15,8 @@ libmvec {
}
GLIBC_2.35 {
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
_ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
_ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
}
}

View File

@ -164,6 +164,26 @@ float: 2
float128: 2
ldouble: 1
Function: "atan_vlen16":
float: 1
Function: "atan_vlen2":
double: 1
Function: "atan_vlen4":
double: 1
float: 1
Function: "atan_vlen4_avx2":
double: 1
Function: "atan_vlen8":
double: 1
float: 1
Function: "atan_vlen8_avx2":
float: 1
Function: "atanh":
double: 2
float: 2

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized atan, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_atan _ZGVbN2v_atan_sse2
#include "../svml_d_atan2_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized atan, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_atan
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_atan, __GI__ZGVbN2v_atan, __redirect__ZGVbN2v_atan)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,245 @@
/* Function atan vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
*
*/
/* Offsets for data table __svml_datan_data_internal_avx512
*/
#define AbsMask 0
#define Shifter 16
#define MaxThreshold 32
#define MOne 48
#define One 64
#define LargeX 80
#define Zero 96
#define Tbl_H 112
#define Tbl_L 368
#define dIndexMed 624
#define Pi2 640
#define Pi2_low 656
#define coeff 672
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN2v_atan_sse4)
lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx
movups __svml_datan_data_internal_avx512(%rip), %xmm4
movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3
andps %xmm0, %xmm4
movaps %xmm3, %xmm12
movaps %xmm4, %xmm5
addpd %xmm4, %xmm12
movaps %xmm12, %xmm7
/*
* table lookup sequence
* VPERMUTE not available
*/
movaps %xmm12, %xmm10
subpd %xmm3, %xmm7
subpd %xmm7, %xmm5
mulpd %xmm4, %xmm7
movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2
psllq $3, %xmm10
/* saturate X range */
movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8
pxor %xmm4, %xmm0
cmplepd %xmm4, %xmm2
addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7
minpd %xmm4, %xmm8
movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6
movaps %xmm2, %xmm1
movaps %xmm2, %xmm9
andnps %xmm5, %xmm1
andps %xmm2, %xmm6
andnps %xmm7, %xmm9
andps %xmm2, %xmm8
orps %xmm6, %xmm1
orps %xmm8, %xmm9
/* R+Rl = DiffX/Y */
divpd %xmm9, %xmm1
pand .FLT_11(%rip), %xmm10
/* set table value to Pi/2 for large X */
movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4
movd %xmm10, %eax
andps %xmm2, %xmm4
pshufd $2, %xmm10, %xmm11
movaps %xmm2, %xmm10
/* polynomial evaluation */
movaps %xmm1, %xmm2
mulpd %xmm1, %xmm2
movd %xmm11, %edx
movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5
movaps %xmm2, %xmm7
movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6
movaps %xmm2, %xmm9
mulpd %xmm2, %xmm5
mulpd %xmm2, %xmm7
addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5
mulpd %xmm2, %xmm6
mulpd %xmm7, %xmm5
addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6
mulpd %xmm1, %xmm9
addpd %xmm5, %xmm6
movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8
mulpd %xmm2, %xmm8
mulpd %xmm6, %xmm7
addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8
addpd %xmm7, %xmm8
mulpd %xmm8, %xmm9
movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14
cmplepd %xmm12, %xmm14
addpd %xmm9, %xmm1
movslq %eax, %rax
movaps %xmm14, %xmm3
movslq %edx, %rdx
movsd -128(%rax,%rcx), %xmm13
movsd (%rcx,%rax), %xmm15
movhpd -128(%rdx,%rcx), %xmm13
movhpd (%rcx,%rdx), %xmm15
andnps %xmm13, %xmm3
andps %xmm14, %xmm15
orps %xmm15, %xmm3
andnps %xmm3, %xmm10
orps %xmm4, %xmm10
addpd %xmm1, %xmm10
pxor %xmm10, %xmm0
ret
END(_ZGVbN2v_atan_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_datan_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(16)) VUINT32 AbsMask[2][2];
__declspec(align(16)) VUINT32 Shifter[2][2];
__declspec(align(16)) VUINT32 MaxThreshold[2][2];
__declspec(align(16)) VUINT32 MOne[2][2];
__declspec(align(16)) VUINT32 One[2][2];
__declspec(align(16)) VUINT32 LargeX[2][2];
__declspec(align(16)) VUINT32 Zero[2][2];
__declspec(align(16)) VUINT32 Tbl_H[32][2];
__declspec(align(16)) VUINT32 Tbl_L[32][2];
__declspec(align(16)) VUINT32 dIndexMed[2][2];
__declspec(align(16)) VUINT32 Pi2[2][2];
__declspec(align(16)) VUINT32 Pi2_low[2][2];
__declspec(align(16)) VUINT32 coeff[6][2][2];
} __svml_datan_data_internal_avx512;
#endif
__svml_datan_data_internal_avx512:
/*== AbsMask ==*/
.quad 0x7fffffffffffffff, 0x7fffffffffffffff
/*== Shifter ==*/
.align 16
.quad 0x4318000000000000, 0x4318000000000000
/*== MaxThreshold ==*/
.align 16
.quad 0x401f800000000000, 0x401f800000000000
/*== MOne ==*/
.align 16
.quad 0xbff0000000000000, 0xbff0000000000000
/*== One ==*/
.align 16
.quad 0x3ff0000000000000, 0x3ff0000000000000
/*== LargeX ==*/
.align 16
.quad 0x47f0000000000000, 0x47f0000000000000
/*== Zero ==*/
.align 16
.quad 0x0000000000000000, 0x0000000000000000
/*== Tbl_H ==*/
.align 16
.quad 0x0000000000000000, 0x3fcf5b75f92c80dd
.quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
.quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
.quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
.quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
.quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
.quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
.quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
.quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
.quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
.quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
.quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
.quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
.quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
.quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
.quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
/*== Tbl_L ==*/
.align 16
.quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
.quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
.quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
.quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
.quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
.quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
.quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
.quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
.quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
.quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
.quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
.quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
.quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
.quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
.quad 0xbc929c86447928e7, 0xbc8957a7170df016
.quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
/*== dIndexMed ==*/
.align 16
.quad 0x4318000000000010, 0x4318000000000010
/*== Pi2 ==*/
.align 16
.quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
/*== Pi2_low ==*/
.align 16
.quad 0x3c91a62633145c07, 0x3c91a62633145c07
/*== coeff6 ==*/
.align 16
.quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
.quad 0xbfb74257c46790cc, 0xbfb74257c46790cc
.quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
.quad 0xbfc249248eef04da, 0xbfc249248eef04da
.quad 0x3fc999999998741e, 0x3fc999999998741e
.quad 0xbfd555555555554d, 0xbfd555555555554d
.align 16
.type __svml_datan_data_internal_avx512,@object
.size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
.align 16
.FLT_11:
.long 0x00000078,0x00000000,0x00000078,0x00000000
.type .FLT_11,@object
.size .FLT_11,16

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized atan, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_atan _ZGVdN4v_atan_sse_wrapper
#include "../svml_d_atan4_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized atan, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_atan
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_atan, __GI__ZGVdN4v_atan, __redirect__ZGVdN4v_atan)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,225 @@
/* Function atan vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
*
*/
/* Offsets for data table __svml_datan_data_internal_avx512
*/
#define AbsMask 0
#define Shifter 32
#define MaxThreshold 64
#define MOne 96
#define One 128
#define LargeX 160
#define Zero 192
#define Tbl_H 224
#define Tbl_L 480
#define dIndexMed 736
#define Pi2 768
#define Pi2_low 800
#define coeff 832
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN4v_atan_avx2)
lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi
vmovupd Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4
vmovupd One+__svml_datan_data_internal_avx512(%rip), %ymm9
/* saturate X range */
vmovupd LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6
vandpd __svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7
vaddpd %ymm4, %ymm7, %ymm2
vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3
vminpd %ymm7, %ymm6, %ymm10
vsubpd %ymm4, %ymm2, %ymm5
/*
* table lookup sequence
* VPERMUTE not available
*/
vpsllq $3, %ymm2, %ymm13
vsubpd %ymm5, %ymm7, %ymm8
vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2
vfmadd231pd %ymm7, %ymm5, %ymm9
vpand .FLT_11(%rip), %ymm13, %ymm14
vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11
vblendvpd %ymm3, %ymm10, %ymm9, %ymm12
vxorpd %ymm0, %ymm7, %ymm1
/* R+Rl = DiffX/Y */
vdivpd %ymm12, %ymm11, %ymm0
vextractf128 $1, %ymm14, %xmm4
vmovd %xmm14, %eax
vmovd %xmm4, %ecx
movslq %eax, %rax
vpextrd $2, %xmm14, %edx
movslq %ecx, %rcx
vpextrd $2, %xmm4, %esi
movslq %edx, %rdx
movslq %esi, %rsi
vmovsd -128(%rax,%rdi), %xmm15
vmovsd (%rdi,%rax), %xmm7
vmovsd -128(%rcx,%rdi), %xmm5
vmovsd (%rdi,%rcx), %xmm9
vmovhpd -128(%rdx,%rdi), %xmm15, %xmm15
vmovhpd (%rdi,%rdx), %xmm7, %xmm8
vmovhpd -128(%rsi,%rdi), %xmm5, %xmm6
vmovhpd (%rdi,%rsi), %xmm9, %xmm10
/* polynomial evaluation */
vmulpd %ymm0, %ymm0, %ymm5
vmulpd %ymm5, %ymm5, %ymm4
vinsertf128 $1, %xmm6, %ymm15, %ymm11
vinsertf128 $1, %xmm10, %ymm8, %ymm12
vblendvpd %ymm2, %ymm12, %ymm11, %ymm13
vmovupd coeff+__svml_datan_data_internal_avx512(%rip), %ymm8
vmovupd coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2
vmulpd %ymm5, %ymm0, %ymm6
vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8
vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2
/* set table value to Pi/2 for large X */
vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7
vmovupd coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3
vfmadd213pd %ymm2, %ymm4, %ymm8
vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5
vfmadd213pd %ymm5, %ymm4, %ymm8
vfmadd213pd %ymm0, %ymm6, %ymm8
vaddpd %ymm8, %ymm7, %ymm0
vxorpd %ymm1, %ymm0, %ymm0
ret
END(_ZGVdN4v_atan_avx2)
.section .rodata, "a"
.align 32
.FLT_11:
.long 0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000
.type .FLT_11,@object
.size .FLT_11,32
.align 32
#ifdef __svml_datan_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(32)) VUINT32 AbsMask[4][2];
__declspec(align(32)) VUINT32 Shifter[4][2];
__declspec(align(32)) VUINT32 MaxThreshold[4][2];
__declspec(align(32)) VUINT32 MOne[4][2];
__declspec(align(32)) VUINT32 One[4][2];
__declspec(align(32)) VUINT32 LargeX[4][2];
__declspec(align(32)) VUINT32 Zero[4][2];
__declspec(align(32)) VUINT32 Tbl_H[32][2];
__declspec(align(32)) VUINT32 Tbl_L[32][2];
__declspec(align(32)) VUINT32 dIndexMed[4][2];
__declspec(align(32)) VUINT32 Pi2[4][2];
__declspec(align(32)) VUINT32 Pi2_low[4][2];
__declspec(align(32)) VUINT32 coeff[6][4][2];
} __svml_datan_data_internal_avx512;
#endif
__svml_datan_data_internal_avx512:
/*== AbsMask ==*/
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
/*== Shifter ==*/
.align 32
.quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
/*== MaxThreshold ==*/
.align 32
.quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
/*== MOne ==*/
.align 32
.quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
/*== One ==*/
.align 32
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
/*== LargeX ==*/
.align 32
.quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
/*== Zero ==*/
.align 32
.quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
/*== Tbl_H ==*/
.align 32
.quad 0x0000000000000000, 0x3fcf5b75f92c80dd
.quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
.quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
.quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
.quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
.quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
.quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
.quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
.quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
.quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
.quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
.quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
.quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
.quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
.quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
.quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
/*== Tbl_L ==*/
.align 32
.quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
.quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
.quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
.quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
.quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
.quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
.quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
.quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
.quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
.quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
.quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
.quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
.quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
.quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
.quad 0xbc929c86447928e7, 0xbc8957a7170df016
.quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
/*== dIndexMed ==*/
.align 32
.quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
/*== Pi2 ==*/
.align 32
.quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
/*== Pi2_low ==*/
.align 32
.quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
/*== coeff6 ==*/
.align 32
.quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
.quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
.quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
.quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
.quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
.quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
.align 32
.type __svml_datan_data_internal_avx512,@object
.size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized atan, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_atan _ZGVeN8v_atan_avx2_wrapper
#include "../svml_d_atan8_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized atan, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_atan
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_atan, __GI__ZGVeN8v_atan, __redirect__ZGVeN8v_atan)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,213 @@
/* Function atan vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
*
*/
/* Offsets for data table __svml_datan_data_internal_avx512
*/
#define AbsMask 0
#define Shifter 64
#define MaxThreshold 128
#define MOne 192
#define One 256
#define LargeX 320
#define Zero 384
#define Tbl_H 448
#define dIndexMed 704
#define Pi2 768
#define coeff_1 832
#define coeff_2 896
#define coeff_3 960
#define coeff_4 1024
#define coeff_5 1088
#define coeff_6 1152
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_atan_skx)
vmovups Shifter+__svml_datan_data_internal_avx512(%rip), %zmm4
vmovups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %zmm3
vmovups One+__svml_datan_data_internal_avx512(%rip), %zmm9
/* saturate X range */
vmovups LargeX+__svml_datan_data_internal_avx512(%rip), %zmm7
vandpd __svml_datan_data_internal_avx512(%rip), %zmm0, %zmm8
/* R+Rl = DiffX/Y */
vbroadcastsd .FLT_10(%rip), %zmm15
vaddpd {rn-sae}, %zmm4, %zmm8, %zmm2
vxorpd %zmm0, %zmm8, %zmm1
vcmppd $29, {sae}, %zmm3, %zmm8, %k2
/* round to 2 bits after binary point */
vreducepd $40, {sae}, %zmm8, %zmm6
vsubpd {rn-sae}, %zmm4, %zmm2, %zmm5
/*
* if|X|>=MaxThreshold, set DiffX=-1
* VMSUB(D, DiffX, LargeMask, Zero, One);
*/
vblendmpd MOne+__svml_datan_data_internal_avx512(%rip), %zmm6, %zmm10{%k2}
vfmadd231pd {rn-sae}, %zmm8, %zmm5, %zmm9
vmovups dIndexMed+__svml_datan_data_internal_avx512(%rip), %zmm5
/* table lookup sequence */
vmovups Tbl_H+__svml_datan_data_internal_avx512(%rip), %zmm6
vgetmantpd $0, {sae}, %zmm10, %zmm14
vgetexppd {sae}, %zmm10, %zmm11
vmovups coeff_5+__svml_datan_data_internal_avx512(%rip), %zmm10
/*
* if|X|>=MaxThreshold, set Y=X
* VMADD(D, Y, LargeMask, X, Zero);
*/
vminpd {sae}, %zmm8, %zmm7, %zmm9{%k2}
vcmppd $29, {sae}, %zmm5, %zmm2, %k1
vmovups Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %zmm7
vmovups coeff_1+__svml_datan_data_internal_avx512(%rip), %zmm8
vgetmantpd $0, {sae}, %zmm9, %zmm3
vgetexppd {sae}, %zmm9, %zmm12
vmovups coeff_3+__svml_datan_data_internal_avx512(%rip), %zmm9
vpermt2pd Tbl_H+64+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm6
vsubpd {rn-sae}, %zmm12, %zmm11, %zmm4
vpermt2pd Tbl_H+192+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm7
vrcp14pd %zmm3, %zmm13
vmovups coeff_4+__svml_datan_data_internal_avx512(%rip), %zmm12
vmovups coeff_6+__svml_datan_data_internal_avx512(%rip), %zmm11
vblendmpd %zmm7, %zmm6, %zmm2{%k1}
vmulpd {rn-sae}, %zmm13, %zmm14, %zmm0
vfnmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
vfnmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm3
vfmadd213pd {rn-sae}, %zmm15, %zmm15, %zmm15
vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm15
vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm3
vscalefpd {rn-sae}, %zmm4, %zmm3, %zmm0
/* set table value to Pi/2 for large X */
vblendmpd Pi2+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm3{%k2}
vmovups coeff_2+__svml_datan_data_internal_avx512(%rip), %zmm2
/* polynomial evaluation */
vmulpd {rn-sae}, %zmm0, %zmm0, %zmm14
vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
vmulpd {rn-sae}, %zmm0, %zmm14, %zmm15
vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm2
vfmadd231pd {rn-sae}, %zmm14, %zmm9, %zmm12
vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14
vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm2
vfmadd213pd {rn-sae}, %zmm14, %zmm13, %zmm2
vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm2
vaddpd {rn-sae}, %zmm3, %zmm2, %zmm0
vxorpd %zmm1, %zmm0, %zmm0
ret
END(_ZGVeN8v_atan_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_datan_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 AbsMask[8][2];
__declspec(align(64)) VUINT32 Shifter[8][2];
__declspec(align(64)) VUINT32 MaxThreshold[8][2];
__declspec(align(64)) VUINT32 MOne[8][2];
__declspec(align(64)) VUINT32 One[8][2];
__declspec(align(64)) VUINT32 LargeX[8][2];
__declspec(align(64)) VUINT32 Zero[8][2];
__declspec(align(64)) VUINT32 Tbl_H[32][2];
__declspec(align(64)) VUINT32 dIndexMed[8][2];
__declspec(align(64)) VUINT32 Pi2[8][2];
__declspec(align(64)) VUINT32 coeff[6][8][2];
} __svml_datan_data_internal_avx512;
#endif
__svml_datan_data_internal_avx512:
/*== AbsMask ==*/
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
/*== Shifter ==*/
.align 64
.quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
/*== MaxThreshold ==*/
.align 64
.quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
/*== MOne ==*/
.align 64
.quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
/*== One ==*/
.align 64
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
/*== LargeX ==*/
.align 64
.quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
/*== Zero ==*/
.align 64
.quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
/*== Tbl_H ==*/
.align 64
.quad 0x0000000000000000, 0x3fcf5b75f92c80dd
.quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
.quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
.quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
.quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
.quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
.quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
.quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
.quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
.quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
.quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
.quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
.quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
.quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
.quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
.quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
/*== dIndexMed ==*/
.align 64
.quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
/*== Pi2 ==*/
.align 64
.quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
/*== coeff6 ==*/
.align 64
.quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
.quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
.quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
.quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
.quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
.quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
.align 64
.type __svml_datan_data_internal_avx512,@object
.size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
.align 8
.FLT_10:
.long 0x00000000,0x3ff00000
.type .FLT_10,@object
.size .FLT_10,8

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized atanf.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_atanf _ZGVeN16v_atanf_avx2_wrapper
#include "../svml_s_atanf16_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized atanf, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_atanf
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_atanf, __GI__ZGVeN16v_atanf,
__redirect__ZGVeN16v_atanf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,174 @@
/* Function atanf vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
*
*/
/* Offsets for data table __svml_satan_data_internal_avx512
*/
#define AbsMask 0
#define Shifter 64
#define MaxThreshold 128
#define MOne 192
#define One 256
#define LargeX 320
#define Zero 384
#define Tbl_H 448
#define Pi2 576
#define coeff_1 640
#define coeff_2 704
#define coeff_3 768
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_atanf_skx)
vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
/* round to 2 bits after binary point */
vreduceps $40, {sae}, %zmm7, %zmm5
/* saturate X range */
vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
vcmpps $29, {sae}, %zmm3, %zmm7, %k1
/* table lookup sequence */
vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
vxorps %zmm0, %zmm7, %zmm0
vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
/* if|X|>=MaxThreshold, set DiffX=-1 */
vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
/* if|X|>=MaxThreshold, set Y=X */
vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
/* R+Rl = DiffX/Y */
vgetmantps $0, {sae}, %zmm9, %zmm12
vgetexpps {sae}, %zmm9, %zmm10
vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
vgetmantps $0, {sae}, %zmm8, %zmm15
vgetexpps {sae}, %zmm8, %zmm11
vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
/* set table value to Pi/2 for large X */
vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
vrcp14ps %zmm15, %zmm13
vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
/* polynomial evaluation */
vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
vxorps %zmm0, %zmm10, %zmm0
ret
END(_ZGVeN16v_atanf_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_satan_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 AbsMask[16][1];
__declspec(align(64)) VUINT32 Shifter[16][1];
__declspec(align(64)) VUINT32 MaxThreshold[16][1];
__declspec(align(64)) VUINT32 MOne[16][1];
__declspec(align(64)) VUINT32 One[16][1];
__declspec(align(64)) VUINT32 LargeX[16][1];
__declspec(align(64)) VUINT32 Zero[16][1];
__declspec(align(64)) VUINT32 Tbl_H[32][1];
__declspec(align(64)) VUINT32 Pi2[16][1];
__declspec(align(64)) VUINT32 coeff[3][16][1];
} __svml_satan_data_internal_avx512;
#endif
__svml_satan_data_internal_avx512:
/*== AbsMask ==*/
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== Shifter ==*/
.align 64
.long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
/*== MaxThreshold ==*/
.align 64
.long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
/*== MOne ==*/
.align 64
.long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
/*== One ==*/
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== LargeX ==*/
.align 64
.long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
/*== Zero ==*/
.align 64
.long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
/*== Tbl_H ==*/
.align 64
.long 0x00000000, 0x3e7adbb0
.long 0x3eed6338, 0x3f24bc7d
.long 0x3f490fdb, 0x3f6563e3
.long 0x3f7b985f, 0x3f869c79
.long 0x3f8db70d, 0x3f93877b
.long 0x3f985b6c, 0x3f9c6b53
.long 0x3f9fe0bb, 0x3fa2daa4
.long 0x3fa57088, 0x3fa7b46f
.long 0x3fa9b465, 0x3fab7b7a
.long 0x3fad1283, 0x3fae809e
.long 0x3fafcb99, 0x3fb0f836
.long 0x3fb20a6a, 0x3fb30581
.long 0x3fb3ec43, 0x3fb4c10a
.long 0x3fb585d7, 0x3fb63c64
.long 0x3fb6e62c, 0x3fb78478
.long 0x3fb81868, 0x3fb8a2f5
/*== Pi2 ==*/
.align 64
.long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
/*== coeff3 ==*/
.align 64
.long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
.long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
.long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
.align 64
.type __svml_satan_data_internal_avx512,@object
.size __svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized atanf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_atanf _ZGVbN4v_atanf_sse2
#include "../svml_s_atanf4_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized atanf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_atanf
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_atanf, __GI__ZGVbN4v_atanf,
__redirect__ZGVbN4v_atanf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,164 @@
/* Function atanf vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
*
*/
/* Offsets for data table __svml_satan_data_internal
*/
#define _sSIGN_MASK 0
#define _sABS_MASK 16
#define _sONE 32
#define _sPIO2 48
#define _sPC8 64
#define _sPC7 80
#define _sPC6 96
#define _sPC5 112
#define _sPC4 128
#define _sPC3 144
#define _sPC2 160
#define _sPC1 176
#define _sPC0 192
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_atanf_sse4)
/*
* To use minps\maxps operations for argument reduction
* uncomment _AT_USEMINMAX_ definition
* Declarations
* Variables
* Constants
*/
movups _sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
/*
* 1) If x>1, then r=-1/x, PIO2=Pi/2
* 2) If -1<=x<=1, then r=x, PIO2=0
* 3) If x<-1, then r=-1/x, PIO2=-Pi/2
*/
movups _sONE+__svml_satan_data_internal(%rip), %xmm1
andps %xmm0, %xmm2
movaps %xmm2, %xmm9
movaps %xmm1, %xmm3
cmpleps %xmm1, %xmm9
maxps %xmm2, %xmm3
minps %xmm2, %xmm1
divps %xmm3, %xmm1
movups __svml_satan_data_internal(%rip), %xmm4
movaps %xmm9, %xmm10
andps %xmm4, %xmm0
andnps %xmm4, %xmm9
pxor %xmm0, %xmm9
pxor %xmm1, %xmm9
/* Polynomial. */
movaps %xmm9, %xmm8
mulps %xmm9, %xmm8
movaps %xmm8, %xmm7
mulps %xmm8, %xmm7
movups _sPC8+__svml_satan_data_internal(%rip), %xmm6
mulps %xmm7, %xmm6
movups _sPC7+__svml_satan_data_internal(%rip), %xmm5
mulps %xmm7, %xmm5
addps _sPC6+__svml_satan_data_internal(%rip), %xmm6
mulps %xmm7, %xmm6
addps _sPC5+__svml_satan_data_internal(%rip), %xmm5
mulps %xmm7, %xmm5
addps _sPC4+__svml_satan_data_internal(%rip), %xmm6
mulps %xmm7, %xmm6
addps _sPC3+__svml_satan_data_internal(%rip), %xmm5
mulps %xmm5, %xmm7
addps _sPC2+__svml_satan_data_internal(%rip), %xmm6
mulps %xmm8, %xmm6
addps _sPC1+__svml_satan_data_internal(%rip), %xmm7
andnps _sPIO2+__svml_satan_data_internal(%rip), %xmm10
addps %xmm6, %xmm7
mulps %xmm7, %xmm8
pxor %xmm0, %xmm10
addps _sPC0+__svml_satan_data_internal(%rip), %xmm8
/* Reconstruction. */
mulps %xmm8, %xmm9
addps %xmm9, %xmm10
movaps %xmm10, %xmm0
ret
END(_ZGVbN4v_atanf_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_satan_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
__declspec(align(16)) VUINT32 _sABS_MASK[4][1];
__declspec(align(16)) VUINT32 _sONE[4][1];
__declspec(align(16)) VUINT32 _sPIO2[4][1];
__declspec(align(16)) VUINT32 _sPC8[4][1];
__declspec(align(16)) VUINT32 _sPC7[4][1];
__declspec(align(16)) VUINT32 _sPC6[4][1];
__declspec(align(16)) VUINT32 _sPC5[4][1];
__declspec(align(16)) VUINT32 _sPC4[4][1];
__declspec(align(16)) VUINT32 _sPC3[4][1];
__declspec(align(16)) VUINT32 _sPC2[4][1];
__declspec(align(16)) VUINT32 _sPC1[4][1];
__declspec(align(16)) VUINT32 _sPC0[4][1];
} __svml_satan_data_internal;
#endif
__svml_satan_data_internal:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
.align 16
.long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
.align 16
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
.align 16
.long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
.align 16
.long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
.align 16
.long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
.align 16
.long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
.align 16
.long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
.align 16
.long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
.align 16
.long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
.align 16
.long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
.align 16
.long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
.align 16
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
.align 16
.type __svml_satan_data_internal,@object
.size __svml_satan_data_internal,.-__svml_satan_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized atanf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_atanf _ZGVdN8v_atanf_sse_wrapper
#include "../svml_s_atanf8_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized atanf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_atanf
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_atanf, __GI__ZGVdN8v_atanf,
__redirect__ZGVdN8v_atanf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,148 @@
/* Function atanf vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
*
*/
/* Offsets for data table __svml_satan_data_internal
*/
#define _sSIGN_MASK 0
#define _sABS_MASK 32
#define _sONE 64
#define _sPIO2 96
#define _sPC8 128
#define _sPC7 160
#define _sPC6 192
#define _sPC5 224
#define _sPC4 256
#define _sPC3 288
#define _sPC2 320
#define _sPC1 352
#define _sPC0 384
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_atanf_avx2)
/*
* 1) If x>1, then r=-1/x, PIO2=Pi/2
* 2) If -1<=x<=1, then r=x, PIO2=0
* 3) If x<-1, then r=-1/x, PIO2=-Pi/2
*/
vmovups _sONE+__svml_satan_data_internal(%rip), %ymm2
vmovups __svml_satan_data_internal(%rip), %ymm7
vmovups _sPC7+__svml_satan_data_internal(%rip), %ymm13
/*
* To use minps\maxps operations for argument reduction
* uncomment _AT_USEMINMAX_ definition
* Declarations
* Variables
* Constants
*/
vandps _sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3
vmaxps %ymm3, %ymm2, %ymm5
vminps %ymm3, %ymm2, %ymm4
vcmple_oqps %ymm2, %ymm3, %ymm6
vdivps %ymm5, %ymm4, %ymm11
vandps %ymm7, %ymm0, %ymm9
vandnps %ymm7, %ymm6, %ymm8
vxorps %ymm9, %ymm8, %ymm10
vxorps %ymm11, %ymm10, %ymm15
/* Polynomial. */
vmulps %ymm15, %ymm15, %ymm14
vmovups _sPC8+__svml_satan_data_internal(%rip), %ymm0
vmulps %ymm14, %ymm14, %ymm12
vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0
vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13
vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0
vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13
vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0
vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13
vfmadd213ps %ymm13, %ymm14, %ymm0
vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0
vandnps _sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1
vxorps %ymm9, %ymm1, %ymm1
/* Reconstruction. */
vfmadd213ps %ymm1, %ymm15, %ymm0
ret
END(_ZGVdN8v_atanf_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_satan_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(32)) VUINT32 _sSIGN_MASK[8][1];
__declspec(align(32)) VUINT32 _sABS_MASK[8][1];
__declspec(align(32)) VUINT32 _sONE[8][1];
__declspec(align(32)) VUINT32 _sPIO2[8][1];
__declspec(align(32)) VUINT32 _sPC8[8][1];
__declspec(align(32)) VUINT32 _sPC7[8][1];
__declspec(align(32)) VUINT32 _sPC6[8][1];
__declspec(align(32)) VUINT32 _sPC5[8][1];
__declspec(align(32)) VUINT32 _sPC4[8][1];
__declspec(align(32)) VUINT32 _sPC3[8][1];
__declspec(align(32)) VUINT32 _sPC2[8][1];
__declspec(align(32)) VUINT32 _sPC1[8][1];
__declspec(align(32)) VUINT32 _sPC0[8][1];
} __svml_satan_data_internal;
#endif
__svml_satan_data_internal:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
.align 32
.long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
.align 32
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
.align 32
.long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
.align 32
.long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
.align 32
.long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
.align 32
.long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
.align 32
.long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
.align 32
.long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
.align 32
.long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
.align 32
.long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
.align 32
.long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
.align 32
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
.align 32
.type __svml_satan_data_internal,@object
.size __svml_satan_data_internal,.-__svml_satan_data_internal

View File

@ -0,0 +1,29 @@
/* Function atan vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_atan)
WRAPPER_IMPL_SSE2 atan
END (_ZGVbN2v_atan)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_atan)
#endif

View File

@ -0,0 +1,29 @@
/* Function atan vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_atan)
WRAPPER_IMPL_AVX _ZGVbN2v_atan
END (_ZGVdN4v_atan)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_atan)
#endif

View File

@ -0,0 +1,25 @@
/* Function atan vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_atan)
WRAPPER_IMPL_AVX _ZGVbN2v_atan
END (_ZGVcN4v_atan)

View File

@ -0,0 +1,25 @@
/* Function atan vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_atan)
WRAPPER_IMPL_AVX512 _ZGVdN4v_atan
END (_ZGVeN8v_atan)

View File

@ -0,0 +1,25 @@
/* Function atanf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_atanf)
WRAPPER_IMPL_AVX512 _ZGVdN8v_atanf
END (_ZGVeN16v_atanf)

View File

@ -0,0 +1,29 @@
/* Function atanf vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_atanf)
WRAPPER_IMPL_SSE2 atanf
END (_ZGVbN4v_atanf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_atanf)
#endif

View File

@ -0,0 +1,29 @@
/* Function atanf vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_atanf)
WRAPPER_IMPL_AVX _ZGVbN4v_atanf
END (_ZGVdN8v_atanf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_atanf)
#endif

View File

@ -0,0 +1,25 @@
/* Function atanf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_atanf)
WRAPPER_IMPL_AVX _ZGVbN4v_atanf
END (_ZGVcN8v_atanf)

View File

@ -0,0 +1 @@
#include "test-double-libmvec-atan.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-atan.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-atan.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC atan
#include "test-vector-abi-arg1.h"

View File

@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVbN2v_atan)
#define VEC_INT_TYPE __m128i

View File

@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVdN4v_atan)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View File

@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVcN4v_atan)
#define VEC_INT_TYPE __m128i

View File

@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVeN8v_atan)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View File

@ -0,0 +1 @@
#include "test-float-libmvec-atanf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-atanf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-atanf.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC atanf
#include "test-vector-abi-arg1.h"

View File

@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVeN16v_atanf)
#define VEC_INT_TYPE __m512i

View File

@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVbN4v_atanf)
#define VEC_INT_TYPE __m128i

View File

@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVdN8v_atanf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View File

@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVcN8v_atanf)
#define VEC_INT_TYPE __m128i