mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-21 20:40:05 +00:00
x86-64: Add vector atan/atanf implementation to libmvec
Implement vectorized atan/atanf containing SSE, AVX, AVX2 and AVX512 versions for libmvec as per vector ABI. It also contains accuracy and ABI tests for vector atan/atanf with regenerated ulps. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
5d28a8962d
commit
146310177a
@ -109,4 +109,15 @@
|
|||||||
#define __DECL_SIMD_acosf32x
|
#define __DECL_SIMD_acosf32x
|
||||||
#define __DECL_SIMD_acosf64x
|
#define __DECL_SIMD_acosf64x
|
||||||
#define __DECL_SIMD_acosf128x
|
#define __DECL_SIMD_acosf128x
|
||||||
|
|
||||||
|
#define __DECL_SIMD_atan
|
||||||
|
#define __DECL_SIMD_atanf
|
||||||
|
#define __DECL_SIMD_atanl
|
||||||
|
#define __DECL_SIMD_atanf16
|
||||||
|
#define __DECL_SIMD_atanf32
|
||||||
|
#define __DECL_SIMD_atanf64
|
||||||
|
#define __DECL_SIMD_atanf128
|
||||||
|
#define __DECL_SIMD_atanf32x
|
||||||
|
#define __DECL_SIMD_atanf64x
|
||||||
|
#define __DECL_SIMD_atanf128x
|
||||||
#endif
|
#endif
|
||||||
|
@ -54,7 +54,7 @@ __MATHCALL_VEC (acos,, (_Mdouble_ __x));
|
|||||||
/* Arc sine of X. */
|
/* Arc sine of X. */
|
||||||
__MATHCALL (asin,, (_Mdouble_ __x));
|
__MATHCALL (asin,, (_Mdouble_ __x));
|
||||||
/* Arc tangent of X. */
|
/* Arc tangent of X. */
|
||||||
__MATHCALL (atan,, (_Mdouble_ __x));
|
__MATHCALL_VEC (atan,, (_Mdouble_ __x));
|
||||||
/* Arc tangent of Y/X. */
|
/* Arc tangent of Y/X. */
|
||||||
__MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));
|
__MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));
|
||||||
|
|
||||||
|
@ -47,10 +47,18 @@ GLIBC_2.22 _ZGVeN8v_sin F
|
|||||||
GLIBC_2.22 _ZGVeN8vv_pow F
|
GLIBC_2.22 _ZGVeN8vv_pow F
|
||||||
GLIBC_2.22 _ZGVeN8vvv_sincos F
|
GLIBC_2.22 _ZGVeN8vvv_sincos F
|
||||||
GLIBC_2.35 _ZGVbN2v_acos F
|
GLIBC_2.35 _ZGVbN2v_acos F
|
||||||
|
GLIBC_2.35 _ZGVbN2v_atan F
|
||||||
GLIBC_2.35 _ZGVbN4v_acosf F
|
GLIBC_2.35 _ZGVbN4v_acosf F
|
||||||
|
GLIBC_2.35 _ZGVbN4v_atanf F
|
||||||
GLIBC_2.35 _ZGVcN4v_acos F
|
GLIBC_2.35 _ZGVcN4v_acos F
|
||||||
|
GLIBC_2.35 _ZGVcN4v_atan F
|
||||||
GLIBC_2.35 _ZGVcN8v_acosf F
|
GLIBC_2.35 _ZGVcN8v_acosf F
|
||||||
|
GLIBC_2.35 _ZGVcN8v_atanf F
|
||||||
GLIBC_2.35 _ZGVdN4v_acos F
|
GLIBC_2.35 _ZGVdN4v_acos F
|
||||||
|
GLIBC_2.35 _ZGVdN4v_atan F
|
||||||
GLIBC_2.35 _ZGVdN8v_acosf F
|
GLIBC_2.35 _ZGVdN8v_acosf F
|
||||||
|
GLIBC_2.35 _ZGVdN8v_atanf F
|
||||||
GLIBC_2.35 _ZGVeN16v_acosf F
|
GLIBC_2.35 _ZGVeN16v_acosf F
|
||||||
|
GLIBC_2.35 _ZGVeN16v_atanf F
|
||||||
GLIBC_2.35 _ZGVeN8v_acos F
|
GLIBC_2.35 _ZGVeN8v_acos F
|
||||||
|
GLIBC_2.35 _ZGVeN8v_atan F
|
||||||
|
@ -62,6 +62,10 @@
|
|||||||
# define __DECL_SIMD_acos __DECL_SIMD_x86_64
|
# define __DECL_SIMD_acos __DECL_SIMD_x86_64
|
||||||
# undef __DECL_SIMD_acosf
|
# undef __DECL_SIMD_acosf
|
||||||
# define __DECL_SIMD_acosf __DECL_SIMD_x86_64
|
# define __DECL_SIMD_acosf __DECL_SIMD_x86_64
|
||||||
|
# undef __DECL_SIMD_atan
|
||||||
|
# define __DECL_SIMD_atan __DECL_SIMD_x86_64
|
||||||
|
# undef __DECL_SIMD_atanf
|
||||||
|
# define __DECL_SIMD_atanf __DECL_SIMD_x86_64
|
||||||
|
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -30,6 +30,8 @@
|
|||||||
!GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
|
!GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
|
||||||
!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
|
!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
|
||||||
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
|
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
|
||||||
|
!GCC$ builtin (atan) attributes simd (notinbranch) if('x86_64')
|
||||||
|
!GCC$ builtin (atanf) attributes simd (notinbranch) if('x86_64')
|
||||||
|
|
||||||
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
|
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
|
||||||
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
|
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
|
||||||
@ -45,3 +47,5 @@
|
|||||||
!GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
|
!GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
|
||||||
!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
|
!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
|
||||||
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
|
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
|
||||||
|
!GCC$ builtin (atan) attributes simd (notinbranch) if('x32')
|
||||||
|
!GCC$ builtin (atanf) attributes simd (notinbranch) if('x32')
|
||||||
|
@ -23,6 +23,7 @@ postclean-generated += libmvec.mk
|
|||||||
# Define for both math and mathvec directories.
|
# Define for both math and mathvec directories.
|
||||||
libmvec-funcs = \
|
libmvec-funcs = \
|
||||||
acos \
|
acos \
|
||||||
|
atan \
|
||||||
cos \
|
cos \
|
||||||
exp \
|
exp \
|
||||||
log \
|
log \
|
||||||
|
@ -15,6 +15,8 @@ libmvec {
|
|||||||
}
|
}
|
||||||
GLIBC_2.35 {
|
GLIBC_2.35 {
|
||||||
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
|
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
|
||||||
|
_ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
|
||||||
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
|
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
|
||||||
|
_ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -164,6 +164,26 @@ float: 2
|
|||||||
float128: 2
|
float128: 2
|
||||||
ldouble: 1
|
ldouble: 1
|
||||||
|
|
||||||
|
Function: "atan_vlen16":
|
||||||
|
float: 1
|
||||||
|
|
||||||
|
Function: "atan_vlen2":
|
||||||
|
double: 1
|
||||||
|
|
||||||
|
Function: "atan_vlen4":
|
||||||
|
double: 1
|
||||||
|
float: 1
|
||||||
|
|
||||||
|
Function: "atan_vlen4_avx2":
|
||||||
|
double: 1
|
||||||
|
|
||||||
|
Function: "atan_vlen8":
|
||||||
|
double: 1
|
||||||
|
float: 1
|
||||||
|
|
||||||
|
Function: "atan_vlen8_avx2":
|
||||||
|
float: 1
|
||||||
|
|
||||||
Function: "atanh":
|
Function: "atanh":
|
||||||
double: 2
|
double: 2
|
||||||
float: 2
|
float: 2
|
||||||
|
20
sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core-sse2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core-sse2.S
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
/* SSE2 version of vectorized atan, vector length is 2.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define _ZGVbN2v_atan _ZGVbN2v_atan_sse2
|
||||||
|
#include "../svml_d_atan2_core.S"
|
27
sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core.c
Normal file
27
sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core.c
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
/* Multiple versions of vectorized atan, vector length is 2.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define SYMBOL_NAME _ZGVbN2v_atan
|
||||||
|
#include "ifunc-mathvec-sse4_1.h"
|
||||||
|
|
||||||
|
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||||
|
|
||||||
|
#ifdef SHARED
|
||||||
|
__hidden_ver1 (_ZGVbN2v_atan, __GI__ZGVbN2v_atan, __redirect__ZGVbN2v_atan)
|
||||||
|
__attribute__ ((visibility ("hidden")));
|
||||||
|
#endif
|
245
sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S
Normal file
245
sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S
Normal file
@ -0,0 +1,245 @@
|
|||||||
|
/* Function atan vectorized with SSE4.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
https://www.gnu.org/licenses/. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ALGORITHM DESCRIPTION:
|
||||||
|
*
|
||||||
|
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
|
||||||
|
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
|
||||||
|
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
|
||||||
|
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
|
||||||
|
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
|
||||||
|
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Offsets for data table __svml_datan_data_internal_avx512
|
||||||
|
*/
|
||||||
|
#define AbsMask 0
|
||||||
|
#define Shifter 16
|
||||||
|
#define MaxThreshold 32
|
||||||
|
#define MOne 48
|
||||||
|
#define One 64
|
||||||
|
#define LargeX 80
|
||||||
|
#define Zero 96
|
||||||
|
#define Tbl_H 112
|
||||||
|
#define Tbl_L 368
|
||||||
|
#define dIndexMed 624
|
||||||
|
#define Pi2 640
|
||||||
|
#define Pi2_low 656
|
||||||
|
#define coeff 672
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
.text
|
||||||
|
.section .text.sse4,"ax",@progbits
|
||||||
|
ENTRY(_ZGVbN2v_atan_sse4)
|
||||||
|
lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx
|
||||||
|
movups __svml_datan_data_internal_avx512(%rip), %xmm4
|
||||||
|
movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3
|
||||||
|
andps %xmm0, %xmm4
|
||||||
|
movaps %xmm3, %xmm12
|
||||||
|
movaps %xmm4, %xmm5
|
||||||
|
addpd %xmm4, %xmm12
|
||||||
|
movaps %xmm12, %xmm7
|
||||||
|
|
||||||
|
/*
|
||||||
|
* table lookup sequence
|
||||||
|
* VPERMUTE not available
|
||||||
|
*/
|
||||||
|
movaps %xmm12, %xmm10
|
||||||
|
subpd %xmm3, %xmm7
|
||||||
|
subpd %xmm7, %xmm5
|
||||||
|
mulpd %xmm4, %xmm7
|
||||||
|
movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2
|
||||||
|
psllq $3, %xmm10
|
||||||
|
|
||||||
|
/* saturate X range */
|
||||||
|
movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8
|
||||||
|
pxor %xmm4, %xmm0
|
||||||
|
cmplepd %xmm4, %xmm2
|
||||||
|
addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7
|
||||||
|
minpd %xmm4, %xmm8
|
||||||
|
movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6
|
||||||
|
movaps %xmm2, %xmm1
|
||||||
|
movaps %xmm2, %xmm9
|
||||||
|
andnps %xmm5, %xmm1
|
||||||
|
andps %xmm2, %xmm6
|
||||||
|
andnps %xmm7, %xmm9
|
||||||
|
andps %xmm2, %xmm8
|
||||||
|
orps %xmm6, %xmm1
|
||||||
|
orps %xmm8, %xmm9
|
||||||
|
|
||||||
|
/* R+Rl = DiffX/Y */
|
||||||
|
divpd %xmm9, %xmm1
|
||||||
|
pand .FLT_11(%rip), %xmm10
|
||||||
|
|
||||||
|
/* set table value to Pi/2 for large X */
|
||||||
|
movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4
|
||||||
|
movd %xmm10, %eax
|
||||||
|
andps %xmm2, %xmm4
|
||||||
|
pshufd $2, %xmm10, %xmm11
|
||||||
|
movaps %xmm2, %xmm10
|
||||||
|
|
||||||
|
/* polynomial evaluation */
|
||||||
|
movaps %xmm1, %xmm2
|
||||||
|
mulpd %xmm1, %xmm2
|
||||||
|
movd %xmm11, %edx
|
||||||
|
movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5
|
||||||
|
movaps %xmm2, %xmm7
|
||||||
|
movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6
|
||||||
|
movaps %xmm2, %xmm9
|
||||||
|
mulpd %xmm2, %xmm5
|
||||||
|
mulpd %xmm2, %xmm7
|
||||||
|
addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5
|
||||||
|
mulpd %xmm2, %xmm6
|
||||||
|
mulpd %xmm7, %xmm5
|
||||||
|
addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6
|
||||||
|
mulpd %xmm1, %xmm9
|
||||||
|
addpd %xmm5, %xmm6
|
||||||
|
movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8
|
||||||
|
mulpd %xmm2, %xmm8
|
||||||
|
mulpd %xmm6, %xmm7
|
||||||
|
addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8
|
||||||
|
addpd %xmm7, %xmm8
|
||||||
|
mulpd %xmm8, %xmm9
|
||||||
|
movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14
|
||||||
|
cmplepd %xmm12, %xmm14
|
||||||
|
addpd %xmm9, %xmm1
|
||||||
|
movslq %eax, %rax
|
||||||
|
movaps %xmm14, %xmm3
|
||||||
|
movslq %edx, %rdx
|
||||||
|
movsd -128(%rax,%rcx), %xmm13
|
||||||
|
movsd (%rcx,%rax), %xmm15
|
||||||
|
movhpd -128(%rdx,%rcx), %xmm13
|
||||||
|
movhpd (%rcx,%rdx), %xmm15
|
||||||
|
andnps %xmm13, %xmm3
|
||||||
|
andps %xmm14, %xmm15
|
||||||
|
orps %xmm15, %xmm3
|
||||||
|
andnps %xmm3, %xmm10
|
||||||
|
orps %xmm4, %xmm10
|
||||||
|
addpd %xmm1, %xmm10
|
||||||
|
pxor %xmm10, %xmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
END(_ZGVbN2v_atan_sse4)
|
||||||
|
|
||||||
|
.section .rodata, "a"
|
||||||
|
.align 16
|
||||||
|
|
||||||
|
#ifdef __svml_datan_data_internal_avx512_typedef
|
||||||
|
typedef unsigned int VUINT32;
|
||||||
|
typedef struct {
|
||||||
|
__declspec(align(16)) VUINT32 AbsMask[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 Shifter[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 MaxThreshold[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 MOne[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 One[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 LargeX[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 Zero[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 Tbl_H[32][2];
|
||||||
|
__declspec(align(16)) VUINT32 Tbl_L[32][2];
|
||||||
|
__declspec(align(16)) VUINT32 dIndexMed[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 Pi2[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 Pi2_low[2][2];
|
||||||
|
__declspec(align(16)) VUINT32 coeff[6][2][2];
|
||||||
|
} __svml_datan_data_internal_avx512;
|
||||||
|
#endif
|
||||||
|
__svml_datan_data_internal_avx512:
|
||||||
|
/*== AbsMask ==*/
|
||||||
|
.quad 0x7fffffffffffffff, 0x7fffffffffffffff
|
||||||
|
/*== Shifter ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x4318000000000000, 0x4318000000000000
|
||||||
|
/*== MaxThreshold ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x401f800000000000, 0x401f800000000000
|
||||||
|
/*== MOne ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0xbff0000000000000, 0xbff0000000000000
|
||||||
|
/*== One ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x3ff0000000000000, 0x3ff0000000000000
|
||||||
|
/*== LargeX ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x47f0000000000000, 0x47f0000000000000
|
||||||
|
/*== Zero ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x0000000000000000, 0x0000000000000000
|
||||||
|
/*== Tbl_H ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x0000000000000000, 0x3fcf5b75f92c80dd
|
||||||
|
.quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
|
||||||
|
.quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
|
||||||
|
.quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
|
||||||
|
.quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
|
||||||
|
.quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
|
||||||
|
.quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
|
||||||
|
.quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
|
||||||
|
.quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
|
||||||
|
.quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
|
||||||
|
.quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
|
||||||
|
.quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
|
||||||
|
.quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
|
||||||
|
.quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
|
||||||
|
.quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
|
||||||
|
.quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
|
||||||
|
/*== Tbl_L ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
|
||||||
|
.quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
|
||||||
|
.quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
|
||||||
|
.quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
|
||||||
|
.quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
|
||||||
|
.quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
|
||||||
|
.quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
|
||||||
|
.quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
|
||||||
|
.quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
|
||||||
|
.quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
|
||||||
|
.quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
|
||||||
|
.quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
|
||||||
|
.quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
|
||||||
|
.quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
|
||||||
|
.quad 0xbc929c86447928e7, 0xbc8957a7170df016
|
||||||
|
.quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
|
||||||
|
/*== dIndexMed ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x4318000000000010, 0x4318000000000010
|
||||||
|
/*== Pi2 ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
|
||||||
|
/*== Pi2_low ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x3c91a62633145c07, 0x3c91a62633145c07
|
||||||
|
/*== coeff6 ==*/
|
||||||
|
.align 16
|
||||||
|
.quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
|
||||||
|
.quad 0xbfb74257c46790cc, 0xbfb74257c46790cc
|
||||||
|
.quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
|
||||||
|
.quad 0xbfc249248eef04da, 0xbfc249248eef04da
|
||||||
|
.quad 0x3fc999999998741e, 0x3fc999999998741e
|
||||||
|
.quad 0xbfd555555555554d, 0xbfd555555555554d
|
||||||
|
.align 16
|
||||||
|
.type __svml_datan_data_internal_avx512,@object
|
||||||
|
.size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
|
||||||
|
.align 16
|
||||||
|
|
||||||
|
.FLT_11:
|
||||||
|
.long 0x00000078,0x00000000,0x00000078,0x00000000
|
||||||
|
.type .FLT_11,@object
|
||||||
|
.size .FLT_11,16
|
20
sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core-sse.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core-sse.S
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
/* SSE version of vectorized atan, vector length is 4.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define _ZGVdN4v_atan _ZGVdN4v_atan_sse_wrapper
|
||||||
|
#include "../svml_d_atan4_core.S"
|
27
sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core.c
Normal file
27
sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core.c
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
/* Multiple versions of vectorized atan, vector length is 4.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define SYMBOL_NAME _ZGVdN4v_atan
|
||||||
|
#include "ifunc-mathvec-avx2.h"
|
||||||
|
|
||||||
|
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||||
|
|
||||||
|
#ifdef SHARED
|
||||||
|
__hidden_ver1 (_ZGVdN4v_atan, __GI__ZGVdN4v_atan, __redirect__ZGVdN4v_atan)
|
||||||
|
__attribute__ ((visibility ("hidden")));
|
||||||
|
#endif
|
225
sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
Normal file
225
sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
Normal file
@ -0,0 +1,225 @@
|
|||||||
|
/* Function atan vectorized with AVX2.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
https://www.gnu.org/licenses/. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ALGORITHM DESCRIPTION:
|
||||||
|
*
|
||||||
|
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
|
||||||
|
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
|
||||||
|
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
|
||||||
|
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
|
||||||
|
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
|
||||||
|
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Offsets for data table __svml_datan_data_internal_avx512
|
||||||
|
*/
|
||||||
|
#define AbsMask 0
|
||||||
|
#define Shifter 32
|
||||||
|
#define MaxThreshold 64
|
||||||
|
#define MOne 96
|
||||||
|
#define One 128
|
||||||
|
#define LargeX 160
|
||||||
|
#define Zero 192
|
||||||
|
#define Tbl_H 224
|
||||||
|
#define Tbl_L 480
|
||||||
|
#define dIndexMed 736
|
||||||
|
#define Pi2 768
|
||||||
|
#define Pi2_low 800
|
||||||
|
#define coeff 832
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
.text
|
||||||
|
.section .text.avx2,"ax",@progbits
|
||||||
|
ENTRY(_ZGVdN4v_atan_avx2)
|
||||||
|
lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi
|
||||||
|
vmovupd Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4
|
||||||
|
vmovupd One+__svml_datan_data_internal_avx512(%rip), %ymm9
|
||||||
|
|
||||||
|
/* saturate X range */
|
||||||
|
vmovupd LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6
|
||||||
|
vandpd __svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7
|
||||||
|
vaddpd %ymm4, %ymm7, %ymm2
|
||||||
|
vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3
|
||||||
|
vminpd %ymm7, %ymm6, %ymm10
|
||||||
|
vsubpd %ymm4, %ymm2, %ymm5
|
||||||
|
|
||||||
|
/*
|
||||||
|
* table lookup sequence
|
||||||
|
* VPERMUTE not available
|
||||||
|
*/
|
||||||
|
vpsllq $3, %ymm2, %ymm13
|
||||||
|
vsubpd %ymm5, %ymm7, %ymm8
|
||||||
|
vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2
|
||||||
|
vfmadd231pd %ymm7, %ymm5, %ymm9
|
||||||
|
vpand .FLT_11(%rip), %ymm13, %ymm14
|
||||||
|
vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11
|
||||||
|
vblendvpd %ymm3, %ymm10, %ymm9, %ymm12
|
||||||
|
vxorpd %ymm0, %ymm7, %ymm1
|
||||||
|
|
||||||
|
/* R+Rl = DiffX/Y */
|
||||||
|
vdivpd %ymm12, %ymm11, %ymm0
|
||||||
|
vextractf128 $1, %ymm14, %xmm4
|
||||||
|
vmovd %xmm14, %eax
|
||||||
|
vmovd %xmm4, %ecx
|
||||||
|
movslq %eax, %rax
|
||||||
|
vpextrd $2, %xmm14, %edx
|
||||||
|
movslq %ecx, %rcx
|
||||||
|
vpextrd $2, %xmm4, %esi
|
||||||
|
movslq %edx, %rdx
|
||||||
|
movslq %esi, %rsi
|
||||||
|
vmovsd -128(%rax,%rdi), %xmm15
|
||||||
|
vmovsd (%rdi,%rax), %xmm7
|
||||||
|
vmovsd -128(%rcx,%rdi), %xmm5
|
||||||
|
vmovsd (%rdi,%rcx), %xmm9
|
||||||
|
vmovhpd -128(%rdx,%rdi), %xmm15, %xmm15
|
||||||
|
vmovhpd (%rdi,%rdx), %xmm7, %xmm8
|
||||||
|
vmovhpd -128(%rsi,%rdi), %xmm5, %xmm6
|
||||||
|
vmovhpd (%rdi,%rsi), %xmm9, %xmm10
|
||||||
|
|
||||||
|
/* polynomial evaluation */
|
||||||
|
vmulpd %ymm0, %ymm0, %ymm5
|
||||||
|
vmulpd %ymm5, %ymm5, %ymm4
|
||||||
|
vinsertf128 $1, %xmm6, %ymm15, %ymm11
|
||||||
|
vinsertf128 $1, %xmm10, %ymm8, %ymm12
|
||||||
|
vblendvpd %ymm2, %ymm12, %ymm11, %ymm13
|
||||||
|
vmovupd coeff+__svml_datan_data_internal_avx512(%rip), %ymm8
|
||||||
|
vmovupd coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2
|
||||||
|
vmulpd %ymm5, %ymm0, %ymm6
|
||||||
|
vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8
|
||||||
|
vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2
|
||||||
|
|
||||||
|
/* set table value to Pi/2 for large X */
|
||||||
|
vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7
|
||||||
|
vmovupd coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3
|
||||||
|
vfmadd213pd %ymm2, %ymm4, %ymm8
|
||||||
|
vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5
|
||||||
|
vfmadd213pd %ymm5, %ymm4, %ymm8
|
||||||
|
vfmadd213pd %ymm0, %ymm6, %ymm8
|
||||||
|
vaddpd %ymm8, %ymm7, %ymm0
|
||||||
|
vxorpd %ymm1, %ymm0, %ymm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
END(_ZGVdN4v_atan_avx2)
|
||||||
|
|
||||||
|
.section .rodata, "a"
|
||||||
|
.align 32
|
||||||
|
|
||||||
|
.FLT_11:
|
||||||
|
.long 0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000
|
||||||
|
.type .FLT_11,@object
|
||||||
|
.size .FLT_11,32
|
||||||
|
.align 32
|
||||||
|
|
||||||
|
#ifdef __svml_datan_data_internal_avx512_typedef
|
||||||
|
typedef unsigned int VUINT32;
|
||||||
|
typedef struct {
|
||||||
|
__declspec(align(32)) VUINT32 AbsMask[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 Shifter[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 MaxThreshold[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 MOne[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 One[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 LargeX[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 Zero[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 Tbl_H[32][2];
|
||||||
|
__declspec(align(32)) VUINT32 Tbl_L[32][2];
|
||||||
|
__declspec(align(32)) VUINT32 dIndexMed[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 Pi2[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 Pi2_low[4][2];
|
||||||
|
__declspec(align(32)) VUINT32 coeff[6][4][2];
|
||||||
|
} __svml_datan_data_internal_avx512;
|
||||||
|
#endif
|
||||||
|
__svml_datan_data_internal_avx512:
|
||||||
|
/*== AbsMask ==*/
|
||||||
|
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
|
||||||
|
/*== Shifter ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
|
||||||
|
/*== MaxThreshold ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
|
||||||
|
/*== MOne ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
|
||||||
|
/*== One ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
|
||||||
|
/*== LargeX ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
|
||||||
|
/*== Zero ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
|
||||||
|
/*== Tbl_H ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x0000000000000000, 0x3fcf5b75f92c80dd
|
||||||
|
.quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
|
||||||
|
.quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
|
||||||
|
.quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
|
||||||
|
.quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
|
||||||
|
.quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
|
||||||
|
.quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
|
||||||
|
.quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
|
||||||
|
.quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
|
||||||
|
.quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
|
||||||
|
.quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
|
||||||
|
.quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
|
||||||
|
.quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
|
||||||
|
.quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
|
||||||
|
.quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
|
||||||
|
.quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
|
||||||
|
/*== Tbl_L ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
|
||||||
|
.quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
|
||||||
|
.quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
|
||||||
|
.quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
|
||||||
|
.quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
|
||||||
|
.quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
|
||||||
|
.quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
|
||||||
|
.quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
|
||||||
|
.quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
|
||||||
|
.quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
|
||||||
|
.quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
|
||||||
|
.quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
|
||||||
|
.quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
|
||||||
|
.quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
|
||||||
|
.quad 0xbc929c86447928e7, 0xbc8957a7170df016
|
||||||
|
.quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
|
||||||
|
/*== dIndexMed ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
|
||||||
|
/*== Pi2 ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
|
||||||
|
/*== Pi2_low ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
|
||||||
|
/*== coeff6 ==*/
|
||||||
|
.align 32
|
||||||
|
.quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
|
||||||
|
.quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
|
||||||
|
.quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
|
||||||
|
.quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
|
||||||
|
.quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
|
||||||
|
.quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
|
||||||
|
.align 32
|
||||||
|
.type __svml_datan_data_internal_avx512,@object
|
||||||
|
.size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
|
20
sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core-avx2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core-avx2.S
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
/* AVX2 version of vectorized atan, vector length is 8.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define _ZGVeN8v_atan _ZGVeN8v_atan_avx2_wrapper
|
||||||
|
#include "../svml_d_atan8_core.S"
|
27
sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core.c
Normal file
27
sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core.c
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
/* Multiple versions of vectorized atan, vector length is 8.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define SYMBOL_NAME _ZGVeN8v_atan
|
||||||
|
#include "ifunc-mathvec-avx512-skx.h"
|
||||||
|
|
||||||
|
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||||
|
|
||||||
|
#ifdef SHARED
|
||||||
|
__hidden_ver1 (_ZGVeN8v_atan, __GI__ZGVeN8v_atan, __redirect__ZGVeN8v_atan)
|
||||||
|
__attribute__ ((visibility ("hidden")));
|
||||||
|
#endif
|
213
sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S
Normal file
213
sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
/* Function atan vectorized with AVX-512.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
https://www.gnu.org/licenses/. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ALGORITHM DESCRIPTION:
|
||||||
|
*
|
||||||
|
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
|
||||||
|
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
|
||||||
|
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
|
||||||
|
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
|
||||||
|
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
|
||||||
|
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Offsets for data table __svml_datan_data_internal_avx512
|
||||||
|
*/
|
||||||
|
#define AbsMask 0
|
||||||
|
#define Shifter 64
|
||||||
|
#define MaxThreshold 128
|
||||||
|
#define MOne 192
|
||||||
|
#define One 256
|
||||||
|
#define LargeX 320
|
||||||
|
#define Zero 384
|
||||||
|
#define Tbl_H 448
|
||||||
|
#define dIndexMed 704
|
||||||
|
#define Pi2 768
|
||||||
|
#define coeff_1 832
|
||||||
|
#define coeff_2 896
|
||||||
|
#define coeff_3 960
|
||||||
|
#define coeff_4 1024
|
||||||
|
#define coeff_5 1088
|
||||||
|
#define coeff_6 1152
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
.text
|
||||||
|
.section .text.evex512,"ax",@progbits
|
||||||
|
ENTRY(_ZGVeN8v_atan_skx)
|
||||||
|
vmovups Shifter+__svml_datan_data_internal_avx512(%rip), %zmm4
|
||||||
|
vmovups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %zmm3
|
||||||
|
vmovups One+__svml_datan_data_internal_avx512(%rip), %zmm9
|
||||||
|
|
||||||
|
/* saturate X range */
|
||||||
|
vmovups LargeX+__svml_datan_data_internal_avx512(%rip), %zmm7
|
||||||
|
vandpd __svml_datan_data_internal_avx512(%rip), %zmm0, %zmm8
|
||||||
|
|
||||||
|
/* R+Rl = DiffX/Y */
|
||||||
|
vbroadcastsd .FLT_10(%rip), %zmm15
|
||||||
|
vaddpd {rn-sae}, %zmm4, %zmm8, %zmm2
|
||||||
|
vxorpd %zmm0, %zmm8, %zmm1
|
||||||
|
vcmppd $29, {sae}, %zmm3, %zmm8, %k2
|
||||||
|
|
||||||
|
/* round to 2 bits after binary point */
|
||||||
|
vreducepd $40, {sae}, %zmm8, %zmm6
|
||||||
|
vsubpd {rn-sae}, %zmm4, %zmm2, %zmm5
|
||||||
|
|
||||||
|
/*
|
||||||
|
* if|X|>=MaxThreshold, set DiffX=-1
|
||||||
|
* VMSUB(D, DiffX, LargeMask, Zero, One);
|
||||||
|
*/
|
||||||
|
vblendmpd MOne+__svml_datan_data_internal_avx512(%rip), %zmm6, %zmm10{%k2}
|
||||||
|
vfmadd231pd {rn-sae}, %zmm8, %zmm5, %zmm9
|
||||||
|
vmovups dIndexMed+__svml_datan_data_internal_avx512(%rip), %zmm5
|
||||||
|
|
||||||
|
/* table lookup sequence */
|
||||||
|
vmovups Tbl_H+__svml_datan_data_internal_avx512(%rip), %zmm6
|
||||||
|
vgetmantpd $0, {sae}, %zmm10, %zmm14
|
||||||
|
vgetexppd {sae}, %zmm10, %zmm11
|
||||||
|
vmovups coeff_5+__svml_datan_data_internal_avx512(%rip), %zmm10
|
||||||
|
|
||||||
|
/*
|
||||||
|
* if|X|>=MaxThreshold, set Y=X
|
||||||
|
* VMADD(D, Y, LargeMask, X, Zero);
|
||||||
|
*/
|
||||||
|
vminpd {sae}, %zmm8, %zmm7, %zmm9{%k2}
|
||||||
|
vcmppd $29, {sae}, %zmm5, %zmm2, %k1
|
||||||
|
vmovups Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %zmm7
|
||||||
|
vmovups coeff_1+__svml_datan_data_internal_avx512(%rip), %zmm8
|
||||||
|
vgetmantpd $0, {sae}, %zmm9, %zmm3
|
||||||
|
vgetexppd {sae}, %zmm9, %zmm12
|
||||||
|
vmovups coeff_3+__svml_datan_data_internal_avx512(%rip), %zmm9
|
||||||
|
vpermt2pd Tbl_H+64+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm6
|
||||||
|
vsubpd {rn-sae}, %zmm12, %zmm11, %zmm4
|
||||||
|
vpermt2pd Tbl_H+192+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm7
|
||||||
|
vrcp14pd %zmm3, %zmm13
|
||||||
|
vmovups coeff_4+__svml_datan_data_internal_avx512(%rip), %zmm12
|
||||||
|
vmovups coeff_6+__svml_datan_data_internal_avx512(%rip), %zmm11
|
||||||
|
vblendmpd %zmm7, %zmm6, %zmm2{%k1}
|
||||||
|
vmulpd {rn-sae}, %zmm13, %zmm14, %zmm0
|
||||||
|
vfnmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
|
||||||
|
vfnmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm3
|
||||||
|
vfmadd213pd {rn-sae}, %zmm15, %zmm15, %zmm15
|
||||||
|
vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm15
|
||||||
|
vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm3
|
||||||
|
vscalefpd {rn-sae}, %zmm4, %zmm3, %zmm0
|
||||||
|
|
||||||
|
/* set table value to Pi/2 for large X */
|
||||||
|
vblendmpd Pi2+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm3{%k2}
|
||||||
|
vmovups coeff_2+__svml_datan_data_internal_avx512(%rip), %zmm2
|
||||||
|
|
||||||
|
/* polynomial evaluation */
|
||||||
|
vmulpd {rn-sae}, %zmm0, %zmm0, %zmm14
|
||||||
|
vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
|
||||||
|
vmulpd {rn-sae}, %zmm0, %zmm14, %zmm15
|
||||||
|
vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm2
|
||||||
|
vfmadd231pd {rn-sae}, %zmm14, %zmm9, %zmm12
|
||||||
|
vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14
|
||||||
|
vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm2
|
||||||
|
vfmadd213pd {rn-sae}, %zmm14, %zmm13, %zmm2
|
||||||
|
vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm2
|
||||||
|
vaddpd {rn-sae}, %zmm3, %zmm2, %zmm0
|
||||||
|
vxorpd %zmm1, %zmm0, %zmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
END(_ZGVeN8v_atan_skx)
|
||||||
|
|
||||||
|
.section .rodata, "a"
|
||||||
|
.align 64
|
||||||
|
|
||||||
|
#ifdef __svml_datan_data_internal_avx512_typedef
|
||||||
|
typedef unsigned int VUINT32;
|
||||||
|
typedef struct {
|
||||||
|
__declspec(align(64)) VUINT32 AbsMask[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 Shifter[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 MaxThreshold[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 MOne[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 One[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 LargeX[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 Zero[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 Tbl_H[32][2];
|
||||||
|
__declspec(align(64)) VUINT32 dIndexMed[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 Pi2[8][2];
|
||||||
|
__declspec(align(64)) VUINT32 coeff[6][8][2];
|
||||||
|
} __svml_datan_data_internal_avx512;
|
||||||
|
#endif
|
||||||
|
__svml_datan_data_internal_avx512:
|
||||||
|
/*== AbsMask ==*/
|
||||||
|
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
|
||||||
|
/*== Shifter ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
|
||||||
|
/*== MaxThreshold ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
|
||||||
|
/*== MOne ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
|
||||||
|
/*== One ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
|
||||||
|
/*== LargeX ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
|
||||||
|
/*== Zero ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
|
||||||
|
/*== Tbl_H ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x0000000000000000, 0x3fcf5b75f92c80dd
|
||||||
|
.quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
|
||||||
|
.quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
|
||||||
|
.quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
|
||||||
|
.quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
|
||||||
|
.quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
|
||||||
|
.quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
|
||||||
|
.quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
|
||||||
|
.quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
|
||||||
|
.quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
|
||||||
|
.quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
|
||||||
|
.quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
|
||||||
|
.quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
|
||||||
|
.quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
|
||||||
|
.quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
|
||||||
|
.quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
|
||||||
|
/*== dIndexMed ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
|
||||||
|
/*== Pi2 ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
|
||||||
|
/*== coeff6 ==*/
|
||||||
|
.align 64
|
||||||
|
.quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
|
||||||
|
.quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
|
||||||
|
.quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
|
||||||
|
.quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
|
||||||
|
.quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
|
||||||
|
.quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
|
||||||
|
.align 64
|
||||||
|
.type __svml_datan_data_internal_avx512,@object
|
||||||
|
.size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
|
||||||
|
.align 8
|
||||||
|
|
||||||
|
.FLT_10:
|
||||||
|
.long 0x00000000,0x3ff00000
|
||||||
|
.type .FLT_10,@object
|
||||||
|
.size .FLT_10,8
|
20
sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core-avx2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core-avx2.S
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
/* AVX2 version of vectorized atanf.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define _ZGVeN16v_atanf _ZGVeN16v_atanf_avx2_wrapper
|
||||||
|
#include "../svml_s_atanf16_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core.c
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* Multiple versions of vectorized atanf, vector length is 16.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define SYMBOL_NAME _ZGVeN16v_atanf
|
||||||
|
#include "ifunc-mathvec-avx512-skx.h"
|
||||||
|
|
||||||
|
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||||
|
|
||||||
|
#ifdef SHARED
|
||||||
|
__hidden_ver1 (_ZGVeN16v_atanf, __GI__ZGVeN16v_atanf,
|
||||||
|
__redirect__ZGVeN16v_atanf)
|
||||||
|
__attribute__ ((visibility ("hidden")));
|
||||||
|
#endif
|
174
sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
Normal file
174
sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
/* Function atanf vectorized with AVX-512.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
https://www.gnu.org/licenses/. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ALGORITHM DESCRIPTION:
|
||||||
|
*
|
||||||
|
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
|
||||||
|
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
|
||||||
|
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
|
||||||
|
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
|
||||||
|
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
|
||||||
|
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Offsets for data table __svml_satan_data_internal_avx512
|
||||||
|
*/
|
||||||
|
#define AbsMask 0
|
||||||
|
#define Shifter 64
|
||||||
|
#define MaxThreshold 128
|
||||||
|
#define MOne 192
|
||||||
|
#define One 256
|
||||||
|
#define LargeX 320
|
||||||
|
#define Zero 384
|
||||||
|
#define Tbl_H 448
|
||||||
|
#define Pi2 576
|
||||||
|
#define coeff_1 640
|
||||||
|
#define coeff_2 704
|
||||||
|
#define coeff_3 768
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
.text
|
||||||
|
.section .text.exex512,"ax",@progbits
|
||||||
|
ENTRY(_ZGVeN16v_atanf_skx)
|
||||||
|
vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
|
||||||
|
vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
|
||||||
|
vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
|
||||||
|
|
||||||
|
/* round to 2 bits after binary point */
|
||||||
|
vreduceps $40, {sae}, %zmm7, %zmm5
|
||||||
|
|
||||||
|
/* saturate X range */
|
||||||
|
vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
|
||||||
|
vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
|
||||||
|
vcmpps $29, {sae}, %zmm3, %zmm7, %k1
|
||||||
|
|
||||||
|
/* table lookup sequence */
|
||||||
|
vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
|
||||||
|
vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
|
||||||
|
vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
|
||||||
|
vxorps %zmm0, %zmm7, %zmm0
|
||||||
|
vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
|
||||||
|
vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
|
||||||
|
|
||||||
|
/* if|X|>=MaxThreshold, set DiffX=-1 */
|
||||||
|
vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
|
||||||
|
vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
|
||||||
|
|
||||||
|
/* if|X|>=MaxThreshold, set Y=X */
|
||||||
|
vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
|
||||||
|
|
||||||
|
/* R+Rl = DiffX/Y */
|
||||||
|
vgetmantps $0, {sae}, %zmm9, %zmm12
|
||||||
|
vgetexpps {sae}, %zmm9, %zmm10
|
||||||
|
vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
|
||||||
|
vgetmantps $0, {sae}, %zmm8, %zmm15
|
||||||
|
vgetexpps {sae}, %zmm8, %zmm11
|
||||||
|
vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
|
||||||
|
|
||||||
|
/* set table value to Pi/2 for large X */
|
||||||
|
vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
|
||||||
|
vrcp14ps %zmm15, %zmm13
|
||||||
|
vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
|
||||||
|
vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
|
||||||
|
vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
|
||||||
|
vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
|
||||||
|
vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
|
||||||
|
|
||||||
|
/* polynomial evaluation */
|
||||||
|
vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
|
||||||
|
vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
|
||||||
|
vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
|
||||||
|
vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
|
||||||
|
vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
|
||||||
|
vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
|
||||||
|
vxorps %zmm0, %zmm10, %zmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
END(_ZGVeN16v_atanf_skx)
|
||||||
|
|
||||||
|
.section .rodata, "a"
|
||||||
|
.align 64
|
||||||
|
|
||||||
|
#ifdef __svml_satan_data_internal_avx512_typedef
|
||||||
|
typedef unsigned int VUINT32;
|
||||||
|
typedef struct {
|
||||||
|
__declspec(align(64)) VUINT32 AbsMask[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 Shifter[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 MaxThreshold[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 MOne[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 One[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 LargeX[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 Zero[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 Tbl_H[32][1];
|
||||||
|
__declspec(align(64)) VUINT32 Pi2[16][1];
|
||||||
|
__declspec(align(64)) VUINT32 coeff[3][16][1];
|
||||||
|
} __svml_satan_data_internal_avx512;
|
||||||
|
#endif
|
||||||
|
__svml_satan_data_internal_avx512:
|
||||||
|
/*== AbsMask ==*/
|
||||||
|
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
|
||||||
|
/*== Shifter ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
|
||||||
|
/*== MaxThreshold ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
|
||||||
|
/*== MOne ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
|
||||||
|
/*== One ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
|
||||||
|
/*== LargeX ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
|
||||||
|
/*== Zero ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
|
||||||
|
/*== Tbl_H ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0x00000000, 0x3e7adbb0
|
||||||
|
.long 0x3eed6338, 0x3f24bc7d
|
||||||
|
.long 0x3f490fdb, 0x3f6563e3
|
||||||
|
.long 0x3f7b985f, 0x3f869c79
|
||||||
|
.long 0x3f8db70d, 0x3f93877b
|
||||||
|
.long 0x3f985b6c, 0x3f9c6b53
|
||||||
|
.long 0x3f9fe0bb, 0x3fa2daa4
|
||||||
|
.long 0x3fa57088, 0x3fa7b46f
|
||||||
|
.long 0x3fa9b465, 0x3fab7b7a
|
||||||
|
.long 0x3fad1283, 0x3fae809e
|
||||||
|
.long 0x3fafcb99, 0x3fb0f836
|
||||||
|
.long 0x3fb20a6a, 0x3fb30581
|
||||||
|
.long 0x3fb3ec43, 0x3fb4c10a
|
||||||
|
.long 0x3fb585d7, 0x3fb63c64
|
||||||
|
.long 0x3fb6e62c, 0x3fb78478
|
||||||
|
.long 0x3fb81868, 0x3fb8a2f5
|
||||||
|
/*== Pi2 ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
|
||||||
|
/*== coeff3 ==*/
|
||||||
|
.align 64
|
||||||
|
.long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
|
||||||
|
.long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
|
||||||
|
.long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
|
||||||
|
.align 64
|
||||||
|
.type __svml_satan_data_internal_avx512,@object
|
||||||
|
.size __svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512
|
20
sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core-sse2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core-sse2.S
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
/* SSE2 version of vectorized atanf, vector length is 4.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define _ZGVbN4v_atanf _ZGVbN4v_atanf_sse2
|
||||||
|
#include "../svml_s_atanf4_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core.c
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* Multiple versions of vectorized atanf, vector length is 4.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define SYMBOL_NAME _ZGVbN4v_atanf
|
||||||
|
#include "ifunc-mathvec-sse4_1.h"
|
||||||
|
|
||||||
|
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||||
|
|
||||||
|
#ifdef SHARED
|
||||||
|
__hidden_ver1 (_ZGVbN4v_atanf, __GI__ZGVbN4v_atanf,
|
||||||
|
__redirect__ZGVbN4v_atanf)
|
||||||
|
__attribute__ ((visibility ("hidden")));
|
||||||
|
#endif
|
164
sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
Normal file
164
sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
/* Function atanf vectorized with SSE4.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
https://www.gnu.org/licenses/. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ALGORITHM DESCRIPTION:
|
||||||
|
*
|
||||||
|
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
|
||||||
|
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
|
||||||
|
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
|
||||||
|
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
|
||||||
|
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
|
||||||
|
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Offsets for data table __svml_satan_data_internal
|
||||||
|
*/
|
||||||
|
#define _sSIGN_MASK 0
|
||||||
|
#define _sABS_MASK 16
|
||||||
|
#define _sONE 32
|
||||||
|
#define _sPIO2 48
|
||||||
|
#define _sPC8 64
|
||||||
|
#define _sPC7 80
|
||||||
|
#define _sPC6 96
|
||||||
|
#define _sPC5 112
|
||||||
|
#define _sPC4 128
|
||||||
|
#define _sPC3 144
|
||||||
|
#define _sPC2 160
|
||||||
|
#define _sPC1 176
|
||||||
|
#define _sPC0 192
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
.text
|
||||||
|
.section .text.sse4,"ax",@progbits
|
||||||
|
ENTRY(_ZGVbN4v_atanf_sse4)
|
||||||
|
/*
|
||||||
|
* To use minps\maxps operations for argument reduction
|
||||||
|
* uncomment _AT_USEMINMAX_ definition
|
||||||
|
* Declarations
|
||||||
|
* Variables
|
||||||
|
* Constants
|
||||||
|
*/
|
||||||
|
movups _sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 1) If x>1, then r=-1/x, PIO2=Pi/2
|
||||||
|
* 2) If -1<=x<=1, then r=x, PIO2=0
|
||||||
|
* 3) If x<-1, then r=-1/x, PIO2=-Pi/2
|
||||||
|
*/
|
||||||
|
movups _sONE+__svml_satan_data_internal(%rip), %xmm1
|
||||||
|
andps %xmm0, %xmm2
|
||||||
|
movaps %xmm2, %xmm9
|
||||||
|
movaps %xmm1, %xmm3
|
||||||
|
cmpleps %xmm1, %xmm9
|
||||||
|
maxps %xmm2, %xmm3
|
||||||
|
minps %xmm2, %xmm1
|
||||||
|
divps %xmm3, %xmm1
|
||||||
|
movups __svml_satan_data_internal(%rip), %xmm4
|
||||||
|
movaps %xmm9, %xmm10
|
||||||
|
andps %xmm4, %xmm0
|
||||||
|
andnps %xmm4, %xmm9
|
||||||
|
pxor %xmm0, %xmm9
|
||||||
|
pxor %xmm1, %xmm9
|
||||||
|
|
||||||
|
/* Polynomial. */
|
||||||
|
movaps %xmm9, %xmm8
|
||||||
|
mulps %xmm9, %xmm8
|
||||||
|
movaps %xmm8, %xmm7
|
||||||
|
mulps %xmm8, %xmm7
|
||||||
|
movups _sPC8+__svml_satan_data_internal(%rip), %xmm6
|
||||||
|
mulps %xmm7, %xmm6
|
||||||
|
movups _sPC7+__svml_satan_data_internal(%rip), %xmm5
|
||||||
|
mulps %xmm7, %xmm5
|
||||||
|
addps _sPC6+__svml_satan_data_internal(%rip), %xmm6
|
||||||
|
mulps %xmm7, %xmm6
|
||||||
|
addps _sPC5+__svml_satan_data_internal(%rip), %xmm5
|
||||||
|
mulps %xmm7, %xmm5
|
||||||
|
addps _sPC4+__svml_satan_data_internal(%rip), %xmm6
|
||||||
|
mulps %xmm7, %xmm6
|
||||||
|
addps _sPC3+__svml_satan_data_internal(%rip), %xmm5
|
||||||
|
mulps %xmm5, %xmm7
|
||||||
|
addps _sPC2+__svml_satan_data_internal(%rip), %xmm6
|
||||||
|
mulps %xmm8, %xmm6
|
||||||
|
addps _sPC1+__svml_satan_data_internal(%rip), %xmm7
|
||||||
|
andnps _sPIO2+__svml_satan_data_internal(%rip), %xmm10
|
||||||
|
addps %xmm6, %xmm7
|
||||||
|
mulps %xmm7, %xmm8
|
||||||
|
pxor %xmm0, %xmm10
|
||||||
|
addps _sPC0+__svml_satan_data_internal(%rip), %xmm8
|
||||||
|
|
||||||
|
/* Reconstruction. */
|
||||||
|
mulps %xmm8, %xmm9
|
||||||
|
addps %xmm9, %xmm10
|
||||||
|
movaps %xmm10, %xmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
END(_ZGVbN4v_atanf_sse4)
|
||||||
|
|
||||||
|
.section .rodata, "a"
|
||||||
|
.align 16
|
||||||
|
|
||||||
|
#ifdef __svml_satan_data_internal_typedef
|
||||||
|
typedef unsigned int VUINT32;
|
||||||
|
typedef struct {
|
||||||
|
__declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sABS_MASK[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sONE[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPIO2[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC8[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC7[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC6[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC5[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC4[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC3[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC2[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC1[4][1];
|
||||||
|
__declspec(align(16)) VUINT32 _sPC0[4][1];
|
||||||
|
} __svml_satan_data_internal;
|
||||||
|
#endif
|
||||||
|
__svml_satan_data_internal:
|
||||||
|
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
|
||||||
|
.align 16
|
||||||
|
.long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
|
||||||
|
.align 16
|
||||||
|
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
|
||||||
|
.align 16
|
||||||
|
.long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
|
||||||
|
.align 16
|
||||||
|
.long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
|
||||||
|
.align 16
|
||||||
|
.long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
|
||||||
|
.align 16
|
||||||
|
.long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
|
||||||
|
.align 16
|
||||||
|
.long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
|
||||||
|
.align 16
|
||||||
|
.long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
|
||||||
|
.align 16
|
||||||
|
.long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
|
||||||
|
.align 16
|
||||||
|
.long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
|
||||||
|
.align 16
|
||||||
|
.long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
|
||||||
|
.align 16
|
||||||
|
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
|
||||||
|
.align 16
|
||||||
|
.type __svml_satan_data_internal,@object
|
||||||
|
.size __svml_satan_data_internal,.-__svml_satan_data_internal
|
20
sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core-sse.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core-sse.S
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
/* SSE version of vectorized atanf, vector length is 8.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define _ZGVdN8v_atanf _ZGVdN8v_atanf_sse_wrapper
|
||||||
|
#include "../svml_s_atanf8_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core.c
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* Multiple versions of vectorized atanf, vector length is 8.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#define SYMBOL_NAME _ZGVdN8v_atanf
|
||||||
|
#include "ifunc-mathvec-avx2.h"
|
||||||
|
|
||||||
|
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||||
|
|
||||||
|
#ifdef SHARED
|
||||||
|
__hidden_ver1 (_ZGVdN8v_atanf, __GI__ZGVdN8v_atanf,
|
||||||
|
__redirect__ZGVdN8v_atanf)
|
||||||
|
__attribute__ ((visibility ("hidden")));
|
||||||
|
#endif
|
148
sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
Normal file
148
sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
/* Function atanf vectorized with AVX2.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
https://www.gnu.org/licenses/. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ALGORITHM DESCRIPTION:
|
||||||
|
*
|
||||||
|
* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
|
||||||
|
* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
|
||||||
|
* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
|
||||||
|
* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
|
||||||
|
* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
|
||||||
|
* Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Offsets for data table __svml_satan_data_internal
|
||||||
|
*/
|
||||||
|
#define _sSIGN_MASK 0
|
||||||
|
#define _sABS_MASK 32
|
||||||
|
#define _sONE 64
|
||||||
|
#define _sPIO2 96
|
||||||
|
#define _sPC8 128
|
||||||
|
#define _sPC7 160
|
||||||
|
#define _sPC6 192
|
||||||
|
#define _sPC5 224
|
||||||
|
#define _sPC4 256
|
||||||
|
#define _sPC3 288
|
||||||
|
#define _sPC2 320
|
||||||
|
#define _sPC1 352
|
||||||
|
#define _sPC0 384
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
.text
|
||||||
|
.section .text.avx2,"ax",@progbits
|
||||||
|
ENTRY(_ZGVdN8v_atanf_avx2)
|
||||||
|
/*
|
||||||
|
* 1) If x>1, then r=-1/x, PIO2=Pi/2
|
||||||
|
* 2) If -1<=x<=1, then r=x, PIO2=0
|
||||||
|
* 3) If x<-1, then r=-1/x, PIO2=-Pi/2
|
||||||
|
*/
|
||||||
|
vmovups _sONE+__svml_satan_data_internal(%rip), %ymm2
|
||||||
|
vmovups __svml_satan_data_internal(%rip), %ymm7
|
||||||
|
vmovups _sPC7+__svml_satan_data_internal(%rip), %ymm13
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To use minps\maxps operations for argument reduction
|
||||||
|
* uncomment _AT_USEMINMAX_ definition
|
||||||
|
* Declarations
|
||||||
|
* Variables
|
||||||
|
* Constants
|
||||||
|
*/
|
||||||
|
vandps _sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3
|
||||||
|
vmaxps %ymm3, %ymm2, %ymm5
|
||||||
|
vminps %ymm3, %ymm2, %ymm4
|
||||||
|
vcmple_oqps %ymm2, %ymm3, %ymm6
|
||||||
|
vdivps %ymm5, %ymm4, %ymm11
|
||||||
|
vandps %ymm7, %ymm0, %ymm9
|
||||||
|
vandnps %ymm7, %ymm6, %ymm8
|
||||||
|
vxorps %ymm9, %ymm8, %ymm10
|
||||||
|
vxorps %ymm11, %ymm10, %ymm15
|
||||||
|
|
||||||
|
/* Polynomial. */
|
||||||
|
vmulps %ymm15, %ymm15, %ymm14
|
||||||
|
vmovups _sPC8+__svml_satan_data_internal(%rip), %ymm0
|
||||||
|
vmulps %ymm14, %ymm14, %ymm12
|
||||||
|
vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0
|
||||||
|
vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13
|
||||||
|
vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0
|
||||||
|
vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13
|
||||||
|
vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0
|
||||||
|
vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13
|
||||||
|
vfmadd213ps %ymm13, %ymm14, %ymm0
|
||||||
|
vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0
|
||||||
|
vandnps _sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1
|
||||||
|
vxorps %ymm9, %ymm1, %ymm1
|
||||||
|
|
||||||
|
/* Reconstruction. */
|
||||||
|
vfmadd213ps %ymm1, %ymm15, %ymm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
END(_ZGVdN8v_atanf_avx2)
|
||||||
|
|
||||||
|
.section .rodata, "a"
|
||||||
|
.align 32
|
||||||
|
|
||||||
|
#ifdef __svml_satan_data_internal_typedef
|
||||||
|
typedef unsigned int VUINT32;
|
||||||
|
typedef struct {
|
||||||
|
__declspec(align(32)) VUINT32 _sSIGN_MASK[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sABS_MASK[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sONE[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPIO2[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC8[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC7[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC6[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC5[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC4[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC3[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC2[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC1[8][1];
|
||||||
|
__declspec(align(32)) VUINT32 _sPC0[8][1];
|
||||||
|
} __svml_satan_data_internal;
|
||||||
|
#endif
|
||||||
|
__svml_satan_data_internal:
|
||||||
|
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
|
||||||
|
.align 32
|
||||||
|
.long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
|
||||||
|
.align 32
|
||||||
|
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
|
||||||
|
.align 32
|
||||||
|
.long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
|
||||||
|
.align 32
|
||||||
|
.long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
|
||||||
|
.align 32
|
||||||
|
.long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
|
||||||
|
.align 32
|
||||||
|
.long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
|
||||||
|
.align 32
|
||||||
|
.long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
|
||||||
|
.align 32
|
||||||
|
.long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
|
||||||
|
.align 32
|
||||||
|
.long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
|
||||||
|
.align 32
|
||||||
|
.long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
|
||||||
|
.align 32
|
||||||
|
.long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
|
||||||
|
.align 32
|
||||||
|
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
|
||||||
|
.align 32
|
||||||
|
.type __svml_satan_data_internal,@object
|
||||||
|
.size __svml_satan_data_internal,.-__svml_satan_data_internal
|
29
sysdeps/x86_64/fpu/svml_d_atan2_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_d_atan2_core.S
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
/* Function atan vectorized with SSE2.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_d_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVbN2v_atan)
|
||||||
|
WRAPPER_IMPL_SSE2 atan
|
||||||
|
END (_ZGVbN2v_atan)
|
||||||
|
|
||||||
|
#ifndef USE_MULTIARCH
|
||||||
|
libmvec_hidden_def (_ZGVbN2v_atan)
|
||||||
|
#endif
|
29
sysdeps/x86_64/fpu/svml_d_atan4_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_d_atan4_core.S
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
/* Function atan vectorized with AVX2, wrapper version.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_d_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVdN4v_atan)
|
||||||
|
WRAPPER_IMPL_AVX _ZGVbN2v_atan
|
||||||
|
END (_ZGVdN4v_atan)
|
||||||
|
|
||||||
|
#ifndef USE_MULTIARCH
|
||||||
|
libmvec_hidden_def (_ZGVdN4v_atan)
|
||||||
|
#endif
|
25
sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S
Normal file
25
sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/* Function atan vectorized in AVX ISA as wrapper to SSE4 ISA version.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_d_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVcN4v_atan)
|
||||||
|
WRAPPER_IMPL_AVX _ZGVbN2v_atan
|
||||||
|
END (_ZGVcN4v_atan)
|
25
sysdeps/x86_64/fpu/svml_d_atan8_core.S
Normal file
25
sysdeps/x86_64/fpu/svml_d_atan8_core.S
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/* Function atan vectorized with AVX-512, wrapper to AVX2.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_d_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVeN8v_atan)
|
||||||
|
WRAPPER_IMPL_AVX512 _ZGVdN4v_atan
|
||||||
|
END (_ZGVeN8v_atan)
|
25
sysdeps/x86_64/fpu/svml_s_atanf16_core.S
Normal file
25
sysdeps/x86_64/fpu/svml_s_atanf16_core.S
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/* Function atanf vectorized with AVX-512. Wrapper to AVX2 version.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_s_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVeN16v_atanf)
|
||||||
|
WRAPPER_IMPL_AVX512 _ZGVdN8v_atanf
|
||||||
|
END (_ZGVeN16v_atanf)
|
29
sysdeps/x86_64/fpu/svml_s_atanf4_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_s_atanf4_core.S
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
/* Function atanf vectorized with SSE2, wrapper version.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_s_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVbN4v_atanf)
|
||||||
|
WRAPPER_IMPL_SSE2 atanf
|
||||||
|
END (_ZGVbN4v_atanf)
|
||||||
|
|
||||||
|
#ifndef USE_MULTIARCH
|
||||||
|
libmvec_hidden_def (_ZGVbN4v_atanf)
|
||||||
|
#endif
|
29
sysdeps/x86_64/fpu/svml_s_atanf8_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_s_atanf8_core.S
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
/* Function atanf vectorized with AVX2, wrapper version.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_s_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVdN8v_atanf)
|
||||||
|
WRAPPER_IMPL_AVX _ZGVbN4v_atanf
|
||||||
|
END (_ZGVdN8v_atanf)
|
||||||
|
|
||||||
|
#ifndef USE_MULTIARCH
|
||||||
|
libmvec_hidden_def (_ZGVdN8v_atanf)
|
||||||
|
#endif
|
25
sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S
Normal file
25
sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/* Function atanf vectorized in AVX ISA as wrapper to SSE4 ISA version.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
#include "svml_s_wrapper_impl.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
ENTRY (_ZGVcN8v_atanf)
|
||||||
|
WRAPPER_IMPL_AVX _ZGVbN4v_atanf
|
||||||
|
END (_ZGVcN8v_atanf)
|
1
sysdeps/x86_64/fpu/test-double-libmvec-atan-avx.c
Normal file
1
sysdeps/x86_64/fpu/test-double-libmvec-atan-avx.c
Normal file
@ -0,0 +1 @@
|
|||||||
|
#include "test-double-libmvec-atan.c"
|
1
sysdeps/x86_64/fpu/test-double-libmvec-atan-avx2.c
Normal file
1
sysdeps/x86_64/fpu/test-double-libmvec-atan-avx2.c
Normal file
@ -0,0 +1 @@
|
|||||||
|
#include "test-double-libmvec-atan.c"
|
1
sysdeps/x86_64/fpu/test-double-libmvec-atan-avx512f.c
Normal file
1
sysdeps/x86_64/fpu/test-double-libmvec-atan-avx512f.c
Normal file
@ -0,0 +1 @@
|
|||||||
|
#include "test-double-libmvec-atan.c"
|
3
sysdeps/x86_64/fpu/test-double-libmvec-atan.c
Normal file
3
sysdeps/x86_64/fpu/test-double-libmvec-atan.c
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#define LIBMVEC_TYPE double
|
||||||
|
#define LIBMVEC_FUNC atan
|
||||||
|
#include "test-vector-abi-arg1.h"
|
@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
|
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
|
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVbN2v_atan)
|
||||||
|
|
||||||
#define VEC_INT_TYPE __m128i
|
#define VEC_INT_TYPE __m128i
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
|
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
|
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVdN4v_atan)
|
||||||
|
|
||||||
#ifndef __ILP32__
|
#ifndef __ILP32__
|
||||||
# define VEC_INT_TYPE __m256i
|
# define VEC_INT_TYPE __m256i
|
||||||
|
@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
|
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
|
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVcN4v_atan)
|
||||||
|
|
||||||
#define VEC_INT_TYPE __m128i
|
#define VEC_INT_TYPE __m128i
|
||||||
|
|
||||||
|
@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
|
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
|
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVeN8v_atan)
|
||||||
|
|
||||||
#ifndef __ILP32__
|
#ifndef __ILP32__
|
||||||
# define VEC_INT_TYPE __m512i
|
# define VEC_INT_TYPE __m512i
|
||||||
|
1
sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx.c
Normal file
1
sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx.c
Normal file
@ -0,0 +1 @@
|
|||||||
|
#include "test-float-libmvec-atanf.c"
|
1
sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx2.c
Normal file
1
sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx2.c
Normal file
@ -0,0 +1 @@
|
|||||||
|
#include "test-float-libmvec-atanf.c"
|
1
sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx512f.c
Normal file
1
sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx512f.c
Normal file
@ -0,0 +1 @@
|
|||||||
|
#include "test-float-libmvec-atanf.c"
|
3
sysdeps/x86_64/fpu/test-float-libmvec-atanf.c
Normal file
3
sysdeps/x86_64/fpu/test-float-libmvec-atanf.c
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#define LIBMVEC_TYPE float
|
||||||
|
#define LIBMVEC_FUNC atanf
|
||||||
|
#include "test-vector-abi-arg1.h"
|
@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
|
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
|
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVeN16v_atanf)
|
||||||
|
|
||||||
#define VEC_INT_TYPE __m512i
|
#define VEC_INT_TYPE __m512i
|
||||||
|
|
||||||
|
@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
|
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
|
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVbN4v_atanf)
|
||||||
|
|
||||||
#define VEC_INT_TYPE __m128i
|
#define VEC_INT_TYPE __m128i
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
|
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
|
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVdN8v_atanf)
|
||||||
|
|
||||||
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
|
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
|
||||||
#undef VECTOR_WRAPPER_fFF
|
#undef VECTOR_WRAPPER_fFF
|
||||||
|
@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
|
|||||||
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
|
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
|
||||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
|
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
|
||||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
|
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
|
||||||
|
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVcN8v_atanf)
|
||||||
|
|
||||||
#define VEC_INT_TYPE __m128i
|
#define VEC_INT_TYPE __m128i
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user