x86-64: Add vector tanh/tanhf implementation to libmvec

Implement vectorized tanh/tanhf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector tanh/tanhf with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 10:00:47 -08:00
parent f9ce13fdac
commit c0f36fc303
50 changed files with 5647 additions and 1 deletions

View File

@ -285,4 +285,15 @@
#define __DECL_SIMD_erff32x
#define __DECL_SIMD_erff64x
#define __DECL_SIMD_erff128x
#define __DECL_SIMD_tanh
#define __DECL_SIMD_tanhf
#define __DECL_SIMD_tanhl
#define __DECL_SIMD_tanhf16
#define __DECL_SIMD_tanhf32
#define __DECL_SIMD_tanhf64
#define __DECL_SIMD_tanhf128
#define __DECL_SIMD_tanhf32x
#define __DECL_SIMD_tanhf64x
#define __DECL_SIMD_tanhf128x
#endif

View File

@ -72,7 +72,7 @@ __MATHCALL_VEC (cosh,, (_Mdouble_ __x));
/* Hyperbolic sine of X. */
__MATHCALL_VEC (sinh,, (_Mdouble_ __x));
/* Hyperbolic tangent of X. */
__MATHCALL (tanh,, (_Mdouble_ __x));
__MATHCALL_VEC (tanh,, (_Mdouble_ __x));
#ifdef __USE_GNU
/* Cosine and sine of X. */

View File

@ -61,6 +61,7 @@ GLIBC_2.35 _ZGVbN2v_log10 F
GLIBC_2.35 _ZGVbN2v_log1p F
GLIBC_2.35 _ZGVbN2v_log2 F
GLIBC_2.35 _ZGVbN2v_sinh F
GLIBC_2.35 _ZGVbN2v_tanh F
GLIBC_2.35 _ZGVbN2vv_atan2 F
GLIBC_2.35 _ZGVbN2vv_hypot F
GLIBC_2.35 _ZGVbN4v_acosf F
@ -78,6 +79,7 @@ GLIBC_2.35 _ZGVbN4v_log10f F
GLIBC_2.35 _ZGVbN4v_log1pf F
GLIBC_2.35 _ZGVbN4v_log2f F
GLIBC_2.35 _ZGVbN4v_sinhf F
GLIBC_2.35 _ZGVbN4v_tanhf F
GLIBC_2.35 _ZGVbN4vv_atan2f F
GLIBC_2.35 _ZGVbN4vv_hypotf F
GLIBC_2.35 _ZGVcN4v_acos F
@ -95,6 +97,7 @@ GLIBC_2.35 _ZGVcN4v_log10 F
GLIBC_2.35 _ZGVcN4v_log1p F
GLIBC_2.35 _ZGVcN4v_log2 F
GLIBC_2.35 _ZGVcN4v_sinh F
GLIBC_2.35 _ZGVcN4v_tanh F
GLIBC_2.35 _ZGVcN4vv_atan2 F
GLIBC_2.35 _ZGVcN4vv_hypot F
GLIBC_2.35 _ZGVcN8v_acosf F
@ -112,6 +115,7 @@ GLIBC_2.35 _ZGVcN8v_log10f F
GLIBC_2.35 _ZGVcN8v_log1pf F
GLIBC_2.35 _ZGVcN8v_log2f F
GLIBC_2.35 _ZGVcN8v_sinhf F
GLIBC_2.35 _ZGVcN8v_tanhf F
GLIBC_2.35 _ZGVcN8vv_atan2f F
GLIBC_2.35 _ZGVcN8vv_hypotf F
GLIBC_2.35 _ZGVdN4v_acos F
@ -129,6 +133,7 @@ GLIBC_2.35 _ZGVdN4v_log10 F
GLIBC_2.35 _ZGVdN4v_log1p F
GLIBC_2.35 _ZGVdN4v_log2 F
GLIBC_2.35 _ZGVdN4v_sinh F
GLIBC_2.35 _ZGVdN4v_tanh F
GLIBC_2.35 _ZGVdN4vv_atan2 F
GLIBC_2.35 _ZGVdN4vv_hypot F
GLIBC_2.35 _ZGVdN8v_acosf F
@ -146,6 +151,7 @@ GLIBC_2.35 _ZGVdN8v_log10f F
GLIBC_2.35 _ZGVdN8v_log1pf F
GLIBC_2.35 _ZGVdN8v_log2f F
GLIBC_2.35 _ZGVdN8v_sinhf F
GLIBC_2.35 _ZGVdN8v_tanhf F
GLIBC_2.35 _ZGVdN8vv_atan2f F
GLIBC_2.35 _ZGVdN8vv_hypotf F
GLIBC_2.35 _ZGVeN16v_acosf F
@ -163,6 +169,7 @@ GLIBC_2.35 _ZGVeN16v_log10f F
GLIBC_2.35 _ZGVeN16v_log1pf F
GLIBC_2.35 _ZGVeN16v_log2f F
GLIBC_2.35 _ZGVeN16v_sinhf F
GLIBC_2.35 _ZGVeN16v_tanhf F
GLIBC_2.35 _ZGVeN16vv_atan2f F
GLIBC_2.35 _ZGVeN16vv_hypotf F
GLIBC_2.35 _ZGVeN8v_acos F
@ -180,5 +187,6 @@ GLIBC_2.35 _ZGVeN8v_log10 F
GLIBC_2.35 _ZGVeN8v_log1p F
GLIBC_2.35 _ZGVeN8v_log2 F
GLIBC_2.35 _ZGVeN8v_sinh F
GLIBC_2.35 _ZGVeN8v_tanh F
GLIBC_2.35 _ZGVeN8vv_atan2 F
GLIBC_2.35 _ZGVeN8vv_hypot F

View File

@ -126,6 +126,10 @@
# define __DECL_SIMD_erf __DECL_SIMD_x86_64
# undef __DECL_SIMD_erff
# define __DECL_SIMD_erff __DECL_SIMD_x86_64
# undef __DECL_SIMD_tanh
# define __DECL_SIMD_tanh __DECL_SIMD_x86_64
# undef __DECL_SIMD_tanhf
# define __DECL_SIMD_tanhf __DECL_SIMD_x86_64
# endif
#endif

View File

@ -62,6 +62,8 @@
!GCC$ builtin (acoshf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (erf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (erff) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (tanh) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (tanhf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -109,3 +111,5 @@
!GCC$ builtin (acoshf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (erf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (erff) attributes simd (notinbranch) if('x32')
!GCC$ builtin (tanh) attributes simd (notinbranch) if('x32')
!GCC$ builtin (tanhf) attributes simd (notinbranch) if('x32')

View File

@ -45,6 +45,7 @@ libmvec-funcs = \
sin \
sincos \
sinh \
tanh \
# Define libmvec function for benchtests directory.
libmvec-bench-funcs = \

View File

@ -29,6 +29,7 @@ libmvec {
_ZGVbN2v_log1p; _ZGVcN4v_log1p; _ZGVdN4v_log1p; _ZGVeN8v_log1p;
_ZGVbN2v_log2; _ZGVcN4v_log2; _ZGVdN4v_log2; _ZGVeN8v_log2;
_ZGVbN2v_sinh; _ZGVcN4v_sinh; _ZGVdN4v_sinh; _ZGVeN8v_sinh;
_ZGVbN2v_tanh; _ZGVcN4v_tanh; _ZGVdN4v_tanh; _ZGVeN8v_tanh;
_ZGVbN2vv_atan2; _ZGVcN4vv_atan2; _ZGVdN4vv_atan2; _ZGVeN8vv_atan2;
_ZGVbN2vv_hypot; _ZGVcN4vv_hypot; _ZGVdN4vv_hypot; _ZGVeN8vv_hypot;
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
@ -46,6 +47,7 @@ libmvec {
_ZGVbN4v_log1pf; _ZGVcN8v_log1pf; _ZGVdN8v_log1pf; _ZGVeN16v_log1pf;
_ZGVbN4v_log2f; _ZGVcN8v_log2f; _ZGVdN8v_log2f; _ZGVeN16v_log2f;
_ZGVbN4v_sinhf; _ZGVcN8v_sinhf; _ZGVdN8v_sinhf; _ZGVeN16v_sinhf;
_ZGVbN4v_tanhf; _ZGVcN8v_tanhf; _ZGVdN8v_tanhf; _ZGVeN16v_tanhf;
_ZGVbN4vv_atan2f; _ZGVcN8vv_atan2f; _ZGVdN8vv_atan2f; _ZGVeN16vv_atan2f;
_ZGVbN4vv_hypotf; _ZGVcN8vv_hypotf; _ZGVdN8vv_hypotf; _ZGVeN16vv_hypotf;
}

View File

@ -2067,6 +2067,21 @@ float: 3
float128: 3
ldouble: 4
Function: "tanh_vlen16":
float: 1
Function: "tanh_vlen2":
double: 1
Function: "tanh_vlen4":
double: 1
Function: "tanh_vlen4_avx2":
double: 1
Function: "tanh_vlen8":
double: 1
Function: "tgamma":
double: 9
float: 8

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized tanh, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_tanh _ZGVbN2v_tanh_sse2
#include "../svml_d_tanh2_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized tanh, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_tanh
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_tanh, __GI__ZGVbN2v_tanh, __redirect__ZGVbN2v_tanh)
__attribute__ ((visibility ("hidden")));
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized tanh, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_tanh _ZGVdN4v_tanh_sse_wrapper
#include "../svml_d_tanh4_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized tanh, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_tanh
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_tanh, __GI__ZGVdN4v_tanh, __redirect__ZGVdN4v_tanh)
__attribute__ ((visibility ("hidden")));
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized tanh, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_tanh _ZGVeN8v_tanh_avx2_wrapper
#include "../svml_d_tanh8_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized tanh, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_tanh
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_tanh, __GI__ZGVeN8v_tanh, __redirect__ZGVeN8v_tanh)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,472 @@
/* Function tanh vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* NOTE: Since the hyperbolic tangent function is odd
* (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
* value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
*
* We use a table lookup method to compute tanh(|x|).
* The basic idea is to split the input range into a number of subintervals
* and to approximate tanh(.) with a polynomial on each of them.
*
* IEEE SPECIAL CONDITIONS:
* x = [+,-]0, r = [+,-]0
* x = +Inf, r = +1
* x = -Inf, r = -1
* x = QNaN, r = QNaN
* x = SNaN, r = QNaN
*
*
* ALGORITHM DETAILS
* We handle special values in a callout function, aside from main path
* computations. "Special" for this algorithm are:
* INF, NAN, |x| > HUGE_THRESHOLD
*
*
* Main path computations are organized as follows:
* Actually we split the interval [0, SATURATION_THRESHOLD)
* into a number of subintervals. On each subinterval we approximate tanh(.)
* with a minimax polynomial of pre-defined degree. Polynomial coefficients
* are computed beforehand and stored in table. We also use
*
* y := |x| + B,
*
* here B depends on subinterval and is used to make argument
* closer to zero.
* We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
* where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
* preserve main path computation logic but return 1.0 for all arguments.
*
* Hence reconstruction looks as follows:
* we extract proper polynomial and range reduction coefficients
* (Pj and B), corresponding to subinterval, to which |x| belongs,
* and return
*
* r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
*
* NOTE: we use multiprecision technique to multiply and sum the first
* K terms of the polynomial. So Pj, j = 0..K are stored in
* table each as a pair of target precision numbers (Pj and PLj) to
* achieve wider than target precision.
*
*
*/
/* Offsets for data table __svml_dtanh_data_internal
*/
#define _dC 0
#define _dP0 128
#define _dP1 256
#define _dP2 384
#define _dP3 512
#define _dP4 640
#define _dP5 768
#define _dP6 896
#define _dP7 1024
#define _dP8 1152
#define _dP9 1280
#define _dP10 1408
#define _dP11 1536
#define _dP12 1664
#define _dP13 1792
#define _dP14 1920
#define _dP15 2048
#define _dP16 2176
#define _dP17 2304
#define _iExpMantMask_UISA 2432
#define _iMinIdxOfsMask_UISA 2496
#define _iMaxIdxMask_UISA 2560
#define _dbSignMask 2624
#define _dbAbsMask 2688
#define _iExpMantMask 2752
#define _iExpMask 2816
#define _iMinIdxOfsMask 2880
#define _iMaxIdxMask 2944
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_tanh_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $320, %rsp
vpsrlq $32, %zmm0, %zmm4
vmovups %zmm0, (%rsp)
vmovups __svml_dtanh_data_internal(%rip), %zmm14
vmovups _dP0+__svml_dtanh_data_internal(%rip), %zmm15
vpmovqd %zmm4, %ymm5
/* Constant loading */
vandpd _dbAbsMask+__svml_dtanh_data_internal(%rip), %zmm0, %zmm13
vandpd _dbSignMask+__svml_dtanh_data_internal(%rip), %zmm0, %zmm3
/* Here huge arguments, INF and NaNs are filtered out to callout. */
vpand _iExpMantMask_UISA+__svml_dtanh_data_internal(%rip), %ymm5, %ymm7
vmovups _dP2+__svml_dtanh_data_internal(%rip), %zmm0
vmovups _dP16+__svml_dtanh_data_internal(%rip), %zmm4
vmovups _dP15+__svml_dtanh_data_internal(%rip), %zmm5
vmovups %zmm3, 64(%rsp)
vmovups _dP3+__svml_dtanh_data_internal(%rip), %zmm3
vpsubd _iMinIdxOfsMask_UISA+__svml_dtanh_data_internal(%rip), %ymm7, %ymm8
/* if VMIN, VMAX is defined for I type */
vxorps %ymm9, %ymm9, %ymm9
vpmaxsd %ymm9, %ymm8, %ymm10
vpminsd _iMaxIdxMask_UISA+__svml_dtanh_data_internal(%rip), %ymm10, %ymm11
vpsrld $19, %ymm11, %ymm12
vmovups _dP12+__svml_dtanh_data_internal(%rip), %zmm8
vmovups _dP11+__svml_dtanh_data_internal(%rip), %zmm9
vmovups _dP10+__svml_dtanh_data_internal(%rip), %zmm10
vmovups _dP9+__svml_dtanh_data_internal(%rip), %zmm11
vpmovzxdq %ymm12, %zmm2
vmovups _dP8+__svml_dtanh_data_internal(%rip), %zmm12
vpermt2pd _dP2+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm0
vpermt2pd _dC+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm14
vpermt2pd _dP16+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm4
vpermt2pd _dP15+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm5
vsubpd {rn-sae}, %zmm14, %zmm13, %zmm1
vpermt2pd _dP12+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm8
vpermt2pd _dP11+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm9
vpermt2pd _dP10+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm10
vpermt2pd _dP9+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm11
vpermt2pd _dP8+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm12
vpermt2pd _dP3+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm3
vpermt2pd _dP0+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm15
vmovups %zmm0, 192(%rsp)
vmovups _dP17+__svml_dtanh_data_internal(%rip), %zmm0
vmovups _dP7+__svml_dtanh_data_internal(%rip), %zmm13
vmovups _dP6+__svml_dtanh_data_internal(%rip), %zmm14
vmovups %zmm3, 256(%rsp)
vmovups _dP5+__svml_dtanh_data_internal(%rip), %zmm3
vmovups %zmm15, 128(%rsp)
vmovups _dP4+__svml_dtanh_data_internal(%rip), %zmm15
vpermt2pd _dP17+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm0
vpermt2pd _dP7+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm13
vpermt2pd _dP6+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm14
vpermt2pd _dP5+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm3
vpermt2pd _dP4+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm15
vfmadd213pd {rn-sae}, %zmm4, %zmm1, %zmm0
vpcmpgtd _iExpMask+__svml_dtanh_data_internal(%rip), %ymm7, %ymm6
vmovmskps %ymm6, %edx
vmovups _dP14+__svml_dtanh_data_internal(%rip), %zmm6
vfmadd213pd {rn-sae}, %zmm5, %zmm1, %zmm0
vmovups _dP13+__svml_dtanh_data_internal(%rip), %zmm7
vpermt2pd _dP14+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm6
vpermt2pd _dP13+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm7
vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm0
vmovups 256(%rsp), %zmm2
vfmadd213pd {rn-sae}, %zmm7, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm8, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm9, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm10, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm11, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm12, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm13, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm14, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm0
vmovups 128(%rsp), %zmm3
vfmadd213pd {rn-sae}, %zmm15, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
vmovups 192(%rsp), %zmm2
vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm0
vorpd 64(%rsp), %zmm0, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups (%rsp), %zmm1
vmovups %zmm0, 128(%rsp)
vmovups %zmm1, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -304; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xfe, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -312; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xfe, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -320; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xfe, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -304; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xfe, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -312; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xfe, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -320; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xfe, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 64(%rsp,%r14,8), %xmm0
call tanh@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 128(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN8v_tanh_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_dtanh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(64)) VUINT32 _dC[16][2];
__declspec(align(64)) VUINT32 _dP0[16][2];
__declspec(align(64)) VUINT32 _dP1[16][2];
__declspec(align(64)) VUINT32 _dP2[16][2];
__declspec(align(64)) VUINT32 _dP3[16][2];
__declspec(align(64)) VUINT32 _dP4[16][2];
__declspec(align(64)) VUINT32 _dP5[16][2];
__declspec(align(64)) VUINT32 _dP6[16][2];
__declspec(align(64)) VUINT32 _dP7[16][2];
__declspec(align(64)) VUINT32 _dP8[16][2];
__declspec(align(64)) VUINT32 _dP9[16][2];
__declspec(align(64)) VUINT32 _dP10[16][2];
__declspec(align(64)) VUINT32 _dP11[16][2];
__declspec(align(64)) VUINT32 _dP12[16][2];
__declspec(align(64)) VUINT32 _dP13[16][2];
__declspec(align(64)) VUINT32 _dP14[16][2];
__declspec(align(64)) VUINT32 _dP15[16][2];
__declspec(align(64)) VUINT32 _dP16[16][2];
__declspec(align(64)) VUINT32 _dP17[16][2];
__declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
__declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
__declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
__declspec(align(64)) VUINT32 _dbSignMask[8][2];
__declspec(align(64)) VUINT32 _dbAbsMask[8][2];
__declspec(align(64)) VUINT32 _iExpMantMask[16][1];
__declspec(align(64)) VUINT32 _iExpMask[16][1];
__declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
__declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
} __svml_dtanh_data_internal;
#endif
__svml_dtanh_data_internal:
/*== _dC ==*/
.quad 0x0000000000000000, 0x3fcc000000000000, 0x3fd4000000000000, 0x3fdc000000000000
.quad 0x3fe4000000000000, 0x3fec000000000000, 0x3ff4000000000000, 0x3ffc000000000000
.quad 0x4004000000000000, 0x400c000000000000, 0x4014000000000000, 0x401c000000000000
.quad 0x4024000000000000, 0x402c000000000000, 0x4034000000000000, 0x0000000000000000
/*== p0 ==*/
.align 64
.quad 0x0000000000000000, 0x3fcb8fd0416a7c92, 0x3fd35f98a0ea650e, 0x3fda5729ee488037
.quad 0x3fe1bf47eabb8f95, 0x3fe686650b8c2015, 0x3feb2523bb6b2dee, 0x3fee1fbf97e33527
.quad 0x3fef9258260a71c2, 0x3feff112c63a9077, 0x3fefff419668df11, 0x3feffffc832750f2
.quad 0x3feffffffdc96f35, 0x3fefffffffffcf58, 0x3ff0000000000000, 0x3ff0000000000000
/*== p1 ==*/
.align 64
.quad 0x0000000000000000, 0x3c65e23ebcd3bcbe, 0xbc4c600bac3adf00, 0x3c6c44091785d040
.quad 0x3c8221d7a6e3674b, 0x3c69f89d2cf6b85c, 0x3c73b3e9ec0b8f1c, 0xbc7f8d4b0428aada
.quad 0xbc7c52d880cf43c0, 0x3c7dd36e37096480, 0x3c7b4f6380c442ca, 0xbc729755de470096
.quad 0x3c84cf852845efbd, 0x3c6fc4fb440a5378, 0xbc63981083b55870, 0x0000000000000000
/*== p2 ==*/
.align 64
.quad 0x3ff0000000000000, 0x3fee842ca3f08532, 0x3fed11574af58f1b, 0x3fea945b9c24e4f9
.quad 0x3fe6284c3374f815, 0x3fe02500a09f8d6e, 0x3fd1f25131e3a8c0, 0x3fbd22ca1c24a139
.quad 0x3f9b3afe1fba5c76, 0x3f6dd37d19b22b21, 0x3f27ccec13a9ef96, 0x3ecbe6c3f33250ae
.quad 0x3e41b4865394f75f, 0x3d8853f01bda5f28, 0x3c73953c0197ef58, 0x0000000000000000
/*== p3 ==*/
.align 64
.quad 0xbbf0b3ea3fdfaa19, 0xbfca48aaeb53bc21, 0xbfd19921f4329916, 0xbfd5e0f09bef8011
.quad 0xbfd893b59c35c882, 0xbfd6ba7cb7576538, 0xbfce7291743d7555, 0xbfbb6d85a01efb80
.quad 0xbf9addae58c7141a, 0xbf6dc59376c7aa19, 0xbf27cc5e74677410, 0xbecbe6c0e8b4cc87
.quad 0xbe41b486526b0565, 0xbd8853f01bef63a4, 0xbc73955be519be31, 0x0000000000000000
/*== p4 ==*/
.align 64
.quad 0xbfd5555555555555, 0xbfd183afc292ba11, 0xbfcc1a4b039c9bfa, 0xbfc16e1e6d8d0be6
.quad 0xbf92426c751e48a2, 0x3fb4f152b2bad124, 0x3fbbba40cbef72be, 0x3fb01ba038be6a3d
.quad 0x3f916df44871efc8, 0x3f63c6869dfc8870, 0x3f1fb9aef915d828, 0x3ec299d1e27c6e11
.quad 0x3e379b5ddcca334c, 0x3d8037f57bc62c9a, 0x3c6a2d4b50a2cff7, 0x0000000000000000
/*== p5 ==*/
.align 64
.quad 0xbce6863ee44ed636, 0x3fc04dcd0476c75e, 0x3fc43d3449a80f08, 0x3fc5c26f3699b7e7
.quad 0x3fc1a686f6ab2533, 0x3faf203c316ce730, 0xbf89c7a02788557c, 0xbf98157e26e0d541
.quad 0xbf807b55c1c7d278, 0xbf53a18d5843190f, 0xbf0fb6bbc89b1a5b, 0xbeb299c9c684a963
.quad 0xbe279b5dd4fb3d01, 0xbd7037f57ae72aa6, 0xbc5a2ca2bba78e86, 0x0000000000000000
/*== p6 ==*/
.align 64
.quad 0x3fc1111111112ab5, 0x3fb5c19efdfc08ad, 0x3fa74c98dc34fbac, 0xbf790d6a8eff0a77
.quad 0xbfac3c021789a786, 0xbfae2196b7326859, 0xbf93a7a011ff8c2a, 0x3f6e4709c7e8430e
.quad 0x3f67682afa611151, 0x3f3ef2ee77717cbf, 0x3ef95a4482f180b7, 0x3e9dc2c27da3b603
.quad 0x3e12e2afd9f7433e, 0x3d59f320348679ba, 0x3c44b61d9bbcc940, 0x0000000000000000
/*== p7 ==*/
.align 64
.quad 0xbda1ea19ddddb3b4, 0xbfb0b8df995ce4df, 0xbfb2955cf41e8164, 0xbfaf9d05c309f7c6
.quad 0xbf987d27ccff4291, 0x3f8b2ca62572b098, 0x3f8f1cf6c7f5b00a, 0x3f60379811e43dd5
.quad 0xbf4793826f78537e, 0xbf2405695e36240f, 0xbee0e08de39ce756, 0xbe83d709ba5f714e
.quad 0xbdf92e3fc5ee63e0, 0xbd414cc030f2110e, 0xbc2ba022e8d82a87, 0x0000000000000000
/*== p8 ==*/
.align 64
.quad 0xbfaba1ba1990520b, 0xbf96e37bba52f6fc, 0x3ecff7df18455399, 0x3f97362834d33a4e
.quad 0x3f9e7f8380184b45, 0x3f869543e7c420d4, 0xbf7326bd4914222a, 0xbf5fc15b0a9d98fa
.quad 0x3f14cffcfa69fbb6, 0x3f057e48e5b79d10, 0x3ec33b66d7d77264, 0x3e66ac4e578b9b10
.quad 0x3ddcc74b8d3d5c42, 0x3d23c589137f92b4, 0x3c107f8e2c8707a1, 0x0000000000000000
/*== p9 ==*/
.align 64
.quad 0xbe351ca7f096011f, 0x3f9eaaf3320c3851, 0x3f9cf823fe761fc1, 0x3f9022271754ff1f
.quad 0xbf731fe77c9c60af, 0xbf84a6046865ec7d, 0xbf4ca3f1f2b9192b, 0x3f4c77dee0afd227
.quad 0x3f04055bce68597a, 0xbee2bf0cb4a71647, 0xbea31eaafe73efd5, 0xbe46abb02c4368ed
.quad 0xbdbcc749ca8079dd, 0xbd03c5883836b9d2, 0xbbf07a5416264aec, 0x0000000000000000
/*== p10 ==*/
.align 64
.quad 0x3f9664f94e6ac14e, 0xbf94d3343bae39dd, 0xbf7bc748e60df843, 0xbf8c89372b43ba85
.quad 0xbf8129a092de747a, 0x3f60c85b4d538746, 0x3f5be9392199ec18, 0xbf2a0c68a4489f10
.quad 0xbf00462601dc2faa, 0x3eb7b6a219dea9f4, 0x3e80cbcc8d4c5c8a, 0x3e2425bb231a5e29
.quad 0x3d9992a4beac8662, 0x3ce191ba5ed3fb67, 0x3bc892450bad44c4, 0x0000000000000000
/*== p11 ==*/
.align 64
.quad 0xbea8c4c1fd7852fe, 0xbfccce16b1046f13, 0xbf81a16f224bb7b6, 0xbf62cbf00406bc09
.quad 0x3f75b29bb02cf69b, 0x3f607df0f9f90c17, 0xbf4b852a6e0758d5, 0xbf0078c63d1b8445
.quad 0x3eec12eadd55be7a, 0xbe6fa600f593181b, 0xbe5a3c935dce3f7d, 0xbe001c6d95e3ae96
.quad 0xbd74755a00ea1fd3, 0xbcbc1c6c063bb7ac, 0xbba3be9a4460fe00, 0x0000000000000000
/*== p12 ==*/
.align 64
.quad 0xbf822404577aa9dd, 0x403d8b07f7a82aa3, 0xbf9f44ab92fbab0a, 0x3fb2eac604473d6a
.quad 0x3f45f87d903aaac8, 0xbf5e104671036300, 0x3f19bc98ddf0f340, 0x3f0d4304bc9246e8
.quad 0xbed13c415f7b9d41, 0xbe722b8d9720cdb0, 0x3e322666d739bec0, 0x3dd76a553d7e7918
.quad 0x3d4de0fa59416a39, 0x3c948716cf3681b4, 0x3b873f9f2d2fda99, 0x0000000000000000
/*== p13 ==*/
.align 64
.quad 0xbefdd99a221ed573, 0x4070593a3735bab4, 0xbfccab654e44835e, 0x3fd13ed80037dbac
.quad 0xbf6045b9076cc487, 0x3f2085ee7e8ac170, 0x3f23524622610430, 0xbeff12a6626911b4
.quad 0x3eab9008bca408af, 0x3e634df71865f620, 0xbe05bb1bcf83ca73, 0xbdaf2ac143fb6762
.quad 0xbd23eae52a3dbf57, 0xbc6b5e3e9ca0955e, 0xbb5eca68e2c1ba2e, 0x0000000000000000
/*== p14 ==*/
.align 64
.quad 0x3f6e3be689423841, 0xc0d263511f5baac1, 0x40169f73b15ebe5c, 0xc025c1dd41cd6cb5
.quad 0xbf58fd89fe05e0d1, 0x3f73f7af01d5af7a, 0xbf1e40bdead17e6b, 0x3ee224cd6c4513e5
.quad 0xbe24b645e68eeaa3, 0xbe4abfebfb72bc83, 0x3dd51c38f8695ed3, 0x3d8313ac38c6832b
.quad 0x3cf7787935626685, 0x3c401ffc49c6bc29, 0xbabf0b21acfa52ab, 0x0000000000000000
/*== p15 ==*/
.align 64
.quad 0xbf2a1306713a4f3a, 0xc1045e509116b066, 0x4041fab9250984ce, 0xc0458d090ec3de95
.quad 0xbf74949d60113d63, 0x3f7c9fd6200d0ade, 0x3f02cd40e0ad0a9f, 0xbe858ab8e019f311
.quad 0xbe792fa6323b7cf8, 0x3e2df04d67876402, 0xbd95c72be95e4d2c, 0xbd55a89c30203106
.quad 0xbccad6b3bb9eff65, 0xbc12705ccd3dd884, 0xba8e0a4c47ae75f5, 0x0000000000000000
/*== p16 ==*/
.align 64
.quad 0xbf55d7e76dc56871, 0x41528c38809c90c7, 0xc076d57fb5190b02, 0x4085f09f888f8ada
.quad 0x3fa246332a2fcba5, 0xbfb29d851a896fcd, 0x3ed9065ae369b212, 0xbeb8e1ba4c98a030
.quad 0x3e6ffd0766ad4016, 0xbe0c63c29f505f5b, 0xbd7fab216b9e0e49, 0x3d2826b62056aa27
.quad 0x3ca313e31762f523, 0x3bea37aa21895319, 0x3ae5c7f1fd871496, 0x0000000000000000
/*== p17 ==*/
.align 64
.quad 0x3f35e67ab76a26e7, 0x41848ee0627d8206, 0xc0a216d618b489ec, 0x40a5b89107c8af4f
.quad 0x3fb69d8374520eda, 0xbfbded519f981716, 0xbef02d288b5b3371, 0x3eb290981209c1a6
.quad 0xbe567e924bf5ff6e, 0x3de3f7f7de6b0eb6, 0x3d69ed18bae3ebbc, 0xbcf7534c4f3dfa71
.quad 0xbc730b73f1eaff20, 0xbbba2cff8135d462, 0xbab5a71b5f7d9035, 0x0000000000000000
.align 64
.long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask_UISA */
.align 64
.long 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000 /* _iMinIdxOfsMask_UISA */
.align 64
.long 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000 /* _iMaxIdxMask_UISA */
.align 64
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dbSignMask */
.align 64
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* _dbAbsMask */
.align 64
.long 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000 /* _iExpMantMask */
.align 64
.long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMask */
.align 64
.long 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000 /* _iMinIdxOfsMask */
.align 64
.long 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000 /* _iMaxIdxMask */
.align 64
.type __svml_dtanh_data_internal,@object
.size __svml_dtanh_data_internal,.-__svml_dtanh_data_internal

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized tanhf.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_tanhf _ZGVeN16v_tanhf_avx2_wrapper
#include "../svml_s_tanhf16_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized tanhf, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_tanhf
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_tanhf, __GI__ZGVeN16v_tanhf,
__redirect__ZGVeN16v_tanhf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,381 @@
/* Function tanhf vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* NOTE: Since the hyperbolic tangent function is odd
* (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
* value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
*
* We use a table lookup method to compute tanh(|x|).
* The basic idea is to split the input range into a number of subintervals
* and to approximate tanh(.) with a polynomial on each of them.
*
* IEEE SPECIAL CONDITIONS:
* x = [+,-]0, r = [+,-]0
* x = +Inf, r = +1
* x = -Inf, r = -1
* x = QNaN, r = QNaN
* x = SNaN, r = QNaN
*
*
* ALGORITHM DETAILS
* We handle special values in a callout function, aside from main path
* computations. "Special" for this algorithm are:
* INF, NAN, |x| > HUGE_THRESHOLD
*
*
* Main path computations are organized as follows:
* Actually we split the interval [0, SATURATION_THRESHOLD)
* into a number of subintervals. On each subinterval we approximate tanh(.)
* with a minimax polynomial of pre-defined degree. Polynomial coefficients
* are computed beforehand and stored in table. We also use
*
* y := |x| + B,
*
* here B depends on subinterval and is used to make argument
* closer to zero.
* We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
* where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
* preserve main path computation logic but return 1.0 for all arguments.
*
* Hence reconstruction looks as follows:
* we extract proper polynomial and range reduction coefficients
* (Pj and B), corresponding to subinterval, to which |x| belongs,
* and return
*
* r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
*
* NOTE: we use multiprecision technique to multiply and sum the first
* K terms of the polynomial. So Pj, j = 0..K are stored in
* table each as a pair of target precision numbers (Pj and PLj) to
* achieve wider than target precision.
*
*
*/
/* Offsets for data table __svml_stanh_data_internal
*/
#define _sC 0
#define _sP0 128
#define _sP2 256
#define _sP3 384
#define _sP4 512
#define _sP5 640
#define _sP6 768
#define _sP7 896
#define _iExpMantMask_UISA 1024
#define _iMinIdxOfsMask_UISA 1088
#define _iMaxIdxMask_UISA 1152
#define _sSignMask 1216
#define _sAbsMask 1280
#define _iExpMantMask 1344
#define _iExpMask 1408
#define _iMinIdxOfsMask 1472
#define _iMaxIdxMask 1536
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_tanhf_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovaps %zmm0, %zmm1
vmovups __svml_stanh_data_internal(%rip), %zmm9
vmovups _sP6+__svml_stanh_data_internal(%rip), %zmm11
vmovups _sP5+__svml_stanh_data_internal(%rip), %zmm12
vmovups _sP4+__svml_stanh_data_internal(%rip), %zmm13
vmovups _sP3+__svml_stanh_data_internal(%rip), %zmm14
vmovups _sP2+__svml_stanh_data_internal(%rip), %zmm15
vpternlogd $255, %zmm2, %zmm2, %zmm2
vandps _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
vandps _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
/* Here huge arguments, INF and NaNs are filtered out to callout. */
vpandd _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
vpsubd _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
vpcmpd $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
/*
* small table specific variables *
* Constant loading
*/
vpxord %zmm5, %zmm5, %zmm5
/* if VMIN, VMAX is defined for I type */
vpmaxsd %zmm5, %zmm4, %zmm6
vpminsd _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
vpsrld $21, %zmm7, %zmm10
vmovups _sP7+__svml_stanh_data_internal(%rip), %zmm4
vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
vpandnd %zmm3, %zmm3, %zmm2{%k1}
vptestmd %zmm2, %zmm2, %k0
vmovups _sP0+__svml_stanh_data_internal(%rip), %zmm3
vsubps {rn-sae}, %zmm9, %zmm8, %zmm2
kmovw %k0, %edx
vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
vorps %zmm0, %zmm4, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm1, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $16, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 64(%rsp,%r14,4), %xmm0
call tanhf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 128(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN16v_tanhf_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_stanh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(64)) VUINT32 _sC[32][1];
__declspec(align(64)) VUINT32 _sP0[32][1];
__declspec(align(64)) VUINT32 _sP2[32][1];
__declspec(align(64)) VUINT32 _sP3[32][1];
__declspec(align(64)) VUINT32 _sP4[32][1];
__declspec(align(64)) VUINT32 _sP5[32][1];
__declspec(align(64)) VUINT32 _sP6[32][1];
__declspec(align(64)) VUINT32 _sP7[32][1];
__declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
__declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
__declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
__declspec(align(64)) VUINT32 _sSignMask[16][1];
__declspec(align(64)) VUINT32 _sAbsMask[16][1];
__declspec(align(64)) VUINT32 _iExpMantMask[16][1];
__declspec(align(64)) VUINT32 _iExpMask[16][1];
__declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
__declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
} __svml_stanh_data_internal;
#endif
__svml_stanh_data_internal:
/*== _sC ==*/
.long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
.long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
.long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
.long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
.long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
.long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
.long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
.long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
/*== p0 ==*/
.align 64
.long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
.long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
.long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
.long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
.long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
.long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
.long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
.long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
/*== p2 ==*/
.align 64
.long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
.long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
.long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
.long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
.long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
.long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
.long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
.long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
/*== p3 ==*/
.align 64
.long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
.long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
.long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
.long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
.long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
.long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
.long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
.long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
/*== p4 ==*/
.align 64
.long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
.long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
.long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
.long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
.long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
.long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
.long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
.long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
/*== p5 ==*/
.align 64
.long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
.long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
.long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
.long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
.long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
.long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
.long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
.long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
/*== p6 ==*/
.align 64
.long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
.long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
.long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
.long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
.long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
.long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
.long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
.long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
/*== p7 ==*/
.align 64
.long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
.long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
.long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
.long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
.long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
.long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
.long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
.long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
.align 64
.long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMantMask_UISA */
.align 64
.long 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000 /* _iMinIdxOfsMask_UISA */
.align 64
.long 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000 /* _iMaxIdxMask_UISA */
.align 64
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
.align 64
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
.align 64
.long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
.align 64
.long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
.align 64
.long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
.align 64
.long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
.align 64
.type __svml_stanh_data_internal,@object
.size __svml_stanh_data_internal,.-__svml_stanh_data_internal

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized tanhf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_tanhf _ZGVbN4v_tanhf_sse2
#include "../svml_s_tanhf4_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized tanhf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_tanhf
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_tanhf, __GI__ZGVbN4v_tanhf,
__redirect__ZGVbN4v_tanhf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,832 @@
/* Function tanhf vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* NOTE: Since the hyperbolic tangent function is odd
* (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
* value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
*
* We use a table lookup method to compute tanh(|x|).
* The basic idea is to split the input range into a number of subintervals
* and to approximate tanh(.) with a polynomial on each of them.
*
* IEEE SPECIAL CONDITIONS:
* x = [+,-]0, r = [+,-]0
* x = +Inf, r = +1
* x = -Inf, r = -1
* x = QNaN, r = QNaN
* x = SNaN, r = QNaN
*
*
* ALGORITHM DETAILS
* We handle special values in a callout function, aside from main path
* computations. "Special" for this algorithm are:
* INF, NAN, |x| > HUGE_THRESHOLD
*
*
* Main path computations are organized as follows:
* Actually we split the interval [0, SATURATION_THRESHOLD)
* into a number of subintervals. On each subinterval we approximate tanh(.)
* with a minimax polynomial of pre-defined degree. Polynomial coefficients
* are computed beforehand and stored in table. We also use
*
* y := |x| + B,
*
* here B depends on subinterval and is used to make argument
* closer to zero.
* We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
* where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
* preserve main path computation logic but return 1.0 for all arguments.
*
* Hence reconstruction looks as follows:
* we extract proper polynomial and range reduction coefficients
* (Pj and B), corresponding to subinterval, to which |x| belongs,
* and return
*
* r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
*
* NOTE: we use multiprecision technique to multiply and sum the first
* K terms of the polynomial. So Pj, j = 0..K are stored in
* table each as a pair of target precision numbers (Pj and PLj) to
* achieve wider than target precision.
*
*
*/
/* Offsets for data table __svml_stanh_data_internal
*/
#define _dbP 0
#define _sSignMask 4288
#define _sAbsMask 4304
#define _iExpMantMask 4320
#define _iExpMask 4336
#define _iMinIdxOfsMask 4352
#define _iMaxIdxMask 4368
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_tanhf_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
movaps %xmm0, %xmm5
/* Here huge arguments, INF and NaNs are filtered out to callout. */
movdqu _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
lea _dbP+16+__svml_stanh_data_internal(%rip), %r8
pand %xmm5, %xmm9
/* if VMIN, VMAX is defined for I type */
pxor %xmm7, %xmm7
movdqa %xmm9, %xmm6
psubd _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
/*
* small table specific variables *
* Constant loading
*/
movdqu _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
movdqa %xmm9, %xmm11
movdqa %xmm9, %xmm8
pcmpgtd %xmm10, %xmm11
pcmpgtd %xmm7, %xmm8
movdqa %xmm11, %xmm14
pand %xmm8, %xmm9
andps %xmm11, %xmm10
andnps %xmm9, %xmm14
orps %xmm10, %xmm14
psrld $14, %xmm14
movd %xmm14, %edx
pshufd $1, %xmm14, %xmm12
pshufd $2, %xmm14, %xmm13
movd %xmm12, %ecx
pshufd $3, %xmm14, %xmm15
movups _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
movslq %edx, %rdx
andps %xmm5, %xmm3
movslq %ecx, %rcx
pcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
movd %xmm13, %esi
movups -16(%rdx,%r8), %xmm2
movaps %xmm2, %xmm0
movd %xmm15, %edi
movmskps %xmm6, %eax
movups -16(%rcx,%r8), %xmm6
unpcklpd %xmm6, %xmm0
unpckhpd %xmm6, %xmm2
cvtps2pd %xmm3, %xmm6
movhlps %xmm3, %xmm3
cvtps2pd %xmm3, %xmm3
movslq %esi, %rsi
movslq %edi, %rdi
movups (%rcx,%r8), %xmm8
movups (%rdx,%r8), %xmm12
movups (%rsi,%r8), %xmm13
movaps %xmm12, %xmm10
movups (%rdi,%r8), %xmm9
movaps %xmm13, %xmm11
unpckhpd %xmm8, %xmm12
unpckhpd %xmm9, %xmm13
mulpd %xmm6, %xmm12
mulpd %xmm3, %xmm13
unpcklpd %xmm8, %xmm10
unpcklpd %xmm9, %xmm11
addpd %xmm10, %xmm12
addpd %xmm11, %xmm13
mulpd %xmm6, %xmm12
mulpd %xmm3, %xmm13
addpd %xmm2, %xmm12
movups -16(%rsi,%r8), %xmm1
movups -16(%rdi,%r8), %xmm7
movaps %xmm1, %xmm14
unpckhpd %xmm7, %xmm1
addpd %xmm1, %xmm13
mulpd %xmm12, %xmm6
mulpd %xmm13, %xmm3
addpd %xmm0, %xmm6
unpcklpd %xmm7, %xmm14
addpd %xmm14, %xmm3
cvtpd2ps %xmm6, %xmm0
cvtpd2ps %xmm3, %xmm1
movups _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
movlhps %xmm1, %xmm0
andps %xmm5, %xmm4
orps %xmm4, %xmm0
testl %eax, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
/* Restore registers
* and exit the function
*/
L(EXIT):
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm5, 32(%rsp)
movups %xmm0, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax
xorl %edx, %edx
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %edx, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %eax, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm0
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call tanhf@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movss %xmm0, 48(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN4v_tanhf_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_stanh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(16)) VUINT32 _dbP[(134*4)][2];
__declspec(align(16)) VUINT32 _sSignMask[4][1];
__declspec(align(16)) VUINT32 _sAbsMask[4][1];
__declspec(align(16)) VUINT32 _iExpMantMask[4][1];
__declspec(align(16)) VUINT32 _iExpMask[4][1];
__declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
__declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
} __svml_stanh_data_internal;
#endif
__svml_stanh_data_internal:
/* Pol_000: err=7.93e-09, x in [0.0000000; 0.0312500]. */
.quad 0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
.quad 0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
.quad 0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
.quad 0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
.quad 0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
.quad 0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
.quad 0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
.quad 0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
.quad 0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
.quad 0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
.quad 0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
.quad 0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
.quad 0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
.quad 0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
.quad 0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
.quad 0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
.quad 0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
.quad 0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
.quad 0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
.quad 0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
.quad 0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
.quad 0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
.quad 0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
.quad 0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
.quad 0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
.quad 0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
.quad 0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
.quad 0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
.quad 0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
.quad 0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
.quad 0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
.quad 0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
.quad 0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
.quad 0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
.quad 0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
.quad 0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
.quad 0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
.quad 0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
.quad 0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
.quad 0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
.quad 0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
.quad 0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
.quad 0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
.quad 0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
.quad 0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
.quad 0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
.quad 0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
.quad 0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
.quad 0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
.quad 0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
.quad 0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
.quad 0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
.quad 0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
.quad 0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
.quad 0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
.quad 0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
.quad 0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
.quad 0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
.quad 0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
.quad 0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
.quad 0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
.quad 0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
.quad 0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
.quad 0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
.quad 0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
.quad 0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
.quad 0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
.quad 0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
.quad 0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
.quad 0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
.quad 0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
.quad 0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
.quad 0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
.quad 0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
.quad 0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
.quad 0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
.quad 0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
.quad 0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
.quad 0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
.quad 0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
.quad 0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
.quad 0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
.quad 0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
.quad 0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
.quad 0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
.quad 0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
.quad 0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
.quad 0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
.quad 0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
.quad 0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
.quad 0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
.quad 0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
.quad 0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
.quad 0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
.quad 0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
.quad 0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
.quad 0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
.quad 0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
.quad 0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
.quad 0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
.quad 0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
.quad 0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
.quad 0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
.quad 0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
.quad 0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
.quad 0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
.quad 0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
.quad 0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
.quad 0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
.quad 0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
.quad 0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
.quad 0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
.quad 0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
.quad 0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
.quad 0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
.quad 0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
.quad 0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
.quad 0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
.quad 0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
.quad 0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
.quad 0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
.quad 0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
.quad 0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
.quad 0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
.quad 0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
.quad 0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
.quad 0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
.quad 0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
.quad 0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
.quad 0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
.quad 0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
.quad 0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
.quad 0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
.quad 0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
.quad 0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
.quad 0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
.quad 0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
.quad 0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
.quad 0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
.quad 0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
.quad 0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
.quad 0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
.quad 0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
.quad 0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
.quad 0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
.quad 0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
.quad 0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
.quad 0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
.quad 0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
.quad 0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
.quad 0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
.quad 0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
.quad 0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
.quad 0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
.quad 0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
.quad 0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
.quad 0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
.quad 0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
.quad 0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
.quad 0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
.quad 0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
.quad 0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
.quad 0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
.quad 0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
.quad 0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
.quad 0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
.quad 0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
.quad 0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
.quad 0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
.quad 0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
.quad 0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
.quad 0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
.quad 0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
.quad 0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
.quad 0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
.quad 0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
.quad 0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
.quad 0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
.quad 0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
.quad 0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
.quad 0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
.quad 0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
.quad 0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
.quad 0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
.quad 0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
.quad 0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
.quad 0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
.quad 0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
.quad 0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
.quad 0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
.quad 0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
.quad 0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
.quad 0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
.quad 0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
.quad 0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
.quad 0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
.quad 0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
.quad 0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
.quad 0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
.quad 0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
.quad 0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
.quad 0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
.quad 0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
.quad 0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
.quad 0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
.quad 0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
.quad 0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
.quad 0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
.quad 0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
.quad 0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
.quad 0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
.quad 0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
.quad 0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
.quad 0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
.quad 0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
.quad 0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
.quad 0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
.quad 0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
.quad 0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
.quad 0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
.quad 0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
.quad 0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
.quad 0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
.quad 0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
.quad 0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
.quad 0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
.quad 0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
.quad 0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
.quad 0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
.quad 0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
.quad 0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
.quad 0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
.quad 0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
.quad 0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
.quad 0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
.quad 0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
.quad 0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
.quad 0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
.quad 0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
.quad 0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
.quad 0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
.quad 0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
.quad 0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
.quad 0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
.quad 0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
.quad 0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
.quad 0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
.quad 0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
.quad 0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
.quad 0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
.quad 0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
.quad 0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
.quad 0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
.quad 0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
.quad 0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
.quad 0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
.quad 0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
.quad 0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
.quad 0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
.quad 0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
.quad 0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
.quad 0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
.quad 0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
.quad 0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
.quad 0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
.quad 0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
.quad 0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
.quad 0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
.quad 0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
.quad 0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
.quad 0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
.quad 0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
.quad 0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
.quad 0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
.quad 0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
.quad 0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
.quad 0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
.quad 0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
.quad 0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
.quad 0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
.quad 0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
.quad 0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
.quad 0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
.quad 0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
.quad 0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
.quad 0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
.quad 0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
.quad 0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
.quad 0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
.quad 0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
.quad 0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
.quad 0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
.quad 0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
.quad 0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
.quad 0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
.quad 0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
.quad 0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
.quad 0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
.quad 0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
.quad 0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
.quad 0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
.quad 0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
.quad 0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
.quad 0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
.quad 0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
.quad 0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
.quad 0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
.quad 0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
.quad 0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
.quad 0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
.quad 0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
.quad 0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
.quad 0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
.quad 0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
.quad 0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
.quad 0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
.quad 0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
.quad 0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
.quad 0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
.quad 0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
.quad 0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
.quad 0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
.quad 0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
.quad 0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
.quad 0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
.quad 0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
.quad 0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
.quad 0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
.quad 0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
.quad 0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
.quad 0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
.quad 0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
.quad 0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
.quad 0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
.quad 0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
.quad 0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
.quad 0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
.quad 0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
.quad 0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
.quad 0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
.quad 0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
.quad 0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
.quad 0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
.quad 0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
.quad 0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
.quad 0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
.quad 0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
.quad 0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
.quad 0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
.quad 0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
.quad 0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
.quad 0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
.quad 0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
.quad 0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
.quad 0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
.quad 0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
.quad 0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
.quad 0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
.quad 0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
.quad 0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
.quad 0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
.quad 0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
.quad 0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
.quad 0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
.quad 0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
.quad 0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
.quad 0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
.quad 0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
.quad 0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
.quad 0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
.quad 0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
.quad 0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
.quad 0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
.quad 0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
.quad 0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
.quad 0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
.quad 0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
.quad 0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
.quad 0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
.quad 0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
.quad 0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
.quad 0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
.quad 0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
.quad 0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
.quad 0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
.quad 0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
.quad 0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
.quad 0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
.quad 0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
.quad 0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
.quad 0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
.quad 0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
.quad 0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
.quad 0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
.quad 0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
.quad 0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
.quad 0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
.quad 0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
.quad 0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
.quad 0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
.quad 0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
.quad 0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
.quad 0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
.quad 0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
.quad 0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
.quad 0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
.quad 0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
.quad 0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
.quad 0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
.quad 0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
.quad 0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
.quad 0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
.quad 0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
.quad 0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
.quad 0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
.quad 0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
.quad 0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
.quad 0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
.quad 0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
.quad 0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
.quad 0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
.quad 0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
.quad 0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
.quad 0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
.quad 0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
.quad 0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
.quad 0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
.quad 0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
.quad 0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
.quad 0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
.quad 0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
.quad 0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
.quad 0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
.quad 0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
.quad 0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
.quad 0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
.quad 0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
.quad 0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
.quad 0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
.quad 0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
.quad 0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
.quad 0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
.quad 0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
.quad 0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
.quad 0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
.quad 0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
.quad 0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
.quad 0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
.quad 0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
.quad 0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
.quad 0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
.quad 0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
.quad 0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
.quad 0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
.quad 0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
.quad 0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
.quad 0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
.quad 0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
.quad 0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
.quad 0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
.quad 0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
.quad 0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
.quad 0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
.quad 0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
.quad 0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
.quad 0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
.quad 0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
.quad 0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
.quad 0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
.quad 0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
.quad 0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
.quad 0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
.quad 0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
.quad 0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
.quad 0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
.quad 0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
.quad 0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
.quad 0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
.quad 0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
.quad 0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
.quad 0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
.quad 0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
.quad 0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
.quad 0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
.quad 0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
.quad 0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
.quad 0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
.quad 0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
.quad 0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
.quad 0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
.quad 0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
.quad 0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
.quad 0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
.quad 0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
.quad 0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
.quad 0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
.quad 0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
.quad 0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
.quad 0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
.quad 0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
.quad 0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
.quad 0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
.quad 0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
.quad 0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
.quad 0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
.quad 0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
.quad 0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
.quad 0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
.quad 0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
.quad 0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
.quad 0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
.quad 0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
.quad 0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
.quad 0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
.quad 0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
.quad 0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
.quad 0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
.quad 0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
.quad 0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
.quad 0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
.quad 0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
.quad 0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
.quad 0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
.quad 0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
.quad 0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
.quad 0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
.quad 0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
.quad 0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
.quad 0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
.quad 0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
.quad 0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
.quad 0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
.quad 0x3ff0000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.align 16
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
.align 16
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
.align 16
.long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
.align 16
.long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
.align 16
.long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
.align 16
.long 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
.align 16
.type __svml_stanh_data_internal,@object
.size __svml_stanh_data_internal,.-__svml_stanh_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized tanhf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_tanhf _ZGVdN8v_tanhf_sse_wrapper
#include "../svml_s_tanhf8_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized tanhf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_tanhf
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_tanhf, __GI__ZGVdN8v_tanhf,
__redirect__ZGVdN8v_tanhf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,844 @@
/* Function tanhf vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* NOTE: Since the hyperbolic tangent function is odd
* (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
* value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
*
* We use a table lookup method to compute tanh(|x|).
* The basic idea is to split the input range into a number of subintervals
* and to approximate tanh(.) with a polynomial on each of them.
*
* IEEE SPECIAL CONDITIONS:
* x = [+,-]0, r = [+,-]0
* x = +Inf, r = +1
* x = -Inf, r = -1
* x = QNaN, r = QNaN
* x = SNaN, r = QNaN
*
*
* ALGORITHM DETAILS
* We handle special values in a callout function, aside from main path
* computations. "Special" for this algorithm are:
* INF, NAN, |x| > HUGE_THRESHOLD
*
*
* Main path computations are organized as follows:
* Actually we split the interval [0, SATURATION_THRESHOLD)
* into a number of subintervals. On each subinterval we approximate tanh(.)
* with a minimax polynomial of pre-defined degree. Polynomial coefficients
* are computed beforehand and stored in table. We also use
*
* y := |x| + B,
*
* here B depends on subinterval and is used to make argument
* closer to zero.
* We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
* where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
* preserve main path computation logic but return 1.0 for all arguments.
*
* Hence reconstruction looks as follows:
* we extract proper polynomial and range reduction coefficients
* (Pj and B), corresponding to subinterval, to which |x| belongs,
* and return
*
* r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
*
* NOTE: we use multiprecision technique to multiply and sum the first
* K terms of the polynomial. So Pj, j = 0..K are stored in
* table each as a pair of target precision numbers (Pj and PLj) to
* achieve wider than target precision.
*
*
*/
/* Offsets for data table __svml_stanh_data_internal
*/
#define _dbP 0
#define _sSignMask 4288
#define _sAbsMask 4320
#define _iExpMantMask 4352
#define _iExpMask 4384
#define _iMinIdxOfsMask 4416
#define _iMaxIdxMask 4448
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_tanhf_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
pushq %r12
subq $120, %rsp
lea _dbP+16+__svml_stanh_data_internal(%rip), %r10
vmovaps %ymm0, %ymm12
/* Here huge arguments, INF and NaNs are filtered out to callout. */
vpand _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
/*
* small table specific variables *
* Constant loading
*/
vmovups _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
vpsubd _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
/* if VMIN, VMAX is defined for I type */
vxorps %ymm15, %ymm15, %ymm15
vpcmpgtd %ymm15, %ymm9, %ymm0
vpand %ymm0, %ymm9, %ymm7
vpcmpgtd %ymm8, %ymm9, %ymm6
vblendvps %ymm6, %ymm8, %ymm7, %ymm3
vpsrld $14, %ymm3, %ymm1
vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
vmovmskps %ymm13, %r11d
vandps _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
vandps _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
vextractf128 $1, %ymm1, %xmm2
vmovd %xmm1, %r9d
vmovd %xmm2, %ecx
vpextrd $1, %xmm2, %edx
vpextrd $1, %xmm1, %r8d
movslq %r9d, %r9
movslq %edx, %rdx
movslq %r8d, %r8
vpextrd $2, %xmm1, %edi
movslq %ecx, %rcx
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
vpextrd $3, %xmm2, %r12d
vpextrd $3, %xmm1, %esi
vpextrd $2, %xmm2, %eax
movslq %edi, %rdi
movslq %r12d, %r12
movslq %esi, %rsi
movslq %eax, %rax
vmovupd -16(%r9,%r10), %xmm5
vmovupd -16(%rdx,%r10), %xmm14
vmovupd -16(%rcx,%r10), %xmm13
vmovupd (%r9,%r10), %xmm1
vmovupd (%r8,%r10), %xmm2
vmovupd -16(%r8,%r10), %xmm4
vinsertf128 $1, -16(%rdi,%r10), %ymm5, %ymm15
vinsertf128 $1, -16(%r12,%r10), %ymm14, %ymm3
vinsertf128 $1, -16(%rax,%r10), %ymm13, %ymm6
vinsertf128 $1, (%rdi,%r10), %ymm1, %ymm5
vinsertf128 $1, (%rsi,%r10), %ymm2, %ymm14
vunpcklpd %ymm3, %ymm6, %ymm8
vunpckhpd %ymm3, %ymm6, %ymm6
vunpcklpd %ymm14, %ymm5, %ymm3
vunpckhpd %ymm14, %ymm5, %ymm2
vmovupd (%rcx,%r10), %xmm13
vcvtps2pd %xmm10, %ymm5
vextractf128 $1, %ymm10, %xmm10
vfmadd213pd %ymm3, %ymm5, %ymm2
vinsertf128 $1, -16(%rsi,%r10), %ymm4, %ymm0
vmovupd (%rdx,%r10), %xmm4
vunpcklpd %ymm0, %ymm15, %ymm9
vunpckhpd %ymm0, %ymm15, %ymm7
vfmadd213pd %ymm7, %ymm5, %ymm2
vfmadd213pd %ymm9, %ymm5, %ymm2
vinsertf128 $1, (%r12,%r10), %ymm4, %ymm0
vcvtps2pd %xmm10, %ymm4
vinsertf128 $1, (%rax,%r10), %ymm13, %ymm15
vunpcklpd %ymm0, %ymm15, %ymm1
vunpckhpd %ymm0, %ymm15, %ymm0
vfmadd213pd %ymm1, %ymm4, %ymm0
vcvtpd2ps %ymm2, %xmm1
vfmadd213pd %ymm6, %ymm4, %ymm0
vfmadd213pd %ymm8, %ymm4, %ymm0
vcvtpd2ps %ymm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm2
vorps %ymm11, %ymm2, %ymm0
testl %r11d, %r11d
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r13 r14 r15 r11d ymm0 ymm12
/* Restore registers
* and exit the function
*/
L(EXIT):
addq $120, %rsp
cfi_restore(12)
popq %r12
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %ymm12, 32(%rsp)
vmovups %ymm0, 64(%rsp)
# LOE rbx r13 r14 r15 r11d ymm0
xorl %r12d, %r12d
# LOE rbx r13 r14 r15 r11d r12d
vzeroupper
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
movl %r11d, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
# LOE rbx r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call tanhf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 64(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN8v_tanhf_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_stanh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _dbP[(134*4)][2];
__declspec(align(32)) VUINT32 _sSignMask[8][1];
__declspec(align(32)) VUINT32 _sAbsMask[8][1];
__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
__declspec(align(32)) VUINT32 _iExpMask[8][1];
__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
} __svml_stanh_data_internal;
#endif
__svml_stanh_data_internal:
/* Pol_000: err=7.93e-09, x in [0.0000000; 0.0312500]. */
.quad 0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
.quad 0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
.quad 0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
.quad 0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
.quad 0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
.quad 0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
.quad 0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
.quad 0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
.quad 0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
.quad 0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
.quad 0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
.quad 0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
.quad 0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
.quad 0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
.quad 0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
.quad 0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
.quad 0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
.quad 0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
.quad 0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
.quad 0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
.quad 0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
.quad 0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
.quad 0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
.quad 0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
.quad 0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
.quad 0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
.quad 0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
.quad 0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
.quad 0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
.quad 0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
.quad 0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
.quad 0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
.quad 0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
.quad 0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
.quad 0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
.quad 0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
.quad 0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
.quad 0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
.quad 0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
.quad 0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
.quad 0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
.quad 0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
.quad 0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
.quad 0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
.quad 0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
.quad 0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
.quad 0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
.quad 0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
.quad 0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
.quad 0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
.quad 0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
.quad 0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
.quad 0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
.quad 0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
.quad 0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
.quad 0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
.quad 0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
.quad 0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
.quad 0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
.quad 0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
.quad 0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
.quad 0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
.quad 0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
.quad 0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
.quad 0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
.quad 0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
.quad 0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
.quad 0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
.quad 0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
.quad 0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
.quad 0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
.quad 0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
.quad 0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
.quad 0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
.quad 0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
.quad 0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
.quad 0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
.quad 0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
.quad 0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
.quad 0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
.quad 0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
.quad 0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
.quad 0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
.quad 0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
.quad 0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
.quad 0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
.quad 0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
.quad 0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
.quad 0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
.quad 0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
.quad 0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
.quad 0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
.quad 0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
.quad 0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
.quad 0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
.quad 0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
.quad 0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
.quad 0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
.quad 0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
.quad 0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
.quad 0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
.quad 0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
.quad 0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
.quad 0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
.quad 0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
.quad 0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
.quad 0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
.quad 0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
.quad 0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
.quad 0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
.quad 0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
.quad 0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
.quad 0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
.quad 0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
.quad 0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
.quad 0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
.quad 0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
.quad 0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
.quad 0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
.quad 0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
.quad 0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
.quad 0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
.quad 0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
.quad 0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
.quad 0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
.quad 0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
.quad 0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
.quad 0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
.quad 0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
.quad 0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
.quad 0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
.quad 0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
.quad 0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
.quad 0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
.quad 0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
.quad 0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
.quad 0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
.quad 0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
.quad 0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
.quad 0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
.quad 0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
.quad 0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
.quad 0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
.quad 0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
.quad 0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
.quad 0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
.quad 0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
.quad 0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
.quad 0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
.quad 0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
.quad 0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
.quad 0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
.quad 0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
.quad 0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
.quad 0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
.quad 0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
.quad 0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
.quad 0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
.quad 0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
.quad 0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
.quad 0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
.quad 0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
.quad 0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
.quad 0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
.quad 0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
.quad 0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
.quad 0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
.quad 0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
.quad 0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
.quad 0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
.quad 0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
.quad 0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
.quad 0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
.quad 0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
.quad 0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
.quad 0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
.quad 0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
.quad 0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
.quad 0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
.quad 0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
.quad 0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
.quad 0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
.quad 0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
.quad 0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
.quad 0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
.quad 0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
.quad 0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
.quad 0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
.quad 0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
.quad 0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
.quad 0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
.quad 0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
.quad 0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
.quad 0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
.quad 0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
.quad 0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
.quad 0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
.quad 0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
.quad 0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
.quad 0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
.quad 0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
.quad 0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
.quad 0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
.quad 0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
.quad 0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
.quad 0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
.quad 0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
.quad 0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
.quad 0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
.quad 0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
.quad 0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
.quad 0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
.quad 0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
.quad 0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
.quad 0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
.quad 0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
.quad 0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
.quad 0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
.quad 0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
.quad 0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
.quad 0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
.quad 0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
.quad 0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
.quad 0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
.quad 0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
.quad 0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
.quad 0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
.quad 0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
.quad 0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
.quad 0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
.quad 0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
.quad 0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
.quad 0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
.quad 0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
.quad 0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
.quad 0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
.quad 0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
.quad 0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
.quad 0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
.quad 0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
.quad 0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
.quad 0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
.quad 0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
.quad 0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
.quad 0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
.quad 0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
.quad 0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
.quad 0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
.quad 0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
.quad 0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
.quad 0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
.quad 0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
.quad 0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
.quad 0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
.quad 0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
.quad 0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
.quad 0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
.quad 0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
.quad 0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
.quad 0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
.quad 0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
.quad 0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
.quad 0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
.quad 0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
.quad 0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
.quad 0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
.quad 0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
.quad 0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
.quad 0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
.quad 0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
.quad 0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
.quad 0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
.quad 0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
.quad 0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
.quad 0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
.quad 0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
.quad 0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
.quad 0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
.quad 0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
.quad 0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
.quad 0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
.quad 0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
.quad 0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
.quad 0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
.quad 0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
.quad 0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
.quad 0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
.quad 0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
.quad 0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
.quad 0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
.quad 0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
.quad 0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
.quad 0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
.quad 0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
.quad 0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
.quad 0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
.quad 0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
.quad 0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
.quad 0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
.quad 0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
.quad 0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
.quad 0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
.quad 0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
.quad 0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
.quad 0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
.quad 0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
.quad 0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
.quad 0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
.quad 0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
.quad 0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
.quad 0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
.quad 0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
.quad 0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
.quad 0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
.quad 0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
.quad 0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
.quad 0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
.quad 0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
.quad 0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
.quad 0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
.quad 0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
.quad 0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
.quad 0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
.quad 0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
.quad 0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
.quad 0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
.quad 0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
.quad 0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
.quad 0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
.quad 0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
.quad 0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
.quad 0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
.quad 0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
.quad 0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
.quad 0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
.quad 0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
.quad 0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
.quad 0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
.quad 0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
.quad 0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
.quad 0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
.quad 0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
.quad 0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
.quad 0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
.quad 0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
.quad 0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
.quad 0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
.quad 0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
.quad 0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
.quad 0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
.quad 0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
.quad 0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
.quad 0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
.quad 0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
.quad 0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
.quad 0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
.quad 0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
.quad 0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
.quad 0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
.quad 0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
.quad 0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
.quad 0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
.quad 0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
.quad 0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
.quad 0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
.quad 0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
.quad 0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
.quad 0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
.quad 0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
.quad 0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
.quad 0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
.quad 0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
.quad 0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
.quad 0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
.quad 0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
.quad 0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
.quad 0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
.quad 0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
.quad 0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
.quad 0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
.quad 0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
.quad 0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
.quad 0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
.quad 0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
.quad 0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
.quad 0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
.quad 0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
.quad 0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
.quad 0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
.quad 0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
.quad 0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
.quad 0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
.quad 0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
.quad 0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
.quad 0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
.quad 0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
.quad 0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
.quad 0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
.quad 0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
.quad 0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
.quad 0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
.quad 0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
.quad 0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
.quad 0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
.quad 0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
.quad 0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
.quad 0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
.quad 0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
.quad 0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
.quad 0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
.quad 0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
.quad 0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
.quad 0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
.quad 0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
.quad 0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
.quad 0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
.quad 0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
.quad 0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
.quad 0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
.quad 0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
.quad 0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
.quad 0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
.quad 0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
.quad 0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
.quad 0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
.quad 0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
.quad 0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
.quad 0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
.quad 0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
.quad 0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
.quad 0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
.quad 0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
.quad 0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
.quad 0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
.quad 0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
.quad 0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
.quad 0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
.quad 0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
.quad 0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
.quad 0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
.quad 0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
.quad 0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
.quad 0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
.quad 0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
.quad 0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
.quad 0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
.quad 0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
.quad 0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
.quad 0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
.quad 0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
.quad 0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
.quad 0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
.quad 0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
.quad 0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
.quad 0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
.quad 0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
.quad 0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
.quad 0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
.quad 0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
.quad 0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
.quad 0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
.quad 0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
.quad 0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
.quad 0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
.quad 0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
.quad 0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
.quad 0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
.quad 0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
.quad 0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
.quad 0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
.quad 0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
.quad 0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
.quad 0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
.quad 0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
.quad 0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
.quad 0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
.quad 0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
.quad 0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
.quad 0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
.quad 0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
.quad 0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
.quad 0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
.quad 0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
.quad 0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
.quad 0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
.quad 0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
.quad 0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
.quad 0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
.quad 0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
.quad 0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
.quad 0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
.quad 0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
.quad 0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
.quad 0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
.quad 0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
.quad 0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
.quad 0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
.quad 0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
.quad 0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
.quad 0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
.quad 0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
.quad 0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
.quad 0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
.quad 0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
.quad 0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
.quad 0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
.quad 0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
.quad 0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
.quad 0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
.quad 0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
.quad 0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
.quad 0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
.quad 0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
.quad 0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
.quad 0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
.quad 0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
.quad 0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
.quad 0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
.quad 0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
.quad 0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
.quad 0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
.quad 0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
.quad 0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
.quad 0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
.quad 0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
.quad 0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
.quad 0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
.quad 0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
.quad 0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
.quad 0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
.quad 0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
.quad 0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
.quad 0x3ff0000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.quad 0x0000000000000000
.align 32
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
.align 32
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
.align 32
.long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
.align 32
.long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
.align 32
.long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
.align 32
.long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
.align 32
.type __svml_stanh_data_internal,@object
.size __svml_stanh_data_internal,.-__svml_stanh_data_internal

View File

@ -0,0 +1,29 @@
/* Function tanh vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_tanh)
WRAPPER_IMPL_SSE2 tanh
END (_ZGVbN2v_tanh)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_tanh)
#endif

View File

@ -0,0 +1,29 @@
/* Function tanh vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_tanh)
WRAPPER_IMPL_AVX _ZGVbN2v_tanh
END (_ZGVdN4v_tanh)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_tanh)
#endif

View File

@ -0,0 +1,25 @@
/* Function tanh vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_tanh)
WRAPPER_IMPL_AVX _ZGVbN2v_tanh
END (_ZGVcN4v_tanh)

View File

@ -0,0 +1,25 @@
/* Function tanh vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_tanh)
WRAPPER_IMPL_AVX512 _ZGVdN4v_tanh
END (_ZGVeN8v_tanh)

View File

@ -0,0 +1,25 @@
/* Function tanhf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_tanhf)
WRAPPER_IMPL_AVX512 _ZGVdN8v_tanhf
END (_ZGVeN16v_tanhf)

View File

@ -0,0 +1,29 @@
/* Function tanhf vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_tanhf)
WRAPPER_IMPL_SSE2 tanhf
END (_ZGVbN4v_tanhf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_tanhf)
#endif

View File

@ -0,0 +1,29 @@
/* Function tanhf vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_tanhf)
WRAPPER_IMPL_AVX _ZGVbN4v_tanhf
END (_ZGVdN8v_tanhf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_tanhf)
#endif

View File

@ -0,0 +1,25 @@
/* Function tanhf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_tanhf)
WRAPPER_IMPL_AVX _ZGVbN4v_tanhf
END (_ZGVcN8v_tanhf)

View File

@ -0,0 +1 @@
#include "test-double-libmvec-tanh.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-tanh.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-tanh.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC tanh
#include "test-vector-abi-arg1.h"

View File

@ -44,6 +44,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVbN2v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVbN2v_atanh)
VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVbN2v_acosh)
VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVbN2v_erf)
VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVbN2v_tanh)
#define VEC_INT_TYPE __m128i

View File

@ -47,6 +47,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVdN4v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVdN4v_atanh)
VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVdN4v_acosh)
VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVdN4v_erf)
VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVdN4v_tanh)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View File

@ -44,6 +44,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVcN4v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVcN4v_atanh)
VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVcN4v_acosh)
VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVcN4v_erf)
VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVcN4v_tanh)
#define VEC_INT_TYPE __m128i

View File

@ -44,6 +44,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVeN8v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVeN8v_atanh)
VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVeN8v_acosh)
VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVeN8v_erf)
VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVeN8v_tanh)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View File

@ -0,0 +1 @@
#include "test-float-libmvec-tanhf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-tanhf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-tanhf.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC tanhf
#include "test-vector-abi-arg1.h"

View File

@ -44,6 +44,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVeN16v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVeN16v_atanhf)
VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVeN16v_acoshf)
VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVeN16v_erff)
VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVeN16v_tanhf)
#define VEC_INT_TYPE __m512i

View File

@ -44,6 +44,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVbN4v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVbN4v_atanhf)
VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVbN4v_acoshf)
VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVbN4v_erff)
VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVbN4v_tanhf)
#define VEC_INT_TYPE __m128i

View File

@ -47,6 +47,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVdN8v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVdN8v_atanhf)
VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVdN8v_acoshf)
VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVdN8v_erff)
VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVdN8v_tanhf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View File

@ -44,6 +44,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVcN8v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVcN8v_atanhf)
VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVcN8v_acoshf)
VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVcN8v_erff)
VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVcN8v_tanhf)
#define VEC_INT_TYPE __m128i