x86-64: Add vector cosh/coshf implementation to libmvec

Implement vectorized cosh/coshf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector cosh/coshf with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 08:53:16 -08:00
parent 8b726453d5
commit ef7ea9c132
50 changed files with 2637 additions and 1 deletions

View File

@ -164,4 +164,15 @@
#define __DECL_SIMD_exp10f32x
#define __DECL_SIMD_exp10f64x
#define __DECL_SIMD_exp10f128x
#define __DECL_SIMD_cosh
#define __DECL_SIMD_coshf
#define __DECL_SIMD_coshl
#define __DECL_SIMD_coshf16
#define __DECL_SIMD_coshf32
#define __DECL_SIMD_coshf64
#define __DECL_SIMD_coshf128
#define __DECL_SIMD_coshf32x
#define __DECL_SIMD_coshf64x
#define __DECL_SIMD_coshf128x
#endif

View File

@ -68,7 +68,7 @@ __MATHCALL (tan,, (_Mdouble_ __x));
/* Hyperbolic functions. */
/* Hyperbolic cosine of X. */
__MATHCALL (cosh,, (_Mdouble_ __x));
__MATHCALL_VEC (cosh,, (_Mdouble_ __x));
/* Hyperbolic sine of X. */
__MATHCALL (sinh,, (_Mdouble_ __x));
/* Hyperbolic tangent of X. */

View File

@ -49,48 +49,56 @@ GLIBC_2.22 _ZGVeN8vvv_sincos F
GLIBC_2.35 _ZGVbN2v_acos F
GLIBC_2.35 _ZGVbN2v_asin F
GLIBC_2.35 _ZGVbN2v_atan F
GLIBC_2.35 _ZGVbN2v_cosh F
GLIBC_2.35 _ZGVbN2v_exp10 F
GLIBC_2.35 _ZGVbN2v_exp2 F
GLIBC_2.35 _ZGVbN2vv_hypot F
GLIBC_2.35 _ZGVbN4v_acosf F
GLIBC_2.35 _ZGVbN4v_asinf F
GLIBC_2.35 _ZGVbN4v_atanf F
GLIBC_2.35 _ZGVbN4v_coshf F
GLIBC_2.35 _ZGVbN4v_exp10f F
GLIBC_2.35 _ZGVbN4v_exp2f F
GLIBC_2.35 _ZGVbN4vv_hypotf F
GLIBC_2.35 _ZGVcN4v_acos F
GLIBC_2.35 _ZGVcN4v_asin F
GLIBC_2.35 _ZGVcN4v_atan F
GLIBC_2.35 _ZGVcN4v_cosh F
GLIBC_2.35 _ZGVcN4v_exp10 F
GLIBC_2.35 _ZGVcN4v_exp2 F
GLIBC_2.35 _ZGVcN4vv_hypot F
GLIBC_2.35 _ZGVcN8v_acosf F
GLIBC_2.35 _ZGVcN8v_asinf F
GLIBC_2.35 _ZGVcN8v_atanf F
GLIBC_2.35 _ZGVcN8v_coshf F
GLIBC_2.35 _ZGVcN8v_exp10f F
GLIBC_2.35 _ZGVcN8v_exp2f F
GLIBC_2.35 _ZGVcN8vv_hypotf F
GLIBC_2.35 _ZGVdN4v_acos F
GLIBC_2.35 _ZGVdN4v_asin F
GLIBC_2.35 _ZGVdN4v_atan F
GLIBC_2.35 _ZGVdN4v_cosh F
GLIBC_2.35 _ZGVdN4v_exp10 F
GLIBC_2.35 _ZGVdN4v_exp2 F
GLIBC_2.35 _ZGVdN4vv_hypot F
GLIBC_2.35 _ZGVdN8v_acosf F
GLIBC_2.35 _ZGVdN8v_asinf F
GLIBC_2.35 _ZGVdN8v_atanf F
GLIBC_2.35 _ZGVdN8v_coshf F
GLIBC_2.35 _ZGVdN8v_exp10f F
GLIBC_2.35 _ZGVdN8v_exp2f F
GLIBC_2.35 _ZGVdN8vv_hypotf F
GLIBC_2.35 _ZGVeN16v_acosf F
GLIBC_2.35 _ZGVeN16v_asinf F
GLIBC_2.35 _ZGVeN16v_atanf F
GLIBC_2.35 _ZGVeN16v_coshf F
GLIBC_2.35 _ZGVeN16v_exp10f F
GLIBC_2.35 _ZGVeN16v_exp2f F
GLIBC_2.35 _ZGVeN16vv_hypotf F
GLIBC_2.35 _ZGVeN8v_acos F
GLIBC_2.35 _ZGVeN8v_asin F
GLIBC_2.35 _ZGVeN8v_atan F
GLIBC_2.35 _ZGVeN8v_cosh F
GLIBC_2.35 _ZGVeN8v_exp10 F
GLIBC_2.35 _ZGVeN8v_exp2 F
GLIBC_2.35 _ZGVeN8vv_hypot F

View File

@ -82,6 +82,10 @@
# define __DECL_SIMD_exp10 __DECL_SIMD_x86_64
# undef __DECL_SIMD_exp10f
# define __DECL_SIMD_exp10f __DECL_SIMD_x86_64
# undef __DECL_SIMD_cosh
# define __DECL_SIMD_cosh __DECL_SIMD_x86_64
# undef __DECL_SIMD_coshf
# define __DECL_SIMD_coshf __DECL_SIMD_x86_64
# endif
#endif

View File

@ -40,6 +40,8 @@
!GCC$ builtin (exp2f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (exp10) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (exp10f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cosh) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (coshf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -65,3 +67,5 @@
!GCC$ builtin (exp2f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (exp10) attributes simd (notinbranch) if('x32')
!GCC$ builtin (exp10f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosh) attributes simd (notinbranch) if('x32')
!GCC$ builtin (coshf) attributes simd (notinbranch) if('x32')

View File

@ -26,6 +26,7 @@ libmvec-funcs = \
asin \
atan \
cos \
cosh \
exp \
exp10 \
exp2 \

View File

@ -17,12 +17,14 @@ libmvec {
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
_ZGVbN2v_asin; _ZGVcN4v_asin; _ZGVdN4v_asin; _ZGVeN8v_asin;
_ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
_ZGVbN2v_cosh; _ZGVcN4v_cosh; _ZGVdN4v_cosh; _ZGVeN8v_cosh;
_ZGVbN2v_exp10; _ZGVcN4v_exp10; _ZGVdN4v_exp10; _ZGVeN8v_exp10;
_ZGVbN2v_exp2; _ZGVcN4v_exp2; _ZGVdN4v_exp2; _ZGVeN8v_exp2;
_ZGVbN2vv_hypot; _ZGVcN4vv_hypot; _ZGVdN4vv_hypot; _ZGVeN8vv_hypot;
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
_ZGVbN4v_asinf; _ZGVcN8v_asinf; _ZGVdN8v_asinf; _ZGVeN16v_asinf;
_ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
_ZGVbN4v_coshf; _ZGVcN8v_coshf; _ZGVdN8v_coshf; _ZGVeN16v_coshf;
_ZGVbN4v_exp10f; _ZGVcN8v_exp10f; _ZGVdN8v_exp10f; _ZGVeN16v_exp10f;
_ZGVbN4v_exp2f; _ZGVcN8v_exp2f; _ZGVdN8v_exp2f; _ZGVeN16v_exp2f;
_ZGVbN4vv_hypotf; _ZGVcN8vv_hypotf; _ZGVdN8vv_hypotf; _ZGVeN16vv_hypotf;

View File

@ -891,6 +891,26 @@ float: 2
float128: 3
ldouble: 3
Function: "cosh_vlen16":
float: 2
Function: "cosh_vlen2":
double: 2
Function: "cosh_vlen4":
double: 2
float: 2
Function: "cosh_vlen4_avx2":
double: 2
Function: "cosh_vlen8":
double: 2
float: 2
Function: "cosh_vlen8_avx2":
float: 2
Function: Real part of "cpow":
double: 2
float: 5

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized cosh, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_cosh _ZGVbN2v_cosh_sse2
#include "../svml_d_cosh2_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized cosh, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_cosh
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_cosh, __GI__ZGVbN2v_cosh, __redirect__ZGVbN2v_cosh)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,396 @@
/* Function cosh vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute cosh(x) as (exp(x)+exp(-x))/2,
* where exp is calculated as
* exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
*
* Special cases:
*
* cosh(NaN) = quiet NaN, and raise invalid exception
* cosh(INF) = that INF
* cosh(0) = 1
* cosh(x) overflows for big x and returns MAXLOG+log(2)
*
*/
/* Offsets for data table __svml_dcosh_data_internal
*/
#define _dbT 0
#define _dbInvLn2 2064
#define _dbLn2hi 2080
#define _dbLn2lo 2096
#define _dbShifter 2112
#define _iIndexMask 2128
#define _dPC2 2144
#define _dPC3 2160
#define _dPC4 2176
#define _iMaxIndex 2192
#define _lExpMask 2208
#define _dSign 2224
#define _iDomainRange 2240
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN2v_cosh_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
movaps %xmm0, %xmm4
movups _dSign+__svml_dcosh_data_internal(%rip), %xmm2
lea _dbT+__svml_dcosh_data_internal(%rip), %r8
/* Abs argument */
movaps %xmm2, %xmm5
/* dXSign=0x001000000000 */
psrlq $11, %xmm2
/*
* Load argument
* dM = x*2^K/log(2) + RShifter
*/
movups _dbInvLn2+__svml_dcosh_data_internal(%rip), %xmm3
andnps %xmm4, %xmm5
mulpd %xmm5, %xmm3
movups _dbShifter+__svml_dcosh_data_internal(%rip), %xmm1
addpd %xmm1, %xmm3
/*
* R
* dN = dM - RShifter
*/
movaps %xmm3, %xmm15
subpd %xmm1, %xmm15
/* dR = dX - dN*Log2_hi/2^K */
movups _dbLn2hi+__svml_dcosh_data_internal(%rip), %xmm14
mulpd %xmm15, %xmm14
/* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */
movups _dbLn2lo+__svml_dcosh_data_internal(%rip), %xmm1
mulpd %xmm15, %xmm1
/*
* Check for overflow\underflow
*
*/
pshufd $221, %xmm5, %xmm7
subpd %xmm14, %xmm5
movq _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm8
/* Index and lookup */
pshufd $136, %xmm3, %xmm9
/*
* G1,G2,G3: dTdif,dTn * 2^N,2^(-N)
* NB: copied from sinh_la - to be optimized!!!!!
*/
psllq $44, %xmm3
/*
* trick
* 256=-iIndex
*/
movq _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm12
pand %xmm8, %xmm9
subpd %xmm1, %xmm5
psubd %xmm9, %xmm12
/* iIndex*=3 */
movdqa %xmm9, %xmm10
/* iDomainRange*=3 */
pslld $3, %xmm12
pslld $3, %xmm10
movd %xmm12, %esi
pshufd $1, %xmm12, %xmm13
movq _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm6
movd %xmm13, %edi
pcmpgtd %xmm6, %xmm7
movmskps %xmm7, %eax
/* dR2 = dR^2 */
movaps %xmm5, %xmm7
/* lM now is an EXP(2^N) */
pand _lExpMask+__svml_dcosh_data_internal(%rip), %xmm3
pshufd $1, %xmm10, %xmm11
movslq %esi, %rsi
mulpd %xmm5, %xmm7
movd %xmm10, %edx
movsd (%r8,%rsi), %xmm6
movd %xmm11, %ecx
movslq %edi, %rdi
movslq %edx, %rdx
movslq %ecx, %rcx
movhpd (%r8,%rdi), %xmm6
/* */
psubq %xmm3, %xmm6
/* lX- = EXP(1/2) */
psubq %xmm2, %xmm6
/*
* sinh(r) = r +r*r^2*a3 ....
* dSinh_r = r^2*a3
*/
movups _dPC3+__svml_dcosh_data_internal(%rip), %xmm2
mulpd %xmm7, %xmm2
/* dSinh_r = r + r*r^2*a3 */
mulpd %xmm5, %xmm2
movsd (%r8,%rdx), %xmm0
movhpd (%r8,%rcx), %xmm0
paddq %xmm3, %xmm0
addpd %xmm2, %xmm5
/* dTn = dTn*2^N - dTn*2^-N */
movaps %xmm0, %xmm3
subpd %xmm6, %xmm3
/* dTp = dTn*2^N + dTn*2^-N */
addpd %xmm6, %xmm0
mulpd %xmm5, %xmm3
/* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
movups _dPC4+__svml_dcosh_data_internal(%rip), %xmm5
mulpd %xmm7, %xmm5
addpd _dPC2+__svml_dcosh_data_internal(%rip), %xmm5
mulpd %xmm5, %xmm7
/* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
mulpd %xmm0, %xmm7
addpd %xmm7, %xmm3
/* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
addpd %xmm3, %xmm0
andl $3, %eax
/* Ret H */
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm4
/* Restore registers
* and exit the function
*/
L(EXIT):
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm4, 32(%rsp)
movups %xmm0, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax xmm0
xorl %edx, %edx
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %edx, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %eax, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $2, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm0
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call cosh@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movsd %xmm0, 48(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN2v_cosh_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_dcosh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(16)) VUINT32 _dbT[(1 + (1<<8))][2]; //dTpj ONLY!
__declspec(align(16)) VUINT32 _dbInvLn2[2][2];
__declspec(align(16)) VUINT32 _dbLn2hi[2][2];
__declspec(align(16)) VUINT32 _dbLn2lo[2][2];
__declspec(align(16)) VUINT32 _dbShifter[2][2];
__declspec(align(16)) VUINT32 _iIndexMask[4][1]; //(1<<K)1-
__declspec(align(16)) VUINT32 _dPC2[2][2];
__declspec(align(16)) VUINT32 _dPC3[2][2];
__declspec(align(16)) VUINT32 _dPC4[2][2];
__declspec(align(16)) VUINT32 _iMaxIndex[4][1]; //(1<<K)
__declspec(align(16)) VUINT32 _lExpMask[2][2];
__declspec(align(16)) VUINT32 _dSign[2][2]; //0x8000000000000000
__declspec(align(16)) VUINT32 _iDomainRange[4][1];
} __svml_dcosh_data_internal;
#endif
__svml_dcosh_data_internal:
/*== _dbT ==*/
.quad 0x3fe0000000000000, 0x3fe00b1afa5abcbf, 0x3fe0163da9fb3335, 0x3fe02168143b0281
.quad 0x3fe02c9a3e778061, 0x3fe037d42e11bbcc, 0x3fe04315e86e7f85, 0x3fe04e5f72f654b1
.quad 0x3fe059b0d3158574, 0x3fe0650a0e3c1f89, 0x3fe0706b29ddf6de, 0x3fe07bd42b72a836
.quad 0x3fe0874518759bc8, 0x3fe092bdf66607e0, 0x3fe09e3ecac6f383, 0x3fe0a9c79b1f3919
.quad 0x3fe0b5586cf9890f, 0x3fe0c0f145e46c85, 0x3fe0cc922b7247f7, 0x3fe0d83b23395dec
.quad 0x3fe0e3ec32d3d1a2, 0x3fe0efa55fdfa9c5, 0x3fe0fb66affed31b, 0x3fe1073028d7233e
.quad 0x3fe11301d0125b51, 0x3fe11edbab5e2ab6, 0x3fe12abdc06c31cc, 0x3fe136a814f204ab
.quad 0x3fe1429aaea92de0, 0x3fe14e95934f312e, 0x3fe15a98c8a58e51, 0x3fe166a45471c3c2
.quad 0x3fe172b83c7d517b, 0x3fe17ed48695bbc0, 0x3fe18af9388c8dea, 0x3fe1972658375d2f
.quad 0x3fe1a35beb6fcb75, 0x3fe1af99f8138a1c, 0x3fe1bbe084045cd4, 0x3fe1c82f95281c6b
.quad 0x3fe1d4873168b9aa, 0x3fe1e0e75eb44027, 0x3fe1ed5022fcd91d, 0x3fe1f9c18438ce4d
.quad 0x3fe2063b88628cd6, 0x3fe212be3578a819, 0x3fe21f49917ddc96, 0x3fe22bdda27912d1
.quad 0x3fe2387a6e756238, 0x3fe2451ffb82140a, 0x3fe251ce4fb2a63f, 0x3fe25e85711ece75
.quad 0x3fe26b4565e27cdd, 0x3fe2780e341ddf29, 0x3fe284dfe1f56381, 0x3fe291ba7591bb70
.quad 0x3fe29e9df51fdee1, 0x3fe2ab8a66d10f13, 0x3fe2b87fd0dad990, 0x3fe2c57e39771b2f
.quad 0x3fe2d285a6e4030b, 0x3fe2df961f641589, 0x3fe2ecafa93e2f56, 0x3fe2f9d24abd886b
.quad 0x3fe306fe0a31b715, 0x3fe31432edeeb2fd, 0x3fe32170fc4cd831, 0x3fe32eb83ba8ea32
.quad 0x3fe33c08b26416ff, 0x3fe3496266e3fa2d, 0x3fe356c55f929ff1, 0x3fe36431a2de883b
.quad 0x3fe371a7373aa9cb, 0x3fe37f26231e754a, 0x3fe38cae6d05d866, 0x3fe39a401b7140ef
.quad 0x3fe3a7db34e59ff7, 0x3fe3b57fbfec6cf4, 0x3fe3c32dc313a8e5, 0x3fe3d0e544ede173
.quad 0x3fe3dea64c123422, 0x3fe3ec70df1c5175, 0x3fe3fa4504ac801c, 0x3fe40822c367a024
.quad 0x3fe4160a21f72e2a, 0x3fe423fb2709468a, 0x3fe431f5d950a897, 0x3fe43ffa3f84b9d4
.quad 0x3fe44e086061892d, 0x3fe45c2042a7d232, 0x3fe46a41ed1d0057, 0x3fe4786d668b3237
.quad 0x3fe486a2b5c13cd0, 0x3fe494e1e192aed2, 0x3fe4a32af0d7d3de, 0x3fe4b17dea6db7d7
.quad 0x3fe4bfdad5362a27, 0x3fe4ce41b817c114, 0x3fe4dcb299fddd0d, 0x3fe4eb2d81d8abff
.quad 0x3fe4f9b2769d2ca7, 0x3fe508417f4531ee, 0x3fe516daa2cf6642, 0x3fe5257de83f4eef
.quad 0x3fe5342b569d4f82, 0x3fe542e2f4f6ad27, 0x3fe551a4ca5d920f, 0x3fe56070dde910d2
.quad 0x3fe56f4736b527da, 0x3fe57e27dbe2c4cf, 0x3fe58d12d497c7fd, 0x3fe59c0827ff07cc
.quad 0x3fe5ab07dd485429, 0x3fe5ba11fba87a03, 0x3fe5c9268a5946b7, 0x3fe5d84590998b93
.quad 0x3fe5e76f15ad2148, 0x3fe5f6a320dceb71, 0x3fe605e1b976dc09, 0x3fe6152ae6cdf6f4
.quad 0x3fe6247eb03a5585, 0x3fe633dd1d1929fd, 0x3fe6434634ccc320, 0x3fe652b9febc8fb7
.quad 0x3fe6623882552225, 0x3fe671c1c70833f6, 0x3fe68155d44ca973, 0x3fe690f4b19e9538
.quad 0x3fe6a09e667f3bcd, 0x3fe6b052fa75173e, 0x3fe6c012750bdabf, 0x3fe6cfdcddd47645
.quad 0x3fe6dfb23c651a2f, 0x3fe6ef9298593ae5, 0x3fe6ff7df9519484, 0x3fe70f7466f42e87
.quad 0x3fe71f75e8ec5f74, 0x3fe72f8286ead08a, 0x3fe73f9a48a58174, 0x3fe74fbd35d7cbfd
.quad 0x3fe75feb564267c9, 0x3fe77024b1ab6e09, 0x3fe780694fde5d3f, 0x3fe790b938ac1cf6
.quad 0x3fe7a11473eb0187, 0x3fe7b17b0976cfdb, 0x3fe7c1ed0130c132, 0x3fe7d26a62ff86f0
.quad 0x3fe7e2f336cf4e62, 0x3fe7f3878491c491, 0x3fe80427543e1a12, 0x3fe814d2add106d9
.quad 0x3fe82589994cce13, 0x3fe8364c1eb941f7, 0x3fe8471a4623c7ad, 0x3fe857f4179f5b21
.quad 0x3fe868d99b4492ed, 0x3fe879cad931a436, 0x3fe88ac7d98a6699, 0x3fe89bd0a478580f
.quad 0x3fe8ace5422aa0db, 0x3fe8be05bad61778, 0x3fe8cf3216b5448c, 0x3fe8e06a5e0866d9
.quad 0x3fe8f1ae99157736, 0x3fe902fed0282c8a, 0x3fe9145b0b91ffc6, 0x3fe925c353aa2fe2
.quad 0x3fe93737b0cdc5e5, 0x3fe948b82b5f98e5, 0x3fe95a44cbc8520f, 0x3fe96bdd9a7670b3
.quad 0x3fe97d829fde4e50, 0x3fe98f33e47a22a2, 0x3fe9a0f170ca07ba, 0x3fe9b2bb4d53fe0d
.quad 0x3fe9c49182a3f090, 0x3fe9d674194bb8d5, 0x3fe9e86319e32323, 0x3fe9fa5e8d07f29e
.quad 0x3fea0c667b5de565, 0x3fea1e7aed8eb8bb, 0x3fea309bec4a2d33, 0x3fea42c980460ad8
.quad 0x3fea5503b23e255d, 0x3fea674a8af46052, 0x3fea799e1330b358, 0x3fea8bfe53c12e59
.quad 0x3fea9e6b5579fdbf, 0x3feab0e521356eba, 0x3feac36bbfd3f37a, 0x3fead5ff3a3c2774
.quad 0x3feae89f995ad3ad, 0x3feafb4ce622f2ff, 0x3feb0e07298db666, 0x3feb20ce6c9a8952
.quad 0x3feb33a2b84f15fb, 0x3feb468415b749b1, 0x3feb59728de5593a, 0x3feb6c6e29f1c52a
.quad 0x3feb7f76f2fb5e47, 0x3feb928cf22749e4, 0x3feba5b030a1064a, 0x3febb8e0b79a6f1f
.quad 0x3febcc1e904bc1d2, 0x3febdf69c3f3a207, 0x3febf2c25bd71e09, 0x3fec06286141b33d
.quad 0x3fec199bdd85529c, 0x3fec2d1cd9fa652c, 0x3fec40ab5fffd07a, 0x3fec544778fafb22
.quad 0x3fec67f12e57d14b, 0x3fec7ba88988c933, 0x3fec8f6d9406e7b5, 0x3feca3405751c4db
.quad 0x3fecb720dcef9069, 0x3feccb0f2e6d1675, 0x3fecdf0b555dc3fa, 0x3fecf3155b5bab74
.quad 0x3fed072d4a07897c, 0x3fed1b532b08c968, 0x3fed2f87080d89f2, 0x3fed43c8eacaa1d6
.quad 0x3fed5818dcfba487, 0x3fed6c76e862e6d3, 0x3fed80e316c98398, 0x3fed955d71ff6075
.quad 0x3feda9e603db3285, 0x3fedbe7cd63a8315, 0x3fedd321f301b460, 0x3fede7d5641c0658
.quad 0x3fedfc97337b9b5f, 0x3fee11676b197d17, 0x3fee264614f5a129, 0x3fee3b333b16ee12
.quad 0x3fee502ee78b3ff6, 0x3fee653924676d76, 0x3fee7a51fbc74c83, 0x3fee8f7977cdb740
.quad 0x3feea4afa2a490da, 0x3feeb9f4867cca6e, 0x3feecf482d8e67f1, 0x3feee4aaa2188510
.quad 0x3feefa1bee615a27, 0x3fef0f9c1cb6412a, 0x3fef252b376bba97, 0x3fef3ac948dd7274
.quad 0x3fef50765b6e4540, 0x3fef6632798844f8, 0x3fef7bfdad9cbe14, 0x3fef91d802243c89
.quad 0x3fefa7c1819e90d8, 0x3fefbdba3692d514, 0x3fefd3c22b8f71f1, 0x3fefe9d96b2a23d9
.quad 0x3ff0000000000000
.align 16
.quad 0x3ff71547652b82fe, 0x3ff71547652b82fe /* _dbInvLn2 = 1/log(2) */
.align 16
.quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi*/
.align 16
.quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo*/
.align 16
.quad 0x42B8000000000000, 0x42B8000000000000 /* _dbShifter */
.align 16
.long 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF /* _iIndexMask */
.align 16
.quad 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD /* _dPC2 */
.align 16
.quad 0x3FC5555570813E14, 0x3FC5555570813E14 /* _dPC3 */
.align 16
.quad 0x3FA55555CF16D299, 0x3FA55555CF16D299 /* _dPC4 */
.align 16
.long 0x00000100, 0x00000100, 0x00000100, 0x00000100 /* _iMaxIndex */
.align 16
.quad 0x7ff0000000000000, 0x7ff0000000000000 /* _lExpMask */
.align 16
.quad 0x8000000000000000, 0x8000000000000000 /* _dSign*/
.align 16
.long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp*/
.align 16
.type __svml_dcosh_data_internal,@object
.size __svml_dcosh_data_internal,.-__svml_dcosh_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized cosh, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_cosh _ZGVdN4v_cosh_sse_wrapper
#include "../svml_d_cosh4_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized cosh, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_cosh
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_cosh, __GI__ZGVdN4v_cosh, __redirect__ZGVdN4v_cosh)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,412 @@
/* Function cosh vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute cosh(x) as (exp(x)+exp(-x))/2,
* where exp is calculated as
* exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
*
* Special cases:
*
* cosh(NaN) = quiet NaN, and raise invalid exception
* cosh(INF) = that INF
* cosh(0) = 1
* cosh(x) overflows for big x and returns MAXLOG+log(2)
*
*/
/* Offsets for data table __svml_dcosh_data_internal
*/
#define _dbT 0
#define _dbInvLn2 2080
#define _dbLn2hi 2112
#define _dbLn2lo 2144
#define _dbShifter 2176
#define _iIndexMask 2208
#define _dPC2 2240
#define _dPC3 2272
#define _dPC4 2304
#define _iMaxIndex 2336
#define _lExpMask 2368
#define _dSign 2400
#define _iDomainRange 2432
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN4v_cosh_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
lea _dbT+__svml_dcosh_data_internal(%rip), %rax
vmovupd _dSign+__svml_dcosh_data_internal(%rip), %ymm8
vmovupd _dbShifter+__svml_dcosh_data_internal(%rip), %ymm6
/*
* Load argument
* dM = x*2^K/log(2) + RShifter
*/
vmovupd _dbInvLn2+__svml_dcosh_data_internal(%rip), %ymm3
/*
* trick
* 256=-iIndex
*/
vmovups _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm14
/* dXSign=0x001000000000 */
vpsrlq $11, %ymm8, %ymm5
vmovapd %ymm0, %ymm7
/* Abs argument */
vandnpd %ymm7, %ymm8, %ymm4
vfmadd213pd %ymm6, %ymm4, %ymm3
/* Index and lookup */
vextractf128 $1, %ymm3, %xmm12
vshufps $136, %xmm12, %xmm3, %xmm13
vpand _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm13, %xmm15
vpsubd %xmm15, %xmm14, %xmm0
/* iDomainRange*=3 */
vpslld $3, %xmm0, %xmm2
vmovd %xmm2, %r9d
vpextrd $2, %xmm2, %r11d
movslq %r9d, %r9
vpextrd $1, %xmm2, %r10d
movslq %r11d, %r11
movslq %r10d, %r10
vmovsd (%rax,%r9), %xmm12
/*
* Check for overflow\underflow
*
*/
vextractf128 $1, %ymm4, %xmm9
vmovsd (%rax,%r11), %xmm14
vmovhpd (%rax,%r10), %xmm12, %xmm13
vshufps $221, %xmm9, %xmm4, %xmm10
/* iIndex*=3 */
vpslld $3, %xmm15, %xmm9
/*
* R
* dN = dM - RShifter
*/
vsubpd %ymm6, %ymm3, %ymm15
vmovd %xmm9, %ecx
vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm10, %xmm11
vmovupd _dbLn2hi+__svml_dcosh_data_internal(%rip), %ymm6
/*
* G1,G2,G3: dTdif,dTn * 2^N,2^(-N)
* NB: copied from sinh_la - to be optimized!!!!!
*/
vpsllq $44, %ymm3, %ymm3
vmovmskps %xmm11, %edx
/* dR = dX - dN*Log2_hi/2^K */
vfnmadd231pd %ymm6, %ymm15, %ymm4
/* lM now is an EXP(2^N) */
vpand _lExpMask+__svml_dcosh_data_internal(%rip), %ymm3, %ymm3
/* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */
vfnmadd231pd _dbLn2lo+__svml_dcosh_data_internal(%rip), %ymm15, %ymm4
movslq %ecx, %rcx
vpextrd $2, %xmm9, %edi
vpextrd $1, %xmm9, %esi
movslq %edi, %rdi
vmovsd (%rax,%rcx), %xmm1
vpextrd $3, %xmm9, %r8d
vpextrd $3, %xmm2, %ecx
movslq %esi, %rsi
movslq %r8d, %r8
movslq %ecx, %rcx
/* dR2 = dR^2 */
vmulpd %ymm4, %ymm4, %ymm0
vmovsd (%rax,%rdi), %xmm10
vmovhpd (%rax,%rsi), %xmm1, %xmm8
vmovhpd (%rax,%r8), %xmm10, %xmm11
vmovhpd (%rax,%rcx), %xmm14, %xmm2
vinsertf128 $1, %xmm11, %ymm8, %ymm1
vinsertf128 $1, %xmm2, %ymm13, %ymm2
vpaddq %ymm3, %ymm1, %ymm6
/* */
vpsubq %ymm3, %ymm2, %ymm1
/*
* sinh(r) = r +r*r^2*a3 ....
* dSinh_r = r^2*a3
*/
vmulpd _dPC3+__svml_dcosh_data_internal(%rip), %ymm0, %ymm2
/* lX- = EXP(1/2) */
vpsubq %ymm5, %ymm1, %ymm5
/* dSinh_r = r + r*r^2*a3 */
vfmadd213pd %ymm4, %ymm4, %ymm2
/* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
vmovupd _dPC4+__svml_dcosh_data_internal(%rip), %ymm4
/* dTn = dTn*2^N - dTn*2^-N */
vsubpd %ymm5, %ymm6, %ymm1
/* dTp = dTn*2^N + dTn*2^-N */
vaddpd %ymm5, %ymm6, %ymm3
vfmadd213pd _dPC2+__svml_dcosh_data_internal(%rip), %ymm0, %ymm4
vmulpd %ymm2, %ymm1, %ymm1
vmulpd %ymm4, %ymm0, %ymm0
/* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
vfmadd213pd %ymm1, %ymm3, %ymm0
/* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
vaddpd %ymm0, %ymm3, %ymm0
/* Ret H */
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm7
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovupd %ymm7, 32(%rsp)
vmovupd %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx ymm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovupd 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call cosh@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 64(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN4v_cosh_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_dcosh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _dbT[(1 + (1<<8))][2]; //dTpj ONLY!
__declspec(align(32)) VUINT32 _dbInvLn2[4][2];
__declspec(align(32)) VUINT32 _dbLn2hi[4][2];
__declspec(align(32)) VUINT32 _dbLn2lo[4][2];
__declspec(align(32)) VUINT32 _dbShifter[4][2];
__declspec(align(32)) VUINT32 _iIndexMask[8][1]; //(1<<K)1-
__declspec(align(32)) VUINT32 _dPC2[4][2];
__declspec(align(32)) VUINT32 _dPC3[4][2];
__declspec(align(32)) VUINT32 _dPC4[4][2];
__declspec(align(32)) VUINT32 _iMaxIndex[8][1]; //(1<<K)
__declspec(align(32)) VUINT32 _lExpMask[4][2];
__declspec(align(32)) VUINT32 _dSign[4][2]; //0x8000000000000000
__declspec(align(32)) VUINT32 _iDomainRange[8][1];
} __svml_dcosh_data_internal;
#endif
__svml_dcosh_data_internal:
/*== _dbT ==*/
.quad 0x3fe0000000000000, 0x3fe00b1afa5abcbf, 0x3fe0163da9fb3335, 0x3fe02168143b0281
.quad 0x3fe02c9a3e778061, 0x3fe037d42e11bbcc, 0x3fe04315e86e7f85, 0x3fe04e5f72f654b1
.quad 0x3fe059b0d3158574, 0x3fe0650a0e3c1f89, 0x3fe0706b29ddf6de, 0x3fe07bd42b72a836
.quad 0x3fe0874518759bc8, 0x3fe092bdf66607e0, 0x3fe09e3ecac6f383, 0x3fe0a9c79b1f3919
.quad 0x3fe0b5586cf9890f, 0x3fe0c0f145e46c85, 0x3fe0cc922b7247f7, 0x3fe0d83b23395dec
.quad 0x3fe0e3ec32d3d1a2, 0x3fe0efa55fdfa9c5, 0x3fe0fb66affed31b, 0x3fe1073028d7233e
.quad 0x3fe11301d0125b51, 0x3fe11edbab5e2ab6, 0x3fe12abdc06c31cc, 0x3fe136a814f204ab
.quad 0x3fe1429aaea92de0, 0x3fe14e95934f312e, 0x3fe15a98c8a58e51, 0x3fe166a45471c3c2
.quad 0x3fe172b83c7d517b, 0x3fe17ed48695bbc0, 0x3fe18af9388c8dea, 0x3fe1972658375d2f
.quad 0x3fe1a35beb6fcb75, 0x3fe1af99f8138a1c, 0x3fe1bbe084045cd4, 0x3fe1c82f95281c6b
.quad 0x3fe1d4873168b9aa, 0x3fe1e0e75eb44027, 0x3fe1ed5022fcd91d, 0x3fe1f9c18438ce4d
.quad 0x3fe2063b88628cd6, 0x3fe212be3578a819, 0x3fe21f49917ddc96, 0x3fe22bdda27912d1
.quad 0x3fe2387a6e756238, 0x3fe2451ffb82140a, 0x3fe251ce4fb2a63f, 0x3fe25e85711ece75
.quad 0x3fe26b4565e27cdd, 0x3fe2780e341ddf29, 0x3fe284dfe1f56381, 0x3fe291ba7591bb70
.quad 0x3fe29e9df51fdee1, 0x3fe2ab8a66d10f13, 0x3fe2b87fd0dad990, 0x3fe2c57e39771b2f
.quad 0x3fe2d285a6e4030b, 0x3fe2df961f641589, 0x3fe2ecafa93e2f56, 0x3fe2f9d24abd886b
.quad 0x3fe306fe0a31b715, 0x3fe31432edeeb2fd, 0x3fe32170fc4cd831, 0x3fe32eb83ba8ea32
.quad 0x3fe33c08b26416ff, 0x3fe3496266e3fa2d, 0x3fe356c55f929ff1, 0x3fe36431a2de883b
.quad 0x3fe371a7373aa9cb, 0x3fe37f26231e754a, 0x3fe38cae6d05d866, 0x3fe39a401b7140ef
.quad 0x3fe3a7db34e59ff7, 0x3fe3b57fbfec6cf4, 0x3fe3c32dc313a8e5, 0x3fe3d0e544ede173
.quad 0x3fe3dea64c123422, 0x3fe3ec70df1c5175, 0x3fe3fa4504ac801c, 0x3fe40822c367a024
.quad 0x3fe4160a21f72e2a, 0x3fe423fb2709468a, 0x3fe431f5d950a897, 0x3fe43ffa3f84b9d4
.quad 0x3fe44e086061892d, 0x3fe45c2042a7d232, 0x3fe46a41ed1d0057, 0x3fe4786d668b3237
.quad 0x3fe486a2b5c13cd0, 0x3fe494e1e192aed2, 0x3fe4a32af0d7d3de, 0x3fe4b17dea6db7d7
.quad 0x3fe4bfdad5362a27, 0x3fe4ce41b817c114, 0x3fe4dcb299fddd0d, 0x3fe4eb2d81d8abff
.quad 0x3fe4f9b2769d2ca7, 0x3fe508417f4531ee, 0x3fe516daa2cf6642, 0x3fe5257de83f4eef
.quad 0x3fe5342b569d4f82, 0x3fe542e2f4f6ad27, 0x3fe551a4ca5d920f, 0x3fe56070dde910d2
.quad 0x3fe56f4736b527da, 0x3fe57e27dbe2c4cf, 0x3fe58d12d497c7fd, 0x3fe59c0827ff07cc
.quad 0x3fe5ab07dd485429, 0x3fe5ba11fba87a03, 0x3fe5c9268a5946b7, 0x3fe5d84590998b93
.quad 0x3fe5e76f15ad2148, 0x3fe5f6a320dceb71, 0x3fe605e1b976dc09, 0x3fe6152ae6cdf6f4
.quad 0x3fe6247eb03a5585, 0x3fe633dd1d1929fd, 0x3fe6434634ccc320, 0x3fe652b9febc8fb7
.quad 0x3fe6623882552225, 0x3fe671c1c70833f6, 0x3fe68155d44ca973, 0x3fe690f4b19e9538
.quad 0x3fe6a09e667f3bcd, 0x3fe6b052fa75173e, 0x3fe6c012750bdabf, 0x3fe6cfdcddd47645
.quad 0x3fe6dfb23c651a2f, 0x3fe6ef9298593ae5, 0x3fe6ff7df9519484, 0x3fe70f7466f42e87
.quad 0x3fe71f75e8ec5f74, 0x3fe72f8286ead08a, 0x3fe73f9a48a58174, 0x3fe74fbd35d7cbfd
.quad 0x3fe75feb564267c9, 0x3fe77024b1ab6e09, 0x3fe780694fde5d3f, 0x3fe790b938ac1cf6
.quad 0x3fe7a11473eb0187, 0x3fe7b17b0976cfdb, 0x3fe7c1ed0130c132, 0x3fe7d26a62ff86f0
.quad 0x3fe7e2f336cf4e62, 0x3fe7f3878491c491, 0x3fe80427543e1a12, 0x3fe814d2add106d9
.quad 0x3fe82589994cce13, 0x3fe8364c1eb941f7, 0x3fe8471a4623c7ad, 0x3fe857f4179f5b21
.quad 0x3fe868d99b4492ed, 0x3fe879cad931a436, 0x3fe88ac7d98a6699, 0x3fe89bd0a478580f
.quad 0x3fe8ace5422aa0db, 0x3fe8be05bad61778, 0x3fe8cf3216b5448c, 0x3fe8e06a5e0866d9
.quad 0x3fe8f1ae99157736, 0x3fe902fed0282c8a, 0x3fe9145b0b91ffc6, 0x3fe925c353aa2fe2
.quad 0x3fe93737b0cdc5e5, 0x3fe948b82b5f98e5, 0x3fe95a44cbc8520f, 0x3fe96bdd9a7670b3
.quad 0x3fe97d829fde4e50, 0x3fe98f33e47a22a2, 0x3fe9a0f170ca07ba, 0x3fe9b2bb4d53fe0d
.quad 0x3fe9c49182a3f090, 0x3fe9d674194bb8d5, 0x3fe9e86319e32323, 0x3fe9fa5e8d07f29e
.quad 0x3fea0c667b5de565, 0x3fea1e7aed8eb8bb, 0x3fea309bec4a2d33, 0x3fea42c980460ad8
.quad 0x3fea5503b23e255d, 0x3fea674a8af46052, 0x3fea799e1330b358, 0x3fea8bfe53c12e59
.quad 0x3fea9e6b5579fdbf, 0x3feab0e521356eba, 0x3feac36bbfd3f37a, 0x3fead5ff3a3c2774
.quad 0x3feae89f995ad3ad, 0x3feafb4ce622f2ff, 0x3feb0e07298db666, 0x3feb20ce6c9a8952
.quad 0x3feb33a2b84f15fb, 0x3feb468415b749b1, 0x3feb59728de5593a, 0x3feb6c6e29f1c52a
.quad 0x3feb7f76f2fb5e47, 0x3feb928cf22749e4, 0x3feba5b030a1064a, 0x3febb8e0b79a6f1f
.quad 0x3febcc1e904bc1d2, 0x3febdf69c3f3a207, 0x3febf2c25bd71e09, 0x3fec06286141b33d
.quad 0x3fec199bdd85529c, 0x3fec2d1cd9fa652c, 0x3fec40ab5fffd07a, 0x3fec544778fafb22
.quad 0x3fec67f12e57d14b, 0x3fec7ba88988c933, 0x3fec8f6d9406e7b5, 0x3feca3405751c4db
.quad 0x3fecb720dcef9069, 0x3feccb0f2e6d1675, 0x3fecdf0b555dc3fa, 0x3fecf3155b5bab74
.quad 0x3fed072d4a07897c, 0x3fed1b532b08c968, 0x3fed2f87080d89f2, 0x3fed43c8eacaa1d6
.quad 0x3fed5818dcfba487, 0x3fed6c76e862e6d3, 0x3fed80e316c98398, 0x3fed955d71ff6075
.quad 0x3feda9e603db3285, 0x3fedbe7cd63a8315, 0x3fedd321f301b460, 0x3fede7d5641c0658
.quad 0x3fedfc97337b9b5f, 0x3fee11676b197d17, 0x3fee264614f5a129, 0x3fee3b333b16ee12
.quad 0x3fee502ee78b3ff6, 0x3fee653924676d76, 0x3fee7a51fbc74c83, 0x3fee8f7977cdb740
.quad 0x3feea4afa2a490da, 0x3feeb9f4867cca6e, 0x3feecf482d8e67f1, 0x3feee4aaa2188510
.quad 0x3feefa1bee615a27, 0x3fef0f9c1cb6412a, 0x3fef252b376bba97, 0x3fef3ac948dd7274
.quad 0x3fef50765b6e4540, 0x3fef6632798844f8, 0x3fef7bfdad9cbe14, 0x3fef91d802243c89
.quad 0x3fefa7c1819e90d8, 0x3fefbdba3692d514, 0x3fefd3c22b8f71f1, 0x3fefe9d96b2a23d9
.quad 0x3ff0000000000000
.align 32
.quad 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe /* _dbInvLn2 = 1/log(2) */
.align 32
.quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi*/
.align 32
.quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo*/
.align 32
.quad 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000 /* _dbShifter */
.align 32
.long 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF /* _iIndexMask */
.align 32
.quad 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD /* _dPC2 */
.align 32
.quad 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14 /* _dPC3 */
.align 32
.quad 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299 /* _dPC4 */
.align 32
.long 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100, 0x00000100 /* _iMaxIndex */
.align 32
.quad 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 /* _lExpMask */
.align 32
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dSign*/
.align 32
.long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp*/
.align 32
.type __svml_dcosh_data_internal,@object
.size __svml_dcosh_data_internal,.-__svml_dcosh_data_internal

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized cosh, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_cosh _ZGVeN8v_cosh_avx2_wrapper
#include "../svml_d_cosh8_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized cosh, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_cosh
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_cosh, __GI__ZGVeN8v_cosh, __redirect__ZGVeN8v_cosh)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,323 @@
/* Function cosh vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute cosh(x) as (exp(x)+exp(-x))/2,
* where exp is calculated as
* exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
*
* Special cases:
*
* cosh(NaN) = quiet NaN, and raise invalid exception
* cosh(INF) = that INF
* cosh(0) = 1
* cosh(x) overflows for big x and returns MAXLOG+log(2)
*
*/
/* Offsets for data table __svml_dcosh_data_internal
*/
#define _dTp_h 0
#define _dTn_h 128
#define _dbShifter_UISA 256
#define _dPC2_UISA 320
#define _dPC3_UISA 384
#define _dPC4_UISA 448
#define _dPC5_UISA 512
#define _dPC6_UISA 576
#define _dPC7_UISA 640
#define _dbInvLn2 704
#define _dbLn2hi 768
#define _dbLn2lo 832
#define _dbShifter 896
#define _dPC2 960
#define _dPC3 1024
#define _dPC4 1088
#define _lExpMask 1152
#define _dSign 1216
#define _iDomainRange 1280
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_cosh_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups _dSign+__svml_dcosh_data_internal(%rip), %zmm11
vmovups _dbShifter_UISA+__svml_dcosh_data_internal(%rip), %zmm15
/*
* Load argument
* dM = x*2^K/log(2) + RShifter
*/
vmovups _dbInvLn2+__svml_dcosh_data_internal(%rip), %zmm4
vmovups _dbLn2hi+__svml_dcosh_data_internal(%rip), %zmm2
vmovups _dbLn2lo+__svml_dcosh_data_internal(%rip), %zmm3
vmovups _dPC7_UISA+__svml_dcosh_data_internal(%rip), %zmm8
vmovups _dPC6_UISA+__svml_dcosh_data_internal(%rip), %zmm9
vmovups _dPC2_UISA+__svml_dcosh_data_internal(%rip), %zmm7
vmovups _dPC3_UISA+__svml_dcosh_data_internal(%rip), %zmm6
vmovaps %zmm0, %zmm10
/* Abs argument */
vandnpd %zmm10, %zmm11, %zmm5
/* Index and lookup */
vmovups __svml_dcosh_data_internal(%rip), %zmm11
vmovups _dTn_h+__svml_dcosh_data_internal(%rip), %zmm0
vfmadd213pd {rn-sae}, %zmm15, %zmm5, %zmm4
/*
* Check for overflow\underflow
*
*/
vpsrlq $32, %zmm5, %zmm12
/* dN = dM - RShifter */
vsubpd {rn-sae}, %zmm15, %zmm4, %zmm1
vpmovqd %zmm12, %ymm13
vpermt2pd _dTn_h+64+__svml_dcosh_data_internal(%rip), %zmm4, %zmm0
vpermt2pd _dTp_h+64+__svml_dcosh_data_internal(%rip), %zmm4, %zmm11
/* dR = dX - dN*Log2_hi/2^K */
vfnmadd231pd {rn-sae}, %zmm2, %zmm1, %zmm5
/*
* poly(r) = Gmjp(1 + a2*r^2 + a4*r^4) + Gmjn*(r+ a3*r^3 +a5*r^5) =
* = Gmjp_h +Gmjp_l+ Gmjp*r^2*(a2 + a4*r^2) + Gmjn*(r+ r^3*(a3 +a5*r^2)
*/
vmovups _dPC5_UISA+__svml_dcosh_data_internal(%rip), %zmm12
vpsllq $48, %zmm4, %zmm2
/* dR = dX - dN*Log2_hi/2^K */
vfnmadd231pd {rn-sae}, %zmm3, %zmm1, %zmm5
vmulpd {rn-sae}, %zmm5, %zmm5, %zmm1
vfmadd231pd {rn-sae}, %zmm1, %zmm8, %zmm12
vmovups _dPC4_UISA+__svml_dcosh_data_internal(%rip), %zmm8
vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm12
vfmadd231pd {rn-sae}, %zmm1, %zmm9, %zmm8
vfmadd213pd {rn-sae}, %zmm7, %zmm1, %zmm8
vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %ymm13, %ymm14
vmovmskps %ymm14, %edx
/* dOut=r^2*(a2 + a4*r^2) */
vmulpd {rn-sae}, %zmm1, %zmm8, %zmm6
/* lM now is an EXP(2^N) */
vpandq _lExpMask+__svml_dcosh_data_internal(%rip), %zmm2, %zmm3
vpaddq %zmm3, %zmm11, %zmm4
vpsubq %zmm3, %zmm0, %zmm0
vsubpd {rn-sae}, %zmm0, %zmm4, %zmm14
vaddpd {rn-sae}, %zmm0, %zmm4, %zmm13
/* dM=r^2*(a3 +a5*r^2) */
vmulpd {rn-sae}, %zmm1, %zmm12, %zmm0
vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm6
/* dM= r + r^3*(a3 +a5*r^2) */
vfmadd213pd {rn-sae}, %zmm5, %zmm5, %zmm0
vfmadd213pd {rn-sae}, %zmm6, %zmm14, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm10
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm10, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 64(%rsp,%r14,8), %xmm0
call cosh@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 128(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN8v_cosh_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_dcosh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(64)) VUINT32 _dTp_h[(1<<4)][2];
__declspec(align(64)) VUINT32 _dTn_h[(1<<4)][2];
__declspec(align(64)) VUINT32 _dbShifter_UISA[8][2];
__declspec(align(64)) VUINT32 _dPC2_UISA[8][2];
__declspec(align(64)) VUINT32 _dPC3_UISA[8][2];
__declspec(align(64)) VUINT32 _dPC4_UISA[8][2];
__declspec(align(64)) VUINT32 _dPC5_UISA[8][2];
__declspec(align(64)) VUINT32 _dPC6_UISA[8][2];
__declspec(align(64)) VUINT32 _dPC7_UISA[8][2];
__declspec(align(64)) VUINT32 _dbInvLn2[8][2];
__declspec(align(64)) VUINT32 _dbLn2hi[8][2];
__declspec(align(64)) VUINT32 _dbLn2lo[8][2];
__declspec(align(64)) VUINT32 _dbShifter[8][2];
__declspec(align(64)) VUINT32 _dPC2[8][2];
__declspec(align(64)) VUINT32 _dPC3[8][2];
__declspec(align(64)) VUINT32 _dPC4[8][2];
__declspec(align(64)) VUINT32 _lExpMask[8][2];
__declspec(align(64)) VUINT32 _dSign[8][2]; //0x8000000000000000
__declspec(align(64)) VUINT32 _iDomainRange[16][1];
} __svml_dcosh_data_internal;
#endif
__svml_dcosh_data_internal:
/*== _dTp_h ==*/
.quad 0x3fe0000000000000, 0x3fe0b5586cf9890f, 0x3fe172b83c7d517b, 0x3fe2387a6e756238
.quad 0x3fe306fe0a31b715, 0x3fe3dea64c123422, 0x3fe4bfdad5362a27, 0x3fe5ab07dd485429
.quad 0x3fe6a09e667f3bcd, 0x3fe7a11473eb0187, 0x3fe8ace5422aa0db, 0x3fe9c49182a3f090
.quad 0x3feae89f995ad3ad, 0x3fec199bdd85529c, 0x3fed5818dcfba487, 0x3feea4afa2a490da
/*== dTn_h ==*/
.align 64
.quad 0x3fe0000000000000, 0x3fdea4afa2a490da, 0x3fdd5818dcfba487, 0x3fdc199bdd85529c
.quad 0x3fdae89f995ad3ad, 0x3fd9c49182a3f090, 0x3fd8ace5422aa0db, 0x3fd7a11473eb0187
.quad 0x3fd6a09e667f3bcd, 0x3fd5ab07dd485429, 0x3fd4bfdad5362a27, 0x3fd3dea64c123422
.quad 0x3fd306fe0a31b715, 0x3fd2387a6e756238, 0x3fd172b83c7d517b, 0x3fd0b5586cf9890f
.align 64
.quad 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000 /* _dbShifter_UISA */
.align 64
.quad 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004 /* _dPC2_UISA */
.align 64
.quad 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543 /* _dPC3_UISA */
.align 64
.quad 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37 /* _dPC4_UISA */
.align 64
.quad 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c /* _dPC5_UISA */
.align 64
.quad 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116 /* _dPC6_UISA */
.align 64
.quad 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da /* _dPC7_UISA */
/*== _dbT ==*/
.align 64
.quad 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe /* _dbInvLn2 = 1/log(2) */
.align 64
.quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi*/
.align 64
.quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo*/
.align 64
.quad 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000 /* _dbShifter */
.align 64
.quad 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD /* _dPC2 */
.align 64
.quad 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14 /* _dPC3 */
.align 64
.quad 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299 /* _dPC4 */
.align 64
.quad 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 /* _lExpMask */
.align 64
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dSign*/
.align 64
.long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp*/
.align 64
.type __svml_dcosh_data_internal,@object
.size __svml_dcosh_data_internal,.-__svml_dcosh_data_internal

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized coshf.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_coshf _ZGVeN16v_coshf_avx2_wrapper
#include "../svml_s_coshf16_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized coshf, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_coshf
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_coshf, __GI__ZGVeN16v_coshf,
__redirect__ZGVeN16v_coshf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,321 @@
/* Function coshf vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute cosh(x) as (exp(x)+exp(-x))/2,
* where exp is calculated as
* exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
*
* Special cases:
*
* cosh(NaN) = quiet NaN, and raise invalid exception
* cosh(INF) = that INF
* cosh(0) = 1
* cosh(x) overflows for big x and returns MAXLOG+log(2)
*
*/
/* Offsets for data table __svml_scosh_data_internal
*/
#define _sExp_tbl_PH 0
#define _sExp_tbl_NH 128
#define _sShifter_UISA 256
#define _iDomainRange_UISA 320
#define _sPC1_UISA 384
#define _sPC2_UISA 448
#define _sPC3_UISA 512
#define _sInvLn2 576
#define _sLn2hi 640
#define _sLn2lo 704
#define _sSign 768
#define _iExpMask 832
#define _sShifter 896
#define _iDomainRange 960
#define _sPC1 1024
#define _sPC2 1088
#define _sPC3 1152
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_coshf_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups _sSign+__svml_scosh_data_internal(%rip), %zmm4
vmovups _sShifter_UISA+__svml_scosh_data_internal(%rip), %zmm6
/*
* Load argument
* dM = x/log(2) + RShifter
*/
vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %zmm10
vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %zmm7
vmovups _sLn2lo+__svml_scosh_data_internal(%rip), %zmm9
/* */
vmovups _sPC3_UISA+__svml_scosh_data_internal(%rip), %zmm2
/* x^2 */
vmovups _sPC2_UISA+__svml_scosh_data_internal(%rip), %zmm3
/* G1,G2 2^N,2^(-N) */
vmovups __svml_scosh_data_internal(%rip), %zmm12
vmovups _sExp_tbl_NH+__svml_scosh_data_internal(%rip), %zmm13
/*
* Implementation
* Abs argument
*/
vandnps %zmm0, %zmm4, %zmm1
/* Check for overflow\underflow */
vpternlogd $255, %zmm5, %zmm5, %zmm5
vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm10
vpcmpd $1, _iDomainRange_UISA+__svml_scosh_data_internal(%rip), %zmm1, %k1
/* iM now is an EXP(2^N) */
vpslld $18, %zmm10, %zmm11
/*
* R
* sN = sM - RShifter
*/
vsubps {rn-sae}, %zmm6, %zmm10, %zmm8
vpermt2ps _sExp_tbl_PH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm12
vpermt2ps _sExp_tbl_NH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm13
vpandnd %zmm1, %zmm1, %zmm5{%k1}
/* sR = sX - sN*Log2_hi */
vfnmadd231ps {rn-sae}, %zmm7, %zmm8, %zmm1
vptestmd %zmm5, %zmm5, %k0
/* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
vfnmadd231ps {rn-sae}, %zmm9, %zmm8, %zmm1
kmovw %k0, %edx
vmulps {rn-sae}, %zmm1, %zmm1, %zmm4
vmulps {rn-sae}, %zmm4, %zmm2, %zmm2
/* sSinh_r = r + r*(r^2*(a3)) */
vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm2
/* sOut = r^2*(a2) */
vmulps {rn-sae}, %zmm4, %zmm3, %zmm1
vpandd _iExpMask+__svml_scosh_data_internal(%rip), %zmm11, %zmm14
vpaddd %zmm14, %zmm12, %zmm15
vpsubd %zmm14, %zmm13, %zmm10
/* sG2 = 2^N*Th + 2^(-N)*T_h */
vaddps {rn-sae}, %zmm10, %zmm15, %zmm5
/* sG1 = 2^N*Th - 2^(-N)*T_h */
vsubps {rn-sae}, %zmm10, %zmm15, %zmm6
/* res = sG1*(r + r*(r^2*(a3))) + sG2*(1+r^2*(a2)) */
vfmadd213ps {rn-sae}, %zmm5, %zmm5, %zmm1
vfmadd213ps {rn-sae}, %zmm1, %zmm2, %zmm6
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
/* Restore registers
* and exit the function
*/
L(EXIT):
vmovaps %zmm6, %zmm0
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm0, 64(%rsp)
vmovups %zmm6, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm6
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $16, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm6
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm6
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 64(%rsp,%r14,4), %xmm0
call coshf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 128(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN16v_coshf_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_scosh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(64)) VUINT32 _sExp_tbl_PH[32][1];
__declspec(align(64)) VUINT32 _sExp_tbl_NH[32][1];
__declspec(align(64)) VUINT32 _sShifter_UISA[16][1];
__declspec(align(64)) VUINT32 _iDomainRange_UISA[16][1];
__declspec(align(64)) VUINT32 _sPC1_UISA[16][1];
__declspec(align(64)) VUINT32 _sPC2_UISA[16][1];
__declspec(align(64)) VUINT32 _sPC3_UISA[16][1];
__declspec(align(64)) VUINT32 _sInvLn2[16][1];
__declspec(align(64)) VUINT32 _sLn2hi[16][1];
__declspec(align(64)) VUINT32 _sLn2lo[16][1];
__declspec(align(64)) VUINT32 _sSign[16][1];
__declspec(align(64)) VUINT32 _iExpMask[16][1];
__declspec(align(64)) VUINT32 _sShifter[16][1];
__declspec(align(64)) VUINT32 _iDomainRange[16][1];
__declspec(align(64)) VUINT32 _sPC1[16][1];
__declspec(align(64)) VUINT32 _sPC2[16][1];
__declspec(align(64)) VUINT32 _sPC3[16][1];
} __svml_scosh_data_internal;
#endif
__svml_scosh_data_internal:
/* _sExp_tbl_PH 2^(i/32-1), i=0..31 */
.long 0x3f000000, 0x3f02cd87, 0x3f05aac3, 0x3f08980f
.long 0x3f0b95c2, 0x3f0ea43a, 0x3f11c3d3, 0x3f14f4f0
.long 0x3f1837f0, 0x3f1b8d3a, 0x3f1ef532, 0x3f227043
.long 0x3f25fed7, 0x3f29a15b, 0x3f2d583f, 0x3f3123f6
.long 0x3f3504f3, 0x3f38fbaf, 0x3f3d08a4, 0x3f412c4d
.long 0x3f45672a, 0x3f49b9be, 0x3f4e248c, 0x3f52a81e
.long 0x3f5744fd, 0x3f5bfbb8, 0x3f60ccdf, 0x3f65b907
.long 0x3f6ac0c7, 0x3f6fe4ba, 0x3f75257d, 0x3f7a83b3
/* _sExp_tbl_NH 2^(-i/32-1), i=0..31 */
.align 64
.long 0x3f000000, 0x3efa83b3, 0x3ef5257d, 0x3eefe4ba
.long 0x3eeac0c7, 0x3ee5b907, 0x3ee0ccdf, 0x3edbfbb8
.long 0x3ed744fd, 0x3ed2a81e, 0x3ece248c, 0x3ec9b9be
.long 0x3ec5672a, 0x3ec12c4d, 0x3ebd08a4, 0x3eb8fbaf
.long 0x3eb504f3, 0x3eb123f6, 0x3ead583f, 0x3ea9a15b
.long 0x3ea5fed7, 0x3ea27043, 0x3e9ef532, 0x3e9b8d3a
.long 0x3e9837f0, 0x3e94f4f0, 0x3e91c3d3, 0x3e8ea43a
.long 0x3e8b95c2, 0x3e88980f, 0x3e85aac3, 0x3e82cd87
.align 64
.long 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000 /* 1.5*2^18 _sShifter_UISA */
.align 64
.long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange_UISA */
.align 64
.long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1_UISA=1 */
.align 64
.long 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f /* _sPC2_UISA */
.align 64
.long 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd /* _sPC3_UISA */
.align 64
.long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ //k=0
.align 64
.long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
.align 64
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
.align 64
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
.align 64
.long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _iExpMask */
.align 64
.long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
.align 64
.long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
.align 64
.long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
.align 64
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
.align 64
.long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
.align 64
.type __svml_scosh_data_internal,@object
.size __svml_scosh_data_internal,.-__svml_scosh_data_internal

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized coshf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_coshf _ZGVbN4v_coshf_sse2
#include "../svml_s_coshf4_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized coshf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_coshf
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_coshf, __GI__ZGVbN4v_coshf,
__redirect__ZGVbN4v_coshf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,305 @@
/* Function coshf vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute cosh(x) as (exp(x)+exp(-x))/2,
* where exp is calculated as
* exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
*
* Special cases:
*
* cosh(NaN) = quiet NaN, and raise invalid exception
* cosh(INF) = that INF
* cosh(0) = 1
* cosh(x) overflows for big x and returns MAXLOG+log(2)
*
*/
/* Offsets for data table __svml_scosh_data_internal
*/
#define _sInvLn2 0
#define _sLn2hi 16
#define _sLn2lo 32
#define _sSign 48
#define _sShifter 64
#define _iDomainRange 80
#define _sPC1 96
#define _sPC2 112
#define _sPC3 128
#define _sPC4 144
#define _sPC5 160
#define _sPC6 176
#define _iHalf 192
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_coshf_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
/*
* Implementation
* Abs argument
*/
movups _sSign+__svml_scosh_data_internal(%rip), %xmm1
/*
* Load argument
* dM = x/log(2) + RShifter
*/
movups _sInvLn2+__svml_scosh_data_internal(%rip), %xmm9
andnps %xmm0, %xmm1
mulps %xmm1, %xmm9
/* Check for overflow\underflow */
movaps %xmm1, %xmm3
movups _sShifter+__svml_scosh_data_internal(%rip), %xmm4
movups _sLn2hi+__svml_scosh_data_internal(%rip), %xmm5
addps %xmm4, %xmm9
/*
* R
* sN = sM - RShifter
*/
movaps %xmm9, %xmm6
/*
* G1,G2 2^N,2^(-N)
* iM now is an EXP(2^N)
*/
pslld $23, %xmm9
movups _sLn2lo+__svml_scosh_data_internal(%rip), %xmm7
subps %xmm4, %xmm6
/* sR = sX - sN*Log2_hi */
mulps %xmm6, %xmm5
/* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
mulps %xmm6, %xmm7
movdqu _iDomainRange+__svml_scosh_data_internal(%rip), %xmm2
pcmpgtd %xmm2, %xmm3
pcmpeqd %xmm1, %xmm2
/*
* sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
* sSinh_r = (a3+r^2*a5)
*/
movups _sPC5+__svml_scosh_data_internal(%rip), %xmm10
por %xmm2, %xmm3
/*
* sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
* sOut = (a4 +a6*sR2)
*/
movups _sPC6+__svml_scosh_data_internal(%rip), %xmm11
subps %xmm5, %xmm1
movmskps %xmm3, %edx
movdqu _iHalf+__svml_scosh_data_internal(%rip), %xmm8
subps %xmm7, %xmm1
/* sR2 = sR^2,shaffled */
movaps %xmm1, %xmm13
movdqa %xmm8, %xmm2
mulps %xmm1, %xmm13
paddd %xmm9, %xmm2
mulps %xmm13, %xmm10
psubd %xmm9, %xmm8
mulps %xmm13, %xmm11
addps _sPC3+__svml_scosh_data_internal(%rip), %xmm10
addps _sPC4+__svml_scosh_data_internal(%rip), %xmm11
/* sSinh_r = r^2*(a3+r^2*a5) */
mulps %xmm13, %xmm10
/* sOut = a2+sR2*(a4+a6*sR2) */
mulps %xmm13, %xmm11
/* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
mulps %xmm1, %xmm10
addps _sPC2+__svml_scosh_data_internal(%rip), %xmm11
addps %xmm10, %xmm1
/* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
mulps %xmm11, %xmm13
/* sG1 = 2^(N-1)-2^(-N-1) */
movdqa %xmm2, %xmm12
/* sG2 = 2^(N-1)+2^(-N-1) */
addps %xmm8, %xmm2
subps %xmm8, %xmm12
/* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
mulps %xmm2, %xmm13
/* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
mulps %xmm1, %xmm12
addps %xmm12, %xmm13
/* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
addps %xmm13, %xmm2
/* Ret H */
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
/* Restore registers
* and exit the function
*/
L(EXIT):
movaps %xmm2, %xmm0
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm0, 32(%rsp)
movups %xmm2, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 edx
xorl %eax, %eax
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %eax, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %edx, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm2
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm2
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call coshf@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movss %xmm0, 48(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN4v_coshf_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_scosh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(16)) VUINT32 _sInvLn2[4][1];
__declspec(align(16)) VUINT32 _sLn2hi[4][1];
__declspec(align(16)) VUINT32 _sLn2lo[4][1];
__declspec(align(16)) VUINT32 _sSign[4][1];
__declspec(align(16)) VUINT32 _sShifter[4][1];
__declspec(align(16)) VUINT32 _iDomainRange[4][1];
__declspec(align(16)) VUINT32 _sPC1[4][1];
__declspec(align(16)) VUINT32 _sPC2[4][1];
__declspec(align(16)) VUINT32 _sPC3[4][1];
__declspec(align(16)) VUINT32 _sPC4[4][1];
__declspec(align(16)) VUINT32 _sPC5[4][1];
__declspec(align(16)) VUINT32 _sPC6[4][1];
__declspec(align(16)) VUINT32 _iHalf[4][1];
} __svml_scosh_data_internal;
#endif
__svml_scosh_data_internal:
.long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ //k=0
.align 16
.long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
.align 16
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
.align 16
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
.align 16
.long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
.align 16
.long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
.align 16
.long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
.align 16
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
.align 16
.long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
.align 16
.long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
.align 16
.long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
.align 16
.long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
// Integer constants
.align 16
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf*/
.align 16
.type __svml_scosh_data_internal,@object
.size __svml_scosh_data_internal,.-__svml_scosh_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized coshf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_coshf _ZGVdN8v_coshf_sse_wrapper
#include "../svml_s_coshf8_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized coshf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_coshf
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_coshf, __GI__ZGVdN8v_coshf,
__redirect__ZGVdN8v_coshf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,308 @@
/* Function coshf vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute cosh(x) as (exp(x)+exp(-x))/2,
* where exp is calculated as
* exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
*
* Special cases:
*
* cosh(NaN) = quiet NaN, and raise invalid exception
* cosh(INF) = that INF
* cosh(0) = 1
* cosh(x) overflows for big x and returns MAXLOG+log(2)
*
*/
/* Offsets for data table __svml_scosh_data_internal
*/
#define _sInvLn2 0
#define _sLn2hi 32
#define _sLn2lo 64
#define _sSign 96
#define _sShifter 128
#define _iDomainRange 160
#define _sPC1 192
#define _sPC2 224
#define _sPC3 256
#define _sPC4 288
#define _sPC5 320
#define _sPC6 352
#define _iHalf 384
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_coshf_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
vmovups _sSign+__svml_scosh_data_internal(%rip), %ymm2
vmovups _sShifter+__svml_scosh_data_internal(%rip), %ymm7
/*
* Load argument
* dM = x/log(2) + RShifter
*/
vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %ymm10
vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %ymm8
vmovups _iDomainRange+__svml_scosh_data_internal(%rip), %ymm3
/*
* sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
* sSinh_r = (a3+r^2*a5)
*/
vmovups _sPC5+__svml_scosh_data_internal(%rip), %ymm15
vmovups _iHalf+__svml_scosh_data_internal(%rip), %ymm11
vmovaps %ymm0, %ymm1
/*
* Implementation
* Abs argument
*/
vandnps %ymm1, %ymm2, %ymm0
vfmadd213ps %ymm7, %ymm0, %ymm10
/*
* R
* sN = sM - RShifter
*/
vsubps %ymm7, %ymm10, %ymm9
/*
* G1,G2 2^N,2^(-N)
* iM now is an EXP(2^N)
*/
vpslld $23, %ymm10, %ymm12
/* Check for overflow\underflow */
vpcmpgtd %ymm3, %ymm0, %ymm4
vpcmpeqd %ymm3, %ymm0, %ymm5
/* sR = sX - sN*Log2_hi */
vfnmadd231ps %ymm8, %ymm9, %ymm0
vpaddd %ymm12, %ymm11, %ymm13
vpsubd %ymm12, %ymm11, %ymm14
vpor %ymm5, %ymm4, %ymm6
/* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
vfnmadd231ps _sLn2lo+__svml_scosh_data_internal(%rip), %ymm9, %ymm0
/* sG1 = 2^(N-1)-2^(-N-1) */
vsubps %ymm14, %ymm13, %ymm4
/* sG2 = 2^(N-1)+2^(-N-1) */
vaddps %ymm14, %ymm13, %ymm3
/* sR2 = sR^2,shaffled */
vmulps %ymm0, %ymm0, %ymm2
vfmadd213ps _sPC3+__svml_scosh_data_internal(%rip), %ymm2, %ymm15
/* sSinh_r = r^2*(a3+r^2*a5) */
vmulps %ymm15, %ymm2, %ymm13
/* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
vfmadd213ps %ymm0, %ymm0, %ymm13
/*
* sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
* sOut = (a4 +a6*sR2)
*/
vmovups _sPC6+__svml_scosh_data_internal(%rip), %ymm0
vfmadd213ps _sPC4+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
/* sOut = a2+sR2*(a4+a6*sR2) */
vfmadd213ps _sPC2+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
/* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
vmulps %ymm0, %ymm2, %ymm15
/* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
vmulps %ymm15, %ymm3, %ymm14
/* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
vfmadd213ps %ymm14, %ymm13, %ymm4
vmovmskps %ymm6, %edx
/* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
vaddps %ymm4, %ymm3, %ymm0
/* Ret H */
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %ymm1, 32(%rsp)
vmovups %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx ymm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call coshf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 64(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN8v_coshf_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_scosh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _sInvLn2[8][1];
__declspec(align(32)) VUINT32 _sLn2hi[8][1];
__declspec(align(32)) VUINT32 _sLn2lo[8][1];
__declspec(align(32)) VUINT32 _sSign[8][1];
__declspec(align(32)) VUINT32 _sShifter[8][1];
__declspec(align(32)) VUINT32 _iDomainRange[8][1];
__declspec(align(32)) VUINT32 _sPC1[8][1];
__declspec(align(32)) VUINT32 _sPC2[8][1];
__declspec(align(32)) VUINT32 _sPC3[8][1];
__declspec(align(32)) VUINT32 _sPC4[8][1];
__declspec(align(32)) VUINT32 _sPC5[8][1];
__declspec(align(32)) VUINT32 _sPC6[8][1];
__declspec(align(32)) VUINT32 _iHalf[8][1];
} __svml_scosh_data_internal;
#endif
__svml_scosh_data_internal:
.long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ //k=0
.align 32
.long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
.align 32
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
.align 32
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
.align 32
.long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
.align 32
.long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
.align 32
.long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
.align 32
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
.align 32
.long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
.align 32
.long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
.align 32
.long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
.align 32
.long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
// Integer constants
.align 32
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf*/
.align 32
.type __svml_scosh_data_internal,@object
.size __svml_scosh_data_internal,.-__svml_scosh_data_internal

View File

@ -0,0 +1,29 @@
/* Function cosh vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_cosh)
WRAPPER_IMPL_SSE2 cosh
END (_ZGVbN2v_cosh)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_cosh)
#endif

View File

@ -0,0 +1,29 @@
/* Function cosh vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_cosh)
WRAPPER_IMPL_AVX _ZGVbN2v_cosh
END (_ZGVdN4v_cosh)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_cosh)
#endif

View File

@ -0,0 +1,25 @@
/* Function cosh vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_cosh)
WRAPPER_IMPL_AVX _ZGVbN2v_cosh
END (_ZGVcN4v_cosh)

View File

@ -0,0 +1,25 @@
/* Function cosh vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_cosh)
WRAPPER_IMPL_AVX512 _ZGVdN4v_cosh
END (_ZGVeN8v_cosh)

View File

@ -0,0 +1,25 @@
/* Function coshf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_coshf)
WRAPPER_IMPL_AVX512 _ZGVdN8v_coshf
END (_ZGVeN16v_coshf)

View File

@ -0,0 +1,29 @@
/* Function coshf vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_coshf)
WRAPPER_IMPL_SSE2 coshf
END (_ZGVbN4v_coshf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_coshf)
#endif

View File

@ -0,0 +1,29 @@
/* Function coshf vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_coshf)
WRAPPER_IMPL_AVX _ZGVbN4v_coshf
END (_ZGVdN8v_coshf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_coshf)
#endif

View File

@ -0,0 +1,25 @@
/* Function coshf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_coshf)
WRAPPER_IMPL_AVX _ZGVbN4v_coshf
END (_ZGVcN8v_coshf)

View File

@ -0,0 +1 @@
#include "test-double-libmvec-cosh.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-cosh.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-cosh.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC cosh
#include "test-vector-abi-arg1.h"

View File

@ -33,6 +33,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVbN2v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVbN2vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVbN2v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVbN2v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVbN2v_cosh)
#define VEC_INT_TYPE __m128i

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVdN4v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVdN4vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVdN4v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVdN4v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVdN4v_cosh)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View File

@ -33,6 +33,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVcN4v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVcN4vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVcN4v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVcN4v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVcN4v_cosh)
#define VEC_INT_TYPE __m128i

View File

@ -33,6 +33,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVeN8v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVeN8vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVeN8v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVeN8v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVeN8v_cosh)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View File

@ -0,0 +1 @@
#include "test-float-libmvec-coshf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-coshf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-coshf.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC coshf
#include "test-vector-abi-arg1.h"

View File

@ -33,6 +33,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVeN16v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVeN16vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVeN16v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVeN16v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVeN16v_coshf)
#define VEC_INT_TYPE __m512i

View File

@ -33,6 +33,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVbN4v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVbN4vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVbN4v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVbN4v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVbN4v_coshf)
#define VEC_INT_TYPE __m128i

View File

@ -36,6 +36,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVdN8v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVdN8vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVdN8v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVdN8v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVdN8v_coshf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View File

@ -33,6 +33,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVcN8v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVcN8vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVcN8v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVcN8v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVcN8v_coshf)
#define VEC_INT_TYPE __m128i