x86-64: Add vector exp10/exp10f implementation to libmvec

Implement vectorized exp10/exp10f containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector exp10/exp10f with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 08:47:16 -08:00
parent 3fc9ccc20b
commit 8b726453d5
50 changed files with 2617 additions and 1 deletions

View File

@ -153,4 +153,15 @@
#define __DECL_SIMD_exp2f32x
#define __DECL_SIMD_exp2f64x
#define __DECL_SIMD_exp2f128x
#define __DECL_SIMD_exp10
#define __DECL_SIMD_exp10f
#define __DECL_SIMD_exp10l
#define __DECL_SIMD_exp10f16
#define __DECL_SIMD_exp10f32
#define __DECL_SIMD_exp10f64
#define __DECL_SIMD_exp10f128
#define __DECL_SIMD_exp10f32x
#define __DECL_SIMD_exp10f64x
#define __DECL_SIMD_exp10f128x
#endif

View File

@ -111,7 +111,7 @@ __MATHCALL (modf,, (_Mdouble_ __x, _Mdouble_ *__iptr)) __nonnull ((2));
#if __GLIBC_USE (IEC_60559_FUNCS_EXT_C2X)
/* Compute exponent to base ten. */
__MATHCALL (exp10,, (_Mdouble_ __x));
__MATHCALL_VEC (exp10,, (_Mdouble_ __x));
#endif
#if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99

View File

@ -49,40 +49,48 @@ GLIBC_2.22 _ZGVeN8vvv_sincos F
GLIBC_2.35 _ZGVbN2v_acos F
GLIBC_2.35 _ZGVbN2v_asin F
GLIBC_2.35 _ZGVbN2v_atan F
GLIBC_2.35 _ZGVbN2v_exp10 F
GLIBC_2.35 _ZGVbN2v_exp2 F
GLIBC_2.35 _ZGVbN2vv_hypot F
GLIBC_2.35 _ZGVbN4v_acosf F
GLIBC_2.35 _ZGVbN4v_asinf F
GLIBC_2.35 _ZGVbN4v_atanf F
GLIBC_2.35 _ZGVbN4v_exp10f F
GLIBC_2.35 _ZGVbN4v_exp2f F
GLIBC_2.35 _ZGVbN4vv_hypotf F
GLIBC_2.35 _ZGVcN4v_acos F
GLIBC_2.35 _ZGVcN4v_asin F
GLIBC_2.35 _ZGVcN4v_atan F
GLIBC_2.35 _ZGVcN4v_exp10 F
GLIBC_2.35 _ZGVcN4v_exp2 F
GLIBC_2.35 _ZGVcN4vv_hypot F
GLIBC_2.35 _ZGVcN8v_acosf F
GLIBC_2.35 _ZGVcN8v_asinf F
GLIBC_2.35 _ZGVcN8v_atanf F
GLIBC_2.35 _ZGVcN8v_exp10f F
GLIBC_2.35 _ZGVcN8v_exp2f F
GLIBC_2.35 _ZGVcN8vv_hypotf F
GLIBC_2.35 _ZGVdN4v_acos F
GLIBC_2.35 _ZGVdN4v_asin F
GLIBC_2.35 _ZGVdN4v_atan F
GLIBC_2.35 _ZGVdN4v_exp10 F
GLIBC_2.35 _ZGVdN4v_exp2 F
GLIBC_2.35 _ZGVdN4vv_hypot F
GLIBC_2.35 _ZGVdN8v_acosf F
GLIBC_2.35 _ZGVdN8v_asinf F
GLIBC_2.35 _ZGVdN8v_atanf F
GLIBC_2.35 _ZGVdN8v_exp10f F
GLIBC_2.35 _ZGVdN8v_exp2f F
GLIBC_2.35 _ZGVdN8vv_hypotf F
GLIBC_2.35 _ZGVeN16v_acosf F
GLIBC_2.35 _ZGVeN16v_asinf F
GLIBC_2.35 _ZGVeN16v_atanf F
GLIBC_2.35 _ZGVeN16v_exp10f F
GLIBC_2.35 _ZGVeN16v_exp2f F
GLIBC_2.35 _ZGVeN16vv_hypotf F
GLIBC_2.35 _ZGVeN8v_acos F
GLIBC_2.35 _ZGVeN8v_asin F
GLIBC_2.35 _ZGVeN8v_atan F
GLIBC_2.35 _ZGVeN8v_exp10 F
GLIBC_2.35 _ZGVeN8v_exp2 F
GLIBC_2.35 _ZGVeN8vv_hypot F

View File

@ -78,6 +78,10 @@
# define __DECL_SIMD_exp2 __DECL_SIMD_x86_64
# undef __DECL_SIMD_exp2f
# define __DECL_SIMD_exp2f __DECL_SIMD_x86_64
# undef __DECL_SIMD_exp10
# define __DECL_SIMD_exp10 __DECL_SIMD_x86_64
# undef __DECL_SIMD_exp10f
# define __DECL_SIMD_exp10f __DECL_SIMD_x86_64
# endif
#endif

View File

@ -38,6 +38,8 @@
!GCC$ builtin (hypotf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (exp2) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (exp2f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (exp10) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (exp10f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -61,3 +63,5 @@
!GCC$ builtin (hypotf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (exp2) attributes simd (notinbranch) if('x32')
!GCC$ builtin (exp2f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (exp10) attributes simd (notinbranch) if('x32')
!GCC$ builtin (exp10f) attributes simd (notinbranch) if('x32')

View File

@ -27,6 +27,7 @@ libmvec-funcs = \
atan \
cos \
exp \
exp10 \
exp2 \
hypot \
log \

View File

@ -17,11 +17,13 @@ libmvec {
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
_ZGVbN2v_asin; _ZGVcN4v_asin; _ZGVdN4v_asin; _ZGVeN8v_asin;
_ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
_ZGVbN2v_exp10; _ZGVcN4v_exp10; _ZGVdN4v_exp10; _ZGVeN8v_exp10;
_ZGVbN2v_exp2; _ZGVcN4v_exp2; _ZGVdN4v_exp2; _ZGVeN8v_exp2;
_ZGVbN2vv_hypot; _ZGVcN4vv_hypot; _ZGVdN4vv_hypot; _ZGVeN8vv_hypot;
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
_ZGVbN4v_asinf; _ZGVcN8v_asinf; _ZGVdN8v_asinf; _ZGVeN16v_asinf;
_ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
_ZGVbN4v_exp10f; _ZGVcN8v_exp10f; _ZGVdN8v_exp10f; _ZGVeN16v_exp10f;
_ZGVbN4v_exp2f; _ZGVcN8v_exp2f; _ZGVdN8v_exp2f; _ZGVeN16v_exp2f;
_ZGVbN4vv_hypotf; _ZGVcN8vv_hypotf; _ZGVdN8vv_hypotf; _ZGVeN16vv_hypotf;
}

View File

@ -1252,6 +1252,26 @@ float: 1
float128: 3
ldouble: 2
Function: "exp10_vlen16":
float: 3
Function: "exp10_vlen2":
double: 1
Function: "exp10_vlen4":
double: 1
float: 1
Function: "exp10_vlen4_avx2":
double: 1
Function: "exp10_vlen8":
double: 1
float: 1
Function: "exp10_vlen8_avx2":
float: 1
Function: "exp2":
double: 1
float: 1

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized exp10, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_exp10 _ZGVbN2v_exp10_sse2
#include "../svml_d_exp102_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized exp10, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_exp10
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_exp10, __GI__ZGVbN2v_exp10, __redirect__ZGVbN2v_exp10)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,418 @@
/* Function exp10 vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* exp10(x) = 2^x/log10(2) = 2^n * (1 + T[j]) * (1 + P(y))
* where
* x = m*log10(2)/K + y, y in [-log10(2)/K..log10(2)/K]
* m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
*
* values of 2^j/K are tabulated
*
* P(y) is a minimax polynomial approximation of exp10(x)-1
* on small interval [-log10(2)/K..log10(2)/K]
*
* Special cases:
*
* exp10(NaN) = NaN
* exp10(+INF) = +INF
* exp10(-INF) = 0
* exp10(x) = 1 for subnormals
* For IEEE double
* if x > 3.39782712893383973096e+02 then exp10(x) overflow
* if x < -3.45133219101941108420e+02 then exp10(x) underflow
*
*/
/* Offsets for data table __svml_dexp10_data_internal
*/
#define _dbT 0
#define _dbLg2_10 1024
#define _dbShifter 1040
#define _dbInvLg2_10hi 1056
#define _dbInvLg2_10lo 1072
#define _dPC1 1088
#define _dPC2 1104
#define _dPC3 1120
#define _dPC4 1136
#define _dPC5 1152
#define _lExpMask 1168
#define _iIndexMask 1184
#define _iAbsMask 1200
#define _iDomainRange 1216
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN2v_exp10_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
/* R */
movaps %xmm0, %xmm12
/* Load arument */
movups _dbLg2_10+__svml_dexp10_data_internal(%rip), %xmm13
lea __svml_dexp10_data_internal(%rip), %rsi
mulpd %xmm0, %xmm13
movups _dbShifter+__svml_dexp10_data_internal(%rip), %xmm1
addpd %xmm1, %xmm13
movaps %xmm13, %xmm9
subpd %xmm1, %xmm9
movups _dbInvLg2_10hi+__svml_dexp10_data_internal(%rip), %xmm8
mulpd %xmm9, %xmm8
movups _dbInvLg2_10lo+__svml_dexp10_data_internal(%rip), %xmm10
mulpd %xmm9, %xmm10
subpd %xmm8, %xmm12
subpd %xmm10, %xmm12
/*
* Polynomial
* poly(dN) = a1*dR+...+a5*dR^5
*/
movups _dPC5+__svml_dexp10_data_internal(%rip), %xmm11
mulpd %xmm12, %xmm11
addpd _dPC4+__svml_dexp10_data_internal(%rip), %xmm11
mulpd %xmm12, %xmm11
addpd _dPC3+__svml_dexp10_data_internal(%rip), %xmm11
mulpd %xmm12, %xmm11
addpd _dPC2+__svml_dexp10_data_internal(%rip), %xmm11
/* a1+...+a5*dR^4 ! */
mulpd %xmm12, %xmm11
addpd _dPC1+__svml_dexp10_data_internal(%rip), %xmm11
movq _iIndexMask+__svml_dexp10_data_internal(%rip), %xmm5
/* Index and lookup */
pshufd $136, %xmm13, %xmm6
/* 2^N */
psllq $45, %xmm13
pand %xmm5, %xmm6
/* iIndex*=sizeof(D); */
pslld $3, %xmm6
movd %xmm6, %eax
pshufd $1, %xmm6, %xmm7
movq _iAbsMask+__svml_dexp10_data_internal(%rip), %xmm2
/* a1*dR+...+a5*dR^5 */
mulpd %xmm11, %xmm12
movd %xmm7, %ecx
/* Check for overflow\underflow */
pshufd $221, %xmm0, %xmm4
movq _iDomainRange+__svml_dexp10_data_internal(%rip), %xmm3
pand %xmm2, %xmm4
movslq %eax, %rax
pcmpgtd %xmm3, %xmm4
movslq %ecx, %rcx
movmskps %xmm4, %edx
/* lM==EXP(2^N) */
pand _lExpMask+__svml_dexp10_data_internal(%rip), %xmm13
movsd (%rsi,%rax), %xmm1
movhpd (%rsi,%rcx), %xmm1
/* Tj*poly */
mulpd %xmm1, %xmm12
addpd %xmm12, %xmm1
/* quick 2^N */
paddq %xmm13, %xmm1
andl $3, %edx
/* Finish */
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
/* Restore registers
* and exit the function
*/
L(EXIT):
movaps %xmm1, %xmm0
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm0, 32(%rsp)
movups %xmm1, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 edx xmm1
xorl %eax, %eax
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %eax, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %edx, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $2, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm1
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm1
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call exp10@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movsd %xmm0, 48(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN2v_exp10_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_dexp10_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(16)) VUINT32 _dbT[(1<<7)][2];
__declspec(align(16)) VUINT32 _dbLg2_10[2][2];
__declspec(align(16)) VUINT32 _dbShifter[2][2];
__declspec(align(16)) VUINT32 _dbInvLg2_10hi[2][2];
__declspec(align(16)) VUINT32 _dbInvLg2_10lo[2][2];
__declspec(align(16)) VUINT32 _dPC1[2][2];
__declspec(align(16)) VUINT32 _dPC2[2][2];
__declspec(align(16)) VUINT32 _dPC3[2][2];
__declspec(align(16)) VUINT32 _dPC4[2][2];
__declspec(align(16)) VUINT32 _dPC5[2][2];
__declspec(align(16)) VUINT32 _lExpMask[2][2];
__declspec(align(16)) VUINT32 _iIndexMask[4][1];
__declspec(align(16)) VUINT32 _iAbsMask[4][1];
__declspec(align(16)) VUINT32 _iDomainRange[4][1];
} __svml_dexp10_data_internal;
#endif
__svml_dexp10_data_internal:
/*== _dbT ==*/
.quad 0x3ff0000000000000 /*2^( 0 /128)*/
.quad 0x3ff0163da9fb3335 /*2^( 1 /128)*/
.quad 0x3ff02c9a3e778061 /*2^( 2 /128)*/
.quad 0x3ff04315e86e7f85 /*2^( 3 /128)*/
.quad 0x3ff059b0d3158574 /*2^( 4 /128)*/
.quad 0x3ff0706b29ddf6de /*2^( 5 /128)*/
.quad 0x3ff0874518759bc8 /*2^( 6 /128)*/
.quad 0x3ff09e3ecac6f383 /*2^( 7 /128)*/
.quad 0x3ff0b5586cf9890f /*2^( 8 /128)*/
.quad 0x3ff0cc922b7247f7 /*2^( 9 /128)*/
.quad 0x3ff0e3ec32d3d1a2 /*2^( 10 /128)*/
.quad 0x3ff0fb66affed31b /*2^( 11 /128)*/
.quad 0x3ff11301d0125b51 /*2^( 12 /128)*/
.quad 0x3ff12abdc06c31cc /*2^( 13 /128)*/
.quad 0x3ff1429aaea92de0 /*2^( 14 /128)*/
.quad 0x3ff15a98c8a58e51 /*2^( 15 /128)*/
.quad 0x3ff172b83c7d517b /*2^( 16 /128)*/
.quad 0x3ff18af9388c8dea /*2^( 17 /128)*/
.quad 0x3ff1a35beb6fcb75 /*2^( 18 /128)*/
.quad 0x3ff1bbe084045cd4 /*2^( 19 /128)*/
.quad 0x3ff1d4873168b9aa /*2^( 20 /128)*/
.quad 0x3ff1ed5022fcd91d /*2^( 21 /128)*/
.quad 0x3ff2063b88628cd6 /*2^( 22 /128)*/
.quad 0x3ff21f49917ddc96 /*2^( 23 /128)*/
.quad 0x3ff2387a6e756238 /*2^( 24 /128)*/
.quad 0x3ff251ce4fb2a63f /*2^( 25 /128)*/
.quad 0x3ff26b4565e27cdd /*2^( 26 /128)*/
.quad 0x3ff284dfe1f56381 /*2^( 27 /128)*/
.quad 0x3ff29e9df51fdee1 /*2^( 28 /128)*/
.quad 0x3ff2b87fd0dad990 /*2^( 29 /128)*/
.quad 0x3ff2d285a6e4030b /*2^( 30 /128)*/
.quad 0x3ff2ecafa93e2f56 /*2^( 31 /128)*/
.quad 0x3ff306fe0a31b715 /*2^( 32 /128)*/
.quad 0x3ff32170fc4cd831 /*2^( 33 /128)*/
.quad 0x3ff33c08b26416ff /*2^( 34 /128)*/
.quad 0x3ff356c55f929ff1 /*2^( 35 /128)*/
.quad 0x3ff371a7373aa9cb /*2^( 36 /128)*/
.quad 0x3ff38cae6d05d866 /*2^( 37 /128)*/
.quad 0x3ff3a7db34e59ff7 /*2^( 38 /128)*/
.quad 0x3ff3c32dc313a8e5 /*2^( 39 /128)*/
.quad 0x3ff3dea64c123422 /*2^( 40 /128)*/
.quad 0x3ff3fa4504ac801c /*2^( 41 /128)*/
.quad 0x3ff4160a21f72e2a /*2^( 42 /128)*/
.quad 0x3ff431f5d950a897 /*2^( 43 /128)*/
.quad 0x3ff44e086061892d /*2^( 44 /128)*/
.quad 0x3ff46a41ed1d0057 /*2^( 45 /128)*/
.quad 0x3ff486a2b5c13cd0 /*2^( 46 /128)*/
.quad 0x3ff4a32af0d7d3de /*2^( 47 /128)*/
.quad 0x3ff4bfdad5362a27 /*2^( 48 /128)*/
.quad 0x3ff4dcb299fddd0d /*2^( 49 /128)*/
.quad 0x3ff4f9b2769d2ca7 /*2^( 50 /128)*/
.quad 0x3ff516daa2cf6642 /*2^( 51 /128)*/
.quad 0x3ff5342b569d4f82 /*2^( 52 /128)*/
.quad 0x3ff551a4ca5d920f /*2^( 53 /128)*/
.quad 0x3ff56f4736b527da /*2^( 54 /128)*/
.quad 0x3ff58d12d497c7fd /*2^( 55 /128)*/
.quad 0x3ff5ab07dd485429 /*2^( 56 /128)*/
.quad 0x3ff5c9268a5946b7 /*2^( 57 /128)*/
.quad 0x3ff5e76f15ad2148 /*2^( 58 /128)*/
.quad 0x3ff605e1b976dc09 /*2^( 59 /128)*/
.quad 0x3ff6247eb03a5585 /*2^( 60 /128)*/
.quad 0x3ff6434634ccc320 /*2^( 61 /128)*/
.quad 0x3ff6623882552225 /*2^( 62 /128)*/
.quad 0x3ff68155d44ca973 /*2^( 63 /128)*/
.quad 0x3ff6a09e667f3bcd /*2^( 64 /128)*/
.quad 0x3ff6c012750bdabf /*2^( 65 /128)*/
.quad 0x3ff6dfb23c651a2f /*2^( 66 /128)*/
.quad 0x3ff6ff7df9519484 /*2^( 67 /128)*/
.quad 0x3ff71f75e8ec5f74 /*2^( 68 /128)*/
.quad 0x3ff73f9a48a58174 /*2^( 69 /128)*/
.quad 0x3ff75feb564267c9 /*2^( 70 /128)*/
.quad 0x3ff780694fde5d3f /*2^( 71 /128)*/
.quad 0x3ff7a11473eb0187 /*2^( 72 /128)*/
.quad 0x3ff7c1ed0130c132 /*2^( 73 /128)*/
.quad 0x3ff7e2f336cf4e62 /*2^( 74 /128)*/
.quad 0x3ff80427543e1a12 /*2^( 75 /128)*/
.quad 0x3ff82589994cce13 /*2^( 76 /128)*/
.quad 0x3ff8471a4623c7ad /*2^( 77 /128)*/
.quad 0x3ff868d99b4492ed /*2^( 78 /128)*/
.quad 0x3ff88ac7d98a6699 /*2^( 79 /128)*/
.quad 0x3ff8ace5422aa0db /*2^( 80 /128)*/
.quad 0x3ff8cf3216b5448c /*2^( 81 /128)*/
.quad 0x3ff8f1ae99157736 /*2^( 82 /128)*/
.quad 0x3ff9145b0b91ffc6 /*2^( 83 /128)*/
.quad 0x3ff93737b0cdc5e5 /*2^( 84 /128)*/
.quad 0x3ff95a44cbc8520f /*2^( 85 /128)*/
.quad 0x3ff97d829fde4e50 /*2^( 86 /128)*/
.quad 0x3ff9a0f170ca07ba /*2^( 87 /128)*/
.quad 0x3ff9c49182a3f090 /*2^( 88 /128)*/
.quad 0x3ff9e86319e32323 /*2^( 89 /128)*/
.quad 0x3ffa0c667b5de565 /*2^( 90 /128)*/
.quad 0x3ffa309bec4a2d33 /*2^( 91 /128)*/
.quad 0x3ffa5503b23e255d /*2^( 92 /128)*/
.quad 0x3ffa799e1330b358 /*2^( 93 /128)*/
.quad 0x3ffa9e6b5579fdbf /*2^( 94 /128)*/
.quad 0x3ffac36bbfd3f37a /*2^( 95 /128)*/
.quad 0x3ffae89f995ad3ad /*2^( 96 /128)*/
.quad 0x3ffb0e07298db666 /*2^( 97 /128)*/
.quad 0x3ffb33a2b84f15fb /*2^( 98 /128)*/
.quad 0x3ffb59728de5593a /*2^( 99 /128)*/
.quad 0x3ffb7f76f2fb5e47 /*2^( 100 /128)*/
.quad 0x3ffba5b030a1064a /*2^( 101 /128)*/
.quad 0x3ffbcc1e904bc1d2 /*2^( 102 /128)*/
.quad 0x3ffbf2c25bd71e09 /*2^( 103 /128)*/
.quad 0x3ffc199bdd85529c /*2^( 104 /128)*/
.quad 0x3ffc40ab5fffd07a /*2^( 105 /128)*/
.quad 0x3ffc67f12e57d14b /*2^( 106 /128)*/
.quad 0x3ffc8f6d9406e7b5 /*2^( 107 /128)*/
.quad 0x3ffcb720dcef9069 /*2^( 108 /128)*/
.quad 0x3ffcdf0b555dc3fa /*2^( 109 /128)*/
.quad 0x3ffd072d4a07897c /*2^( 110 /128)*/
.quad 0x3ffd2f87080d89f2 /*2^( 111 /128)*/
.quad 0x3ffd5818dcfba487 /*2^( 112 /128)*/
.quad 0x3ffd80e316c98398 /*2^( 113 /128)*/
.quad 0x3ffda9e603db3285 /*2^( 114 /128)*/
.quad 0x3ffdd321f301b460 /*2^( 115 /128)*/
.quad 0x3ffdfc97337b9b5f /*2^( 116 /128)*/
.quad 0x3ffe264614f5a129 /*2^( 117 /128)*/
.quad 0x3ffe502ee78b3ff6 /*2^( 118 /128)*/
.quad 0x3ffe7a51fbc74c83 /*2^( 119 /128)*/
.quad 0x3ffea4afa2a490da /*2^( 120 /128)*/
.quad 0x3ffecf482d8e67f1 /*2^( 121 /128)*/
.quad 0x3ffefa1bee615a27 /*2^( 122 /128)*/
.quad 0x3fff252b376bba97 /*2^( 123 /128)*/
.quad 0x3fff50765b6e4540 /*2^( 124 /128)*/
.quad 0x3fff7bfdad9cbe14 /*2^( 125 /128)*/
.quad 0x3fffa7c1819e90d8 /*2^( 126 /128)*/
.quad 0x3fffd3c22b8f71f1 /*2^( 127 /128)*/
.align 16
.quad 0x407a934f0979a371, 0x407a934f0979a371 /* _dbLg2_10*2^K */
.align 16
.quad 0x4338800000000000, 0x4338800000000000 /* _dbShifter */
.align 16
.quad 0x3f63441350a00000, 0x3f63441350a00000 /* _dbInvLg2_10hi/2^K 53-11-K bits*/
.align 16
.quad 0xbd10c0219dc1da99, 0xbd10c0219dc1da99 /* _dbInvLg2_10lo/2^K */
//PC0 = 1.0
.align 16
.quad 0x40026bb1bbb55516, 0x40026bb1bbb55516 /* _dPC1 */
.align 16
.quad 0x40053524c73ce8e3, 0x40053524c73ce8e3 /* _dPC2 */
.align 16
.quad 0x4000470591ccea8b, 0x4000470591ccea8b /* _dPC3 */
.align 16
.quad 0x3ff2bd767584db59, 0x3ff2bd767584db59 /* _dPC4 */
.align 16
.quad 0x3fe144c03efafb54, 0x3fe144c03efafb54 /* _dPC5 */
.align 16
.quad 0xfff0000000000000, 0xfff0000000000000 /* _lExpMask */
.align 16
.long 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f /* _iIndexMask =(2^K-1)*/
//common
.align 16
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
.align 16
.long 0x40733a70, 0x40733a70, 0x40733a70, 0x40733a70 /* _iDomainRange */
.align 16
.type __svml_dexp10_data_internal,@object
.size __svml_dexp10_data_internal,.-__svml_dexp10_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized exp10, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_exp10 _ZGVdN4v_exp10_sse_wrapper
#include "../svml_d_exp104_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized exp10, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_exp10
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_exp10, __GI__ZGVdN4v_exp10, __redirect__ZGVdN4v_exp10)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,429 @@
/* Function exp10 vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* exp10(x) = 2^x/log10(2) = 2^n * (1 + T[j]) * (1 + P(y))
* where
* x = m*log10(2)/K + y, y in [-log10(2)/K..log10(2)/K]
* m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
*
* values of 2^j/K are tabulated
*
* P(y) is a minimax polynomial approximation of exp10(x)-1
* on small interval [-log10(2)/K..log10(2)/K]
*
* Special cases:
*
* exp10(NaN) = NaN
* exp10(+INF) = +INF
* exp10(-INF) = 0
* exp10(x) = 1 for subnormals
* For IEEE double
* if x > 3.39782712893383973096e+02 then exp10(x) overflow
* if x < -3.45133219101941108420e+02 then exp10(x) underflow
*
*/
/* Offsets for data table __svml_dexp10_data_internal
*/
#define _dbT 0
#define _dbLg2_10 1024
#define _dbShifter 1056
#define _dbInvLg2_10hi 1088
#define _dbInvLg2_10lo 1120
#define _dPC1 1152
#define _dPC2 1184
#define _dPC3 1216
#define _dPC4 1248
#define _dPC5 1280
#define _lExpMask 1312
#define _iIndexMask 1344
#define _iAbsMask 1376
#define _iDomainRange 1408
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN4v_exp10_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
lea __svml_dexp10_data_internal(%rip), %r8
vmovapd %ymm0, %ymm2
vmovupd _dbShifter+__svml_dexp10_data_internal(%rip), %ymm3
/* Load arument */
vmovupd _dbLg2_10+__svml_dexp10_data_internal(%rip), %ymm0
vfmadd213pd %ymm3, %ymm2, %ymm0
vsubpd %ymm3, %ymm0, %ymm1
/* R */
vmovupd _dbInvLg2_10hi+__svml_dexp10_data_internal(%rip), %ymm3
vfnmadd213pd %ymm2, %ymm1, %ymm3
/* Check for overflow\underflow */
vextractf128 $1, %ymm2, %xmm4
vfnmadd132pd _dbInvLg2_10lo+__svml_dexp10_data_internal(%rip), %ymm3, %ymm1
vshufps $221, %xmm4, %xmm2, %xmm5
vandps _iAbsMask+__svml_dexp10_data_internal(%rip), %xmm5, %xmm6
vpcmpgtd _iDomainRange+__svml_dexp10_data_internal(%rip), %xmm6, %xmm7
/*
* Polynomial
* poly(dN) = a1*dR+...+a5*dR^5
*/
vmovupd _dPC5+__svml_dexp10_data_internal(%rip), %ymm4
vmovmskps %xmm7, %eax
vfmadd213pd _dPC4+__svml_dexp10_data_internal(%rip), %ymm1, %ymm4
vfmadd213pd _dPC3+__svml_dexp10_data_internal(%rip), %ymm1, %ymm4
vfmadd213pd _dPC2+__svml_dexp10_data_internal(%rip), %ymm1, %ymm4
/* a1+...+a5*dR^4 ! */
vfmadd213pd _dPC1+__svml_dexp10_data_internal(%rip), %ymm1, %ymm4
/* a1*dR+...+a5*dR^5 */
vmulpd %ymm4, %ymm1, %ymm1
/* Index and lookup */
vextractf128 $1, %ymm0, %xmm8
vshufps $136, %xmm8, %xmm0, %xmm9
vandps _iIndexMask+__svml_dexp10_data_internal(%rip), %xmm9, %xmm10
/* iIndex*=sizeof(D); */
vpslld $3, %xmm10, %xmm13
vmovd %xmm13, %edx
/* 2^N */
vpsllq $45, %ymm0, %ymm0
vpextrd $2, %xmm13, %esi
movslq %edx, %rdx
vpextrd $1, %xmm13, %ecx
movslq %esi, %rsi
vpextrd $3, %xmm13, %edi
movslq %ecx, %rcx
movslq %edi, %rdi
vmovsd (%r8,%rdx), %xmm11
vmovsd (%r8,%rsi), %xmm14
vmovhpd (%r8,%rcx), %xmm11, %xmm12
vmovhpd (%r8,%rdi), %xmm14, %xmm15
/* lM==EXP(2^N) */
vpand _lExpMask+__svml_dexp10_data_internal(%rip), %ymm0, %ymm6
vinsertf128 $1, %xmm15, %ymm12, %ymm5
/* Tj*poly */
vfmadd213pd %ymm5, %ymm5, %ymm1
/* quick 2^N */
vpaddq %ymm6, %ymm1, %ymm0
/* Finish */
testl %eax, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 eax ymm0 ymm2
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovupd %ymm2, 32(%rsp)
vmovupd %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 eax ymm0
xorl %edx, %edx
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %edx, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %eax, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovupd 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call exp10@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 64(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN4v_exp10_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_dexp10_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _dbT[(1<<7)][2];
__declspec(align(32)) VUINT32 _dbLg2_10[4][2];
__declspec(align(32)) VUINT32 _dbShifter[4][2];
__declspec(align(32)) VUINT32 _dbInvLg2_10hi[4][2];
__declspec(align(32)) VUINT32 _dbInvLg2_10lo[4][2];
__declspec(align(32)) VUINT32 _dPC1[4][2];
__declspec(align(32)) VUINT32 _dPC2[4][2];
__declspec(align(32)) VUINT32 _dPC3[4][2];
__declspec(align(32)) VUINT32 _dPC4[4][2];
__declspec(align(32)) VUINT32 _dPC5[4][2];
__declspec(align(32)) VUINT32 _lExpMask[4][2];
__declspec(align(32)) VUINT32 _iIndexMask[8][1];
__declspec(align(32)) VUINT32 _iAbsMask[8][1];
__declspec(align(32)) VUINT32 _iDomainRange[8][1];
} __svml_dexp10_data_internal;
#endif
__svml_dexp10_data_internal:
/*== _dbT ==*/
.quad 0x3ff0000000000000 /*2^( 0 /128)*/
.quad 0x3ff0163da9fb3335 /*2^( 1 /128)*/
.quad 0x3ff02c9a3e778061 /*2^( 2 /128)*/
.quad 0x3ff04315e86e7f85 /*2^( 3 /128)*/
.quad 0x3ff059b0d3158574 /*2^( 4 /128)*/
.quad 0x3ff0706b29ddf6de /*2^( 5 /128)*/
.quad 0x3ff0874518759bc8 /*2^( 6 /128)*/
.quad 0x3ff09e3ecac6f383 /*2^( 7 /128)*/
.quad 0x3ff0b5586cf9890f /*2^( 8 /128)*/
.quad 0x3ff0cc922b7247f7 /*2^( 9 /128)*/
.quad 0x3ff0e3ec32d3d1a2 /*2^( 10 /128)*/
.quad 0x3ff0fb66affed31b /*2^( 11 /128)*/
.quad 0x3ff11301d0125b51 /*2^( 12 /128)*/
.quad 0x3ff12abdc06c31cc /*2^( 13 /128)*/
.quad 0x3ff1429aaea92de0 /*2^( 14 /128)*/
.quad 0x3ff15a98c8a58e51 /*2^( 15 /128)*/
.quad 0x3ff172b83c7d517b /*2^( 16 /128)*/
.quad 0x3ff18af9388c8dea /*2^( 17 /128)*/
.quad 0x3ff1a35beb6fcb75 /*2^( 18 /128)*/
.quad 0x3ff1bbe084045cd4 /*2^( 19 /128)*/
.quad 0x3ff1d4873168b9aa /*2^( 20 /128)*/
.quad 0x3ff1ed5022fcd91d /*2^( 21 /128)*/
.quad 0x3ff2063b88628cd6 /*2^( 22 /128)*/
.quad 0x3ff21f49917ddc96 /*2^( 23 /128)*/
.quad 0x3ff2387a6e756238 /*2^( 24 /128)*/
.quad 0x3ff251ce4fb2a63f /*2^( 25 /128)*/
.quad 0x3ff26b4565e27cdd /*2^( 26 /128)*/
.quad 0x3ff284dfe1f56381 /*2^( 27 /128)*/
.quad 0x3ff29e9df51fdee1 /*2^( 28 /128)*/
.quad 0x3ff2b87fd0dad990 /*2^( 29 /128)*/
.quad 0x3ff2d285a6e4030b /*2^( 30 /128)*/
.quad 0x3ff2ecafa93e2f56 /*2^( 31 /128)*/
.quad 0x3ff306fe0a31b715 /*2^( 32 /128)*/
.quad 0x3ff32170fc4cd831 /*2^( 33 /128)*/
.quad 0x3ff33c08b26416ff /*2^( 34 /128)*/
.quad 0x3ff356c55f929ff1 /*2^( 35 /128)*/
.quad 0x3ff371a7373aa9cb /*2^( 36 /128)*/
.quad 0x3ff38cae6d05d866 /*2^( 37 /128)*/
.quad 0x3ff3a7db34e59ff7 /*2^( 38 /128)*/
.quad 0x3ff3c32dc313a8e5 /*2^( 39 /128)*/
.quad 0x3ff3dea64c123422 /*2^( 40 /128)*/
.quad 0x3ff3fa4504ac801c /*2^( 41 /128)*/
.quad 0x3ff4160a21f72e2a /*2^( 42 /128)*/
.quad 0x3ff431f5d950a897 /*2^( 43 /128)*/
.quad 0x3ff44e086061892d /*2^( 44 /128)*/
.quad 0x3ff46a41ed1d0057 /*2^( 45 /128)*/
.quad 0x3ff486a2b5c13cd0 /*2^( 46 /128)*/
.quad 0x3ff4a32af0d7d3de /*2^( 47 /128)*/
.quad 0x3ff4bfdad5362a27 /*2^( 48 /128)*/
.quad 0x3ff4dcb299fddd0d /*2^( 49 /128)*/
.quad 0x3ff4f9b2769d2ca7 /*2^( 50 /128)*/
.quad 0x3ff516daa2cf6642 /*2^( 51 /128)*/
.quad 0x3ff5342b569d4f82 /*2^( 52 /128)*/
.quad 0x3ff551a4ca5d920f /*2^( 53 /128)*/
.quad 0x3ff56f4736b527da /*2^( 54 /128)*/
.quad 0x3ff58d12d497c7fd /*2^( 55 /128)*/
.quad 0x3ff5ab07dd485429 /*2^( 56 /128)*/
.quad 0x3ff5c9268a5946b7 /*2^( 57 /128)*/
.quad 0x3ff5e76f15ad2148 /*2^( 58 /128)*/
.quad 0x3ff605e1b976dc09 /*2^( 59 /128)*/
.quad 0x3ff6247eb03a5585 /*2^( 60 /128)*/
.quad 0x3ff6434634ccc320 /*2^( 61 /128)*/
.quad 0x3ff6623882552225 /*2^( 62 /128)*/
.quad 0x3ff68155d44ca973 /*2^( 63 /128)*/
.quad 0x3ff6a09e667f3bcd /*2^( 64 /128)*/
.quad 0x3ff6c012750bdabf /*2^( 65 /128)*/
.quad 0x3ff6dfb23c651a2f /*2^( 66 /128)*/
.quad 0x3ff6ff7df9519484 /*2^( 67 /128)*/
.quad 0x3ff71f75e8ec5f74 /*2^( 68 /128)*/
.quad 0x3ff73f9a48a58174 /*2^( 69 /128)*/
.quad 0x3ff75feb564267c9 /*2^( 70 /128)*/
.quad 0x3ff780694fde5d3f /*2^( 71 /128)*/
.quad 0x3ff7a11473eb0187 /*2^( 72 /128)*/
.quad 0x3ff7c1ed0130c132 /*2^( 73 /128)*/
.quad 0x3ff7e2f336cf4e62 /*2^( 74 /128)*/
.quad 0x3ff80427543e1a12 /*2^( 75 /128)*/
.quad 0x3ff82589994cce13 /*2^( 76 /128)*/
.quad 0x3ff8471a4623c7ad /*2^( 77 /128)*/
.quad 0x3ff868d99b4492ed /*2^( 78 /128)*/
.quad 0x3ff88ac7d98a6699 /*2^( 79 /128)*/
.quad 0x3ff8ace5422aa0db /*2^( 80 /128)*/
.quad 0x3ff8cf3216b5448c /*2^( 81 /128)*/
.quad 0x3ff8f1ae99157736 /*2^( 82 /128)*/
.quad 0x3ff9145b0b91ffc6 /*2^( 83 /128)*/
.quad 0x3ff93737b0cdc5e5 /*2^( 84 /128)*/
.quad 0x3ff95a44cbc8520f /*2^( 85 /128)*/
.quad 0x3ff97d829fde4e50 /*2^( 86 /128)*/
.quad 0x3ff9a0f170ca07ba /*2^( 87 /128)*/
.quad 0x3ff9c49182a3f090 /*2^( 88 /128)*/
.quad 0x3ff9e86319e32323 /*2^( 89 /128)*/
.quad 0x3ffa0c667b5de565 /*2^( 90 /128)*/
.quad 0x3ffa309bec4a2d33 /*2^( 91 /128)*/
.quad 0x3ffa5503b23e255d /*2^( 92 /128)*/
.quad 0x3ffa799e1330b358 /*2^( 93 /128)*/
.quad 0x3ffa9e6b5579fdbf /*2^( 94 /128)*/
.quad 0x3ffac36bbfd3f37a /*2^( 95 /128)*/
.quad 0x3ffae89f995ad3ad /*2^( 96 /128)*/
.quad 0x3ffb0e07298db666 /*2^( 97 /128)*/
.quad 0x3ffb33a2b84f15fb /*2^( 98 /128)*/
.quad 0x3ffb59728de5593a /*2^( 99 /128)*/
.quad 0x3ffb7f76f2fb5e47 /*2^( 100 /128)*/
.quad 0x3ffba5b030a1064a /*2^( 101 /128)*/
.quad 0x3ffbcc1e904bc1d2 /*2^( 102 /128)*/
.quad 0x3ffbf2c25bd71e09 /*2^( 103 /128)*/
.quad 0x3ffc199bdd85529c /*2^( 104 /128)*/
.quad 0x3ffc40ab5fffd07a /*2^( 105 /128)*/
.quad 0x3ffc67f12e57d14b /*2^( 106 /128)*/
.quad 0x3ffc8f6d9406e7b5 /*2^( 107 /128)*/
.quad 0x3ffcb720dcef9069 /*2^( 108 /128)*/
.quad 0x3ffcdf0b555dc3fa /*2^( 109 /128)*/
.quad 0x3ffd072d4a07897c /*2^( 110 /128)*/
.quad 0x3ffd2f87080d89f2 /*2^( 111 /128)*/
.quad 0x3ffd5818dcfba487 /*2^( 112 /128)*/
.quad 0x3ffd80e316c98398 /*2^( 113 /128)*/
.quad 0x3ffda9e603db3285 /*2^( 114 /128)*/
.quad 0x3ffdd321f301b460 /*2^( 115 /128)*/
.quad 0x3ffdfc97337b9b5f /*2^( 116 /128)*/
.quad 0x3ffe264614f5a129 /*2^( 117 /128)*/
.quad 0x3ffe502ee78b3ff6 /*2^( 118 /128)*/
.quad 0x3ffe7a51fbc74c83 /*2^( 119 /128)*/
.quad 0x3ffea4afa2a490da /*2^( 120 /128)*/
.quad 0x3ffecf482d8e67f1 /*2^( 121 /128)*/
.quad 0x3ffefa1bee615a27 /*2^( 122 /128)*/
.quad 0x3fff252b376bba97 /*2^( 123 /128)*/
.quad 0x3fff50765b6e4540 /*2^( 124 /128)*/
.quad 0x3fff7bfdad9cbe14 /*2^( 125 /128)*/
.quad 0x3fffa7c1819e90d8 /*2^( 126 /128)*/
.quad 0x3fffd3c22b8f71f1 /*2^( 127 /128)*/
.align 32
.quad 0x407a934f0979a371, 0x407a934f0979a371, 0x407a934f0979a371, 0x407a934f0979a371 /* _dbLg2_10*2^K */
.align 32
.quad 0x4338800000000000, 0x4338800000000000, 0x4338800000000000, 0x4338800000000000 /* _dbShifter */
.align 32
.quad 0x3f63441350a00000, 0x3f63441350a00000, 0x3f63441350a00000, 0x3f63441350a00000 /* _dbInvLg2_10hi/2^K 53-11-K bits*/
.align 32
.quad 0xbd10c0219dc1da99, 0xbd10c0219dc1da99, 0xbd10c0219dc1da99, 0xbd10c0219dc1da99 /* _dbInvLg2_10lo/2^K */
//PC0 = 1.0
.align 32
.quad 0x40026bb1bbb55516, 0x40026bb1bbb55516, 0x40026bb1bbb55516, 0x40026bb1bbb55516 /* _dPC1 */
.align 32
.quad 0x40053524c73ce8e3, 0x40053524c73ce8e3, 0x40053524c73ce8e3, 0x40053524c73ce8e3 /* _dPC2 */
.align 32
.quad 0x4000470591ccea8b, 0x4000470591ccea8b, 0x4000470591ccea8b, 0x4000470591ccea8b /* _dPC3 */
.align 32
.quad 0x3ff2bd767584db59, 0x3ff2bd767584db59, 0x3ff2bd767584db59, 0x3ff2bd767584db59 /* _dPC4 */
.align 32
.quad 0x3fe144c03efafb54, 0x3fe144c03efafb54, 0x3fe144c03efafb54, 0x3fe144c03efafb54 /* _dPC5 */
.align 32
.quad 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000 /* _lExpMask */
.align 32
.long 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f /* _iIndexMask =(2^K-1)*/
//common
.align 32
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
.align 32
.long 0x40733a70, 0x40733a70, 0x40733a70, 0x40733a70, 0x40733a70, 0x40733a70, 0x40733a70, 0x40733a70 /* _iDomainRange */
.align 32
.type __svml_dexp10_data_internal,@object
.size __svml_dexp10_data_internal,.-__svml_dexp10_data_internal

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized exp10, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_exp10 _ZGVeN8v_exp10_avx2_wrapper
#include "../svml_d_exp108_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized exp10, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_exp10
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_exp10, __GI__ZGVeN8v_exp10, __redirect__ZGVeN8v_exp10)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,287 @@
/* Function exp10 vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
* Typical exp10() implementation, except that:
* - tables are small (16 elements), allowing for fast gathers
* - all arguments processed in the main path
* - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
* - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
* - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
* - SAE used to avoid spurious flag settings
*
*/
/* Offsets for data table __svml_dexp10_data_internal_avx512
*/
#define Exp_tbl_H 0
#define L2E 128
#define Shifter 192
#define L2H 256
#define L2L 320
#define EMask 384
#define poly_coeff6 448
#define poly_coeff5 512
#define poly_coeff4 576
#define poly_coeff3 640
#define poly_coeff2 704
#define poly_coeff1 768
#define AbsMask 832
#define Threshold 896
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_exp10_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups L2E+__svml_dexp10_data_internal_avx512(%rip), %zmm4
vmovups Shifter+__svml_dexp10_data_internal_avx512(%rip), %zmm2
vmovups L2H+__svml_dexp10_data_internal_avx512(%rip), %zmm5
vmovups L2L+__svml_dexp10_data_internal_avx512(%rip), %zmm3
/* polynomial */
vmovups poly_coeff6+__svml_dexp10_data_internal_avx512(%rip), %zmm6
vmovups poly_coeff4+__svml_dexp10_data_internal_avx512(%rip), %zmm7
vmovups poly_coeff3+__svml_dexp10_data_internal_avx512(%rip), %zmm9
vmovups poly_coeff2+__svml_dexp10_data_internal_avx512(%rip), %zmm8
vmovups poly_coeff1+__svml_dexp10_data_internal_avx512(%rip), %zmm11
vmovups Threshold+__svml_dexp10_data_internal_avx512(%rip), %zmm14
vmovaps %zmm0, %zmm1
/* 2^(52-4)*1.5 + x * log2(e) */
vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm4
vandpd AbsMask+__svml_dexp10_data_internal_avx512(%rip), %zmm1, %zmm13
/* Z0 ~ x*log2(e), rounded down to 4 fractional bits */
vsubpd {rn-sae}, %zmm2, %zmm4, %zmm0
/* Table lookup: Th */
vmovups __svml_dexp10_data_internal_avx512(%rip), %zmm2
vcmppd $29, {sae}, %zmm14, %zmm13, %k0
/* R = x - Z0*log(2) */
vfnmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm5
vpermt2pd Exp_tbl_H+64+__svml_dexp10_data_internal_avx512(%rip), %zmm4, %zmm2
kmovw %k0, %edx
vfnmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm5
vmovups poly_coeff5+__svml_dexp10_data_internal_avx512(%rip), %zmm3
/* ensure |R|<2 even for special cases */
vandpd EMask+__svml_dexp10_data_internal_avx512(%rip), %zmm5, %zmm12
vmulpd {rn-sae}, %zmm12, %zmm12, %zmm10
vmulpd {rn-sae}, %zmm12, %zmm2, %zmm15
vfmadd231pd {rn-sae}, %zmm12, %zmm6, %zmm3
vfmadd231pd {rn-sae}, %zmm12, %zmm7, %zmm9
vfmadd231pd {rn-sae}, %zmm12, %zmm8, %zmm11
vfmadd213pd {rn-sae}, %zmm9, %zmm10, %zmm3
vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm3
vfmadd213pd {rn-sae}, %zmm2, %zmm15, %zmm3
vscalefpd {rn-sae}, %zmm0, %zmm3, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm1, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 64(%rsp,%r14,8), %xmm0
call exp10@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 128(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN8v_exp10_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_dexp10_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Exp_tbl_H[16][2];
__declspec(align(64)) VUINT32 L2E[8][2];
__declspec(align(64)) VUINT32 Shifter[8][2];
__declspec(align(64)) VUINT32 L2H[8][2];
__declspec(align(64)) VUINT32 L2L[8][2];
__declspec(align(64)) VUINT32 EMask[8][2];
__declspec(align(64)) VUINT32 poly_coeff6[8][2];
__declspec(align(64)) VUINT32 poly_coeff5[8][2];
__declspec(align(64)) VUINT32 poly_coeff4[8][2];
__declspec(align(64)) VUINT32 poly_coeff3[8][2];
__declspec(align(64)) VUINT32 poly_coeff2[8][2];
__declspec(align(64)) VUINT32 poly_coeff1[8][2];
__declspec(align(64)) VUINT32 AbsMask[8][2];
__declspec(align(64)) VUINT32 Threshold[8][2];
} __svml_dexp10_data_internal_avx512;
#endif
__svml_dexp10_data_internal_avx512:
/*== Exp_tbl_H ==*/
.quad 0x3ff0000000000000
.quad 0x3ff0b5586cf9890f
.quad 0x3ff172b83c7d517b
.quad 0x3ff2387a6e756238
.quad 0x3ff306fe0a31b715
.quad 0x3ff3dea64c123422
.quad 0x3ff4bfdad5362a27
.quad 0x3ff5ab07dd485429
.quad 0x3ff6a09e667f3bcd
.quad 0x3ff7a11473eb0187
.quad 0x3ff8ace5422aa0db
.quad 0x3ff9c49182a3f090
.quad 0x3ffae89f995ad3ad
.quad 0x3ffc199bdd85529c
.quad 0x3ffd5818dcfba487
.quad 0x3ffea4afa2a490da
/*== log2(e) ==*/
.align 64
.quad 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371
/*== Shifter=2^(52-4)*1.5 ==*/
.align 64
.quad 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0
/*== L2H = log(2)_high ==*/
.align 64
.quad 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff
/*== L2L = log(2)_low ==*/
.align 64
.quad 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21
/*== EMask ==*/
.align 64
.quad 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff
/*== poly_coeff6 ==*/
.align 64
.quad 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020
/*== poly_coeff5 ==*/
.align 64
.quad 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424
/*== poly_coeff4 ==*/
.align 64
.quad 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d
/*== poly_coeff3 ==*/
.align 64
.quad 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8
/*== poly_coeff2 ==*/
.align 64
.quad 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25
/*== poly_coeff1 ==*/
.align 64
.quad 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2
/*== AbsMask ==*/
.align 64
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
/*== Threshold ==*/
.align 64
.quad 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41
.align 64
.type __svml_dexp10_data_internal_avx512,@object
.size __svml_dexp10_data_internal_avx512,.-__svml_dexp10_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized exp10f.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_exp10f _ZGVeN16v_exp10f_avx2_wrapper
#include "../svml_s_exp10f16_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized exp10f, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_exp10f
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_exp10f, __GI__ZGVeN16v_exp10f,
__redirect__ZGVeN16v_exp10f)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,269 @@
/* Function exp10f vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
* Typical exp10() implementation, except that:
* - tables are small (16 elements), allowing for fast gathers
* - all arguments processed in the main path
* - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
* - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
* - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
* - SAE used to avoid spurious flag settings
*
*/
/* Offsets for data table __svml_sexp10_data_internal_avx512
*/
#define Exp_tbl_L 0
#define Exp_tbl_H 128
#define L2E 256
#define Shifter 320
#define L2H 384
#define L2L 448
#define EMask 512
#define AbsMask 576
#define Threshold 640
#define poly_coeff2 704
#define poly_coeff1 768
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_exp10f_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups L2E+__svml_sexp10_data_internal_avx512(%rip), %zmm2
vmovups Shifter+__svml_sexp10_data_internal_avx512(%rip), %zmm1
vmovups L2H+__svml_sexp10_data_internal_avx512(%rip), %zmm5
vmovups L2L+__svml_sexp10_data_internal_avx512(%rip), %zmm4
/* ensure |R|<2 even for special cases */
vmovups EMask+__svml_sexp10_data_internal_avx512(%rip), %zmm6
vmovups poly_coeff2+__svml_sexp10_data_internal_avx512(%rip), %zmm9
/* 2^(52-4)*1.5 + x * log2(e) */
vfmadd213ps {rz-sae}, %zmm1, %zmm0, %zmm2
vmovups poly_coeff1+__svml_sexp10_data_internal_avx512(%rip), %zmm10
vmovups __svml_sexp10_data_internal_avx512(%rip), %zmm8
vmovups Exp_tbl_H+__svml_sexp10_data_internal_avx512(%rip), %zmm15
vmovups Threshold+__svml_sexp10_data_internal_avx512(%rip), %zmm13
vpsrld $5, %zmm2, %zmm3
/* Z0 ~ x*log2(e), rounded down to 6 fractional bits */
vsubps {rn-sae}, %zmm1, %zmm2, %zmm1
vpermt2ps Exp_tbl_L+64+__svml_sexp10_data_internal_avx512(%rip), %zmm2, %zmm8
vpermt2ps Exp_tbl_H+64+__svml_sexp10_data_internal_avx512(%rip), %zmm3, %zmm15
vandps AbsMask+__svml_sexp10_data_internal_avx512(%rip), %zmm0, %zmm12
/* R = x - Z0*log(2) */
vfnmadd213ps {rn-sae}, %zmm0, %zmm1, %zmm5
vcmpps $29, {sae}, %zmm13, %zmm12, %k0
vfnmadd231ps {rn-sae}, %zmm1, %zmm4, %zmm5
kmovw %k0, %edx
vrangeps $2, {sae}, %zmm6, %zmm5, %zmm11
vfmadd231ps {rn-sae}, %zmm11, %zmm9, %zmm10
vmulps {rn-sae}, %zmm11, %zmm10, %zmm14
/* x!=0? */
vpxord %zmm7, %zmm7, %zmm7
vcmpps $4, {sae}, %zmm7, %zmm0, %k1
/* Th*Tl */
vmulps {rn-sae}, %zmm8, %zmm15, %zmm15{%k1}
vfmadd213ps {rn-sae}, %zmm15, %zmm14, %zmm15
vscalefps {rn-sae}, %zmm1, %zmm15, %zmm1
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
/* Restore registers
* and exit the function
*/
L(EXIT):
vmovaps %zmm1, %zmm0
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm0, 64(%rsp)
vmovups %zmm1, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm1
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $16, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm1
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm1
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 64(%rsp,%r14,4), %xmm0
call exp10f@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 128(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN16v_exp10f_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_sexp10_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Exp_tbl_L[32][1];
__declspec(align(64)) VUINT32 Exp_tbl_H[32][1];
__declspec(align(64)) VUINT32 L2E[16][1];
__declspec(align(64)) VUINT32 Shifter[16][1];
__declspec(align(64)) VUINT32 L2H[16][1];
__declspec(align(64)) VUINT32 L2L[16][1];
__declspec(align(64)) VUINT32 EMask[16][1];
__declspec(align(64)) VUINT32 AbsMask[16][1];
__declspec(align(64)) VUINT32 Threshold[16][1];
__declspec(align(64)) VUINT32 poly_coeff2[16][1];
__declspec(align(64)) VUINT32 poly_coeff1[16][1];
} __svml_sexp10_data_internal_avx512;
#endif
__svml_sexp10_data_internal_avx512:
/*== Exp_tbl_L ==*/
.long 0x3f800001, 0x3f801631, 0x3f802c65, 0x3f80429d
.long 0x3f8058d9, 0x3f806f18, 0x3f80855c, 0x3f809ba3
.long 0x3f80b1ee, 0x3f80c83d, 0x3f80de90, 0x3f80f4e7
.long 0x3f810b42, 0x3f8121a0, 0x3f813803, 0x3f814e69
.long 0x3f8164d3, 0x3f817b41, 0x3f8191b3, 0x3f81a829
.long 0x3f81bea2, 0x3f81d520, 0x3f81eba2, 0x3f820227
.long 0x3f8218b0, 0x3f822f3d, 0x3f8245cf, 0x3f825c64
.long 0x3f8272fd, 0x3f828999, 0x3f82a03a, 0x3f82b6df
/*== Exp_tbl_H ==*/
.align 64
.long 0x3f800000, 0x3f82cd87, 0x3f85aac3, 0x3f88980f
.long 0x3f8b95c2, 0x3f8ea43a, 0x3f91c3d3, 0x3f94f4f0
.long 0x3f9837f0, 0x3f9b8d3a, 0x3f9ef532, 0x3fa27043
.long 0x3fa5fed7, 0x3fa9a15b, 0x3fad583f, 0x3fb123f6
.long 0x3fb504f3, 0x3fb8fbaf, 0x3fbd08a4, 0x3fc12c4d
.long 0x3fc5672a, 0x3fc9b9be, 0x3fce248c, 0x3fd2a81e
.long 0x3fd744fd, 0x3fdbfbb8, 0x3fe0ccdf, 0x3fe5b907
.long 0x3feac0c7, 0x3fefe4ba, 0x3ff5257d, 0x3ffa83b3
/*== log2(10) ==*/
.align 64
.long 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78
/*== Shifter=2^(23-10)*1.5 ==*/
.align 64
.long 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000
/*== L2H = log(2)_high ==*/
.align 64
.long 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
/*== L2L = log(2)_low ==*/
.align 64
.long 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860
/*== EMask ==*/
.align 64
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
/*== AbsMask ==*/
.align 64
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== Threshold ==*/
.align 64
.long 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818
/*== poly_coeff2 ==*/
.align 64
.long 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA
/*== poly_coeff1 ==*/
.align 64
.long 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D
.align 64
.type __svml_sexp10_data_internal_avx512,@object
.size __svml_sexp10_data_internal_avx512,.-__svml_sexp10_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized exp10f, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_exp10f _ZGVbN4v_exp10f_sse2
#include "../svml_s_exp10f4_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized exp10f, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_exp10f
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_exp10f, __GI__ZGVbN4v_exp10f,
__redirect__ZGVbN4v_exp10f)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,311 @@
/* Function exp10f vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* exp10(x) = 2^x/log10(2) = 2^n * (1 + T[j]) * (1 + P(y))
* where
* x = m*log10(2)/K + y, y in [-log10(2)/K..log10(2)/K]
* m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
*
* values of 2^j/K are tabulated
*
* P(y) is a minimax polynomial approximation of exp10(x)-1
* on small interval [-log10(2)/K..log10(2)/K]
*
* Special cases:
*
* exp10(NaN) = NaN
* exp10(+INF) = +INF
* exp10(-INF) = 0
* exp10(x) = 1 for subnormals
* For IEEE float
* if x > 38.5318412780761720 then exp10f(x) overflow
* if x < -45.4555282592773440 then exp10f(x) underflow
*
*/
/* Offsets for data table __svml_sexp10_data_internal
*/
#define _sT 0
#define _sLg2_10 128
#define _sShifter 144
#define _sInvLg2_10hi 160
#define _sInvLg2_10lo 176
#define _sPC0 192
#define _sPC1 208
#define _sPC2 224
#define _iIndexMask 240
#define _iAbsMask 256
#define _iDomainRange 272
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_exp10f_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
movaps %xmm0, %xmm4
/* Load arument */
movups _sLg2_10+__svml_sexp10_data_internal(%rip), %xmm2
lea __svml_sexp10_data_internal(%rip), %r8
mulps %xmm4, %xmm2
movups _sShifter+__svml_sexp10_data_internal(%rip), %xmm5
/* R */
movups _sInvLg2_10hi+__svml_sexp10_data_internal(%rip), %xmm14
addps %xmm5, %xmm2
movaps %xmm2, %xmm1
movups _sInvLg2_10lo+__svml_sexp10_data_internal(%rip), %xmm15
subps %xmm5, %xmm1
mulps %xmm1, %xmm14
movaps %xmm4, %xmm5
mulps %xmm1, %xmm15
subps %xmm14, %xmm5
/*
* Polynomial
* exp10 = 2^N*(Tj+Tj*poly)
* poly(sN) = {1+later} a0+a1*sR
*/
movups _sPC2+__svml_sexp10_data_internal(%rip), %xmm1
subps %xmm15, %xmm5
mulps %xmm5, %xmm1
movdqu _iIndexMask+__svml_sexp10_data_internal(%rip), %xmm3
/* Index and lookup */
movdqa %xmm3, %xmm10
/* remove index bits */
pandn %xmm2, %xmm3
pand %xmm2, %xmm10
/* 2^N */
pslld $18, %xmm3
/* iIndex *= sizeof(S); */
pslld $2, %xmm10
addps _sPC1+__svml_sexp10_data_internal(%rip), %xmm1
movd %xmm10, %edx
pshufd $1, %xmm10, %xmm7
pshufd $2, %xmm10, %xmm9
pshufd $3, %xmm10, %xmm11
movd %xmm7, %ecx
movd %xmm9, %esi
movd %xmm11, %edi
/* Check for overflow\underflow */
movdqu _iAbsMask+__svml_sexp10_data_internal(%rip), %xmm6
pand %xmm4, %xmm6
mulps %xmm1, %xmm5
movslq %edx, %rdx
addps _sPC0+__svml_sexp10_data_internal(%rip), %xmm5
movslq %ecx, %rcx
movslq %esi, %rsi
movslq %edi, %rdi
movd (%r8,%rdx), %xmm0
movd (%r8,%rcx), %xmm8
movd (%r8,%rsi), %xmm13
movd (%r8,%rdi), %xmm12
punpckldq %xmm8, %xmm0
punpckldq %xmm12, %xmm13
punpcklqdq %xmm13, %xmm0
/* Tj_l+Tj_h*poly */
mulps %xmm0, %xmm5
pcmpgtd _iDomainRange+__svml_sexp10_data_internal(%rip), %xmm6
addps %xmm5, %xmm0
movmskps %xmm6, %eax
/* quick mul 2^N */
paddd %xmm3, %xmm0
/* Finish */
testl %eax, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm4
/* Restore registers
* and exit the function
*/
L(EXIT):
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm4, 32(%rsp)
movups %xmm0, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax
xorl %edx, %edx
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %edx, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %eax, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm0
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call exp10f@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movss %xmm0, 48(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN4v_exp10f_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_sexp10_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(16)) VUINT32 _sT[(1<<5)][1];
__declspec(align(16)) VUINT32 _sLg2_10[4][1];
__declspec(align(16)) VUINT32 _sShifter[4][1];
__declspec(align(16)) VUINT32 _sInvLg2_10hi[4][1];
__declspec(align(16)) VUINT32 _sInvLg2_10lo[4][1];
__declspec(align(16)) VUINT32 _sPC0[4][1];
__declspec(align(16)) VUINT32 _sPC1[4][1];
__declspec(align(16)) VUINT32 _sPC2[4][1];
__declspec(align(16)) VUINT32 _iIndexMask[4][1];
__declspec(align(16)) VUINT32 _iAbsMask[4][1];
__declspec(align(16)) VUINT32 _iDomainRange[4][1];
} __svml_sexp10_data_internal;
#endif
__svml_sexp10_data_internal:
/*== _sT ==*/
.long 0x3f800000 // 2^( 0 /32 )
.long 0x3f82cd87 // 2^( 1 /32 )
.long 0x3f85aac3 // 2^( 2 /32 )
.long 0x3f88980f // 2^( 3 /32 )
.long 0x3f8b95c2 // 2^( 4 /32 )
.long 0x3f8ea43a // 2^( 5 /32 )
.long 0x3f91c3d3 // 2^( 6 /32 )
.long 0x3f94f4f0 // 2^( 7 /32 )
.long 0x3f9837f0 // 2^( 8 /32 )
.long 0x3f9b8d3a // 2^( 9 /32 )
.long 0x3f9ef532 // 2^( 10/32 )
.long 0x3fa27043 // 2^( 11/32 )
.long 0x3fa5fed7 // 2^( 12/32 )
.long 0x3fa9a15b // 2^( 13/32 )
.long 0x3fad583f // 2^( 14/32 )
.long 0x3fb123f6 // 2^( 15/32 )
.long 0x3fb504f3 // 2^( 16/32 )
.long 0x3fb8fbaf // 2^( 17/32 )
.long 0x3fbd08a4 // 2^( 18/32 )
.long 0x3fc12c4d // 2^( 19/32 )
.long 0x3fc5672a // 2^( 20/32 )
.long 0x3fc9b9be // 2^( 21/32 )
.long 0x3fce248c // 2^( 22/32 )
.long 0x3fd2a81e // 2^( 23/32 )
.long 0x3fd744fd // 2^( 24/32 )
.long 0x3fdbfbb8 // 2^( 25/32 )
.long 0x3fe0ccdf // 2^( 26/32 )
.long 0x3fe5b907 // 2^( 27/32 )
.long 0x3feac0c7 // 2^( 28/32 )
.long 0x3fefe4ba // 2^( 29/32 )
.long 0x3ff5257d // 2^( 30/32 )
.long 0x3ffa83b3 // 2^( 31/32 )
.align 16
.long 0x42d49a78, 0x42d49a78, 0x42d49a78, 0x42d49a78 /* _sLg2_10*2^K */
.align 16
.long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter) */
.align 16
.long 0x3c1a2000, 0x3c1a2000, 0x3c1a2000, 0x3c1a2000 /* _sInvLg2_10hi/2^K hi (24-K-7) bits*/
.align 16
.long 0x341a84fc, 0x341a84fc, 0x341a84fc, 0x341a84fc /* _sInvLg2_10lo/2^K lo bits */
// otherwise exp10(0) won't produce exact 1.0
.align 16
.long 0x2fecc868, 0x2fecc868, 0x2fecc868, 0x2fecc868 /* _sPC0 */
.align 16
.long 0x40135e1b, 0x40135e1b, 0x40135e1b, 0x40135e1b /* _sPC1 */
.align 16
.long 0x4029a8d2, 0x4029a8d2, 0x4029a8d2, 0x4029a8d2 /* _sPC2 */
.align 16
.long 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f /* _iIndexMask =(2^K-1)*/
//common
.align 16
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
.align 16
.long 0x4217b818, 0x4217b818, 0x4217b818, 0x4217b818 /* _iDomainRange=-log10(max_denormal=0x007fffff) RZ */
.align 16
.type __svml_sexp10_data_internal,@object
.size __svml_sexp10_data_internal,.-__svml_sexp10_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized exp10f, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_exp10f _ZGVdN8v_exp10f_sse_wrapper
#include "../svml_s_exp10f8_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized exp10f, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_exp10f
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_exp10f, __GI__ZGVdN8v_exp10f,
__redirect__ZGVdN8v_exp10f)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,331 @@
/* Function exp10f vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* exp10(x) = 2^x/log10(2) = 2^n * (1 + T[j]) * (1 + P(y))
* where
* x = m*log10(2)/K + y, y in [-log10(2)/K..log10(2)/K]
* m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
*
* values of 2^j/K are tabulated
*
* P(y) is a minimax polynomial approximation of exp10(x)-1
* on small interval [-log10(2)/K..log10(2)/K]
*
* Special cases:
*
* exp10(NaN) = NaN
* exp10(+INF) = +INF
* exp10(-INF) = 0
* exp10(x) = 1 for subnormals
* For IEEE float
* if x > 38.5318412780761720 then exp10f(x) overflow
* if x < -45.4555282592773440 then exp10f(x) underflow
*
*/
/* Offsets for data table __svml_sexp10_data_internal
*/
#define _sT 0
#define _sLg2_10 128
#define _sShifter 160
#define _sInvLg2_10hi 192
#define _sInvLg2_10lo 224
#define _sPC0 256
#define _sPC1 288
#define _sPC2 320
#define _iIndexMask 352
#define _iAbsMask 384
#define _iDomainRange 416
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_exp10f_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
lea __svml_sexp10_data_internal(%rip), %rax
vmovups _sShifter+__svml_sexp10_data_internal(%rip), %ymm4
/* Load arument */
vmovups _sLg2_10+__svml_sexp10_data_internal(%rip), %ymm1
vmovups _iIndexMask+__svml_sexp10_data_internal(%rip), %ymm2
vmovaps %ymm0, %ymm3
vfmadd213ps %ymm4, %ymm3, %ymm1
/* Index and lookup */
vandps %ymm2, %ymm1, %ymm7
/* iIndex *= sizeof(S); */
vpslld $2, %ymm7, %ymm10
vsubps %ymm4, %ymm1, %ymm0
/* Check for overflow\underflow */
vandps _iAbsMask+__svml_sexp10_data_internal(%rip), %ymm3, %ymm5
vpcmpgtd _iDomainRange+__svml_sexp10_data_internal(%rip), %ymm5, %ymm6
vmovmskps %ymm6, %edx
vmovd %xmm10, %ecx
vextractf128 $1, %ymm10, %xmm6
vpextrd $1, %xmm10, %esi
vpextrd $2, %xmm10, %edi
vpextrd $3, %xmm10, %r8d
movslq %ecx, %rcx
movslq %esi, %rsi
movslq %edi, %rdi
movslq %r8d, %r8
vmovd (%rax,%rcx), %xmm8
vmovd (%rax,%rsi), %xmm9
vmovd (%rax,%rdi), %xmm11
vmovd (%rax,%r8), %xmm12
vpunpckldq %xmm9, %xmm8, %xmm13
vpunpckldq %xmm12, %xmm11, %xmm14
vpunpcklqdq %xmm14, %xmm13, %xmm15
/* R */
vmovups _sInvLg2_10hi+__svml_sexp10_data_internal(%rip), %ymm13
vmovd %xmm6, %r9d
vfnmadd213ps %ymm3, %ymm0, %ymm13
vpextrd $1, %xmm6, %r10d
movslq %r9d, %r9
movslq %r10d, %r10
vfnmadd132ps _sInvLg2_10lo+__svml_sexp10_data_internal(%rip), %ymm13, %ymm0
vmovd (%rax,%r9), %xmm4
vmovd (%rax,%r10), %xmm5
vpunpckldq %xmm5, %xmm4, %xmm9
/*
* Polynomial
* exp10 = 2^N*(Tj+Tj*poly)
* poly(sN) = {1+later} a0+a1*sR
*/
vmovups _sPC2+__svml_sexp10_data_internal(%rip), %ymm4
vfmadd213ps _sPC1+__svml_sexp10_data_internal(%rip), %ymm0, %ymm4
vpextrd $2, %xmm6, %r11d
vpextrd $3, %xmm6, %ecx
movslq %r11d, %r11
movslq %ecx, %rcx
vfmadd213ps _sPC0+__svml_sexp10_data_internal(%rip), %ymm0, %ymm4
vmovd (%rax,%r11), %xmm7
vmovd (%rax,%rcx), %xmm8
vpunpckldq %xmm8, %xmm7, %xmm11
/* remove index bits */
vpandn %ymm1, %ymm2, %ymm0
vpunpcklqdq %xmm11, %xmm9, %xmm12
/* 2^N */
vpslld $18, %ymm0, %ymm1
vinsertf128 $1, %xmm12, %ymm15, %ymm14
/* Tj_l+Tj_h*poly */
vfmadd213ps %ymm14, %ymm14, %ymm4
/* quick mul 2^N */
vpaddd %ymm1, %ymm4, %ymm0
/* Finish */
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %ymm3, 32(%rsp)
vmovups %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx ymm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call exp10f@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 64(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN8v_exp10f_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_sexp10_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(32)) VUINT32 _sT[(1<<5)][1];
__declspec(align(32)) VUINT32 _sLg2_10[8][1];
__declspec(align(32)) VUINT32 _sShifter[8][1];
__declspec(align(32)) VUINT32 _sInvLg2_10hi[8][1];
__declspec(align(32)) VUINT32 _sInvLg2_10lo[8][1];
__declspec(align(32)) VUINT32 _sPC0[8][1];
__declspec(align(32)) VUINT32 _sPC1[8][1];
__declspec(align(32)) VUINT32 _sPC2[8][1];
__declspec(align(32)) VUINT32 _iIndexMask[8][1];
__declspec(align(32)) VUINT32 _iAbsMask[8][1];
__declspec(align(32)) VUINT32 _iDomainRange[8][1];
} __svml_sexp10_data_internal;
#endif
__svml_sexp10_data_internal:
/*== _sT ==*/
.long 0x3f800000 // 2^( 0 /32 )
.long 0x3f82cd87 // 2^( 1 /32 )
.long 0x3f85aac3 // 2^( 2 /32 )
.long 0x3f88980f // 2^( 3 /32 )
.long 0x3f8b95c2 // 2^( 4 /32 )
.long 0x3f8ea43a // 2^( 5 /32 )
.long 0x3f91c3d3 // 2^( 6 /32 )
.long 0x3f94f4f0 // 2^( 7 /32 )
.long 0x3f9837f0 // 2^( 8 /32 )
.long 0x3f9b8d3a // 2^( 9 /32 )
.long 0x3f9ef532 // 2^( 10/32 )
.long 0x3fa27043 // 2^( 11/32 )
.long 0x3fa5fed7 // 2^( 12/32 )
.long 0x3fa9a15b // 2^( 13/32 )
.long 0x3fad583f // 2^( 14/32 )
.long 0x3fb123f6 // 2^( 15/32 )
.long 0x3fb504f3 // 2^( 16/32 )
.long 0x3fb8fbaf // 2^( 17/32 )
.long 0x3fbd08a4 // 2^( 18/32 )
.long 0x3fc12c4d // 2^( 19/32 )
.long 0x3fc5672a // 2^( 20/32 )
.long 0x3fc9b9be // 2^( 21/32 )
.long 0x3fce248c // 2^( 22/32 )
.long 0x3fd2a81e // 2^( 23/32 )
.long 0x3fd744fd // 2^( 24/32 )
.long 0x3fdbfbb8 // 2^( 25/32 )
.long 0x3fe0ccdf // 2^( 26/32 )
.long 0x3fe5b907 // 2^( 27/32 )
.long 0x3feac0c7 // 2^( 28/32 )
.long 0x3fefe4ba // 2^( 29/32 )
.long 0x3ff5257d // 2^( 30/32 )
.long 0x3ffa83b3 // 2^( 31/32 )
.align 32
.long 0x42d49a78, 0x42d49a78, 0x42d49a78, 0x42d49a78, 0x42d49a78, 0x42d49a78, 0x42d49a78, 0x42d49a78 /* _sLg2_10*2^K */
.align 32
.long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter) */
.align 32
.long 0x3c1a2000, 0x3c1a2000, 0x3c1a2000, 0x3c1a2000, 0x3c1a2000, 0x3c1a2000, 0x3c1a2000, 0x3c1a2000 /* _sInvLg2_10hi/2^K hi (24-K-7) bits*/
.align 32
.long 0x341a84fc, 0x341a84fc, 0x341a84fc, 0x341a84fc, 0x341a84fc, 0x341a84fc, 0x341a84fc, 0x341a84fc /* _sInvLg2_10lo/2^K lo bits */
// otherwise exp10(0) won't produce exact 1.0
.align 32
.long 0x2fecc868, 0x2fecc868, 0x2fecc868, 0x2fecc868, 0x2fecc868, 0x2fecc868, 0x2fecc868, 0x2fecc868 /* _sPC0 */
.align 32
.long 0x40135e1b, 0x40135e1b, 0x40135e1b, 0x40135e1b, 0x40135e1b, 0x40135e1b, 0x40135e1b, 0x40135e1b /* _sPC1 */
.align 32
.long 0x4029a8d2, 0x4029a8d2, 0x4029a8d2, 0x4029a8d2, 0x4029a8d2, 0x4029a8d2, 0x4029a8d2, 0x4029a8d2 /* _sPC2 */
.align 32
.long 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f /* _iIndexMask =(2^K-1)*/
//common
.align 32
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
.align 32
.long 0x4217b818, 0x4217b818, 0x4217b818, 0x4217b818, 0x4217b818, 0x4217b818, 0x4217b818, 0x4217b818 /* _iDomainRange=-log10(max_denormal=0x007fffff) RZ */
.align 32
.type __svml_sexp10_data_internal,@object
.size __svml_sexp10_data_internal,.-__svml_sexp10_data_internal

View File

@ -0,0 +1,29 @@
/* Function exp10 vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_exp10)
WRAPPER_IMPL_SSE2 exp10
END (_ZGVbN2v_exp10)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_exp10)
#endif

View File

@ -0,0 +1,29 @@
/* Function exp10 vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_exp10)
WRAPPER_IMPL_AVX _ZGVbN2v_exp10
END (_ZGVdN4v_exp10)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_exp10)
#endif

View File

@ -0,0 +1,25 @@
/* Function exp10 vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_exp10)
WRAPPER_IMPL_AVX _ZGVbN2v_exp10
END (_ZGVcN4v_exp10)

View File

@ -0,0 +1,25 @@
/* Function exp10 vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_exp10)
WRAPPER_IMPL_AVX512 _ZGVdN4v_exp10
END (_ZGVeN8v_exp10)

View File

@ -0,0 +1,25 @@
/* Function exp10f vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_exp10f)
WRAPPER_IMPL_AVX512 _ZGVdN8v_exp10f
END (_ZGVeN16v_exp10f)

View File

@ -0,0 +1,29 @@
/* Function exp10f vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_exp10f)
WRAPPER_IMPL_SSE2 exp10f
END (_ZGVbN4v_exp10f)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_exp10f)
#endif

View File

@ -0,0 +1,29 @@
/* Function exp10f vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_exp10f)
WRAPPER_IMPL_AVX _ZGVbN4v_exp10f
END (_ZGVdN8v_exp10f)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_exp10f)
#endif

View File

@ -0,0 +1,25 @@
/* Function exp10f vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_exp10f)
WRAPPER_IMPL_AVX _ZGVbN4v_exp10f
END (_ZGVcN8v_exp10f)

View File

@ -0,0 +1 @@
#include "test-double-libmvec-exp10.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-exp10.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-exp10.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC exp10
#include "test-vector-abi-arg1.h"

View File

@ -32,6 +32,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVbN2v_atan)
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVbN2v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVbN2vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVbN2v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVbN2v_exp10)
#define VEC_INT_TYPE __m128i

View File

@ -35,6 +35,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVdN4v_atan)
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVdN4v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVdN4vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVdN4v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVdN4v_exp10)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View File

@ -32,6 +32,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVcN4v_atan)
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVcN4v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVcN4vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVcN4v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVcN4v_exp10)
#define VEC_INT_TYPE __m128i

View File

@ -32,6 +32,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVeN8v_atan)
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVeN8v_asin)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVeN8vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVeN8v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVeN8v_exp10)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View File

@ -0,0 +1 @@
#include "test-float-libmvec-exp10f.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-exp10f.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-exp10f.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC exp10f
#include "test-vector-abi-arg1.h"

View File

@ -32,6 +32,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVeN16v_atanf)
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVeN16v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVeN16vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVeN16v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVeN16v_exp10f)
#define VEC_INT_TYPE __m512i

View File

@ -32,6 +32,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVbN4v_atanf)
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVbN4v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVbN4vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVbN4v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVbN4v_exp10f)
#define VEC_INT_TYPE __m128i

View File

@ -35,6 +35,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVdN8v_atanf)
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVdN8v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVdN8vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVdN8v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVdN8v_exp10f)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View File

@ -32,6 +32,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVcN8v_atanf)
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVcN8v_asinf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVcN8vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVcN8v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVcN8v_exp10f)
#define VEC_INT_TYPE __m128i