x86-64: Add vector expm1/expm1f implementation to libmvec

Implement vectorized expm1/expm1f containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector expm1/expm1f with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 08:59:16 -08:00
parent ef7ea9c132
commit 76ddc74e86
50 changed files with 2725 additions and 1 deletions

View File

@ -175,4 +175,15 @@
#define __DECL_SIMD_coshf32x
#define __DECL_SIMD_coshf64x
#define __DECL_SIMD_coshf128x
#define __DECL_SIMD_expm1
#define __DECL_SIMD_expm1f
#define __DECL_SIMD_expm1l
#define __DECL_SIMD_expm1f16
#define __DECL_SIMD_expm1f32
#define __DECL_SIMD_expm1f64
#define __DECL_SIMD_expm1f128
#define __DECL_SIMD_expm1f32x
#define __DECL_SIMD_expm1f64x
#define __DECL_SIMD_expm1f128x
#endif

View File

@ -116,7 +116,7 @@ __MATHCALL_VEC (exp10,, (_Mdouble_ __x));
#if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99
/* Return exp(X) - 1. */
__MATHCALL (expm1,, (_Mdouble_ __x));
__MATHCALL_VEC (expm1,, (_Mdouble_ __x));
/* Return log(1 + X). */
__MATHCALL (log1p,, (_Mdouble_ __x));

View File

@ -52,6 +52,7 @@ GLIBC_2.35 _ZGVbN2v_atan F
GLIBC_2.35 _ZGVbN2v_cosh F
GLIBC_2.35 _ZGVbN2v_exp10 F
GLIBC_2.35 _ZGVbN2v_exp2 F
GLIBC_2.35 _ZGVbN2v_expm1 F
GLIBC_2.35 _ZGVbN2vv_hypot F
GLIBC_2.35 _ZGVbN4v_acosf F
GLIBC_2.35 _ZGVbN4v_asinf F
@ -59,6 +60,7 @@ GLIBC_2.35 _ZGVbN4v_atanf F
GLIBC_2.35 _ZGVbN4v_coshf F
GLIBC_2.35 _ZGVbN4v_exp10f F
GLIBC_2.35 _ZGVbN4v_exp2f F
GLIBC_2.35 _ZGVbN4v_expm1f F
GLIBC_2.35 _ZGVbN4vv_hypotf F
GLIBC_2.35 _ZGVcN4v_acos F
GLIBC_2.35 _ZGVcN4v_asin F
@ -66,6 +68,7 @@ GLIBC_2.35 _ZGVcN4v_atan F
GLIBC_2.35 _ZGVcN4v_cosh F
GLIBC_2.35 _ZGVcN4v_exp10 F
GLIBC_2.35 _ZGVcN4v_exp2 F
GLIBC_2.35 _ZGVcN4v_expm1 F
GLIBC_2.35 _ZGVcN4vv_hypot F
GLIBC_2.35 _ZGVcN8v_acosf F
GLIBC_2.35 _ZGVcN8v_asinf F
@ -73,6 +76,7 @@ GLIBC_2.35 _ZGVcN8v_atanf F
GLIBC_2.35 _ZGVcN8v_coshf F
GLIBC_2.35 _ZGVcN8v_exp10f F
GLIBC_2.35 _ZGVcN8v_exp2f F
GLIBC_2.35 _ZGVcN8v_expm1f F
GLIBC_2.35 _ZGVcN8vv_hypotf F
GLIBC_2.35 _ZGVdN4v_acos F
GLIBC_2.35 _ZGVdN4v_asin F
@ -80,6 +84,7 @@ GLIBC_2.35 _ZGVdN4v_atan F
GLIBC_2.35 _ZGVdN4v_cosh F
GLIBC_2.35 _ZGVdN4v_exp10 F
GLIBC_2.35 _ZGVdN4v_exp2 F
GLIBC_2.35 _ZGVdN4v_expm1 F
GLIBC_2.35 _ZGVdN4vv_hypot F
GLIBC_2.35 _ZGVdN8v_acosf F
GLIBC_2.35 _ZGVdN8v_asinf F
@ -87,6 +92,7 @@ GLIBC_2.35 _ZGVdN8v_atanf F
GLIBC_2.35 _ZGVdN8v_coshf F
GLIBC_2.35 _ZGVdN8v_exp10f F
GLIBC_2.35 _ZGVdN8v_exp2f F
GLIBC_2.35 _ZGVdN8v_expm1f F
GLIBC_2.35 _ZGVdN8vv_hypotf F
GLIBC_2.35 _ZGVeN16v_acosf F
GLIBC_2.35 _ZGVeN16v_asinf F
@ -94,6 +100,7 @@ GLIBC_2.35 _ZGVeN16v_atanf F
GLIBC_2.35 _ZGVeN16v_coshf F
GLIBC_2.35 _ZGVeN16v_exp10f F
GLIBC_2.35 _ZGVeN16v_exp2f F
GLIBC_2.35 _ZGVeN16v_expm1f F
GLIBC_2.35 _ZGVeN16vv_hypotf F
GLIBC_2.35 _ZGVeN8v_acos F
GLIBC_2.35 _ZGVeN8v_asin F
@ -101,4 +108,5 @@ GLIBC_2.35 _ZGVeN8v_atan F
GLIBC_2.35 _ZGVeN8v_cosh F
GLIBC_2.35 _ZGVeN8v_exp10 F
GLIBC_2.35 _ZGVeN8v_exp2 F
GLIBC_2.35 _ZGVeN8v_expm1 F
GLIBC_2.35 _ZGVeN8vv_hypot F

View File

@ -86,6 +86,10 @@
# define __DECL_SIMD_cosh __DECL_SIMD_x86_64
# undef __DECL_SIMD_coshf
# define __DECL_SIMD_coshf __DECL_SIMD_x86_64
# undef __DECL_SIMD_expm1
# define __DECL_SIMD_expm1 __DECL_SIMD_x86_64
# undef __DECL_SIMD_expm1f
# define __DECL_SIMD_expm1f __DECL_SIMD_x86_64
# endif
#endif

View File

@ -42,6 +42,8 @@
!GCC$ builtin (exp10f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cosh) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (coshf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (expm1) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (expm1f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -69,3 +71,5 @@
!GCC$ builtin (exp10f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosh) attributes simd (notinbranch) if('x32')
!GCC$ builtin (coshf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (expm1) attributes simd (notinbranch) if('x32')
!GCC$ builtin (expm1f) attributes simd (notinbranch) if('x32')

View File

@ -30,6 +30,7 @@ libmvec-funcs = \
exp \
exp10 \
exp2 \
expm1 \
hypot \
log \
pow \

View File

@ -20,6 +20,7 @@ libmvec {
_ZGVbN2v_cosh; _ZGVcN4v_cosh; _ZGVdN4v_cosh; _ZGVeN8v_cosh;
_ZGVbN2v_exp10; _ZGVcN4v_exp10; _ZGVdN4v_exp10; _ZGVeN8v_exp10;
_ZGVbN2v_exp2; _ZGVcN4v_exp2; _ZGVdN4v_exp2; _ZGVeN8v_exp2;
_ZGVbN2v_expm1; _ZGVcN4v_expm1; _ZGVdN4v_expm1; _ZGVeN8v_expm1;
_ZGVbN2vv_hypot; _ZGVcN4vv_hypot; _ZGVdN4vv_hypot; _ZGVeN8vv_hypot;
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
_ZGVbN4v_asinf; _ZGVcN8v_asinf; _ZGVdN8v_asinf; _ZGVeN16v_asinf;
@ -27,6 +28,7 @@ libmvec {
_ZGVbN4v_coshf; _ZGVcN8v_coshf; _ZGVdN8v_coshf; _ZGVeN16v_coshf;
_ZGVbN4v_exp10f; _ZGVcN8v_exp10f; _ZGVdN8v_exp10f; _ZGVeN16v_exp10f;
_ZGVbN4v_exp2f; _ZGVcN8v_exp2f; _ZGVdN8v_exp2f; _ZGVeN16v_exp2f;
_ZGVbN4v_expm1f; _ZGVcN8v_expm1f; _ZGVdN8v_expm1f; _ZGVeN16v_expm1f;
_ZGVbN4vv_hypotf; _ZGVcN8vv_hypotf; _ZGVdN8vv_hypotf; _ZGVeN16vv_hypotf;
}
}

View File

@ -1395,6 +1395,26 @@ float: 1
float128: 3
ldouble: 4
Function: "expm1_vlen16":
float: 1
Function: "expm1_vlen2":
double: 1
Function: "expm1_vlen4":
double: 1
float: 1
Function: "expm1_vlen4_avx2":
double: 1
Function: "expm1_vlen8":
double: 1
float: 1
Function: "expm1_vlen8_avx2":
float: 1
Function: "gamma":
double: 4
float: 7

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized expm1, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_expm1 _ZGVbN2v_expm1_sse2
#include "../svml_d_expm12_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized expm1, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_expm1
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_expm1, __GI__ZGVbN2v_expm1, __redirect__ZGVbN2v_expm1)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,421 @@
/* Function expm1 vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
* exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
* expm1(x) = exp(x)-1 is then obtained via multi-precision computation
*
*
*/
/* Offsets for data table __svml_dexpm1_data_internal
*/
#define Expm1_HA_table 0
#define poly_coeff 2048
#define Log2e 2112
#define L2H 2128
#define L2L 2144
#define ExpAddConst 2160
#define IndexMask 2176
#define ExpMask 2192
#define MOne 2208
#define AbsMask 2224
#define Threshold 2240
#define L2 2256
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN2v_expm1_sse4)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $64, %rsp
movaps %xmm0, %xmm2
movups Log2e+__svml_dexpm1_data_internal(%rip), %xmm7
lea __svml_dexpm1_data_internal(%rip), %rsi
mulpd %xmm0, %xmm7
movups .FLT_10(%rip), %xmm3
addpd %xmm3, %xmm7
subpd %xmm3, %xmm7
/* argument reduction */
movups L2H+__svml_dexpm1_data_internal(%rip), %xmm4
mulpd %xmm7, %xmm4
movups L2L+__svml_dexpm1_data_internal(%rip), %xmm5
mulpd %xmm7, %xmm5
subpd %xmm4, %xmm2
subpd %xmm5, %xmm2
/* polynomial */
movups poly_coeff+__svml_dexpm1_data_internal(%rip), %xmm12
movaps %xmm2, %xmm14
mulpd %xmm2, %xmm12
mulpd %xmm2, %xmm14
addpd poly_coeff+16+__svml_dexpm1_data_internal(%rip), %xmm12
movups ExpAddConst+__svml_dexpm1_data_internal(%rip), %xmm15
addpd %xmm7, %xmm15
mulpd %xmm14, %xmm12
movups poly_coeff+32+__svml_dexpm1_data_internal(%rip), %xmm13
mulpd %xmm2, %xmm13
/* table lookup */
movdqu IndexMask+__svml_dexpm1_data_internal(%rip), %xmm8
pand %xmm15, %xmm8
movups AbsMask+__svml_dexpm1_data_internal(%rip), %xmm1
pshufd $2, %xmm8, %xmm9
movaps %xmm1, %xmm6
movd %xmm8, %eax
andps %xmm0, %xmm6
movd %xmm9, %ecx
andnps %xmm0, %xmm1
movdqu ExpMask+__svml_dexpm1_data_internal(%rip), %xmm11
pand %xmm11, %xmm15
cmpnlepd Threshold+__svml_dexpm1_data_internal(%rip), %xmm6
addpd poly_coeff+48+__svml_dexpm1_data_internal(%rip), %xmm13
movmskpd %xmm6, %edx
psllq $41, %xmm15
/* T-1 */
movups MOne+__svml_dexpm1_data_internal(%rip), %xmm4
movslq %eax, %rax
movslq %ecx, %rcx
addpd %xmm12, %xmm13
movups (%rsi,%rax), %xmm3
movups (%rsi,%rcx), %xmm10
movaps %xmm3, %xmm6
unpckhpd %xmm10, %xmm3
/* Th1 = (Th-1) + Tl */
mulpd %xmm15, %xmm3
mulpd %xmm13, %xmm14
unpcklpd %xmm10, %xmm6
orps %xmm15, %xmm6
addpd %xmm4, %xmm6
addpd %xmm14, %xmm2
addpd %xmm3, %xmm6
/* T = Th+Tl */
movaps %xmm6, %xmm5
subpd %xmm4, %xmm5
mulpd %xmm5, %xmm2
addpd %xmm2, %xmm6
orps %xmm1, %xmm6
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx xmm0 xmm6
/* Restore registers
* and exit the function
*/
L(EXIT):
movaps %xmm6, %xmm0
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm0, 32(%rsp)
movups %xmm6, 48(%rsp)
# LOE rbx r12 r13 r14 r15 edx
xorl %eax, %eax
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $2, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm6
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 xmm6
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call expm1@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 48(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVbN2v_expm1_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_dexpm1_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(16)) VUINT32 Expm1_HA_table[(1<<8)][2];
__declspec(align(16)) VUINT32 poly_coeff[4][2][2];
__declspec(align(16)) VUINT32 Log2e[2][2];
__declspec(align(16)) VUINT32 L2H[2][2];
__declspec(align(16)) VUINT32 L2L[2][2];
__declspec(align(16)) VUINT32 ExpAddConst[2][2];
__declspec(align(16)) VUINT32 IndexMask[2][2];
__declspec(align(16)) VUINT32 ExpMask[2][2];
__declspec(align(16)) VUINT32 MOne[2][2];
__declspec(align(16)) VUINT32 AbsMask[2][2];
__declspec(align(16)) VUINT32 Threshold[2][2];
__declspec(align(16)) VUINT32 L2[2][2];
} __svml_dexpm1_data_internal;
#endif
__svml_dexpm1_data_internal:
/* Expm1_HA_table */
.quad 0x0000000000000000, 0x0000000000000000
.quad 0x0000163da8000000, 0x3e3fb33356d84a67
.quad 0x00002c9a40000000, 0xbe3887f9f1190835
.quad 0x00004315e8000000, 0x3e1b9fe12f5ce3e7
.quad 0x000059b0d0000000, 0x3e48ac2ba1d73e2a
.quad 0x0000706b28000000, 0x3e3ddf6ddc6dc404
.quad 0x0000874518000000, 0x3e1d66f20230d7c9
.quad 0x00009e3ec8000000, 0x3e46379c1a290f03
.quad 0x0000b55870000000, 0xbe4833b784eb3a37
.quad 0x0000cc9228000000, 0x3e4b923fba03db83
.quad 0x0000e3ec30000000, 0x3e469e8d10103a17
.quad 0x0000fb66b0000000, 0xbdb2ce50dcdf6e22
.quad 0x00011301d0000000, 0x3df25b50a4ebbf1b
.quad 0x00012abdc0000000, 0x3e1b0c72fee4aeb5
.quad 0x0001429ab0000000, 0xbe356d2204cbefe7
.quad 0x00015a98c8000000, 0x3e24b1ca24901aae
.quad 0x000172b840000000, 0xbe4c15742919041c
.quad 0x00018af938000000, 0x3e2191bd3777ee17
.quad 0x0001a35be8000000, 0x3e4b7e5ba9e5b4c8
.quad 0x0001bbe088000000, 0xbe4fdd19632a70c7
.quad 0x0001d48730000000, 0x3e368b9aa7805b80
.quad 0x0001ed5020000000, 0x3e47e6c8e5c40d00
.quad 0x0002063b88000000, 0x3e18a3358ee3bac1
.quad 0x00021f4990000000, 0x3e37ddc962552fd3
.quad 0x0002387a70000000, 0xbe38a9dc7993e052
.quad 0x000251ce50000000, 0xbe135670329f5521
.quad 0x00026b4568000000, 0xbe40ec1916d42cc6
.quad 0x000284dfe0000000, 0x3e3f5638096cf15d
.quad 0x00029e9df8000000, 0xbe470108f69ed175
.quad 0x0002b87fd0000000, 0x3e2b5b31ffbbd48d
.quad 0x0002d285a8000000, 0xbe31bfcf4bff6e2b
.quad 0x0002ecafa8000000, 0x3e33e2f5611ca0f4
.quad 0x000306fe08000000, 0x3e418db8a96f46ad
.quad 0x0003217100000000, 0xbe4d993e76563187
.quad 0x00033c08b0000000, 0x3e4320b7fa64e431
.quad 0x000356c560000000, 0xbe1b5803cdae772e
.quad 0x000371a738000000, 0xbe28aac6ab1d7560
.quad 0x00038cae70000000, 0xbe47d13cd3d2b1a8
.quad 0x0003a7db38000000, 0xbe48d30048af21b7
.quad 0x0003c32dc0000000, 0x3e489d47242000f9
.quad 0x0003dea650000000, 0xbe4f6e5eee525f6f
.quad 0x0003fa4508000000, 0xbe4a9bff22fa047f
.quad 0x0004160a20000000, 0x3e3f72e29f84325c
.quad 0x000431f5d8000000, 0x3e350a896dc70444
.quad 0x00044e0860000000, 0x3e18624b40c4dbd0
.quad 0x00046a41f0000000, 0xbe4717fd446d7686
.quad 0x000486a2b8000000, 0xbe41f6197f61f2e2
.quad 0x0004a32af0000000, 0x3e2afa7bcce5b17a
.quad 0x0004bfdad8000000, 0xbe464eaec715e343
.quad 0x0004dcb298000000, 0x3e3fddd0d63b36ef
.quad 0x0004f9b278000000, 0xbe362d35952cc275
.quad 0x000516daa0000000, 0x3e467b320e0897a9
.quad 0x0005342b58000000, 0xbe362b07e20f57c4
.quad 0x000551a4c8000000, 0x3e42ec9076297631
.quad 0x00056f4738000000, 0xbe34ad8259913500
.quad 0x00058d12d8000000, 0xbe4b41c016d6a1ea
.quad 0x0005ab07e0000000, 0xbe45bd5eb539b67f
.quad 0x0005c92688000000, 0x3e42ca35b80e258e
.quad 0x0005e76f18000000, 0xbe4296f5bc8b20da
.quad 0x000605e1b8000000, 0x3e376dc08b076f59
.quad 0x0006247eb0000000, 0x3e0d2ac258f87d03
.quad 0x0006434638000000, 0xbe4999e701c483c7
.quad 0x0006623880000000, 0x3e42a91124893ecf
.quad 0x00068155d8000000, 0xbe4d9ab467bf1d47
.quad 0x0006a09e68000000, 0xbe380c4336f74d05
.quad 0x0006c01278000000, 0xbe47a12a08944ab3
.quad 0x0006dfb240000000, 0xbe4cd72e886ef8ea
.quad 0x0006ff7df8000000, 0x3e3519483cf87e1b
.quad 0x00071f75e8000000, 0x3e2d8bee7ba46e1e
.quad 0x00073f9a48000000, 0x3e24b02e77ab934a
.quad 0x00075feb58000000, 0xbe3bd98374091656
.quad 0x0007806950000000, 0xbe00d1604f328fec
.quad 0x0007a11470000000, 0x3e4f580c36bea881
.quad 0x0007c1ed00000000, 0x3e330c1327c49334
.quad 0x0007e2f338000000, 0xbe330b19defa2fd4
.quad 0x0008042758000000, 0xbe4e0f2f724f90cc
.quad 0x0008258998000000, 0x3e34cce128acf88b
.quad 0x0008471a48000000, 0xbe3dc385331ad094
.quad 0x000868d998000000, 0x3e4a2497640720ed
.quad 0x00088ac7d8000000, 0x3e38a669966530bd
.quad 0x0008ace540000000, 0x3e415506dadd3e2b
.quad 0x0008cf3218000000, 0xbe34abb7410d55e3
.quad 0x0008f1ae98000000, 0x3e31577362b98274
.quad 0x0009145b08000000, 0x3e4c8ffe2c4530da
.quad 0x00093737b0000000, 0x3e29b8bc9e8a0388
.quad 0x00095a44c8000000, 0x3e4e4290774da41b
.quad 0x00097d82a0000000, 0xbe00d8d83a30b6f8
.quad 0x0009a0f170000000, 0x3e2940f737462137
.quad 0x0009c49180000000, 0x3e451f8480e3e236
.quad 0x0009e86318000000, 0x3e3e323231824ca8
.quad 0x000a0c6678000000, 0x3e4aef2b2594d6d4
.quad 0x000a309bf0000000, 0xbe4dae966539f470
.quad 0x000a5503b0000000, 0x3e41f12ae45a1225
.quad 0x000a799e10000000, 0x3e49859ac3796fd9
.quad 0x000a9e6b58000000, 0xbe44301205e0a6de
.quad 0x000ac36bc0000000, 0xbe0606431f9234cb
.quad 0x000ae89f98000000, 0x3e35ad3ad5e8734d
.quad 0x000b0e0728000000, 0x3e38db66590842ad
.quad 0x000b33a2b8000000, 0x3e13c57ebdaff43a
.quad 0x000b597290000000, 0xbe40d536338e3bf7
.quad 0x000b7f76f0000000, 0x3e47daf237553d84
.quad 0x000ba5b030000000, 0x3e2420c930819679
.quad 0x000bcc1e90000000, 0x3e12f074891ee83d
.quad 0x000bf2c258000000, 0x3e4eb8f0442046b8
.quad 0x000c199be0000000, 0xbe43d56b1eeef9a7
.quad 0x000c40ab60000000, 0xbd87c2c975903ef8
.quad 0x000c67f130000000, 0xbe3a82eb4b5dec80
.quad 0x000c8f6d98000000, 0xbe4fc8c257729a1e
.quad 0x000cb720e0000000, 0xbe48837cb757e1a1
.quad 0x000cdf0b58000000, 0xbe4511e031dd83b5
.quad 0x000d072d48000000, 0x3e403c4bdc687918
.quad 0x000d2f8708000000, 0x3deb13e315bc2473
.quad 0x000d5818e0000000, 0xbe4822dbc6d12fd3
.quad 0x000d80e318000000, 0xbe3367c68447b063
.quad 0x000da9e600000000, 0x3e4ed9942b84600d
.quad 0x000dd321f0000000, 0x3e480da3025b4aef
.quad 0x000dfc9730000000, 0x3e4bdcdaf5cb4656
.quad 0x000e264618000000, 0xbe4852f6baf6c4f0
.quad 0x000e502ee8000000, 0xbe1d30027630bb40
.quad 0x000e7a51f8000000, 0x3e4e3a641a5aa459
.quad 0x000ea4afa0000000, 0x3e452486cc2c7b9d
.quad 0x000ecf4830000000, 0xbe438cc07b927e77
.quad 0x000efa1bf0000000, 0xbe39ea5d888e02de
.quad 0x000f252b38000000, 0xbe2288ad162f2d20
.quad 0x000f507658000000, 0x3e4b722a033a7c26
.quad 0x000f7bfdb0000000, 0xbe431a0f63b7625a
.quad 0x000fa7c180000000, 0x3e39e90d82e90a7e
.quad 0x000fd3c228000000, 0x3e4c7b8f884badd2
/*== poly_coeff[4] ==*/
.align 16
.quad 0x3f81111168877F38, 0x3f81111168877F38 /* coeff5 */
.quad 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3 /* coeff4 */
.quad 0x3fc555555555541D, 0x3fc555555555541D /* coeff3 */
.quad 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C /* coeff2 */
/*== Log2e ==*/
.align 16
.quad 0x40671547652B82FE, 0x40671547652B82FE
/*== L2H ==*/
.align 16
.quad 0x3f762e42fef80000, 0x3f762e42fef80000
/*== L2L ==*/
.align 16
.quad 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4
/*== ExpAddConst ==*/
.align 16
.quad 0x42f80000001ff800, 0x42f80000001ff800
/*== IndexMask ==*/
.align 16
.quad 0x00000000000007f0, 0x00000000000007f0
/*== ExpMask ==*/
.align 16
.quad 0x00000000003ff800, 0x00000000003ff800
/*== MOne ==*/
.align 16
.quad 0xbff0000000000000, 0xbff0000000000000
/*== AbsMask ==*/
.align 16
.quad 0x7fffffffffffffff, 0x7fffffffffffffff
/*== Threshold ==*/
.align 16
.quad 0x40861DA04CBAFE43, 0x40861DA04CBAFE43
/*== L2 ==*/
.align 16
.quad 0x3f762e42fefa39ef, 0x3f762e42fefa39ef
.align 16
.type __svml_dexpm1_data_internal,@object
.size __svml_dexpm1_data_internal,.-__svml_dexpm1_data_internal
.align 16
.FLT_10:
.long 0x00000000,0x43380000,0x00000000,0x43380000
.type .FLT_10,@object
.size .FLT_10,16

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized expm1, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_expm1 _ZGVdN4v_expm1_sse_wrapper
#include "../svml_d_expm14_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized expm1, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_expm1
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_expm1, __GI__ZGVdN4v_expm1, __redirect__ZGVdN4v_expm1)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,408 @@
/* Function expm1 vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
* exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
* expm1(x) = exp(x)-1 is then obtained via multi-precision computation
*
*
*/
/* Offsets for data table __svml_dexpm1_data_internal
*/
#define Expm1_HA_table 0
#define poly_coeff 2048
#define Log2e 2176
#define L2H 2208
#define L2L 2240
#define ExpAddConst 2272
#define IndexMask 2304
#define ExpMask 2336
#define MOne 2368
#define AbsMask 2400
#define Threshold 2432
#define L2 2464
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN4v_expm1_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
lea __svml_dexpm1_data_internal(%rip), %r8
vmovapd %ymm0, %ymm3
vmulpd Log2e+__svml_dexpm1_data_internal(%rip), %ymm3, %ymm4
/* argument reduction */
vmovupd L2H+__svml_dexpm1_data_internal(%rip), %ymm2
vmovupd AbsMask+__svml_dexpm1_data_internal(%rip), %ymm5
vroundpd $0, %ymm4, %ymm8
vaddpd ExpAddConst+__svml_dexpm1_data_internal(%rip), %ymm8, %ymm0
vfnmadd213pd %ymm3, %ymm8, %ymm2
/* table lookup */
vandps IndexMask+__svml_dexpm1_data_internal(%rip), %ymm0, %ymm9
vandpd %ymm5, %ymm3, %ymm6
vcmpnle_uqpd Threshold+__svml_dexpm1_data_internal(%rip), %ymm6, %ymm7
vfnmadd231pd L2L+__svml_dexpm1_data_internal(%rip), %ymm8, %ymm2
vandnpd %ymm3, %ymm5, %ymm1
vmovmskpd %ymm7, %eax
vmovupd poly_coeff+64+__svml_dexpm1_data_internal(%rip), %ymm7
vmulpd %ymm2, %ymm2, %ymm8
vfmadd213pd poly_coeff+96+__svml_dexpm1_data_internal(%rip), %ymm2, %ymm7
vandps ExpMask+__svml_dexpm1_data_internal(%rip), %ymm0, %ymm0
vextractf128 $1, %ymm9, %xmm10
vmovd %xmm9, %edx
vmovd %xmm10, %esi
vpextrd $2, %xmm9, %ecx
vpextrd $2, %xmm10, %edi
movslq %edx, %rdx
movslq %ecx, %rcx
movslq %esi, %rsi
movslq %edi, %rdi
vmovupd (%r8,%rdx), %xmm13
vmovupd (%r8,%rcx), %xmm14
vmovupd (%r8,%rsi), %xmm4
vmovupd (%r8,%rdi), %xmm5
vunpcklpd %xmm14, %xmm13, %xmm11
vunpcklpd %xmm5, %xmm4, %xmm12
vpsllq $41, %ymm0, %ymm10
vunpckhpd %xmm14, %xmm13, %xmm15
vunpckhpd %xmm5, %xmm4, %xmm13
vinsertf128 $1, %xmm12, %ymm11, %ymm6
/* polynomial */
vmovupd poly_coeff+__svml_dexpm1_data_internal(%rip), %ymm12
/* T-1 */
vmovupd MOne+__svml_dexpm1_data_internal(%rip), %ymm11
vfmadd213pd poly_coeff+32+__svml_dexpm1_data_internal(%rip), %ymm2, %ymm12
vfmadd213pd %ymm7, %ymm8, %ymm12
vorpd %ymm10, %ymm6, %ymm9
vfmadd213pd %ymm2, %ymm8, %ymm12
vaddpd %ymm11, %ymm9, %ymm2
vinsertf128 $1, %xmm13, %ymm15, %ymm14
/* Th1 = (Th-1) + Tl */
vfmadd213pd %ymm2, %ymm10, %ymm14
/* T = Th+Tl */
vsubpd %ymm11, %ymm14, %ymm0
vfmadd213pd %ymm14, %ymm12, %ymm0
vorpd %ymm1, %ymm0, %ymm0
testl %eax, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 eax ymm0 ymm3
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovupd %ymm3, 32(%rsp)
vmovupd %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 eax ymm0
xorl %edx, %edx
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %edx, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %eax, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovupd 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0
call expm1@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 64(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN4v_expm1_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_dexpm1_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(32)) VUINT32 Expm1_HA_table[(1<<8)][2];
__declspec(align(32)) VUINT32 poly_coeff[4][4][2];
__declspec(align(32)) VUINT32 Log2e[4][2];
__declspec(align(32)) VUINT32 L2H[4][2];
__declspec(align(32)) VUINT32 L2L[4][2];
__declspec(align(32)) VUINT32 ExpAddConst[4][2];
__declspec(align(32)) VUINT32 IndexMask[4][2];
__declspec(align(32)) VUINT32 ExpMask[4][2];
__declspec(align(32)) VUINT32 MOne[4][2];
__declspec(align(32)) VUINT32 AbsMask[4][2];
__declspec(align(32)) VUINT32 Threshold[4][2];
__declspec(align(32)) VUINT32 L2[4][2];
} __svml_dexpm1_data_internal;
#endif
__svml_dexpm1_data_internal:
/* Expm1_HA_table */
.quad 0x0000000000000000, 0x0000000000000000
.quad 0x0000163da8000000, 0x3e3fb33356d84a67
.quad 0x00002c9a40000000, 0xbe3887f9f1190835
.quad 0x00004315e8000000, 0x3e1b9fe12f5ce3e7
.quad 0x000059b0d0000000, 0x3e48ac2ba1d73e2a
.quad 0x0000706b28000000, 0x3e3ddf6ddc6dc404
.quad 0x0000874518000000, 0x3e1d66f20230d7c9
.quad 0x00009e3ec8000000, 0x3e46379c1a290f03
.quad 0x0000b55870000000, 0xbe4833b784eb3a37
.quad 0x0000cc9228000000, 0x3e4b923fba03db83
.quad 0x0000e3ec30000000, 0x3e469e8d10103a17
.quad 0x0000fb66b0000000, 0xbdb2ce50dcdf6e22
.quad 0x00011301d0000000, 0x3df25b50a4ebbf1b
.quad 0x00012abdc0000000, 0x3e1b0c72fee4aeb5
.quad 0x0001429ab0000000, 0xbe356d2204cbefe7
.quad 0x00015a98c8000000, 0x3e24b1ca24901aae
.quad 0x000172b840000000, 0xbe4c15742919041c
.quad 0x00018af938000000, 0x3e2191bd3777ee17
.quad 0x0001a35be8000000, 0x3e4b7e5ba9e5b4c8
.quad 0x0001bbe088000000, 0xbe4fdd19632a70c7
.quad 0x0001d48730000000, 0x3e368b9aa7805b80
.quad 0x0001ed5020000000, 0x3e47e6c8e5c40d00
.quad 0x0002063b88000000, 0x3e18a3358ee3bac1
.quad 0x00021f4990000000, 0x3e37ddc962552fd3
.quad 0x0002387a70000000, 0xbe38a9dc7993e052
.quad 0x000251ce50000000, 0xbe135670329f5521
.quad 0x00026b4568000000, 0xbe40ec1916d42cc6
.quad 0x000284dfe0000000, 0x3e3f5638096cf15d
.quad 0x00029e9df8000000, 0xbe470108f69ed175
.quad 0x0002b87fd0000000, 0x3e2b5b31ffbbd48d
.quad 0x0002d285a8000000, 0xbe31bfcf4bff6e2b
.quad 0x0002ecafa8000000, 0x3e33e2f5611ca0f4
.quad 0x000306fe08000000, 0x3e418db8a96f46ad
.quad 0x0003217100000000, 0xbe4d993e76563187
.quad 0x00033c08b0000000, 0x3e4320b7fa64e431
.quad 0x000356c560000000, 0xbe1b5803cdae772e
.quad 0x000371a738000000, 0xbe28aac6ab1d7560
.quad 0x00038cae70000000, 0xbe47d13cd3d2b1a8
.quad 0x0003a7db38000000, 0xbe48d30048af21b7
.quad 0x0003c32dc0000000, 0x3e489d47242000f9
.quad 0x0003dea650000000, 0xbe4f6e5eee525f6f
.quad 0x0003fa4508000000, 0xbe4a9bff22fa047f
.quad 0x0004160a20000000, 0x3e3f72e29f84325c
.quad 0x000431f5d8000000, 0x3e350a896dc70444
.quad 0x00044e0860000000, 0x3e18624b40c4dbd0
.quad 0x00046a41f0000000, 0xbe4717fd446d7686
.quad 0x000486a2b8000000, 0xbe41f6197f61f2e2
.quad 0x0004a32af0000000, 0x3e2afa7bcce5b17a
.quad 0x0004bfdad8000000, 0xbe464eaec715e343
.quad 0x0004dcb298000000, 0x3e3fddd0d63b36ef
.quad 0x0004f9b278000000, 0xbe362d35952cc275
.quad 0x000516daa0000000, 0x3e467b320e0897a9
.quad 0x0005342b58000000, 0xbe362b07e20f57c4
.quad 0x000551a4c8000000, 0x3e42ec9076297631
.quad 0x00056f4738000000, 0xbe34ad8259913500
.quad 0x00058d12d8000000, 0xbe4b41c016d6a1ea
.quad 0x0005ab07e0000000, 0xbe45bd5eb539b67f
.quad 0x0005c92688000000, 0x3e42ca35b80e258e
.quad 0x0005e76f18000000, 0xbe4296f5bc8b20da
.quad 0x000605e1b8000000, 0x3e376dc08b076f59
.quad 0x0006247eb0000000, 0x3e0d2ac258f87d03
.quad 0x0006434638000000, 0xbe4999e701c483c7
.quad 0x0006623880000000, 0x3e42a91124893ecf
.quad 0x00068155d8000000, 0xbe4d9ab467bf1d47
.quad 0x0006a09e68000000, 0xbe380c4336f74d05
.quad 0x0006c01278000000, 0xbe47a12a08944ab3
.quad 0x0006dfb240000000, 0xbe4cd72e886ef8ea
.quad 0x0006ff7df8000000, 0x3e3519483cf87e1b
.quad 0x00071f75e8000000, 0x3e2d8bee7ba46e1e
.quad 0x00073f9a48000000, 0x3e24b02e77ab934a
.quad 0x00075feb58000000, 0xbe3bd98374091656
.quad 0x0007806950000000, 0xbe00d1604f328fec
.quad 0x0007a11470000000, 0x3e4f580c36bea881
.quad 0x0007c1ed00000000, 0x3e330c1327c49334
.quad 0x0007e2f338000000, 0xbe330b19defa2fd4
.quad 0x0008042758000000, 0xbe4e0f2f724f90cc
.quad 0x0008258998000000, 0x3e34cce128acf88b
.quad 0x0008471a48000000, 0xbe3dc385331ad094
.quad 0x000868d998000000, 0x3e4a2497640720ed
.quad 0x00088ac7d8000000, 0x3e38a669966530bd
.quad 0x0008ace540000000, 0x3e415506dadd3e2b
.quad 0x0008cf3218000000, 0xbe34abb7410d55e3
.quad 0x0008f1ae98000000, 0x3e31577362b98274
.quad 0x0009145b08000000, 0x3e4c8ffe2c4530da
.quad 0x00093737b0000000, 0x3e29b8bc9e8a0388
.quad 0x00095a44c8000000, 0x3e4e4290774da41b
.quad 0x00097d82a0000000, 0xbe00d8d83a30b6f8
.quad 0x0009a0f170000000, 0x3e2940f737462137
.quad 0x0009c49180000000, 0x3e451f8480e3e236
.quad 0x0009e86318000000, 0x3e3e323231824ca8
.quad 0x000a0c6678000000, 0x3e4aef2b2594d6d4
.quad 0x000a309bf0000000, 0xbe4dae966539f470
.quad 0x000a5503b0000000, 0x3e41f12ae45a1225
.quad 0x000a799e10000000, 0x3e49859ac3796fd9
.quad 0x000a9e6b58000000, 0xbe44301205e0a6de
.quad 0x000ac36bc0000000, 0xbe0606431f9234cb
.quad 0x000ae89f98000000, 0x3e35ad3ad5e8734d
.quad 0x000b0e0728000000, 0x3e38db66590842ad
.quad 0x000b33a2b8000000, 0x3e13c57ebdaff43a
.quad 0x000b597290000000, 0xbe40d536338e3bf7
.quad 0x000b7f76f0000000, 0x3e47daf237553d84
.quad 0x000ba5b030000000, 0x3e2420c930819679
.quad 0x000bcc1e90000000, 0x3e12f074891ee83d
.quad 0x000bf2c258000000, 0x3e4eb8f0442046b8
.quad 0x000c199be0000000, 0xbe43d56b1eeef9a7
.quad 0x000c40ab60000000, 0xbd87c2c975903ef8
.quad 0x000c67f130000000, 0xbe3a82eb4b5dec80
.quad 0x000c8f6d98000000, 0xbe4fc8c257729a1e
.quad 0x000cb720e0000000, 0xbe48837cb757e1a1
.quad 0x000cdf0b58000000, 0xbe4511e031dd83b5
.quad 0x000d072d48000000, 0x3e403c4bdc687918
.quad 0x000d2f8708000000, 0x3deb13e315bc2473
.quad 0x000d5818e0000000, 0xbe4822dbc6d12fd3
.quad 0x000d80e318000000, 0xbe3367c68447b063
.quad 0x000da9e600000000, 0x3e4ed9942b84600d
.quad 0x000dd321f0000000, 0x3e480da3025b4aef
.quad 0x000dfc9730000000, 0x3e4bdcdaf5cb4656
.quad 0x000e264618000000, 0xbe4852f6baf6c4f0
.quad 0x000e502ee8000000, 0xbe1d30027630bb40
.quad 0x000e7a51f8000000, 0x3e4e3a641a5aa459
.quad 0x000ea4afa0000000, 0x3e452486cc2c7b9d
.quad 0x000ecf4830000000, 0xbe438cc07b927e77
.quad 0x000efa1bf0000000, 0xbe39ea5d888e02de
.quad 0x000f252b38000000, 0xbe2288ad162f2d20
.quad 0x000f507658000000, 0x3e4b722a033a7c26
.quad 0x000f7bfdb0000000, 0xbe431a0f63b7625a
.quad 0x000fa7c180000000, 0x3e39e90d82e90a7e
.quad 0x000fd3c228000000, 0x3e4c7b8f884badd2
/*== poly_coeff[4] ==*/
.align 32
.quad 0x3f81111168877F38, 0x3f81111168877F38, 0x3f81111168877F38, 0x3f81111168877F38 /* coeff5 */
.quad 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3 /* coeff4 */
.quad 0x3fc555555555541D, 0x3fc555555555541D, 0x3fc555555555541D, 0x3fc555555555541D /* coeff3 */
.quad 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C /* coeff2 */
/*== Log2e ==*/
.align 32
.quad 0x40671547652B82FE, 0x40671547652B82FE, 0x40671547652B82FE, 0x40671547652B82FE
/*== L2H ==*/
.align 32
.quad 0x3f762e42fef80000, 0x3f762e42fef80000, 0x3f762e42fef80000, 0x3f762e42fef80000
/*== L2L ==*/
.align 32
.quad 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4
/*== ExpAddConst ==*/
.align 32
.quad 0x42f80000001ff800, 0x42f80000001ff800, 0x42f80000001ff800, 0x42f80000001ff800
/*== IndexMask ==*/
.align 32
.quad 0x00000000000007f0, 0x00000000000007f0, 0x00000000000007f0, 0x00000000000007f0
/*== ExpMask ==*/
.align 32
.quad 0x00000000003ff800, 0x00000000003ff800, 0x00000000003ff800, 0x00000000003ff800
/*== MOne ==*/
.align 32
.quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
/*== AbsMask ==*/
.align 32
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
/*== Threshold ==*/
.align 32
.quad 0x40861DA04CBAFE43, 0x40861DA04CBAFE43, 0x40861DA04CBAFE43, 0x40861DA04CBAFE43
/*== L2 ==*/
.align 32
.quad 0x3f762e42fefa39ef, 0x3f762e42fefa39ef, 0x3f762e42fefa39ef, 0x3f762e42fefa39ef
.align 32
.type __svml_dexpm1_data_internal,@object
.size __svml_dexpm1_data_internal,.-__svml_dexpm1_data_internal

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized expm1, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_expm1 _ZGVeN8v_expm1_avx2_wrapper
#include "../svml_d_expm18_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized expm1, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_expm1
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_expm1, __GI__ZGVeN8v_expm1, __redirect__ZGVeN8v_expm1)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,334 @@
/* Function expm1 vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
* After computing exp(x) in high-low parts, an accurate computation is performed to obtain exp(x)-1
* Typical exp() implementation, except that:
* - tables are small (16 elements), allowing for fast gathers
* - all arguments processed in the main path
* - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
* - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
* - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
*
*
*/
/* Offsets for data table __svml_dexpm1_data_internal_avx512
*/
#define Exp_tbl_H 0
#define Exp_tbl_L 128
#define L2E 256
#define Shifter 320
#define Threshold 384
#define SgnMask 448
#define L2H 512
#define L2L 576
#define ZThres 640
#define EMask 704
#define poly_coeff7 768
#define poly_coeff6 832
#define poly_coeff5 896
#define poly_coeff4 960
#define poly_coeff3 1024
#define poly_coeff2 1088
#define One 1152
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_expm1_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups L2E+__svml_dexpm1_data_internal_avx512(%rip), %zmm6
vmovups Shifter+__svml_dexpm1_data_internal_avx512(%rip), %zmm4
vmovups L2H+__svml_dexpm1_data_internal_avx512(%rip), %zmm11
vmovups L2L+__svml_dexpm1_data_internal_avx512(%rip), %zmm5
vmovups Threshold+__svml_dexpm1_data_internal_avx512(%rip), %zmm3
vmovups poly_coeff5+__svml_dexpm1_data_internal_avx512(%rip), %zmm13
vmovups poly_coeff4+__svml_dexpm1_data_internal_avx512(%rip), %zmm15
/* polynomial */
vmovups poly_coeff7+__svml_dexpm1_data_internal_avx512(%rip), %zmm12
/* set Z0=max(Z0, -128.0) */
vmovups ZThres+__svml_dexpm1_data_internal_avx512(%rip), %zmm8
vmovups poly_coeff3+__svml_dexpm1_data_internal_avx512(%rip), %zmm14
vmovups __svml_dexpm1_data_internal_avx512(%rip), %zmm9
vmovaps %zmm0, %zmm2
/* 2^(52-4)*1.5 + x * log2(e) */
vfmadd213pd {rn-sae}, %zmm4, %zmm2, %zmm6
vmovups Exp_tbl_L+__svml_dexpm1_data_internal_avx512(%rip), %zmm0
vcmppd $21, {sae}, %zmm3, %zmm2, %k0
/* Z0 ~ x*log2(e), rounded to 4 fractional bits */
vsubpd {rn-sae}, %zmm4, %zmm6, %zmm7
vpermt2pd Exp_tbl_H+64+__svml_dexpm1_data_internal_avx512(%rip), %zmm6, %zmm9
vpermt2pd Exp_tbl_L+64+__svml_dexpm1_data_internal_avx512(%rip), %zmm6, %zmm0
vandpd SgnMask+__svml_dexpm1_data_internal_avx512(%rip), %zmm2, %zmm1
/* R = x - Z0*log(2) */
vfnmadd213pd {rn-sae}, %zmm2, %zmm7, %zmm11
vmaxpd {sae}, %zmm8, %zmm7, %zmm10
vfnmadd231pd {rn-sae}, %zmm7, %zmm5, %zmm11
kmovw %k0, %edx
/* ensure |R|<2 even for special cases */
vandpd EMask+__svml_dexpm1_data_internal_avx512(%rip), %zmm11, %zmm3
vmovups poly_coeff6+__svml_dexpm1_data_internal_avx512(%rip), %zmm11
/* scale Th */
vscalefpd {rn-sae}, %zmm10, %zmm9, %zmm4
vfmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
vfmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm11
vmovups poly_coeff2+__svml_dexpm1_data_internal_avx512(%rip), %zmm12
vmulpd {rn-sae}, %zmm3, %zmm3, %zmm13
vfmadd231pd {rn-sae}, %zmm3, %zmm14, %zmm12
vfmadd213pd {rn-sae}, %zmm15, %zmm13, %zmm11
vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm11
/* Tlr + R+ R*Poly */
vfmadd213pd {rn-sae}, %zmm0, %zmm13, %zmm11
/* Th - 1 */
vmovups One+__svml_dexpm1_data_internal_avx512(%rip), %zmm0
vaddpd {rn-sae}, %zmm3, %zmm11, %zmm14
vsubpd {rn-sae}, %zmm0, %zmm4, %zmm15
/* (Th-1)+Th*(Tlr + R+ R*Poly) */
vfmadd213pd {rn-sae}, %zmm15, %zmm14, %zmm4
vorpd %zmm1, %zmm4, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm2, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 64(%rsp,%r14,8), %xmm0
call expm1@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 128(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN8v_expm1_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_dexpm1_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Exp_tbl_H[16][2];
__declspec(align(64)) VUINT32 Exp_tbl_L[16][2];
__declspec(align(64)) VUINT32 L2E[8][2];
__declspec(align(64)) VUINT32 Shifter[8][2];
__declspec(align(64)) VUINT32 Threshold[8][2];
__declspec(align(64)) VUINT32 SgnMask[8][2];
__declspec(align(64)) VUINT32 L2H[8][2];
__declspec(align(64)) VUINT32 L2L[8][2];
__declspec(align(64)) VUINT32 ZThres[8][2];
__declspec(align(64)) VUINT32 EMask[8][2];
__declspec(align(64)) VUINT32 poly_coeff7[8][2];
__declspec(align(64)) VUINT32 poly_coeff6[8][2];
__declspec(align(64)) VUINT32 poly_coeff5[8][2];
__declspec(align(64)) VUINT32 poly_coeff4[8][2];
__declspec(align(64)) VUINT32 poly_coeff3[8][2];
__declspec(align(64)) VUINT32 poly_coeff2[8][2];
__declspec(align(64)) VUINT32 One[8][2];
} __svml_dexpm1_data_internal_avx512;
#endif
__svml_dexpm1_data_internal_avx512:
/*== Exp_tbl_H ==*/
.quad 0x3ff0000000000000
.quad 0x3ff0b5586cf9890f
.quad 0x3ff172b83c7d517b
.quad 0x3ff2387a6e756238
.quad 0x3ff306fe0a31b715
.quad 0x3ff3dea64c123422
.quad 0x3ff4bfdad5362a27
.quad 0x3ff5ab07dd485429
.quad 0x3ff6a09e667f3bcd
.quad 0x3ff7a11473eb0187
.quad 0x3ff8ace5422aa0db
.quad 0x3ff9c49182a3f090
.quad 0x3ffae89f995ad3ad
.quad 0x3ffc199bdd85529c
.quad 0x3ffd5818dcfba487
.quad 0x3ffea4afa2a490da
/*== Exp_tbl_L ==*/
.align 64
.quad 0x0000000000000000
.quad 0x3c979aa65d837b6d
.quad 0xbc801b15eaa59348
.quad 0x3c968efde3a8a894
.quad 0x3c834d754db0abb6
.quad 0x3c859f48a72a4c6d
.quad 0x3c7690cebb7aafb0
.quad 0x3c9063e1e21c5409
.quad 0xbc93b3efbf5e2228
.quad 0xbc7b32dcb94da51d
.quad 0x3c8db72fc1f0eab4
.quad 0x3c71affc2b91ce27
.quad 0x3c8c1a7792cb3387
.quad 0x3c736eae30af0cb3
.quad 0x3c74a385a63d07a7
.quad 0xbc8ff7128fd391f0
/*== log2(e) ==*/
.align 64
.quad 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE
/*== Shifter=2^(52-4)*1.5 ==*/
.align 64
.quad 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0
/*== Threshold ==*/
.align 64
.quad 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44
/*== Sgn ==*/
.align 64
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
/*== L2H = log(2)_high ==*/
.align 64
.quad 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef
/*== L2L = log(2)_low ==*/
.align 64
.quad 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f
/*== ZThres ==*/
.align 64
.quad 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000
/*== EMask ==*/
.align 64
.quad 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff
/*== poly_coeff7 ==*/
.align 64
.quad 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a
/*== poly_coeff6 ==*/
.align 64
.quad 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f
/*== poly_coeff5 ==*/
.align 64
.quad 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214
/*== poly_coeff4 ==*/
.align 64
.quad 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06
/*== poly_coeff3 ==*/
.align 64
.quad 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656
/*== poly_coeff2 ==*/
.align 64
.quad 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2
/*== One ==*/
.align 64
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
.align 64
.type __svml_dexpm1_data_internal_avx512,@object
.size __svml_dexpm1_data_internal_avx512,.-__svml_dexpm1_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized expm1f.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_expm1f _ZGVeN16v_expm1f_avx2_wrapper
#include "../svml_s_expm1f16_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized expm1f, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_expm1f
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_expm1f, __GI__ZGVeN16v_expm1f,
__redirect__ZGVeN16v_expm1f)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,281 @@
/* Function expm1f vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
* After computing exp(x) in high-low parts, an accurate computation is performed to obtain exp(x)-1
* Typical exp() implementation, except that:
* - tables are small (32 elements), allowing for fast gathers
* - all arguments processed in the main path
* - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
* - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
* - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
*
*
*/
/* Offsets for data table __svml_sexpm1_data_internal_avx512
*/
#define Exp_tbl_H 0
#define Exp_tbl_L 128
#define L2E 256
#define Shifter 320
#define Threshold 384
#define SgnMask 448
#define L2H 512
#define L2L 576
#define EMask 640
#define poly_coeff3 704
#define poly_coeff2 768
#define One 832
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_expm1f_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups L2E+__svml_sexpm1_data_internal_avx512(%rip), %zmm5
vmovups Shifter+__svml_sexpm1_data_internal_avx512(%rip), %zmm3
vmovups L2H+__svml_sexpm1_data_internal_avx512(%rip), %zmm8
vmovups L2L+__svml_sexpm1_data_internal_avx512(%rip), %zmm4
vmovups __svml_sexpm1_data_internal_avx512(%rip), %zmm6
/* polynomial */
vmovups poly_coeff3+__svml_sexpm1_data_internal_avx512(%rip), %zmm9
vmovups poly_coeff2+__svml_sexpm1_data_internal_avx512(%rip), %zmm12
vmovups Exp_tbl_L+__svml_sexpm1_data_internal_avx512(%rip), %zmm11
vmovups Threshold+__svml_sexpm1_data_internal_avx512(%rip), %zmm2
/* Th - 1 */
vmovups One+__svml_sexpm1_data_internal_avx512(%rip), %zmm14
vmovaps %zmm0, %zmm1
/* 2^(52-5)*1.5 + x * log2(e) */
vfmadd213ps {rn-sae}, %zmm3, %zmm1, %zmm5
vcmpps $29, {sae}, %zmm2, %zmm1, %k0
/* Z0 ~ x*log2(e), rounded to 5 fractional bits */
vsubps {rn-sae}, %zmm3, %zmm5, %zmm7
vpermt2ps Exp_tbl_H+64+__svml_sexpm1_data_internal_avx512(%rip), %zmm5, %zmm6
vpermt2ps Exp_tbl_L+64+__svml_sexpm1_data_internal_avx512(%rip), %zmm5, %zmm11
vandps SgnMask+__svml_sexpm1_data_internal_avx512(%rip), %zmm1, %zmm0
/* R = x - Z0*log(2) */
vfnmadd213ps {rn-sae}, %zmm1, %zmm7, %zmm8
/* scale Th */
vscalefps {rn-sae}, %zmm7, %zmm6, %zmm2
vfnmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
kmovw %k0, %edx
/* ensure |R|<2 even for special cases */
vandps EMask+__svml_sexpm1_data_internal_avx512(%rip), %zmm8, %zmm13
vsubps {rn-sae}, %zmm14, %zmm2, %zmm8
vmulps {rn-sae}, %zmm13, %zmm13, %zmm10
vfmadd231ps {rn-sae}, %zmm13, %zmm9, %zmm12
/* Tlr + R+ R2*Poly */
vfmadd213ps {rn-sae}, %zmm11, %zmm10, %zmm12
vaddps {rn-sae}, %zmm13, %zmm12, %zmm15
/* (Th-1)+Th*(Tlr + R+ R*Poly) */
vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
vorps %zmm0, %zmm2, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm1, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $16, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 64(%rsp,%r14,4), %xmm0
call expm1f@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 128(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN16v_expm1f_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_sexpm1_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Exp_tbl_H[32][1];
__declspec(align(64)) VUINT32 Exp_tbl_L[32][1];
__declspec(align(64)) VUINT32 L2E[16][1];
__declspec(align(64)) VUINT32 Shifter[16][1];
__declspec(align(64)) VUINT32 Threshold[16][1];
__declspec(align(64)) VUINT32 SgnMask[16][1];
__declspec(align(64)) VUINT32 L2H[16][1];
__declspec(align(64)) VUINT32 L2L[16][1];
__declspec(align(64)) VUINT32 EMask[16][1];
__declspec(align(64)) VUINT32 poly_coeff3[16][1];
__declspec(align(64)) VUINT32 poly_coeff2[16][1];
__declspec(align(64)) VUINT32 One[16][1];
} __svml_sexpm1_data_internal_avx512;
#endif
__svml_sexpm1_data_internal_avx512:
/*== Exp_tbl_H ==*/
.long 0x3f800000, 0x3f82cd87, 0x3f85aac3, 0x3f88980f
.long 0x3f8b95c2, 0x3f8ea43a, 0x3f91c3d3, 0x3f94f4f0
.long 0x3f9837f0, 0x3f9b8d3a, 0x3f9ef532, 0x3fa27043
.long 0x3fa5fed7, 0x3fa9a15b, 0x3fad583f, 0x3fb123f6
.long 0x3fb504f3, 0x3fb8fbaf, 0x3fbd08a4, 0x3fc12c4d
.long 0x3fc5672a, 0x3fc9b9be, 0x3fce248c, 0x3fd2a81e
.long 0x3fd744fd, 0x3fdbfbb8, 0x3fe0ccdf, 0x3fe5b907
.long 0x3feac0c7, 0x3fefe4ba, 0x3ff5257d, 0x3ffa83b3
/*== Exp_tbl_L ==*/
.align 64
.long 0x00000000, 0xb34a3a0a, 0x3346cb6a, 0xb36ed17e
.long 0xb24e0611, 0xb3517dd9, 0x334b2482, 0xb31586de
.long 0x33092801, 0xb2e6f467, 0x331b85f2, 0x3099b6f1
.long 0xb3051aa8, 0xb2e2a0da, 0xb2006c56, 0xb3365942
.long 0x329302ae, 0x32c595dc, 0xb302e5a2, 0xb28e10a1
.long 0x31b3d0e5, 0xb31a472b, 0x31d1daf2, 0xb305bf64
.long 0xb27ce182, 0xb2f26443, 0xb1b4b0da, 0xb1da8a8f
.long 0xb1d290be, 0xb2d5b899, 0x31b0a147, 0xb2156afc
/*== log2(e) ==*/
.align 64
.long 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B
/*== Shifter=2^(23-5)*1.5 ==*/
.align 64
.long 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000
/*== Threshold ==*/
.align 64
.long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B
/*== Sgn ==*/
.align 64
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
/*== L2H = log(2)_high ==*/
.align 64
.long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
/*== L2L = log(2)_low ==*/
.align 64
.long 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308
/*== EMask ==*/
.align 64
.long 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff
/*== poly_coeff3 ==*/
.align 64
.long 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3
/*== poly_coeff2 ==*/
.align 64
.long 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6
/*== One ==*/
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
.align 64
.type __svml_sexpm1_data_internal_avx512,@object
.size __svml_sexpm1_data_internal_avx512,.-__svml_sexpm1_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized expm1f, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_expm1f _ZGVbN4v_expm1f_sse2
#include "../svml_s_expm1f4_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized expm1f, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_expm1f
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_expm1f, __GI__ZGVbN4v_expm1f,
__redirect__ZGVbN4v_expm1f)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,358 @@
/* Function expm1f vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
* exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
* expm1(x) = exp(x)-1 is then obtained via multi-precision computation
*
*
*/
/* Offsets for data table __svml_sexpm1_data_internal
*/
#define Expm1_HA_table 0
#define poly_coeff 512
#define Log2e 576
#define L2H 592
#define L2L 608
#define ExpAddConst 624
#define IndexMask 640
#define ExpMask 656
#define MOne 672
#define AbsMask 688
#define Threshold 704
#define L2 720
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_expm1f_sse4)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $64, %rsp
movaps %xmm0, %xmm4
movups Log2e+__svml_sexpm1_data_internal(%rip), %xmm9
lea __svml_sexpm1_data_internal(%rip), %r8
mulps %xmm0, %xmm9
movups .FLT_10(%rip), %xmm5
movups ExpAddConst+__svml_sexpm1_data_internal(%rip), %xmm2
addps %xmm5, %xmm9
/* argument reduction */
movups L2H+__svml_sexpm1_data_internal(%rip), %xmm6
subps %xmm5, %xmm9
mulps %xmm9, %xmm6
addps %xmm9, %xmm2
/* table lookup */
movdqu IndexMask+__svml_sexpm1_data_internal(%rip), %xmm12
subps %xmm6, %xmm4
pand %xmm2, %xmm12
movups L2L+__svml_sexpm1_data_internal(%rip), %xmm7
movups AbsMask+__svml_sexpm1_data_internal(%rip), %xmm3
pshufd $1, %xmm12, %xmm10
movaps %xmm3, %xmm8
mulps %xmm9, %xmm7
andps %xmm0, %xmm8
cmpnleps Threshold+__svml_sexpm1_data_internal(%rip), %xmm8
movd %xmm12, %edx
subps %xmm7, %xmm4
movd %xmm10, %ecx
movmskps %xmm8, %eax
pshufd $2, %xmm12, %xmm11
movaps %xmm4, %xmm7
pshufd $3, %xmm12, %xmm13
andnps %xmm0, %xmm3
movd %xmm11, %esi
movd %xmm13, %edi
/* polynomial */
movups poly_coeff+__svml_sexpm1_data_internal(%rip), %xmm8
movdqu ExpMask+__svml_sexpm1_data_internal(%rip), %xmm6
movslq %edx, %rdx
pand %xmm6, %xmm2
movslq %ecx, %rcx
pslld $14, %xmm2
movslq %esi, %rsi
movslq %edi, %rdi
movq (%r8,%rdx), %xmm1
movq (%r8,%rcx), %xmm14
movq (%r8,%rsi), %xmm5
movq (%r8,%rdi), %xmm15
unpcklps %xmm14, %xmm1
mulps %xmm4, %xmm8
movaps %xmm1, %xmm10
mulps %xmm4, %xmm7
addps poly_coeff+16+__svml_sexpm1_data_internal(%rip), %xmm8
unpcklps %xmm15, %xmm5
movlhps %xmm5, %xmm10
shufps $238, %xmm5, %xmm1
orps %xmm2, %xmm10
/* T-1 */
movups MOne+__svml_sexpm1_data_internal(%rip), %xmm9
mulps %xmm2, %xmm1
addps %xmm9, %xmm10
mulps %xmm7, %xmm8
addps %xmm1, %xmm10
addps %xmm8, %xmm4
movaps %xmm10, %xmm1
subps %xmm9, %xmm1
mulps %xmm1, %xmm4
addps %xmm4, %xmm10
orps %xmm3, %xmm10
testl %eax, %eax
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm10
/* Restore registers
* and exit the function
*/
L(EXIT):
movaps %xmm10, %xmm0
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm0, 32(%rsp)
movups %xmm10, 48(%rsp)
# LOE rbx r12 r13 r14 r15 eax
xorl %edx, %edx
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
movl %edx, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
movl %eax, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm10
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 xmm10
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call expm1f@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 48(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVbN4v_expm1f_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_sexpm1_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(16)) VUINT32 Expm1_HA_table[(1<<7)][1];
__declspec(align(16)) VUINT32 poly_coeff[4][4][1];
__declspec(align(16)) VUINT32 Log2e[4][1];
__declspec(align(16)) VUINT32 L2H[4][1];
__declspec(align(16)) VUINT32 L2L[4][1];
__declspec(align(16)) VUINT32 ExpAddConst[4][1];
__declspec(align(16)) VUINT32 IndexMask[4][1];
__declspec(align(16)) VUINT32 ExpMask[4][1];
__declspec(align(16)) VUINT32 MOne[4][1];
__declspec(align(16)) VUINT32 AbsMask[4][1];
__declspec(align(16)) VUINT32 Threshold[4][1];
__declspec(align(16)) VUINT32 L2[4][1];
} __svml_sexpm1_data_internal;
#endif
__svml_sexpm1_data_internal:
/* Expm1_HA_table */
.long 0x00000000, 0x00000000
.long 0x00016000, 0x391a3e78
.long 0x0002d000, 0xb89e59d5
.long 0x00044000, 0xb93ae78a
.long 0x0005b000, 0xb9279306
.long 0x00072000, 0xb79e6961
.long 0x0008a000, 0xb97e2fee
.long 0x000a1000, 0x391aaea9
.long 0x000b9000, 0x39383c7d
.long 0x000d2000, 0xb9241490
.long 0x000ea000, 0x39073169
.long 0x00103000, 0x386e218a
.long 0x0011c000, 0x38f4dceb
.long 0x00136000, 0xb93a9a1e
.long 0x0014f000, 0x391df520
.long 0x00169000, 0x3905a6e4
.long 0x00183000, 0x397e0a32
.long 0x0019e000, 0x370b2641
.long 0x001b9000, 0xb8b1918b
.long 0x001d4000, 0xb8132c6a
.long 0x001ef000, 0x39264c12
.long 0x0020b000, 0x37221f73
.long 0x00227000, 0x37060619
.long 0x00243000, 0x3922b5c1
.long 0x00260000, 0xb814ab27
.long 0x0027d000, 0xb89b12c6
.long 0x0029a000, 0x382d5a75
.long 0x002b8000, 0xb938c94b
.long 0x002d6000, 0xb97822b8
.long 0x002f4000, 0xb910ea53
.long 0x00312000, 0x38fd6075
.long 0x00331000, 0x38620955
.long 0x00350000, 0x391e667f
.long 0x00370000, 0xb89b8736
.long 0x00390000, 0xb90a1714
.long 0x003b0000, 0xb7a54ded
.long 0x003d1000, 0xb96b8c15
.long 0x003f1000, 0x397336cf
.long 0x00413000, 0xb8eccd66
.long 0x00434000, 0x39599b45
.long 0x00456000, 0x3965422b
.long 0x00479000, 0xb8a2cdd5
.long 0x0049c000, 0xb9484f32
.long 0x004bf000, 0xb8fac043
.long 0x004e2000, 0x391182a4
.long 0x00506000, 0x38ccf6bc
.long 0x0052b000, 0xb97c4dc2
.long 0x0054f000, 0x38d6aaf4
.long 0x00574000, 0x391f995b
.long 0x0059a000, 0xb8ba8f62
.long 0x005c0000, 0xb9090d05
.long 0x005e6000, 0x37f4825e
.long 0x0060d000, 0xb8c844f5
.long 0x00634000, 0xb76d1a83
.long 0x0065c000, 0xb95f2310
.long 0x00684000, 0xb952b5f8
.long 0x006ac000, 0x37c6e7dd
.long 0x006d5000, 0xb7cfe126
.long 0x006fe000, 0x3917337c
.long 0x00728000, 0x383b9e2d
.long 0x00752000, 0x392fa2a5
.long 0x0077d000, 0x37df730b
.long 0x007a8000, 0x38ecb6dd
.long 0x007d4000, 0xb879f986
/*== poly_coeff[4] ==*/
.align 16
.long 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF /* coeff3 */
.long 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F /* coeff2 */
/* 32 Byte Padding */
.zero 32
/*== Log2e ==*/
.align 16
.long 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B
/*== L2H ==*/
.align 16
.long 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000
/*== L2L ==*/
.align 16
.long 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083
/*== ExpAddConst ==*/
.align 16
.long 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00
/*== IndexMask ==*/
.align 16
.long 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8
/*== ExpMask ==*/
.align 16
.long 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00
/*== MOne ==*/
.align 16
.long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
/*== AbsMask ==*/
.align 16
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== Threshold ==*/
.align 16
.long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B // 86.643394
/*== L2 ==*/
.align 16
.long 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218
.align 16
.type __svml_sexpm1_data_internal,@object
.size __svml_sexpm1_data_internal,.-__svml_sexpm1_data_internal
.align 16
.FLT_10:
.long 0x4b400000,0x4b400000,0x4b400000,0x4b400000
.type .FLT_10,@object
.size .FLT_10,16

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized expm1f, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_expm1f _ZGVdN8v_expm1f_sse_wrapper
#include "../svml_s_expm1f8_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized expm1f, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_expm1f
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_expm1f, __GI__ZGVdN8v_expm1f,
__redirect__ZGVdN8v_expm1f)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,351 @@
/* Function expm1f vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
* exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
* expm1(x) = exp(x)-1 is then obtained via multi-precision computation
*
*
*/
/* Offsets for data table __svml_sexpm1_data_internal
*/
#define Expm1_HA_table 0
#define poly_coeff 512
#define Log2e 640
#define L2H 672
#define L2L 704
#define ExpAddConst 736
#define IndexMask 768
#define ExpMask 800
#define MOne 832
#define AbsMask 864
#define Threshold 896
#define L2 928
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_expm1f_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
lea __svml_sexpm1_data_internal(%rip), %rax
vmovaps %ymm0, %ymm3
vmulps Log2e+__svml_sexpm1_data_internal(%rip), %ymm3, %ymm4
/* argument reduction */
vmovups L2H+__svml_sexpm1_data_internal(%rip), %ymm2
vmovups AbsMask+__svml_sexpm1_data_internal(%rip), %ymm5
vroundps $0, %ymm4, %ymm8
vaddps ExpAddConst+__svml_sexpm1_data_internal(%rip), %ymm8, %ymm0
vfnmadd213ps %ymm3, %ymm8, %ymm2
/* table lookup */
vandps IndexMask+__svml_sexpm1_data_internal(%rip), %ymm0, %ymm9
vandps %ymm5, %ymm3, %ymm6
vcmpnle_uqps Threshold+__svml_sexpm1_data_internal(%rip), %ymm6, %ymm7
vfnmadd231ps L2L+__svml_sexpm1_data_internal(%rip), %ymm8, %ymm2
vandps ExpMask+__svml_sexpm1_data_internal(%rip), %ymm0, %ymm0
vandnps %ymm3, %ymm5, %ymm1
vpslld $14, %ymm0, %ymm0
vmovmskps %ymm7, %edx
vmovd %xmm9, %ecx
vextractf128 $1, %ymm9, %xmm10
movslq %ecx, %rcx
vmovd %xmm10, %r9d
vpextrd $1, %xmm9, %esi
vpextrd $2, %xmm9, %edi
vpextrd $3, %xmm9, %r8d
vmovq (%rax,%rcx), %xmm11
vpextrd $1, %xmm10, %r10d
vpextrd $2, %xmm10, %r11d
vpextrd $3, %xmm10, %ecx
movslq %esi, %rsi
movslq %edi, %rdi
movslq %r8d, %r8
movslq %r9d, %r9
movslq %r10d, %r10
movslq %r11d, %r11
movslq %ecx, %rcx
vmovq (%rax,%rsi), %xmm13
vmovq (%rax,%rdi), %xmm12
vmovq (%rax,%r8), %xmm14
vmovq (%rax,%r9), %xmm15
vmovq (%rax,%r10), %xmm5
vmovq (%rax,%r11), %xmm4
vmovq (%rax,%rcx), %xmm6
vunpcklps %xmm12, %xmm11, %xmm7
vunpcklps %xmm14, %xmm13, %xmm8
vunpcklps %xmm4, %xmm15, %xmm15
vunpcklps %xmm6, %xmm5, %xmm9
vmulps %ymm2, %ymm2, %ymm13
vinsertf128 $1, %xmm15, %ymm7, %ymm10
vinsertf128 $1, %xmm9, %ymm8, %ymm11
vunpcklps %ymm11, %ymm10, %ymm12
vorps %ymm0, %ymm12, %ymm14
/* polynomial */
vmovups poly_coeff+__svml_sexpm1_data_internal(%rip), %ymm12
vfmadd213ps poly_coeff+32+__svml_sexpm1_data_internal(%rip), %ymm2, %ymm12
vfmadd213ps %ymm2, %ymm13, %ymm12
/* T-1 */
vmovups MOne+__svml_sexpm1_data_internal(%rip), %ymm13
vaddps %ymm13, %ymm14, %ymm2
vunpckhps %ymm11, %ymm10, %ymm4
vfmadd213ps %ymm2, %ymm0, %ymm4
vsubps %ymm13, %ymm4, %ymm0
vfmadd213ps %ymm4, %ymm12, %ymm0
vorps %ymm1, %ymm0, %ymm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %ymm3, 32(%rsp)
vmovups %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx ymm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call expm1f@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 64(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN8v_expm1f_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_sexpm1_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(32)) VUINT32 Expm1_HA_table[(1<<7)][1];
__declspec(align(32)) VUINT32 poly_coeff[4][8][1];
__declspec(align(32)) VUINT32 Log2e[8][1];
__declspec(align(32)) VUINT32 L2H[8][1];
__declspec(align(32)) VUINT32 L2L[8][1];
__declspec(align(32)) VUINT32 ExpAddConst[8][1];
__declspec(align(32)) VUINT32 IndexMask[8][1];
__declspec(align(32)) VUINT32 ExpMask[8][1];
__declspec(align(32)) VUINT32 MOne[8][1];
__declspec(align(32)) VUINT32 AbsMask[8][1];
__declspec(align(32)) VUINT32 Threshold[8][1];
__declspec(align(32)) VUINT32 L2[8][1];
} __svml_sexpm1_data_internal;
#endif
__svml_sexpm1_data_internal:
/* Expm1_HA_table */
.long 0x00000000, 0x00000000
.long 0x00016000, 0x391a3e78
.long 0x0002d000, 0xb89e59d5
.long 0x00044000, 0xb93ae78a
.long 0x0005b000, 0xb9279306
.long 0x00072000, 0xb79e6961
.long 0x0008a000, 0xb97e2fee
.long 0x000a1000, 0x391aaea9
.long 0x000b9000, 0x39383c7d
.long 0x000d2000, 0xb9241490
.long 0x000ea000, 0x39073169
.long 0x00103000, 0x386e218a
.long 0x0011c000, 0x38f4dceb
.long 0x00136000, 0xb93a9a1e
.long 0x0014f000, 0x391df520
.long 0x00169000, 0x3905a6e4
.long 0x00183000, 0x397e0a32
.long 0x0019e000, 0x370b2641
.long 0x001b9000, 0xb8b1918b
.long 0x001d4000, 0xb8132c6a
.long 0x001ef000, 0x39264c12
.long 0x0020b000, 0x37221f73
.long 0x00227000, 0x37060619
.long 0x00243000, 0x3922b5c1
.long 0x00260000, 0xb814ab27
.long 0x0027d000, 0xb89b12c6
.long 0x0029a000, 0x382d5a75
.long 0x002b8000, 0xb938c94b
.long 0x002d6000, 0xb97822b8
.long 0x002f4000, 0xb910ea53
.long 0x00312000, 0x38fd6075
.long 0x00331000, 0x38620955
.long 0x00350000, 0x391e667f
.long 0x00370000, 0xb89b8736
.long 0x00390000, 0xb90a1714
.long 0x003b0000, 0xb7a54ded
.long 0x003d1000, 0xb96b8c15
.long 0x003f1000, 0x397336cf
.long 0x00413000, 0xb8eccd66
.long 0x00434000, 0x39599b45
.long 0x00456000, 0x3965422b
.long 0x00479000, 0xb8a2cdd5
.long 0x0049c000, 0xb9484f32
.long 0x004bf000, 0xb8fac043
.long 0x004e2000, 0x391182a4
.long 0x00506000, 0x38ccf6bc
.long 0x0052b000, 0xb97c4dc2
.long 0x0054f000, 0x38d6aaf4
.long 0x00574000, 0x391f995b
.long 0x0059a000, 0xb8ba8f62
.long 0x005c0000, 0xb9090d05
.long 0x005e6000, 0x37f4825e
.long 0x0060d000, 0xb8c844f5
.long 0x00634000, 0xb76d1a83
.long 0x0065c000, 0xb95f2310
.long 0x00684000, 0xb952b5f8
.long 0x006ac000, 0x37c6e7dd
.long 0x006d5000, 0xb7cfe126
.long 0x006fe000, 0x3917337c
.long 0x00728000, 0x383b9e2d
.long 0x00752000, 0x392fa2a5
.long 0x0077d000, 0x37df730b
.long 0x007a8000, 0x38ecb6dd
.long 0x007d4000, 0xb879f986
/*== poly_coeff[4] ==*/
.align 32
.long 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF /* coeff3 */
.long 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F /* coeff2 */
/* 64 Byte Padding */
.zero 64
/*== Log2e ==*/
.align 32
.long 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B
/*== L2H ==*/
.align 32
.long 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000
/*== L2L ==*/
.align 32
.long 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083
/*== ExpAddConst ==*/
.align 32
.long 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00
/*== IndexMask ==*/
.align 32
.long 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8
/*== ExpMask ==*/
.align 32
.long 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00
/*== MOne ==*/
.align 32
.long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
/*== AbsMask ==*/
.align 32
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== Threshold ==*/
.align 32
.long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B // 86.643394
/*== L2 ==*/
.align 32
.long 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218
.align 32
.type __svml_sexpm1_data_internal,@object
.size __svml_sexpm1_data_internal,.-__svml_sexpm1_data_internal

View File

@ -0,0 +1,29 @@
/* Function expm1 vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_expm1)
WRAPPER_IMPL_SSE2 expm1
END (_ZGVbN2v_expm1)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_expm1)
#endif

View File

@ -0,0 +1,29 @@
/* Function expm1 vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_expm1)
WRAPPER_IMPL_AVX _ZGVbN2v_expm1
END (_ZGVdN4v_expm1)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_expm1)
#endif

View File

@ -0,0 +1,25 @@
/* Function expm1 vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_expm1)
WRAPPER_IMPL_AVX _ZGVbN2v_expm1
END (_ZGVcN4v_expm1)

View File

@ -0,0 +1,25 @@
/* Function expm1 vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_expm1)
WRAPPER_IMPL_AVX512 _ZGVdN4v_expm1
END (_ZGVeN8v_expm1)

View File

@ -0,0 +1,25 @@
/* Function expm1f vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_expm1f)
WRAPPER_IMPL_AVX512 _ZGVdN8v_expm1f
END (_ZGVeN16v_expm1f)

View File

@ -0,0 +1,29 @@
/* Function expm1f vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_expm1f)
WRAPPER_IMPL_SSE2 expm1f
END (_ZGVbN4v_expm1f)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_expm1f)
#endif

View File

@ -0,0 +1,29 @@
/* Function expm1f vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_expm1f)
WRAPPER_IMPL_AVX _ZGVbN4v_expm1f
END (_ZGVdN8v_expm1f)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_expm1f)
#endif

View File

@ -0,0 +1,25 @@
/* Function expm1f vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_expm1f)
WRAPPER_IMPL_AVX _ZGVbN4v_expm1f
END (_ZGVcN8v_expm1f)

View File

@ -0,0 +1 @@
#include "test-double-libmvec-expm1.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-expm1.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-expm1.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC expm1
#include "test-vector-abi-arg1.h"

View File

@ -34,6 +34,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVbN2vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVbN2v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVbN2v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVbN2v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVbN2v_expm1)
#define VEC_INT_TYPE __m128i

View File

@ -37,6 +37,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVdN4vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVdN4v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVdN4v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVdN4v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVdN4v_expm1)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View File

@ -34,6 +34,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVcN4vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVcN4v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVcN4v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVcN4v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVcN4v_expm1)
#define VEC_INT_TYPE __m128i

View File

@ -34,6 +34,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVeN8vv_hypot)
VECTOR_WRAPPER (WRAPPER_NAME (exp2), _ZGVeN8v_exp2)
VECTOR_WRAPPER (WRAPPER_NAME (exp10), _ZGVeN8v_exp10)
VECTOR_WRAPPER (WRAPPER_NAME (cosh), _ZGVeN8v_cosh)
VECTOR_WRAPPER (WRAPPER_NAME (expm1), _ZGVeN8v_expm1)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View File

@ -0,0 +1 @@
#include "test-float-libmvec-expm1f.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-expm1f.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-expm1f.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC expm1f
#include "test-vector-abi-arg1.h"

View File

@ -34,6 +34,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVeN16vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVeN16v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVeN16v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVeN16v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVeN16v_expm1f)
#define VEC_INT_TYPE __m512i

View File

@ -34,6 +34,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVbN4vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVbN4v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVbN4v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVbN4v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVbN4v_expm1f)
#define VEC_INT_TYPE __m128i

View File

@ -37,6 +37,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVdN8vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVdN8v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVdN8v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVdN8v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVdN8v_expm1f)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View File

@ -34,6 +34,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVcN8vv_hypotf)
VECTOR_WRAPPER (WRAPPER_NAME (exp2f), _ZGVcN8v_exp2f)
VECTOR_WRAPPER (WRAPPER_NAME (exp10f), _ZGVcN8v_exp10f)
VECTOR_WRAPPER (WRAPPER_NAME (coshf), _ZGVcN8v_coshf)
VECTOR_WRAPPER (WRAPPER_NAME (expm1f), _ZGVcN8v_expm1f)
#define VEC_INT_TYPE __m128i