x86-64: Add vector log1p/log1pf implementation to libmvec

Implement vectorized log1p/log1pf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector log1p/log1pf with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 09:35:53 -08:00
parent 7e1722fec8
commit 74265c16ab
50 changed files with 4447 additions and 1 deletions

View File

@ -241,4 +241,15 @@
#define __DECL_SIMD_log2f32x
#define __DECL_SIMD_log2f64x
#define __DECL_SIMD_log2f128x
#define __DECL_SIMD_log1p
#define __DECL_SIMD_log1pf
#define __DECL_SIMD_log1pl
#define __DECL_SIMD_log1pf16
#define __DECL_SIMD_log1pf32
#define __DECL_SIMD_log1pf64
#define __DECL_SIMD_log1pf128
#define __DECL_SIMD_log1pf32x
#define __DECL_SIMD_log1pf64x
#define __DECL_SIMD_log1pf128x
#endif

View File

@ -119,7 +119,7 @@ __MATHCALL_VEC (exp10,, (_Mdouble_ __x));
__MATHCALL_VEC (expm1,, (_Mdouble_ __x));
/* Return log(1 + X). */
__MATHCALL (log1p,, (_Mdouble_ __x));
__MATHCALL_VEC (log1p,, (_Mdouble_ __x));
/* Return the base 2 signed integral exponent of X. */
__MATHCALL (logb,, (_Mdouble_ __x));

View File

@ -55,6 +55,7 @@ GLIBC_2.35 _ZGVbN2v_exp10 F
GLIBC_2.35 _ZGVbN2v_exp2 F
GLIBC_2.35 _ZGVbN2v_expm1 F
GLIBC_2.35 _ZGVbN2v_log10 F
GLIBC_2.35 _ZGVbN2v_log1p F
GLIBC_2.35 _ZGVbN2v_log2 F
GLIBC_2.35 _ZGVbN2v_sinh F
GLIBC_2.35 _ZGVbN2vv_atan2 F
@ -68,6 +69,7 @@ GLIBC_2.35 _ZGVbN4v_exp10f F
GLIBC_2.35 _ZGVbN4v_exp2f F
GLIBC_2.35 _ZGVbN4v_expm1f F
GLIBC_2.35 _ZGVbN4v_log10f F
GLIBC_2.35 _ZGVbN4v_log1pf F
GLIBC_2.35 _ZGVbN4v_log2f F
GLIBC_2.35 _ZGVbN4v_sinhf F
GLIBC_2.35 _ZGVbN4vv_atan2f F
@ -81,6 +83,7 @@ GLIBC_2.35 _ZGVcN4v_exp10 F
GLIBC_2.35 _ZGVcN4v_exp2 F
GLIBC_2.35 _ZGVcN4v_expm1 F
GLIBC_2.35 _ZGVcN4v_log10 F
GLIBC_2.35 _ZGVcN4v_log1p F
GLIBC_2.35 _ZGVcN4v_log2 F
GLIBC_2.35 _ZGVcN4v_sinh F
GLIBC_2.35 _ZGVcN4vv_atan2 F
@ -94,6 +97,7 @@ GLIBC_2.35 _ZGVcN8v_exp10f F
GLIBC_2.35 _ZGVcN8v_exp2f F
GLIBC_2.35 _ZGVcN8v_expm1f F
GLIBC_2.35 _ZGVcN8v_log10f F
GLIBC_2.35 _ZGVcN8v_log1pf F
GLIBC_2.35 _ZGVcN8v_log2f F
GLIBC_2.35 _ZGVcN8v_sinhf F
GLIBC_2.35 _ZGVcN8vv_atan2f F
@ -107,6 +111,7 @@ GLIBC_2.35 _ZGVdN4v_exp10 F
GLIBC_2.35 _ZGVdN4v_exp2 F
GLIBC_2.35 _ZGVdN4v_expm1 F
GLIBC_2.35 _ZGVdN4v_log10 F
GLIBC_2.35 _ZGVdN4v_log1p F
GLIBC_2.35 _ZGVdN4v_log2 F
GLIBC_2.35 _ZGVdN4v_sinh F
GLIBC_2.35 _ZGVdN4vv_atan2 F
@ -120,6 +125,7 @@ GLIBC_2.35 _ZGVdN8v_exp10f F
GLIBC_2.35 _ZGVdN8v_exp2f F
GLIBC_2.35 _ZGVdN8v_expm1f F
GLIBC_2.35 _ZGVdN8v_log10f F
GLIBC_2.35 _ZGVdN8v_log1pf F
GLIBC_2.35 _ZGVdN8v_log2f F
GLIBC_2.35 _ZGVdN8v_sinhf F
GLIBC_2.35 _ZGVdN8vv_atan2f F
@ -133,6 +139,7 @@ GLIBC_2.35 _ZGVeN16v_exp10f F
GLIBC_2.35 _ZGVeN16v_exp2f F
GLIBC_2.35 _ZGVeN16v_expm1f F
GLIBC_2.35 _ZGVeN16v_log10f F
GLIBC_2.35 _ZGVeN16v_log1pf F
GLIBC_2.35 _ZGVeN16v_log2f F
GLIBC_2.35 _ZGVeN16v_sinhf F
GLIBC_2.35 _ZGVeN16vv_atan2f F
@ -146,6 +153,7 @@ GLIBC_2.35 _ZGVeN8v_exp10 F
GLIBC_2.35 _ZGVeN8v_exp2 F
GLIBC_2.35 _ZGVeN8v_expm1 F
GLIBC_2.35 _ZGVeN8v_log10 F
GLIBC_2.35 _ZGVeN8v_log1p F
GLIBC_2.35 _ZGVeN8v_log2 F
GLIBC_2.35 _ZGVeN8v_sinh F
GLIBC_2.35 _ZGVeN8vv_atan2 F

View File

@ -110,6 +110,10 @@
# define __DECL_SIMD_log2 __DECL_SIMD_x86_64
# undef __DECL_SIMD_log2f
# define __DECL_SIMD_log2f __DECL_SIMD_x86_64
# undef __DECL_SIMD_log1p
# define __DECL_SIMD_log1p __DECL_SIMD_x86_64
# undef __DECL_SIMD_log1pf
# define __DECL_SIMD_log1pf __DECL_SIMD_x86_64
# endif
#endif

View File

@ -54,6 +54,8 @@
!GCC$ builtin (log10f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (log2) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (log2f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (log1p) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (log1pf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -93,3 +95,5 @@
!GCC$ builtin (log10f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (log2) attributes simd (notinbranch) if('x32')
!GCC$ builtin (log2f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (log1p) attributes simd (notinbranch) if('x32')
!GCC$ builtin (log1pf) attributes simd (notinbranch) if('x32')

View File

@ -36,6 +36,7 @@ libmvec-funcs = \
hypot \
log \
log10 \
log1p \
log2 \
pow \
sin \

View File

@ -23,6 +23,7 @@ libmvec {
_ZGVbN2v_exp2; _ZGVcN4v_exp2; _ZGVdN4v_exp2; _ZGVeN8v_exp2;
_ZGVbN2v_expm1; _ZGVcN4v_expm1; _ZGVdN4v_expm1; _ZGVeN8v_expm1;
_ZGVbN2v_log10; _ZGVcN4v_log10; _ZGVdN4v_log10; _ZGVeN8v_log10;
_ZGVbN2v_log1p; _ZGVcN4v_log1p; _ZGVdN4v_log1p; _ZGVeN8v_log1p;
_ZGVbN2v_log2; _ZGVcN4v_log2; _ZGVdN4v_log2; _ZGVeN8v_log2;
_ZGVbN2v_sinh; _ZGVcN4v_sinh; _ZGVdN4v_sinh; _ZGVeN8v_sinh;
_ZGVbN2vv_atan2; _ZGVcN4vv_atan2; _ZGVdN4vv_atan2; _ZGVeN8vv_atan2;
@ -36,6 +37,7 @@ libmvec {
_ZGVbN4v_exp2f; _ZGVcN8v_exp2f; _ZGVdN8v_exp2f; _ZGVeN16v_exp2f;
_ZGVbN4v_expm1f; _ZGVcN8v_expm1f; _ZGVdN8v_expm1f; _ZGVeN16v_expm1f;
_ZGVbN4v_log10f; _ZGVcN8v_log10f; _ZGVdN8v_log10f; _ZGVeN16v_log10f;
_ZGVbN4v_log1pf; _ZGVcN8v_log1pf; _ZGVdN8v_log1pf; _ZGVeN16v_log1pf;
_ZGVbN4v_log2f; _ZGVcN8v_log2f; _ZGVdN8v_log2f; _ZGVeN16v_log2f;
_ZGVbN4v_sinhf; _ZGVcN8v_sinhf; _ZGVdN8v_sinhf; _ZGVeN16v_sinhf;
_ZGVbN4vv_atan2f; _ZGVcN8vv_atan2f; _ZGVdN8vv_atan2f; _ZGVeN16vv_atan2f;

View File

@ -1685,6 +1685,26 @@ float: 2
float128: 2
ldouble: 3
Function: "log1p_vlen16":
float: 2
Function: "log1p_vlen2":
double: 1
Function: "log1p_vlen4":
double: 1
float: 2
Function: "log1p_vlen4_avx2":
double: 1
Function: "log1p_vlen8":
double: 1
float: 2
Function: "log1p_vlen8_avx2":
float: 2
Function: "log2":
double: 2
float: 1

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized log1p, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_log1p _ZGVbN2v_log1p_sse2
#include "../svml_d_log1p2_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized log1p, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_log1p
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_log1p, __GI__ZGVbN2v_log1p, __redirect__ZGVbN2v_log1p)
__attribute__ ((visibility ("hidden")));
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized log1p, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_log1p _ZGVdN4v_log1p_sse_wrapper
#include "../svml_d_log1p4_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized log1p, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_log1p
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_log1p, __GI__ZGVdN4v_log1p, __redirect__ZGVdN4v_log1p)
__attribute__ ((visibility ("hidden")));
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized log1p, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_log1p _ZGVeN8v_log1p_avx2_wrapper
#include "../svml_d_log1p8_core.S"

View File

@ -0,0 +1,27 @@
/* Multiple versions of vectorized log1p, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_log1p
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_log1p, __GI__ZGVeN8v_log1p, __redirect__ZGVeN8v_log1p)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,317 @@
/* Function log1p vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
* Get short reciprocal approximation Rcp ~ 1/xh
* R = (Rcp*xh - 1.0) + Rcp*xl
* log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
* log(Rcp) is tabulated
*
*
*/
/* Offsets for data table __svml_dlog1p_data_internal_avx512
*/
#define Log_tbl 0
#define One 128
#define SgnMask 192
#define C075 256
#define poly_coeff9 320
#define poly_coeff8 384
#define poly_coeff7 448
#define poly_coeff6 512
#define poly_coeff5 576
#define poly_coeff4 640
#define poly_coeff3 704
#define poly_coeff2 768
#define L2 832
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_log1p_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups One+__svml_dlog1p_data_internal_avx512(%rip), %zmm7
vmovups SgnMask+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
vmovaps %zmm0, %zmm9
vaddpd {rn-sae}, %zmm9, %zmm7, %zmm11
vandpd %zmm14, %zmm9, %zmm8
/* compute 1+x as high, low parts */
vmaxpd {sae}, %zmm9, %zmm7, %zmm10
vminpd {sae}, %zmm9, %zmm7, %zmm12
/* GetMant(x), normalized to [1,2) for x>=0, NaN for x<0 */
vgetmantpd $8, {sae}, %zmm11, %zmm6
/* GetExp(x) */
vgetexppd {sae}, %zmm11, %zmm5
vsubpd {rn-sae}, %zmm10, %zmm11, %zmm13
/* DblRcp ~ 1/Mantissa */
vrcp14pd %zmm6, %zmm15
/* Start polynomial evaluation */
vmovups poly_coeff9+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
vmovups poly_coeff7+__svml_dlog1p_data_internal_avx512(%rip), %zmm11
/* Xl */
vsubpd {rn-sae}, %zmm13, %zmm12, %zmm2
vxorpd %zmm14, %zmm5, %zmm3
/* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
vrndscalepd $88, {sae}, %zmm15, %zmm4
vmovups poly_coeff5+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
vmovups poly_coeff6+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
vmovups poly_coeff3+__svml_dlog1p_data_internal_avx512(%rip), %zmm13
/* Xl*2^(-Expon) */
vscalefpd {rn-sae}, %zmm3, %zmm2, %zmm1
/* Reduced argument: R = DblRcp*(Mantissa+Xl) - 1 */
vfmsub213pd {rn-sae}, %zmm7, %zmm4, %zmm6
vmovups __svml_dlog1p_data_internal_avx512(%rip), %zmm3
/*
* Table lookup
* Prepare exponent correction: DblRcp<0.75?
*/
vmovups C075+__svml_dlog1p_data_internal_avx512(%rip), %zmm2
/* Prepare table index */
vpsrlq $48, %zmm4, %zmm0
vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm6
vmovups poly_coeff8+__svml_dlog1p_data_internal_avx512(%rip), %zmm1
vcmppd $17, {sae}, %zmm2, %zmm4, %k1
vcmppd $4, {sae}, %zmm6, %zmm6, %k0
vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
vmovups poly_coeff4+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
vmovups L2+__svml_dlog1p_data_internal_avx512(%rip), %zmm4
vpermt2pd Log_tbl+64+__svml_dlog1p_data_internal_avx512(%rip), %zmm0, %zmm3
/* add 1 to Expon if DblRcp<0.75 */
vaddpd {rn-sae}, %zmm7, %zmm5, %zmm5{%k1}
/* R^2 */
vmulpd {rn-sae}, %zmm6, %zmm6, %zmm0
vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm10
vmovups poly_coeff2+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
vmulpd {rn-sae}, %zmm0, %zmm0, %zmm15
vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
kmovw %k0, %edx
vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm10
/* polynomial */
vfmadd213pd {rn-sae}, %zmm10, %zmm15, %zmm1
vfmadd213pd {rn-sae}, %zmm6, %zmm0, %zmm1
vaddpd {rn-sae}, %zmm1, %zmm3, %zmm6
vfmadd213pd {rn-sae}, %zmm6, %zmm4, %zmm5
vorpd %zmm8, %zmm5, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm9
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm9, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 64(%rsp,%r14,8), %xmm0
call log1p@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 128(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN8v_log1p_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_dlog1p_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Log_tbl[16][2];
__declspec(align(64)) VUINT32 One[8][2];
__declspec(align(64)) VUINT32 SgnMask[8][2];
__declspec(align(64)) VUINT32 C075[8][2];
__declspec(align(64)) VUINT32 poly_coeff9[8][2];
__declspec(align(64)) VUINT32 poly_coeff8[8][2];
__declspec(align(64)) VUINT32 poly_coeff7[8][2];
__declspec(align(64)) VUINT32 poly_coeff6[8][2];
__declspec(align(64)) VUINT32 poly_coeff5[8][2];
__declspec(align(64)) VUINT32 poly_coeff4[8][2];
__declspec(align(64)) VUINT32 poly_coeff3[8][2];
__declspec(align(64)) VUINT32 poly_coeff2[8][2];
__declspec(align(64)) VUINT32 L2[8][2];
} __svml_dlog1p_data_internal_avx512;
#endif
__svml_dlog1p_data_internal_avx512:
/*== Log_tbl ==*/
.quad 0x0000000000000000
.quad 0xbfaf0a30c01162a6
.quad 0xbfbe27076e2af2e6
.quad 0xbfc5ff3070a793d4
.quad 0xbfcc8ff7c79a9a22
.quad 0xbfd1675cababa60e
.quad 0xbfd4618bc21c5ec2
.quad 0xbfd739d7f6bbd007
.quad 0x3fd269621134db92
.quad 0x3fcf991c6cb3b379
.quad 0x3fca93ed3c8ad9e3
.quad 0x3fc5bf406b543db2
.quad 0x3fc1178e8227e47c
.quad 0x3fb9335e5d594989
.quad 0x3fb08598b59e3a07
.quad 0x3fa0415d89e74444
/*== One ==*/
.align 64
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
/*== SgnMask ==*/
.align 64
.quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
/*== C075 0.75 ==*/
.align 64
.quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
/*== poly_coeff9 ==*/
.align 64
.quad 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70
/*== poly_coeff8 ==*/
.align 64
.quad 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62
/*== poly_coeff7 ==*/
.align 64
.quad 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF
/*== poly_coeff6 ==*/
.align 64
.quad 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06
/*== poly_coeff5 ==*/
.align 64
.quad 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C
/*== poly_coeff4 ==*/
.align 64
.quad 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD
/*== poly_coeff3 ==*/
.align 64
.quad 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466
/*== poly_coeff2 ==*/
.align 64
.quad 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6
/*== L2 = log(2) ==*/
.align 64
.quad 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF
.align 64
.type __svml_dlog1p_data_internal_avx512,@object
.size __svml_dlog1p_data_internal_avx512,.-__svml_dlog1p_data_internal_avx512

View File

@ -0,0 +1,20 @@
/* AVX2 version of vectorized log1pf.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_log1pf _ZGVeN16v_log1pf_avx2_wrapper
#include "../svml_s_log1pf16_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized log1pf, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_log1pf
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_log1pf, __GI__ZGVeN16v_log1pf,
__redirect__ZGVeN16v_log1pf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,271 @@
/* Function log1pf vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
* Get short reciprocal approximation Rcp ~ 1/xh
* R = (Rcp*xh - 1.0) + Rcp*xl
* log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
* log(Rcp) is tabulated
*
*
*/
/* Offsets for data table __svml_slog1p_data_internal
*/
#define SgnMask 0
#define sOne 64
#define sPoly_1 128
#define sPoly_2 192
#define sPoly_3 256
#define sPoly_4 320
#define sPoly_5 384
#define sPoly_6 448
#define sPoly_7 512
#define sPoly_8 576
#define iHiDelta 640
#define iLoRange 704
#define iBrkValue 768
#define iOffExpoMask 832
#define sLn2 896
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_log1pf_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups sOne+__svml_slog1p_data_internal(%rip), %zmm2
/* reduction: compute r,n */
vmovups iBrkValue+__svml_slog1p_data_internal(%rip), %zmm12
vmovups SgnMask+__svml_slog1p_data_internal(%rip), %zmm4
vmovaps %zmm0, %zmm3
/* compute 1+x as high, low parts */
vmaxps {sae}, %zmm3, %zmm2, %zmm5
vminps {sae}, %zmm3, %zmm2, %zmm7
vandnps %zmm3, %zmm4, %zmm1
vpternlogd $255, %zmm4, %zmm4, %zmm4
vaddps {rn-sae}, %zmm7, %zmm5, %zmm9
vpsubd %zmm12, %zmm9, %zmm10
vsubps {rn-sae}, %zmm9, %zmm5, %zmm6
/* check argument value ranges */
vpaddd iHiDelta+__svml_slog1p_data_internal(%rip), %zmm9, %zmm8
vpsrad $23, %zmm10, %zmm13
vmovups sPoly_5+__svml_slog1p_data_internal(%rip), %zmm9
vpcmpd $5, iLoRange+__svml_slog1p_data_internal(%rip), %zmm8, %k1
vpslld $23, %zmm13, %zmm14
vaddps {rn-sae}, %zmm7, %zmm6, %zmm15
vcvtdq2ps {rn-sae}, %zmm13, %zmm0
vpsubd %zmm14, %zmm2, %zmm13
vmovups sPoly_8+__svml_slog1p_data_internal(%rip), %zmm7
vmovups sPoly_1+__svml_slog1p_data_internal(%rip), %zmm14
vmulps {rn-sae}, %zmm13, %zmm15, %zmm6
vpandd iOffExpoMask+__svml_slog1p_data_internal(%rip), %zmm10, %zmm11
vpaddd %zmm12, %zmm11, %zmm5
vmovups sPoly_4+__svml_slog1p_data_internal(%rip), %zmm10
vmovups sPoly_3+__svml_slog1p_data_internal(%rip), %zmm11
vmovups sPoly_2+__svml_slog1p_data_internal(%rip), %zmm12
/* polynomial evaluation */
vsubps {rn-sae}, %zmm2, %zmm5, %zmm2
vaddps {rn-sae}, %zmm6, %zmm2, %zmm15
vmovups sPoly_7+__svml_slog1p_data_internal(%rip), %zmm2
vfmadd231ps {rn-sae}, %zmm15, %zmm7, %zmm2
vpandnd %zmm8, %zmm8, %zmm4{%k1}
vmovups sPoly_6+__svml_slog1p_data_internal(%rip), %zmm8
/* combine and get argument value range mask */
vptestmd %zmm4, %zmm4, %k0
vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
kmovw %k0, %edx
vfmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm2
vfmadd213ps {rn-sae}, %zmm10, %zmm15, %zmm2
vfmadd213ps {rn-sae}, %zmm11, %zmm15, %zmm2
vfmadd213ps {rn-sae}, %zmm12, %zmm15, %zmm2
vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm2
vmulps {rn-sae}, %zmm15, %zmm2, %zmm4
vfmadd213ps {rn-sae}, %zmm15, %zmm15, %zmm4
/* final reconstruction */
vmovups sLn2+__svml_slog1p_data_internal(%rip), %zmm15
vfmadd213ps {rn-sae}, %zmm4, %zmm15, %zmm0
vorps %zmm1, %zmm0, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm3, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $16, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 64(%rsp,%r14,4), %xmm0
call log1pf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 128(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN16v_log1pf_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_slog1p_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 SgnMask[16][1];
__declspec(align(64)) VUINT32 sOne[16][1];
__declspec(align(64)) VUINT32 sPoly[8][16][1];
__declspec(align(64)) VUINT32 iHiDelta[16][1];
__declspec(align(64)) VUINT32 iLoRange[16][1];
__declspec(align(64)) VUINT32 iBrkValue[16][1];
__declspec(align(64)) VUINT32 iOffExpoMask[16][1];
__declspec(align(64)) VUINT32 sLn2[16][1];
} __svml_slog1p_data_internal;
#endif
__svml_slog1p_data_internal:
/*== SgnMask ==*/
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== sOne = SP 1.0 ==*/
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== sPoly[] = SP polynomial ==*/
.align 64
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
.long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
.long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
.long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
.long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
.long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
.long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
.long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
/*== iHiDelta = SP 80000000-7f000000 ==*/
.align 64
.long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000
/*== iLoRange = SP 00800000+iHiDelta ==*/
.align 64
.long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000
/*== iBrkValue = SP 2/3 ==*/
.align 64
.long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
/*== iOffExpoMask = SP significand mask ==*/
.align 64
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
/*== sLn2 = SP ln(2) ==*/
.align 64
.long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
.align 64
.type __svml_slog1p_data_internal,@object
.size __svml_slog1p_data_internal,.-__svml_slog1p_data_internal

View File

@ -0,0 +1,20 @@
/* SSE2 version of vectorized log1pf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_log1pf _ZGVbN4v_log1pf_sse2
#include "../svml_s_log1pf4_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized log1pf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_log1pf
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_log1pf, __GI__ZGVbN4v_log1pf,
__redirect__ZGVbN4v_log1pf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,252 @@
/* Function log1pf vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
* Get short reciprocal approximation Rcp ~ 1/xh
* R = (Rcp*xh - 1.0) + Rcp*xl
* log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
* log(Rcp) is tabulated
*
*
*/
/* Offsets for data table __svml_slog1p_data_internal
*/
#define SgnMask 0
#define sOne 16
#define sPoly 32
#define iHiDelta 160
#define iLoRange 176
#define iBrkValue 192
#define iOffExpoMask 208
#define sLn2 224
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_log1pf_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
movups sOne+__svml_slog1p_data_internal(%rip), %xmm7
/* compute 1+x as high, low parts */
movaps %xmm7, %xmm1
movaps %xmm7, %xmm5
maxps %xmm0, %xmm1
minps %xmm0, %xmm5
movaps %xmm1, %xmm4
/* check argument value ranges */
movdqu iHiDelta+__svml_slog1p_data_internal(%rip), %xmm2
addps %xmm5, %xmm4
/* reduction: compute r,n */
movdqu iBrkValue+__svml_slog1p_data_internal(%rip), %xmm3
paddd %xmm4, %xmm2
movdqu iOffExpoMask+__svml_slog1p_data_internal(%rip), %xmm8
subps %xmm4, %xmm1
psubd %xmm3, %xmm4
addps %xmm1, %xmm5
pand %xmm4, %xmm8
psrad $23, %xmm4
cvtdq2ps %xmm4, %xmm10
pslld $23, %xmm4
movaps %xmm7, %xmm1
paddd %xmm3, %xmm8
psubd %xmm4, %xmm1
mulps %xmm5, %xmm1
/* polynomial evaluation */
subps %xmm7, %xmm8
/* final reconstruction */
mulps sLn2+__svml_slog1p_data_internal(%rip), %xmm10
addps %xmm8, %xmm1
movups sPoly+112+__svml_slog1p_data_internal(%rip), %xmm9
mulps %xmm1, %xmm9
movdqu iLoRange+__svml_slog1p_data_internal(%rip), %xmm6
pcmpgtd %xmm2, %xmm6
addps sPoly+96+__svml_slog1p_data_internal(%rip), %xmm9
/* combine and get argument value range mask */
movmskps %xmm6, %edx
movups SgnMask+__svml_slog1p_data_internal(%rip), %xmm11
mulps %xmm1, %xmm9
andnps %xmm0, %xmm11
addps sPoly+80+__svml_slog1p_data_internal(%rip), %xmm9
mulps %xmm1, %xmm9
addps sPoly+64+__svml_slog1p_data_internal(%rip), %xmm9
mulps %xmm1, %xmm9
addps sPoly+48+__svml_slog1p_data_internal(%rip), %xmm9
mulps %xmm1, %xmm9
addps sPoly+32+__svml_slog1p_data_internal(%rip), %xmm9
mulps %xmm1, %xmm9
addps sPoly+16+__svml_slog1p_data_internal(%rip), %xmm9
mulps %xmm1, %xmm9
addps sPoly+__svml_slog1p_data_internal(%rip), %xmm9
mulps %xmm1, %xmm9
mulps %xmm1, %xmm9
addps %xmm9, %xmm1
addps %xmm10, %xmm1
orps %xmm11, %xmm1
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
/* Restore registers
* and exit the function
*/
L(EXIT):
movaps %xmm1, %xmm0
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm0, 32(%rsp)
movups %xmm1, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 edx
xorl %eax, %eax
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %eax, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %edx, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm1
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm1
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call log1pf@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movss %xmm0, 48(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN4v_log1pf_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_slog1p_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(16)) VUINT32 SgnMask[4][1];
__declspec(align(16)) VUINT32 sOne[4][1];
__declspec(align(16)) VUINT32 sPoly[8][4][1];
__declspec(align(16)) VUINT32 iHiDelta[4][1];
__declspec(align(16)) VUINT32 iLoRange[4][1];
__declspec(align(16)) VUINT32 iBrkValue[4][1];
__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
__declspec(align(16)) VUINT32 sLn2[4][1];
} __svml_slog1p_data_internal;
#endif
__svml_slog1p_data_internal:
/*== SgnMask ==*/
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== sOne = SP 1.0 ==*/
.align 16
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== sPoly[] = SP polynomial ==*/
.align 16
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
.long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
.long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
.long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
.long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
.long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
.long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
.long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
/*== iHiDelta = SP 80000000-7f000000 ==*/
.align 16
.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
/*== iLoRange = SP 00800000+iHiDelta ==*/
.align 16
.long 0x01800000, 0x01800000, 0x01800000, 0x01800000
/*== iBrkValue = SP 2/3 ==*/
.align 16
.long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
/*== iOffExpoMask = SP significand mask ==*/
.align 16
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
/*== sLn2 = SP ln(2) ==*/
.align 16
.long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
.align 16
.type __svml_slog1p_data_internal,@object
.size __svml_slog1p_data_internal,.-__svml_slog1p_data_internal

View File

@ -0,0 +1,20 @@
/* SSE version of vectorized log1pf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_log1pf _ZGVdN8v_log1pf_sse_wrapper
#include "../svml_s_log1pf8_core.S"

View File

@ -0,0 +1,28 @@
/* Multiple versions of vectorized log1pf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_log1pf
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_log1pf, __GI__ZGVdN8v_log1pf,
__redirect__ZGVdN8v_log1pf)
__attribute__ ((visibility ("hidden")));
#endif

View File

@ -0,0 +1,254 @@
/* Function log1pf vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
* Get short reciprocal approximation Rcp ~ 1/xh
* R = (Rcp*xh - 1.0) + Rcp*xl
* log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
* log(Rcp) is tabulated
*
*
*/
/* Offsets for data table __svml_slog1p_data_internal
*/
#define SgnMask 0
#define sOne 32
#define sPoly 64
#define iHiDelta 320
#define iLoRange 352
#define iBrkValue 384
#define iOffExpoMask 416
#define sLn2 448
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_log1pf_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
vmovups sOne+__svml_slog1p_data_internal(%rip), %ymm2
/* reduction: compute r,n */
vmovups iBrkValue+__svml_slog1p_data_internal(%rip), %ymm13
vmovups SgnMask+__svml_slog1p_data_internal(%rip), %ymm4
vmovups iLoRange+__svml_slog1p_data_internal(%rip), %ymm8
vmovaps %ymm0, %ymm3
/* compute 1+x as high, low parts */
vmaxps %ymm3, %ymm2, %ymm5
vminps %ymm3, %ymm2, %ymm6
vaddps %ymm6, %ymm5, %ymm10
vpsubd %ymm13, %ymm10, %ymm11
/* check argument value ranges */
vpaddd iHiDelta+__svml_slog1p_data_internal(%rip), %ymm10, %ymm9
vsubps %ymm10, %ymm5, %ymm7
vpsrad $23, %ymm11, %ymm14
vpand iOffExpoMask+__svml_slog1p_data_internal(%rip), %ymm11, %ymm12
vpslld $23, %ymm14, %ymm15
vcvtdq2ps %ymm14, %ymm0
vpsubd %ymm15, %ymm2, %ymm14
vandnps %ymm3, %ymm4, %ymm1
vaddps %ymm7, %ymm6, %ymm4
vpaddd %ymm13, %ymm12, %ymm6
vmulps %ymm4, %ymm14, %ymm7
/* polynomial evaluation */
vsubps %ymm2, %ymm6, %ymm2
vpcmpgtd %ymm9, %ymm8, %ymm5
vmovups sPoly+224+__svml_slog1p_data_internal(%rip), %ymm8
vaddps %ymm2, %ymm7, %ymm9
vfmadd213ps sPoly+192+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
vfmadd213ps sPoly+160+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
vfmadd213ps sPoly+128+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
vfmadd213ps sPoly+96+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
vfmadd213ps sPoly+64+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
vfmadd213ps sPoly+32+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
vfmadd213ps sPoly+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
vmulps %ymm8, %ymm9, %ymm10
vfmadd213ps %ymm9, %ymm9, %ymm10
/* final reconstruction */
vfmadd132ps sLn2+__svml_slog1p_data_internal(%rip), %ymm10, %ymm0
/* combine and get argument value range mask */
vmovmskps %ymm5, %edx
vorps %ymm1, %ymm0, %ymm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %ymm3, 32(%rsp)
vmovups %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx ymm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call log1pf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 64(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN8v_log1pf_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_slog1p_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(32)) VUINT32 SgnMask[8][1];
__declspec(align(32)) VUINT32 sOne[8][1];
__declspec(align(32)) VUINT32 sPoly[8][8][1];
__declspec(align(32)) VUINT32 iHiDelta[8][1];
__declspec(align(32)) VUINT32 iLoRange[8][1];
__declspec(align(32)) VUINT32 iBrkValue[8][1];
__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
__declspec(align(32)) VUINT32 sLn2[8][1];
} __svml_slog1p_data_internal;
#endif
__svml_slog1p_data_internal:
/*== SgnMask ==*/
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== sOne = SP 1.0 ==*/
.align 32
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== sPoly[] = SP polynomial ==*/
.align 32
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
.long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
.long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
.long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
.long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
.long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
.long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
.long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
/*== iHiDelta = SP 80000000-7f000000 ==*/
.align 32
.long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000
/*== iLoRange = SP 00800000+iHiDelta ==*/
.align 32
.long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000
/*== iBrkValue = SP 2/3 ==*/
.align 32
.long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
/*== iOffExpoMask = SP significand mask ==*/
.align 32
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
/*== sLn2 = SP ln(2) ==*/
.align 32
.long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
.align 32
.type __svml_slog1p_data_internal,@object
.size __svml_slog1p_data_internal,.-__svml_slog1p_data_internal

View File

@ -0,0 +1,29 @@
/* Function log1p vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_log1p)
WRAPPER_IMPL_SSE2 log1p
END (_ZGVbN2v_log1p)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_log1p)
#endif

View File

@ -0,0 +1,29 @@
/* Function log1p vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_log1p)
WRAPPER_IMPL_AVX _ZGVbN2v_log1p
END (_ZGVdN4v_log1p)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_log1p)
#endif

View File

@ -0,0 +1,25 @@
/* Function log1p vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_log1p)
WRAPPER_IMPL_AVX _ZGVbN2v_log1p
END (_ZGVcN4v_log1p)

View File

@ -0,0 +1,25 @@
/* Function log1p vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_log1p)
WRAPPER_IMPL_AVX512 _ZGVdN4v_log1p
END (_ZGVeN8v_log1p)

View File

@ -0,0 +1,25 @@
/* Function log1pf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_log1pf)
WRAPPER_IMPL_AVX512 _ZGVdN8v_log1pf
END (_ZGVeN16v_log1pf)

View File

@ -0,0 +1,29 @@
/* Function log1pf vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_log1pf)
WRAPPER_IMPL_SSE2 log1pf
END (_ZGVbN4v_log1pf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_log1pf)
#endif

View File

@ -0,0 +1,29 @@
/* Function log1pf vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_log1pf)
WRAPPER_IMPL_AVX _ZGVbN4v_log1pf
END (_ZGVdN8v_log1pf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_log1pf)
#endif

View File

@ -0,0 +1,25 @@
/* Function log1pf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_log1pf)
WRAPPER_IMPL_AVX _ZGVbN4v_log1pf
END (_ZGVcN8v_log1pf)

View File

@ -0,0 +1 @@
#include "test-double-libmvec-log1p.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-log1p.c"

View File

@ -0,0 +1 @@
#include "test-double-libmvec-log1p.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC log1p
#include "test-vector-abi-arg1.h"

View File

@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVbN2v_cbrt)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVbN2vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVbN2v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVbN2v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVbN2v_log1p)
#define VEC_INT_TYPE __m128i

View File

@ -43,6 +43,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVdN4v_cbrt)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVdN4vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVdN4v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVdN4v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVdN4v_log1p)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View File

@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVcN4v_cbrt)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVcN4vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVcN4v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVcN4v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVcN4v_log1p)
#define VEC_INT_TYPE __m128i

View File

@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVeN8v_cbrt)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVeN8vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVeN8v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVeN8v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVeN8v_log1p)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View File

@ -0,0 +1 @@
#include "test-float-libmvec-log1pf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-log1pf.c"

View File

@ -0,0 +1 @@
#include "test-float-libmvec-log1pf.c"

View File

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC log1pf
#include "test-vector-abi-arg1.h"

View File

@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVeN16v_cbrtf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVeN16vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVeN16v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVeN16v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVeN16v_log1pf)
#define VEC_INT_TYPE __m512i

View File

@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVbN4v_cbrtf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVbN4vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVbN4v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVbN4v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVbN4v_log1pf)
#define VEC_INT_TYPE __m128i

View File

@ -43,6 +43,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVdN8v_cbrtf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVdN8vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVdN8v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVdN8v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVdN8v_log1pf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View File

@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVcN8v_cbrtf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVcN8vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVcN8v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVcN8v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVcN8v_log1pf)
#define VEC_INT_TYPE __m128i