x86-64: Add vector log1p/log1pf implementation to libmvec

Implement vectorized log1p/log1pf containing SSE, AVX, AVX2 and AVX512 versions for libmvec as per vector ABI. It also contains accuracy and ABI tests for vector log1p/log1pf with regenerated ulps. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2024-11-22 04:50:07 +00:00 · 2021-12-29 09:35:53 -08:00 · 2021-12-29 09:35:53 -08:00 · 74265c16ab
commit 74265c16ab
parent 7e1722fec8
50 changed files with 4447 additions and 1 deletions
--- a/bits/libm-simd-decl-stubs.h
+++ b/bits/libm-simd-decl-stubs.h
@ -241,4 +241,15 @@
 #define __DECL_SIMD_log2f32x
 #define __DECL_SIMD_log2f64x
 #define __DECL_SIMD_log2f128x
 #define __DECL_SIMD_log1p
 #define __DECL_SIMD_log1pf
 #define __DECL_SIMD_log1pl
 #define __DECL_SIMD_log1pf16
 #define __DECL_SIMD_log1pf32
 #define __DECL_SIMD_log1pf64
 #define __DECL_SIMD_log1pf128
 #define __DECL_SIMD_log1pf32x
 #define __DECL_SIMD_log1pf64x
 #define __DECL_SIMD_log1pf128x
 #endif
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@ -119,7 +119,7 @@ __MATHCALL_VEC (exp10,, (_Mdouble_ __x));
 __MATHCALL_VEC (expm1,, (_Mdouble_ __x));
 /* Return log(1 + X).  */
-__MATHCALL (log1p,, (_Mdouble_ __x));
+__MATHCALL_VEC (log1p,, (_Mdouble_ __x));
 /* Return the base 2 signed integral exponent of X.  */
 __MATHCALL (logb,, (_Mdouble_ __x));
--- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
@ -55,6 +55,7 @@ GLIBC_2.35 _ZGVbN2v_exp10 F
 GLIBC_2.35 _ZGVbN2v_exp2 F
 GLIBC_2.35 _ZGVbN2v_expm1 F
 GLIBC_2.35 _ZGVbN2v_log10 F
 GLIBC_2.35 _ZGVbN2v_log1p F
 GLIBC_2.35 _ZGVbN2v_log2 F
 GLIBC_2.35 _ZGVbN2v_sinh F
 GLIBC_2.35 _ZGVbN2vv_atan2 F
@ -68,6 +69,7 @@ GLIBC_2.35 _ZGVbN4v_exp10f F
 GLIBC_2.35 _ZGVbN4v_exp2f F
 GLIBC_2.35 _ZGVbN4v_expm1f F
 GLIBC_2.35 _ZGVbN4v_log10f F
 GLIBC_2.35 _ZGVbN4v_log1pf F
 GLIBC_2.35 _ZGVbN4v_log2f F
 GLIBC_2.35 _ZGVbN4v_sinhf F
 GLIBC_2.35 _ZGVbN4vv_atan2f F
@ -81,6 +83,7 @@ GLIBC_2.35 _ZGVcN4v_exp10 F
 GLIBC_2.35 _ZGVcN4v_exp2 F
 GLIBC_2.35 _ZGVcN4v_expm1 F
 GLIBC_2.35 _ZGVcN4v_log10 F
 GLIBC_2.35 _ZGVcN4v_log1p F
 GLIBC_2.35 _ZGVcN4v_log2 F
 GLIBC_2.35 _ZGVcN4v_sinh F
 GLIBC_2.35 _ZGVcN4vv_atan2 F
@ -94,6 +97,7 @@ GLIBC_2.35 _ZGVcN8v_exp10f F
 GLIBC_2.35 _ZGVcN8v_exp2f F
 GLIBC_2.35 _ZGVcN8v_expm1f F
 GLIBC_2.35 _ZGVcN8v_log10f F
 GLIBC_2.35 _ZGVcN8v_log1pf F
 GLIBC_2.35 _ZGVcN8v_log2f F
 GLIBC_2.35 _ZGVcN8v_sinhf F
 GLIBC_2.35 _ZGVcN8vv_atan2f F
@ -107,6 +111,7 @@ GLIBC_2.35 _ZGVdN4v_exp10 F
 GLIBC_2.35 _ZGVdN4v_exp2 F
 GLIBC_2.35 _ZGVdN4v_expm1 F
 GLIBC_2.35 _ZGVdN4v_log10 F
 GLIBC_2.35 _ZGVdN4v_log1p F
 GLIBC_2.35 _ZGVdN4v_log2 F
 GLIBC_2.35 _ZGVdN4v_sinh F
 GLIBC_2.35 _ZGVdN4vv_atan2 F
@ -120,6 +125,7 @@ GLIBC_2.35 _ZGVdN8v_exp10f F
 GLIBC_2.35 _ZGVdN8v_exp2f F
 GLIBC_2.35 _ZGVdN8v_expm1f F
 GLIBC_2.35 _ZGVdN8v_log10f F
 GLIBC_2.35 _ZGVdN8v_log1pf F
 GLIBC_2.35 _ZGVdN8v_log2f F
 GLIBC_2.35 _ZGVdN8v_sinhf F
 GLIBC_2.35 _ZGVdN8vv_atan2f F
@ -133,6 +139,7 @@ GLIBC_2.35 _ZGVeN16v_exp10f F
 GLIBC_2.35 _ZGVeN16v_exp2f F
 GLIBC_2.35 _ZGVeN16v_expm1f F
 GLIBC_2.35 _ZGVeN16v_log10f F
 GLIBC_2.35 _ZGVeN16v_log1pf F
 GLIBC_2.35 _ZGVeN16v_log2f F
 GLIBC_2.35 _ZGVeN16v_sinhf F
 GLIBC_2.35 _ZGVeN16vv_atan2f F
@ -146,6 +153,7 @@ GLIBC_2.35 _ZGVeN8v_exp10 F
 GLIBC_2.35 _ZGVeN8v_exp2 F
 GLIBC_2.35 _ZGVeN8v_expm1 F
 GLIBC_2.35 _ZGVeN8v_log10 F
 GLIBC_2.35 _ZGVeN8v_log1p F
 GLIBC_2.35 _ZGVeN8v_log2 F
 GLIBC_2.35 _ZGVeN8v_sinh F
 GLIBC_2.35 _ZGVeN8vv_atan2 F
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86/fpu/bits/math-vector.h
@ -110,6 +110,10 @@
 #  define __DECL_SIMD_log2 __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_log2f
 #  define __DECL_SIMD_log2f __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_log1p
 #  define __DECL_SIMD_log1p __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_log1pf
 #  define __DECL_SIMD_log1pf __DECL_SIMD_x86_64
 # endif
 #endif
--- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
@ -54,6 +54,8 @@
 !GCC$ builtin (log10f) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (log2) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (log2f) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (log1p) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (log1pf) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -93,3 +95,5 @@
 !GCC$ builtin (log10f) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (log2) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (log2f) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (log1p) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (log1pf) attributes simd (notinbranch) if('x32')
--- a/sysdeps/x86_64/fpu/Makeconfig
+++ b/sysdeps/x86_64/fpu/Makeconfig
@ -36,6 +36,7 @@ libmvec-funcs = \
  hypot \
  log \
  log10 \
  log1p \
  log2 \
  pow \
  sin \
--- a/sysdeps/x86_64/fpu/Versions
+++ b/sysdeps/x86_64/fpu/Versions
@ -23,6 +23,7 @@ libmvec {
    _ZGVbN2v_exp2; _ZGVcN4v_exp2; _ZGVdN4v_exp2; _ZGVeN8v_exp2;
    _ZGVbN2v_expm1; _ZGVcN4v_expm1; _ZGVdN4v_expm1; _ZGVeN8v_expm1;
    _ZGVbN2v_log10; _ZGVcN4v_log10; _ZGVdN4v_log10; _ZGVeN8v_log10;
    _ZGVbN2v_log1p; _ZGVcN4v_log1p; _ZGVdN4v_log1p; _ZGVeN8v_log1p;
    _ZGVbN2v_log2; _ZGVcN4v_log2; _ZGVdN4v_log2; _ZGVeN8v_log2;
    _ZGVbN2v_sinh; _ZGVcN4v_sinh; _ZGVdN4v_sinh; _ZGVeN8v_sinh;
    _ZGVbN2vv_atan2; _ZGVcN4vv_atan2; _ZGVdN4vv_atan2; _ZGVeN8vv_atan2;
@ -36,6 +37,7 @@ libmvec {
    _ZGVbN4v_exp2f; _ZGVcN8v_exp2f; _ZGVdN8v_exp2f; _ZGVeN16v_exp2f;
    _ZGVbN4v_expm1f; _ZGVcN8v_expm1f; _ZGVdN8v_expm1f; _ZGVeN16v_expm1f;
    _ZGVbN4v_log10f; _ZGVcN8v_log10f; _ZGVdN8v_log10f; _ZGVeN16v_log10f;
    _ZGVbN4v_log1pf; _ZGVcN8v_log1pf; _ZGVdN8v_log1pf; _ZGVeN16v_log1pf;
    _ZGVbN4v_log2f; _ZGVcN8v_log2f; _ZGVdN8v_log2f; _ZGVeN16v_log2f;
    _ZGVbN4v_sinhf; _ZGVcN8v_sinhf; _ZGVdN8v_sinhf; _ZGVeN16v_sinhf;
    _ZGVbN4vv_atan2f; _ZGVcN8vv_atan2f; _ZGVdN8vv_atan2f; _ZGVeN16vv_atan2f;
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@ -1685,6 +1685,26 @@ float: 2
 float128: 2
 ldouble: 3
 Function: "log1p_vlen16":
 float: 2
 Function: "log1p_vlen2":
 double: 1
 Function: "log1p_vlen4":
 double: 1
 float: 2
 Function: "log1p_vlen4_avx2":
 double: 1
 Function: "log1p_vlen8":
 double: 1
 float: 2
 Function: "log1p_vlen8_avx2":
 float: 2
 Function: "log2":
 double: 2
 float: 1
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p2_core-sse2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p2_core-sse2.S
@ -0,0 +1,20 @@
 /* SSE2 version of vectorized log1p, vector length is 2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVbN2v_log1p _ZGVbN2v_log1p_sse2
 #include "../svml_d_log1p2_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p2_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p2_core.c
@ -0,0 +1,27 @@
 /* Multiple versions of vectorized log1p, vector length is 2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVbN2v_log1p
 #include "ifunc-mathvec-sse4_1.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVbN2v_log1p, __GI__ZGVbN2v_log1p, __redirect__ZGVbN2v_log1p)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p2_core_sse4.S
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core-sse.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core-sse.S
@ -0,0 +1,20 @@
 /* SSE version of vectorized log1p, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVdN4v_log1p _ZGVdN4v_log1p_sse_wrapper
 #include "../svml_d_log1p4_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core.c
@ -0,0 +1,27 @@
 /* Multiple versions of vectorized log1p, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVdN4v_log1p
 #include "ifunc-mathvec-avx2.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVdN4v_log1p, __GI__ZGVdN4v_log1p, __redirect__ZGVdN4v_log1p)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core-avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core-avx2.S
@ -0,0 +1,20 @@
 /* AVX2 version of vectorized log1p, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVeN8v_log1p _ZGVeN8v_log1p_avx2_wrapper
 #include "../svml_d_log1p8_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core.c
@ -0,0 +1,27 @@
 /* Multiple versions of vectorized log1p, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVeN8v_log1p
 #include "ifunc-mathvec-avx512-skx.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVeN8v_log1p, __GI__ZGVeN8v_log1p, __redirect__ZGVeN8v_log1p)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
@ -0,0 +1,317 @@
 /* Function log1p vectorized with AVX-512.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *    1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
 *    Get short reciprocal approximation Rcp ~ 1/xh
 *    R = (Rcp*xh - 1.0) + Rcp*xl
 *    log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
 *       log(Rcp) is tabulated
 *
 *
 */
 /* Offsets for data table __svml_dlog1p_data_internal_avx512
 */
 #define Log_tbl                       	0
 #define One                           	128
 #define SgnMask                       	192
 #define C075                          	256
 #define poly_coeff9                   	320
 #define poly_coeff8                   	384
 #define poly_coeff7                   	448
 #define poly_coeff6                   	512
 #define poly_coeff5                   	576
 #define poly_coeff4                   	640
 #define poly_coeff3                   	704
 #define poly_coeff2                   	768
 #define L2                            	832
 #include <sysdep.h>
        .text
 	.section .text.evex512,"ax",@progbits
 ENTRY(_ZGVeN8v_log1p_skx)
        pushq     %rbp
        cfi_def_cfa_offset(16)
        movq      %rsp, %rbp
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
        andq      $-64, %rsp
        subq      $192, %rsp
        vmovups   One+__svml_dlog1p_data_internal_avx512(%rip), %zmm7
        vmovups   SgnMask+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
        vmovaps   %zmm0, %zmm9
        vaddpd    {rn-sae}, %zmm9, %zmm7, %zmm11
        vandpd    %zmm14, %zmm9, %zmm8
 /* compute 1+x as high, low parts */
        vmaxpd    {sae}, %zmm9, %zmm7, %zmm10
        vminpd    {sae}, %zmm9, %zmm7, %zmm12
 /* GetMant(x), normalized to [1,2) for x>=0, NaN for x<0 */
        vgetmantpd $8, {sae}, %zmm11, %zmm6
 /* GetExp(x) */
        vgetexppd {sae}, %zmm11, %zmm5
        vsubpd    {rn-sae}, %zmm10, %zmm11, %zmm13
 /* DblRcp ~ 1/Mantissa */
        vrcp14pd  %zmm6, %zmm15
 /* Start polynomial evaluation */
        vmovups   poly_coeff9+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
        vmovups   poly_coeff7+__svml_dlog1p_data_internal_avx512(%rip), %zmm11
 /* Xl */
        vsubpd    {rn-sae}, %zmm13, %zmm12, %zmm2
        vxorpd    %zmm14, %zmm5, %zmm3
 /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
        vrndscalepd $88, {sae}, %zmm15, %zmm4
        vmovups   poly_coeff5+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
        vmovups   poly_coeff6+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
        vmovups   poly_coeff3+__svml_dlog1p_data_internal_avx512(%rip), %zmm13
 /* Xl*2^(-Expon) */
        vscalefpd {rn-sae}, %zmm3, %zmm2, %zmm1
 /* Reduced argument: R = DblRcp*(Mantissa+Xl) - 1 */
        vfmsub213pd {rn-sae}, %zmm7, %zmm4, %zmm6
        vmovups   __svml_dlog1p_data_internal_avx512(%rip), %zmm3
 /*
 * Table lookup
 * Prepare exponent correction: DblRcp<0.75?
 */
        vmovups   C075+__svml_dlog1p_data_internal_avx512(%rip), %zmm2
 /* Prepare table index */
        vpsrlq    $48, %zmm4, %zmm0
        vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm6
        vmovups   poly_coeff8+__svml_dlog1p_data_internal_avx512(%rip), %zmm1
        vcmppd    $17, {sae}, %zmm2, %zmm4, %k1
        vcmppd    $4, {sae}, %zmm6, %zmm6, %k0
        vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
        vmovups   poly_coeff4+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
        vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
        vmovups   L2+__svml_dlog1p_data_internal_avx512(%rip), %zmm4
        vpermt2pd Log_tbl+64+__svml_dlog1p_data_internal_avx512(%rip), %zmm0, %zmm3
 /* add 1 to Expon if DblRcp<0.75 */
        vaddpd    {rn-sae}, %zmm7, %zmm5, %zmm5{%k1}
 /* R^2 */
        vmulpd    {rn-sae}, %zmm6, %zmm6, %zmm0
        vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm10
        vmovups   poly_coeff2+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
        vmulpd    {rn-sae}, %zmm0, %zmm0, %zmm15
        vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
        vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
        kmovw     %k0, %edx
        vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm10
 /* polynomial */
        vfmadd213pd {rn-sae}, %zmm10, %zmm15, %zmm1
        vfmadd213pd {rn-sae}, %zmm6, %zmm0, %zmm1
        vaddpd    {rn-sae}, %zmm1, %zmm3, %zmm6
        vfmadd213pd {rn-sae}, %zmm6, %zmm4, %zmm5
        vorpd     %zmm8, %zmm5, %zmm0
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm9
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        movq      %rbp, %rsp
        popq      %rbp
        cfi_def_cfa(7, 8)
        cfi_restore(6)
        ret
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        vmovups   %zmm9, 64(%rsp)
        vmovups   %zmm0, 128(%rsp)
                                # LOE rbx r12 r13 r14 r15 edx zmm0
        xorl      %eax, %eax
                                # LOE rbx r12 r13 r14 r15 eax edx
        vzeroupper
        movq      %r12, 16(%rsp)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $8, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        vmovups   128(%rsp), %zmm0
 /* Go to exit */
        jmp       L(EXIT)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r12 r13 r14 r15 zmm0
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movsd     64(%rsp,%r14,8), %xmm0
        call      log1p@PLT
                                # LOE rbx r14 r15 r12d r13d xmm0
        movsd     %xmm0, 128(%rsp,%r14,8)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx r15 r12d r13d
 END(_ZGVeN8v_log1p_skx)
        .section .rodata, "a"
        .align 64
 #ifdef __svml_dlog1p_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(64)) VUINT32 Log_tbl[16][2];
        __declspec(align(64)) VUINT32 One[8][2];
        __declspec(align(64)) VUINT32 SgnMask[8][2];
        __declspec(align(64)) VUINT32 C075[8][2];
        __declspec(align(64)) VUINT32 poly_coeff9[8][2];
        __declspec(align(64)) VUINT32 poly_coeff8[8][2];
        __declspec(align(64)) VUINT32 poly_coeff7[8][2];
        __declspec(align(64)) VUINT32 poly_coeff6[8][2];
        __declspec(align(64)) VUINT32 poly_coeff5[8][2];
        __declspec(align(64)) VUINT32 poly_coeff4[8][2];
        __declspec(align(64)) VUINT32 poly_coeff3[8][2];
        __declspec(align(64)) VUINT32 poly_coeff2[8][2];
        __declspec(align(64)) VUINT32 L2[8][2];
   } __svml_dlog1p_data_internal_avx512;
 #endif
 __svml_dlog1p_data_internal_avx512:
        /*== Log_tbl ==*/
        .quad 0x0000000000000000
        .quad 0xbfaf0a30c01162a6
        .quad 0xbfbe27076e2af2e6
        .quad 0xbfc5ff3070a793d4
        .quad 0xbfcc8ff7c79a9a22
        .quad 0xbfd1675cababa60e
        .quad 0xbfd4618bc21c5ec2
        .quad 0xbfd739d7f6bbd007
        .quad 0x3fd269621134db92
        .quad 0x3fcf991c6cb3b379
        .quad 0x3fca93ed3c8ad9e3
        .quad 0x3fc5bf406b543db2
        .quad 0x3fc1178e8227e47c
        .quad 0x3fb9335e5d594989
        .quad 0x3fb08598b59e3a07
        .quad 0x3fa0415d89e74444
        /*== One ==*/
        .align 64
        .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
        /*== SgnMask ==*/
        .align 64
        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
        /*== C075 0.75 ==*/
        .align 64
        .quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
        /*== poly_coeff9 ==*/
        .align 64
        .quad 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70
        /*== poly_coeff8 ==*/
        .align 64
        .quad 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62
        /*== poly_coeff7 ==*/
        .align 64
        .quad 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF
        /*== poly_coeff6 ==*/
        .align 64
        .quad 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06
        /*== poly_coeff5 ==*/
        .align 64
        .quad 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C
        /*== poly_coeff4 ==*/
        .align 64
        .quad 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD
        /*== poly_coeff3 ==*/
        .align 64
        .quad 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466
        /*== poly_coeff2 ==*/
        .align 64
        .quad 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6
        /*== L2 = log(2) ==*/
        .align 64
        .quad 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF
        .align 64
        .type	__svml_dlog1p_data_internal_avx512,@object
        .size	__svml_dlog1p_data_internal_avx512,.-__svml_dlog1p_data_internal_avx512
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core-avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core-avx2.S
@ -0,0 +1,20 @@
 /* AVX2 version of vectorized log1pf.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVeN16v_log1pf _ZGVeN16v_log1pf_avx2_wrapper
 #include "../svml_s_log1pf16_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core.c
@ -0,0 +1,28 @@
 /* Multiple versions of vectorized log1pf, vector length is 16.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVeN16v_log1pf
 #include "ifunc-mathvec-avx512-skx.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVeN16v_log1pf, __GI__ZGVeN16v_log1pf,
 	       __redirect__ZGVeN16v_log1pf)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
@ -0,0 +1,271 @@
 /* Function log1pf vectorized with AVX-512.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *    1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
 *    Get short reciprocal approximation Rcp ~ 1/xh
 *    R = (Rcp*xh - 1.0) + Rcp*xl
 *    log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
 *       log(Rcp) is tabulated
 *
 *
 */
 /* Offsets for data table __svml_slog1p_data_internal
 */
 #define SgnMask                       	0
 #define sOne                          	64
 #define sPoly_1                       	128
 #define sPoly_2                       	192
 #define sPoly_3                       	256
 #define sPoly_4                       	320
 #define sPoly_5                       	384
 #define sPoly_6                       	448
 #define sPoly_7                       	512
 #define sPoly_8                       	576
 #define iHiDelta                      	640
 #define iLoRange                      	704
 #define iBrkValue                     	768
 #define iOffExpoMask                  	832
 #define sLn2                          	896
 #include <sysdep.h>
        .text
 	.section .text.exex512,"ax",@progbits
 ENTRY(_ZGVeN16v_log1pf_skx)
        pushq     %rbp
        cfi_def_cfa_offset(16)
        movq      %rsp, %rbp
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
        andq      $-64, %rsp
        subq      $192, %rsp
        vmovups   sOne+__svml_slog1p_data_internal(%rip), %zmm2
 /* reduction: compute r,n */
        vmovups   iBrkValue+__svml_slog1p_data_internal(%rip), %zmm12
        vmovups   SgnMask+__svml_slog1p_data_internal(%rip), %zmm4
        vmovaps   %zmm0, %zmm3
 /* compute 1+x as high, low parts */
        vmaxps    {sae}, %zmm3, %zmm2, %zmm5
        vminps    {sae}, %zmm3, %zmm2, %zmm7
        vandnps   %zmm3, %zmm4, %zmm1
        vpternlogd $255, %zmm4, %zmm4, %zmm4
        vaddps    {rn-sae}, %zmm7, %zmm5, %zmm9
        vpsubd    %zmm12, %zmm9, %zmm10
        vsubps    {rn-sae}, %zmm9, %zmm5, %zmm6
 /* check argument value ranges */
        vpaddd    iHiDelta+__svml_slog1p_data_internal(%rip), %zmm9, %zmm8
        vpsrad    $23, %zmm10, %zmm13
        vmovups   sPoly_5+__svml_slog1p_data_internal(%rip), %zmm9
        vpcmpd    $5, iLoRange+__svml_slog1p_data_internal(%rip), %zmm8, %k1
        vpslld    $23, %zmm13, %zmm14
        vaddps    {rn-sae}, %zmm7, %zmm6, %zmm15
        vcvtdq2ps {rn-sae}, %zmm13, %zmm0
        vpsubd    %zmm14, %zmm2, %zmm13
        vmovups   sPoly_8+__svml_slog1p_data_internal(%rip), %zmm7
        vmovups   sPoly_1+__svml_slog1p_data_internal(%rip), %zmm14
        vmulps    {rn-sae}, %zmm13, %zmm15, %zmm6
        vpandd    iOffExpoMask+__svml_slog1p_data_internal(%rip), %zmm10, %zmm11
        vpaddd    %zmm12, %zmm11, %zmm5
        vmovups   sPoly_4+__svml_slog1p_data_internal(%rip), %zmm10
        vmovups   sPoly_3+__svml_slog1p_data_internal(%rip), %zmm11
        vmovups   sPoly_2+__svml_slog1p_data_internal(%rip), %zmm12
 /* polynomial evaluation */
        vsubps    {rn-sae}, %zmm2, %zmm5, %zmm2
        vaddps    {rn-sae}, %zmm6, %zmm2, %zmm15
        vmovups   sPoly_7+__svml_slog1p_data_internal(%rip), %zmm2
        vfmadd231ps {rn-sae}, %zmm15, %zmm7, %zmm2
        vpandnd   %zmm8, %zmm8, %zmm4{%k1}
        vmovups   sPoly_6+__svml_slog1p_data_internal(%rip), %zmm8
 /* combine and get argument value range mask */
        vptestmd  %zmm4, %zmm4, %k0
        vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
        kmovw     %k0, %edx
        vfmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm2
        vfmadd213ps {rn-sae}, %zmm10, %zmm15, %zmm2
        vfmadd213ps {rn-sae}, %zmm11, %zmm15, %zmm2
        vfmadd213ps {rn-sae}, %zmm12, %zmm15, %zmm2
        vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm2
        vmulps    {rn-sae}, %zmm15, %zmm2, %zmm4
        vfmadd213ps {rn-sae}, %zmm15, %zmm15, %zmm4
 /* final reconstruction */
        vmovups   sLn2+__svml_slog1p_data_internal(%rip), %zmm15
        vfmadd213ps {rn-sae}, %zmm4, %zmm15, %zmm0
        vorps     %zmm1, %zmm0, %zmm0
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        movq      %rbp, %rsp
        popq      %rbp
        cfi_def_cfa(7, 8)
        cfi_restore(6)
        ret
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        vmovups   %zmm3, 64(%rsp)
        vmovups   %zmm0, 128(%rsp)
                                # LOE rbx r12 r13 r14 r15 edx zmm0
        xorl      %eax, %eax
                                # LOE rbx r12 r13 r14 r15 eax edx
        vzeroupper
        movq      %r12, 16(%rsp)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $16, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        vmovups   128(%rsp), %zmm0
 /* Go to exit */
        jmp       L(EXIT)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r12 r13 r14 r15 zmm0
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movss     64(%rsp,%r14,4), %xmm0
        call      log1pf@PLT
                                # LOE rbx r14 r15 r12d r13d xmm0
        movss     %xmm0, 128(%rsp,%r14,4)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx r15 r12d r13d
 END(_ZGVeN16v_log1pf_skx)
        .section .rodata, "a"
        .align 64
 #ifdef __svml_slog1p_data_internal_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(64)) VUINT32 SgnMask[16][1];
        __declspec(align(64)) VUINT32 sOne[16][1];
        __declspec(align(64)) VUINT32 sPoly[8][16][1];
        __declspec(align(64)) VUINT32 iHiDelta[16][1];
        __declspec(align(64)) VUINT32 iLoRange[16][1];
        __declspec(align(64)) VUINT32 iBrkValue[16][1];
        __declspec(align(64)) VUINT32 iOffExpoMask[16][1];
        __declspec(align(64)) VUINT32 sLn2[16][1];
 } __svml_slog1p_data_internal;
 #endif
 __svml_slog1p_data_internal:
        /*== SgnMask ==*/
        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
        /*== sOne = SP 1.0 ==*/
        .align 64
        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
        /*== sPoly[] = SP polynomial ==*/
        .align 64
        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
        /*== iHiDelta = SP 80000000-7f000000 ==*/
        .align 64
        .long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000
        /*== iLoRange = SP 00800000+iHiDelta ==*/
        .align 64
        .long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000
        /*== iBrkValue = SP 2/3 ==*/
        .align 64
        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
        /*== iOffExpoMask = SP significand mask ==*/
        .align 64
        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
        /*== sLn2 = SP ln(2) ==*/
        .align 64
        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
        .align 64
        .type	__svml_slog1p_data_internal,@object
        .size	__svml_slog1p_data_internal,.-__svml_slog1p_data_internal
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf4_core-sse2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf4_core-sse2.S
@ -0,0 +1,20 @@
 /* SSE2 version of vectorized log1pf, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVbN4v_log1pf _ZGVbN4v_log1pf_sse2
 #include "../svml_s_log1pf4_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf4_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf4_core.c
@ -0,0 +1,28 @@
 /* Multiple versions of vectorized log1pf, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVbN4v_log1pf
 #include "ifunc-mathvec-sse4_1.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVbN4v_log1pf, __GI__ZGVbN4v_log1pf,
 	       __redirect__ZGVbN4v_log1pf)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf4_core_sse4.S
@ -0,0 +1,252 @@
 /* Function log1pf vectorized with SSE4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *    1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
 *    Get short reciprocal approximation Rcp ~ 1/xh
 *    R = (Rcp*xh - 1.0) + Rcp*xl
 *    log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
 *       log(Rcp) is tabulated
 *
 *
 */
 /* Offsets for data table __svml_slog1p_data_internal
 */
 #define SgnMask                       	0
 #define sOne                          	16
 #define sPoly                         	32
 #define iHiDelta                      	160
 #define iLoRange                      	176
 #define iBrkValue                     	192
 #define iOffExpoMask                  	208
 #define sLn2                          	224
 #include <sysdep.h>
        .text
 	.section .text.sse4,"ax",@progbits
 ENTRY(_ZGVbN4v_log1pf_sse4)
        subq      $72, %rsp
        cfi_def_cfa_offset(80)
        movups    sOne+__svml_slog1p_data_internal(%rip), %xmm7
 /* compute 1+x as high, low parts */
        movaps    %xmm7, %xmm1
        movaps    %xmm7, %xmm5
        maxps     %xmm0, %xmm1
        minps     %xmm0, %xmm5
        movaps    %xmm1, %xmm4
 /* check argument value ranges */
        movdqu    iHiDelta+__svml_slog1p_data_internal(%rip), %xmm2
        addps     %xmm5, %xmm4
 /* reduction: compute r,n */
        movdqu    iBrkValue+__svml_slog1p_data_internal(%rip), %xmm3
        paddd     %xmm4, %xmm2
        movdqu    iOffExpoMask+__svml_slog1p_data_internal(%rip), %xmm8
        subps     %xmm4, %xmm1
        psubd     %xmm3, %xmm4
        addps     %xmm1, %xmm5
        pand      %xmm4, %xmm8
        psrad     $23, %xmm4
        cvtdq2ps  %xmm4, %xmm10
        pslld     $23, %xmm4
        movaps    %xmm7, %xmm1
        paddd     %xmm3, %xmm8
        psubd     %xmm4, %xmm1
        mulps     %xmm5, %xmm1
 /* polynomial evaluation */
        subps     %xmm7, %xmm8
 /* final reconstruction */
        mulps     sLn2+__svml_slog1p_data_internal(%rip), %xmm10
        addps     %xmm8, %xmm1
        movups    sPoly+112+__svml_slog1p_data_internal(%rip), %xmm9
        mulps     %xmm1, %xmm9
        movdqu    iLoRange+__svml_slog1p_data_internal(%rip), %xmm6
        pcmpgtd   %xmm2, %xmm6
        addps     sPoly+96+__svml_slog1p_data_internal(%rip), %xmm9
 /* combine and get argument value range mask */
        movmskps  %xmm6, %edx
        movups    SgnMask+__svml_slog1p_data_internal(%rip), %xmm11
        mulps     %xmm1, %xmm9
        andnps    %xmm0, %xmm11
        addps     sPoly+80+__svml_slog1p_data_internal(%rip), %xmm9
        mulps     %xmm1, %xmm9
        addps     sPoly+64+__svml_slog1p_data_internal(%rip), %xmm9
        mulps     %xmm1, %xmm9
        addps     sPoly+48+__svml_slog1p_data_internal(%rip), %xmm9
        mulps     %xmm1, %xmm9
        addps     sPoly+32+__svml_slog1p_data_internal(%rip), %xmm9
        mulps     %xmm1, %xmm9
        addps     sPoly+16+__svml_slog1p_data_internal(%rip), %xmm9
        mulps     %xmm1, %xmm9
        addps     sPoly+__svml_slog1p_data_internal(%rip), %xmm9
        mulps     %xmm1, %xmm9
        mulps     %xmm1, %xmm9
        addps     %xmm9, %xmm1
        addps     %xmm10, %xmm1
        orps      %xmm11, %xmm1
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        movaps    %xmm1, %xmm0
        addq      $72, %rsp
        cfi_def_cfa_offset(8)
        ret
        cfi_def_cfa_offset(80)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        movups    %xmm0, 32(%rsp)
        movups    %xmm1, 48(%rsp)
                                # LOE rbx rbp r12 r13 r14 r15 edx
        xorl      %eax, %eax
        movq      %r12, 16(%rsp)
        cfi_offset(12, -64)
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        cfi_offset(13, -72)
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        cfi_offset(14, -80)
                                # LOE rbx rbp r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx rbp r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $4, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx rbp r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        movups    48(%rsp), %xmm1
 /* Go to exit */
        jmp       L(EXIT)
        cfi_offset(12, -64)
        cfi_offset(13, -72)
        cfi_offset(14, -80)
                                # LOE rbx rbp r12 r13 r14 r15 xmm1
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movss     32(%rsp,%r14,4), %xmm0
        call      log1pf@PLT
                                # LOE rbx rbp r14 r15 r12d r13d xmm0
        movss     %xmm0, 48(%rsp,%r14,4)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx rbp r15 r12d r13d
 END(_ZGVbN4v_log1pf_sse4)
        .section .rodata, "a"
        .align 16
 #ifdef __svml_slog1p_data_internal_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(16)) VUINT32 SgnMask[4][1];
        __declspec(align(16)) VUINT32 sOne[4][1];
        __declspec(align(16)) VUINT32 sPoly[8][4][1];
        __declspec(align(16)) VUINT32 iHiDelta[4][1];
        __declspec(align(16)) VUINT32 iLoRange[4][1];
        __declspec(align(16)) VUINT32 iBrkValue[4][1];
        __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
        __declspec(align(16)) VUINT32 sLn2[4][1];
 } __svml_slog1p_data_internal;
 #endif
 __svml_slog1p_data_internal:
        /*== SgnMask ==*/
        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
        /*== sOne = SP 1.0 ==*/
        .align 16
        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
        /*== sPoly[] = SP polynomial ==*/
        .align 16
        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
        /*== iHiDelta = SP 80000000-7f000000 ==*/
        .align 16
        .long 0x01000000, 0x01000000, 0x01000000, 0x01000000
        /*== iLoRange = SP 00800000+iHiDelta ==*/
        .align 16
        .long 0x01800000, 0x01800000, 0x01800000, 0x01800000
        /*== iBrkValue = SP 2/3 ==*/
        .align 16
        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
        /*== iOffExpoMask = SP significand mask ==*/
        .align 16
        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
        /*== sLn2 = SP ln(2) ==*/
        .align 16
        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
        .align 16
        .type	__svml_slog1p_data_internal,@object
        .size	__svml_slog1p_data_internal,.-__svml_slog1p_data_internal
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core-sse.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core-sse.S
@ -0,0 +1,20 @@
 /* SSE version of vectorized log1pf, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVdN8v_log1pf _ZGVdN8v_log1pf_sse_wrapper
 #include "../svml_s_log1pf8_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core.c
@ -0,0 +1,28 @@
 /* Multiple versions of vectorized log1pf, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVdN8v_log1pf
 #include "ifunc-mathvec-avx2.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVdN8v_log1pf, __GI__ZGVdN8v_log1pf,
 	       __redirect__ZGVdN8v_log1pf)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
@ -0,0 +1,254 @@
 /* Function log1pf vectorized with AVX2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *    1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1,2)
 *    Get short reciprocal approximation Rcp ~ 1/xh
 *    R = (Rcp*xh - 1.0) + Rcp*xl
 *    log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
 *       log(Rcp) is tabulated
 *
 *
 */
 /* Offsets for data table __svml_slog1p_data_internal
 */
 #define SgnMask                       	0
 #define sOne                          	32
 #define sPoly                         	64
 #define iHiDelta                      	320
 #define iLoRange                      	352
 #define iBrkValue                     	384
 #define iOffExpoMask                  	416
 #define sLn2                          	448
 #include <sysdep.h>
        .text
 	.section .text.avx2,"ax",@progbits
 ENTRY(_ZGVdN8v_log1pf_avx2)
        pushq     %rbp
        cfi_def_cfa_offset(16)
        movq      %rsp, %rbp
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
        andq      $-32, %rsp
        subq      $96, %rsp
        vmovups   sOne+__svml_slog1p_data_internal(%rip), %ymm2
 /* reduction: compute r,n */
        vmovups   iBrkValue+__svml_slog1p_data_internal(%rip), %ymm13
        vmovups   SgnMask+__svml_slog1p_data_internal(%rip), %ymm4
        vmovups   iLoRange+__svml_slog1p_data_internal(%rip), %ymm8
        vmovaps   %ymm0, %ymm3
 /* compute 1+x as high, low parts */
        vmaxps    %ymm3, %ymm2, %ymm5
        vminps    %ymm3, %ymm2, %ymm6
        vaddps    %ymm6, %ymm5, %ymm10
        vpsubd    %ymm13, %ymm10, %ymm11
 /* check argument value ranges */
        vpaddd    iHiDelta+__svml_slog1p_data_internal(%rip), %ymm10, %ymm9
        vsubps    %ymm10, %ymm5, %ymm7
        vpsrad    $23, %ymm11, %ymm14
        vpand     iOffExpoMask+__svml_slog1p_data_internal(%rip), %ymm11, %ymm12
        vpslld    $23, %ymm14, %ymm15
        vcvtdq2ps %ymm14, %ymm0
        vpsubd    %ymm15, %ymm2, %ymm14
        vandnps   %ymm3, %ymm4, %ymm1
        vaddps    %ymm7, %ymm6, %ymm4
        vpaddd    %ymm13, %ymm12, %ymm6
        vmulps    %ymm4, %ymm14, %ymm7
 /* polynomial evaluation */
        vsubps    %ymm2, %ymm6, %ymm2
        vpcmpgtd  %ymm9, %ymm8, %ymm5
        vmovups   sPoly+224+__svml_slog1p_data_internal(%rip), %ymm8
        vaddps    %ymm2, %ymm7, %ymm9
        vfmadd213ps sPoly+192+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
        vfmadd213ps sPoly+160+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
        vfmadd213ps sPoly+128+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
        vfmadd213ps sPoly+96+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
        vfmadd213ps sPoly+64+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
        vfmadd213ps sPoly+32+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
        vfmadd213ps sPoly+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8
        vmulps    %ymm8, %ymm9, %ymm10
        vfmadd213ps %ymm9, %ymm9, %ymm10
 /* final reconstruction */
        vfmadd132ps sLn2+__svml_slog1p_data_internal(%rip), %ymm10, %ymm0
 /* combine and get argument value range mask */
        vmovmskps %ymm5, %edx
        vorps     %ymm1, %ymm0, %ymm0
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        movq      %rbp, %rsp
        popq      %rbp
        cfi_def_cfa(7, 8)
        cfi_restore(6)
        ret
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        vmovups   %ymm3, 32(%rsp)
        vmovups   %ymm0, 64(%rsp)
                                # LOE rbx r12 r13 r14 r15 edx ymm0
        xorl      %eax, %eax
                                # LOE rbx r12 r13 r14 r15 eax edx
        vzeroupper
        movq      %r12, 16(%rsp)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $8, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        vmovups   64(%rsp), %ymm0
 /* Go to exit */
        jmp       L(EXIT)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r12 r13 r14 r15 ymm0
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movss     32(%rsp,%r14,4), %xmm0
        call      log1pf@PLT
                                # LOE rbx r14 r15 r12d r13d xmm0
        movss     %xmm0, 64(%rsp,%r14,4)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx r15 r12d r13d
 END(_ZGVdN8v_log1pf_avx2)
        .section .rodata, "a"
        .align 32
 #ifdef __svml_slog1p_data_internal_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(32)) VUINT32 SgnMask[8][1];
        __declspec(align(32)) VUINT32 sOne[8][1];
        __declspec(align(32)) VUINT32 sPoly[8][8][1];
        __declspec(align(32)) VUINT32 iHiDelta[8][1];
        __declspec(align(32)) VUINT32 iLoRange[8][1];
        __declspec(align(32)) VUINT32 iBrkValue[8][1];
        __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
        __declspec(align(32)) VUINT32 sLn2[8][1];
 } __svml_slog1p_data_internal;
 #endif
 __svml_slog1p_data_internal:
        /*== SgnMask ==*/
        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
        /*== sOne = SP 1.0 ==*/
        .align 32
        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
        /*== sPoly[] = SP polynomial ==*/
        .align 32
        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
        /*== iHiDelta = SP 80000000-7f000000 ==*/
        .align 32
        .long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000
        /*== iLoRange = SP 00800000+iHiDelta ==*/
        .align 32
        .long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000
        /*== iBrkValue = SP 2/3 ==*/
        .align 32
        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
        /*== iOffExpoMask = SP significand mask ==*/
        .align 32
        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
        /*== sLn2 = SP ln(2) ==*/
        .align 32
        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
        .align 32
        .type	__svml_slog1p_data_internal,@object
        .size	__svml_slog1p_data_internal,.-__svml_slog1p_data_internal
--- a/sysdeps/x86_64/fpu/svml_d_log1p2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_log1p2_core.S
@ -0,0 +1,29 @@
 /* Function log1p vectorized with SSE2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVbN2v_log1p)
 WRAPPER_IMPL_SSE2 log1p
 END (_ZGVbN2v_log1p)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVbN2v_log1p)
 #endif
--- a/sysdeps/x86_64/fpu/svml_d_log1p4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_log1p4_core.S
@ -0,0 +1,29 @@
 /* Function log1p vectorized with AVX2, wrapper version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVdN4v_log1p)
 WRAPPER_IMPL_AVX _ZGVbN2v_log1p
 END (_ZGVdN4v_log1p)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVdN4v_log1p)
 #endif
--- a/sysdeps/x86_64/fpu/svml_d_log1p4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_log1p4_core_avx.S
@ -0,0 +1,25 @@
 /* Function log1p vectorized in AVX ISA as wrapper to SSE4 ISA version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVcN4v_log1p)
 WRAPPER_IMPL_AVX _ZGVbN2v_log1p
 END (_ZGVcN4v_log1p)
--- a/sysdeps/x86_64/fpu/svml_d_log1p8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_log1p8_core.S
@ -0,0 +1,25 @@
 /* Function log1p vectorized with AVX-512, wrapper to AVX2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVeN8v_log1p)
 WRAPPER_IMPL_AVX512 _ZGVdN4v_log1p
 END (_ZGVeN8v_log1p)
--- a/sysdeps/x86_64/fpu/svml_s_log1pf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_log1pf16_core.S
@ -0,0 +1,25 @@
 /* Function log1pf vectorized with AVX-512. Wrapper to AVX2 version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
 	.text
 ENTRY (_ZGVeN16v_log1pf)
 WRAPPER_IMPL_AVX512 _ZGVdN8v_log1pf
 END (_ZGVeN16v_log1pf)
--- a/sysdeps/x86_64/fpu/svml_s_log1pf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_log1pf4_core.S
@ -0,0 +1,29 @@
 /* Function log1pf vectorized with SSE2, wrapper version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
 	.text
 ENTRY (_ZGVbN4v_log1pf)
 WRAPPER_IMPL_SSE2 log1pf
 END (_ZGVbN4v_log1pf)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVbN4v_log1pf)
 #endif
--- a/sysdeps/x86_64/fpu/svml_s_log1pf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_log1pf8_core.S
@ -0,0 +1,29 @@
 /* Function log1pf vectorized with AVX2, wrapper version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
 	.text
 ENTRY (_ZGVdN8v_log1pf)
 WRAPPER_IMPL_AVX _ZGVbN4v_log1pf
 END (_ZGVdN8v_log1pf)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVdN8v_log1pf)
 #endif
--- a/sysdeps/x86_64/fpu/svml_s_log1pf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_log1pf8_core_avx.S
@ -0,0 +1,25 @@
 /* Function log1pf vectorized in AVX ISA as wrapper to SSE4 ISA version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
        .text
 ENTRY (_ZGVcN8v_log1pf)
 WRAPPER_IMPL_AVX _ZGVbN4v_log1pf
 END (_ZGVcN8v_log1pf)
--- a/sysdeps/x86_64/fpu/test-double-libmvec-log1p-avx.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-log1p-avx.c
@ -0,0 +1 @@
 #include "test-double-libmvec-log1p.c"
--- a/sysdeps/x86_64/fpu/test-double-libmvec-log1p-avx2.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-log1p-avx2.c
@ -0,0 +1 @@
 #include "test-double-libmvec-log1p.c"
--- a/sysdeps/x86_64/fpu/test-double-libmvec-log1p-avx512f.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-log1p-avx512f.c
@ -0,0 +1 @@
 #include "test-double-libmvec-log1p.c"
--- a/sysdeps/x86_64/fpu/test-double-libmvec-log1p.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-log1p.c
@ -0,0 +1,3 @@
 #define LIBMVEC_TYPE double
 #define LIBMVEC_FUNC log1p
 #include "test-vector-abi-arg1.h"
--- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVbN2v_cbrt)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVbN2vv_atan2)
 VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVbN2v_log10)
 VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVbN2v_log2)
 VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVbN2v_log1p)
 #define VEC_INT_TYPE __m128i
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@ -43,6 +43,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVdN4v_cbrt)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVdN4vv_atan2)
 VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVdN4v_log10)
 VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVdN4v_log2)
 VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVdN4v_log1p)
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m256i
--- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVcN4v_cbrt)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVcN4vv_atan2)
 VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVcN4v_log10)
 VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVcN4v_log2)
 VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVcN4v_log1p)
 #define VEC_INT_TYPE __m128i
--- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrt), _ZGVeN8v_cbrt)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVeN8vv_atan2)
 VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVeN8v_log10)
 VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVeN8v_log2)
 VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVeN8v_log1p)
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m512i
--- a/sysdeps/x86_64/fpu/test-float-libmvec-log1pf-avx.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-log1pf-avx.c
@ -0,0 +1 @@
 #include "test-float-libmvec-log1pf.c"
--- a/sysdeps/x86_64/fpu/test-float-libmvec-log1pf-avx2.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-log1pf-avx2.c
@ -0,0 +1 @@
 #include "test-float-libmvec-log1pf.c"
--- a/sysdeps/x86_64/fpu/test-float-libmvec-log1pf-avx512f.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-log1pf-avx512f.c
@ -0,0 +1 @@
 #include "test-float-libmvec-log1pf.c"
--- a/sysdeps/x86_64/fpu/test-float-libmvec-log1pf.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-log1pf.c
@ -0,0 +1,3 @@
 #define LIBMVEC_TYPE float
 #define LIBMVEC_FUNC log1pf
 #include "test-vector-abi-arg1.h"
--- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVeN16v_cbrtf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVeN16vv_atan2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVeN16v_log10f)
 VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVeN16v_log2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVeN16v_log1pf)
 #define VEC_INT_TYPE __m512i
--- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVbN4v_cbrtf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVbN4vv_atan2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVbN4v_log10f)
 VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVbN4v_log2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVbN4v_log1pf)
 #define VEC_INT_TYPE __m128i
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@ -43,6 +43,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVdN8v_cbrtf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVdN8vv_atan2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVdN8v_log10f)
 VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVdN8v_log2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVdN8v_log1pf)
 /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
 #undef VECTOR_WRAPPER_fFF
--- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@ -40,6 +40,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cbrtf), _ZGVcN8v_cbrtf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVcN8vv_atan2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVcN8v_log10f)
 VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVcN8v_log2f)
 VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVcN8v_log1pf)
 #define VEC_INT_TYPE __m128i