x86-64: Add vector asinh/asinhf implementation to libmvec

Implement vectorized asinh/asinhf containing SSE, AVX, AVX2 and AVX512 versions for libmvec as per vector ABI. It also contains accuracy and ABI tests for vector asinh/asinhf with regenerated ulps. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2024-11-12 16:20:06 +00:00 · 2021-12-29 10:07:02 -08:00 · 2021-12-29 10:07:02 -08:00 · e682d01578
commit e682d01578
parent c0f36fc303
50 changed files with 5784 additions and 1 deletions
--- a/bits/libm-simd-decl-stubs.h
+++ b/bits/libm-simd-decl-stubs.h
@ -296,4 +296,15 @@
 #define __DECL_SIMD_tanhf32x
 #define __DECL_SIMD_tanhf64x
 #define __DECL_SIMD_tanhf128x
 #define __DECL_SIMD_asinh
 #define __DECL_SIMD_asinhf
 #define __DECL_SIMD_asinhl
 #define __DECL_SIMD_asinhf16
 #define __DECL_SIMD_asinhf32
 #define __DECL_SIMD_asinhf64
 #define __DECL_SIMD_asinhf128
 #define __DECL_SIMD_asinhf32x
 #define __DECL_SIMD_asinhf64x
 #define __DECL_SIMD_asinhf128x
 #endif
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@ -84,7 +84,7 @@ __MATHDECL_VEC (void,sincos,,
 /* Hyperbolic arc cosine of X.  */
 __MATHCALL_VEC (acosh,, (_Mdouble_ __x));
 /* Hyperbolic arc sine of X.  */
-__MATHCALL (asinh,, (_Mdouble_ __x));
+__MATHCALL_VEC (asinh,, (_Mdouble_ __x));
 /* Hyperbolic arc tangent of X.  */
 __MATHCALL_VEC (atanh,, (_Mdouble_ __x));
 #endif
--- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
@ -49,6 +49,7 @@ GLIBC_2.22 _ZGVeN8vvv_sincos F
 GLIBC_2.35 _ZGVbN2v_acos F
 GLIBC_2.35 _ZGVbN2v_acosh F
 GLIBC_2.35 _ZGVbN2v_asin F
 GLIBC_2.35 _ZGVbN2v_asinh F
 GLIBC_2.35 _ZGVbN2v_atan F
 GLIBC_2.35 _ZGVbN2v_atanh F
 GLIBC_2.35 _ZGVbN2v_cbrt F
@ -67,6 +68,7 @@ GLIBC_2.35 _ZGVbN2vv_hypot F
 GLIBC_2.35 _ZGVbN4v_acosf F
 GLIBC_2.35 _ZGVbN4v_acoshf F
 GLIBC_2.35 _ZGVbN4v_asinf F
 GLIBC_2.35 _ZGVbN4v_asinhf F
 GLIBC_2.35 _ZGVbN4v_atanf F
 GLIBC_2.35 _ZGVbN4v_atanhf F
 GLIBC_2.35 _ZGVbN4v_cbrtf F
@ -85,6 +87,7 @@ GLIBC_2.35 _ZGVbN4vv_hypotf F
 GLIBC_2.35 _ZGVcN4v_acos F
 GLIBC_2.35 _ZGVcN4v_acosh F
 GLIBC_2.35 _ZGVcN4v_asin F
 GLIBC_2.35 _ZGVcN4v_asinh F
 GLIBC_2.35 _ZGVcN4v_atan F
 GLIBC_2.35 _ZGVcN4v_atanh F
 GLIBC_2.35 _ZGVcN4v_cbrt F
@ -103,6 +106,7 @@ GLIBC_2.35 _ZGVcN4vv_hypot F
 GLIBC_2.35 _ZGVcN8v_acosf F
 GLIBC_2.35 _ZGVcN8v_acoshf F
 GLIBC_2.35 _ZGVcN8v_asinf F
 GLIBC_2.35 _ZGVcN8v_asinhf F
 GLIBC_2.35 _ZGVcN8v_atanf F
 GLIBC_2.35 _ZGVcN8v_atanhf F
 GLIBC_2.35 _ZGVcN8v_cbrtf F
@ -121,6 +125,7 @@ GLIBC_2.35 _ZGVcN8vv_hypotf F
 GLIBC_2.35 _ZGVdN4v_acos F
 GLIBC_2.35 _ZGVdN4v_acosh F
 GLIBC_2.35 _ZGVdN4v_asin F
 GLIBC_2.35 _ZGVdN4v_asinh F
 GLIBC_2.35 _ZGVdN4v_atan F
 GLIBC_2.35 _ZGVdN4v_atanh F
 GLIBC_2.35 _ZGVdN4v_cbrt F
@ -139,6 +144,7 @@ GLIBC_2.35 _ZGVdN4vv_hypot F
 GLIBC_2.35 _ZGVdN8v_acosf F
 GLIBC_2.35 _ZGVdN8v_acoshf F
 GLIBC_2.35 _ZGVdN8v_asinf F
 GLIBC_2.35 _ZGVdN8v_asinhf F
 GLIBC_2.35 _ZGVdN8v_atanf F
 GLIBC_2.35 _ZGVdN8v_atanhf F
 GLIBC_2.35 _ZGVdN8v_cbrtf F
@ -157,6 +163,7 @@ GLIBC_2.35 _ZGVdN8vv_hypotf F
 GLIBC_2.35 _ZGVeN16v_acosf F
 GLIBC_2.35 _ZGVeN16v_acoshf F
 GLIBC_2.35 _ZGVeN16v_asinf F
 GLIBC_2.35 _ZGVeN16v_asinhf F
 GLIBC_2.35 _ZGVeN16v_atanf F
 GLIBC_2.35 _ZGVeN16v_atanhf F
 GLIBC_2.35 _ZGVeN16v_cbrtf F
@ -175,6 +182,7 @@ GLIBC_2.35 _ZGVeN16vv_hypotf F
 GLIBC_2.35 _ZGVeN8v_acos F
 GLIBC_2.35 _ZGVeN8v_acosh F
 GLIBC_2.35 _ZGVeN8v_asin F
 GLIBC_2.35 _ZGVeN8v_asinh F
 GLIBC_2.35 _ZGVeN8v_atan F
 GLIBC_2.35 _ZGVeN8v_atanh F
 GLIBC_2.35 _ZGVeN8v_cbrt F
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86/fpu/bits/math-vector.h
@ -130,6 +130,10 @@
 #  define __DECL_SIMD_tanh __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_tanhf
 #  define __DECL_SIMD_tanhf __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_asinh
 #  define __DECL_SIMD_asinh __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_asinhf
 #  define __DECL_SIMD_asinhf __DECL_SIMD_x86_64
 # endif
 #endif
--- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
@ -64,6 +64,8 @@
 !GCC$ builtin (erff) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (tanh) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (tanhf) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (asinh) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (asinhf) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -113,3 +115,5 @@
 !GCC$ builtin (erff) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (tanh) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (tanhf) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (asinh) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (asinhf) attributes simd (notinbranch) if('x32')
--- a/sysdeps/x86_64/fpu/Makeconfig
+++ b/sysdeps/x86_64/fpu/Makeconfig
@ -25,6 +25,7 @@ libmvec-funcs = \
  acos \
  acosh \
  asin \
  asinh \
  atan \
  atan2 \
  atanh \
--- a/sysdeps/x86_64/fpu/Versions
+++ b/sysdeps/x86_64/fpu/Versions
@ -17,6 +17,7 @@ libmvec {
    _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
    _ZGVbN2v_acosh; _ZGVcN4v_acosh; _ZGVdN4v_acosh; _ZGVeN8v_acosh;
    _ZGVbN2v_asin; _ZGVcN4v_asin; _ZGVdN4v_asin; _ZGVeN8v_asin;
    _ZGVbN2v_asinh; _ZGVcN4v_asinh; _ZGVdN4v_asinh; _ZGVeN8v_asinh;
    _ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
    _ZGVbN2v_atanh; _ZGVcN4v_atanh; _ZGVdN4v_atanh; _ZGVeN8v_atanh;
    _ZGVbN2v_cbrt; _ZGVcN4v_cbrt; _ZGVdN4v_cbrt; _ZGVeN8v_cbrt;
@ -35,6 +36,7 @@ libmvec {
    _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
    _ZGVbN4v_acoshf; _ZGVcN8v_acoshf; _ZGVdN8v_acoshf; _ZGVeN16v_acoshf;
    _ZGVbN4v_asinf; _ZGVcN8v_asinf; _ZGVdN8v_asinf; _ZGVeN16v_asinf;
    _ZGVbN4v_asinhf; _ZGVcN8v_asinhf; _ZGVdN8v_asinhf; _ZGVeN16v_asinhf;
    _ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
    _ZGVbN4v_atanhf; _ZGVcN8v_atanhf; _ZGVdN8v_atanhf; _ZGVeN16v_atanhf;
    _ZGVbN4v_cbrtf; _ZGVcN8v_cbrtf; _ZGVdN8v_cbrtf; _ZGVeN16v_cbrtf;
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@ -157,6 +157,23 @@ float: 3
 float128: 4
 ldouble: 5
 Function: "asinh_vlen2":
 double: 1
 Function: "asinh_vlen4":
 double: 1
 float: 1
 Function: "asinh_vlen4_avx2":
 double: 1
 Function: "asinh_vlen8":
 double: 1
 float: 1
 Function: "asinh_vlen8_avx2":
 float: 1
 Function: "atan":
 double: 1
 float: 1
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh2_core-sse2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh2_core-sse2.S
@ -0,0 +1,20 @@
 /* SSE2 version of vectorized asinh, vector length is 2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVbN2v_asinh _ZGVbN2v_asinh_sse2
 #include "../svml_d_asinh2_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh2_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh2_core.c
@ -0,0 +1,27 @@
 /* Multiple versions of vectorized asinh, vector length is 2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVbN2v_asinh
 #include "ifunc-mathvec-sse4_1.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVbN2v_asinh, __GI__ZGVbN2v_asinh, __redirect__ZGVbN2v_asinh)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh2_core_sse4.S
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core-sse.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core-sse.S
@ -0,0 +1,20 @@
 /* SSE version of vectorized asinh, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVdN4v_asinh _ZGVdN4v_asinh_sse_wrapper
 #include "../svml_d_asinh4_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core.c
@ -0,0 +1,27 @@
 /* Multiple versions of vectorized asinh, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVdN4v_asinh
 #include "ifunc-mathvec-avx2.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVdN4v_asinh, __GI__ZGVdN4v_asinh, __redirect__ZGVdN4v_asinh)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core-avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core-avx2.S
@ -0,0 +1,20 @@
 /* AVX2 version of vectorized asinh, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVeN8v_asinh _ZGVeN8v_asinh_avx2_wrapper
 #include "../svml_d_asinh8_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core.c
@ -0,0 +1,27 @@
 /* Multiple versions of vectorized asinh, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVeN8v_asinh
 #include "ifunc-mathvec-avx512-skx.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVeN8v_asinh, __GI__ZGVeN8v_asinh, __redirect__ZGVeN8v_asinh)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
@ -0,0 +1,510 @@
 /* Function asinh vectorized with AVX-512.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *   Compute asinh(x) as log(x + sqrt(x*x + 1))
 *   using RSQRT instructions for starting the
 *   square root approximation, and small table lookups for log
 *   that map to AVX-512 permute instructions
 *
 *   Special cases:
 *
 *   asinh(NaN) = quiet NaN, and raise invalid exception
 *   asinh(INF) = that INF
 *   asinh(0)   = that 0
 *
 */
 /* Offsets for data table __svml_dasinh_data_internal_avx512
 */
 #define Log_tbl_H                     	0
 #define Log_tbl_L                     	128
 #define One                           	256
 #define AbsMask                       	320
 #define SmallThreshold                	384
 #define Threshold                     	448
 #define LargeThreshold                	512
 #define ca2                           	576
 #define ca1                           	640
 #define c4s                           	704
 #define c3s                           	768
 #define c2s                           	832
 #define c1s                           	896
 #define AddB5                         	960
 #define RcpBitMask                    	1024
 #define OneEighth                     	1088
 #define Four                          	1152
 #define poly_coeff9                   	1216
 #define poly_coeff8                   	1280
 #define poly_coeff7                   	1344
 #define poly_coeff6                   	1408
 #define poly_coeff5                   	1472
 #define poly_coeff4                   	1536
 #define poly_coeff3                   	1600
 #define poly_coeff2                   	1664
 #define poly_coeff1                   	1728
 #define L2H                           	1792
 #define L2L                           	1856
 #include <sysdep.h>
        .text
 	.section .text.evex512,"ax",@progbits
 ENTRY(_ZGVeN8v_asinh_skx)
        pushq     %rbp
        cfi_def_cfa_offset(16)
        movq      %rsp, %rbp
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
        andq      $-64, %rsp
        subq      $192, %rsp
        vmovaps   %zmm0, %zmm3
 /* x^2 */
        vmulpd    {rn-sae}, %zmm3, %zmm3, %zmm14
        vmovups   One+__svml_dasinh_data_internal_avx512(%rip), %zmm9
 /* polynomial computation for small inputs */
        vmovups   ca2+__svml_dasinh_data_internal_avx512(%rip), %zmm10
        vmovups   ca1+__svml_dasinh_data_internal_avx512(%rip), %zmm11
 /* not a very small input ? */
        vmovups   SmallThreshold+__svml_dasinh_data_internal_avx512(%rip), %zmm0
 /* A=max(x^2, 1); */
        vmaxpd    {sae}, %zmm14, %zmm9, %zmm4
 /* B=min(x^2, 1); */
        vminpd    {sae}, %zmm14, %zmm9, %zmm5
        vfmadd231pd {rn-sae}, %zmm14, %zmm10, %zmm11
 /* 1+x^2 */
        vaddpd    {rn-sae}, %zmm9, %zmm14, %zmm8
 /* |input| */
        vandpd    AbsMask+__svml_dasinh_data_internal_avx512(%rip), %zmm3, %zmm1
        vrsqrt14pd %zmm8, %zmm6
        vcmppd    $21, {sae}, %zmm0, %zmm1, %k2
 /* B_high */
        vsubpd    {rn-sae}, %zmm4, %zmm8, %zmm7
 /* sign bit */
        vxorpd    %zmm3, %zmm1, %zmm2
        vmulpd    {rn-sae}, %zmm14, %zmm11, %zmm4
 /* B_low */
        vsubpd    {rn-sae}, %zmm7, %zmm5, %zmm13
        vmovups   c2s+__svml_dasinh_data_internal_avx512(%rip), %zmm5
        vmovups   c1s+__svml_dasinh_data_internal_avx512(%rip), %zmm7
 /* polynomial computation for small inputs */
        vfmadd213pd {rn-sae}, %zmm1, %zmm1, %zmm4
 /* (x^2)_low */
        vmovaps   %zmm3, %zmm15
        vfmsub213pd {rn-sae}, %zmm14, %zmm3, %zmm15
 /* Sh ~sqrt(1+x^2) */
        vmulpd    {rn-sae}, %zmm6, %zmm8, %zmm14
 /* Yl = (x^2)_low + B_low */
        vaddpd    {rn-sae}, %zmm15, %zmm13, %zmm13
 /* very large inputs ? */
        vmovups   Threshold+__svml_dasinh_data_internal_avx512(%rip), %zmm15
 /* (Yh*R0)_low */
        vfmsub213pd {rn-sae}, %zmm14, %zmm6, %zmm8
        vcmppd    $21, {sae}, %zmm15, %zmm1, %k1
 /* Sl = (Yh*R0)_low+(R0*Yl) */
        vfmadd213pd {rn-sae}, %zmm8, %zmm6, %zmm13
        vmovups   LargeThreshold+__svml_dasinh_data_internal_avx512(%rip), %zmm8
 /* rel. error term: Eh=1-Sh*R0 */
        vmovaps   %zmm9, %zmm12
        vfnmadd231pd {rn-sae}, %zmm14, %zmm6, %zmm12
        vcmppd    $22, {sae}, %zmm8, %zmm1, %k0
 /* rel. error term: Eh=(1-Sh*R0)-Sl*R0 */
        vfnmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm12
 /*
 * sqrt(1+x^2) ~ Sh + Sl + Sh*Eh*poly_s
 * poly_s = c1+c2*Eh+c3*Eh^2
 */
        vmovups   c4s+__svml_dasinh_data_internal_avx512(%rip), %zmm6
        vmovups   c3s+__svml_dasinh_data_internal_avx512(%rip), %zmm8
 /* Sh*Eh */
        vmulpd    {rn-sae}, %zmm12, %zmm14, %zmm11
        vfmadd231pd {rn-sae}, %zmm12, %zmm6, %zmm8
 /* Sh+x */
        vaddpd    {rn-sae}, %zmm1, %zmm14, %zmm6
        kmovw     %k0, %edx
        vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm8
        vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm8
 /* Xh */
        vsubpd    {rn-sae}, %zmm14, %zmm6, %zmm12
 /* Sl + Sh*Eh*poly_s */
        vfmadd213pd {rn-sae}, %zmm13, %zmm8, %zmm11
 /* fixup for very large inputs */
        vmovups   OneEighth+__svml_dasinh_data_internal_avx512(%rip), %zmm8
 /* Xl */
        vsubpd    {rn-sae}, %zmm12, %zmm1, %zmm12
 /* Xin0+Sl+Sh*Eh*poly_s ~ x+sqrt(1+x^2) */
        vaddpd    {rn-sae}, %zmm11, %zmm6, %zmm10
 /* Sl_high */
        vsubpd    {rn-sae}, %zmm6, %zmm10, %zmm5
        vmulpd    {rn-sae}, %zmm8, %zmm1, %zmm10{%k1}
 /* Table lookups */
        vmovups   __svml_dasinh_data_internal_avx512(%rip), %zmm6
 /* Sl_l */
        vsubpd    {rn-sae}, %zmm5, %zmm11, %zmm7
        vrcp14pd  %zmm10, %zmm13
 /* Xin_low */
        vaddpd    {rn-sae}, %zmm12, %zmm7, %zmm14
        vmovups   Log_tbl_L+__svml_dasinh_data_internal_avx512(%rip), %zmm7
        vmovups   poly_coeff6+__svml_dasinh_data_internal_avx512(%rip), %zmm12
 /* round reciprocal to 1+4b mantissas */
        vpaddq    AddB5+__svml_dasinh_data_internal_avx512(%rip), %zmm13, %zmm11
 /* fixup for very large inputs */
        vxorpd    %zmm14, %zmm14, %zmm14{%k1}
        vmovups   poly_coeff5+__svml_dasinh_data_internal_avx512(%rip), %zmm13
        vandpd    RcpBitMask+__svml_dasinh_data_internal_avx512(%rip), %zmm11, %zmm15
        vmovups   poly_coeff7+__svml_dasinh_data_internal_avx512(%rip), %zmm11
 /* Prepare table index */
        vpsrlq    $48, %zmm15, %zmm5
 /* reduced argument for log(): (Rcp*Xin-1)+Rcp*Xin_low */
        vfmsub231pd {rn-sae}, %zmm15, %zmm10, %zmm9
 /* exponents */
        vgetexppd {sae}, %zmm15, %zmm8
        vmovups   Four+__svml_dasinh_data_internal_avx512(%rip), %zmm10
        vpermt2pd Log_tbl_H+64+__svml_dasinh_data_internal_avx512(%rip), %zmm5, %zmm6
        vpermt2pd Log_tbl_L+64+__svml_dasinh_data_internal_avx512(%rip), %zmm5, %zmm7
        vsubpd    {rn-sae}, %zmm10, %zmm8, %zmm8{%k1}
        vfmadd231pd {rn-sae}, %zmm15, %zmm14, %zmm9
 /* polynomials */
        vmovups   poly_coeff9+__svml_dasinh_data_internal_avx512(%rip), %zmm10
        vmovups   poly_coeff8+__svml_dasinh_data_internal_avx512(%rip), %zmm5
        vmovups   poly_coeff4+__svml_dasinh_data_internal_avx512(%rip), %zmm14
 /* -K*L2H + Th */
        vmovups   L2H+__svml_dasinh_data_internal_avx512(%rip), %zmm15
        vfmadd231pd {rn-sae}, %zmm9, %zmm10, %zmm5
 /* -K*L2L + Tl */
        vmovups   L2L+__svml_dasinh_data_internal_avx512(%rip), %zmm10
        vfnmadd231pd {rn-sae}, %zmm8, %zmm15, %zmm6
        vfmadd213pd {rn-sae}, %zmm11, %zmm9, %zmm5
        vfnmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm8
        vmovups   poly_coeff3+__svml_dasinh_data_internal_avx512(%rip), %zmm7
        vmovups   poly_coeff1+__svml_dasinh_data_internal_avx512(%rip), %zmm10
 /* R^2 */
        vmulpd    {rn-sae}, %zmm9, %zmm9, %zmm11
        vfmadd213pd {rn-sae}, %zmm12, %zmm9, %zmm5
        vfmadd213pd {rn-sae}, %zmm13, %zmm9, %zmm5
        vfmadd213pd {rn-sae}, %zmm14, %zmm9, %zmm5
        vfmadd213pd {rn-sae}, %zmm7, %zmm9, %zmm5
        vmovups   poly_coeff2+__svml_dasinh_data_internal_avx512(%rip), %zmm7
        vfmadd213pd {rn-sae}, %zmm7, %zmm9, %zmm5
        vfmadd213pd {rn-sae}, %zmm10, %zmm9, %zmm5
 /* Tl + R^2*Poly */
        vfmadd213pd {rn-sae}, %zmm8, %zmm11, %zmm5
 /* R+Tl + R^2*Poly */
        vaddpd    {rn-sae}, %zmm9, %zmm5, %zmm9
        vaddpd    {rn-sae}, %zmm9, %zmm6, %zmm4{%k2}
        vxorpd    %zmm2, %zmm4, %zmm0
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        movq      %rbp, %rsp
        popq      %rbp
        cfi_def_cfa(7, 8)
        cfi_restore(6)
        ret
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        vmovups   %zmm3, 64(%rsp)
        vmovups   %zmm0, 128(%rsp)
                                # LOE rbx r12 r13 r14 r15 edx zmm0
        xorl      %eax, %eax
                                # LOE rbx r12 r13 r14 r15 eax edx
        vzeroupper
        movq      %r12, 16(%rsp)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $8, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        vmovups   128(%rsp), %zmm0
 /* Go to exit */
        jmp       L(EXIT)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r12 r13 r14 r15 zmm0
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movsd     64(%rsp,%r14,8), %xmm0
        call      asinh@PLT
                                # LOE rbx r14 r15 r12d r13d xmm0
        movsd     %xmm0, 128(%rsp,%r14,8)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx r15 r12d r13d
 END(_ZGVeN8v_asinh_skx)
        .section .rodata, "a"
        .align 64
 #ifdef __svml_dasinh_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(64)) VUINT32 Log_tbl_H[16][2];
        __declspec(align(64)) VUINT32 Log_tbl_L[16][2];
        __declspec(align(64)) VUINT32 One[8][2];
        __declspec(align(64)) VUINT32 AbsMask[8][2];
        __declspec(align(64)) VUINT32 SmallThreshold[8][2];
        __declspec(align(64)) VUINT32 Threshold[8][2];
        __declspec(align(64)) VUINT32 LargeThreshold[8][2];
        __declspec(align(64)) VUINT32 ca2[8][2];
        __declspec(align(64)) VUINT32 ca1[8][2];
        __declspec(align(64)) VUINT32 c4s[8][2];
        __declspec(align(64)) VUINT32 c3s[8][2];
        __declspec(align(64)) VUINT32 c2s[8][2];
        __declspec(align(64)) VUINT32 c1s[8][2];
        __declspec(align(64)) VUINT32 AddB5[8][2];
        __declspec(align(64)) VUINT32 RcpBitMask[8][2];
        __declspec(align(64)) VUINT32 OneEighth[8][2];
        __declspec(align(64)) VUINT32 Four[8][2];
        __declspec(align(64)) VUINT32 poly_coeff9[8][2];
        __declspec(align(64)) VUINT32 poly_coeff8[8][2];
        __declspec(align(64)) VUINT32 poly_coeff7[8][2];
        __declspec(align(64)) VUINT32 poly_coeff6[8][2];
        __declspec(align(64)) VUINT32 poly_coeff5[8][2];
        __declspec(align(64)) VUINT32 poly_coeff4[8][2];
        __declspec(align(64)) VUINT32 poly_coeff3[8][2];
        __declspec(align(64)) VUINT32 poly_coeff2[8][2];
        __declspec(align(64)) VUINT32 poly_coeff1[8][2];
        __declspec(align(64)) VUINT32 L2H[8][2];
        __declspec(align(64)) VUINT32 L2L[8][2];
    } __svml_dasinh_data_internal_avx512;
 #endif
 __svml_dasinh_data_internal_avx512:
        /*== Log_tbl_H ==*/
        .quad 0x0000000000000000
        .quad 0xbfaf0a30c0120000
        .quad 0xbfbe27076e2b0000
        .quad 0xbfc5ff3070a78000
        .quad 0xbfcc8ff7c79a8000
        .quad 0xbfd1675cababc000
        .quad 0xbfd4618bc21c4000
        .quad 0xbfd739d7f6bbc000
        .quad 0xbfd9f323ecbf8000
        .quad 0xbfdc8ff7c79a8000
        .quad 0xbfdf128f5faf0000
        .quad 0xbfe0be72e4252000
        .quad 0xbfe1e85f5e704000
        .quad 0xbfe307d7334f2000
        .quad 0xbfe41d8fe8468000
        .quad 0xbfe52a2d265bc000
        /*== Log_tbl_L ==*/
        .align 64
        .quad 0x0000000000000000
        .quad 0x3d53ab33d066d1d2
        .quad 0x3d2a342c2af0003c
        .quad 0xbd43d3c873e20a07
        .quad 0xbd4a21ac25d81ef3
        .quad 0x3d59f1fc63382a8f
        .quad 0xbd5ec27d0b7b37b3
        .quad 0xbd50069ce24c53fb
        .quad 0xbd584bf2b68d766f
        .quad 0xbd5a21ac25d81ef3
        .quad 0xbd3bb2cd720ec44c
        .quad 0xbd55056d312f7668
        .quad 0xbd1a07bd8b34be7c
        .quad 0x3d5e83c094debc15
        .quad 0x3d5aa33736867a17
        .quad 0xbd46abb9df22bc57
        /*== One ==*/
        .align 64
        .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
        /*== AbsMask ==*/
        .align 64
        .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
        /*== SmallThreshold ==*/
        .align 64
        .quad 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000
        /*== Threshold ==*/
        .align 64
        .quad 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000
        /*== LargeThreshold ==*/
        .align 64
        .quad 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff
        /*== ca2 ==*/
        .align 64
        .quad 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7
        /*== ca1 ==*/
        .align 64
        .quad 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e
        /*== c4s ==*/
        .align 64
        .quad 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612
        /*== c3s ==*/
        .align 64
        .quad 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000
        /*== c2s ==*/
        .align 64
        .quad 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000
        /*== c1s ==*/
        .align 64
        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
        /*== AddB5 ==*/
        .align 64
        .quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000
        /*== RcpBitMask ==*/
        .align 64
        .quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000
        /*==OneEighth ==*/
        .align 64
        .quad 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000
        /*== Four ==*/
        .align 64
        .quad 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000
        /*== poly_coeff9 ==*/
        .align 64
        .quad 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368
        /*== poly_coeff8 ==*/
        .align 64
        .quad 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778
        /*== poly_coeff7 ==*/
        .align 64
        .quad 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9
        /*== poly_coeff6 ==*/
        .align 64
        .quad 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1
        /*== poly_coeff5 ==*/
        .align 64
        .quad 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736
        /*== poly_coeff4 ==*/
        .align 64
        .quad 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af
        /*== poly_coeff3 ==*/
        .align 64
        .quad 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65
        /*== poly_coeff2 ==*/
        .align 64
        .quad 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1
        /*== poly_coeff1 ==*/
        .align 64
        .quad 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000
        /*== L2H = log(2)_high ==*/
        .align 64
        .quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000
        /*== L2L = log(2)_low ==*/
        .align 64
        .quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000
        .align 64
        .type	__svml_dasinh_data_internal_avx512,@object
        .size	__svml_dasinh_data_internal_avx512,.-__svml_dasinh_data_internal_avx512
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core-avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core-avx2.S
@ -0,0 +1,20 @@
 /* AVX2 version of vectorized asinhf.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVeN16v_asinhf _ZGVeN16v_asinhf_avx2_wrapper
 #include "../svml_s_asinhf16_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core.c
@ -0,0 +1,28 @@
 /* Multiple versions of vectorized asinhf, vector length is 16.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVeN16v_asinhf
 #include "ifunc-mathvec-avx512-skx.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVeN16v_asinhf, __GI__ZGVeN16v_asinhf,
 	       __redirect__ZGVeN16v_asinhf)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
@ -0,0 +1,476 @@
 /* Function asinhf vectorized with AVX-512.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *   Compute asinh(x) as log(x + sqrt(x*x + 1))
 *   using RSQRT instructions for starting the
 *   square root approximation, and small table lookups for log
 *   that map to AVX-512 permute instructions
 *
 *   Special cases:
 *
 *   asinh(NaN) = quiet NaN, and raise invalid exception
 *   asinh(INF) = that INF
 *   asinh(0)   = that 0
 *
 */
 /* Offsets for data table __svml_sasinh_data_internal_avx512
 */
 #define Log_tbl_H                     	0
 #define Log_tbl_L                     	128
 #define One                           	256
 #define AbsMask                       	320
 #define SmallThreshold                	384
 #define Threshold                     	448
 #define LargeThreshold                	512
 #define ca1                           	576
 #define c2s                           	640
 #define c1s                           	704
 #define AddB5                         	768
 #define RcpBitMask                    	832
 #define OneEighth                     	896
 #define Four                          	960
 #define poly_coeff3                   	1024
 #define poly_coeff2                   	1088
 #define poly_coeff1                   	1152
 #define L2H                           	1216
 #define L2L                           	1280
 #include <sysdep.h>
        .text
 	.section .text.exex512,"ax",@progbits
 ENTRY(_ZGVeN16v_asinhf_skx)
        pushq     %rbp
        cfi_def_cfa_offset(16)
        movq      %rsp, %rbp
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
        andq      $-64, %rsp
        subq      $192, %rsp
        vmovaps   %zmm0, %zmm10
 /* x^2 */
        vmulps    {rn-sae}, %zmm10, %zmm10, %zmm0
        vmovups   One+__svml_sasinh_data_internal_avx512(%rip), %zmm2
 /* polynomial computation for small inputs */
        vmovups   ca1+__svml_sasinh_data_internal_avx512(%rip), %zmm1
 /* not a very small input ? */
        vmovups   SmallThreshold+__svml_sasinh_data_internal_avx512(%rip), %zmm11
 /* 1+x^2 */
        vaddps    {rn-sae}, %zmm2, %zmm0, %zmm7
 /* |input| */
        vandps    AbsMask+__svml_sasinh_data_internal_avx512(%rip), %zmm10, %zmm12
 /* A=max(x^2, 1); */
        vmaxps    {sae}, %zmm0, %zmm2, %zmm14
        vrsqrt14ps %zmm7, %zmm8
 /* B=min(x^2, 1); */
        vminps    {sae}, %zmm0, %zmm2, %zmm15
        vcmpps    $21, {sae}, %zmm11, %zmm12, %k2
 /* B_high */
        vsubps    {rn-sae}, %zmm14, %zmm7, %zmm9
 /* sign bit */
        vxorps    %zmm10, %zmm12, %zmm13
 /* Sh ~sqrt(1+x^2) */
        vmulps    {rn-sae}, %zmm8, %zmm7, %zmm6
        vmovups   LargeThreshold+__svml_sasinh_data_internal_avx512(%rip), %zmm14
 /* B_low */
        vsubps    {rn-sae}, %zmm9, %zmm15, %zmm3
 /* Sh+x */
        vaddps    {rn-sae}, %zmm12, %zmm6, %zmm15
 /* (Yh*R0)_low */
        vfmsub213ps {rn-sae}, %zmm6, %zmm8, %zmm7
        vmulps    {rn-sae}, %zmm1, %zmm0, %zmm9
        vcmpps    $22, {sae}, %zmm14, %zmm12, %k0
        vmovups   c1s+__svml_sasinh_data_internal_avx512(%rip), %zmm1
 /* polynomial computation for small inputs */
        vfmadd213ps {rn-sae}, %zmm12, %zmm12, %zmm9
        kmovw     %k0, %edx
 /* (x^2)_low */
        vmovaps   %zmm10, %zmm4
        vfmsub213ps {rn-sae}, %zmm0, %zmm10, %zmm4
 /* Yl = (x^2)_low + B_low */
        vaddps    {rn-sae}, %zmm4, %zmm3, %zmm5
 /* rel. error term: Eh=1-Sh*R0 */
        vmovaps   %zmm2, %zmm0
        vfnmadd231ps {rn-sae}, %zmm6, %zmm8, %zmm0
 /* Sl = (Yh*R0)_low+(R0*Yl) */
        vfmadd213ps {rn-sae}, %zmm7, %zmm8, %zmm5
 /* very large inputs ? */
        vmovups   Threshold+__svml_sasinh_data_internal_avx512(%rip), %zmm7
 /* rel. error term: Eh=(1-Sh*R0)-Sl*R0 */
        vfnmadd231ps {rn-sae}, %zmm5, %zmm8, %zmm0
 /* sqrt(1+x^2) ~ Sh + Sl + Sh*Eh*poly_s */
        vmovups   c2s+__svml_sasinh_data_internal_avx512(%rip), %zmm8
        vcmpps    $21, {sae}, %zmm7, %zmm12, %k1
 /* Sh*Eh */
        vmulps    {rn-sae}, %zmm0, %zmm6, %zmm4
        vfmadd231ps {rn-sae}, %zmm0, %zmm8, %zmm1
 /* Sl + Sh*Eh*poly_s */
        vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm4
 /* Xh */
        vsubps    {rn-sae}, %zmm6, %zmm15, %zmm5
 /* fixup for very large inputs */
        vmovups   OneEighth+__svml_sasinh_data_internal_avx512(%rip), %zmm6
 /* Xin0+Sl+Sh*Eh*poly_s ~ x+sqrt(1+x^2) */
        vaddps    {rn-sae}, %zmm4, %zmm15, %zmm3
 /* Xl */
        vsubps    {rn-sae}, %zmm5, %zmm12, %zmm5
 /* Sl_high */
        vsubps    {rn-sae}, %zmm15, %zmm3, %zmm0
        vmulps    {rn-sae}, %zmm6, %zmm12, %zmm3{%k1}
 /* -K*L2H + Th */
        vmovups   L2H+__svml_sasinh_data_internal_avx512(%rip), %zmm15
 /* Sl_l */
        vsubps    {rn-sae}, %zmm0, %zmm4, %zmm1
        vrcp14ps  %zmm3, %zmm6
 /* Table lookups */
        vmovups   __svml_sasinh_data_internal_avx512(%rip), %zmm0
 /* Xin_low */
        vaddps    {rn-sae}, %zmm5, %zmm1, %zmm7
 /* round reciprocal to 1+4b mantissas */
        vpaddd    AddB5+__svml_sasinh_data_internal_avx512(%rip), %zmm6, %zmm4
        vmovups   poly_coeff1+__svml_sasinh_data_internal_avx512(%rip), %zmm5
        vandps    RcpBitMask+__svml_sasinh_data_internal_avx512(%rip), %zmm4, %zmm8
 /* fixup for very large inputs */
        vxorps    %zmm7, %zmm7, %zmm7{%k1}
 /* polynomial */
        vmovups   poly_coeff3+__svml_sasinh_data_internal_avx512(%rip), %zmm4
 /* reduced argument for log(): (Rcp*Xin-1)+Rcp*Xin_low */
        vfmsub231ps {rn-sae}, %zmm8, %zmm3, %zmm2
        vmovups   Four+__svml_sasinh_data_internal_avx512(%rip), %zmm3
 /* exponents */
        vgetexpps {sae}, %zmm8, %zmm1
 /* Prepare table index */
        vpsrld    $18, %zmm8, %zmm14
        vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm2
        vmovups   poly_coeff2+__svml_sasinh_data_internal_avx512(%rip), %zmm7
        vsubps    {rn-sae}, %zmm3, %zmm1, %zmm1{%k1}
        vpermt2ps Log_tbl_H+64+__svml_sasinh_data_internal_avx512(%rip), %zmm14, %zmm0
        vmovups   Log_tbl_L+__svml_sasinh_data_internal_avx512(%rip), %zmm3
        vfmadd231ps {rn-sae}, %zmm2, %zmm4, %zmm7
        vfnmadd231ps {rn-sae}, %zmm1, %zmm15, %zmm0
 /* R^2 */
        vmulps    {rn-sae}, %zmm2, %zmm2, %zmm6
        vfmadd213ps {rn-sae}, %zmm5, %zmm2, %zmm7
        vpermt2ps Log_tbl_L+64+__svml_sasinh_data_internal_avx512(%rip), %zmm14, %zmm3
 /* -K*L2L + Tl */
        vmovups   L2L+__svml_sasinh_data_internal_avx512(%rip), %zmm14
        vfnmadd213ps {rn-sae}, %zmm3, %zmm14, %zmm1
 /* Tl + R^2*Poly */
        vfmadd213ps {rn-sae}, %zmm1, %zmm6, %zmm7
 /* R+Tl + R^2*Poly */
        vaddps    {rn-sae}, %zmm2, %zmm7, %zmm2
        vaddps    {rn-sae}, %zmm2, %zmm0, %zmm9{%k2}
        vxorps    %zmm13, %zmm9, %zmm0
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm10
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        movq      %rbp, %rsp
        popq      %rbp
        cfi_def_cfa(7, 8)
        cfi_restore(6)
        ret
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        vmovups   %zmm10, 64(%rsp)
        vmovups   %zmm0, 128(%rsp)
                                # LOE rbx r12 r13 r14 r15 edx zmm0
        xorl      %eax, %eax
                                # LOE rbx r12 r13 r14 r15 eax edx
        vzeroupper
        movq      %r12, 16(%rsp)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $16, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        vmovups   128(%rsp), %zmm0
 /* Go to exit */
        jmp       L(EXIT)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r12 r13 r14 r15 zmm0
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movss     64(%rsp,%r14,4), %xmm0
        call      asinhf@PLT
                                # LOE rbx r14 r15 r12d r13d xmm0
        movss     %xmm0, 128(%rsp,%r14,4)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx r15 r12d r13d
 END(_ZGVeN16v_asinhf_skx)
        .section .rodata, "a"
        .align 64
 #ifdef __svml_sasinh_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
        __declspec(align(64)) VUINT32 One[16][1];
        __declspec(align(64)) VUINT32 AbsMask[16][1];
        __declspec(align(64)) VUINT32 SmallThreshold[16][1];
        __declspec(align(64)) VUINT32 Threshold[16][1];
        __declspec(align(64)) VUINT32 LargeThreshold[16][1];
        __declspec(align(64)) VUINT32 ca1[16][1];
        __declspec(align(64)) VUINT32 c2s[16][1];
        __declspec(align(64)) VUINT32 c1s[16][1];
        __declspec(align(64)) VUINT32 AddB5[16][1];
        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
        __declspec(align(64)) VUINT32 OneEighth[16][1];
        __declspec(align(64)) VUINT32 Four[16][1];
        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
        __declspec(align(64)) VUINT32 L2H[16][1];
        __declspec(align(64)) VUINT32 L2L[16][1];
    } __svml_sasinh_data_internal_avx512;
 #endif
 __svml_sasinh_data_internal_avx512:
        /*== Log_tbl_H ==*/
        .long 0x00000000
        .long 0xbcfc0000
        .long 0xbd788000
        .long 0xbdb78000
        .long 0xbdf14000
        .long 0xbe14a000
        .long 0xbe300000
        .long 0xbe4aa000
        .long 0xbe648000
        .long 0xbe7dc000
        .long 0xbe8b4000
        .long 0xbe974000
        .long 0xbea31000
        .long 0xbeae9000
        .long 0xbeb9d000
        .long 0xbec4d000
        .long 0xbecfa000
        .long 0xbeda2000
        .long 0xbee48000
        .long 0xbeeea000
        .long 0xbef89000
        .long 0xbf012800
        .long 0xbf05f000
        .long 0xbf0aa800
        .long 0xbf0f4000
        .long 0xbf13c800
        .long 0xbf184000
        .long 0xbf1ca000
        .long 0xbf20f000
        .long 0xbf252800
        .long 0xbf295000
        .long 0xbf2d6800
        /*== Log_tbl_L ==*/
        .align 64
        .long 0x80000000
        .long 0xb726c39e
        .long 0x3839e7fe
        .long 0xb7528ae5
        .long 0x377891d5
        .long 0xb8297c10
        .long 0x37cf8f58
        .long 0x3852b186
        .long 0x35838656
        .long 0xb80c36af
        .long 0x38235454
        .long 0xb862bae1
        .long 0x37e87bc7
        .long 0x37848150
        .long 0x37202511
        .long 0xb74e1b05
        .long 0x385c1340
        .long 0xb8777bcd
        .long 0x36038656
        .long 0xb7d40984
        .long 0xb80f5faf
        .long 0xb8254b4c
        .long 0xb865c84a
        .long 0x37f0b42d
        .long 0xb83ebce1
        .long 0xb83c2513
        .long 0x37a332c4
        .long 0x3779654f
        .long 0x38602f73
        .long 0x367449f8
        .long 0xb7b4996f
        .long 0xb800986b
        /*== One ==*/
        .align 64
        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
        /*== AbsMask ==*/
        .align 64
        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
        /*== SmallThreshold ==*/
        .align 64
        .long 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000
        /*== Threshold ==*/
        .align 64
        .long 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000
        /*== LargeThreshold ==*/
        .align 64
        .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
        /*== ca1 ==*/
        .align 64
        .long 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE
        /*== c2s ==*/
        .align 64
        .long 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000
        /*== c1s ==*/
        .align 64
        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
        /*== AddB5 ==*/
        .align 64
        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
        /*== RcpBitMask ==*/
        .align 64
        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
        /*==OneEighth ==*/
        .align 64
        .long 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000
        /*== Four ==*/
        .align 64
        .long 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000
        /*== poly_coeff3 ==*/
        .align 64
        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
        /*== poly_coeff2 ==*/
        .align 64
        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
        /*== poly_coeff1 ==*/
        .align 64
        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
        /*== L2H = log(2)_high ==*/
        .align 64
        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
        /*== L2L = log(2)_low ==*/
        .align 64
        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
        .align 64
        .type	__svml_sasinh_data_internal_avx512,@object
        .size	__svml_sasinh_data_internal_avx512,.-__svml_sasinh_data_internal_avx512
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core-sse2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core-sse2.S
@ -0,0 +1,20 @@
 /* SSE2 version of vectorized asinhf, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVbN4v_asinhf _ZGVbN4v_asinhf_sse2
 #include "../svml_s_asinhf4_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core.c
@ -0,0 +1,28 @@
 /* Multiple versions of vectorized asinhf, vector length is 4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVbN4v_asinhf
 #include "ifunc-mathvec-sse4_1.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVbN4v_asinhf, __GI__ZGVbN4v_asinhf,
 	       __redirect__ZGVbN4v_asinhf)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core_sse4.S
@ -0,0 +1,509 @@
 /* Function asinhf vectorized with SSE4.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *   Compute asinh(x) as log(x + sqrt(x*x + 1))
 *
 *   Special cases:
 *
 *   asinh(NaN) = quiet NaN, and raise invalid exception
 *   asinh(INF) = that INF
 *   asinh(0)   = that 0
 *
 */
 /* Offsets for data table __svml_sasinh_data_internal
 */
 #define SgnMask                       	0
 #define sOne                          	16
 #define sPoly                         	32
 #define iBrkValue                     	160
 #define iOffExpoMask                  	176
 #define sBigThreshold                 	192
 #define sC2                           	208
 #define sC3                           	224
 #define sHalf                         	240
 #define sLargestFinite                	256
 #define sLittleThreshold              	272
 #define sSign                         	288
 #define sThirtyOne                    	304
 #define sTopMask11                    	320
 #define sTopMask8                     	336
 #define XScale                        	352
 #define sLn2                          	368
 #include <sysdep.h>
        .text
 	.section .text.sse4,"ax",@progbits
 ENTRY(_ZGVbN4v_asinhf_sse4)
        subq      $72, %rsp
        cfi_def_cfa_offset(80)
        movaps    %xmm0, %xmm8
 /*
 * Split X into high and low parts, XHi (<= 11 bits) and XLo (<= 13 bits)
 * We could use either X or |X| here, but it doesn't seem to matter
 */
        movups    sTopMask11+__svml_sasinh_data_internal(%rip), %xmm10
        movaps    %xmm8, %xmm2
        andps     %xmm8, %xmm10
 /*
 * Compute X^2 = (XHi + XLo)^2 = XHi^2 + XLo * (X + XHi)
 * The two parts are shifted off by around 11 bits. So even though
 * the low bit will not in general be exact, it's near enough
 */
        movaps    %xmm10, %xmm3
        subps     %xmm10, %xmm2
        mulps     %xmm10, %xmm3
        addps     %xmm8, %xmm10
 /* Load the constant 1 and a sign mask */
        movups    sOne+__svml_sasinh_data_internal(%rip), %xmm7
 /*
 * Finally, express Y + W = X^2 + 1 accurately where Y has <= 8 bits.
 * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1
 * as the dominant component in the compensated summation. Otherwise,
 * if |X| >= 1, then since X2Hi only has 22 significant bits, the basic
 * addition will be exact anyway until we get to |X| >= 2^24. But by
 * that time the log function is well-conditioned enough that the
 * rounding error doesn't matter. Hence we can treat 1 as dominant even
 * if it literally isn't.
 */
        movaps    %xmm7, %xmm11
        movaps    %xmm7, %xmm4
        movups    sTopMask8+__svml_sasinh_data_internal(%rip), %xmm12
        addps     %xmm3, %xmm11
        mulps     %xmm10, %xmm2
        subps     %xmm11, %xmm4
        movaps    %xmm12, %xmm0
        addps     %xmm3, %xmm4
 /*
 * Unfortunately, we can still be in trouble if |X| <= 2^-5, since
 * the absolute error 2^-(7+24)-ish in sqrt(1 + X^2) gets scaled up
 * by 1/X and comes close to our threshold. Hence if |X| <= 2^-4,
 * perform an alternative computation
 * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
 * X2 = X^2
 */
        addps     %xmm2, %xmm3
        addps     %xmm2, %xmm4
        andps     %xmm11, %xmm0
 /*
 * Compute R = 1/sqrt(Y + W) * (1 + d)
 * Force R to <= 8 significant bits.
 * This means that R * Y and R^2 * Y are exactly representable.
 */
        rsqrtps   %xmm0, %xmm14
        subps     %xmm0, %xmm11
        andps     %xmm12, %xmm14
        addps     %xmm11, %xmm4
 /*
 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
 * and T = (W/sqrt(Y + W)) * (1 + d)
 * so that S + T = sqrt(Y + W) * (1 + d)
 * S is exact, and the rounding error in T is OK.
 */
        mulps     %xmm14, %xmm0
        mulps     %xmm14, %xmm4
 /*
 * Get the absolute value of the input, since we will exploit antisymmetry
 * and mostly assume X >= 0 in the core computation
 */
        movups    SgnMask+__svml_sasinh_data_internal(%rip), %xmm6
 /*
 * Compute e = -(2 * d + d^2)
 * The first FMR is exact, and the rounding error in the other is acceptable
 * since d and e are ~ 2^-8
 */
        movaps    %xmm14, %xmm13
        andps     %xmm8, %xmm6
 /*
 * Obtain sqrt(1 + X^2) - 1 in two pieces
 * sqrt(1 + X^2) - 1
 * = sqrt(Y + W) - 1
 * = (S + T) * (1 + Corr) - 1
 * = [S - 1] + [T + (S + T) * Corr]
 * We need a compensated summation for the last part. We treat S - 1
 * as the larger part; it certainly is until about X < 2^-4, and in that
 * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
 * Final sum is dTmp5 (hi) + dTmp7 (lo)
 */
        movaps    %xmm0, %xmm1
 /*
 * Check whether the input is finite, by checking |X| <= MaxFloat
 * Otherwise set the rangemask so that the callout will get used.
 * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
 */
        movaps    %xmm6, %xmm9
 /*
 * The following computation can go wrong for very large X, basically
 * because X^2 overflows. But for large X we have
 * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
 * we can just later stick X back into the log and tweak up the exponent.
 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
 * to stay in the safe range for the later log computation.
 * Compute a flag now telling us when do do this.
 */
        movaps    %xmm6, %xmm5
        cmpnleps  sLargestFinite+__svml_sasinh_data_internal(%rip), %xmm9
        cmpltps   sBigThreshold+__svml_sasinh_data_internal(%rip), %xmm5
        mulps     %xmm0, %xmm13
        addps     %xmm4, %xmm1
        subps     %xmm7, %xmm0
        mulps     %xmm4, %xmm14
        movmskps  %xmm9, %edx
        movaps    %xmm7, %xmm9
 /*
 * Now       1 / (1 + d)
 * = 1 / (1 + (sqrt(1 - e) - 1))
 * = 1 / sqrt(1 - e)
 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
 * So compute the first three nonconstant terms of that, so that
 * we have a relative correction (1 + Corr) to apply to S etc.
 * C1 = 1/2
 * C2 = 3/8
 * C3 = 5/16
 */
        movups    sC3+__svml_sasinh_data_internal(%rip), %xmm15
        subps     %xmm13, %xmm9
        movups    sHalf+__svml_sasinh_data_internal(%rip), %xmm10
        subps     %xmm14, %xmm9
 /* sX2over2 = X^2/2 */
        mulps     %xmm10, %xmm3
        mulps     %xmm9, %xmm15
 /* sX46 = -X^4/4 + X^6/8 */
        movaps    %xmm3, %xmm2
        movaps    %xmm3, %xmm12
 /*
 * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1].
 * It's always safe to assume |X| is larger.
 * This is the final 2-part argument to the log1p function
 */
        movaps    %xmm6, %xmm14
        addps     sC2+__svml_sasinh_data_internal(%rip), %xmm15
        mulps     %xmm9, %xmm15
        addps     %xmm10, %xmm15
        mulps     %xmm15, %xmm9
        mulps     %xmm1, %xmm9
 /* Now multiplex to the case X = 2^-30 * input, Xl = sL = 0 in the "big" case. */
        movups    XScale+__svml_sasinh_data_internal(%rip), %xmm15
        addps     %xmm9, %xmm4
        movaps    %xmm4, %xmm11
        addps     %xmm0, %xmm11
        subps     %xmm11, %xmm0
        addps     %xmm0, %xmm4
 /* sX4over4 = X^4/4 */
        movaps    %xmm3, %xmm0
        mulps     %xmm3, %xmm0
        mulps     %xmm0, %xmm2
        subps     %xmm0, %xmm2
 /*
 * Now we feed into the log1p code, using H in place of _VARG1 and
 * also adding L into Xl.
 * compute 1+x as high, low parts
 */
        movaps    %xmm7, %xmm0
 /* sX46over2 = -X^4/8 + x^6/16 */
        mulps     %xmm2, %xmm10
        movaps    %xmm7, %xmm2
        addps     %xmm10, %xmm12
        subps     %xmm12, %xmm3
        addps     %xmm3, %xmm10
 /* Now multiplex the two possible computations */
        movaps    %xmm6, %xmm3
        cmpleps   sLittleThreshold+__svml_sasinh_data_internal(%rip), %xmm3
        movaps    %xmm3, %xmm13
        andps     %xmm3, %xmm12
        andnps    %xmm11, %xmm13
        movaps    %xmm3, %xmm1
        orps      %xmm12, %xmm13
        andnps    %xmm4, %xmm1
        andps     %xmm3, %xmm10
        movaps    %xmm6, %xmm4
        orps      %xmm10, %xmm1
        addps     %xmm13, %xmm14
        mulps     %xmm15, %xmm6
        maxps     %xmm14, %xmm0
        minps     %xmm14, %xmm2
        subps     %xmm14, %xmm4
        movaps    %xmm0, %xmm3
        addps     %xmm4, %xmm13
        addps     %xmm2, %xmm3
        addps     %xmm13, %xmm1
        subps     %xmm3, %xmm0
        movaps    %xmm5, %xmm4
        andps     %xmm5, %xmm3
        andnps    %xmm6, %xmm4
        addps     %xmm0, %xmm2
 /*
 * Now resume the main code.
 * reduction: compute r,n
 */
        movdqu    iBrkValue+__svml_sasinh_data_internal(%rip), %xmm6
        orps      %xmm3, %xmm4
        psubd     %xmm6, %xmm4
        movaps    %xmm7, %xmm0
        addps     %xmm2, %xmm1
        movdqu    iOffExpoMask+__svml_sasinh_data_internal(%rip), %xmm2
        pand      %xmm4, %xmm2
        psrad     $23, %xmm4
        cvtdq2ps  %xmm4, %xmm3
        pslld     $23, %xmm4
        andps     %xmm5, %xmm1
        paddd     %xmm6, %xmm2
        psubd     %xmm4, %xmm0
        mulps     %xmm0, %xmm1
 /* polynomial evaluation */
        subps     %xmm7, %xmm2
        movups    sPoly+112+__svml_sasinh_data_internal(%rip), %xmm7
        addps     %xmm2, %xmm1
        mulps     %xmm1, %xmm7
        movaps    %xmm5, %xmm2
 /* Add 31 to the exponent in the "large" case to get log(2 * input) */
        movups    sThirtyOne+__svml_sasinh_data_internal(%rip), %xmm0
        addps     sPoly+96+__svml_sasinh_data_internal(%rip), %xmm7
        addps     %xmm3, %xmm0
        mulps     %xmm1, %xmm7
        andnps    %xmm0, %xmm2
        andps     %xmm5, %xmm3
        orps      %xmm3, %xmm2
        addps     sPoly+80+__svml_sasinh_data_internal(%rip), %xmm7
 /* final reconstruction */
        mulps     sLn2+__svml_sasinh_data_internal(%rip), %xmm2
        mulps     %xmm1, %xmm7
 /* Finally, reincorporate the original sign. */
        movups    sSign+__svml_sasinh_data_internal(%rip), %xmm0
        andps     %xmm8, %xmm0
        addps     sPoly+64+__svml_sasinh_data_internal(%rip), %xmm7
        mulps     %xmm1, %xmm7
        addps     sPoly+48+__svml_sasinh_data_internal(%rip), %xmm7
        mulps     %xmm1, %xmm7
        addps     sPoly+32+__svml_sasinh_data_internal(%rip), %xmm7
        mulps     %xmm1, %xmm7
        addps     sPoly+16+__svml_sasinh_data_internal(%rip), %xmm7
        mulps     %xmm1, %xmm7
        addps     sPoly+__svml_sasinh_data_internal(%rip), %xmm7
        mulps     %xmm1, %xmm7
        mulps     %xmm1, %xmm7
        addps     %xmm7, %xmm1
        addps     %xmm2, %xmm1
        pxor      %xmm1, %xmm0
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm8
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        addq      $72, %rsp
        cfi_def_cfa_offset(8)
        ret
        cfi_def_cfa_offset(80)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        movups    %xmm8, 32(%rsp)
        movups    %xmm0, 48(%rsp)
                                # LOE rbx rbp r12 r13 r14 r15 edx
        xorl      %eax, %eax
        movq      %r12, 16(%rsp)
        cfi_offset(12, -64)
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        cfi_offset(13, -72)
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        cfi_offset(14, -80)
                                # LOE rbx rbp r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx rbp r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $4, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx rbp r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        movups    48(%rsp), %xmm0
 /* Go to exit */
        jmp       L(EXIT)
        cfi_offset(12, -64)
        cfi_offset(13, -72)
        cfi_offset(14, -80)
                                # LOE rbx rbp r12 r13 r14 r15 xmm0
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movss     32(%rsp,%r14,4), %xmm0
        call      asinhf@PLT
                                # LOE rbx rbp r14 r15 r12d r13d xmm0
        movss     %xmm0, 48(%rsp,%r14,4)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx rbp r15 r12d r13d
 END(_ZGVbN4v_asinhf_sse4)
        .section .rodata, "a"
        .align 16
 #ifdef __svml_sasinh_data_internal_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(16)) VUINT32 SgnMask[4][1];
        __declspec(align(16)) VUINT32 sOne[4][1];
        __declspec(align(16)) VUINT32 sPoly[8][4][1];
        __declspec(align(16)) VUINT32 iBrkValue[4][1];
        __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
        __declspec(align(16)) VUINT32 sBigThreshold[4][1];
        __declspec(align(16)) VUINT32 sC2[4][1];
        __declspec(align(16)) VUINT32 sC3[4][1];
        __declspec(align(16)) VUINT32 sHalf[4][1];
        __declspec(align(16)) VUINT32 sLargestFinite[4][1];
        __declspec(align(16)) VUINT32 sLittleThreshold[4][1];
        __declspec(align(16)) VUINT32 sSign[4][1];
        __declspec(align(16)) VUINT32 sThirtyOne[4][1];
        __declspec(align(16)) VUINT32 sTopMask11[4][1];
        __declspec(align(16)) VUINT32 sTopMask8[4][1];
        __declspec(align(16)) VUINT32 XScale[4][1];
        __declspec(align(16)) VUINT32 sLn2[4][1];
 } __svml_sasinh_data_internal;
 #endif
 __svml_sasinh_data_internal:
        /*== SgnMask ==*/
        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
        /*== sOne = SP 1.0 ==*/
        .align 16
        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
        /*== sPoly[] = SP polynomial ==*/
        .align 16
        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
        /*== iBrkValue = SP 2/3 ==*/
        .align 16
        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
        /*== iOffExpoMask = SP significand mask ==*/
        .align 16
        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
        /*== sBigThreshold ==*/
        .align 16
        .long 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
        /*== sC2 ==*/
        .align 16
        .long 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
        /*== sC3 ==*/
        .align 16
        .long 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
        /*== sHalf ==*/
        .align 16
        .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
        /*== sLargestFinite ==*/
        .align 16
        .long 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
        /*== sLittleThreshold ==*/
        .align 16
        .long 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000
        /*== sSign ==*/
        .align 16
        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
        /*== sThirtyOne ==*/
        .align 16
        .long 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
        /*== sTopMask11 ==*/
        .align 16
        .long 0xFFFFE000, 0xFFFFE000, 0xFFFFE000, 0xFFFFE000
        /*== sTopMask8 ==*/
        .align 16
        .long 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
        /*== XScale ==*/
        .align 16
        .long 0x30800000, 0x30800000, 0x30800000, 0x30800000
        /*== sLn2 = SP ln(2) ==*/
        .align 16
        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
        .align 16
        .type	__svml_sasinh_data_internal,@object
        .size	__svml_sasinh_data_internal,.-__svml_sasinh_data_internal
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core-sse.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core-sse.S
@ -0,0 +1,20 @@
 /* SSE version of vectorized asinhf, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define _ZGVdN8v_asinhf _ZGVdN8v_asinhf_sse_wrapper
 #include "../svml_s_asinhf8_core.S"
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core.c
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core.c
@ -0,0 +1,28 @@
 /* Multiple versions of vectorized asinhf, vector length is 8.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define SYMBOL_NAME _ZGVdN8v_asinhf
 #include "ifunc-mathvec-avx2.h"
 libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (_ZGVdN8v_asinhf, __GI__ZGVdN8v_asinhf,
 	       __redirect__ZGVdN8v_asinhf)
  __attribute__ ((visibility ("hidden")));
 #endif
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
@ -0,0 +1,457 @@
 /* Function asinhf vectorized with AVX2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */
 /*
 * ALGORITHM DESCRIPTION:
 *
 *   Compute asinh(x) as log(x + sqrt(x*x + 1))
 *
 *   Special cases:
 *
 *   asinh(NaN) = quiet NaN, and raise invalid exception
 *   asinh(INF) = that INF
 *   asinh(0)   = that 0
 *
 */
 /* Offsets for data table __svml_sasinh_data_internal
 */
 #define SgnMask                       	0
 #define sOne                          	32
 #define sPoly                         	64
 #define iBrkValue                     	320
 #define iOffExpoMask                  	352
 #define sBigThreshold                 	384
 #define sC2                           	416
 #define sC3                           	448
 #define sHalf                         	480
 #define sLargestFinite                	512
 #define sLittleThreshold              	544
 #define sSign                         	576
 #define sThirtyOne                    	608
 #define sTopMask8                     	640
 #define XScale                        	672
 #define sLn2                          	704
 #include <sysdep.h>
        .text
 	.section .text.avx2,"ax",@progbits
 ENTRY(_ZGVdN8v_asinhf_avx2)
        pushq     %rbp
        cfi_def_cfa_offset(16)
        movq      %rsp, %rbp
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
        andq      $-32, %rsp
        subq      $96, %rsp
        vmovaps   %ymm0, %ymm9
 /* Load the constant 1 and a sign mask */
        vmovups   sOne+__svml_sasinh_data_internal(%rip), %ymm8
 /* No need to split X when FMA is available in hardware. */
        vmulps    %ymm9, %ymm9, %ymm5
        vmovups   sTopMask8+__svml_sasinh_data_internal(%rip), %ymm1
 /*
 * Finally, express Y + W = X^2 + 1 accurately where Y has <= 8 bits.
 * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1
 * as the dominant component in the compensated summation. Otherwise,
 * if |X| >= 1, then since X2Hi only has 22 significant bits, the basic
 * addition will be exact anyway until we get to |X| >= 2^24. But by
 * that time the log function is well-conditioned enough that the
 * rounding error doesn't matter. Hence we can treat 1 as dominant even
 * if it literally isn't.
 */
        vaddps    %ymm5, %ymm8, %ymm13
        vandps    %ymm1, %ymm13, %ymm2
        vmovaps   %ymm9, %ymm4
        vsubps    %ymm13, %ymm8, %ymm11
        vsubps    %ymm2, %ymm13, %ymm15
 /*
 * Compute R = 1/sqrt(Y + W) * (1 + d)
 * Force R to <= 8 significant bits.
 * This means that R * Y and R^2 * Y are exactly representable.
 */
        vrsqrtps  %ymm2, %ymm0
        vfmsub213ps %ymm5, %ymm9, %ymm4
        vaddps    %ymm11, %ymm5, %ymm12
 /*
 * Get the absolute value of the input, since we will exploit antisymmetry
 * and mostly assume X >= 0 in the core computation
 */
        vandps    SgnMask+__svml_sasinh_data_internal(%rip), %ymm9, %ymm6
 /*
 * Check whether the input is finite, by checking |X| <= MaxFloat
 * Otherwise set the rangemask so that the callout will get used.
 * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
 */
        vcmpnle_uqps sLargestFinite+__svml_sasinh_data_internal(%rip), %ymm6, %ymm10
        vaddps    %ymm12, %ymm4, %ymm14
 /*
 * Unfortunately, we can still be in trouble if |X| <= 2^-5, since
 * the absolute error 2^-(7+24)-ish in sqrt(1 + X^2) gets scaled up
 * by 1/X and comes close to our threshold. Hence if |X| <= 2^-4,
 * perform an alternative computation
 * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
 * X2 = X^2
 */
        vaddps    %ymm4, %ymm5, %ymm4
 /*
 * The following computation can go wrong for very large X, basically
 * because X^2 overflows. But for large X we have
 * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
 * we can just later stick X back into the log and tweak up the exponent.
 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
 * to stay in the safe range for the later log computation.
 * Compute a flag now telling us when do do this.
 */
        vcmplt_oqps sBigThreshold+__svml_sasinh_data_internal(%rip), %ymm6, %ymm7
        vaddps    %ymm15, %ymm14, %ymm3
 /*
 * Now       1 / (1 + d)
 * = 1 / (1 + (sqrt(1 - e) - 1))
 * = 1 / sqrt(1 - e)
 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
 * So compute the first three nonconstant terms of that, so that
 * we have a relative correction (1 + Corr) to apply to S etc.
 * C1 = 1/2
 * C2 = 3/8
 * C3 = 5/16
 */
        vmovups   sC3+__svml_sasinh_data_internal(%rip), %ymm12
        vmovmskps %ymm10, %edx
        vandps    %ymm1, %ymm0, %ymm10
 /*
 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
 * and T = (W/sqrt(Y + W)) * (1 + d)
 * so that S + T = sqrt(Y + W) * (1 + d)
 * S is exact, and the rounding error in T is OK.
 */
        vmulps    %ymm10, %ymm2, %ymm15
        vmulps    %ymm3, %ymm10, %ymm14
        vmovups   sHalf+__svml_sasinh_data_internal(%rip), %ymm3
        vsubps    %ymm8, %ymm15, %ymm0
 /*
 * Obtain sqrt(1 + X^2) - 1 in two pieces
 * sqrt(1 + X^2) - 1
 * = sqrt(Y + W) - 1
 * = (S + T) * (1 + Corr) - 1
 * = [S - 1] + [T + (S + T) * Corr]
 * We need a compensated summation for the last part. We treat S - 1
 * as the larger part; it certainly is until about X < 2^-4, and in that
 * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
 * Final sum is dTmp5 (hi) + dTmp7 (lo)
 */
        vaddps    %ymm14, %ymm15, %ymm13
 /*
 * Compute e = -(2 * d + d^2)
 * The first FMR is exact, and the rounding error in the other is acceptable
 * since d and e are ~ 2^-8
 */
        vmovaps   %ymm8, %ymm11
        vfnmadd231ps %ymm15, %ymm10, %ymm11
        vfnmadd231ps %ymm14, %ymm10, %ymm11
        vfmadd213ps sC2+__svml_sasinh_data_internal(%rip), %ymm11, %ymm12
        vfmadd213ps %ymm3, %ymm11, %ymm12
        vmulps    %ymm12, %ymm11, %ymm1
 /* Now multiplex the two possible computations */
        vcmple_oqps sLittleThreshold+__svml_sasinh_data_internal(%rip), %ymm6, %ymm11
        vfmadd213ps %ymm14, %ymm13, %ymm1
        vaddps    %ymm0, %ymm1, %ymm2
        vsubps    %ymm2, %ymm0, %ymm10
 /* sX2over2 = X^2/2 */
        vmulps    %ymm4, %ymm3, %ymm0
        vaddps    %ymm10, %ymm1, %ymm1
 /* sX4over4 = X^4/4 */
        vmulps    %ymm0, %ymm0, %ymm5
 /* sX46 = -X^4/4 + X^6/8 */
        vfmsub231ps %ymm0, %ymm5, %ymm5
 /* sX46over2 = -X^4/8 + x^6/16 */
        vmulps    %ymm5, %ymm3, %ymm3
        vaddps    %ymm3, %ymm0, %ymm5
        vblendvps %ymm11, %ymm5, %ymm2, %ymm2
        vsubps    %ymm5, %ymm0, %ymm4
 /*
 * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1].
 * It's always safe to assume |X| is larger.
 * This is the final 2-part argument to the log1p function
 */
        vaddps    %ymm2, %ymm6, %ymm14
 /*
 * Now resume the main code.
 * reduction: compute r,n
 */
        vmovups   iBrkValue+__svml_sasinh_data_internal(%rip), %ymm5
        vaddps    %ymm4, %ymm3, %ymm10
 /*
 * Now we feed into the log1p code, using H in place of _VARG1 and
 * also adding L into Xl.
 * compute 1+x as high, low parts
 */
        vmaxps    %ymm14, %ymm8, %ymm15
        vminps    %ymm14, %ymm8, %ymm0
        vblendvps %ymm11, %ymm10, %ymm1, %ymm12
        vsubps    %ymm14, %ymm6, %ymm1
        vaddps    %ymm0, %ymm15, %ymm3
 /* Now multiplex to the case X = 2^-30 * input, Xl = sL = 0 in the "big" case. */
        vmulps    XScale+__svml_sasinh_data_internal(%rip), %ymm6, %ymm6
        vaddps    %ymm1, %ymm2, %ymm13
        vsubps    %ymm3, %ymm15, %ymm15
        vaddps    %ymm13, %ymm12, %ymm1
        vaddps    %ymm15, %ymm0, %ymm2
        vblendvps %ymm7, %ymm3, %ymm6, %ymm0
        vaddps    %ymm2, %ymm1, %ymm4
        vpsubd    %ymm5, %ymm0, %ymm1
        vpsrad    $23, %ymm1, %ymm6
        vpand     iOffExpoMask+__svml_sasinh_data_internal(%rip), %ymm1, %ymm2
        vmovups   sPoly+224+__svml_sasinh_data_internal(%rip), %ymm1
        vpslld    $23, %ymm6, %ymm10
        vpaddd    %ymm5, %ymm2, %ymm13
        vcvtdq2ps %ymm6, %ymm0
        vpsubd    %ymm10, %ymm8, %ymm12
 /* polynomial evaluation */
        vsubps    %ymm8, %ymm13, %ymm8
 /* Add 31 to the exponent in the "large" case to get log(2 * input) */
        vaddps    sThirtyOne+__svml_sasinh_data_internal(%rip), %ymm0, %ymm3
        vandps    %ymm7, %ymm4, %ymm11
        vmulps    %ymm12, %ymm11, %ymm14
        vblendvps %ymm7, %ymm0, %ymm3, %ymm0
        vaddps    %ymm8, %ymm14, %ymm2
        vfmadd213ps sPoly+192+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
        vfmadd213ps sPoly+160+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
        vfmadd213ps sPoly+128+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
        vfmadd213ps sPoly+96+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
        vfmadd213ps sPoly+64+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
        vfmadd213ps sPoly+32+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
        vfmadd213ps sPoly+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
        vmulps    %ymm1, %ymm2, %ymm4
        vfmadd213ps %ymm2, %ymm2, %ymm4
 /* final reconstruction */
        vfmadd132ps sLn2+__svml_sasinh_data_internal(%rip), %ymm4, %ymm0
 /* Finally, reincorporate the original sign. */
        vandps    sSign+__svml_sasinh_data_internal(%rip), %ymm9, %ymm7
        vxorps    %ymm0, %ymm7, %ymm0
        testl     %edx, %edx
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
                                # LOE rbx r12 r13 r14 r15 edx ymm0 ymm9
 /* Restore registers
 * and exit the function
 */
 L(EXIT):
        movq      %rbp, %rsp
        popq      %rbp
        cfi_def_cfa(7, 8)
        cfi_restore(6)
        ret
        cfi_def_cfa(6, 16)
        cfi_offset(6, -16)
 /* Branch to process
 * special inputs
 */
 L(SPECIAL_VALUES_BRANCH):
        vmovups   %ymm9, 32(%rsp)
        vmovups   %ymm0, 64(%rsp)
                                # LOE rbx r12 r13 r14 r15 edx ymm0
        xorl      %eax, %eax
                                # LOE rbx r12 r13 r14 r15 eax edx
        vzeroupper
        movq      %r12, 16(%rsp)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
        movl      %eax, %r12d
        movq      %r13, 8(%rsp)
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
        movl      %edx, %r13d
        movq      %r14, (%rsp)
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r15 r12d r13d
 /* Range mask
 * bits check
 */
 L(RANGEMASK_CHECK):
        btl       %r12d, %r13d
 /* Call scalar math function */
        jc        L(SCALAR_MATH_CALL)
                                # LOE rbx r15 r12d r13d
 /* Special inputs
 * processing loop
 */
 L(SPECIAL_VALUES_LOOP):
        incl      %r12d
        cmpl      $8, %r12d
 /* Check bits in range mask */
        jl        L(RANGEMASK_CHECK)
                                # LOE rbx r15 r12d r13d
        movq      16(%rsp), %r12
        cfi_restore(12)
        movq      8(%rsp), %r13
        cfi_restore(13)
        movq      (%rsp), %r14
        cfi_restore(14)
        vmovups   64(%rsp), %ymm0
 /* Go to exit */
        jmp       L(EXIT)
        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r12 r13 r14 r15 ymm0
 /* Scalar math fucntion call
 * to process special input
 */
 L(SCALAR_MATH_CALL):
        movl      %r12d, %r14d
        movss     32(%rsp,%r14,4), %xmm0
        call      asinhf@PLT
                                # LOE rbx r14 r15 r12d r13d xmm0
        movss     %xmm0, 64(%rsp,%r14,4)
 /* Process special inputs in loop */
        jmp       L(SPECIAL_VALUES_LOOP)
                                # LOE rbx r15 r12d r13d
 END(_ZGVdN8v_asinhf_avx2)
        .section .rodata, "a"
        .align 32
 #ifdef __svml_sasinh_data_internal_typedef
 typedef unsigned int VUINT32;
 typedef struct {
        __declspec(align(32)) VUINT32 SgnMask[8][1];
        __declspec(align(32)) VUINT32 sOne[8][1];
        __declspec(align(32)) VUINT32 sPoly[8][8][1];
        __declspec(align(32)) VUINT32 iBrkValue[8][1];
        __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
        __declspec(align(32)) VUINT32 sBigThreshold[8][1];
        __declspec(align(32)) VUINT32 sC2[8][1];
        __declspec(align(32)) VUINT32 sC3[8][1];
        __declspec(align(32)) VUINT32 sHalf[8][1];
        __declspec(align(32)) VUINT32 sLargestFinite[8][1];
        __declspec(align(32)) VUINT32 sLittleThreshold[8][1];
        __declspec(align(32)) VUINT32 sSign[8][1];
        __declspec(align(32)) VUINT32 sThirtyOne[8][1];
        __declspec(align(32)) VUINT32 sTopMask8[8][1];
        __declspec(align(32)) VUINT32 XScale[8][1];
        __declspec(align(32)) VUINT32 sLn2[8][1];
 } __svml_sasinh_data_internal;
 #endif
 __svml_sasinh_data_internal:
        /*== SgnMask ==*/
        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
        /*== sOne = SP 1.0 ==*/
        .align 32
        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
        /*== sPoly[] = SP polynomial ==*/
        .align 32
        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
        /*== iBrkValue = SP 2/3 ==*/
        .align 32
        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
        /*== iOffExpoMask = SP significand mask ==*/
        .align 32
        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
        /*== sBigThreshold ==*/
        .align 32
        .long 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
        /*== sC2 ==*/
        .align 32
        .long 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
        /*== sC3 ==*/
        .align 32
        .long 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
        /*== sHalf ==*/
        .align 32
        .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
        /*== sLargestFinite ==*/
        .align 32
        .long 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
        /*== sLittleThreshold ==*/
        .align 32
        .long 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000
        /*== sSign ==*/
        .align 32
        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
        /*== sThirtyOne ==*/
        .align 32
        .long 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
        /*== sTopMask8 ==*/
        .align 32
        .long 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
        /*== XScale ==*/
        .align 32
        .long 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000
        /*== sLn2 = SP ln(2) ==*/
        .align 32
        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
        .align 32
        .type	__svml_sasinh_data_internal,@object
        .size	__svml_sasinh_data_internal,.-__svml_sasinh_data_internal
--- a/sysdeps/x86_64/fpu/svml_d_asinh2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_asinh2_core.S
@ -0,0 +1,29 @@
 /* Function asinh vectorized with SSE2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVbN2v_asinh)
 WRAPPER_IMPL_SSE2 asinh
 END (_ZGVbN2v_asinh)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVbN2v_asinh)
 #endif
--- a/sysdeps/x86_64/fpu/svml_d_asinh4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_asinh4_core.S
@ -0,0 +1,29 @@
 /* Function asinh vectorized with AVX2, wrapper version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVdN4v_asinh)
 WRAPPER_IMPL_AVX _ZGVbN2v_asinh
 END (_ZGVdN4v_asinh)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVdN4v_asinh)
 #endif
--- a/sysdeps/x86_64/fpu/svml_d_asinh4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_asinh4_core_avx.S
@ -0,0 +1,25 @@
 /* Function asinh vectorized in AVX ISA as wrapper to SSE4 ISA version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVcN4v_asinh)
 WRAPPER_IMPL_AVX _ZGVbN2v_asinh
 END (_ZGVcN4v_asinh)
--- a/sysdeps/x86_64/fpu/svml_d_asinh8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_asinh8_core.S
@ -0,0 +1,25 @@
 /* Function asinh vectorized with AVX-512, wrapper to AVX2.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_d_wrapper_impl.h"
 	.text
 ENTRY (_ZGVeN8v_asinh)
 WRAPPER_IMPL_AVX512 _ZGVdN4v_asinh
 END (_ZGVeN8v_asinh)
--- a/sysdeps/x86_64/fpu/svml_s_asinhf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_asinhf16_core.S
@ -0,0 +1,25 @@
 /* Function asinhf vectorized with AVX-512. Wrapper to AVX2 version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
 	.text
 ENTRY (_ZGVeN16v_asinhf)
 WRAPPER_IMPL_AVX512 _ZGVdN8v_asinhf
 END (_ZGVeN16v_asinhf)
--- a/sysdeps/x86_64/fpu/svml_s_asinhf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_asinhf4_core.S
@ -0,0 +1,29 @@
 /* Function asinhf vectorized with SSE2, wrapper version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
 	.text
 ENTRY (_ZGVbN4v_asinhf)
 WRAPPER_IMPL_SSE2 asinhf
 END (_ZGVbN4v_asinhf)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVbN4v_asinhf)
 #endif
--- a/sysdeps/x86_64/fpu/svml_s_asinhf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_asinhf8_core.S
@ -0,0 +1,29 @@
 /* Function asinhf vectorized with AVX2, wrapper version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
 	.text
 ENTRY (_ZGVdN8v_asinhf)
 WRAPPER_IMPL_AVX _ZGVbN4v_asinhf
 END (_ZGVdN8v_asinhf)
 #ifndef USE_MULTIARCH
 libmvec_hidden_def (_ZGVdN8v_asinhf)
 #endif
--- a/sysdeps/x86_64/fpu/svml_s_asinhf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_asinhf8_core_avx.S
@ -0,0 +1,25 @@
 /* Function asinhf vectorized in AVX ISA as wrapper to SSE4 ISA version.
   Copyright (C) 2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 #include "svml_s_wrapper_impl.h"
        .text
 ENTRY (_ZGVcN8v_asinhf)
 WRAPPER_IMPL_AVX _ZGVbN4v_asinhf
 END (_ZGVcN8v_asinhf)
--- a/sysdeps/x86_64/fpu/test-double-libmvec-asinh-avx.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-asinh-avx.c
@ -0,0 +1 @@
 #include "test-double-libmvec-asinh.c"
--- a/sysdeps/x86_64/fpu/test-double-libmvec-asinh-avx2.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-asinh-avx2.c
@ -0,0 +1 @@
 #include "test-double-libmvec-asinh.c"
--- a/sysdeps/x86_64/fpu/test-double-libmvec-asinh-avx512f.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-asinh-avx512f.c
@ -0,0 +1 @@
 #include "test-double-libmvec-asinh.c"
--- a/sysdeps/x86_64/fpu/test-double-libmvec-asinh.c
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-asinh.c
@ -0,0 +1,3 @@
 #define LIBMVEC_TYPE double
 #define LIBMVEC_FUNC asinh
 #include "test-vector-abi-arg1.h"
--- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@ -45,6 +45,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVbN2v_atanh)
 VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVbN2v_acosh)
 VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVbN2v_erf)
 VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVbN2v_tanh)
 VECTOR_WRAPPER (WRAPPER_NAME (asinh), _ZGVbN2v_asinh)
 #define VEC_INT_TYPE __m128i
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@ -48,6 +48,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVdN4v_atanh)
 VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVdN4v_acosh)
 VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVdN4v_erf)
 VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVdN4v_tanh)
 VECTOR_WRAPPER (WRAPPER_NAME (asinh), _ZGVdN4v_asinh)
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m256i
--- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@ -45,6 +45,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVcN4v_atanh)
 VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVcN4v_acosh)
 VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVcN4v_erf)
 VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVcN4v_tanh)
 VECTOR_WRAPPER (WRAPPER_NAME (asinh), _ZGVcN4v_asinh)
 #define VEC_INT_TYPE __m128i
--- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@ -45,6 +45,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVeN8v_atanh)
 VECTOR_WRAPPER (WRAPPER_NAME (acosh), _ZGVeN8v_acosh)
 VECTOR_WRAPPER (WRAPPER_NAME (erf), _ZGVeN8v_erf)
 VECTOR_WRAPPER (WRAPPER_NAME (tanh), _ZGVeN8v_tanh)
 VECTOR_WRAPPER (WRAPPER_NAME (asinh), _ZGVeN8v_asinh)
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m512i
--- a/sysdeps/x86_64/fpu/test-float-libmvec-asinhf-avx.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-asinhf-avx.c
@ -0,0 +1 @@
 #include "test-float-libmvec-asinhf.c"
--- a/sysdeps/x86_64/fpu/test-float-libmvec-asinhf-avx2.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-asinhf-avx2.c
@ -0,0 +1 @@
 #include "test-float-libmvec-asinhf.c"
--- a/sysdeps/x86_64/fpu/test-float-libmvec-asinhf-avx512f.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-asinhf-avx512f.c
@ -0,0 +1 @@
 #include "test-float-libmvec-asinhf.c"
--- a/sysdeps/x86_64/fpu/test-float-libmvec-asinhf.c
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-asinhf.c
@ -0,0 +1,3 @@
 #define LIBMVEC_TYPE float
 #define LIBMVEC_FUNC asinhf
 #include "test-vector-abi-arg1.h"
--- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@ -45,6 +45,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVeN16v_atanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVeN16v_acoshf)
 VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVeN16v_erff)
 VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVeN16v_tanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (asinhf), _ZGVeN16v_asinhf)
 #define VEC_INT_TYPE __m512i
--- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@ -45,6 +45,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVbN4v_atanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVbN4v_acoshf)
 VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVbN4v_erff)
 VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVbN4v_tanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (asinhf), _ZGVbN4v_asinhf)
 #define VEC_INT_TYPE __m128i
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@ -48,6 +48,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVdN8v_atanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVdN8v_acoshf)
 VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVdN8v_erff)
 VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVdN8v_tanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (asinhf), _ZGVdN8v_asinhf)
 /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
 #undef VECTOR_WRAPPER_fFF
--- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@ -45,6 +45,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVcN8v_atanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (acoshf), _ZGVcN8v_acoshf)
 VECTOR_WRAPPER (WRAPPER_NAME (erff), _ZGVcN8v_erff)
 VECTOR_WRAPPER (WRAPPER_NAME (tanhf), _ZGVcN8v_tanhf)
 VECTOR_WRAPPER (WRAPPER_NAME (asinhf), _ZGVcN8v_asinhf)
 #define VEC_INT_TYPE __m128i