mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-09 23:00:07 +00:00
x86-64: Add vector hypot/hypotf implementation to libmvec
Implement vectorized hypot/hypotf containing SSE, AVX, AVX2 and AVX512 versions for libmvec as per vector ABI. It also contains accuracy and ABI tests for vector hypot/hypotf with regenerated ulps. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
11c01de14c
commit
37475ba883
@ -131,4 +131,15 @@
|
||||
#define __DECL_SIMD_asinf32x
|
||||
#define __DECL_SIMD_asinf64x
|
||||
#define __DECL_SIMD_asinf128x
|
||||
|
||||
#define __DECL_SIMD_hypot
|
||||
#define __DECL_SIMD_hypotf
|
||||
#define __DECL_SIMD_hypotl
|
||||
#define __DECL_SIMD_hypotf16
|
||||
#define __DECL_SIMD_hypotf32
|
||||
#define __DECL_SIMD_hypotf64
|
||||
#define __DECL_SIMD_hypotf128
|
||||
#define __DECL_SIMD_hypotf32x
|
||||
#define __DECL_SIMD_hypotf64x
|
||||
#define __DECL_SIMD_hypotf128x
|
||||
#endif
|
||||
|
@ -144,7 +144,7 @@ __MATHCALL (sqrt,, (_Mdouble_ __x));
|
||||
|
||||
#if defined __USE_XOPEN || defined __USE_ISOC99
|
||||
/* Return `sqrt(X*X + Y*Y)'. */
|
||||
__MATHCALL (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
|
||||
__MATHCALL_VEC (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
|
||||
#endif
|
||||
|
||||
#if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99
|
||||
|
@ -49,24 +49,32 @@ GLIBC_2.22 _ZGVeN8vvv_sincos F
|
||||
GLIBC_2.35 _ZGVbN2v_acos F
|
||||
GLIBC_2.35 _ZGVbN2v_asin F
|
||||
GLIBC_2.35 _ZGVbN2v_atan F
|
||||
GLIBC_2.35 _ZGVbN2vv_hypot F
|
||||
GLIBC_2.35 _ZGVbN4v_acosf F
|
||||
GLIBC_2.35 _ZGVbN4v_asinf F
|
||||
GLIBC_2.35 _ZGVbN4v_atanf F
|
||||
GLIBC_2.35 _ZGVbN4vv_hypotf F
|
||||
GLIBC_2.35 _ZGVcN4v_acos F
|
||||
GLIBC_2.35 _ZGVcN4v_asin F
|
||||
GLIBC_2.35 _ZGVcN4v_atan F
|
||||
GLIBC_2.35 _ZGVcN4vv_hypot F
|
||||
GLIBC_2.35 _ZGVcN8v_acosf F
|
||||
GLIBC_2.35 _ZGVcN8v_asinf F
|
||||
GLIBC_2.35 _ZGVcN8v_atanf F
|
||||
GLIBC_2.35 _ZGVcN8vv_hypotf F
|
||||
GLIBC_2.35 _ZGVdN4v_acos F
|
||||
GLIBC_2.35 _ZGVdN4v_asin F
|
||||
GLIBC_2.35 _ZGVdN4v_atan F
|
||||
GLIBC_2.35 _ZGVdN4vv_hypot F
|
||||
GLIBC_2.35 _ZGVdN8v_acosf F
|
||||
GLIBC_2.35 _ZGVdN8v_asinf F
|
||||
GLIBC_2.35 _ZGVdN8v_atanf F
|
||||
GLIBC_2.35 _ZGVdN8vv_hypotf F
|
||||
GLIBC_2.35 _ZGVeN16v_acosf F
|
||||
GLIBC_2.35 _ZGVeN16v_asinf F
|
||||
GLIBC_2.35 _ZGVeN16v_atanf F
|
||||
GLIBC_2.35 _ZGVeN16vv_hypotf F
|
||||
GLIBC_2.35 _ZGVeN8v_acos F
|
||||
GLIBC_2.35 _ZGVeN8v_asin F
|
||||
GLIBC_2.35 _ZGVeN8v_atan F
|
||||
GLIBC_2.35 _ZGVeN8vv_hypot F
|
||||
|
@ -70,6 +70,10 @@
|
||||
# define __DECL_SIMD_asin __DECL_SIMD_x86_64
|
||||
# undef __DECL_SIMD_asinf
|
||||
# define __DECL_SIMD_asinf __DECL_SIMD_x86_64
|
||||
# undef __DECL_SIMD_hypot
|
||||
# define __DECL_SIMD_hypot __DECL_SIMD_x86_64
|
||||
# undef __DECL_SIMD_hypotf
|
||||
# define __DECL_SIMD_hypotf __DECL_SIMD_x86_64
|
||||
|
||||
# endif
|
||||
#endif
|
||||
|
@ -34,6 +34,8 @@
|
||||
!GCC$ builtin (atanf) attributes simd (notinbranch) if('x86_64')
|
||||
!GCC$ builtin (asin) attributes simd (notinbranch) if('x86_64')
|
||||
!GCC$ builtin (asinf) attributes simd (notinbranch) if('x86_64')
|
||||
!GCC$ builtin (hypot) attributes simd (notinbranch) if('x86_64')
|
||||
!GCC$ builtin (hypotf) attributes simd (notinbranch) if('x86_64')
|
||||
|
||||
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
|
||||
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
|
||||
@ -53,3 +55,5 @@
|
||||
!GCC$ builtin (atanf) attributes simd (notinbranch) if('x32')
|
||||
!GCC$ builtin (asin) attributes simd (notinbranch) if('x32')
|
||||
!GCC$ builtin (asinf) attributes simd (notinbranch) if('x32')
|
||||
!GCC$ builtin (hypot) attributes simd (notinbranch) if('x32')
|
||||
!GCC$ builtin (hypotf) attributes simd (notinbranch) if('x32')
|
||||
|
@ -27,6 +27,7 @@ libmvec-funcs = \
|
||||
atan \
|
||||
cos \
|
||||
exp \
|
||||
hypot \
|
||||
log \
|
||||
pow \
|
||||
sin \
|
||||
|
@ -17,8 +17,10 @@ libmvec {
|
||||
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
|
||||
_ZGVbN2v_asin; _ZGVcN4v_asin; _ZGVdN4v_asin; _ZGVeN8v_asin;
|
||||
_ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
|
||||
_ZGVbN2vv_hypot; _ZGVcN4vv_hypot; _ZGVdN4vv_hypot; _ZGVeN8vv_hypot;
|
||||
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
|
||||
_ZGVbN4v_asinf; _ZGVcN8v_asinf; _ZGVdN8v_asinf; _ZGVeN16v_asinf;
|
||||
_ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
|
||||
_ZGVbN4vv_hypotf; _ZGVcN8vv_hypotf; _ZGVdN8vv_hypotf; _ZGVeN16vv_hypotf;
|
||||
}
|
||||
}
|
||||
|
@ -1375,6 +1375,26 @@ double: 1
|
||||
float128: 1
|
||||
ldouble: 1
|
||||
|
||||
Function: "hypot_vlen16":
|
||||
float: 1
|
||||
|
||||
Function: "hypot_vlen2":
|
||||
double: 1
|
||||
|
||||
Function: "hypot_vlen4":
|
||||
double: 1
|
||||
float: 1
|
||||
|
||||
Function: "hypot_vlen4_avx2":
|
||||
double: 1
|
||||
|
||||
Function: "hypot_vlen8":
|
||||
double: 1
|
||||
float: 1
|
||||
|
||||
Function: "hypot_vlen8_avx2":
|
||||
float: 1
|
||||
|
||||
Function: "j0":
|
||||
double: 3
|
||||
float: 9
|
||||
|
20
sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core-sse2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core-sse2.S
Normal file
@ -0,0 +1,20 @@
|
||||
/* SSE2 version of vectorized hypot.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define _ZGVbN2vv_hypot _ZGVbN2vv_hypot_sse2
|
||||
#include "../svml_d_hypot2_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core.c
Normal file
@ -0,0 +1,28 @@
|
||||
/* Multiple versions of vectorized hypot, vector length is 2.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define SYMBOL_NAME _ZGVbN2vv_hypot
|
||||
#include "ifunc-mathvec-sse4_1.h"
|
||||
|
||||
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||
|
||||
#ifdef SHARED
|
||||
__hidden_ver1 (_ZGVbN2vv_hypot, __GI__ZGVbN2vv_hypot,
|
||||
__redirect__ZGVbN2vv_hypot)
|
||||
__attribute__ ((visibility ("hidden")));
|
||||
#endif
|
279
sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core_sse4.S
Normal file
279
sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core_sse4.S
Normal file
@ -0,0 +1,279 @@
|
||||
/* Function hypot vectorized with SSE4.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
https://www.gnu.org/licenses/. */
|
||||
|
||||
/*
|
||||
* ALGORITHM DESCRIPTION:
|
||||
*
|
||||
* HIGH LEVEL OVERVIEW
|
||||
*
|
||||
* Calculate z = (x*x+y*y)
|
||||
* Calculate reciplicle sqrt (z)
|
||||
* Calculate error = z*(rsqrt(z)*rsqrt(z)) - 1
|
||||
* Calculate fixing part p with polynom
|
||||
* Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
|
||||
*
|
||||
* ALGORITHM DETAILS
|
||||
*
|
||||
* Multiprecision branch for _HA_ only
|
||||
* Remove sigm from both arguments
|
||||
* Find maximum (_x) and minimum (_y) (by abs value) between arguments
|
||||
* Split _x int _a and _b for multiprecision
|
||||
* If _x >> _y we will we will not split _y for multiprecision
|
||||
* all _y will be put into lower part (_d) and higher part (_c = 0)
|
||||
* Fixing _hilo_mask for the case _x >> _y
|
||||
* Split _y into _c and _d for multiprecision with fixed mask
|
||||
*
|
||||
* compute Hi and Lo parts of _z = _x*_x + _y*_y
|
||||
*
|
||||
* _zHi = _a*_a + _c*_c
|
||||
* _zLo = (_x + _a)*_b + _d*_y + _d*_c
|
||||
* _z = _zHi + _zLo
|
||||
*
|
||||
* No multiprecision branch for _LA_ and _EP_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*
|
||||
* Check _z exponent to be withing borders [3BC ; 441] else goto Callout
|
||||
*
|
||||
* _s ~ 1.0/sqrt(_z)
|
||||
* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
|
||||
* _e[rror] = (1.0/_z + O) * _z - 1.0
|
||||
* calculate fixing part _p
|
||||
* _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
|
||||
* some parts of polynom are skipped for lower flav
|
||||
*
|
||||
* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/* Offsets for data table __svml_dhypot_data_internal
|
||||
*/
|
||||
#define _dHiLoMask 0
|
||||
#define _dAbsMask 16
|
||||
#define _dOne 32
|
||||
#define _POLY_C5 48
|
||||
#define _POLY_C4 64
|
||||
#define _POLY_C3 80
|
||||
#define _POLY_C2 96
|
||||
#define _POLY_C1 112
|
||||
#define _LowBoundary 128
|
||||
#define _HighBoundary 144
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.sse4,"ax",@progbits
|
||||
ENTRY(_ZGVbN2vv_hypot_sse4)
|
||||
subq $88, %rsp
|
||||
cfi_def_cfa_offset(96)
|
||||
|
||||
/*
|
||||
* Defines
|
||||
* Implementation
|
||||
* Multiprecision branch for _HA_ only
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*/
|
||||
movaps %xmm0, %xmm10
|
||||
movaps %xmm1, %xmm2
|
||||
mulpd %xmm0, %xmm10
|
||||
mulpd %xmm1, %xmm2
|
||||
addpd %xmm2, %xmm10
|
||||
|
||||
/*
|
||||
* _s ~ 1.0/sqrt(_z)
|
||||
* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z
|
||||
*/
|
||||
cvtpd2ps %xmm10, %xmm7
|
||||
movlhps %xmm7, %xmm7
|
||||
rsqrtps %xmm7, %xmm8
|
||||
cvtps2pd %xmm8, %xmm11
|
||||
movaps %xmm11, %xmm2
|
||||
mulpd %xmm11, %xmm2
|
||||
|
||||
/* _e[rror] ~ (1.0/_z + O) * _z - 1.0 */
|
||||
mulpd %xmm10, %xmm2
|
||||
subpd _dOne+__svml_dhypot_data_internal(%rip), %xmm2
|
||||
|
||||
/*
|
||||
* calculate fixing part _p
|
||||
* _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
|
||||
* some parts of polynom are skipped for lower flav
|
||||
*/
|
||||
movups _POLY_C4+__svml_dhypot_data_internal(%rip), %xmm9
|
||||
mulpd %xmm2, %xmm9
|
||||
addpd _POLY_C3+__svml_dhypot_data_internal(%rip), %xmm9
|
||||
mulpd %xmm2, %xmm9
|
||||
addpd _POLY_C2+__svml_dhypot_data_internal(%rip), %xmm9
|
||||
mulpd %xmm2, %xmm9
|
||||
addpd _POLY_C1+__svml_dhypot_data_internal(%rip), %xmm9
|
||||
|
||||
/* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z */
|
||||
mulpd %xmm9, %xmm2
|
||||
mulpd %xmm11, %xmm2
|
||||
mulpd %xmm10, %xmm11
|
||||
mulpd %xmm10, %xmm2
|
||||
|
||||
/* Check _z exponent to be withing borders [3BC ; 441] else goto Callout */
|
||||
movq _LowBoundary+__svml_dhypot_data_internal(%rip), %xmm5
|
||||
movq _HighBoundary+__svml_dhypot_data_internal(%rip), %xmm3
|
||||
pshufd $221, %xmm10, %xmm4
|
||||
pcmpgtd %xmm4, %xmm5
|
||||
pcmpgtd %xmm3, %xmm4
|
||||
por %xmm4, %xmm5
|
||||
pshufd $80, %xmm5, %xmm6
|
||||
movmskpd %xmm6, %edx
|
||||
addpd %xmm11, %xmm2
|
||||
|
||||
/* The end of implementation */
|
||||
testl %edx, %edx
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1 xmm2
|
||||
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
L(EXIT):
|
||||
movaps %xmm2, %xmm0
|
||||
addq $88, %rsp
|
||||
cfi_def_cfa_offset(8)
|
||||
ret
|
||||
cfi_def_cfa_offset(96)
|
||||
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_BRANCH):
|
||||
movups %xmm0, 32(%rsp)
|
||||
movups %xmm1, 48(%rsp)
|
||||
movups %xmm2, 64(%rsp)
|
||||
# LOE rbx rbp r12 r13 r14 r15 edx
|
||||
|
||||
xorl %eax, %eax
|
||||
movq %r12, 16(%rsp)
|
||||
cfi_offset(12, -80)
|
||||
movl %eax, %r12d
|
||||
movq %r13, 8(%rsp)
|
||||
cfi_offset(13, -88)
|
||||
movl %edx, %r13d
|
||||
movq %r14, (%rsp)
|
||||
cfi_offset(14, -96)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %r12d, %r13d
|
||||
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_LOOP):
|
||||
incl %r12d
|
||||
cmpl $2, %r12d
|
||||
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
|
||||
movq 16(%rsp), %r12
|
||||
cfi_restore(12)
|
||||
movq 8(%rsp), %r13
|
||||
cfi_restore(13)
|
||||
movq (%rsp), %r14
|
||||
cfi_restore(14)
|
||||
movups 64(%rsp), %xmm2
|
||||
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
cfi_offset(12, -80)
|
||||
cfi_offset(13, -88)
|
||||
cfi_offset(14, -96)
|
||||
# LOE rbx rbp r12 r13 r14 r15 xmm2
|
||||
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %r12d, %r14d
|
||||
movsd 32(%rsp,%r14,8), %xmm0
|
||||
movsd 48(%rsp,%r14,8), %xmm1
|
||||
call hypot@PLT
|
||||
# LOE rbx rbp r14 r15 r12d r13d xmm0
|
||||
|
||||
movsd %xmm0, 64(%rsp,%r14,8)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
END(_ZGVbN2vv_hypot_sse4)
|
||||
|
||||
.section .rodata, "a"
|
||||
.align 16
|
||||
|
||||
#ifdef __svml_dhypot_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
__declspec(align(16)) VUINT32 _dHiLoMask[2][2];
|
||||
__declspec(align(16)) VUINT32 _dAbsMask[2][2];
|
||||
__declspec(align(16)) VUINT32 _dOne[2][2];
|
||||
__declspec(align(16)) VUINT32 _POLY_C5[2][2];
|
||||
__declspec(align(16)) VUINT32 _POLY_C4[2][2];
|
||||
__declspec(align(16)) VUINT32 _POLY_C3[2][2];
|
||||
__declspec(align(16)) VUINT32 _POLY_C2[2][2];
|
||||
__declspec(align(16)) VUINT32 _POLY_C1[2][2];
|
||||
__declspec(align(16)) VUINT32 _LowBoundary[4][1];
|
||||
__declspec(align(16)) VUINT32 _HighBoundary[4][1];
|
||||
} __svml_dhypot_data_internal;
|
||||
#endif
|
||||
__svml_dhypot_data_internal:
|
||||
/* legacy algorithm */
|
||||
.quad 0xffffc00000000000, 0xffffc00000000000 /* _dHiLoMask */
|
||||
.align 16
|
||||
.quad 0x7fffffffffffffff, 0x7fffffffffffffff /* _dAbsMask */
|
||||
.align 16
|
||||
.quad 0x3FF0000000000000, 0x3FF0000000000000 /* _dOne */
|
||||
.align 16
|
||||
.quad 0xBFCF800000000000, 0xBFCF800000000000 /* _POLY_C5 */
|
||||
.align 16
|
||||
.quad 0x3FD1800000000000, 0x3FD1800000000000 /* _POLY_C4 */
|
||||
.align 16
|
||||
.quad 0xBFD4000000000000, 0xBFD4000000000000 /* _POLY_C3 */
|
||||
.align 16
|
||||
.quad 0x3FD8000000000000, 0x3FD8000000000000 /* _POLY_C2 */
|
||||
.align 16
|
||||
.quad 0xBFE0000000000000, 0xBFE0000000000000 /* _POLY_C1 */
|
||||
.align 16
|
||||
.long 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000 /* _LowBoundary */
|
||||
.align 16
|
||||
.long 0x44100000, 0x44100000, 0x44100000, 0x44100000 /* _HighBoundary */
|
||||
.align 16
|
||||
.type __svml_dhypot_data_internal,@object
|
||||
.size __svml_dhypot_data_internal,.-__svml_dhypot_data_internal
|
20
sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core-sse.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core-sse.S
Normal file
@ -0,0 +1,20 @@
|
||||
/* SSE version of vectorized hypot.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define _ZGVdN4vv_hypot _ZGVdN4vv_hypot_sse_wrapper
|
||||
#include "../svml_d_hypot4_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core.c
Normal file
@ -0,0 +1,28 @@
|
||||
/* Multiple versions of vectorized hypot, vector length is 4.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define SYMBOL_NAME _ZGVdN4vv_hypot
|
||||
#include "ifunc-mathvec-avx2.h"
|
||||
|
||||
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||
|
||||
#ifdef SHARED
|
||||
__hidden_ver1 (_ZGVdN4vv_hypot, __GI__ZGVdN4vv_hypot,
|
||||
__redirect__ZGVdN4vv_hypot)
|
||||
__attribute__ ((visibility ("hidden")));
|
||||
#endif
|
289
sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
Normal file
289
sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
Normal file
@ -0,0 +1,289 @@
|
||||
/* Function hypot vectorized with AVX2.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
https://www.gnu.org/licenses/. */
|
||||
|
||||
/*
|
||||
* ALGORITHM DESCRIPTION:
|
||||
*
|
||||
* HIGH LEVEL OVERVIEW
|
||||
*
|
||||
* Calculate z = (x*x+y*y)
|
||||
* Calculate reciplicle sqrt (z)
|
||||
* Calculate error = z*(rsqrt(z)*rsqrt(z)) - 1
|
||||
* Calculate fixing part p with polynom
|
||||
* Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
|
||||
*
|
||||
* ALGORITHM DETAILS
|
||||
*
|
||||
* Multiprecision branch for _HA_ only
|
||||
* Remove sigm from both arguments
|
||||
* Find maximum (_x) and minimum (_y) (by abs value) between arguments
|
||||
* Split _x int _a and _b for multiprecision
|
||||
* If _x >> _y we will we will not split _y for multiprecision
|
||||
* all _y will be put into lower part (_d) and higher part (_c = 0)
|
||||
* Fixing _hilo_mask for the case _x >> _y
|
||||
* Split _y into _c and _d for multiprecision with fixed mask
|
||||
*
|
||||
* compute Hi and Lo parts of _z = _x*_x + _y*_y
|
||||
*
|
||||
* _zHi = _a*_a + _c*_c
|
||||
* _zLo = (_x + _a)*_b + _d*_y + _d*_c
|
||||
* _z = _zHi + _zLo
|
||||
*
|
||||
* No multiprecision branch for _LA_ and _EP_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*
|
||||
* Check _z exponent to be withing borders [3BC ; 441] else goto Callout
|
||||
*
|
||||
* _s ~ 1.0/sqrt(_z)
|
||||
* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
|
||||
* _e[rror] = (1.0/_z + O) * _z - 1.0
|
||||
* calculate fixing part _p
|
||||
* _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
|
||||
* some parts of polynom are skipped for lower flav
|
||||
*
|
||||
* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/* Offsets for data table __svml_dhypot_data_internal
|
||||
*/
|
||||
#define _dHiLoMask 0
|
||||
#define _dAbsMask 32
|
||||
#define _dOne 64
|
||||
#define _POLY_C5 96
|
||||
#define _POLY_C4 128
|
||||
#define _POLY_C3 160
|
||||
#define _POLY_C2 192
|
||||
#define _POLY_C1 224
|
||||
#define _LowBoundary 256
|
||||
#define _HighBoundary 288
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.avx2,"ax",@progbits
|
||||
ENTRY(_ZGVdN4vv_hypot_avx2)
|
||||
pushq %rbp
|
||||
cfi_def_cfa_offset(16)
|
||||
movq %rsp, %rbp
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
andq $-32, %rsp
|
||||
subq $128, %rsp
|
||||
vmovapd %ymm1, %ymm2
|
||||
vmovapd %ymm0, %ymm1
|
||||
|
||||
/*
|
||||
* Defines
|
||||
* Implementation
|
||||
* Multiprecision branch for _HA_ only
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*/
|
||||
vmulpd %ymm1, %ymm1, %ymm0
|
||||
|
||||
/*
|
||||
* calculate fixing part _p
|
||||
* _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
|
||||
* some parts of polynom are skipped for lower flav
|
||||
*/
|
||||
vmovupd _POLY_C4+__svml_dhypot_data_internal(%rip), %ymm15
|
||||
vmovups _LowBoundary+__svml_dhypot_data_internal(%rip), %xmm4
|
||||
vfmadd231pd %ymm2, %ymm2, %ymm0
|
||||
|
||||
/*
|
||||
* _s ~ 1.0/sqrt(_z)
|
||||
* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z
|
||||
*/
|
||||
vcvtpd2ps %ymm0, %xmm12
|
||||
|
||||
/* Check _z exponent to be withing borders [3BC ; 441] else goto Callout */
|
||||
vextractf128 $1, %ymm0, %xmm3
|
||||
vrsqrtps %xmm12, %xmm13
|
||||
vshufps $221, %xmm3, %xmm0, %xmm5
|
||||
vcvtps2pd %xmm13, %ymm3
|
||||
vpcmpgtd %xmm5, %xmm4, %xmm6
|
||||
vpcmpgtd _HighBoundary+__svml_dhypot_data_internal(%rip), %xmm5, %xmm7
|
||||
vpor %xmm7, %xmm6, %xmm9
|
||||
vpshufd $80, %xmm9, %xmm8
|
||||
vmulpd %ymm3, %ymm3, %ymm14
|
||||
vpshufd $250, %xmm9, %xmm10
|
||||
|
||||
/* _e[rror] ~ (1.0/_z + O) * _z - 1.0 */
|
||||
vfmsub213pd _dOne+__svml_dhypot_data_internal(%rip), %ymm0, %ymm14
|
||||
vfmadd213pd _POLY_C3+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
|
||||
vfmadd213pd _POLY_C2+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
|
||||
vfmadd213pd _POLY_C1+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
|
||||
|
||||
/* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z */
|
||||
vmulpd %ymm15, %ymm14, %ymm14
|
||||
vmulpd %ymm14, %ymm3, %ymm15
|
||||
vmulpd %ymm15, %ymm0, %ymm4
|
||||
vfmadd213pd %ymm4, %ymm3, %ymm0
|
||||
vinsertf128 $1, %xmm10, %ymm8, %ymm11
|
||||
vmovmskpd %ymm11, %edx
|
||||
|
||||
/* The end of implementation */
|
||||
testl %edx, %edx
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2
|
||||
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
L(EXIT):
|
||||
movq %rbp, %rsp
|
||||
popq %rbp
|
||||
cfi_def_cfa(7, 8)
|
||||
cfi_restore(6)
|
||||
ret
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_BRANCH):
|
||||
vmovupd %ymm1, 32(%rsp)
|
||||
vmovupd %ymm2, 64(%rsp)
|
||||
vmovupd %ymm0, 96(%rsp)
|
||||
# LOE rbx r12 r13 r14 r15 edx ymm0
|
||||
|
||||
xorl %eax, %eax
|
||||
# LOE rbx r12 r13 r14 r15 eax edx
|
||||
|
||||
vzeroupper
|
||||
movq %r12, 16(%rsp)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
|
||||
movl %eax, %r12d
|
||||
movq %r13, 8(%rsp)
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
movl %edx, %r13d
|
||||
movq %r14, (%rsp)
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %r12d, %r13d
|
||||
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_LOOP):
|
||||
incl %r12d
|
||||
cmpl $4, %r12d
|
||||
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
movq 16(%rsp), %r12
|
||||
cfi_restore(12)
|
||||
movq 8(%rsp), %r13
|
||||
cfi_restore(13)
|
||||
movq (%rsp), %r14
|
||||
cfi_restore(14)
|
||||
vmovupd 96(%rsp), %ymm0
|
||||
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r12 r13 r14 r15 ymm0
|
||||
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %r12d, %r14d
|
||||
movsd 32(%rsp,%r14,8), %xmm0
|
||||
movsd 64(%rsp,%r14,8), %xmm1
|
||||
call hypot@PLT
|
||||
# LOE rbx r14 r15 r12d r13d xmm0
|
||||
|
||||
movsd %xmm0, 96(%rsp,%r14,8)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
# LOE rbx r15 r12d r13d
|
||||
END(_ZGVdN4vv_hypot_avx2)
|
||||
|
||||
.section .rodata, "a"
|
||||
.align 32
|
||||
|
||||
#ifdef __svml_dhypot_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
__declspec(align(32)) VUINT32 _dHiLoMask[4][2];
|
||||
__declspec(align(32)) VUINT32 _dAbsMask[4][2];
|
||||
__declspec(align(32)) VUINT32 _dOne[4][2];
|
||||
__declspec(align(32)) VUINT32 _POLY_C5[4][2];
|
||||
__declspec(align(32)) VUINT32 _POLY_C4[4][2];
|
||||
__declspec(align(32)) VUINT32 _POLY_C3[4][2];
|
||||
__declspec(align(32)) VUINT32 _POLY_C2[4][2];
|
||||
__declspec(align(32)) VUINT32 _POLY_C1[4][2];
|
||||
__declspec(align(32)) VUINT32 _LowBoundary[8][1];
|
||||
__declspec(align(32)) VUINT32 _HighBoundary[8][1];
|
||||
} __svml_dhypot_data_internal;
|
||||
#endif
|
||||
__svml_dhypot_data_internal:
|
||||
/* legacy algorithm */
|
||||
.quad 0xffffc00000000000, 0xffffc00000000000, 0xffffc00000000000, 0xffffc00000000000 /* _dHiLoMask */
|
||||
.align 32
|
||||
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* _dAbsMask */
|
||||
.align 32
|
||||
.quad 0x3FF0000000000000, 0x3FF0000000000000, 0x3FF0000000000000, 0x3FF0000000000000 /* _dOne */
|
||||
.align 32
|
||||
.quad 0xBFCF800000000000, 0xBFCF800000000000, 0xBFCF800000000000, 0xBFCF800000000000 /* _POLY_C5 */
|
||||
.align 32
|
||||
.quad 0x3FD1800000000000, 0x3FD1800000000000, 0x3FD1800000000000, 0x3FD1800000000000 /* _POLY_C4 */
|
||||
.align 32
|
||||
.quad 0xBFD4000000000000, 0xBFD4000000000000, 0xBFD4000000000000, 0xBFD4000000000000 /* _POLY_C3 */
|
||||
.align 32
|
||||
.quad 0x3FD8000000000000, 0x3FD8000000000000, 0x3FD8000000000000, 0x3FD8000000000000 /* _POLY_C2 */
|
||||
.align 32
|
||||
.quad 0xBFE0000000000000, 0xBFE0000000000000, 0xBFE0000000000000, 0xBFE0000000000000 /* _POLY_C1 */
|
||||
.align 32
|
||||
.long 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000 /* _LowBoundary */
|
||||
.align 32
|
||||
.long 0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000 /* _HighBoundary */
|
||||
.align 32
|
||||
.type __svml_dhypot_data_internal,@object
|
||||
.size __svml_dhypot_data_internal,.-__svml_dhypot_data_internal
|
20
sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core-avx2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core-avx2.S
Normal file
@ -0,0 +1,20 @@
|
||||
/* AVX2 version of vectorized hypot.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define _ZGVeN8vv_hypot _ZGVeN8vv_hypot_avx2_wrapper
|
||||
#include "../svml_d_hypot8_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core.c
Normal file
@ -0,0 +1,28 @@
|
||||
/* Multiple versions of vectorized hypot, vector length is 8.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define SYMBOL_NAME _ZGVeN8vv_hypot
|
||||
#include "ifunc-mathvec-avx512-skx.h"
|
||||
|
||||
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||
|
||||
#ifdef SHARED
|
||||
__hidden_ver1 (_ZGVeN8vv_hypot, __GI__ZGVeN8vv_hypot,
|
||||
__redirect__ZGVeN8vv_hypot)
|
||||
__attribute__ ((visibility ("hidden")));
|
||||
#endif
|
235
sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
Normal file
235
sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
Normal file
@ -0,0 +1,235 @@
|
||||
/* Function hypot vectorized with AVX-512.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
https://www.gnu.org/licenses/. */
|
||||
|
||||
/*
|
||||
* ALGORITHM DESCRIPTION:
|
||||
*
|
||||
* HIGH LEVEL OVERVIEW
|
||||
*
|
||||
* Calculate z = (x*x+y*y)
|
||||
* Calculate reciplicle sqrt (z)
|
||||
* Calculate error = z*(rsqrt(z)*rsqrt(z)) - 1
|
||||
* Calculate fixing part p with polynom
|
||||
* Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
|
||||
*
|
||||
* ALGORITHM DETAILS
|
||||
*
|
||||
* Multiprecision branch for _HA_ only
|
||||
* Remove sigm from both arguments
|
||||
* Find maximum (_x) and minimum (_y) (by abs value) between arguments
|
||||
* Split _x int _a and _b for multiprecision
|
||||
* If _x >> _y we will we will not split _y for multiprecision
|
||||
* all _y will be put into lower part (_d) and higher part (_c = 0)
|
||||
* Fixing _hilo_mask for the case _x >> _y
|
||||
* Split _y into _c and _d for multiprecision with fixed mask
|
||||
*
|
||||
* compute Hi and Lo parts of _z = _x*_x + _y*_y
|
||||
*
|
||||
* _zHi = _a*_a + _c*_c
|
||||
* _zLo = (_x + _a)*_b + _d*_y + _d*_c
|
||||
* _z = _zHi + _zLo
|
||||
*
|
||||
* No multiprecision branch for _LA_ and _EP_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*
|
||||
* Check _z exponent to be withing borders [3BC ; 441] else goto Callout
|
||||
*
|
||||
* _s ~ 1.0/sqrt(_z)
|
||||
* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
|
||||
* _e[rror] = (1.0/_z + O) * _z - 1.0
|
||||
* calculate fixing part _p
|
||||
* _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
|
||||
* some parts of polynom are skipped for lower flav
|
||||
*
|
||||
* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/* Offsets for data table __svml_dhypot_data_internal
|
||||
*/
|
||||
#define _dAbsMask 0
|
||||
#define _lExpBound_uisa 64
|
||||
#define _lExpBound 128
|
||||
#define _dHalf 192
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.evex512,"ax",@progbits
|
||||
ENTRY(_ZGVeN8vv_hypot_skx)
|
||||
pushq %rbp
|
||||
cfi_def_cfa_offset(16)
|
||||
movq %rsp, %rbp
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
andq $-64, %rsp
|
||||
subq $256, %rsp
|
||||
vgetexppd {sae}, %zmm0, %zmm2
|
||||
vgetexppd {sae}, %zmm1, %zmm3
|
||||
vmovups _dHalf+__svml_dhypot_data_internal(%rip), %zmm9
|
||||
vmaxpd {sae}, %zmm3, %zmm2, %zmm4
|
||||
vmulpd {rn-sae}, %zmm0, %zmm0, %zmm2
|
||||
vandpd _dAbsMask+__svml_dhypot_data_internal(%rip), %zmm4, %zmm5
|
||||
vfmadd231pd {rn-sae}, %zmm1, %zmm1, %zmm2
|
||||
|
||||
/* Select exponent bound so that no scaling is needed */
|
||||
vpcmpq $5, _lExpBound_uisa+__svml_dhypot_data_internal(%rip), %zmm5, %k0
|
||||
vrsqrt14pd %zmm2, %zmm6
|
||||
kmovw %k0, %edx
|
||||
vmulpd {rn-sae}, %zmm6, %zmm2, %zmm7
|
||||
vmulpd {rn-sae}, %zmm6, %zmm9, %zmm8
|
||||
vfnmadd231pd {rn-sae}, %zmm7, %zmm8, %zmm9
|
||||
vfmadd231pd {rn-sae}, %zmm9, %zmm8, %zmm8
|
||||
vfmadd213pd {rn-sae}, %zmm7, %zmm7, %zmm9
|
||||
vfnmadd231pd {rn-sae}, %zmm9, %zmm9, %zmm2
|
||||
vfmadd213pd {rn-sae}, %zmm9, %zmm8, %zmm2
|
||||
|
||||
/* The end of implementation */
|
||||
testl %edx, %edx
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2
|
||||
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
L(EXIT):
|
||||
vmovaps %zmm2, %zmm0
|
||||
movq %rbp, %rsp
|
||||
popq %rbp
|
||||
cfi_def_cfa(7, 8)
|
||||
cfi_restore(6)
|
||||
ret
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_BRANCH):
|
||||
vmovups %zmm0, 64(%rsp)
|
||||
vmovups %zmm1, 128(%rsp)
|
||||
vmovups %zmm2, 192(%rsp)
|
||||
# LOE rbx r12 r13 r14 r15 edx zmm2
|
||||
|
||||
xorl %eax, %eax
|
||||
# LOE rbx r12 r13 r14 r15 eax edx
|
||||
|
||||
vzeroupper
|
||||
movq %r12, 16(%rsp)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
|
||||
movl %eax, %r12d
|
||||
movq %r13, 8(%rsp)
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
|
||||
movl %edx, %r13d
|
||||
movq %r14, (%rsp)
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %r12d, %r13d
|
||||
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_LOOP):
|
||||
incl %r12d
|
||||
cmpl $8, %r12d
|
||||
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
movq 16(%rsp), %r12
|
||||
cfi_restore(12)
|
||||
movq 8(%rsp), %r13
|
||||
cfi_restore(13)
|
||||
movq (%rsp), %r14
|
||||
cfi_restore(14)
|
||||
vmovups 192(%rsp), %zmm2
|
||||
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r12 r13 r14 r15 zmm2
|
||||
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %r12d, %r14d
|
||||
movsd 64(%rsp,%r14,8), %xmm0
|
||||
movsd 128(%rsp,%r14,8), %xmm1
|
||||
call hypot@PLT
|
||||
# LOE rbx r14 r15 r12d r13d xmm0
|
||||
|
||||
movsd %xmm0, 192(%rsp,%r14,8)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
# LOE rbx r15 r12d r13d
|
||||
END(_ZGVeN8vv_hypot_skx)
|
||||
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
|
||||
#ifdef __svml_dhypot_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
__declspec(align(64)) VUINT32 _dAbsMask[8][2];
|
||||
__declspec(align(64)) VUINT32 _lExpBound_uisa[8][2];
|
||||
__declspec(align(64)) VUINT32 _lExpBound[8][2];
|
||||
__declspec(align(64)) VUINT32 _dHalf[8][2];
|
||||
} __svml_dhypot_data_internal;
|
||||
#endif
|
||||
__svml_dhypot_data_internal:
|
||||
/* legacy algorithm */
|
||||
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* _dAbsMask */
|
||||
/* fma based algorithm*/
|
||||
.align 64
|
||||
.quad 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000 /* _lExpBound_uisa */
|
||||
.align 64
|
||||
.quad 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000 /* _lExpBound */
|
||||
.align 64
|
||||
.quad 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000 /* _dHalf */
|
||||
.align 64
|
||||
.type __svml_dhypot_data_internal,@object
|
||||
.size __svml_dhypot_data_internal,.-__svml_dhypot_data_internal
|
20
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core-avx2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core-avx2.S
Normal file
@ -0,0 +1,20 @@
|
||||
/* AVX2 version of vectorized hypotf.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define _ZGVeN16vv_hypotf _ZGVeN16vv_hypotf_avx2_wrapper
|
||||
#include "../svml_s_hypotf16_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core.c
Normal file
@ -0,0 +1,28 @@
|
||||
/* Multiple versions of vectorized hypotf, vector length is 16.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define SYMBOL_NAME _ZGVeN16vv_hypotf
|
||||
#include "ifunc-mathvec-avx512-skx.h"
|
||||
|
||||
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||
|
||||
#ifdef SHARED
|
||||
__hidden_ver1 (_ZGVeN16vv_hypotf, __GI__ZGVeN16vv_hypotf,
|
||||
__redirect__ZGVeN16vv_hypotf)
|
||||
__attribute__ ((visibility ("hidden")));
|
||||
#endif
|
239
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
Normal file
239
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
Normal file
@ -0,0 +1,239 @@
|
||||
/* Function hypotf vectorized with AVX-512.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
https://www.gnu.org/licenses/. */
|
||||
|
||||
/*
|
||||
* ALGORITHM DESCRIPTION:
|
||||
*
|
||||
* HIGH LEVEL OVERVIEW
|
||||
*
|
||||
* Calculate z = (x*x+y*y)
|
||||
* Calculate reciplicle sqrt (z)
|
||||
* Calculate make two NR iterations
|
||||
*
|
||||
* ALGORITHM DETAILS
|
||||
*
|
||||
* Multiprecision branch for _HA_ only
|
||||
* Remove sigm from both arguments
|
||||
* Find maximum (_x) and minimum (_y) (by abs value) between arguments
|
||||
* Split _x int _a and _b for multiprecision
|
||||
* If _x >> _y we will we will not split _y for multiprecision
|
||||
* all _y will be put into lower part (_d) and higher part (_c = 0)
|
||||
* Fixing _hilo_mask for the case _x >> _y
|
||||
* Split _y into _c and _d for multiprecision with fixed mask
|
||||
*
|
||||
* compute Hi and Lo parts of _z = _x*_x + _y*_y
|
||||
*
|
||||
* _zHi = _a*_a + _c*_c
|
||||
* _zLo = (_x + _a)*_b + _d*_y + _d*_c
|
||||
* _z = _zHi + _zLo
|
||||
*
|
||||
* No multiprecision branch for _LA_ and _EP_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*
|
||||
* Check _z exponent to be withing borders [1E3 ; 60A] else goto Callout
|
||||
*
|
||||
* Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z),
|
||||
* that multiplied by _z, is final result for _EP_ version.
|
||||
*
|
||||
* First iteration (or zero iteration):
|
||||
* s = z * s0
|
||||
* h = .5 * s0
|
||||
* d = s * h - .5
|
||||
*
|
||||
* Second iteration:
|
||||
* h = d * h + h
|
||||
* s = s * d + s
|
||||
* d = s * s - z (in multiprecision for _HA_)
|
||||
*
|
||||
* result = s - h * d
|
||||
*
|
||||
* EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2)
|
||||
* with all intermediate operations done in target precision for i=1,..,n.
|
||||
* It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target
|
||||
* precision (for some i). It can return result y[i]=NAN in case
|
||||
* a[i]^2+b[i]^2 overflow in target precision, for some i. It can return
|
||||
* result y[i]=NAN in case a[i] or b[i] is infinite, for some i.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/* Offsets for data table __svml_shypot_data_internal
|
||||
*/
|
||||
#define _sAbsMask 0
|
||||
#define _sHalf 64
|
||||
#define _iExpBound 128
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.exex512,"ax",@progbits
|
||||
ENTRY(_ZGVeN16vv_hypotf_skx)
|
||||
pushq %rbp
|
||||
cfi_def_cfa_offset(16)
|
||||
movq %rsp, %rbp
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
andq $-64, %rsp
|
||||
subq $256, %rsp
|
||||
vgetexpps {sae}, %zmm0, %zmm2
|
||||
vgetexpps {sae}, %zmm1, %zmm3
|
||||
vmovups _sHalf+__svml_shypot_data_internal(%rip), %zmm6
|
||||
vmaxps {sae}, %zmm3, %zmm2, %zmm4
|
||||
vmulps {rn-sae}, %zmm0, %zmm0, %zmm2
|
||||
vandps _sAbsMask+__svml_shypot_data_internal(%rip), %zmm4, %zmm5
|
||||
vfmadd231ps {rn-sae}, %zmm1, %zmm1, %zmm2
|
||||
vpcmpd $5, _iExpBound+__svml_shypot_data_internal(%rip), %zmm5, %k0
|
||||
vrsqrt14ps %zmm2, %zmm7
|
||||
kmovw %k0, %edx
|
||||
vmulps {rn-sae}, %zmm7, %zmm2, %zmm9
|
||||
vmulps {rn-sae}, %zmm7, %zmm6, %zmm8
|
||||
vfnmadd231ps {rn-sae}, %zmm9, %zmm9, %zmm2
|
||||
vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm2
|
||||
|
||||
/*
|
||||
* VSCALEF( S, _VRES1, _VRES1, sExp );
|
||||
* The end of implementation
|
||||
*/
|
||||
testl %edx, %edx
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2
|
||||
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
L(EXIT):
|
||||
vmovaps %zmm2, %zmm0
|
||||
movq %rbp, %rsp
|
||||
popq %rbp
|
||||
cfi_def_cfa(7, 8)
|
||||
cfi_restore(6)
|
||||
ret
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_BRANCH):
|
||||
vmovups %zmm0, 64(%rsp)
|
||||
vmovups %zmm1, 128(%rsp)
|
||||
vmovups %zmm2, 192(%rsp)
|
||||
# LOE rbx r12 r13 r14 r15 edx zmm2
|
||||
|
||||
xorl %eax, %eax
|
||||
# LOE rbx r12 r13 r14 r15 eax edx
|
||||
|
||||
vzeroupper
|
||||
movq %r12, 16(%rsp)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
|
||||
movl %eax, %r12d
|
||||
movq %r13, 8(%rsp)
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
|
||||
movl %edx, %r13d
|
||||
movq %r14, (%rsp)
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %r12d, %r13d
|
||||
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_LOOP):
|
||||
incl %r12d
|
||||
cmpl $16, %r12d
|
||||
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
movq 16(%rsp), %r12
|
||||
cfi_restore(12)
|
||||
movq 8(%rsp), %r13
|
||||
cfi_restore(13)
|
||||
movq (%rsp), %r14
|
||||
cfi_restore(14)
|
||||
vmovups 192(%rsp), %zmm2
|
||||
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r12 r13 r14 r15 zmm2
|
||||
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %r12d, %r14d
|
||||
movss 64(%rsp,%r14,4), %xmm0
|
||||
movss 128(%rsp,%r14,4), %xmm1
|
||||
call hypotf@PLT
|
||||
# LOE rbx r14 r15 r12d r13d xmm0
|
||||
|
||||
movss %xmm0, 192(%rsp,%r14,4)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
# LOE rbx r15 r12d r13d
|
||||
END(_ZGVeN16vv_hypotf_skx)
|
||||
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
|
||||
#ifdef __svml_shypot_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
__declspec(align(64)) VUINT32 _sAbsMask[16][1];
|
||||
__declspec(align(64)) VUINT32 _sHalf[16][1];
|
||||
__declspec(align(64)) VUINT32 _iExpBound[16][1];
|
||||
} __svml_shypot_data_internal;
|
||||
#endif
|
||||
__svml_shypot_data_internal:
|
||||
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
|
||||
.align 64
|
||||
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sHalf */
|
||||
/* fma based algorithm*/
|
||||
.align 64
|
||||
.long 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000 /* _iExpBound */
|
||||
.align 64
|
||||
.type __svml_shypot_data_internal,@object
|
||||
.size __svml_shypot_data_internal,.-__svml_shypot_data_internal
|
20
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core-sse2.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core-sse2.S
Normal file
@ -0,0 +1,20 @@
|
||||
/* SSE2 version of vectorized hypotf.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define _ZGVbN4vv_hypotf _ZGVbN4vv_hypotf_sse2
|
||||
#include "../svml_s_hypotf4_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core.c
Normal file
@ -0,0 +1,28 @@
|
||||
/* Multiple versions of vectorized hypotf, vector length is 4.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define SYMBOL_NAME _ZGVbN4vv_hypotf
|
||||
#include "ifunc-mathvec-sse4_1.h"
|
||||
|
||||
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||
|
||||
#ifdef SHARED
|
||||
__hidden_ver1 (_ZGVbN4vv_hypotf, __GI__ZGVbN4vv_hypotf,
|
||||
__redirect__ZGVbN4vv_hypotf)
|
||||
__attribute__ ((visibility ("hidden")));
|
||||
#endif
|
265
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core_sse4.S
Normal file
265
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core_sse4.S
Normal file
@ -0,0 +1,265 @@
|
||||
/* Function hypotf vectorized with SSE4.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
https://www.gnu.org/licenses/. */
|
||||
|
||||
/*
|
||||
* ALGORITHM DESCRIPTION:
|
||||
*
|
||||
* HIGH LEVEL OVERVIEW
|
||||
*
|
||||
* Calculate z = (x*x+y*y)
|
||||
* Calculate reciplicle sqrt (z)
|
||||
* Calculate make two NR iterations
|
||||
*
|
||||
* ALGORITHM DETAILS
|
||||
*
|
||||
* Multiprecision branch for _HA_ only
|
||||
* Remove sigm from both arguments
|
||||
* Find maximum (_x) and minimum (_y) (by abs value) between arguments
|
||||
* Split _x int _a and _b for multiprecision
|
||||
* If _x >> _y we will we will not split _y for multiprecision
|
||||
* all _y will be put into lower part (_d) and higher part (_c = 0)
|
||||
* Fixing _hilo_mask for the case _x >> _y
|
||||
* Split _y into _c and _d for multiprecision with fixed mask
|
||||
*
|
||||
* compute Hi and Lo parts of _z = _x*_x + _y*_y
|
||||
*
|
||||
* _zHi = _a*_a + _c*_c
|
||||
* _zLo = (_x + _a)*_b + _d*_y + _d*_c
|
||||
* _z = _zHi + _zLo
|
||||
*
|
||||
* No multiprecision branch for _LA_ and _EP_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*
|
||||
* Check _z exponent to be withing borders [1E3 ; 60A] else goto Callout
|
||||
*
|
||||
* Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z),
|
||||
* that multiplied by _z, is final result for _EP_ version.
|
||||
*
|
||||
* First iteration (or zero iteration):
|
||||
* s = z * s0
|
||||
* h = .5 * s0
|
||||
* d = s * h - .5
|
||||
*
|
||||
* Second iteration:
|
||||
* h = d * h + h
|
||||
* s = s * d + s
|
||||
* d = s * s - z (in multiprecision for _HA_)
|
||||
*
|
||||
* result = s - h * d
|
||||
*
|
||||
* EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2)
|
||||
* with all intermediate operations done in target precision for i=1,..,n.
|
||||
* It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target
|
||||
* precision (for some i). It can return result y[i]=NAN in case
|
||||
* a[i]^2+b[i]^2 overflow in target precision, for some i. It can return
|
||||
* result y[i]=NAN in case a[i] or b[i] is infinite, for some i.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/* Offsets for data table __svml_shypot_data_internal
|
||||
*/
|
||||
#define _sHiLoMask 0
|
||||
#define _sAbsMask 16
|
||||
#define _sHalf 32
|
||||
#define _LowBoundary 48
|
||||
#define _HighBoundary 64
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.sse4,"ax",@progbits
|
||||
ENTRY(_ZGVbN4vv_hypotf_sse4)
|
||||
subq $88, %rsp
|
||||
cfi_def_cfa_offset(96)
|
||||
|
||||
/*
|
||||
* Implementation
|
||||
* Multiprecision branch for _HA_ only
|
||||
* No multiprecision branch for _LA_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*/
|
||||
movaps %xmm0, %xmm8
|
||||
movaps %xmm1, %xmm2
|
||||
mulps %xmm0, %xmm8
|
||||
mulps %xmm1, %xmm2
|
||||
|
||||
/*
|
||||
* Variables
|
||||
* Defines
|
||||
* Constants loading
|
||||
*/
|
||||
movups _sHalf+__svml_shypot_data_internal(%rip), %xmm5
|
||||
addps %xmm2, %xmm8
|
||||
|
||||
/* _s0 ~ 1.0/sqrt(_z) */
|
||||
rsqrtps %xmm8, %xmm10
|
||||
|
||||
/* First iteration */
|
||||
movaps %xmm10, %xmm2
|
||||
movaps %xmm8, %xmm3
|
||||
mulps %xmm8, %xmm2
|
||||
mulps %xmm5, %xmm10
|
||||
movaps %xmm2, %xmm6
|
||||
mulps %xmm10, %xmm6
|
||||
|
||||
/* Check _z exponent to be withing borders [1E3 ; 60A] else goto Callout */
|
||||
movdqu _LowBoundary+__svml_shypot_data_internal(%rip), %xmm4
|
||||
subps %xmm6, %xmm5
|
||||
|
||||
/* Second iteration */
|
||||
movaps %xmm5, %xmm7
|
||||
pcmpgtd %xmm8, %xmm4
|
||||
mulps %xmm2, %xmm5
|
||||
mulps %xmm10, %xmm7
|
||||
addps %xmm5, %xmm2
|
||||
addps %xmm7, %xmm10
|
||||
|
||||
/* Finish second iteration in native precision for _LA_ */
|
||||
movaps %xmm2, %xmm9
|
||||
mulps %xmm2, %xmm9
|
||||
pcmpgtd _HighBoundary+__svml_shypot_data_internal(%rip), %xmm3
|
||||
subps %xmm8, %xmm9
|
||||
mulps %xmm9, %xmm10
|
||||
por %xmm3, %xmm4
|
||||
movmskps %xmm4, %edx
|
||||
subps %xmm10, %xmm2
|
||||
|
||||
/* The end of implementation */
|
||||
testl %edx, %edx
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1 xmm2
|
||||
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
L(EXIT):
|
||||
movaps %xmm2, %xmm0
|
||||
addq $88, %rsp
|
||||
cfi_def_cfa_offset(8)
|
||||
ret
|
||||
cfi_def_cfa_offset(96)
|
||||
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_BRANCH):
|
||||
movups %xmm0, 32(%rsp)
|
||||
movups %xmm1, 48(%rsp)
|
||||
movups %xmm2, 64(%rsp)
|
||||
# LOE rbx rbp r12 r13 r14 r15 edx
|
||||
|
||||
xorl %eax, %eax
|
||||
movq %r12, 16(%rsp)
|
||||
cfi_offset(12, -80)
|
||||
movl %eax, %r12d
|
||||
movq %r13, 8(%rsp)
|
||||
cfi_offset(13, -88)
|
||||
movl %edx, %r13d
|
||||
movq %r14, (%rsp)
|
||||
cfi_offset(14, -96)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %r12d, %r13d
|
||||
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_LOOP):
|
||||
incl %r12d
|
||||
cmpl $4, %r12d
|
||||
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
|
||||
movq 16(%rsp), %r12
|
||||
cfi_restore(12)
|
||||
movq 8(%rsp), %r13
|
||||
cfi_restore(13)
|
||||
movq (%rsp), %r14
|
||||
cfi_restore(14)
|
||||
movups 64(%rsp), %xmm2
|
||||
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
cfi_offset(12, -80)
|
||||
cfi_offset(13, -88)
|
||||
cfi_offset(14, -96)
|
||||
# LOE rbx rbp r12 r13 r14 r15 xmm2
|
||||
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %r12d, %r14d
|
||||
movss 32(%rsp,%r14,4), %xmm0
|
||||
movss 48(%rsp,%r14,4), %xmm1
|
||||
call hypotf@PLT
|
||||
# LOE rbx rbp r14 r15 r12d r13d xmm0
|
||||
|
||||
movss %xmm0, 64(%rsp,%r14,4)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
# LOE rbx rbp r15 r12d r13d
|
||||
END(_ZGVbN4vv_hypotf_sse4)
|
||||
|
||||
.section .rodata, "a"
|
||||
.align 16
|
||||
|
||||
#ifdef __svml_shypot_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
__declspec(align(16)) VUINT32 _sHiLoMask[4][1];
|
||||
__declspec(align(16)) VUINT32 _sAbsMask[4][1];
|
||||
__declspec(align(16)) VUINT32 _sHalf[4][1];
|
||||
__declspec(align(16)) VUINT32 _LowBoundary[4][1];
|
||||
__declspec(align(16)) VUINT32 _HighBoundary[4][1];
|
||||
} __svml_shypot_data_internal;
|
||||
#endif
|
||||
__svml_shypot_data_internal:
|
||||
/* legacy algorithm */
|
||||
.long 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000 /* _sHiLoMask */
|
||||
.align 16
|
||||
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
|
||||
.align 16
|
||||
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sHalf */
|
||||
.align 16
|
||||
.long 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000 /* _LowBoundary */
|
||||
.align 16
|
||||
.long 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000 /* _HighBoundary */
|
||||
.align 16
|
||||
.type __svml_shypot_data_internal,@object
|
||||
.size __svml_shypot_data_internal,.-__svml_shypot_data_internal
|
20
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core-sse.S
Normal file
20
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core-sse.S
Normal file
@ -0,0 +1,20 @@
|
||||
/* SSE version of vectorized hypotf.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define _ZGVdN8vv_hypotf _ZGVdN8vv_hypotf_sse_wrapper
|
||||
#include "../svml_s_hypotf8_core.S"
|
28
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core.c
Normal file
28
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core.c
Normal file
@ -0,0 +1,28 @@
|
||||
/* Multiple versions of vectorized sinf, vector length is 8.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define SYMBOL_NAME _ZGVdN8vv_hypotf
|
||||
#include "ifunc-mathvec-avx2.h"
|
||||
|
||||
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
|
||||
|
||||
#ifdef SHARED
|
||||
__hidden_ver1 (_ZGVdN8vv_hypotf, __GI__ZGVdN8vv_hypotf,
|
||||
__redirect__ZGVdN8vv_hypotf)
|
||||
__attribute__ ((visibility ("hidden")));
|
||||
#endif
|
269
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
Normal file
269
sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
Normal file
@ -0,0 +1,269 @@
|
||||
/* Function hypotf vectorized with AVX2.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
https://www.gnu.org/licenses/. */
|
||||
|
||||
/*
|
||||
* ALGORITHM DESCRIPTION:
|
||||
*
|
||||
* HIGH LEVEL OVERVIEW
|
||||
*
|
||||
* Calculate z = (x*x+y*y)
|
||||
* Calculate reciplicle sqrt (z)
|
||||
* Calculate make two NR iterations
|
||||
*
|
||||
* ALGORITHM DETAILS
|
||||
*
|
||||
* Multiprecision branch for _HA_ only
|
||||
* Remove sigm from both arguments
|
||||
* Find maximum (_x) and minimum (_y) (by abs value) between arguments
|
||||
* Split _x int _a and _b for multiprecision
|
||||
* If _x >> _y we will we will not split _y for multiprecision
|
||||
* all _y will be put into lower part (_d) and higher part (_c = 0)
|
||||
* Fixing _hilo_mask for the case _x >> _y
|
||||
* Split _y into _c and _d for multiprecision with fixed mask
|
||||
*
|
||||
* compute Hi and Lo parts of _z = _x*_x + _y*_y
|
||||
*
|
||||
* _zHi = _a*_a + _c*_c
|
||||
* _zLo = (_x + _a)*_b + _d*_y + _d*_c
|
||||
* _z = _zHi + _zLo
|
||||
*
|
||||
* No multiprecision branch for _LA_ and _EP_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*
|
||||
* Check _z exponent to be withing borders [1E3 ; 60A] else goto Callout
|
||||
*
|
||||
* Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z),
|
||||
* that multiplied by _z, is final result for _EP_ version.
|
||||
*
|
||||
* First iteration (or zero iteration):
|
||||
* s = z * s0
|
||||
* h = .5 * s0
|
||||
* d = s * h - .5
|
||||
*
|
||||
* Second iteration:
|
||||
* h = d * h + h
|
||||
* s = s * d + s
|
||||
* d = s * s - z (in multiprecision for _HA_)
|
||||
*
|
||||
* result = s - h * d
|
||||
*
|
||||
* EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2)
|
||||
* with all intermediate operations done in target precision for i=1,..,n.
|
||||
* It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target
|
||||
* precision (for some i). It can return result y[i]=NAN in case
|
||||
* a[i]^2+b[i]^2 overflow in target precision, for some i. It can return
|
||||
* result y[i]=NAN in case a[i] or b[i] is infinite, for some i.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/* Offsets for data table __svml_shypot_data_internal
|
||||
*/
|
||||
#define _sHiLoMask 0
|
||||
#define _sAbsMask 32
|
||||
#define _sHalf 64
|
||||
#define _LowBoundary 96
|
||||
#define _HighBoundary 128
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.avx2,"ax",@progbits
|
||||
ENTRY(_ZGVdN8vv_hypotf_avx2)
|
||||
pushq %rbp
|
||||
cfi_def_cfa_offset(16)
|
||||
movq %rsp, %rbp
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
andq $-32, %rsp
|
||||
subq $128, %rsp
|
||||
|
||||
/*
|
||||
* Implementation
|
||||
* Multiprecision branch for _HA_ only
|
||||
* No multiprecision branch for _LA_
|
||||
* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
|
||||
*/
|
||||
vmulps %ymm0, %ymm0, %ymm8
|
||||
|
||||
/*
|
||||
* Variables
|
||||
* Defines
|
||||
* Constants loading
|
||||
*/
|
||||
vmovups _sHalf+__svml_shypot_data_internal(%rip), %ymm7
|
||||
|
||||
/* Check _z exponent to be withing borders [1E3 ; 60A] else goto Callout */
|
||||
vmovups _LowBoundary+__svml_shypot_data_internal(%rip), %ymm2
|
||||
vfmadd231ps %ymm1, %ymm1, %ymm8
|
||||
|
||||
/* _s0 ~ 1.0/sqrt(_z) */
|
||||
vrsqrtps %ymm8, %ymm6
|
||||
vpcmpgtd %ymm8, %ymm2, %ymm3
|
||||
|
||||
/* First iteration */
|
||||
vmulps %ymm8, %ymm6, %ymm9
|
||||
vmulps %ymm7, %ymm6, %ymm2
|
||||
vfnmadd231ps %ymm9, %ymm2, %ymm7
|
||||
vfmadd213ps %ymm9, %ymm7, %ymm9
|
||||
|
||||
/* Second iteration */
|
||||
vfmadd132ps %ymm7, %ymm2, %ymm2
|
||||
vpcmpgtd _HighBoundary+__svml_shypot_data_internal(%rip), %ymm8, %ymm4
|
||||
vpor %ymm4, %ymm3, %ymm5
|
||||
|
||||
/* Finish second iteration in native precision for _LA_ */
|
||||
vfmsub231ps %ymm9, %ymm9, %ymm8
|
||||
vmovmskps %ymm5, %edx
|
||||
vfnmadd213ps %ymm9, %ymm8, %ymm2
|
||||
|
||||
/* The end of implementation */
|
||||
testl %edx, %edx
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2
|
||||
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
L(EXIT):
|
||||
vmovaps %ymm2, %ymm0
|
||||
movq %rbp, %rsp
|
||||
popq %rbp
|
||||
cfi_def_cfa(7, 8)
|
||||
cfi_restore(6)
|
||||
ret
|
||||
cfi_def_cfa(6, 16)
|
||||
cfi_offset(6, -16)
|
||||
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_BRANCH):
|
||||
vmovups %ymm0, 32(%rsp)
|
||||
vmovups %ymm1, 64(%rsp)
|
||||
vmovups %ymm2, 96(%rsp)
|
||||
# LOE rbx r12 r13 r14 r15 edx ymm2
|
||||
|
||||
xorl %eax, %eax
|
||||
# LOE rbx r12 r13 r14 r15 eax edx
|
||||
|
||||
vzeroupper
|
||||
movq %r12, 16(%rsp)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
|
||||
movl %eax, %r12d
|
||||
movq %r13, 8(%rsp)
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
movl %edx, %r13d
|
||||
movq %r14, (%rsp)
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %r12d, %r13d
|
||||
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
L(SPECIAL_VALUES_LOOP):
|
||||
incl %r12d
|
||||
cmpl $8, %r12d
|
||||
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE rbx r15 r12d r13d
|
||||
|
||||
movq 16(%rsp), %r12
|
||||
cfi_restore(12)
|
||||
movq 8(%rsp), %r13
|
||||
cfi_restore(13)
|
||||
movq (%rsp), %r14
|
||||
cfi_restore(14)
|
||||
vmovups 96(%rsp), %ymm2
|
||||
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rbx r12 r13 r14 r15 ymm2
|
||||
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %r12d, %r14d
|
||||
movss 32(%rsp,%r14,4), %xmm0
|
||||
movss 64(%rsp,%r14,4), %xmm1
|
||||
call hypotf@PLT
|
||||
# LOE rbx r14 r15 r12d r13d xmm0
|
||||
|
||||
movss %xmm0, 96(%rsp,%r14,4)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
# LOE rbx r15 r12d r13d
|
||||
END(_ZGVdN8vv_hypotf_avx2)
|
||||
|
||||
.section .rodata, "a"
|
||||
.align 32
|
||||
|
||||
#ifdef __svml_shypot_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
__declspec(align(32)) VUINT32 _sHiLoMask[8][1];
|
||||
__declspec(align(32)) VUINT32 _sAbsMask[8][1];
|
||||
__declspec(align(32)) VUINT32 _sHalf[8][1];
|
||||
__declspec(align(32)) VUINT32 _LowBoundary[8][1];
|
||||
__declspec(align(32)) VUINT32 _HighBoundary[8][1];
|
||||
} __svml_shypot_data_internal;
|
||||
#endif
|
||||
__svml_shypot_data_internal:
|
||||
/* legacy algorithm */
|
||||
.long 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000 /* _sHiLoMask */
|
||||
.align 32
|
||||
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
|
||||
.align 32
|
||||
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sHalf */
|
||||
.align 32
|
||||
.long 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000 /* _LowBoundary */
|
||||
.align 32
|
||||
.long 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000 /* _HighBoundary */
|
||||
.align 32
|
||||
.type __svml_shypot_data_internal,@object
|
||||
.size __svml_shypot_data_internal,.-__svml_shypot_data_internal
|
29
sysdeps/x86_64/fpu/svml_d_hypot2_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_d_hypot2_core.S
Normal file
@ -0,0 +1,29 @@
|
||||
/* Function hypot vectorized with SSE2.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_d_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY (_ZGVbN2vv_hypot)
|
||||
WRAPPER_IMPL_SSE2_ff hypot
|
||||
END (_ZGVbN2vv_hypot)
|
||||
|
||||
#ifndef USE_MULTIARCH
|
||||
libmvec_hidden_def (_ZGVbN2vv_hypot)
|
||||
#endif
|
29
sysdeps/x86_64/fpu/svml_d_hypot4_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_d_hypot4_core.S
Normal file
@ -0,0 +1,29 @@
|
||||
/* Function hypot vectorized with AVX2, wrapper version.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_d_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY (_ZGVdN4vv_hypot)
|
||||
WRAPPER_IMPL_AVX_ff _ZGVbN2vv_hypot
|
||||
END (_ZGVdN4vv_hypot)
|
||||
|
||||
#ifndef USE_MULTIARCH
|
||||
libmvec_hidden_def (_ZGVdN4vv_hypot)
|
||||
#endif
|
25
sysdeps/x86_64/fpu/svml_d_hypot4_core_avx.S
Normal file
25
sysdeps/x86_64/fpu/svml_d_hypot4_core_avx.S
Normal file
@ -0,0 +1,25 @@
|
||||
/* Function hypot vectorized in AVX ISA as wrapper to SSE4 ISA version.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_d_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY (_ZGVcN4vv_hypot)
|
||||
WRAPPER_IMPL_AVX_ff _ZGVbN2vv_hypot
|
||||
END (_ZGVcN4vv_hypot)
|
25
sysdeps/x86_64/fpu/svml_d_hypot8_core.S
Normal file
25
sysdeps/x86_64/fpu/svml_d_hypot8_core.S
Normal file
@ -0,0 +1,25 @@
|
||||
/* Function hypot vectorized with AVX-512. Wrapper to AVX2 version.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_d_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY (_ZGVeN8vv_hypot)
|
||||
WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_hypot
|
||||
END (_ZGVeN8vv_hypot)
|
25
sysdeps/x86_64/fpu/svml_s_hypotf16_core.S
Normal file
25
sysdeps/x86_64/fpu/svml_s_hypotf16_core.S
Normal file
@ -0,0 +1,25 @@
|
||||
/* Function hypotf vectorized with AVX-512. Wrapper to AVX2 version.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_s_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY (_ZGVeN16vv_hypotf)
|
||||
WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_hypotf
|
||||
END (_ZGVeN16vv_hypotf)
|
29
sysdeps/x86_64/fpu/svml_s_hypotf4_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_s_hypotf4_core.S
Normal file
@ -0,0 +1,29 @@
|
||||
/* Function hypotf vectorized with SSE2.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_s_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY (_ZGVbN4vv_hypotf)
|
||||
WRAPPER_IMPL_SSE2_ff hypotf
|
||||
END (_ZGVbN4vv_hypotf)
|
||||
|
||||
#ifndef USE_MULTIARCH
|
||||
libmvec_hidden_def (_ZGVbN4vv_hypotf)
|
||||
#endif
|
29
sysdeps/x86_64/fpu/svml_s_hypotf8_core.S
Normal file
29
sysdeps/x86_64/fpu/svml_s_hypotf8_core.S
Normal file
@ -0,0 +1,29 @@
|
||||
/* Function hypotf vectorized with AVX2, wrapper version.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_s_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY (_ZGVdN8vv_hypotf)
|
||||
WRAPPER_IMPL_AVX_ff _ZGVbN4vv_hypotf
|
||||
END (_ZGVdN8vv_hypotf)
|
||||
|
||||
#ifndef USE_MULTIARCH
|
||||
libmvec_hidden_def (_ZGVdN8vv_hypotf)
|
||||
#endif
|
25
sysdeps/x86_64/fpu/svml_s_hypotf8_core_avx.S
Normal file
25
sysdeps/x86_64/fpu/svml_s_hypotf8_core_avx.S
Normal file
@ -0,0 +1,25 @@
|
||||
/* Function hypotf vectorized in AVX ISA as wrapper to SSE4 ISA version.
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "svml_s_wrapper_impl.h"
|
||||
|
||||
.text
|
||||
ENTRY(_ZGVcN8vv_hypotf)
|
||||
WRAPPER_IMPL_AVX_ff _ZGVbN4vv_hypotf
|
||||
END(_ZGVcN8vv_hypotf)
|
1
sysdeps/x86_64/fpu/test-double-libmvec-hypot-avx.c
Normal file
1
sysdeps/x86_64/fpu/test-double-libmvec-hypot-avx.c
Normal file
@ -0,0 +1 @@
|
||||
#include "test-double-libmvec-hypot.c"
|
1
sysdeps/x86_64/fpu/test-double-libmvec-hypot-avx2.c
Normal file
1
sysdeps/x86_64/fpu/test-double-libmvec-hypot-avx2.c
Normal file
@ -0,0 +1 @@
|
||||
#include "test-double-libmvec-hypot.c"
|
1
sysdeps/x86_64/fpu/test-double-libmvec-hypot-avx512f.c
Normal file
1
sysdeps/x86_64/fpu/test-double-libmvec-hypot-avx512f.c
Normal file
@ -0,0 +1 @@
|
||||
#include "test-double-libmvec-hypot.c"
|
3
sysdeps/x86_64/fpu/test-double-libmvec-hypot.c
Normal file
3
sysdeps/x86_64/fpu/test-double-libmvec-hypot.c
Normal file
@ -0,0 +1,3 @@
|
||||
#define LIBMVEC_TYPE double
|
||||
#define LIBMVEC_FUNC hypot
|
||||
#include "test-vector-abi-arg2.h"
|
@ -30,6 +30,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVbN2v_atan)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVbN2v_asin)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVbN2vv_hypot)
|
||||
|
||||
#define VEC_INT_TYPE __m128i
|
||||
|
||||
|
@ -33,6 +33,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVdN4v_atan)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVdN4v_asin)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVdN4vv_hypot)
|
||||
|
||||
#ifndef __ILP32__
|
||||
# define VEC_INT_TYPE __m256i
|
||||
|
@ -30,6 +30,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVcN4v_atan)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVcN4v_asin)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVcN4vv_hypot)
|
||||
|
||||
#define VEC_INT_TYPE __m128i
|
||||
|
||||
|
@ -30,6 +30,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVeN8v_atan)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asin), _ZGVeN8v_asin)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypot), _ZGVeN8vv_hypot)
|
||||
|
||||
#ifndef __ILP32__
|
||||
# define VEC_INT_TYPE __m512i
|
||||
|
1
sysdeps/x86_64/fpu/test-float-libmvec-hypotf-avx.c
Normal file
1
sysdeps/x86_64/fpu/test-float-libmvec-hypotf-avx.c
Normal file
@ -0,0 +1 @@
|
||||
#include "test-float-libmvec-hypotf.c"
|
1
sysdeps/x86_64/fpu/test-float-libmvec-hypotf-avx2.c
Normal file
1
sysdeps/x86_64/fpu/test-float-libmvec-hypotf-avx2.c
Normal file
@ -0,0 +1 @@
|
||||
#include "test-float-libmvec-hypotf.c"
|
1
sysdeps/x86_64/fpu/test-float-libmvec-hypotf-avx512f.c
Normal file
1
sysdeps/x86_64/fpu/test-float-libmvec-hypotf-avx512f.c
Normal file
@ -0,0 +1 @@
|
||||
#include "test-float-libmvec-hypotf.c"
|
3
sysdeps/x86_64/fpu/test-float-libmvec-hypotf.c
Normal file
3
sysdeps/x86_64/fpu/test-float-libmvec-hypotf.c
Normal file
@ -0,0 +1,3 @@
|
||||
#define LIBMVEC_TYPE float
|
||||
#define LIBMVEC_FUNC hypotf
|
||||
#include "test-vector-abi-arg2.h"
|
@ -30,6 +30,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVeN16v_atanf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVeN16v_asinf)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVeN16vv_hypotf)
|
||||
|
||||
#define VEC_INT_TYPE __m512i
|
||||
|
||||
|
@ -30,6 +30,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVbN4v_atanf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVbN4v_asinf)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVbN4vv_hypotf)
|
||||
|
||||
#define VEC_INT_TYPE __m128i
|
||||
|
||||
|
@ -33,6 +33,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVdN8v_atanf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVdN8v_asinf)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVdN8vv_hypotf)
|
||||
|
||||
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
|
||||
#undef VECTOR_WRAPPER_fFF
|
||||
|
@ -30,6 +30,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVcN8v_atanf)
|
||||
VECTOR_WRAPPER (WRAPPER_NAME (asinf), _ZGVcN8v_asinf)
|
||||
VECTOR_WRAPPER_ff (WRAPPER_NAME (hypotf), _ZGVcN8vv_hypotf)
|
||||
|
||||
#define VEC_INT_TYPE __m128i
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user