mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-15 09:30:06 +00:00
powerpc: Add a POWER8-optimized version of expf()
This implementation is based on the one already used at sysdeps/x86_64/fpu/e_expf.S. This implementation improves the performance by ~14% on average in synthetic benchmarks at the cost of decreasing accuracy to 1 ULP.
This commit is contained in:
parent
9765ffa710
commit
35da2541c3
11
ChangeLog
11
ChangeLog
@ -1,3 +1,14 @@
|
||||
2016-06-30 Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
|
||||
|
||||
* sysdeps/powerpc/fpu/libm-test-ulps: Update.
|
||||
* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
|
||||
[$(subdir) = math] (libm-sysdep_routines): Add e_expf-power8 and
|
||||
e_expf-ppc64.
|
||||
* sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-power8.S: New file.
|
||||
* sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-ppc64.c: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf.c: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S: Likewise.
|
||||
|
||||
2016-06-30 John David Anglin <danglin@gcc.gnu.org>
|
||||
|
||||
[BZ #20277]
|
||||
|
@ -1588,8 +1588,10 @@ ildouble: 2
|
||||
ldouble: 2
|
||||
|
||||
Function: "exp_upward":
|
||||
float: 1
|
||||
double: 1
|
||||
idouble: 1
|
||||
ifloat: 1
|
||||
ildouble: 1
|
||||
ldouble: 1
|
||||
|
||||
|
@ -24,7 +24,8 @@ libm-sysdep_routines += s_isnan-power7 s_isnan-power6x s_isnan-power6 \
|
||||
s_modff-power5+ s_modff-ppc64 e_hypot-ppc64 \
|
||||
e_hypot-power7 e_hypotf-ppc64 e_hypotf-power7 \
|
||||
s_isnan-power8 s_isinf-power8 s_finite-power8 \
|
||||
s_llrint-power8 s_llround-power8
|
||||
s_llrint-power8 s_llround-power8 \
|
||||
e_expf-power8 e_expf-ppc64
|
||||
|
||||
CFLAGS-s_logbf-power7.c = -mcpu=power7
|
||||
CFLAGS-s_logbl-power7.c = -mcpu=power7
|
||||
|
26
sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-power8.S
Normal file
26
sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-power8.S
Normal file
@ -0,0 +1,26 @@
|
||||
/* __ieee754_expf() POWER8 version.
|
||||
Copyright (C) 2016 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
#undef strong_alias
|
||||
#define strong_alias(a, b)
|
||||
|
||||
#define __ieee754_expf __ieee754_expf_power8
|
||||
|
||||
#include <sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S>
|
24
sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-ppc64.c
Normal file
24
sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-ppc64.c
Normal file
@ -0,0 +1,24 @@
|
||||
/* __ieee_expf() PowerPC64 version.
|
||||
Copyright (C) 2016 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#undef strong_alias
|
||||
#define strong_alias(a, b)
|
||||
|
||||
#define __ieee754_expf __ieee754_expf_ppc64
|
||||
|
||||
#include <sysdeps/ieee754/flt-32/e_expf.c>
|
31
sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf.c
Normal file
31
sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf.c
Normal file
@ -0,0 +1,31 @@
|
||||
/* Multiple versions of ieee754_expf.
|
||||
Copyright (C) 2016 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <math.h>
|
||||
#include <math_ldbl_opt.h>
|
||||
#include "init-arch.h"
|
||||
|
||||
extern __typeof (__ieee754_expf) __ieee754_expf_ppc64 attribute_hidden;
|
||||
extern __typeof (__ieee754_expf) __ieee754_expf_power8 attribute_hidden;
|
||||
|
||||
libc_ifunc (__ieee754_expf,
|
||||
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||
? __ieee754_expf_power8
|
||||
: __ieee754_expf_ppc64);
|
||||
|
||||
strong_alias (__ieee754_expf, __expf_finite)
|
303
sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
Normal file
303
sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
Normal file
@ -0,0 +1,303 @@
|
||||
/* Optimized expf(). PowerPC64/POWER8 version.
|
||||
Copyright (C) 2016 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
/* Short algorithm description:
|
||||
*
|
||||
* Let K = 64 (table size).
|
||||
* e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
|
||||
* where:
|
||||
* x = m*log(2)/K + y, y in [0.0..log(2)/K]
|
||||
* m = n*K + j, m,n,j - signed integer, j in [0..K-1]
|
||||
* values of 2^(j/K) are tabulated as T[j].
|
||||
*
|
||||
* P(y) is a minimax polynomial approximation of expf(y)-1
|
||||
* on small interval [0.0..log(2)/K].
|
||||
*
|
||||
* P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
|
||||
* z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
|
||||
*
|
||||
* Special cases:
|
||||
* expf(NaN) = NaN
|
||||
* expf(+INF) = +INF
|
||||
* expf(-INF) = 0
|
||||
* expf(x) = 1 for subnormals
|
||||
* for finite argument, only expf(0)=1 is exact
|
||||
* expf(x) overflows if x>88.7228317260742190
|
||||
* expf(x) underflows if x<-103.972076416015620
|
||||
*/
|
||||
|
||||
#define C1 0x42ad496b /* Single precision 125*log(2). */
|
||||
#define C2 0x31800000 /* Single precision 2^(-28). */
|
||||
#define SP_INF 0x7f800000 /* Single precision Inf. */
|
||||
#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */
|
||||
|
||||
#define DATA_OFFSET r9
|
||||
|
||||
/* Implements the function
|
||||
|
||||
float [fp1] expf (float [fp1] x) */
|
||||
|
||||
.machine power8
|
||||
EALIGN(__ieee754_expf, 4, 0)
|
||||
addis DATA_OFFSET,r2,.Lanchor@toc@ha
|
||||
addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l
|
||||
|
||||
xscvdpspn v0,v1
|
||||
mfvsrd r8,v0 /* r8 = x */
|
||||
lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET)
|
||||
lfd fp3,(.P2-.Lanchor)(DATA_OFFSET)
|
||||
rldicl r3,r8,32,33 /* r3 = |x| */
|
||||
lis r4,C1@ha /* r4 = 125*log(2) */
|
||||
ori r4,r4,C1@l
|
||||
cmpw r3,r4
|
||||
lfd fp5,(.P3-.Lanchor)(DATA_OFFSET)
|
||||
lfd fp4,(.RS-.Lanchor)(DATA_OFFSET)
|
||||
fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */
|
||||
bge L(special_paths) /* |x| >= 125*log(2) ? */
|
||||
|
||||
lis r4,C2@ha
|
||||
ori r4,r4,C2@l
|
||||
cmpw r3,r4
|
||||
blt L(small_args) /* |x| < 2^(-28) ? */
|
||||
|
||||
/* Main path: here if 2^(-28) <= |x| < 125*log(2) */
|
||||
frsp fp6,fp2
|
||||
xscvdpsp v2,v2
|
||||
mfvsrd r8,v2
|
||||
mr r3,r8 /* r3 = m */
|
||||
rldicl r8,r8,32,58 /* r8 = j */
|
||||
lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
|
||||
fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
|
||||
srdi r3,r3,32
|
||||
clrrwi r3,r3,6 /* r3 = n */
|
||||
lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
|
||||
fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
|
||||
fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
|
||||
lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
|
||||
lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
|
||||
lis r4,SP_EXP_BIAS@ha
|
||||
ori r4,r4,SP_EXP_BIAS@l
|
||||
add r3,r3,r4
|
||||
rldic r3,r3,49,1 /* r3 = 2^n */
|
||||
fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
|
||||
fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
|
||||
mtvsrd v1,r3
|
||||
xscvspdp v1,v1
|
||||
fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */
|
||||
fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
|
||||
sldi r8,r8,3 /* Access doublewords from T[j]. */
|
||||
addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
|
||||
lfdx fp3,r6,r8
|
||||
fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */
|
||||
fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */
|
||||
frsp fp1,fp1
|
||||
blr
|
||||
|
||||
.align 4
|
||||
/* x is either underflow, overflow, infinite or NaN. */
|
||||
L(special_paths):
|
||||
srdi r8,r8,32
|
||||
rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive.
|
||||
r8 = 4, otherwise. */
|
||||
addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor)
|
||||
lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */
|
||||
cmpw r3,r4
|
||||
/* |x| <= .SPRANGE[signbit(x)] */
|
||||
ble L(near_under_or_overflow)
|
||||
|
||||
lis r4,SP_INF@ha
|
||||
ori r4,r4,SP_INF@l
|
||||
cmpw r3,r4
|
||||
bge L(arg_inf_or_nan) /* |x| > Infinite ? */
|
||||
|
||||
addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor)
|
||||
lfsx fp1,r6,r8
|
||||
fmuls fp1,fp1,fp1
|
||||
blr
|
||||
|
||||
|
||||
.align 4
|
||||
L(small_args):
|
||||
/* expf(x) = 1.0, where |x| < |2^(-28)| */
|
||||
lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET)
|
||||
fadds fp1,fp1,fp2
|
||||
blr
|
||||
|
||||
|
||||
.align 4
|
||||
L(arg_inf_or_nan:)
|
||||
bne L(arg_nan)
|
||||
|
||||
/* expf(+INF) = +INF
|
||||
expf(-INF) = 0 */
|
||||
addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor)
|
||||
lfsx fp1,r6,r8
|
||||
blr
|
||||
|
||||
|
||||
.align 4
|
||||
L(arg_nan):
|
||||
/* expf(NaN) = NaN */
|
||||
fadd fp1,fp1,fp1
|
||||
frsp fp1,fp1
|
||||
blr
|
||||
|
||||
.align 4
|
||||
L(near_under_or_overflow):
|
||||
frsp fp6,fp2
|
||||
xscvdpsp v2,v2
|
||||
mfvsrd r8,v2
|
||||
mr r3,r8 /* r3 = m */
|
||||
rldicl r8,r8,32,58 /* r8 = j */
|
||||
lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
|
||||
fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
|
||||
srdi r3,r3,32
|
||||
clrrwi r3,r3,6 /* r3 = n */
|
||||
lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
|
||||
fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
|
||||
fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
|
||||
lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
|
||||
lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
|
||||
ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET)
|
||||
add r3,r3,r4
|
||||
rldic r3,r3,46,1 /* r3 = 2 */
|
||||
fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
|
||||
fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
|
||||
mtvsrd v1,r3
|
||||
fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */
|
||||
fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
|
||||
sldi r8,r8,3 /* Access doublewords from T[j]. */
|
||||
addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
|
||||
lfdx fp3,r6,r8
|
||||
fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */
|
||||
fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */
|
||||
frsp fp1,fp1
|
||||
blr
|
||||
END(__ieee754_expf)
|
||||
|
||||
.section .rodata, "a",@progbits
|
||||
.Lanchor:
|
||||
.balign 8
|
||||
/* Table T[j] = 2^(j/K). Double precision. */
|
||||
.Ttable:
|
||||
.8byte 0x3ff0000000000000
|
||||
.8byte 0x3ff02c9a3e778061
|
||||
.8byte 0x3ff059b0d3158574
|
||||
.8byte 0x3ff0874518759bc8
|
||||
.8byte 0x3ff0b5586cf9890f
|
||||
.8byte 0x3ff0e3ec32d3d1a2
|
||||
.8byte 0x3ff11301d0125b51
|
||||
.8byte 0x3ff1429aaea92de0
|
||||
.8byte 0x3ff172b83c7d517b
|
||||
.8byte 0x3ff1a35beb6fcb75
|
||||
.8byte 0x3ff1d4873168b9aa
|
||||
.8byte 0x3ff2063b88628cd6
|
||||
.8byte 0x3ff2387a6e756238
|
||||
.8byte 0x3ff26b4565e27cdd
|
||||
.8byte 0x3ff29e9df51fdee1
|
||||
.8byte 0x3ff2d285a6e4030b
|
||||
.8byte 0x3ff306fe0a31b715
|
||||
.8byte 0x3ff33c08b26416ff
|
||||
.8byte 0x3ff371a7373aa9cb
|
||||
.8byte 0x3ff3a7db34e59ff7
|
||||
.8byte 0x3ff3dea64c123422
|
||||
.8byte 0x3ff4160a21f72e2a
|
||||
.8byte 0x3ff44e086061892d
|
||||
.8byte 0x3ff486a2b5c13cd0
|
||||
.8byte 0x3ff4bfdad5362a27
|
||||
.8byte 0x3ff4f9b2769d2ca7
|
||||
.8byte 0x3ff5342b569d4f82
|
||||
.8byte 0x3ff56f4736b527da
|
||||
.8byte 0x3ff5ab07dd485429
|
||||
.8byte 0x3ff5e76f15ad2148
|
||||
.8byte 0x3ff6247eb03a5585
|
||||
.8byte 0x3ff6623882552225
|
||||
.8byte 0x3ff6a09e667f3bcd
|
||||
.8byte 0x3ff6dfb23c651a2f
|
||||
.8byte 0x3ff71f75e8ec5f74
|
||||
.8byte 0x3ff75feb564267c9
|
||||
.8byte 0x3ff7a11473eb0187
|
||||
.8byte 0x3ff7e2f336cf4e62
|
||||
.8byte 0x3ff82589994cce13
|
||||
.8byte 0x3ff868d99b4492ed
|
||||
.8byte 0x3ff8ace5422aa0db
|
||||
.8byte 0x3ff8f1ae99157736
|
||||
.8byte 0x3ff93737b0cdc5e5
|
||||
.8byte 0x3ff97d829fde4e50
|
||||
.8byte 0x3ff9c49182a3f090
|
||||
.8byte 0x3ffa0c667b5de565
|
||||
.8byte 0x3ffa5503b23e255d
|
||||
.8byte 0x3ffa9e6b5579fdbf
|
||||
.8byte 0x3ffae89f995ad3ad
|
||||
.8byte 0x3ffb33a2b84f15fb
|
||||
.8byte 0x3ffb7f76f2fb5e47
|
||||
.8byte 0x3ffbcc1e904bc1d2
|
||||
.8byte 0x3ffc199bdd85529c
|
||||
.8byte 0x3ffc67f12e57d14b
|
||||
.8byte 0x3ffcb720dcef9069
|
||||
.8byte 0x3ffd072d4a07897c
|
||||
.8byte 0x3ffd5818dcfba487
|
||||
.8byte 0x3ffda9e603db3285
|
||||
.8byte 0x3ffdfc97337b9b5f
|
||||
.8byte 0x3ffe502ee78b3ff6
|
||||
.8byte 0x3ffea4afa2a490da
|
||||
.8byte 0x3ffefa1bee615a27
|
||||
.8byte 0x3fff50765b6e4540
|
||||
.8byte 0x3fffa7c1819e90d8
|
||||
|
||||
.KLN2:
|
||||
.8byte 0x40571547652b82fe /* Double precision K/log(2). */
|
||||
|
||||
/* Double precision polynomial coefficients. */
|
||||
.P0:
|
||||
.8byte 0x3fefffffffffe7c6
|
||||
.P1:
|
||||
.8byte 0x3fe00000008d6118
|
||||
.P2:
|
||||
.8byte 0x3fc55550da752d4f
|
||||
.P3:
|
||||
.8byte 0x3fa56420eb78fa85
|
||||
|
||||
.RS:
|
||||
.8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */
|
||||
.NLN2K:
|
||||
.8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */
|
||||
.DP_EXP_BIAS:
|
||||
.8byte 0x000000000000ffc0 /* Double precision exponent bias. */
|
||||
|
||||
.balign 4
|
||||
.SPone:
|
||||
.4byte 0x3f800000 /* Single precision 1.0. */
|
||||
.SP_RS:
|
||||
.4byte 0x4b400000 /* Single precision 2^23 + 2^22. */
|
||||
|
||||
.SPRANGE: /* Single precision overflow/underflow bounds. */
|
||||
.4byte 0x42b17217 /* if x>this bound, then result overflows. */
|
||||
.4byte 0x42cff1b4 /* if x<this bound, then result underflows. */
|
||||
|
||||
.SPLARGE_SMALL:
|
||||
.4byte 0x71800000 /* 2^100. */
|
||||
.4byte 0x0d800000 /* 2^-100. */
|
||||
|
||||
.INF_ZERO:
|
||||
.4byte 0x7f800000 /* Single precision Inf. */
|
||||
.4byte 0 /* Single precision zero. */
|
||||
|
||||
strong_alias (__ieee754_expf, __expf_finite)
|
Loading…
Reference in New Issue
Block a user