i386: Replace assembly versions of e_powf with generic e_powf.c

This patch replaces i386 assembly versions of e_powf with generic
e_powf.c.  For workload-spec2017.wrf, on Nehalem, it improves
performance by:

                           Before            After     Improvement
reciprocal-throughput      230.855          78.3358       194%
latency                    231.685          94.1259       146%

On Skylake, it improves performance by:

                           Before            After     Improvement
reciprocal-throughput      239.858          47.4713       405%
latency                    247.57           93.8798       163%

On IvyBridge with --disable-multi-arch, it improves performance by:

                           Before            After     Improvement
reciprocal-throughput      269.078          63.3758       324%
latency                    271.473          102.091       165%

	* sysdeps/i386/fpu/e_powf.S: Removed.
	* sysdeps/i386/fpu/e_powf_log2_data.c: Likewise.
	* sysdeps/i386/fpu/w_powf.c: Likewise.
	* sysdeps/i386/fpu/libm-test-ulps: Updated for generic e_powf.c.
	* sysdeps/i386/i686/fpu/multiarch/libm-test-ulps: Likewise.
	* sysdeps/i386/i686/fpu/multiarch/Makefile (libm-sysdep_routines):
	Add e_powf-sse2.
	(CFLAGS-e_powf-sse2.c): New.
	* sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c: New file.
	* sysdeps/i386/i686/fpu/multiarch/e_powf.c: Likewise.
This commit is contained in:
H.J. Lu 2017-10-22 08:11:15 -07:00
parent 6089a3ee24
commit 5313581cb5
9 changed files with 79 additions and 401 deletions

View File

@ -1,3 +1,16 @@
2017-10-22 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/i386/fpu/e_powf.S: Removed.
* sysdeps/i386/fpu/e_powf_log2_data.c: Likewise.
* sysdeps/i386/fpu/w_powf.c: Likewise.
* sysdeps/i386/fpu/libm-test-ulps: Updated for generic e_powf.c.
* sysdeps/i386/i686/fpu/multiarch/libm-test-ulps: Likewise.
* sysdeps/i386/i686/fpu/multiarch/Makefile (libm-sysdep_routines):
Add e_powf-sse2.
(CFLAGS-e_powf-sse2.c): New.
* sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c: New file.
* sysdeps/i386/i686/fpu/multiarch/e_powf.c: Likewise.
2017-10-22 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/i386/fpu/e_log2f.S: Removed.

View File

@ -1,392 +0,0 @@
/* ix87 specific implementation of pow function.
Copyright (C) 1996-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <machine/asm.h>
#include <i386-math-asm.h>
.section .rodata.cst8,"aM",@progbits,8
.p2align 3
.type one,@object
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
.type limit,@object
limit: .double 0.29
ASM_SIZE_DIRECTIVE(limit)
.type p31,@object
p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
ASM_SIZE_DIRECTIVE(p31)
.section .rodata.cst16,"aM",@progbits,16
.p2align 3
.type infinity,@object
inf_zero:
infinity:
.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
ASM_SIZE_DIRECTIVE(infinity)
.type zero,@object
zero: .double 0.0
ASM_SIZE_DIRECTIVE(zero)
.type minf_mzero,@object
minf_mzero:
minfinity:
.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
mzero:
.byte 0, 0, 0, 0, 0, 0, 0, 0x80
ASM_SIZE_DIRECTIVE(minf_mzero)
DEFINE_FLT_MIN
#ifdef PIC
# define MO(op) op##@GOTOFF(%ecx)
# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
#else
# define MO(op) op
# define MOX(op,x,f) op(,x,f)
#endif
.text
ENTRY(__ieee754_powf)
flds 8(%esp) // y
fxam
#ifdef PIC
LOAD_PIC_REG (cx)
#endif
fnstsw
movb %ah, %dl
andb $0x45, %ah
cmpb $0x40, %ah // is y == 0 ?
je 11f
cmpb $0x05, %ah // is y == ±inf ?
je 12f
cmpb $0x01, %ah // is y == NaN ?
je 30f
flds 4(%esp) // x : y
subl $4, %esp
cfi_adjust_cfa_offset (4)
fxam
fnstsw
movb %ah, %dh
andb $0x45, %ah
cmpb $0x40, %ah
je 20f // x is ±0
cmpb $0x05, %ah
je 15f // x is ±inf
cmpb $0x01, %ah
je 33f // x is NaN
fxch // y : x
/* fistpl raises invalid exception for |y| >= 1L<<31. */
fld %st // y : y : x
fabs // |y| : y : x
fcompl MO(p31) // y : x
fnstsw
sahf
jnc 2f
/* First see whether `y' is a natural number. In this case we
can use a more precise algorithm. */
fld %st // y : y : x
fistpl (%esp) // y : x
fildl (%esp) // int(y) : y : x
fucomp %st(1) // y : x
fnstsw
sahf
jne 3f
/* OK, we have an integer value for y. */
popl %edx
cfi_adjust_cfa_offset (-4)
orl $0, %edx
fstp %st(0) // x
jns 4f // y >= 0, jump
fdivrl MO(one) // 1/x (now referred to as x)
negl %edx
4: fldl MO(one) // 1 : x
fxch
/* If y is even, take the absolute value of x. Otherwise,
ensure all intermediate values that might overflow have the
sign of x. */
testb $1, %dl
jnz 6f
fabs
6: shrl $1, %edx
jnc 5f
fxch
fabs
fmul %st(1) // x : ST*x
fxch
5: fld %st // x : x : ST*x
fabs // |x| : x : ST*x
fmulp // |x|*x : ST*x
testl %edx, %edx
jnz 6b
fstp %st(0) // ST*x
FLT_NARROW_EVAL_UFLOW_NONNAN
ret
/* y is ±NAN */
30: flds 4(%esp) // x : y
fldl MO(one) // 1.0 : x : y
fucomp %st(1) // x : y
fnstsw
sahf
je 31f
fxch // y : x
31: fstp %st(1)
ret
cfi_adjust_cfa_offset (4)
.align ALIGNARG(4)
2: /* y is a large integer (so even). */
fxch // x : y
fabs // |x| : y
fxch // y : x
.align ALIGNARG(4)
3: /* y is a real number. */
fxch // x : y
fldl MO(one) // 1.0 : x : y
fldl MO(limit) // 0.29 : 1.0 : x : y
fld %st(2) // x : 0.29 : 1.0 : x : y
fsub %st(2) // x-1 : 0.29 : 1.0 : x : y
fabs // |x-1| : 0.29 : 1.0 : x : y
fucompp // 1.0 : x : y
fnstsw
fxch // x : 1.0 : y
sahf
ja 7f
fsub %st(1) // x-1 : 1.0 : y
fyl2xp1 // log2(x) : y
jmp 8f
7: fyl2x // log2(x) : y
8: fmul %st(1) // y*log2(x) : y
fst %st(1) // y*log2(x) : y*log2(x)
frndint // int(y*log2(x)) : y*log2(x)
fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x))
fxch // fract(y*log2(x)) : int(y*log2(x))
f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x))
faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x))
fscale // 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
32: addl $4, %esp
cfi_adjust_cfa_offset (-4)
fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x))
FLT_NARROW_EVAL_UFLOW_NONNAN
ret
/* x is NaN. */
cfi_adjust_cfa_offset (4)
33: addl $4, %esp
cfi_adjust_cfa_offset (-4)
fstp %st(1)
ret
// pow(x,±0) = 1
.align ALIGNARG(4)
11: fstp %st(0) // pop y
fldl MO(one)
ret
// y == ±inf
.align ALIGNARG(4)
12: fstp %st(0) // pop y
fldl MO(one) // 1
flds 4(%esp) // x : 1
fabs // abs(x) : 1
fucompp // < 1, == 1, or > 1
fnstsw
andb $0x45, %ah
cmpb $0x45, %ah
je 13f // jump if x is NaN
cmpb $0x40, %ah
je 14f // jump if |x| == 1
shlb $1, %ah
xorb %ah, %dl
andl $2, %edx
fldl MOX(inf_zero, %edx, 4)
ret
.align ALIGNARG(4)
14: fldl MO(one)
ret
.align ALIGNARG(4)
13: flds 4(%esp) // load x == NaN
ret
cfi_adjust_cfa_offset (4)
.align ALIGNARG(4)
// x is ±inf
15: fstp %st(0) // y
testb $2, %dh
jz 16f // jump if x == +inf
// fistpl raises invalid exception for |y| >= 1L<<31, so test
// that (in which case y is certainly even) before testing
// whether y is odd.
fld %st // y : y
fabs // |y| : y
fcompl MO(p31) // y
fnstsw
sahf
jnc 16f
// We must find out whether y is an odd integer.
fld %st // y : y
fistpl (%esp) // y
fildl (%esp) // int(y) : y
fucompp // <empty>
fnstsw
sahf
jne 17f
// OK, the value is an integer.
popl %edx
cfi_adjust_cfa_offset (-4)
testb $1, %dl
jz 18f // jump if not odd
// It's an odd integer.
shrl $31, %edx
fldl MOX(minf_mzero, %edx, 8)
ret
cfi_adjust_cfa_offset (4)
.align ALIGNARG(4)
16: fcompl MO(zero)
addl $4, %esp
cfi_adjust_cfa_offset (-4)
fnstsw
shrl $5, %eax
andl $8, %eax
fldl MOX(inf_zero, %eax, 1)
ret
cfi_adjust_cfa_offset (4)
.align ALIGNARG(4)
17: shll $30, %edx // sign bit for y in right position
addl $4, %esp
cfi_adjust_cfa_offset (-4)
18: shrl $31, %edx
fldl MOX(inf_zero, %edx, 8)
ret
cfi_adjust_cfa_offset (4)
.align ALIGNARG(4)
// x is ±0
20: fstp %st(0) // y
testb $2, %dl
jz 21f // y > 0
// x is ±0 and y is < 0. We must find out whether y is an odd integer.
testb $2, %dh
jz 25f
// fistpl raises invalid exception for |y| >= 1L<<31, so test
// that (in which case y is certainly even) before testing
// whether y is odd.
fld %st // y : y
fabs // |y| : y
fcompl MO(p31) // y
fnstsw
sahf
jnc 25f
fld %st // y : y
fistpl (%esp) // y
fildl (%esp) // int(y) : y
fucompp // <empty>
fnstsw
sahf
jne 26f
// OK, the value is an integer.
popl %edx
cfi_adjust_cfa_offset (-4)
testb $1, %dl
jz 27f // jump if not odd
// It's an odd integer.
// Raise divide-by-zero exception and get minus infinity value.
fldl MO(one)
fdivl MO(zero)
fchs
ret
cfi_adjust_cfa_offset (4)
25: fstp %st(0)
26: addl $4, %esp
cfi_adjust_cfa_offset (-4)
27: // Raise divide-by-zero exception and get infinity value.
fldl MO(one)
fdivl MO(zero)
ret
cfi_adjust_cfa_offset (4)
.align ALIGNARG(4)
// x is ±0 and y is > 0. We must find out whether y is an odd integer.
21: testb $2, %dh
jz 22f
// fistpl raises invalid exception for |y| >= 1L<<31, so test
// that (in which case y is certainly even) before testing
// whether y is odd.
fcoml MO(p31) // y
fnstsw
sahf
jnc 22f
fld %st // y : y
fistpl (%esp) // y
fildl (%esp) // int(y) : y
fucompp // <empty>
fnstsw
sahf
jne 23f
// OK, the value is an integer.
popl %edx
cfi_adjust_cfa_offset (-4)
testb $1, %dl
jz 24f // jump if not odd
// It's an odd integer.
fldl MO(mzero)
ret
cfi_adjust_cfa_offset (4)
22: fstp %st(0)
23: addl $4, %esp // Don't use pop.
cfi_adjust_cfa_offset (-4)
24: fldl MO(zero)
ret
END(__ieee754_powf)
strong_alias (__ieee754_powf, __powf_finite)

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -2370,24 +2370,30 @@ ldouble: 1
Function: "pow_downward":
double: 1
float: 1
float128: 2
idouble: 1
ifloat: 1
ifloat128: 2
ildouble: 4
ldouble: 4
Function: "pow_towardzero":
double: 1
float: 1
float128: 2
idouble: 1
ifloat: 1
ifloat128: 2
ildouble: 4
ldouble: 4
Function: "pow_upward":
double: 1
float: 1
float128: 2
idouble: 1
ifloat: 1
ifloat128: 2
ildouble: 4
ldouble: 4

View File

@ -1 +0,0 @@
#include <sysdeps/../math/w_powf.c>

View File

@ -1,9 +1,10 @@
ifeq ($(subdir),math)
libm-sysdep_routines += e_exp2f-sse2 e_expf-sse2 e_logf-sse2 e_log2f-sse2 \
s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
e_powf-sse2 s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
CFLAGS-e_exp2f-sse2.c = -msse2 -mfpmath=sse
CFLAGS-e_expf-sse2.c = -msse2 -mfpmath=sse
CFLAGS-e_log2f-sse2.c = -msse2 -mfpmath=sse
CFLAGS-e_logf-sse2.c = -msse2 -mfpmath=sse
CFLAGS-e_powf-sse2.c = -msse2 -mfpmath=sse
endif

View File

@ -0,0 +1,3 @@
#define __powf __powf_sse2
#include <sysdeps/ieee754/flt-32/e_powf.c>

View File

@ -0,0 +1,43 @@
/* Multiple versions of powf.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#define powf __redirect_powf
#define __DECL_SIMD___redirect_powf
#include <math.h>
#undef powf
#define SYMBOL_NAME powf
#include "ifunc-sse2.h"
libc_ifunc_redirected (__redirect_powf, __powf, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (__powf_ia32, __GI___powf, __redirect_powf)
__attribute__ ((visibility ("hidden")));
# include <shlib-compat.h>
versioned_symbol (libm, __powf, powf, GLIBC_2_27);
#else
weak_alias (__powf, powf)
#endif
strong_alias (__powf, __ieee754_powf)
strong_alias (__powf, __powf_finite)
#define __powf __powf_ia32
#include <sysdeps/ieee754/flt-32/e_powf.c>

View File

@ -2370,24 +2370,30 @@ ldouble: 1
Function: "pow_downward":
double: 1
float: 1
float128: 2
idouble: 1
ifloat: 1
ifloat128: 2
ildouble: 4
ldouble: 4
Function: "pow_towardzero":
double: 1
float: 1
float128: 2
idouble: 1
ifloat: 1
ifloat128: 2
ildouble: 4
ldouble: 4
Function: "pow_upward":
double: 1
float: 1
float128: 2
idouble: 1
ifloat: 1
ifloat128: 2
ildouble: 4
ldouble: 4
@ -2577,30 +2583,30 @@ ldouble: 5
Function: "tgamma_downward":
double: 3
float: 4
float: 5
float128: 5
idouble: 3
ifloat: 4
ifloat: 5
ifloat128: 5
ildouble: 5
ldouble: 5
Function: "tgamma_towardzero":
double: 4
float: 4
float: 5
float128: 5
idouble: 4
ifloat: 4
ifloat: 5
ifloat128: 5
ildouble: 5
ldouble: 5
Function: "tgamma_upward":
double: 4
float: 4
float: 6
float128: 4
idouble: 4
ifloat: 4
ifloat: 6
ifloat128: 4
ildouble: 5
ldouble: 5