i386: Replace assembly versions of e_powf with generic e_powf.c

This patch replaces i386 assembly versions of e_powf with generic e_powf.c. For workload-spec2017.wrf, on Nehalem, it improves performance by: Before After Improvement reciprocal-throughput 230.855 78.3358 194% latency 231.685 94.1259 146% On Skylake, it improves performance by: Before After Improvement reciprocal-throughput 239.858 47.4713 405% latency 247.57 93.8798 163% On IvyBridge with --disable-multi-arch, it improves performance by: Before After Improvement reciprocal-throughput 269.078 63.3758 324% latency 271.473 102.091 165% * sysdeps/i386/fpu/e_powf.S: Removed. * sysdeps/i386/fpu/e_powf_log2_data.c: Likewise. * sysdeps/i386/fpu/w_powf.c: Likewise. * sysdeps/i386/fpu/libm-test-ulps: Updated for generic e_powf.c. * sysdeps/i386/i686/fpu/multiarch/libm-test-ulps: Likewise. * sysdeps/i386/i686/fpu/multiarch/Makefile (libm-sysdep_routines): Add e_powf-sse2. (CFLAGS-e_powf-sse2.c): New. * sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c: New file. * sysdeps/i386/i686/fpu/multiarch/e_powf.c: Likewise.
2025-01-11 03:40:06 +00:00 · 2017-10-22 08:11:15 -07:00 · 2017-10-22 08:11:15 -07:00 · 5313581cb5
commit 5313581cb5
parent 6089a3ee24
9 changed files with 79 additions and 401 deletions
--- a/13
+++ b/13
@ -1,3 +1,16 @@
 2017-10-22  H.J. Lu  <hongjiu.lu@intel.com>
 	* sysdeps/i386/fpu/e_powf.S: Removed.
 	* sysdeps/i386/fpu/e_powf_log2_data.c: Likewise.
 	* sysdeps/i386/fpu/w_powf.c: Likewise.
 	* sysdeps/i386/fpu/libm-test-ulps: Updated for generic e_powf.c.
 	* sysdeps/i386/i686/fpu/multiarch/libm-test-ulps: Likewise.
 	* sysdeps/i386/i686/fpu/multiarch/Makefile (libm-sysdep_routines):
 	Add e_powf-sse2.
 	(CFLAGS-e_powf-sse2.c): New.
 	* sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c: New file.
 	* sysdeps/i386/i686/fpu/multiarch/e_powf.c: Likewise.
 2017-10-22  H.J. Lu  <hongjiu.lu@intel.com>
 	* sysdeps/i386/fpu/e_log2f.S: Removed.
--- a/sysdeps/i386/fpu/e_powf.S
+++ b/sysdeps/i386/fpu/e_powf.S
@ -1,392 +0,0 @@
 /* ix87 specific implementation of pow function.
   Copyright (C) 1996-2017 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 #include <machine/asm.h>
 #include <i386-math-asm.h>
 	.section .rodata.cst8,"aM",@progbits,8
 	.p2align 3
 	.type one,@object
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
 	.type limit,@object
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
 	.type p31,@object
 p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
 	ASM_SIZE_DIRECTIVE(p31)
 	.section .rodata.cst16,"aM",@progbits,16
 	.p2align 3
 	.type infinity,@object
 inf_zero:
 infinity:
 	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
 	ASM_SIZE_DIRECTIVE(infinity)
 	.type zero,@object
 zero:	.double 0.0
 	ASM_SIZE_DIRECTIVE(zero)
 	.type minf_mzero,@object
 minf_mzero:
 minfinity:
 	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
 mzero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	ASM_SIZE_DIRECTIVE(minf_mzero)
 DEFINE_FLT_MIN
 #ifdef PIC
 # define MO(op) op##@GOTOFF(%ecx)
 # define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
 #else
 # define MO(op) op
 # define MOX(op,x,f) op(,x,f)
 #endif
 	.text
 ENTRY(__ieee754_powf)
 	flds	8(%esp)	// y
 	fxam
 #ifdef	PIC
 	LOAD_PIC_REG (cx)
 #endif
 	fnstsw
 	movb	%ah, %dl
 	andb	$0x45, %ah
 	cmpb	$0x40, %ah	// is y == 0 ?
 	je	11f
 	cmpb	$0x05, %ah	// is y == ±inf ?
 	je	12f
 	cmpb	$0x01, %ah	// is y == NaN ?
 	je	30f
 	flds	4(%esp)		// x : y
 	subl	$4, %esp
 	cfi_adjust_cfa_offset (4)
 	fxam
 	fnstsw
 	movb	%ah, %dh
 	andb	$0x45, %ah
 	cmpb	$0x40, %ah
 	je	20f		// x is ±0
 	cmpb	$0x05, %ah
 	je	15f		// x is ±inf
 	cmpb	$0x01, %ah
 	je	33f		// x is NaN
 	fxch			// y : x
 	/* fistpl raises invalid exception for |y| >= 1L<<31.  */
 	fld	%st		// y : y : x
 	fabs			// |y| : y : x
 	fcompl	MO(p31)		// y : x
 	fnstsw
 	sahf
 	jnc	2f
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
 	fistpl	(%esp)		// y : x
 	fildl	(%esp)		// int(y) : y : x
 	fucomp	%st(1)		// y : x
 	fnstsw
 	sahf
 	jne	3f
 	/* OK, we have an integer value for y.  */
 	popl	%edx
 	cfi_adjust_cfa_offset (-4)
 	orl	$0, %edx
 	fstp	%st(0)		// x
 	jns	4f		// y >= 0, jump
 	fdivrl	MO(one)		// 1/x		(now referred to as x)
 	negl	%edx
 4:	fldl	MO(one)		// 1 : x
 	fxch
 	/* If y is even, take the absolute value of x.  Otherwise,
 	   ensure all intermediate values that might overflow have the
 	   sign of x.  */
 	testb	$1, %dl
 	jnz	6f
 	fabs
 6:	shrl	$1, %edx
 	jnc	5f
 	fxch
 	fabs
 	fmul	%st(1)		// x : ST*x
 	fxch
 5:	fld	%st		// x : x : ST*x
 	fabs			// |x| : x : ST*x
 	fmulp			// |x|*x : ST*x
 	testl	%edx, %edx
 	jnz	6b
 	fstp	%st(0)		// ST*x
 	FLT_NARROW_EVAL_UFLOW_NONNAN
 	ret
 	/* y is ±NAN */
 30:	flds	4(%esp)		// x : y
 	fldl	MO(one)		// 1.0 : x : y
 	fucomp	%st(1)		// x : y
 	fnstsw
 	sahf
 	je	31f
 	fxch			// y : x
 31:	fstp	%st(1)
 	ret
 	cfi_adjust_cfa_offset (4)
 	.align ALIGNARG(4)
 2:	/* y is a large integer (so even).  */
 	fxch			// x : y
 	fabs			// |x| : y
 	fxch			// y : x
 	.align ALIGNARG(4)
 3:	/* y is a real number.  */
 	fxch			// x : y
 	fldl	MO(one)		// 1.0 : x : y
 	fldl	MO(limit)	// 0.29 : 1.0 : x : y
 	fld	%st(2)		// x : 0.29 : 1.0 : x : y
 	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
 	fabs			// |x-1| : 0.29 : 1.0 : x : y
 	fucompp			// 1.0 : x : y
 	fnstsw
 	fxch			// x : 1.0 : y
 	sahf
 	ja	7f
 	fsub	%st(1)		// x-1 : 1.0 : y
 	fyl2xp1			// log2(x) : y
 	jmp	8f
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
 	fxch			// fract(y*log2(x)) : int(y*log2(x))
 	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
 	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
 	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
 32:	addl	$4, %esp
 	cfi_adjust_cfa_offset (-4)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	FLT_NARROW_EVAL_UFLOW_NONNAN
 	ret
 	/* x is NaN.  */
 	cfi_adjust_cfa_offset (4)
 33:	addl	$4, %esp
 	cfi_adjust_cfa_offset (-4)
 	fstp	%st(1)
 	ret
 	// pow(x,±0) = 1
 	.align ALIGNARG(4)
 11:	fstp	%st(0)		// pop y
 	fldl	MO(one)
 	ret
 	// y == ±inf
 	.align ALIGNARG(4)
 12:	fstp	%st(0)		// pop y
 	fldl	MO(one)		// 1
 	flds	4(%esp)		// x : 1
 	fabs			// abs(x) : 1
 	fucompp			// < 1, == 1, or > 1
 	fnstsw
 	andb	$0x45, %ah
 	cmpb	$0x45, %ah
 	je	13f		// jump if x is NaN
 	cmpb	$0x40, %ah
 	je	14f		// jump if |x| == 1
 	shlb	$1, %ah
 	xorb	%ah, %dl
 	andl	$2, %edx
 	fldl	MOX(inf_zero, %edx, 4)
 	ret
 	.align ALIGNARG(4)
 14:	fldl	MO(one)
 	ret
 	.align ALIGNARG(4)
 13:	flds	4(%esp)		// load x == NaN
 	ret
 	cfi_adjust_cfa_offset (4)
 	.align ALIGNARG(4)
 	// x is ±inf
 15:	fstp	%st(0)		// y
 	testb	$2, %dh
 	jz	16f		// jump if x == +inf
 	// fistpl raises invalid exception for |y| >= 1L<<31, so test
 	// that (in which case y is certainly even) before testing
 	// whether y is odd.
 	fld	%st		// y : y
 	fabs			// |y| : y
 	fcompl	MO(p31)		// y
 	fnstsw
 	sahf
 	jnc	16f
 	// We must find out whether y is an odd integer.
 	fld	%st		// y : y
 	fistpl	(%esp)		// y
 	fildl	(%esp)		// int(y) : y
 	fucompp			// <empty>
 	fnstsw
 	sahf
 	jne	17f
 	// OK, the value is an integer.
 	popl	%edx
 	cfi_adjust_cfa_offset (-4)
 	testb	$1, %dl
 	jz	18f		// jump if not odd
 	// It's an odd integer.
 	shrl	$31, %edx
 	fldl	MOX(minf_mzero, %edx, 8)
 	ret
 	cfi_adjust_cfa_offset (4)
 	.align ALIGNARG(4)
 16:	fcompl	MO(zero)
 	addl	$4, %esp
 	cfi_adjust_cfa_offset (-4)
 	fnstsw
 	shrl	$5, %eax
 	andl	$8, %eax
 	fldl	MOX(inf_zero, %eax, 1)
 	ret
 	cfi_adjust_cfa_offset (4)
 	.align ALIGNARG(4)
 17:	shll	$30, %edx	// sign bit for y in right position
 	addl	$4, %esp
 	cfi_adjust_cfa_offset (-4)
 18:	shrl	$31, %edx
 	fldl	MOX(inf_zero, %edx, 8)
 	ret
 	cfi_adjust_cfa_offset (4)
 	.align ALIGNARG(4)
 	// x is ±0
 20:	fstp	%st(0)		// y
 	testb	$2, %dl
 	jz	21f		// y > 0
 	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
 	testb	$2, %dh
 	jz	25f
 	// fistpl raises invalid exception for |y| >= 1L<<31, so test
 	// that (in which case y is certainly even) before testing
 	// whether y is odd.
 	fld	%st		// y : y
 	fabs			// |y| : y
 	fcompl	MO(p31)		// y
 	fnstsw
 	sahf
 	jnc	25f
 	fld	%st		// y : y
 	fistpl	(%esp)		// y
 	fildl	(%esp)		// int(y) : y
 	fucompp			// <empty>
 	fnstsw
 	sahf
 	jne	26f
 	// OK, the value is an integer.
 	popl	%edx
 	cfi_adjust_cfa_offset (-4)
 	testb	$1, %dl
 	jz	27f		// jump if not odd
 	// It's an odd integer.
 	// Raise divide-by-zero exception and get minus infinity value.
 	fldl	MO(one)
 	fdivl	MO(zero)
 	fchs
 	ret
 	cfi_adjust_cfa_offset (4)
 25:	fstp	%st(0)
 26:	addl	$4, %esp
 	cfi_adjust_cfa_offset (-4)
 27:	// Raise divide-by-zero exception and get infinity value.
 	fldl	MO(one)
 	fdivl	MO(zero)
 	ret
 	cfi_adjust_cfa_offset (4)
 	.align ALIGNARG(4)
 	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
 21:	testb	$2, %dh
 	jz	22f
 	// fistpl raises invalid exception for |y| >= 1L<<31, so test
 	// that (in which case y is certainly even) before testing
 	// whether y is odd.
 	fcoml	MO(p31)		// y
 	fnstsw
 	sahf
 	jnc	22f
 	fld	%st		// y : y
 	fistpl	(%esp)		// y
 	fildl	(%esp)		// int(y) : y
 	fucompp			// <empty>
 	fnstsw
 	sahf
 	jne	23f
 	// OK, the value is an integer.
 	popl	%edx
 	cfi_adjust_cfa_offset (-4)
 	testb	$1, %dl
 	jz	24f		// jump if not odd
 	// It's an odd integer.
 	fldl	MO(mzero)
 	ret
 	cfi_adjust_cfa_offset (4)
 22:	fstp	%st(0)
 23:	addl	$4, %esp	// Don't use pop.
 	cfi_adjust_cfa_offset (-4)
 24:	fldl	MO(zero)
 	ret
 END(__ieee754_powf)
 strong_alias (__ieee754_powf, __powf_finite)
--- a/sysdeps/i386/fpu/e_powf_log2_data.c
+++ b/sysdeps/i386/fpu/e_powf_log2_data.c
@ -1 +0,0 @@
 /* Not needed.  */
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@ -2370,24 +2370,30 @@ ldouble: 1
 Function: "pow_downward":
 double: 1
 float: 1
 float128: 2
 idouble: 1
 ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 Function: "pow_towardzero":
 double: 1
 float: 1
 float128: 2
 idouble: 1
 ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 Function: "pow_upward":
 double: 1
 float: 1
 float128: 2
 idouble: 1
 ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
--- a/sysdeps/i386/fpu/w_powf.c
+++ b/sysdeps/i386/fpu/w_powf.c
@ -1 +0,0 @@
 #include <sysdeps/../math/w_powf.c>
--- a/sysdeps/i386/i686/fpu/multiarch/Makefile
+++ b/sysdeps/i386/i686/fpu/multiarch/Makefile
@ -1,9 +1,10 @@
 ifeq ($(subdir),math)
 libm-sysdep_routines += e_exp2f-sse2 e_expf-sse2 e_logf-sse2 e_log2f-sse2 \
-			s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
+			e_powf-sse2 s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
 CFLAGS-e_exp2f-sse2.c = -msse2 -mfpmath=sse
 CFLAGS-e_expf-sse2.c = -msse2 -mfpmath=sse
 CFLAGS-e_log2f-sse2.c = -msse2 -mfpmath=sse
 CFLAGS-e_logf-sse2.c = -msse2 -mfpmath=sse
 CFLAGS-e_powf-sse2.c = -msse2 -mfpmath=sse
 endif
--- a/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
+++ b/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
@ -0,0 +1,3 @@
 #define __powf __powf_sse2
 #include <sysdeps/ieee754/flt-32/e_powf.c>
--- a/sysdeps/i386/i686/fpu/multiarch/e_powf.c
+++ b/sysdeps/i386/i686/fpu/multiarch/e_powf.c
@ -0,0 +1,43 @@
 /* Multiple versions of powf.
   Copyright (C) 2017 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 #define powf __redirect_powf
 #define __DECL_SIMD___redirect_powf
 #include <math.h>
 #undef powf
 #define SYMBOL_NAME powf
 #include "ifunc-sse2.h"
 libc_ifunc_redirected (__redirect_powf, __powf, IFUNC_SELECTOR ());
 #ifdef SHARED
 __hidden_ver1 (__powf_ia32, __GI___powf, __redirect_powf)
  __attribute__ ((visibility ("hidden")));
 # include <shlib-compat.h>
 versioned_symbol (libm, __powf, powf, GLIBC_2_27);
 #else
 weak_alias (__powf, powf)
 #endif
 strong_alias (__powf, __ieee754_powf)
 strong_alias (__powf, __powf_finite)
 #define __powf __powf_ia32
 #include <sysdeps/ieee754/flt-32/e_powf.c>
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@ -2370,24 +2370,30 @@ ldouble: 1
 Function: "pow_downward":
 double: 1
 float: 1
 float128: 2
 idouble: 1
 ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 Function: "pow_towardzero":
 double: 1
 float: 1
 float128: 2
 idouble: 1
 ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 Function: "pow_upward":
 double: 1
 float: 1
 float128: 2
 idouble: 1
 ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
@ -2577,30 +2583,30 @@ ldouble: 5
 Function: "tgamma_downward":
 double: 3
-float: 4
+float: 5
 float128: 5
 idouble: 3
-ifloat: 4
+ifloat: 5
 ifloat128: 5
 ildouble: 5
 ldouble: 5
 Function: "tgamma_towardzero":
 double: 4
-float: 4
+float: 5
 float128: 5
 idouble: 4
-ifloat: 4
+ifloat: 5
 ifloat128: 5
 ildouble: 5
 ldouble: 5
 Function: "tgamma_upward":
 double: 4
-float: 4
+float: 6
 float128: 4
 idouble: 4
-ifloat: 4
+ifloat: 6
 ifloat128: 4
 ildouble: 5
 ldouble: 5
		`@ -0,0 +1,3 @@`
							`#define __powf __powf_sse2`

							`#include <sysdeps/ieee754/flt-32/e_powf.c>`