x86-64: Compile branred.c with -mprefer-vector-width=128 [BZ #24603]

When compiled with -O3 and AVX, GCC 8 and 9 optimize some loops in
sysdeps/ieee754/dbl-64/branred.c with 256-bit vector instructions,
which leads to store forward stall:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90579

There is no easy fix in compiler.  This patch limits vector width to
128 bits to work around this issue.  It improves performance of sin
and cos by more than 40% on Skylake compiled with -O3 -march=skylake.

Tested with GCC 7/8/9 on x86-64.

	[BZ #24603]
	* sysdeps/x86_64/configure.ac: Check if -mprefer-vector-width=128
	works.
	* sysdeps/x86_64/configure: Regenerated.
	* sysdeps/x86_64/fpu/Makefile (CFLAGS-branred.c): New.  Set
	to -mprefer-vector-width=128 if supported.
This commit is contained in:
H.J. Lu 2019-07-24 14:48:33 -07:00
parent 82c664ed75
commit 7e681561a3
4 changed files with 52 additions and 0 deletions

View File

@ -1,3 +1,12 @@
2019-07-24 H.J. Lu <hongjiu.lu@intel.com>
[BZ #24603]
* sysdeps/x86_64/configure.ac: Check if -mprefer-vector-width=128
works.
* sysdeps/x86_64/configure: Regenerated.
* sysdeps/x86_64/fpu/Makefile (CFLAGS-branred.c): New. Set
to -mprefer-vector-width=128 if supported.
2019-07-24 Florian Weimer <fweimer@redhat.com>
* scripts/build-many-glibcs.py (Context.checkout): Default to

View File

@ -54,6 +54,28 @@ fi
config_vars="$config_vars
config-cflags-avx512 = $libc_cv_cc_avx512"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking -mprefer-vector-width=128" >&5
$as_echo_n "checking -mprefer-vector-width=128... " >&6; }
if ${libc_cv_cc_mprefer_vector_width+:} false; then :
$as_echo_n "(cached) " >&6
else
if { ac_try='${CC-cc} -mprefer-vector-width=128 -xc /dev/null -S -o /dev/null'
{ { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
(eval $ac_try) 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; }; then :
libc_cv_cc_mprefer_vector_width=yes
else
libc_cv_cc_mprefer_vector_width=no
fi
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_mprefer_vector_width" >&5
$as_echo "$libc_cv_cc_mprefer_vector_width" >&6; }
config_vars="$config_vars
config-cflags-mprefer-vector-width = $libc_cv_cc_mprefer_vector_width"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
$as_echo_n "checking for Intel MPX support... " >&6; }
if ${libc_cv_asm_mpx+:} false; then :

View File

@ -25,6 +25,15 @@ if test $libc_cv_cc_avx512 = yes; then
fi
LIBC_CONFIG_VAR([config-cflags-avx512], [$libc_cv_cc_avx512])
dnl Check if -mprefer-vector-width=128 works.
AC_CACHE_CHECK(-mprefer-vector-width=128, libc_cv_cc_mprefer_vector_width, [dnl
LIBC_TRY_CC_OPTION([-mprefer-vector-width=128],
[libc_cv_cc_mprefer_vector_width=yes],
[libc_cv_cc_mprefer_vector_width=no])
])
LIBC_CONFIG_VAR([config-cflags-mprefer-vector-width],
[$libc_cv_cc_mprefer_vector_width])
dnl Check whether asm supports Intel MPX
AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
cat > conftest.s <<\EOF

View File

@ -237,3 +237,15 @@ CFLAGS-test-float-libmvec-sincosf-avx512.c = -DREQUIRE_AVX512F
CFLAGS-test-float-libmvec-sincosf-avx512-main.c = $(libmvec-sincos-cflags) $(float-vlen16-arch-ext-cflags)
endif
endif
ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
# When compiled with -O3 -march=skylake, GCC 8 and 9 optimize some loops
# in branred.c with 256-bit vector instructions, which leads to store
# forward stall:
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90579
#
# Limit vector width to 128 bits to work around this issue. It improves
# performance of sin and cos by more than 40% on Skylake.
CFLAGS-branred.c = -mprefer-vector-width=128
endif