mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-12 22:30:12 +00:00
424c4f60ed
The algorithm is exp(y * log(x)), where log(x) is computed with about 1.3*2^-68 relative error (1.5*2^-68 without fma), returning the result in two doubles, and the exp part uses the same algorithm (and lookup tables) as exp, but takes the input as two doubles and a sign (to handle negative bases with odd integer exponent). The __exp1 internal symbol is no longer necessary. There is separate code path when fma is not available but the worst case error is about 0.54 ULP in both cases. The lookup table and consts for log are 4168 bytes. The .rodata+.text is decreased by 37908 bytes on aarch64. The non-nearest rounding error is less than 1 ULP. Improvements on Cortex-A72 compared to current glibc master: pow thruput: 2.40x in [0.01 11.1]x[0.01 11.1] pow latency: 1.84x in [0.01 11.1]x[0.01 11.1] Tested on aarch64-linux-gnu (defined __FP_FAST_FMA, TOINT_INTRINSICS) and arm-linux-gnueabihf (!defined __FP_FAST_FMA, !TOINT_INTRINSICS) and x86_64-linux-gnu (!defined __FP_FAST_FMA, !TOINT_INTRINSICS) and powerpc64le-linux-gnu (defined __FP_FAST_FMA, !TOINT_INTRINSICS) targets. * NEWS: Mention pow improvements. * math/Makefile (type-double-routines): Add e_pow_log_data. * sysdeps/generic/math_private.h (__exp1): Remove. * sysdeps/i386/fpu/e_pow_log_data.c: New file. * sysdeps/ia64/fpu/e_pow_log_data.c: New file. * sysdeps/ieee754/dbl-64/Makefile (CFLAGS-e_pow.c): Allow fma contraction. * sysdeps/ieee754/dbl-64/e_exp.c (__exp1): Remove. (exp_inline): Remove. (__ieee754_exp): Only single double input is handled. * sysdeps/ieee754/dbl-64/e_pow.c: Rewrite. * sysdeps/ieee754/dbl-64/e_pow_log_data.c: New file. * sysdeps/ieee754/dbl-64/math_config.h (issignaling_inline): Define. (__pow_log_data): Define. * sysdeps/ieee754/dbl-64/upow.h: Remove. * sysdeps/ieee754/dbl-64/upow.tbl: Remove. * sysdeps/m68k/m680x0/fpu/e_pow_log_data.c: New file. * sysdeps/x86_64/fpu/multiarch/Makefile (CFLAGS-e_pow-fma.c): Allow fma contraction. (CFLAGS-e_pow-fma4.c): Likewise.
134 lines
4.7 KiB
Makefile
134 lines
4.7 KiB
Makefile
ifeq ($(subdir),math)
|
|
libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
|
|
s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
|
|
s_trunc-c s_truncf-c
|
|
|
|
libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
|
|
s_floorf-sse4_1 s_nearbyint-sse4_1 \
|
|
s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
|
s_trunc-sse4_1 s_truncf-sse4_1
|
|
|
|
libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
|
|
e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
|
|
mpa-fma \
|
|
sincos32-fma doasin-fma dosincos-fma \
|
|
mpatan2-fma mpatan-fma mpsqrt-fma mptan-fma
|
|
|
|
CFLAGS-doasin-fma.c = -mfma -mavx2
|
|
CFLAGS-dosincos-fma.c = -mfma -mavx2
|
|
CFLAGS-e_asin-fma.c = -mfma -mavx2
|
|
CFLAGS-e_atan2-fma.c = -mfma -mavx2
|
|
CFLAGS-e_exp-fma.c = -mfma -mavx2
|
|
CFLAGS-e_log-fma.c = -mfma -mavx2
|
|
CFLAGS-e_pow-fma.c = -mfma -mavx2
|
|
CFLAGS-mpa-fma.c = -mfma -mavx2
|
|
CFLAGS-mpatan-fma.c = -mfma -mavx2
|
|
CFLAGS-mpatan2-fma.c = -mfma -mavx2
|
|
CFLAGS-mpsqrt-fma.c = -mfma -mavx2
|
|
CFLAGS-mptan-fma.c = -mfma -mavx2
|
|
CFLAGS-s_atan-fma.c = -mfma -mavx2
|
|
CFLAGS-sincos32-fma.c = -mfma -mavx2
|
|
CFLAGS-s_sin-fma.c = -mfma -mavx2
|
|
CFLAGS-s_tan-fma.c = -mfma -mavx2
|
|
|
|
libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
|
|
|
|
libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \
|
|
e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma
|
|
|
|
CFLAGS-e_exp2f-fma.c = -mfma -mavx2
|
|
CFLAGS-e_expf-fma.c = -mfma -mavx2
|
|
CFLAGS-e_log2f-fma.c = -mfma -mavx2
|
|
CFLAGS-e_logf-fma.c = -mfma -mavx2
|
|
CFLAGS-e_powf-fma.c = -mfma -mavx2
|
|
CFLAGS-s_sinf-fma.c = -mfma -mavx2
|
|
CFLAGS-s_cosf-fma.c = -mfma -mavx2
|
|
CFLAGS-s_sincosf-fma.c = -mfma -mavx2
|
|
|
|
libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
|
|
e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
|
|
mpa-fma4 \
|
|
sincos32-fma4 doasin-fma4 dosincos-fma4 \
|
|
mpatan2-fma4 mpatan-fma4 mpsqrt-fma4 mptan-fma4
|
|
|
|
CFLAGS-doasin-fma4.c = -mfma4
|
|
CFLAGS-dosincos-fma4.c = -mfma4
|
|
CFLAGS-e_asin-fma4.c = -mfma4
|
|
CFLAGS-e_atan2-fma4.c = -mfma4
|
|
CFLAGS-e_exp-fma4.c = -mfma4
|
|
CFLAGS-e_log-fma4.c = -mfma4
|
|
CFLAGS-e_pow-fma4.c = -mfma4
|
|
CFLAGS-mpa-fma4.c = -mfma4
|
|
CFLAGS-mpatan-fma4.c = -mfma4
|
|
CFLAGS-mpatan2-fma4.c = -mfma4
|
|
CFLAGS-mpsqrt-fma4.c = -mfma4
|
|
CFLAGS-mptan-fma4.c = -mfma4
|
|
CFLAGS-s_atan-fma4.c = -mfma4
|
|
CFLAGS-sincos32-fma4.c = -mfma4
|
|
CFLAGS-s_sin-fma4.c = -mfma4
|
|
CFLAGS-s_tan-fma4.c = -mfma4
|
|
|
|
libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
|
|
e_atan2-avx s_sin-avx s_tan-avx \
|
|
mpa-avx
|
|
|
|
CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
|
|
CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
|
|
CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
|
|
CFLAGS-mpa-avx.c = -msse2avx -DSSE2AVX
|
|
CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
|
|
CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
|
|
CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
|
|
endif
|
|
|
|
ifeq ($(subdir),mathvec)
|
|
libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \
|
|
svml_d_cos8_core_avx512 svml_d_sin2_core_sse4 \
|
|
svml_d_sin4_core_avx2 svml_d_sin8_core_avx512 \
|
|
svml_d_log2_core_sse4 svml_d_log4_core_avx2 \
|
|
svml_d_log8_core_avx512 svml_d_sincos2_core_sse4 \
|
|
svml_d_sincos4_core_avx2 svml_d_sincos8_core_avx512 \
|
|
svml_s_cosf4_core_sse4 svml_s_cosf8_core_avx2 \
|
|
svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \
|
|
svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \
|
|
svml_s_logf4_core_sse4 svml_s_logf8_core_avx2 \
|
|
svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \
|
|
svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 \
|
|
svml_s_expf4_core_sse4 svml_s_expf8_core_avx2 \
|
|
svml_s_expf16_core_avx512 svml_d_pow2_core_sse4 \
|
|
svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 \
|
|
svml_s_powf4_core_sse4 svml_s_powf8_core_avx2 \
|
|
svml_s_powf16_core_avx512 svml_s_sincosf4_core_sse4 \
|
|
svml_s_sincosf8_core_avx2 \
|
|
svml_s_sincosf16_core_avx512 \
|
|
svml_d_cos2_core-sse2 svml_d_cos4_core-sse \
|
|
svml_d_cos8_core-avx2 svml_d_exp2_core-sse2 \
|
|
svml_d_exp4_core-sse svml_d_exp8_core-avx2 \
|
|
svml_d_log2_core-sse2 svml_d_log4_core-sse \
|
|
svml_d_log8_core-avx2 svml_d_pow2_core-sse2 \
|
|
svml_d_pow4_core-sse svml_d_pow8_core-avx2 \
|
|
svml_d_sin2_core-sse2 svml_d_sin4_core-sse \
|
|
svml_d_sin8_core-avx2 \
|
|
svml_d_sincos2_core-sse2 \
|
|
svml_d_sincos4_core-sse \
|
|
svml_d_sincos8_core-avx2 \
|
|
svml_s_cosf16_core-avx2 \
|
|
svml_s_cosf4_core-sse2 \
|
|
svml_s_cosf8_core-sse \
|
|
svml_s_expf16_core-avx2 \
|
|
svml_s_expf4_core-sse2 \
|
|
svml_s_expf8_core-sse \
|
|
svml_s_logf16_core-avx2 \
|
|
svml_s_logf4_core-sse2 \
|
|
svml_s_logf8_core-sse \
|
|
svml_s_powf16_core-avx2 \
|
|
svml_s_powf4_core-sse2 \
|
|
svml_s_powf8_core-sse \
|
|
svml_s_sincosf16_core-avx2 \
|
|
svml_s_sincosf4_core-sse2 \
|
|
svml_s_sincosf8_core-sse \
|
|
svml_s_sinf16_core-avx2 \
|
|
svml_s_sinf4_core-sse2 \
|
|
svml_s_sinf8_core-sse
|
|
endif
|