math: Add new exp10 implementation

New implementation is based on the existing exp/exp2, with different
reduction constants and polynomial. Worst-case error in round-to-
nearest is 0.513 ULP.

The exp/exp2 shared table is reused for exp10 - .rodata size of
e_exp_data increases by 64 bytes.

As for exp/exp2, targets with single-instruction rounding/conversion
intrinsics can use them by toggling TOINT_INTRINSICS=1 and adding the
necessary code to their math_private.h.

Improvements on Neoverse V1 compared to current GLIBC master:
exp10 thruput: 3.3x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
exp10 latency: 1.8x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]

Tested on:
    aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and
    x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction)

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
This commit is contained in:
Joe Ramsay 2023-12-01 09:49:45 +00:00 committed by Szabolcs Nagy
parent 8e755f5bc8
commit 63d0a35d5f
3 changed files with 134 additions and 23 deletions

View File

@ -16,36 +16,132 @@
<https://www.gnu.org/licenses/>. */
#include <math.h>
#include <math-barriers.h>
#include <math-narrow-eval.h>
#include <math_private.h>
#include <float.h>
#include <libm-alias-finite.h>
#include "math_config.h"
static const double log10_high = 0x2.4d7637p0;
static const double log10_low = 0x7.6aaa2b05ba95cp-28;
#define N (1 << EXP_TABLE_BITS)
#define IndexMask (N - 1)
#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
#define UFlowBound -0x1.5ep+8 /* -350. */
#define SmallTop 0x3c6 /* top12(0x1p-57). */
#define BigTop 0x407 /* top12(0x1p8). */
#define Thresh 0x41 /* BigTop - SmallTop. */
#define Shift __exp_data.shift
#define C(i) __exp_data.exp10_poly[i]
double
__ieee754_exp10 (double arg)
static double
special_case (uint64_t sbits, double_t tmp, uint64_t ki)
{
int32_t lx;
double arg_high, arg_low;
double exp_high, exp_low;
double_t scale, y;
if (!isfinite (arg))
return __ieee754_exp (arg);
if (arg < DBL_MIN_10_EXP - DBL_DIG - 10)
return DBL_MIN * DBL_MIN;
else if (arg > DBL_MAX_10_EXP + 1)
return DBL_MAX * DBL_MAX;
else if (fabs (arg) < 0x1p-56)
return 1.0;
if (ki - (1ull << 16) < 0x80000000)
{
/* The exponent of scale might have overflowed by 1. */
sbits -= 1ull << 52;
scale = asdouble (sbits);
y = 2 * (scale + scale * tmp);
return check_oflow (y);
}
GET_LOW_WORD (lx, arg);
lx &= 0xf8000000;
arg_high = arg;
SET_LOW_WORD (arg_high, lx);
arg_low = arg - arg_high;
exp_high = arg_high * log10_high;
exp_low = arg_high * log10_low + arg_low * M_LN10;
return __ieee754_exp (exp_high) * __ieee754_exp (exp_low);
/* n < 0, need special care in the subnormal range. */
sbits += 1022ull << 52;
scale = asdouble (sbits);
y = scale + scale * tmp;
if (y < 1.0)
{
/* Round y to the right precision before scaling it into the subnormal
range to avoid double rounding that can cause 0.5+E/2 ulp error where
E is the worst-case ulp error outside the subnormal range. So this
is only useful if the goal is better than 1 ulp worst-case error. */
double_t lo = scale - y + scale * tmp;
double_t hi = 1.0 + y;
lo = 1.0 - hi + y + lo;
y = math_narrow_eval (hi + lo) - 1.0;
/* Avoid -0.0 with downward rounding. */
if (WANT_ROUNDING && y == 0.0)
y = 0.0;
/* The underflow exception needs to be signaled explicitly. */
math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
}
y = 0x1p-1022 * y;
return check_uflow (y);
}
/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
double
__ieee754_exp10 (double x)
{
uint64_t ix = asuint64 (x);
uint32_t abstop = (ix >> 52) & 0x7ff;
if (__glibc_unlikely (abstop - SmallTop >= Thresh))
{
if (abstop - SmallTop >= 0x80000000)
/* Avoid spurious underflow for tiny x.
Note: 0 is common input. */
return x + 1;
if (abstop == 0x7ff)
return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
if (x >= OFlowBound)
return __math_oflow (0);
if (x < UFlowBound)
return __math_uflow (0);
/* Large x is special-cased below. */
abstop = 0;
}
/* Reduce x: z = x * N / log10(2), k = round(z). */
double_t z = __exp_data.invlog10_2N * x;
double_t kd;
int64_t ki;
#if TOINT_INTRINSICS
kd = roundtoint (z);
ki = converttoint (z);
#else
kd = math_narrow_eval (z + Shift);
kd -= Shift;
ki = kd;
#endif
/* r = x - k * log10(2), r in [-0.5, 0.5]. */
double_t r = x;
r = __exp_data.neglog10_2hiN * kd + r;
r = __exp_data.neglog10_2loN * kd + r;
/* exp10(x) = 2^(k/N) * 2^(r/N).
Approximate the two components separately. */
/* s = 2^(k/N), using lookup table. */
uint64_t e = ki << (52 - EXP_TABLE_BITS);
uint64_t i = (ki & IndexMask) * 2;
uint64_t u = __exp_data.tab[i + 1];
uint64_t sbits = u + e;
double_t tail = asdouble (__exp_data.tab[i]);
/* 2^(r/N) ~= 1 + r * Poly(r). */
double_t r2 = r * r;
double_t p = C (0) + r * C (1);
double_t y = C (2) + r * C (3);
y = y + r2 * C (4);
y = p + r2 * y;
y = tail + y * r;
if (__glibc_unlikely (abstop == 0))
return special_case (sbits, y, ki);
/* Assemble components:
y = 2^(r/N) * 2^(k/N)
~= (y + 1) * s. */
double_t s = asdouble (sbits);
return s * y + s;
}
libm_alias_finite (__ieee754_exp10, __exp10)

View File

@ -58,6 +58,17 @@ const struct exp_data __exp_data = {
0x1.5d7e09b4e3a84p-10,
#endif
},
.invlog10_2N = 0x1.a934f0979a371p1 * N,
.neglog10_2hiN = -0x1.3441350ap-2 / N,
.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
.exp10_poly = {
/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
0x1.26bb1bbb55516p1,
0x1.53524c73ce9fep1,
0x1.0470591ce4b26p1,
0x1.2bd76577fe684p0,
0x1.1446eeccd0efbp-1
},
// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
// tab[2*k] = asuint64(T[k])
// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N

View File

@ -201,6 +201,10 @@ extern const struct exp_data
double poly[4]; /* Last four coefficients. */
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
double invlog10_2N;
double neglog10_2hiN;
double neglog10_2loN;
double exp10_poly[5];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
} __exp_data attribute_hidden;