mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-01 17:30:07 +00:00
0fed0b250f
Plus a small amount of moving includes around in order to be able to remove duplicate definition of asuint64. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
336 lines
12 KiB
C
336 lines
12 KiB
C
/* Single-precision vector (SVE) pow function
|
|
|
|
Copyright (C) 2024 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include "../ieee754/flt-32/math_config.h"
|
|
#include "sv_math.h"
|
|
|
|
/* The following data is used in the SVE pow core computation
|
|
and special case detection. */
|
|
#define Tinvc __v_powf_data.invc
|
|
#define Tlogc __v_powf_data.logc
|
|
#define Texp __v_powf_data.scale
|
|
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
|
|
#define Shift 0x1.8p52
|
|
#define Norm 0x1p23f /* 0x4b000000. */
|
|
|
|
/* Overall ULP error bound for pow is 2.6 ulp
|
|
~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
|
|
static const struct data
|
|
{
|
|
double log_poly[4];
|
|
double exp_poly[3];
|
|
float uflow_bound, oflow_bound, small_bound;
|
|
uint32_t sign_bias, sign_mask, subnormal_bias, off;
|
|
} data = {
|
|
/* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
|
|
V_POWF_EXP2_N. */
|
|
.log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
|
|
-0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
|
|
/* rel err: 1.69 * 2^-34. */
|
|
.exp_poly = {
|
|
0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */
|
|
0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */
|
|
0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */
|
|
},
|
|
.uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */
|
|
.oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */
|
|
.small_bound = 0x1p-126f,
|
|
.off = 0x3f35d000,
|
|
.sign_bias = SignBias,
|
|
.sign_mask = 0x80000000,
|
|
.subnormal_bias = 0x0b800000, /* 23 << 23. */
|
|
};
|
|
|
|
#define A(i) sv_f64 (d->log_poly[i])
|
|
#define C(i) sv_f64 (d->exp_poly[i])
|
|
|
|
/* Check if x is an integer. */
|
|
static inline svbool_t
|
|
svisint (svbool_t pg, svfloat32_t x)
|
|
{
|
|
return svcmpeq (pg, svrintz_z (pg, x), x);
|
|
}
|
|
|
|
/* Check if x is real not integer valued. */
|
|
static inline svbool_t
|
|
svisnotint (svbool_t pg, svfloat32_t x)
|
|
{
|
|
return svcmpne (pg, svrintz_z (pg, x), x);
|
|
}
|
|
|
|
/* Check if x is an odd integer. */
|
|
static inline svbool_t
|
|
svisodd (svbool_t pg, svfloat32_t x)
|
|
{
|
|
svfloat32_t y = svmul_x (pg, x, 0.5f);
|
|
return svisnotint (pg, y);
|
|
}
|
|
|
|
/* Check if zero, inf or nan. */
|
|
static inline svbool_t
|
|
sv_zeroinfnan (svbool_t pg, svuint32_t i)
|
|
{
|
|
return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
|
|
2u * 0x7f800000 - 1);
|
|
}
|
|
|
|
/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
|
|
the bit representation of a non-zero finite floating-point value. */
|
|
static inline int
|
|
checkint (uint32_t iy)
|
|
{
|
|
int e = iy >> 23 & 0xff;
|
|
if (e < 0x7f)
|
|
return 0;
|
|
if (e > 0x7f + 23)
|
|
return 2;
|
|
if (iy & ((1 << (0x7f + 23 - e)) - 1))
|
|
return 0;
|
|
if (iy & (1 << (0x7f + 23 - e)))
|
|
return 1;
|
|
return 2;
|
|
}
|
|
|
|
/* Check if zero, inf or nan. */
|
|
static inline int
|
|
zeroinfnan (uint32_t ix)
|
|
{
|
|
return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
|
|
}
|
|
|
|
/* A scalar subroutine used to fix main power special cases. Similar to the
|
|
preamble of scalar powf except that we do not update ix and sign_bias. This
|
|
is done in the preamble of the SVE powf. */
|
|
static inline float
|
|
powf_specialcase (float x, float y, float z)
|
|
{
|
|
uint32_t ix = asuint (x);
|
|
uint32_t iy = asuint (y);
|
|
/* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */
|
|
if (__glibc_unlikely (zeroinfnan (iy)))
|
|
{
|
|
if (2 * iy == 0)
|
|
return issignalingf_inline (x) ? x + y : 1.0f;
|
|
if (ix == 0x3f800000)
|
|
return issignalingf_inline (y) ? x + y : 1.0f;
|
|
if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
|
|
return x + y;
|
|
if (2 * ix == 2 * 0x3f800000)
|
|
return 1.0f;
|
|
if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
|
|
return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
|
|
return y * y;
|
|
}
|
|
if (__glibc_unlikely (zeroinfnan (ix)))
|
|
{
|
|
float_t x2 = x * x;
|
|
if (ix & 0x80000000 && checkint (iy) == 1)
|
|
x2 = -x2;
|
|
return iy & 0x80000000 ? 1 / x2 : x2;
|
|
}
|
|
/* We need a return here in case x<0 and y is integer, but all other tests
|
|
need to be run. */
|
|
return z;
|
|
}
|
|
|
|
/* Scalar fallback for special case routines with custom signature. */
|
|
static inline svfloat32_t
|
|
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
|
|
{
|
|
svbool_t p = svpfirst (cmp, svpfalse ());
|
|
while (svptest_any (cmp, p))
|
|
{
|
|
float sx1 = svclastb (p, 0, x1);
|
|
float sx2 = svclastb (p, 0, x2);
|
|
float elem = svclastb (p, 0, y);
|
|
elem = powf_specialcase (sx1, sx2, elem);
|
|
svfloat32_t y2 = sv_f32 (elem);
|
|
y = svsel (p, y2, y);
|
|
p = svpnext_b32 (cmp, p);
|
|
}
|
|
return y;
|
|
}
|
|
|
|
/* Compute core for half of the lanes in double precision. */
|
|
static inline svfloat64_t
|
|
sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
|
|
svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
|
|
const struct data *d)
|
|
{
|
|
svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
|
|
svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
|
|
|
|
/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
|
|
svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
|
|
svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
|
|
|
|
/* Polynomial to approximate log1p(r)/ln2. */
|
|
svfloat64_t logx = A (0);
|
|
logx = svmla_x (pg, A (1), r, logx);
|
|
logx = svmla_x (pg, A (2), r, logx);
|
|
logx = svmla_x (pg, A (3), r, logx);
|
|
logx = svmla_x (pg, y0, r, logx);
|
|
*pylogx = svmul_x (pg, y, logx);
|
|
|
|
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
|
svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
|
|
svuint64_t ki = svreinterpret_u64 (kd);
|
|
kd = svsub_x (pg, kd, Shift);
|
|
|
|
r = svsub_x (pg, *pylogx, kd);
|
|
|
|
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
|
|
svuint64_t t
|
|
= svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
|
|
svuint64_t ski = svadd_x (pg, ki, sign_bias);
|
|
t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
|
|
svfloat64_t s = svreinterpret_f64 (t);
|
|
|
|
svfloat64_t p = C (0);
|
|
p = svmla_x (pg, C (1), p, r);
|
|
p = svmla_x (pg, C (2), p, r);
|
|
p = svmla_x (pg, s, p, svmul_x (pg, s, r));
|
|
|
|
return p;
|
|
}
|
|
|
|
/* Widen vector to double precision and compute core on both halves of the
|
|
vector. Lower cost of promotion by considering all lanes active. */
|
|
static inline svfloat32_t
|
|
sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
|
svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
|
|
const struct data *d)
|
|
{
|
|
const svbool_t ptrue = svptrue_b64 ();
|
|
|
|
/* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
|
|
order to perform core computation in double precision. */
|
|
const svbool_t pg_lo = svunpklo (pg);
|
|
const svbool_t pg_hi = svunpkhi (pg);
|
|
svfloat64_t y_lo = svcvt_f64_x (
|
|
ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
|
svfloat64_t y_hi = svcvt_f64_x (
|
|
ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
|
svfloat32_t z = svreinterpret_f32 (iz);
|
|
svfloat64_t z_lo = svcvt_f64_x (
|
|
ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
|
|
svfloat64_t z_hi = svcvt_f64_x (
|
|
ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
|
|
svuint64_t i_lo = svunpklo (i);
|
|
svuint64_t i_hi = svunpkhi (i);
|
|
svint64_t k_lo = svunpklo (k);
|
|
svint64_t k_hi = svunpkhi (k);
|
|
svuint64_t sign_bias_lo = svunpklo (sign_bias);
|
|
svuint64_t sign_bias_hi = svunpkhi (sign_bias);
|
|
|
|
/* Compute each part in double precision. */
|
|
svfloat64_t ylogx_lo, ylogx_hi;
|
|
svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
|
|
sign_bias_lo, &ylogx_lo, d);
|
|
svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
|
|
sign_bias_hi, &ylogx_hi, d);
|
|
|
|
/* Convert back to single-precision and interleave. */
|
|
svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
|
|
svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
|
|
*pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
|
|
svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
|
|
svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
|
|
return svuzp1 (lo_32, hi_32);
|
|
}
|
|
|
|
/* Implementation of SVE powf.
|
|
Provides the same accuracy as AdvSIMD powf, since it relies on the same
|
|
algorithm. The theoretical maximum error is under 2.60 ULPs.
|
|
Maximum measured error is 2.56 ULPs:
|
|
SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
|
|
want 0x1.fd4b06p+127. */
|
|
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|
{
|
|
const struct data *d = ptr_barrier (&data);
|
|
|
|
svuint32_t vix0 = svreinterpret_u32 (x);
|
|
svuint32_t viy0 = svreinterpret_u32 (y);
|
|
|
|
/* Negative x cases. */
|
|
svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
|
|
svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
|
|
|
|
/* Set sign_bias and ix depending on sign of x and nature of y. */
|
|
svbool_t yisnotint_xisneg = svpfalse_b ();
|
|
svuint32_t sign_bias = sv_u32 (0);
|
|
svuint32_t vix = vix0;
|
|
if (__glibc_unlikely (svptest_any (pg, xisneg)))
|
|
{
|
|
/* Determine nature of y. */
|
|
yisnotint_xisneg = svisnotint (xisneg, y);
|
|
svbool_t yisint_xisneg = svisint (xisneg, y);
|
|
svbool_t yisodd_xisneg = svisodd (xisneg, y);
|
|
/* ix set to abs(ix) if y is integer. */
|
|
vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
|
|
/* Set to SignBias if x is negative and y is odd. */
|
|
sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
|
|
}
|
|
|
|
/* Special cases of x or y: zero, inf and nan. */
|
|
svbool_t xspecial = sv_zeroinfnan (pg, vix0);
|
|
svbool_t yspecial = sv_zeroinfnan (pg, viy0);
|
|
svbool_t cmp = svorr_z (pg, xspecial, yspecial);
|
|
|
|
/* Small cases of x: |x| < 0x1p-126. */
|
|
svbool_t xsmall = svaclt (pg, x, d->small_bound);
|
|
if (__glibc_unlikely (svptest_any (pg, xsmall)))
|
|
{
|
|
/* Normalize subnormal x so exponent becomes negative. */
|
|
svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
|
|
vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
|
|
vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
|
|
vix = svsel (xsmall, vix_norm, vix);
|
|
}
|
|
/* Part of core computation carried in working precision. */
|
|
svuint32_t tmp = svsub_x (pg, vix, d->off);
|
|
svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
|
V_POWF_LOG2_N - 1);
|
|
svuint32_t top = svand_x (pg, tmp, 0xff800000);
|
|
svuint32_t iz = svsub_x (pg, vix, top);
|
|
svint32_t k
|
|
= svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
|
|
|
|
/* Compute core in extended precision and return intermediate ylogx results to
|
|
handle cases of underflow and underflow in exp. */
|
|
svfloat32_t ylogx;
|
|
svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
|
|
|
|
/* Handle exp special cases of underflow and overflow. */
|
|
svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
|
|
svfloat32_t ret_oflow
|
|
= svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
|
|
svfloat32_t ret_uflow = svreinterpret_f32 (sign);
|
|
ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
|
|
ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
|
|
|
|
/* Cases of finite y and finite negative x. */
|
|
ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
|
|
|
|
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
|
return sv_call_powf_sc (x, y, ret, cmp);
|
|
|
|
return ret;
|
|
}
|