mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-21 12:30:06 +00:00
AArch64: Improve codegen in users of ADVSIMD expm1f helper
Rearrange operations so MOV is not necessary in reduction or around the special-case handler. Reduce memory access by using more indexed MLAs in polynomial. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
5bc100bd4b
commit
7900ac490d
@ -18,27 +18,18 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "v_expm1f_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
float invln2_and_ln2[4];
|
||||
float32x4_t shift;
|
||||
int32x4_t exponent_bias;
|
||||
struct v_expm1f_data d;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t thresh;
|
||||
#else
|
||||
float32x4_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
/* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
|
||||
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
|
||||
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
|
||||
/* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
|
||||
.invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
|
||||
.shift = V4 (0x1.8p23f),
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.d = V_EXPM1F_DATA,
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
/* Value above which expm1f(x) should overflow. Absolute value of the
|
||||
underflow bound is greater than this, so it catches both cases - there is
|
||||
@ -55,67 +46,38 @@ static const struct data
|
||||
#define TinyBound v_u32 (0x34000000 << 1)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
special_case (float32x4_t x, uint32x4_t special, const struct data *d)
|
||||
{
|
||||
return v_call_f32 (expm1f, x, y, special);
|
||||
return v_call_f32 (
|
||||
expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
|
||||
}
|
||||
|
||||
/* Single-precision vector exp(x) - 1 function.
|
||||
The maximum error is 1.51 ULP:
|
||||
_ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
|
||||
want 0x1.e2fb94p-2. */
|
||||
The maximum error is 1.62 ULP:
|
||||
_ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
|
||||
want 0x1.da9f44p-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
||||
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
||||
shift-left by 1, and compare with thresh which was left-shifted offline -
|
||||
this is effectively an absolute compare. */
|
||||
uint32x4_t special
|
||||
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
x = v_zerofy_f32 (x, special);
|
||||
#else
|
||||
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
|
||||
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
/* Reduce argument to smaller range:
|
||||
Let i = round(x / ln2)
|
||||
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
where 2^i is exact because i is an integer. */
|
||||
float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
float32x4_t j
|
||||
= vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
||||
int32x4_t i = vcvtq_s32_f32 (j);
|
||||
float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
||||
f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
||||
|
||||
/* Approximate expm1(f) using polynomial.
|
||||
Taylor expansion for expm1(x) has the form:
|
||||
x + ax^2 + bx^3 + cx^4 ....
|
||||
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
float32x4_t p = v_horner_4_f32 (f, d->poly);
|
||||
p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
|
||||
|
||||
/* Assemble the result.
|
||||
expm1(x) ~= 2^i * (p + 1) - 1
|
||||
Let t = 2^i. */
|
||||
int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
|
||||
float32x4_t t = vreinterpretq_f32_s32 (u);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
return special_case (vreinterpretq_f32_u32 (ix),
|
||||
vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
|
||||
special);
|
||||
return special_case (x, special, d);
|
||||
|
||||
/* expm1(x) ~= p * t + (t - 1). */
|
||||
return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
|
||||
return expm1f_inline (x, &d->d);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (expm1))
|
||||
HALF_WIDTH_ALIAS_F1 (expm1)
|
||||
|
@ -23,15 +23,13 @@
|
||||
static const struct data
|
||||
{
|
||||
struct v_expm1f_data expm1f_consts;
|
||||
uint32x4_t halff;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t tiny_bound, thresh;
|
||||
#else
|
||||
uint32x4_t oflow_bound;
|
||||
float32x4_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.expm1f_consts = V_EXPM1F_DATA,
|
||||
.halff = V4 (0x3f000000),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* 0x1.6a09e8p-32, below which expm1f underflows. */
|
||||
.tiny_bound = V4 (0x2fb504f4),
|
||||
@ -39,14 +37,15 @@ static const struct data
|
||||
.thresh = V4 (0x12fbbbb3),
|
||||
#else
|
||||
/* 0x1.61814ep+6, above which expm1f helper overflows. */
|
||||
.oflow_bound = V4 (0x42b0c0a7),
|
||||
.oflow_bound = V4 (0x1.61814ep+6),
|
||||
#endif
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
|
||||
uint32x4_t special)
|
||||
{
|
||||
return v_call_f32 (sinhf, x, y, special);
|
||||
return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector single-precision sinh(x) using expm1.
|
||||
@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
|
||||
float32x4_t halfsign = vreinterpretq_f32_u32 (
|
||||
vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
|
||||
uint32x4_t special = vcgeq_u32 (
|
||||
vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
|
||||
ax = v_zerofy_f32 (ax, special);
|
||||
#else
|
||||
uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
|
||||
uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
||||
@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||
/* Fall back to the scalar variant for any lanes that should trigger an
|
||||
exception. */
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
return special_case (x, vmulq_f32 (t, halfsign), special);
|
||||
return special_case (x, t, halfsign, special);
|
||||
|
||||
return vmulq_f32 (t, halfsign);
|
||||
}
|
||||
|
@ -28,13 +28,16 @@ static const struct data
|
||||
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
||||
.boring_bound = V4 (0x41102cb3),
|
||||
.large_bound = V4 (0x7f800000),
|
||||
.onef = V4 (0x3f800000),
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
|
||||
float32x4_t q, uint32x4_t special)
|
||||
{
|
||||
return v_call_f32 (tanhf, x, y, special);
|
||||
return v_call_f32 (
|
||||
tanhf, x,
|
||||
vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
|
||||
special);
|
||||
}
|
||||
|
||||
/* Approximation for single-precision vector tanh(x), using a simplified
|
||||
@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
|
||||
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
|
||||
/* expm1 exponent bias is 1.0f reinterpreted to int. */
|
||||
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
|
||||
sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If fp exceptions are to be triggered properly, set all special and boring
|
||||
@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||
|
||||
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
|
||||
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
return special_case (vreinterpretq_f32_u32 (ix),
|
||||
vbslq_f32 (is_boring, boring, y), special);
|
||||
return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
|
||||
special);
|
||||
|
||||
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||
return vbslq_f32 (is_boring, boring, y);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (tanh))
|
||||
|
@ -21,48 +21,47 @@
|
||||
#define AARCH64_FPU_V_EXPM1F_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "math_config.h"
|
||||
|
||||
struct v_expm1f_data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
float invln2_and_ln2[4];
|
||||
float32x4_t shift;
|
||||
float32x4_t c0, c2;
|
||||
int32x4_t exponent_bias;
|
||||
float c1, c3, inv_ln2, c4;
|
||||
float ln2_hi, ln2_lo;
|
||||
};
|
||||
|
||||
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
|
||||
log(2)/2]. Exponent bias is asuint(1.0f).
|
||||
invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
|
||||
log(2)/2]. Exponent bias is asuint(1.0f). */
|
||||
#define V_EXPM1F_DATA \
|
||||
{ \
|
||||
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
|
||||
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
|
||||
.shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
|
||||
.invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
||||
.c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
|
||||
.c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
|
||||
.exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
|
||||
.ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
||||
{
|
||||
/* Helper routine for calculating exp(x) - 1.
|
||||
Copied from v_expm1f_1u6.c, with all special-case handling removed - the
|
||||
calling routine should handle special values if required. */
|
||||
/* Helper routine for calculating exp(x) - 1. */
|
||||
|
||||
float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
|
||||
float32x4_t lane_consts = vld1q_f32 (&d->c1);
|
||||
|
||||
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
float32x4_t j
|
||||
= vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
||||
float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
|
||||
int32x4_t i = vcvtq_s32_f32 (j);
|
||||
float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
||||
f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
||||
float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
|
||||
f = vfmsq_lane_f32 (f, j, ln2, 1);
|
||||
|
||||
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
|
||||
Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
|
||||
Horner. */
|
||||
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
|
||||
float32x4_t f2 = vmulq_f32 (f, f);
|
||||
float32x4_t f4 = vmulq_f32 (f2, f2);
|
||||
float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
|
||||
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
|
||||
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
|
||||
float32x4_t p = vfmaq_f32 (p01, f2, p23);
|
||||
p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
|
||||
p = vfmaq_f32 (f, f2, p);
|
||||
|
||||
/* t = 2^i. */
|
||||
|
Loading…
Reference in New Issue
Block a user