AArch64: Improve codegen in users of ADVSIMD expm1f helper

Rearrange operations so MOV is not necessary in reduction or around
the special-case handler.  Reduce memory access by using more indexed
MLAs in polynomial.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Joe Ramsay 2024-09-23 15:32:53 +01:00 committed by Wilco Dijkstra
parent 5bc100bd4b
commit 7900ac490d
4 changed files with 58 additions and 91 deletions

View File

@ -18,27 +18,18 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
#include "poly_advsimd_f32.h"
#include "v_expm1f_inline.h"
static const struct data
{
float32x4_t poly[5];
float invln2_and_ln2[4];
float32x4_t shift;
int32x4_t exponent_bias;
struct v_expm1f_data d;
#if WANT_SIMD_EXCEPT
uint32x4_t thresh;
#else
float32x4_t oflow_bound;
#endif
} data = {
/* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
/* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
.invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
.shift = V4 (0x1.8p23f),
.exponent_bias = V4 (0x3f800000),
.d = V_EXPM1F_DATA,
#if !WANT_SIMD_EXCEPT
/* Value above which expm1f(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
@ -55,67 +46,38 @@ static const struct data
#define TinyBound v_u32 (0x34000000 << 1)
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
special_case (float32x4_t x, uint32x4_t special, const struct data *d)
{
return v_call_f32 (expm1f, x, y, special);
return v_call_f32 (
expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
}
/* Single-precision vector exp(x) - 1 function.
The maximum error is 1.51 ULP:
_ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
want 0x1.e2fb94p-2. */
The maximum error is 1.62 ULP:
_ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
want 0x1.da9f44p-2. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
#if WANT_SIMD_EXCEPT
uint32x4_t ix = vreinterpretq_u32_f32 (x);
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
shift-left by 1, and compare with thresh which was left-shifted offline -
this is effectively an absolute compare. */
uint32x4_t special
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
if (__glibc_unlikely (v_any_u32 (special)))
x = v_zerofy_f32 (x, special);
#else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif
/* Reduce argument to smaller range:
Let i = round(x / ln2)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
float32x4_t j
= vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
int32x4_t i = vcvtq_s32_f32 (j);
float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
float32x4_t p = v_horner_4_f32 (f, d->poly);
p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
float32x4_t t = vreinterpretq_f32_s32 (u);
if (__glibc_unlikely (v_any_u32 (special)))
return special_case (vreinterpretq_f32_u32 (ix),
vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
special);
return special_case (x, special, d);
/* expm1(x) ~= p * t + (t - 1). */
return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
return expm1f_inline (x, &d->d);
}
libmvec_hidden_def (V_NAME_F1 (expm1))
HALF_WIDTH_ALIAS_F1 (expm1)

View File

@ -23,15 +23,13 @@
static const struct data
{
struct v_expm1f_data expm1f_consts;
uint32x4_t halff;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound, thresh;
#else
uint32x4_t oflow_bound;
float32x4_t oflow_bound;
#endif
} data = {
.expm1f_consts = V_EXPM1F_DATA,
.halff = V4 (0x3f000000),
#if WANT_SIMD_EXCEPT
/* 0x1.6a09e8p-32, below which expm1f underflows. */
.tiny_bound = V4 (0x2fb504f4),
@ -39,14 +37,15 @@ static const struct data
.thresh = V4 (0x12fbbbb3),
#else
/* 0x1.61814ep+6, above which expm1f helper overflows. */
.oflow_bound = V4 (0x42b0c0a7),
.oflow_bound = V4 (0x1.61814ep+6),
#endif
};
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
uint32x4_t special)
{
return v_call_f32 (sinhf, x, y, special);
return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
}
/* Approximation for vector single-precision sinh(x) using expm1.
@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
uint32x4_t ix = vreinterpretq_u32_f32 (x);
float32x4_t ax = vabsq_f32 (x);
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
float32x4_t halfsign = vreinterpretq_f32_u32 (
vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
#if WANT_SIMD_EXCEPT
uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
uint32x4_t special = vcgeq_u32 (
vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
ax = v_zerofy_f32 (ax, special);
#else
uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
#endif
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
/* Fall back to the scalar variant for any lanes that should trigger an
exception. */
if (__glibc_unlikely (v_any_u32 (special)))
return special_case (x, vmulq_f32 (t, halfsign), special);
return special_case (x, t, halfsign, special);
return vmulq_f32 (t, halfsign);
}

View File

@ -28,13 +28,16 @@ static const struct data
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
.boring_bound = V4 (0x41102cb3),
.large_bound = V4 (0x7f800000),
.onef = V4 (0x3f800000),
};
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
float32x4_t q, uint32x4_t special)
{
return v_call_f32 (tanhf, x, y, special);
return v_call_f32 (
tanhf, x,
vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
special);
}
/* Approximation for single-precision vector tanh(x), using a simplified
@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
/* expm1 exponent bias is 1.0f reinterpreted to int. */
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered properly, set all special and boring
@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
if (__glibc_unlikely (v_any_u32 (special)))
return special_case (vreinterpretq_f32_u32 (ix),
vbslq_f32 (is_boring, boring, y), special);
return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
special);
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
return vbslq_f32 (is_boring, boring, y);
}
libmvec_hidden_def (V_NAME_F1 (tanh))

View File

@ -21,48 +21,47 @@
#define AARCH64_FPU_V_EXPM1F_INLINE_H
#include "v_math.h"
#include "poly_advsimd_f32.h"
#include "math_config.h"
struct v_expm1f_data
{
float32x4_t poly[5];
float invln2_and_ln2[4];
float32x4_t shift;
float32x4_t c0, c2;
int32x4_t exponent_bias;
float c1, c3, inv_ln2, c4;
float ln2_hi, ln2_lo;
};
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
log(2)/2]. Exponent bias is asuint(1.0f).
invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
log(2)/2]. Exponent bias is asuint(1.0f). */
#define V_EXPM1F_DATA \
{ \
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
.shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
.invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
.c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
.c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
.exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
.ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
}
static inline float32x4_t
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
{
/* Helper routine for calculating exp(x) - 1.
Copied from v_expm1f_1u6.c, with all special-case handling removed - the
calling routine should handle special values if required. */
/* Helper routine for calculating exp(x) - 1. */
float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
float32x4_t lane_consts = vld1q_f32 (&d->c1);
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
float32x4_t j
= vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
int32x4_t i = vcvtq_s32_f32 (j);
float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
f = vfmsq_lane_f32 (f, j, ln2, 1);
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
Horner. */
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
float32x4_t f2 = vmulq_f32 (f, f);
float32x4_t f4 = vmulq_f32 (f2, f2);
float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
float32x4_t p = vfmaq_f32 (p01, f2, p23);
p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
p = vfmaq_f32 (f, f2, p);
/* t = 2^i. */