glibc/sysdeps/aarch64/fpu/v_log1pf_inline.h
Joe Ramsay 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
log1pf is quite register-intensive - use fewer registers for the
polynomial, and make various changes to shorten dependency chains in
parent routines.  There is now no spilling with GCC 14.  Accuracy moves
around a little - comments adjusted accordingly but does not require
regen-ulps.

Use the helper in log1pf as well, instead of having separate
implementations.  The more accurate polynomial means special-casing can
be simplified, and the shorter dependency chain avoids the usual dance
around v0, which is otherwise difficult.

There is a small duplication of vectors containing 1.0f (or 0x3f800000) -
GCC is not currently able to efficiently handle values which fit in FMOV
but not MOVI, and are reinterpreted to integer.  There may be potential
for more optimisation if this is fixed.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
2024-09-23 15:44:07 +01:00

106 lines
4.1 KiB
C

/* Helper for single-precision Advanced SIMD routines which depend on log1p
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef AARCH64_FPU_V_LOG1PF_INLINE_H
#define AARCH64_FPU_V_LOG1PF_INLINE_H
#include "v_math.h"
#include "poly_advsimd_f32.h"
struct v_log1pf_data
{
uint32x4_t four;
int32x4_t three_quarters;
float c0, c3, c5, c7;
float32x4_t c4, c6, c1, c2, ln2;
};
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
(1, -0.5) are not stored as they can be generated more efficiently. */
#define V_LOG1PF_CONSTANTS_TABLE \
{ \
.c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
.c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
.c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
.c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
.three_quarters = V4 (0x3f400000) \
}
static inline float32x4_t
eval_poly (float32x4_t m, const struct v_log1pf_data *d)
{
/* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
float32x4_t c0357 = vld1q_f32 (&d->c0);
float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
float32x4_t m2 = vmulq_f32 (m, m);
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
float32x4_t p = vfmaq_f32 (p45, m2, p67);
p = vfmaq_f32 (p23, m2, p);
p = vfmaq_f32 (d->c1, m, p);
p = vmulq_f32 (m2, p);
p = vfmaq_f32 (m, m2, p);
return vfmaq_f32 (p, m2, q);
}
static inline float32x4_t
log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
{
/* Helper for calculating log(x + 1). */
/* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
is in [-0.25, 0.5]):
log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
We approximate log1p(m) with a polynomial, then scale by
k*log(2). Instead of doing this directly, we use an intermediate
scale factor s = 4*k*log(2) to ensure the scale is representable
as a normalised fp32 number. */
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
/* Choose k to scale x to the range [-1/4, 1/2]. */
int32x4_t k
= vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
v_s32 (0xff800000));
uint32x4_t ku = vreinterpretq_u32_s32 (k);
/* Scale up to ensure that the scale factor is representable as normalised
fp32 number, and scale m down accordingly. */
float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
/* Scale x by exponent manipulation. */
float32x4_t m_scale
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
/* Evaluate polynomial on the reduced interval. */
float32x4_t p = eval_poly (m_scale, d);
/* The scale factor to be applied back at the end - by multiplying float(k)
by 2^-23 we get the unbiased exponent of k. */
float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
/* Apply the scaling back. */
return vfmaq_f32 (p, scale_back, d->ln2);
}
#endif