glibc/sysdeps/aarch64/fpu/v_log1pf_inline.h

/* Helper for single-precision Advanced SIMD routines which depend on log1p

   Copyright (C) 2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#ifndef AARCH64_FPU_V_LOG1PF_INLINE_H
#define AARCH64_FPU_V_LOG1PF_INLINE_H

#include "v_math.h"
#include "poly_advsimd_f32.h"

struct v_log1pf_data
{
  uint32x4_t four;
  int32x4_t three_quarters;
  float c0, c3, c5, c7;
  float32x4_t c4, c6, c1, c2, ln2;
};

/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
   (1, -0.5) are not stored as they can be generated more efficiently.  */
#define V_LOG1PF_CONSTANTS_TABLE                                              \
  {                                                                           \
    .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f),                         \
    .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f,                         \
    .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f,                          \
    .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f,                          \
    .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                       \
    .three_quarters = V4 (0x3f400000)                                         \
  }

static inline float32x4_t
eval_poly (float32x4_t m, const struct v_log1pf_data *d)
{
  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
  float32x4_t c0357 = vld1q_f32 (&d->c0);
  float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
  float32x4_t m2 = vmulq_f32 (m, m);
  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
  float32x4_t p = vfmaq_f32 (p45, m2, p67);
  p = vfmaq_f32 (p23, m2, p);
  p = vfmaq_f32 (d->c1, m, p);
  p = vmulq_f32 (m2, p);
  p = vfmaq_f32 (m, m2, p);
  return vfmaq_f32 (p, m2, q);
}

static inline float32x4_t
log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
{
  /* Helper for calculating log(x + 1).  */

  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
			   is in [-0.25, 0.5]):
     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).

     We approximate log1p(m) with a polynomial, then scale by
     k*log(2). Instead of doing this directly, we use an intermediate
     scale factor s = 4*k*log(2) to ensure the scale is representable
     as a normalised fp32 number.  */
  float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));

  /* Choose k to scale x to the range [-1/4, 1/2].  */
  int32x4_t k
      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
		   v_s32 (0xff800000));
  uint32x4_t ku = vreinterpretq_u32_s32 (k);

  /* Scale up to ensure that the scale factor is representable as normalised
     fp32 number, and scale m down accordingly.  */
  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));

  /* Scale x by exponent manipulation.  */
  float32x4_t m_scale
      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
  m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));

  /* Evaluate polynomial on the reduced interval.  */
  float32x4_t p = eval_poly (m_scale, d);

  /* The scale factor to be applied back at the end - by multiplying float(k)
     by 2^-23 we get the unbiased exponent of k.  */
  float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));

  /* Apply the scaling back.  */
  return vfmaq_f32 (p, scale_back, d->ln2);
}

#endif
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`/* Helper for single-precision Advanced SIMD routines which depend on log1p`

			`Copyright (C) 2024 Free Software Foundation, Inc.`
			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<https://www.gnu.org/licenses/>. */`

			`#ifndef AARCH64_FPU_V_LOG1PF_INLINE_H`
			`#define AARCH64_FPU_V_LOG1PF_INLINE_H`

			`#include "v_math.h"`
			`#include "poly_advsimd_f32.h"`

			`struct v_log1pf_data`
			`{`
			`uint32x4_t four;`
			`int32x4_t three_quarters;`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`float c0, c3, c5, c7;`
			`float32x4_t c4, c6, c1, c2, ln2;`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`};`

			`/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients`
			`(1, -0.5) are not stored as they can be generated more efficiently. */`
			`#define V_LOG1PF_CONSTANTS_TABLE \`
			`{ \`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`.c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \`
			`.c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \`
			`.c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \`
			`.c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \`
			`.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \`
			`.three_quarters = V4 (0x3f400000) \`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`}`

			`static inline float32x4_t`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`eval_poly (float32x4_t m, const struct v_log1pf_data *d)`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`{`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`/* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */`
			`float32x4_t c0357 = vld1q_f32 (&d->c0);`
			`float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`float32x4_t m2 = vmulq_f32 (m, m);`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);`
			`float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);`
			`float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);`
			`float32x4_t p = vfmaq_f32 (p45, m2, p67);`
			`p = vfmaq_f32 (p23, m2, p);`
			`p = vfmaq_f32 (d->c1, m, p);`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`p = vmulq_f32 (m2, p);`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`p = vfmaq_f32 (m, m2, p);`
			`return vfmaq_f32 (p, m2, q);`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`}`

			`static inline float32x4_t`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`{`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`/* Helper for calculating log(x + 1). */`

			`/* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m`
			`is in [-0.25, 0.5]):`
			`log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).`

			`We approximate log1p(m) with a polynomial, then scale by`
			`k*log(2). Instead of doing this directly, we use an intermediate`
			`scale factor s = 4klog(2) to ensure the scale is representable`
			`as a normalised fp32 number. */`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00
			`/* Choose k to scale x to the range [-1/4, 1/2]. */`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`int32x4_t k`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00			`= vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`v_s32 (0xff800000));`
			`uint32x4_t ku = vreinterpretq_u32_s32 (k);`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00
			`/* Scale up to ensure that the scale factor is representable as normalised`
			`fp32 number, and scale m down accordingly. */`
			`float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));`

			`/* Scale x by exponent manipulation. */`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`float32x4_t m_scale`
			`= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));`
			`m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00
			`/* Evaluate polynomial on the reduced interval. */`
			`float32x4_t p = eval_poly (m_scale, d);`

			`/* The scale factor to be applied back at the end - by multiplying float(k)`
			`by 2^-23 we get the unbiased exponent of k. */`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));`
AArch64: Improve codegen in users of AdvSIMD log1pf helper log1pf is quite register-intensive - use fewer registers for the polynomial, and make various changes to shorten dependency chains in parent routines. There is now no spilling with GCC 14. Accuracy moves around a little - comments adjusted accordingly but does not require regen-ulps. Use the helper in log1pf as well, instead of having separate implementations. The more accurate polynomial means special-casing can be simplified, and the shorter dependency chain avoids the usual dance around v0, which is otherwise difficult. There is a small duplication of vectors containing 1.0f (or 0x3f800000) - GCC is not currently able to efficiently handle values which fit in FMOV but not MOVI, and are reinterpreted to integer. There may be potential for more optimisation if this is fixed. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:32:14 +00:00
			`/* Apply the scaling back. */`
			`return vfmaq_f32 (p, scale_back, d->ln2);`
aarch64/fpu: Add vector variants of acosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:40 +00:00			`}`

			`#endif`