aarch64: Fix AdvSIMD libmvec routines for big-endian

Previously many routines used * to load from vector types stored in the data table. This is emitted as ldr, which byte-swaps the entire vector register, and causes bugs for big-endian when not all lanes contain the same value. When a vector is to be used this way, it has been replaced with an array and the load with an explicit ld1 intrinsic, which byte-swaps only within lanes. As well, many routines previously used non-standard GCC syntax for vector operations such as indexing into vectors types with [] and assembling vectors using {}. This syntax should not be mixed with ACLE, as the former does not respect endianness whereas the latter does. Such examples have been replaced with, for instance, vcombine_* and vgetq_lane* intrinsics. Helpers which only use the GCC syntax, such as the v_call helpers, do not need changing as they do not use intrinsics. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
2024-11-21 12:30:06 +00:00 · 2024-05-02 16:43:13 +01:00 · 2024-05-02 16:43:13 +01:00 · 90a6ca8b28
commit 90a6ca8b28
parent ec6ed525f1
17 changed files with 119 additions and 85 deletions
--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
@ -22,6 +22,7 @@

 #define A(i) v_f64 (__v_log_data.poly[i])
 #define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)

 const static struct data
 {
@ -63,11 +64,15 @@ struct entry
 static inline struct entry
 lookup (uint64x2_t i)
 {
-  float64x2_t e0 = vld1q_f64 (
-      &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
-  float64x2_t e1 = vld1q_f64 (
-      &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
-  return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
 }

 static inline float64x2_t
--- a/sysdeps/aarch64/fpu/cosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
@ -22,7 +22,9 @@
 static const struct data
 {
  float64x2_t poly[3];
-  float64x2_t inv_ln2, ln2, shift, thres;
+  float64x2_t inv_ln2;
+  double ln2[2];
+  float64x2_t shift, thres;
  uint64x2_t index_mask, special_bound;
 } data = {
  .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
@ -58,8 +60,9 @@ exp_inline (float64x2_t x)
  float64x2_t n = vsubq_f64 (z, d->shift);

  /* r = x - n*ln2/N.  */
-  float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
-  r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+  float64x2_t ln2 = vld1q_f64 (d->ln2);
+  float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
+  r = vfmaq_laneq_f64 (r, n, ln2, 1);

  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
  uint64x2_t i = vandq_u64 (u, d->index_mask);
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
@ -56,8 +56,8 @@ static inline struct entry
 lookup (uint64x2_t i)
 {
  struct entry e;
-  float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
-	      e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
+  float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+	      e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
  e.erf = vuzp1q_f64 (e1, e2);
  e.scale = vuzp2q_f64 (e1, e2);
  return e;
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@ -26,7 +26,7 @@ static const struct data
  float64x2_t max, shift;
  float64x2_t p20, p40, p41, p42;
  float64x2_t p51, p52;
-  float64x2_t qr5, qr6, qr7, qr8, qr9;
+  double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
 #if WANT_SIMD_EXCEPT
  float64x2_t uflow_bound;
 #endif
@ -68,8 +68,10 @@ static inline struct entry
 lookup (uint64x2_t i)
 {
  struct entry e;
-  float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
-	      e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
+  float64x2_t e1
+      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+  float64x2_t e2
+      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
  e.erfc = vuzp1q_f64 (e1, e2);
  e.scale = vuzp2q_f64 (e1, e2);
  return e;
@ -161,16 +163,19 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
  p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
  /* Compute p_i using recurrence relation:
     p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}.  */
-  float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
-  p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
-  float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
-  p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
-  float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
-  p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
-  float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
-  p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
-  float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
-  p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
+  float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
+	      qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
+	      qr9 = vld1q_f64 (dat->qr9);
+  float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
+  p6 = vmulq_laneq_f64 (p6, qr5, 1);
+  float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
+  p7 = vmulq_laneq_f64 (p7, qr6, 1);
+  float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
+  p8 = vmulq_laneq_f64 (p8, qr7, 1);
+  float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
+  p9 = vmulq_laneq_f64 (p9, qr8, 1);
+  float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
+  p10 = vmulq_laneq_f64 (p10, qr9, 1);
  /* Compute polynomial in d using pairwise Horner scheme.  */
  float64x2_t p90 = vfmaq_f64 (p9, d, p10);
  float64x2_t p78 = vfmaq_f64 (p7, d, p8);
--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
@ -23,7 +23,8 @@ static const struct data
 {
  uint32x4_t offset, table_scale;
  float32x4_t max, shift;
-  float32x4_t coeffs, third, two_over_five, tenth;
+  float coeffs[4];
+  float32x4_t third, two_over_five, tenth;
 #if WANT_SIMD_EXCEPT
  float32x4_t uflow_bound;
 #endif
@ -37,7 +38,7 @@ static const struct data
  .shift = V4 (0x1p17f),
  /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
     fmas.  */
-  .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+  .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
  .third = V4 (0x1.555556p-2f),
  .two_over_five = V4 (-0x1.99999ap-2f),
  .tenth = V4 (-0x1.99999ap-4f),
@ -60,12 +61,16 @@ static inline struct entry
 lookup (uint32x4_t i)
 {
  struct entry e;
-  float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
-  float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
-  float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
-  float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
-  float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
-  float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+  float32x2_t t0
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+  float32x2_t t1
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+  float32x2_t t2
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+  float32x2_t t3
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
  e.erfc = vuzp1q_f32 (e1, e2);
  e.scale = vuzp2q_f32 (e1, e2);
  return e;
@ -140,10 +145,11 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x)
  float32x4_t r2 = vmulq_f32 (r, r);

  float32x4_t p1 = r;
-  float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
+  float32x4_t coeffs = vld1q_f32 (dat->coeffs);
+  float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
  float32x4_t p3
-      = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
-  float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
+      = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
+  float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
  p4 = vfmsq_f32 (dat->tenth, r2, p4);

  float32x4_t y = vfmaq_f32 (p3, d, p4);
--- a/sysdeps/aarch64/fpu/erff_advsimd.c
+++ b/sysdeps/aarch64/fpu/erff_advsimd.c
@ -47,12 +47,12 @@ static inline struct entry
 lookup (uint32x4_t i)
 {
  struct entry e;
-  float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
-  float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
-  float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
-  float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
-  float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
-  float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+  float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+  float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+  float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+  float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
  e.erf = vuzp1q_f32 (e1, e2);
  e.scale = vuzp2q_f32 (e1, e2);
  return e;
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
@ -25,7 +25,8 @@
 static const struct data
 {
  float32x4_t poly[5];
-  float32x4_t log10_2_and_inv, shift;
+  float log10_2_and_inv[4];
+  float32x4_t shift;

 #if !WANT_SIMD_EXCEPT
  float32x4_t scale_thresh;
@ -111,10 +112,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
  /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
     with poly(r) in [1/sqrt(2), sqrt(2)] and
     x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2].  */
-  float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
+  float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
+  float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
  float32x4_t n = vsubq_f32 (z, d->shift);
-  float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
-  r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
+  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
+  r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);

  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
--- a/sysdeps/aarch64/fpu/expm1_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
@ -23,7 +23,9 @@
 static const struct data
 {
  float64x2_t poly[11];
-  float64x2_t invln2, ln2, shift;
+  float64x2_t invln2;
+  double ln2[2];
+  float64x2_t shift;
  int64x2_t exponent_bias;
 #if WANT_SIMD_EXCEPT
  uint64x2_t thresh, tiny_bound;
@ -92,8 +94,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
     where 2^i is exact because i is an integer.  */
  float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
  int64x2_t i = vcvtq_s64_f64 (n);
-  float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
-  f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
+  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+  float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+  f = vfmsq_laneq_f64 (f, n, ln2, 1);

  /* Approximate expm1(f) using polynomial.
     Taylor expansion for expm1(x) has the form:
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@ -23,7 +23,7 @@
 static const struct data
 {
  float32x4_t poly[5];
-  float32x4_t invln2_and_ln2;
+  float invln2_and_ln2[4];
  float32x4_t shift;
  int32x4_t exponent_bias;
 #if WANT_SIMD_EXCEPT
@ -88,11 +88,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
     where 2^i is exact because i is an integer.  */
-  float32x4_t j = vsubq_f32 (
-      vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+  float32x4_t j
+      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
  int32x4_t i = vcvtq_s32_f32 (j);
-  float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
-  f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
+  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);

  /* Approximate expm1(f) using polynomial.
     Taylor expansion for expm1(x) has the form:
--- a/sysdeps/aarch64/fpu/log10_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10_advsimd.c
@ -58,8 +58,10 @@ static inline struct entry
 lookup (uint64x2_t i)
 {
  struct entry e;
-  uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
-  uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+  uint64_t i0
+      = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+  uint64_t i1
+      = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
  float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
  float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
  e.invc = vuzp1q_f64 (e0, e1);
--- a/sysdeps/aarch64/fpu/log2_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2_advsimd.c
@ -55,8 +55,10 @@ static inline struct entry
 lookup (uint64x2_t i)
 {
  struct entry e;
-  uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
-  uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+  uint64_t i0
+      = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+  uint64_t i1
+      = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
  float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
  float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
  e.invc = vuzp1q_f64 (e0, e1);
--- a/sysdeps/aarch64/fpu/log_advsimd.c
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
@ -54,17 +54,12 @@ lookup (uint64x2_t i)
 {
  /* Since N is a power of 2, n % N = n & (N - 1).  */
  struct entry e;
-  uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
-  uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
-#if __BYTE_ORDER == __LITTLE_ENDIAN
  e.invc = vuzp1q_f64 (e0, e1);
  e.logc = vuzp2q_f64 (e0, e1);
-#else
-  e.invc = vuzp1q_f64 (e1, e0);
-  e.logc = vuzp2q_f64 (e1, e0);
-#endif
  return e;
 }

--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
@ -22,8 +22,9 @@

 static const struct data
 {
-  float64x2_t poly[11];
-  float64x2_t inv_ln2, m_ln2, shift;
+  float64x2_t poly[11], inv_ln2;
+  double m_ln2[2];
+  float64x2_t shift;
  uint64x2_t halff;
  int64x2_t onef;
 #if WANT_SIMD_EXCEPT
@ -40,7 +41,7 @@ static const struct data
 	    V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },

  .inv_ln2 = V2 (0x1.71547652b82fep0),
-  .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+  .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
  .shift = V2 (0x1.8p52),

  .halff = V2 (0x3fe0000000000000),
@ -67,8 +68,10 @@ expm1_inline (float64x2_t x)
     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
  float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
  int64x2_t i = vcvtq_s64_f64 (j);
-  float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
-  f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
+
+  float64x2_t m_ln2 = vld1q_f64 (d->m_ln2);
+  float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0);
+  f = vfmaq_laneq_f64 (f, j, m_ln2, 1);
  /* Approximate expm1(f) using polynomial.  */
  float64x2_t f2 = vmulq_f64 (f, f);
  float64x2_t f4 = vmulq_f64 (f2, f2);
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
@ -23,7 +23,8 @@
 static const struct data
 {
  float64x2_t poly[9];
-  float64x2_t half_pi, two_over_pi, shift;
+  double half_pi[2];
+  float64x2_t two_over_pi, shift;
 #if !WANT_SIMD_EXCEPT
  float64x2_t range_val;
 #endif
@ -81,8 +82,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
  /* Use q to reduce x to r in [-pi/4, pi/4], by:
     r = x - q * pi/2, in extended precision.  */
  float64x2_t r = x;
-  r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
-  r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
+  float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 1);
  /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
     formula.  */
  r = vmulq_n_f64 (r, 0.5);
--- a/sysdeps/aarch64/fpu/tanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
@ -23,7 +23,7 @@
 static const struct data
 {
  float32x4_t poly[6];
-  float32x4_t pi_consts;
+  float pi_consts[4];
  float32x4_t shift;
 #if !WANT_SIMD_EXCEPT
  float32x4_t range_val;
@ -95,16 +95,17 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
 #endif

  /* n = rint(x/(pi/2)).  */
-  float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
+  float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
+  float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
  float32x4_t n = vsubq_f32 (q, d->shift);
  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
  uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));

  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
  float32x4_t r;
-  r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
-  r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
-  r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
+  r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
+  r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
+  r = vfmaq_laneq_f32 (r, n, pi_consts, 2);

  /* If x lives in an interval, where |tan(x)|
     - is finite, then use a polynomial approximation of the form
--- a/sysdeps/aarch64/fpu/v_expf_inline.h
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
@ -25,7 +25,8 @@
 struct v_expf_data
 {
  float32x4_t poly[5];
-  float32x4_t shift, invln2_and_ln2;
+  float32x4_t shift;
+  float invln2_and_ln2[4];
 };

 /* maxerr: 1.45358 +0.5 ulp.  */
@ -50,10 +51,11 @@ v_expf_inline (float32x4_t x, const struct v_expf_data *d)
  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
  float32x4_t n, r, z;
-  z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
+  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+  z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
  n = vsubq_f32 (z, d->shift);
-  r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
-  r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
+  r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
+  r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));

--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
@ -26,7 +26,8 @@
 struct v_expm1f_data
 {
  float32x4_t poly[5];
-  float32x4_t invln2_and_ln2, shift;
+  float invln2_and_ln2[4];
+  float32x4_t shift;
  int32x4_t exponent_bias;
 };

@ -49,11 +50,12 @@ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
     calling routine should handle special values if required.  */

  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  float32x4_t j = vsubq_f32 (
-      vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+  float32x4_t j
+      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
  int32x4_t i = vcvtq_s32_f32 (j);
-  float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
-  f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
+  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);

  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
     Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses