AArch64: Remove SVE erf and erfc tables

By using a combination of mask-and-add instead of the shift-based index calculation the routines can share the same table as other variants with no performance degradation. The tables change name because of other changes in downstream AOR. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2024-11-26 23:10:06 +00:00 · 2024-11-01 15:48:54 +00:00 · 2024-11-01 15:48:54 +00:00 · 2d82d781a5
commit 2d82d781a5
parent 6d477b8de8
16 changed files with 50 additions and 2691 deletions
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@ -41,8 +41,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
                  v_log10_data \
                  erf_data \
                  erff_data \
-                  sv_erf_data \
-                  sv_erff_data \
                  v_exp_tail_data \
                  erfc_data \
                  erfcf_data \
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
@ -58,8 +58,8 @@ static inline struct entry
 lookup (uint64x2_t i)
 {
  struct entry e;
-  float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
-	      e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
+  float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+	      e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
  e.erf = vuzp1q_f64 (e1, e2);
  e.scale = vuzp2q_f64 (e1, e2);
  return e;
--- a/sysdeps/aarch64/fpu/erf_data.c
+++ b/sysdeps/aarch64/fpu/erf_data.c
@ -19,14 +19,14 @@

 #include "vecmath_config.h"

-/* Lookup table used in erf.
+/* Lookup table used in vector erf.
   For each possible rounded input r (multiples of 1/128), between
   r = 0.0 and r = 6.0 (769 values):
-   - the first entry __erff_data.tab.erf contains the values of erf(r),
-   - the second entry __erff_data.tab.scale contains the values of
+   - the first entry __v_erff_data.tab.erf contains the values of erf(r),
+   - the second entry __v_erff_data.tab.scale contains the values of
   2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
   algorithm, since lookup is performed only for x >= 1/64-1/512.  */
-const struct erf_data __erf_data = {
+const struct v_erf_data __v_erf_data = {
  .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
 	   { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
 	   { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },
--- a/sysdeps/aarch64/fpu/erf_sve.c
+++ b/sysdeps/aarch64/fpu/erf_sve.c
@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
  svfloat64_t a = svabs_x (pg, x);
  svfloat64_t shift = sv_f64 (dat->shift);
  svfloat64_t z = svadd_x (pg, a, shift);
-  svuint64_t i
-      = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift));
+  svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff);
+  i = svadd_x (pg, i, i);

  /* Lookup without shortcut for small values but with predicate to avoid
     segfault for large values and NaNs.  */
  svfloat64_t r = svsub_x (pg, z, shift);
-  svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i);
-  svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i);
+  svfloat64_t erfr
+      = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i);
+  svfloat64_t scale
+      = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i);

  /* erf(x) ~ erf(r) + scale * d * poly (r, d).  */
  svfloat64_t d = svsub_x (pg, a, r);
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@ -69,9 +69,9 @@ lookup (uint64x2_t i)
 {
  struct entry e;
  float64x2_t e1
-      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
  float64x2_t e2
-      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
+      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
  e.erfc = vuzp1q_f64 (e1, e2);
  e.scale = vuzp2q_f64 (e1, e2);
  return e;
--- a/sysdeps/aarch64/fpu/erfc_data.c
+++ b/sysdeps/aarch64/fpu/erfc_data.c
@ -19,14 +19,14 @@

 #include "vecmath_config.h"

-/* Lookup table used in erfc.
+/* Lookup table used in vector erfc.
   For each possible rounded input r (multiples of 1/128), between
   r = 0.0 and r = ~27.0 (3488 values):
-   - the first entry __erfc_data.tab.erfc contains the values of erfc(r),
-   - the second entry __erfc_data.tab.scale contains the values of
+   - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r),
+   - the second entry __v_erfc_data.tab.scale contains the values of
   2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
   they are scaled by a large enough value 2^128 (fits in 8bit).  */
-const struct erfc_data __erfc_data = {
+const struct v_erfc_data __v_erfc_data = {
  .tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
 	   { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
 	   { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },
--- a/sysdeps/aarch64/fpu/erfc_sve.c
+++ b/sysdeps/aarch64/fpu/erfc_sve.c
@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)

  /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
  i = svadd_x (pg, i, i);
-  const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
+  const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr;
  svfloat64_t erfcr = svld1_gather_index (pg, p, i);
  svfloat64_t scale = svld1_gather_index (pg, p + 1, i);

--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
@ -62,13 +62,13 @@ lookup (uint32x4_t i)
 {
  struct entry e;
  float32x2_t t0
-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
  float32x2_t t1
-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
  float32x2_t t2
-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
  float32x2_t t3
-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
  float32x4_t e1 = vcombine_f32 (t0, t1);
  float32x4_t e2 = vcombine_f32 (t2, t3);
  e.erfc = vuzp1q_f32 (e1, e2);
--- a/sysdeps/aarch64/fpu/erfcf_data.c
+++ b/sysdeps/aarch64/fpu/erfcf_data.c
@ -19,14 +19,14 @@

 #include "vecmath_config.h"

-/* Lookup table used in erfcf.
+/* Lookup table used in vector erfcf.
   For each possible rounded input r (multiples of 1/64), between
   r = 0.0 and r = 10.0625 (645 values):
-   - the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
-   - the second entry __erfcf_data.tab.scale contains the values of
+   - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r),
+   - the second entry __v_erfcf_data.tab.scale contains the values of
   2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
   they are scaled by a large enough value 2^47 (fits in 8 bits).  */
-const struct erfcf_data __erfcf_data = {
+const struct v_erfcf_data __v_erfcf_data = {
  .tab = { { 0x1p47, 0x1.20dd76p47 },
 	   { 0x1.f6f944p46, 0x1.20cb68p47 },
 	   { 0x1.edf3aap46, 0x1.209546p47 },
--- a/sysdeps/aarch64/fpu/erfcf_sve.c
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
@ -77,7 +77,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)

  /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
  i = svmul_x (pg, i, 2);
-  const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
+  const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
  svfloat32_t erfcr = svld1_gather_index (pg, p, i);
  svfloat32_t scale = svld1_gather_index (pg, p + 1, i);

--- a/sysdeps/aarch64/fpu/erff_advsimd.c
+++ b/sysdeps/aarch64/fpu/erff_advsimd.c
@ -47,10 +47,10 @@ static inline struct entry
 lookup (uint32x4_t i)
 {
  struct entry e;
-  float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
-  float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
-  float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
-  float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+  float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+  float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+  float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+  float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
  float32x4_t e1 = vcombine_f32 (t0, t1);
  float32x4_t e2 = vcombine_f32 (t2, t3);
  e.erf = vuzp1q_f32 (e1, e2);
--- a/sysdeps/aarch64/fpu/erff_data.c
+++ b/sysdeps/aarch64/fpu/erff_data.c
@ -19,14 +19,14 @@

 #include "vecmath_config.h"

-/* Lookup table used in erff.
+/* Lookup table used in vector erff.
   For each possible rounded input r (multiples of 1/128), between
   r = 0.0 and r = 4.0 (513 values):
-   - the first entry __erff_data.tab.erf contains the values of erf(r),
-   - the second entry __erff_data.tab.scale contains the values of
+   - the first entry __v_erff_data.tab.erf contains the values of erf(r),
+   - the second entry __v_erff_data.tab.scale contains the values of
   2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
   algorithm, since lookup is performed only for x >= 1/64-1/512.  */
-const struct erff_data __erff_data = {
+const struct v_erff_data __v_erff_data = {
  .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
 	   { 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
 	   { 0x1.20d770p-6, 0x1.20cb68p+0 },
--- a/sysdeps/aarch64/fpu/erff_sve.c
+++ b/sysdeps/aarch64/fpu/erff_sve.c
@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)

  svfloat32_t shift = sv_f32 (dat->shift);
  svfloat32_t z = svadd_x (pg, a, shift);
-  svuint32_t i
-      = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift));
-
-  /* Saturate lookup index.  */
-  i = svsel (a_ge_max, sv_u32 (512), i);
+  svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff);
+  i = svadd_x (pg, i, i);

  /* r and erf(r) set to 0 for |x| below min.  */
  svfloat32_t r = svsub_z (a_gt_min, z, shift);
-  svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i);
+  svfloat32_t erfr
+      = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i);

  /* scale set to 2/sqrt(pi) for |x| below min.  */
-  svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i);
+  svfloat32_t scale
+      = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i);
  scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));

  /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2).  */
--- a/sysdeps/aarch64/fpu/sv_erf_data.c
+++ b/sysdeps/aarch64/fpu/sv_erf_data.c
--- a/sysdeps/aarch64/fpu/sv_erff_data.c
+++ b/sysdeps/aarch64/fpu/sv_erff_data.c
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@ -75,49 +75,37 @@ extern const struct v_log10_data
  } table[1 << V_LOG10_TABLE_BITS];
 } __v_log10_data attribute_hidden;

-extern const struct erff_data
+extern const struct v_erff_data
 {
  struct
  {
    float erf, scale;
  } tab[513];
-} __erff_data attribute_hidden;
+} __v_erff_data attribute_hidden;

-extern const struct sv_erff_data
-{
-  float erf[513];
-  float scale[513];
-} __sv_erff_data attribute_hidden;
-
-extern const struct erf_data
+extern const struct v_erf_data
 {
  struct
  {
    double erf, scale;
  } tab[769];
-} __erf_data attribute_hidden;
+} __v_erf_data attribute_hidden;

-extern const struct sv_erf_data
-{
-  double erf[769];
-  double scale[769];
-} __sv_erf_data attribute_hidden;
-
-extern const struct erfc_data
+extern const struct v_erfc_data
 {
  struct
  {
    double erfc, scale;
  } tab[3488];
-} __erfc_data attribute_hidden;
+} __v_erfc_data attribute_hidden;

-extern const struct erfcf_data
+extern const struct v_erfcf_data
 {
  struct
  {
    float erfc, scale;
  } tab[645];
-} __erfcf_data attribute_hidden;
+} __v_erfcf_data attribute_hidden;

 /* Some data for AdvSIMD and SVE pow's internal exp and log.  */
 #define V_POW_EXP_TABLE_BITS 8