aarch64: Add vector implementations of log routines

Optimised implementations for single and double precision, Advanced SIMD and SVE, copied from Arm Optimized Routines. Log lookup table added as HIDDEN symbol to allow it to be shared between AdvSIMD and SVE variants. As previously, data tables are used via a barrier to prevent overly aggressive constant inlining. Special-case handlers are marked NOINLINE to avoid incurring the penalty of switching call standards unnecessarily. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
2024-11-10 07:10:06 +00:00 · 2023-06-28 12:19:38 +01:00 · 2023-06-28 12:19:38 +01:00 · 78c01a5cbe
commit 78c01a5cbe
parent 3bb1af2051
15 changed files with 563 additions and 1 deletions
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@ -1,4 +1,5 @@
 libmvec-supported-funcs = cos \
+                          log \
                          sin

 float-advsimd-funcs = $(libmvec-supported-funcs)
@ -10,7 +11,8 @@ ifeq ($(subdir),mathvec)
 libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
                  $(addsuffix _advsimd,$(double-advsimd-funcs)) \
                  $(addsuffix f_sve,$(float-sve-funcs)) \
-                  $(addsuffix _sve,$(double-sve-funcs))
+                  $(addsuffix _sve,$(double-sve-funcs)) \
+                  v_log_data
 endif

 sve-cflags = -march=armv8-a+sve
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@ -1,11 +1,15 @@
 libmvec {
  GLIBC_2.38 {
    _ZGVnN2v_cos;
+    _ZGVnN2v_log;
    _ZGVnN2v_sin;
    _ZGVnN4v_cosf;
+    _ZGVnN4v_logf;
    _ZGVnN4v_sinf;
    _ZGVsMxv_cos;
    _ZGVsMxv_cosf;
+    _ZGVsMxv_log;
+    _ZGVsMxv_logf;
    _ZGVsMxv_sin;
    _ZGVsMxv_sinf;
  }
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@ -50,9 +50,11 @@ typedef __SVBool_t __sv_bool_t;
 #  define __vpcs __attribute__ ((__aarch64_vector_pcs__))

 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);

 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);

 #  undef __ADVSIMD_VEC_MATH_SUPPORTED
@ -61,9 +63,11 @@ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 #ifdef __SVE_VEC_MATH_SUPPORTED

 __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);

 __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);

 #  undef __SVE_VEC_MATH_SUPPORTED
--- a/sysdeps/aarch64/fpu/log_advsimd.c
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
@ -0,0 +1,105 @@
+/* Double-precision vector (Advanced SIMD) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[5];
+  float64x2_t ln2;
+  uint64x2_t min_norm, special_bound, sign_exp_mask;
+} data = {
+  /* Worst-case error: 1.17 + 0.5 ulp.
+     Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
+	    V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
+	    V2 (-0x1.554e550bd501ep-3) },
+  .ln2 = V2 (0x1.62e42fefa39efp-1),
+  .min_norm = V2 (0x0010000000000000),
+  .special_bound = V2 (0x7fe0000000000000), /* asuint64(inf) - min_norm.  */
+  .sign_exp_mask = V2 (0xfff0000000000000)
+};
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+#define Off v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  e.invc[0] = __v_log_data.invc[i[0] & IndexMask];
+  e.logc[0] = __v_log_data.logc[i[0] & IndexMask];
+  e.invc[1] = __v_log_data.invc[i[1] & IndexMask];
+  e.logc[1] = __v_log_data.logc[i[1] & IndexMask];
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  return v_call_f64 (log, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t z, r, r2, p, y, kd, hi;
+  uint64x2_t ix, iz, tmp, cmp;
+  int64x2_t k;
+  struct entry e;
+
+  ix = vreinterpretq_u64_f64 (x);
+  cmp = vcgeq_u64 (vsubq_u64 (ix, d->min_norm), d->special_bound);
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = vsubq_u64 (ix, Off);
+  k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
+  iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+  z = vreinterpretq_f64_u64 (iz);
+  e = lookup (vshrq_n_u64 (tmp, 52 - V_LOG_TABLE_BITS));
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (A (2), A (3), r);
+  p = vfmaq_f64 (A (0), A (1), r);
+  y = vfmaq_f64 (y, A (4), r2);
+  y = vfmaq_f64 (p, y, r2);
+  y = vfmaq_f64 (hi, y, r2);
+
+  if (__glibc_unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
--- a/sysdeps/aarch64/fpu/log_sve.c
+++ b/sysdeps/aarch64/fpu/log_sve.c
@ -0,0 +1,80 @@
+/* Double-precision vector (SVE) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+#define P(i) sv_f64 (__v_log_data.poly[i])
+#define N (1 << V_LOG_TABLE_BITS)
+#define Off (0x3fe6900900000000)
+#define MaxTop (0x7ff)
+#define MinTop (0x001)
+#define ThreshTop (0x7fe) /* MaxTop - MinTop.  */
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (log, x, y, cmp);
+}
+
+/* SVE port of AdvSIMD log algorithm.
+   Maximum measured error is 2.17 ulp:
+   SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+					 want 0x1.ffffff1cca045p-2.  */
+svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
+{
+  svuint64_t ix = svreinterpret_u64_f64 (x);
+  svuint64_t top = svlsr_n_u64_x (pg, ix, 52);
+  svbool_t cmp
+      = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, MinTop), sv_u64 (ThreshTop));
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  svuint64_t tmp = svsub_n_u64_x (pg, ix, Off);
+  /* Equivalent to (tmp >> (52 - V_LOG_TABLE_BITS)) % N, since N is a power
+     of 2.  */
+  svuint64_t i = svand_n_u64_x (
+      pg, svlsr_n_u64_x (pg, tmp, (52 - V_LOG_TABLE_BITS)), N - 1);
+  svint64_t k = svasr_n_s64_x (pg, svreinterpret_s64_u64 (tmp),
+			       52); /* Arithmetic shift.  */
+  svuint64_t iz
+      = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
+  svfloat64_t z = svreinterpret_f64_u64 (iz);
+  /* Lookup in 2 global lists (length N).  */
+  svfloat64_t invc = svld1_gather_u64index_f64 (pg, __v_log_data.invc, i);
+  svfloat64_t logc = svld1_gather_u64index_f64 (pg, __v_log_data.logc, i);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  svfloat64_t r = svmad_n_f64_x (pg, invc, z, -1);
+  svfloat64_t kd = svcvt_f64_s64_x (pg, k);
+  /* hi = r + log(c) + k*Ln2.  */
+  svfloat64_t hi
+      = svmla_n_f64_x (pg, svadd_f64_x (pg, logc, r), kd, __v_log_data.ln2);
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  svfloat64_t r2 = svmul_f64_x (pg, r, r);
+  svfloat64_t y = svmla_f64_x (pg, P (2), r, P (3));
+  svfloat64_t p = svmla_f64_x (pg, P (0), r, P (1));
+  y = svmla_f64_x (pg, y, r2, P (4));
+  y = svmla_f64_x (pg, p, r2, y);
+  y = svmla_f64_x (pg, hi, r2, y);
+
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@ -0,0 +1,81 @@
+/* Single-precision vector (Advanced SIMD) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[7];
+  float32x4_t ln2, tiny_bound;
+  uint32x4_t min_norm, special_bound, off, mantissa_mask;
+} data = {
+  /* 3.34 ulp error.  */
+  .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
+	    V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
+	    V4 (-0x1.ffffc8p-2f) },
+  .ln2 = V4 (0x1.62e43p-1f),
+  .tiny_bound = V4 (0x1p-126),
+  .min_norm = V4 (0x00800000),
+  .special_bound = V4 (0x7f000000), /* asuint32(inf) - min_norm.  */
+  .off = V4 (0x3f2aaaab),	    /* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff)
+};
+
+#define P(i) d->poly[7 - i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (logf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, p, q, r, r2, y;
+  uint32x4_t u, cmp;
+
+  u = vreinterpretq_u32_f32 (x);
+  cmp = vcgeq_u32 (vsubq_u32 (u, d->min_norm), d->special_bound);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = vsubq_u32 (u, d->off);
+  n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
+  u = vandq_u32 (u, d->mantissa_mask);
+  u = vaddq_u32 (u, d->off);
+  r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log(1+r) + n*ln2.  */
+  r2 = vmulq_f32 (r, r);
+  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+  p = vfmaq_f32 (P (5), P (6), r);
+  q = vfmaq_f32 (P (3), P (4), r);
+  y = vfmaq_f32 (P (1), P (2), r);
+  p = vfmaq_f32 (p, P (7), r2);
+  q = vfmaq_f32 (q, p, r2);
+  y = vfmaq_f32 (y, q, r2);
+  p = vfmaq_f32 (r, d->ln2, n);
+  y = vfmaq_f32 (p, y, r2);
+
+  if (__glibc_unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
--- a/sysdeps/aarch64/fpu/logf_sve.c
+++ b/sysdeps/aarch64/fpu/logf_sve.c
@ -0,0 +1,87 @@
+/* Single-precision vector (SVE) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float poly_0135[4];
+  float poly_246[3];
+  float ln2;
+} data = {
+  .poly_0135 = {
+    /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
+       that coeffs 0, 1, 3 and 5 can be loaded as a single quad-word, hence used
+       with _lane variant of MLA intrinsic.  */
+    -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
+  },
+  .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
+  .ln2 = 0x1.62e43p-1f
+};
+
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Thresh (0x7f000000) /* Max - Min.  */
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667.  */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (logf, x, y, cmp);
+}
+
+/* Optimised implementation of SVE logf, using the same algorithm and
+   polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs:
+   SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2
+				  want 0x1.26ede6p-2.  */
+svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svuint32_t u = svreinterpret_u32_f32 (x);
+  svbool_t cmp = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, u, Min), Thresh);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  svfloat32_t n
+      = svcvt_f32_s32_x (pg, svasr_n_s32_x (pg, svreinterpret_s32_u32 (u),
+					    23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  svfloat32_t r = svsub_n_f32_x (pg, svreinterpret_f32_u32 (u), 1.0f);
+
+  /* y = log(1+r) + n*ln2.  */
+  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
+  svfloat32_t p_0135 = svld1rq_f32 (svptrue_b32 (), &d->poly_0135[0]);
+  svfloat32_t p = svmla_lane_f32 (sv_f32 (d->poly_246[0]), r, p_0135, 1);
+  svfloat32_t q = svmla_lane_f32 (sv_f32 (d->poly_246[1]), r, p_0135, 2);
+  svfloat32_t y = svmla_lane_f32 (sv_f32 (d->poly_246[2]), r, p_0135, 3);
+  p = svmla_lane_f32 (p, r2, p_0135, 0);
+
+  q = svmla_f32_x (pg, q, r2, p);
+  y = svmla_f32_x (pg, y, r2, q);
+  p = svmla_n_f32_x (pg, r, n, d->ln2);
+  y = svmla_f32_x (pg, p, r2, y);
+
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@ -24,4 +24,5 @@
 #define VEC_TYPE float64x2_t

 VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
 VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@ -33,4 +33,5 @@
  }

 SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
 SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@ -24,4 +24,5 @@
 #define VEC_TYPE float32x4_t

 VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
 VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@ -33,4 +33,5 @@
  }

 SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
 SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
--- a/sysdeps/aarch64/fpu/v_log_data.c
+++ b/sysdeps/aarch64/fpu/v_log_data.c
@ -0,0 +1,173 @@
+/* Lookup table for double-precision log(x) vector function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "vecmath_config.h"
+
+const struct v_log_data __v_log_data = {
+  /* Worst-case error: 1.17 + 0.5 ulp.
+     Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
+	    0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 },
+  .ln2 = 0x1.62e42fefa39efp-1,
+  /* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+     where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+     N=128) and log(c) and 1/c for the ith subinterval comes from two lookup
+     tables:
+
+	invc[i] = 1/c
+	logc[i] = (double)log(c)
+
+     where c is near the center of the subinterval and is chosen by trying
+     several floating point invc candidates around 1/center and selecting one
+     for which the error in (double)log(c) is minimized (< 0x1p-74), except the
+     subinterval that contains 1 and the previous one got tweaked to avoid
+     cancellation.  */
+  .invc = { 0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
+	    0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
+	    0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
+	    0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
+	    0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
+	    0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
+	    0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
+	    0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
+	    0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
+	    0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
+	    0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
+	    0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
+	    0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
+	    0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
+	    0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
+	    0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
+	    0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
+	    0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
+	    0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
+	    0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
+	    0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
+	    0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
+	    0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
+	    0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
+	    0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
+	    0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
+	    0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
+	    0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
+	    0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
+	    0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
+	    0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
+	    0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
+	    0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
+	    0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
+	    0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
+	    0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
+	    0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
+	    0x1.010037d38bcc2p+0, 1.0,
+	    0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
+	    0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
+	    0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
+	    0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
+	    0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
+	    0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
+	    0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
+	    0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
+	    0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
+	    0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
+	    0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
+	    0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
+	    0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
+	    0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
+	    0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
+	    0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
+	    0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
+	    0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
+	    0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
+	    0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
+	    0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
+	    0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
+	    0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
+	    0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
+	    0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
+	    0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1 },
+  .logc = { -0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
+	    -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
+	    -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
+	    -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
+	    -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
+	    -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
+	    -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
+	    -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
+	    -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
+	    -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
+	    -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
+	    -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
+	    -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
+	    -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
+	    -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
+	    -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
+	    -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
+	    -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
+	    -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
+	    -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
+	    -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
+	    -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
+	    -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
+	    -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
+	    -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
+	    -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
+	    -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
+	    -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
+	    -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
+	    -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
+	    -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
+	    -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
+	    -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
+	    -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
+	    -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
+	    -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
+	    -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
+	    -0x1.ff6fe1feb4e53p-9, 0.0,
+	    0x1.fe91885ec8e20p-8,  0x1.fc516f716296dp-7,
+	    0x1.7bb4dd70a015bp-6,  0x1.f84c99b34b674p-6,
+	    0x1.39f9ce4fb2d71p-5,  0x1.7756c0fd22e78p-5,
+	    0x1.b43ee82db8f3ap-5,  0x1.f0b3fced60034p-5,
+	    0x1.165bd78d4878ep-4,  0x1.3425d2715ebe6p-4,
+	    0x1.51b8bd91b7915p-4,  0x1.6f15632c76a47p-4,
+	    0x1.8c3c88ecbe503p-4,  0x1.a92ef077625dap-4,
+	    0x1.c5ed5745fa006p-4,  0x1.e27876de1c993p-4,
+	    0x1.fed104fce4cdcp-4,  0x1.0d7bd9c17d78bp-3,
+	    0x1.1b76986cef97bp-3,  0x1.295913d24f750p-3,
+	    0x1.37239fa295d17p-3,  0x1.44d68dd78714bp-3,
+	    0x1.52722ebe5d780p-3,  0x1.5ff6d12671f98p-3,
+	    0x1.6d64c2389484bp-3,  0x1.7abc4da40fddap-3,
+	    0x1.87fdbda1e8452p-3,  0x1.95295b06a5f37p-3,
+	    0x1.a23f6d34abbc5p-3,  0x1.af403a28e04f2p-3,
+	    0x1.bc2c06a85721ap-3,  0x1.c903161240163p-3,
+	    0x1.d5c5aa93287ebp-3,  0x1.e274051823fa9p-3,
+	    0x1.ef0e656300c16p-3,  0x1.fb9509f05aa2ap-3,
+	    0x1.04041821f37afp-2,  0x1.0a340a49b3029p-2,
+	    0x1.105a7918a126dp-2,  0x1.1677819812b84p-2,
+	    0x1.1c8b405b40c0ep-2,  0x1.2295d16cfa6b1p-2,
+	    0x1.28975066318a2p-2,  0x1.2e8fd855d86fcp-2,
+	    0x1.347f83d605e59p-2,  0x1.3a666d1244588p-2,
+	    0x1.4044adb6f8ec4p-2,  0x1.461a5f077558cp-2,
+	    0x1.4be799e20b9c8p-2,  0x1.51ac76a6b79dfp-2,
+	    0x1.57690d5744a45p-2,  0x1.5d1d758e45217p-2 }
+};
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@ -35,4 +35,14 @@
    __ptr;                                                                    \
  })

+#define V_LOG_POLY_ORDER 6
+#define V_LOG_TABLE_BITS 7
+extern const struct v_log_data
+{
+  /* Shared data for vector log and log-derived routines (e.g. asinh).  */
+  double poly[V_LOG_POLY_ORDER - 1];
+  double ln2;
+  double invc[1 << V_LOG_TABLE_BITS];
+  double logc[1 << V_LOG_TABLE_BITS];
+} __v_log_data attribute_hidden;
 #endif
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@ -1219,10 +1219,18 @@ double: 3
 float: 3
 ldouble: 1

+Function: "log_advsimd":
+double: 1
+float: 3
+
 Function: "log_downward":
 float: 2
 ldouble: 1

+Function: "log_sve":
+double: 1
+float: 3
+
 Function: "log_towardzero":
 float: 2
 ldouble: 2
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@ -1,8 +1,12 @@
 GLIBC_2.38 _ZGVnN2v_cos F
+GLIBC_2.38 _ZGVnN2v_log F
 GLIBC_2.38 _ZGVnN2v_sin F
 GLIBC_2.38 _ZGVnN4v_cosf F
+GLIBC_2.38 _ZGVnN4v_logf F
 GLIBC_2.38 _ZGVnN4v_sinf F
 GLIBC_2.38 _ZGVsMxv_cos F
 GLIBC_2.38 _ZGVsMxv_cosf F
+GLIBC_2.38 _ZGVsMxv_log F
+GLIBC_2.38 _ZGVsMxv_logf F
 GLIBC_2.38 _ZGVsMxv_sin F
 GLIBC_2.38 _ZGVsMxv_sinf F