aarch64: Add half-width versions of AdvSIMD f32 libmvec routines

Compilers may emit calls to 'half-width' routines (two-lane
single-precision variants). These have been added in the form of
wrappers around the full-width versions, where the low half of the
vector is simply duplicated. This will perform poorly when one lane
triggers the special-case handler, as there will be a redundant call
to the scalar version, however this is expected to be rare at Ofast.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
This commit is contained in:
Joe Ramsay 2023-12-19 16:44:01 +00:00 committed by Szabolcs Nagy
parent 3150cc0c90
commit cc0d77ba94
20 changed files with 125 additions and 14 deletions

View File

@ -600,8 +600,10 @@ for linking")
#endif
#if IS_IN (libmvec)
# define libmvec_hidden_proto(name, attrs...) hidden_proto (name, ##attrs)
# define libmvec_hidden_def(name) hidden_def (name)
#else
# define libmvec_hidden_proto(name, attrs...)
# define libmvec_hidden_def(name)
#endif

View File

@ -18,47 +18,62 @@ libmvec {
_ZGVsMxv_sinf;
}
GLIBC_2.39 {
_ZGVnN2v_cosf;
_ZGVnN2v_expf;
_ZGVnN2v_logf;
_ZGVnN2v_sinf;
_ZGVnN4v_acosf;
_ZGVnN2v_acosf;
_ZGVnN2v_acos;
_ZGVsMxv_acosf;
_ZGVsMxv_acos;
_ZGVnN4v_asinf;
_ZGVnN2v_asinf;
_ZGVnN2v_asin;
_ZGVsMxv_asinf;
_ZGVsMxv_asin;
_ZGVnN4v_atanf;
_ZGVnN2v_atanf;
_ZGVnN2v_atan;
_ZGVsMxv_atanf;
_ZGVsMxv_atan;
_ZGVnN4vv_atan2f;
_ZGVnN2vv_atan2f;
_ZGVnN2vv_atan2;
_ZGVsMxvv_atan2f;
_ZGVsMxvv_atan2;
_ZGVnN4v_exp10f;
_ZGVnN2v_exp10f;
_ZGVnN2v_exp10;
_ZGVsMxv_exp10f;
_ZGVsMxv_exp10;
_ZGVnN4v_exp2f;
_ZGVnN2v_exp2f;
_ZGVnN2v_exp2;
_ZGVsMxv_exp2f;
_ZGVsMxv_exp2;
_ZGVnN4v_expm1f;
_ZGVnN2v_expm1f;
_ZGVnN2v_expm1;
_ZGVsMxv_expm1f;
_ZGVsMxv_expm1;
_ZGVnN4v_log10f;
_ZGVnN2v_log10f;
_ZGVnN2v_log10;
_ZGVsMxv_log10f;
_ZGVsMxv_log10;
_ZGVnN4v_log1pf;
_ZGVnN2v_log1pf;
_ZGVnN2v_log1p;
_ZGVsMxv_log1pf;
_ZGVsMxv_log1p;
_ZGVnN4v_log2f;
_ZGVnN2v_log2f;
_ZGVnN2v_log2;
_ZGVsMxv_log2f;
_ZGVsMxv_log2;
_ZGVnN4v_tanf;
_ZGVnN2v_tanf;
_ZGVnN2v_tan;
_ZGVsMxv_tanf;
_ZGVsMxv_tan;

View File

@ -68,7 +68,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The largest observed error in this region is 1.32 ulps,
_ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
want 0x1.feb32ep-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -111,3 +111,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
return vfmaq_f32 (add, mul, y);
}
libmvec_hidden_def (V_NAME_F1(acos))
HALF_WIDTH_ALIAS_F1 (acos)

View File

@ -0,0 +1,34 @@
/* Hidden prototypes for single-precision AdvSIMD routines
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
libmvec_hidden_proto (V_NAME_F1(acos));
libmvec_hidden_proto (V_NAME_F1(asin));
libmvec_hidden_proto (V_NAME_F1(atan));
libmvec_hidden_proto (V_NAME_F1(cos));
libmvec_hidden_proto (V_NAME_F1(exp10));
libmvec_hidden_proto (V_NAME_F1(exp2));
libmvec_hidden_proto (V_NAME_F1(exp));
libmvec_hidden_proto (V_NAME_F1(expm1));
libmvec_hidden_proto (V_NAME_F1(log10));
libmvec_hidden_proto (V_NAME_F1(log1p));
libmvec_hidden_proto (V_NAME_F1(log2));
libmvec_hidden_proto (V_NAME_F1(log));
libmvec_hidden_proto (V_NAME_F1(sin));
libmvec_hidden_proto (V_NAME_F1(tan));
libmvec_hidden_proto (V_NAME_F2(atan2));

View File

@ -63,7 +63,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The largest observed error in this region is 2.41 ulps,
_ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -102,3 +102,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
/* Copy sign. */
return vbslq_f32 (v_u32 (AbsMask), y, x);
}
libmvec_hidden_def (V_NAME_F1 (asin))
HALF_WIDTH_ALIAS_F1 (asin)

View File

@ -56,7 +56,7 @@ zeroinfnan (uint32x4_t i)
2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
_ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
want 0x1.967f00p-1. */
float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
{
const struct data *data_ptr = ptr_barrier (&data);
@ -114,3 +114,5 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
return ret;
}
libmvec_hidden_def (V_NAME_F2 (atan2))
HALF_WIDTH_ALIAS_F2(atan2)

View File

@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
_ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -107,3 +107,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
return y;
}
libmvec_hidden_def (V_NAME_F1 (atan))
HALF_WIDTH_ALIAS_F1 (atan)

View File

@ -48,7 +48,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
return v_call_f32 (cosf, x, y, cmp);
}
float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, r3, y;
@ -92,3 +92,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
libmvec_hidden_def (V_NAME_F1 (cos))
HALF_WIDTH_ALIAS_F1 (cos)

View File

@ -92,7 +92,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
Algorithm is accurate to 2.36 ULP.
_ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
want 0x1.7e79cp+11. */
float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
#if WANT_SIMD_EXCEPT
@ -138,3 +138,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
libmvec_hidden_def (V_NAME_F1 (exp10))
HALF_WIDTH_ALIAS_F1 (exp10)

View File

@ -77,7 +77,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
#endif
float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, scale, p, q, poly;
@ -122,3 +122,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
libmvec_hidden_def (V_NAME_F1 (exp2))
HALF_WIDTH_ALIAS_F1 (exp2)

View File

@ -82,7 +82,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
#endif
float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, scale, p, q, poly, z;
@ -131,3 +131,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
libmvec_hidden_def (V_NAME_F1 (exp))
HALF_WIDTH_ALIAS_F1 (exp)

View File

@ -64,7 +64,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The maximum error is 1.51 ULP:
_ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
want 0x1.e2fb94p-2. */
float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
@ -115,3 +115,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
/* expm1(x) ~= p * t + (t - 1). */
return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
}
libmvec_hidden_def (V_NAME_F1 (expm1))
HALF_WIDTH_ALIAS_F1 (expm1)

View File

@ -55,7 +55,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
Maximum error: 3.305ulps (nearest rounding.)
_ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
want 0x1.ffe2f4p-4. */
float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t u = vreinterpretq_u32_f32 (x);
@ -80,3 +80,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
return special_case (x, y, poly, r2, special);
return vfmaq_f32 (y, poly, r2);
}
libmvec_hidden_def (V_NAME_F1 (log10))
HALF_WIDTH_ALIAS_F1 (log10)

View File

@ -126,3 +126,5 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
return special_case (special_arg, y, special_cases);
return y;
}
libmvec_hidden_def (V_NAME_F1 (log1p))
HALF_WIDTH_ALIAS_F1 (log1p)

View File

@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
Maximum error: 2.48 ULPs
_ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
want 0x1.a9be8p-2. */
float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t u = vreinterpretq_u32_f32 (x);
@ -75,3 +75,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
return special_case (x, n, p, r, special);
return vfmaq_f32 (n, p, r);
}
libmvec_hidden_def (V_NAME_F1 (log2))
HALF_WIDTH_ALIAS_F1 (log2)

View File

@ -49,7 +49,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
}
float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, p, q, r, r2, y;
@ -83,3 +83,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
return special_case (x, y, r2, p, cmp);
return vfmaq_f32 (p, y, r2);
}
libmvec_hidden_def (V_NAME_F1 (log))
HALF_WIDTH_ALIAS_F1 (log)

View File

@ -52,7 +52,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
return v_call_f32 (sinf, x, y, cmp);
}
float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, y;
@ -92,3 +92,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
libmvec_hidden_def (V_NAME_F1 (sin))
HALF_WIDTH_ALIAS_F1 (sin)

View File

@ -73,7 +73,7 @@ eval_poly (float32x4_t z, const struct data *d)
Maximum error is 3.45 ULP:
__v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
want 0x1.ff9850p-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t special_arg = x;
@ -127,3 +127,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special);
return vbslq_f32 (pred_alt, inv_y, y);
}
libmvec_hidden_def (V_NAME_F1 (tan))
HALF_WIDTH_ALIAS_F1 (tan)

View File

@ -29,6 +29,21 @@
#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
#define V_NAME_D2(fun) _ZGVnN2vv_##fun
#include "advsimd_f32_protos.h"
#define HALF_WIDTH_ALIAS_F1(fun) \
float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \
{ \
return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \
}
#define HALF_WIDTH_ALIAS_F2(fun) \
float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \
{ \
return vget_low_f32 ( \
_ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \
}
/* Shorthand helpers for declaring constants. */
#define V2(X) { X, X }
#define V4(X) { X, X, X, X }

View File

@ -15,16 +15,31 @@ GLIBC_2.38 _ZGVsMxv_logf F
GLIBC_2.38 _ZGVsMxv_sin F
GLIBC_2.38 _ZGVsMxv_sinf F
GLIBC_2.39 _ZGVnN2v_acos F
GLIBC_2.39 _ZGVnN2v_acosf F
GLIBC_2.39 _ZGVnN2v_asin F
GLIBC_2.39 _ZGVnN2v_asinf F
GLIBC_2.39 _ZGVnN2v_atan F
GLIBC_2.39 _ZGVnN2v_atanf F
GLIBC_2.39 _ZGVnN2v_cosf F
GLIBC_2.39 _ZGVnN2v_exp10 F
GLIBC_2.39 _ZGVnN2v_exp10f F
GLIBC_2.39 _ZGVnN2v_exp2 F
GLIBC_2.39 _ZGVnN2v_exp2f F
GLIBC_2.39 _ZGVnN2v_expf F
GLIBC_2.39 _ZGVnN2v_expm1 F
GLIBC_2.39 _ZGVnN2v_expm1f F
GLIBC_2.39 _ZGVnN2v_log10 F
GLIBC_2.39 _ZGVnN2v_log10f F
GLIBC_2.39 _ZGVnN2v_log1p F
GLIBC_2.39 _ZGVnN2v_log1pf F
GLIBC_2.39 _ZGVnN2v_log2 F
GLIBC_2.39 _ZGVnN2v_log2f F
GLIBC_2.39 _ZGVnN2v_logf F
GLIBC_2.39 _ZGVnN2v_sinf F
GLIBC_2.39 _ZGVnN2v_tan F
GLIBC_2.39 _ZGVnN2v_tanf F
GLIBC_2.39 _ZGVnN2vv_atan2 F
GLIBC_2.39 _ZGVnN2vv_atan2f F
GLIBC_2.39 _ZGVnN4v_acosf F
GLIBC_2.39 _ZGVnN4v_asinf F
GLIBC_2.39 _ZGVnN4v_atanf F