glibc/sysdeps/aarch64/fpu/v_math.h

/* Utilities for Advanced SIMD libmvec routines.
   Copyright (C) 2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#ifndef _V_MATH_H
#define _V_MATH_H

#include <arm_neon.h>
#include "vecmath_config.h"

#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))

#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
#define V_NAME_D1(fun) _ZGVnN2v_##fun
#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
#define V_NAME_D2(fun) _ZGVnN2vv_##fun

/* Shorthand helpers for declaring constants.  */
#define V2(X) { X, X }
#define V4(X) { X, X, X, X }
#define V8(X) { X, X, X, X, X, X, X, X }

static inline int
v_any_u16h (uint16x4_t x)
{
  return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
}

static inline float32x4_t
v_f32 (float x)
{
  return (float32x4_t) V4 (x);
}
static inline uint32x4_t
v_u32 (uint32_t x)
{
  return (uint32x4_t) V4 (x);
}
static inline int32x4_t
v_s32 (int32_t x)
{
  return (int32x4_t) V4 (x);
}

/* true if any elements of a vector compare result is non-zero.  */
static inline int
v_any_u32 (uint32x4_t x)
{
  /* assume elements in x are either 0 or -1u.  */
  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
}
static inline int
v_any_u32h (uint32x2_t x)
{
  return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
}
static inline float32x4_t
v_lookup_f32 (const float *tab, uint32x4_t idx)
{
  return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
}
static inline uint32x4_t
v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
{
  return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
}
static inline float32x4_t
v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
{
  return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
			p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };
}
static inline float32x4_t
v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
	     float32x4_t y, uint32x4_t p)
{
  return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],
			p[1] ? f (x1[1], x2[1]) : y[1],
			p[2] ? f (x1[2], x2[2]) : y[2],
			p[3] ? f (x1[3], x2[3]) : y[3] };
}

static inline float64x2_t
v_f64 (double x)
{
  return (float64x2_t) V2 (x);
}
static inline uint64x2_t
v_u64 (uint64_t x)
{
  return (uint64x2_t) V2 (x);
}
static inline int64x2_t
v_s64 (int64_t x)
{
  return (int64x2_t) V2 (x);
}

/* true if any elements of a vector compare result is non-zero.  */
static inline int
v_any_u64 (uint64x2_t x)
{
  /* assume elements in x are either 0 or -1u.  */
  return vpaddd_u64 (x) != 0;
}
/* true if all elements of a vector compare result is 1.  */
static inline int
v_all_u64 (uint64x2_t x)
{
  /* assume elements in x are either 0 or -1u.  */
  return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
}
static inline float64x2_t
v_lookup_f64 (const double *tab, uint64x2_t idx)
{
  return (float64x2_t){ tab[idx[0]], tab[idx[1]] };
}
static inline uint64x2_t
v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
{
  return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
}
static inline float64x2_t
v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
{
  return (float64x2_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1] };
}
static inline float64x2_t
v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
	     float64x2_t y, uint64x2_t p)
{
  return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],
			p[1] ? f (x1[1], x2[1]) : y[1] };
}

#endif
aarch64: Add vector implementations of cos routines Replace the loop-over-scalar placeholder routines with optimised implementations from Arm Optimized Routines (AOR). Also add some headers containing utilities for aarch64 libmvec routines, and update libm-test-ulps. Data tables for new routines are used via a pointer with a barrier on it, in order to prevent overly aggressive constant inlining in GCC. This allows a single adrp, combined with offset loads, to be used for every constant in the table. Special-case handlers are marked NOINLINE in order to confine the save/restore overhead of switching from vector to normal calling standard. This way we only incur the extra memory access in the exceptional cases. NOINLINE definitions have been moved to math_private.h in order to reduce duplication. AOR exposes a config option, WANT_SIMD_EXCEPT, to enable selective masking (and later fixing up) of invalid lanes, in order to trigger fp exceptions correctly (AdvSIMD only). This is tested and maintained in AOR, however it is configured off at source level here for performance reasons. We keep the WANT_SIMD_EXCEPT blocks in routine sources to greatly simplify the upstreaming process from AOR to glibc. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2023-06-28 11:19:36 +00:00			`/* Utilities for Advanced SIMD libmvec routines.`
			`Copyright (C) 2023 Free Software Foundation, Inc.`
			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<https://www.gnu.org/licenses/>. */`

			`#ifndef _V_MATH_H`
			`#define _V_MATH_H`

			`#include <arm_neon.h>`
			`#include "vecmath_config.h"`

			`#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))`

			`#define V_NAME_F1(fun) _ZGVnN4v_##fun##f`
			`#define V_NAME_D1(fun) _ZGVnN2v_##fun`
			`#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f`
			`#define V_NAME_D2(fun) _ZGVnN2vv_##fun`

			`/* Shorthand helpers for declaring constants. */`
aarch64: Optimise vecmath logs * Transpose table layout for improved memory access * Use half-vector special comparisons for AdvSIMD * Improve register use near special-case branches - Due to the presence of a function call, return value would get mov-d out of x0 in order to facilitate PCS. By moving the final computation after the branch this can be avoided Also change SVE routines to use overloaded intrinsics for readability. 2023-10-04 09:38:57 +00:00			`#define V2(X) { X, X }`
			`#define V4(X) { X, X, X, X }`
			`#define V8(X) { X, X, X, X, X, X, X, X }`
aarch64: Add vector implementations of cos routines Replace the loop-over-scalar placeholder routines with optimised implementations from Arm Optimized Routines (AOR). Also add some headers containing utilities for aarch64 libmvec routines, and update libm-test-ulps. Data tables for new routines are used via a pointer with a barrier on it, in order to prevent overly aggressive constant inlining in GCC. This allows a single adrp, combined with offset loads, to be used for every constant in the table. Special-case handlers are marked NOINLINE in order to confine the save/restore overhead of switching from vector to normal calling standard. This way we only incur the extra memory access in the exceptional cases. NOINLINE definitions have been moved to math_private.h in order to reduce duplication. AOR exposes a config option, WANT_SIMD_EXCEPT, to enable selective masking (and later fixing up) of invalid lanes, in order to trigger fp exceptions correctly (AdvSIMD only). This is tested and maintained in AOR, however it is configured off at source level here for performance reasons. We keep the WANT_SIMD_EXCEPT blocks in routine sources to greatly simplify the upstreaming process from AOR to glibc. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2023-06-28 11:19:36 +00:00
aarch64: Optimise vecmath logs * Transpose table layout for improved memory access * Use half-vector special comparisons for AdvSIMD * Improve register use near special-case branches - Due to the presence of a function call, return value would get mov-d out of x0 in order to facilitate PCS. By moving the final computation after the branch this can be avoided Also change SVE routines to use overloaded intrinsics for readability. 2023-10-04 09:38:57 +00:00			`static inline int`
			`v_any_u16h (uint16x4_t x)`
			`{`
			`return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;`
			`}`
aarch64: Add vector implementations of cos routines Replace the loop-over-scalar placeholder routines with optimised implementations from Arm Optimized Routines (AOR). Also add some headers containing utilities for aarch64 libmvec routines, and update libm-test-ulps. Data tables for new routines are used via a pointer with a barrier on it, in order to prevent overly aggressive constant inlining in GCC. This allows a single adrp, combined with offset loads, to be used for every constant in the table. Special-case handlers are marked NOINLINE in order to confine the save/restore overhead of switching from vector to normal calling standard. This way we only incur the extra memory access in the exceptional cases. NOINLINE definitions have been moved to math_private.h in order to reduce duplication. AOR exposes a config option, WANT_SIMD_EXCEPT, to enable selective masking (and later fixing up) of invalid lanes, in order to trigger fp exceptions correctly (AdvSIMD only). This is tested and maintained in AOR, however it is configured off at source level here for performance reasons. We keep the WANT_SIMD_EXCEPT blocks in routine sources to greatly simplify the upstreaming process from AOR to glibc. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2023-06-28 11:19:36 +00:00
			`static inline float32x4_t`
			`v_f32 (float x)`
			`{`
			`return (float32x4_t) V4 (x);`
			`}`
			`static inline uint32x4_t`
			`v_u32 (uint32_t x)`
			`{`
			`return (uint32x4_t) V4 (x);`
			`}`
			`static inline int32x4_t`
			`v_s32 (int32_t x)`
			`{`
			`return (int32x4_t) V4 (x);`
			`}`

			`/* true if any elements of a vector compare result is non-zero. */`
			`static inline int`
			`v_any_u32 (uint32x4_t x)`
			`{`
			`/* assume elements in x are either 0 or -1u. */`
			`return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;`
			`}`
aarch64: Optimise vecmath logs * Transpose table layout for improved memory access * Use half-vector special comparisons for AdvSIMD * Improve register use near special-case branches - Due to the presence of a function call, return value would get mov-d out of x0 in order to facilitate PCS. By moving the final computation after the branch this can be avoided Also change SVE routines to use overloaded intrinsics for readability. 2023-10-04 09:38:57 +00:00			`static inline int`
			`v_any_u32h (uint32x2_t x)`
			`{`
			`return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;`
			`}`
aarch64: Add vector implementations of cos routines Replace the loop-over-scalar placeholder routines with optimised implementations from Arm Optimized Routines (AOR). Also add some headers containing utilities for aarch64 libmvec routines, and update libm-test-ulps. Data tables for new routines are used via a pointer with a barrier on it, in order to prevent overly aggressive constant inlining in GCC. This allows a single adrp, combined with offset loads, to be used for every constant in the table. Special-case handlers are marked NOINLINE in order to confine the save/restore overhead of switching from vector to normal calling standard. This way we only incur the extra memory access in the exceptional cases. NOINLINE definitions have been moved to math_private.h in order to reduce duplication. AOR exposes a config option, WANT_SIMD_EXCEPT, to enable selective masking (and later fixing up) of invalid lanes, in order to trigger fp exceptions correctly (AdvSIMD only). This is tested and maintained in AOR, however it is configured off at source level here for performance reasons. We keep the WANT_SIMD_EXCEPT blocks in routine sources to greatly simplify the upstreaming process from AOR to glibc. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2023-06-28 11:19:36 +00:00			`static inline float32x4_t`
			`v_lookup_f32 (const float *tab, uint32x4_t idx)`
			`{`
			`return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };`
			`}`
			`static inline uint32x4_t`
			`v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)`
			`{`
			`return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };`
			`}`
			`static inline float32x4_t`
			`v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)`
			`{`
			`return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],`
			`p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };`
			`}`
			`static inline float32x4_t`
			`v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,`
			`float32x4_t y, uint32x4_t p)`
			`{`
			`return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],`
			`p[1] ? f (x1[1], x2[1]) : y[1],`
			`p[2] ? f (x1[2], x2[2]) : y[2],`
			`p[3] ? f (x1[3], x2[3]) : y[3] };`
			`}`

			`static inline float64x2_t`
			`v_f64 (double x)`
			`{`
			`return (float64x2_t) V2 (x);`
			`}`
			`static inline uint64x2_t`
			`v_u64 (uint64_t x)`
			`{`
			`return (uint64x2_t) V2 (x);`
			`}`
			`static inline int64x2_t`
			`v_s64 (int64_t x)`
			`{`
			`return (int64x2_t) V2 (x);`
			`}`

			`/* true if any elements of a vector compare result is non-zero. */`
			`static inline int`
			`v_any_u64 (uint64x2_t x)`
			`{`
			`/* assume elements in x are either 0 or -1u. */`
			`return vpaddd_u64 (x) != 0;`
			`}`
			`/* true if all elements of a vector compare result is 1. */`
			`static inline int`
			`v_all_u64 (uint64x2_t x)`
			`{`
			`/* assume elements in x are either 0 or -1u. */`
			`return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;`
			`}`
			`static inline float64x2_t`
			`v_lookup_f64 (const double *tab, uint64x2_t idx)`
			`{`
			`return (float64x2_t){ tab[idx[0]], tab[idx[1]] };`
			`}`
			`static inline uint64x2_t`
			`v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)`
			`{`
			`return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };`
			`}`
			`static inline float64x2_t`
			`v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)`
			`{`
			`return (float64x2_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1] };`
			`}`
			`static inline float64x2_t`
			`v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,`
			`float64x2_t y, uint64x2_t p)`
			`{`
			`return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],`
			`p[1] ? f (x1[1], x2[1]) : y[1] };`
			`}`

			`#endif`