glibc/sysdeps/aarch64/fpu/coshf_sve.c

/* Single-precision vector (SVE) cosh function

   Copyright (C) 2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include "sv_math.h"
#include "sv_expf_inline.h"

static const struct data
{
  struct sv_expf_data expf_consts;
  float special_bound;
} data = {
  .expf_consts = SV_EXPF_DATA,
  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
  .special_bound = 0x1.5a92d8p+6,
};

static svfloat32_t NOINLINE
special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
	      svbool_t pg)
{
  return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
		      pg);
}

/* Single-precision vector cosh, using vector expf.
   Maximum error is 2.77 ULP:
   _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
				 want 0x1.e4594cp+2.  */
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
{
  const struct data *d = ptr_barrier (&data);

  svbool_t special = svacge (pg, x, d->special_bound);

  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
     Note that x is passed to exp here, rather than |x|. This is to avoid using
     destructive unary ABS for better register usage. However it means the
     routine is not exactly symmetrical, as the exp helper is slightly less
     accurate in the negative range.  */
  svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
  svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
  svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);

  if (__glibc_unlikely (svptest_any (pg, special)))
    return special_case (x, half_e, half_over_e, special);

  return svadd_x (svptrue_b32 (), half_e, half_over_e);
}
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00			`/* Single-precision vector (SVE) cosh function`

			`Copyright (C) 2024 Free Software Foundation, Inc.`
			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<https://www.gnu.org/licenses/>. */`

			`#include "sv_math.h"`
			`#include "sv_expf_inline.h"`

			`static const struct data`
			`{`
			`struct sv_expf_data expf_consts;`
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`float special_bound;`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00			`} data = {`
			`.expf_consts = SV_EXPF_DATA,`
			`/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */`
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`.special_bound = 0x1.5a92d8p+6,`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00			`};`

			`static svfloat32_t NOINLINE`
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,`
			`svbool_t pg)`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00			`{`
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),`
			`pg);`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00			`}`

			`/* Single-precision vector cosh, using vector expf.`
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`Maximum error is 2.77 ULP:`
			`_ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2`
			`want 0x1.e4594cp+2. */`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00			`svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)`
			`{`
			`const struct data *d = ptr_barrier (&data);`

AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`svbool_t special = svacge (pg, x, d->special_bound);`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`/* Calculate cosh by exp(x) / 2 + exp(-x) / 2.`
			`Note that x is passed to exp here, rather than \|x\|. This is to avoid using`
			`destructive unary ABS for better register usage. However it means the`
			`routine is not exactly symmetrical, as the exp helper is slightly less`
			`accurate in the negative range. */`
			`svfloat32_t e = expf_inline (x, pg, &d->expf_consts);`
			`svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);`
			`svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00
			`if (__glibc_unlikely (svptest_any (pg, special)))`
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`return special_case (x, half_e, half_over_e, special);`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00
AArch64: Improve codegen in SVE expf & related routines Reduce MOV and MOVPRFX by improving special-case handling. Use inline helper to duplicate the entire computation between the special- and non-special case branches, removing the contention for z0 between x and the return value. Also rearrange some MLAs and MLSs - by making the multiplicand the destination we can avoid a MOVPRFX in several cases. Also change which constants go in the vector used for lanewise ops - the last lane is no longer wasted. Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the comment that explains it. Fixed - worst-case ULP for exp2f moves around but it doesn't change significantly for either routine. Worst-case error for coshf increases due to passing x to exp rather than abs(x) - updated the comment, but does not require regen-ulps. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2024-09-23 14:26:12 +00:00			`return svadd_x (svptrue_b32 (), half_e, half_over_e);`
aarch64/fpu: Add vector variants of cosh Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2024-02-20 16:59:39 +00:00			`}`