aarch64/fpu: Add vector variants of erfc

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
This commit is contained in:
Joe Ramsay 2024-02-20 16:59:45 +00:00 committed by Szabolcs Nagy
parent 3d3a4fb8e4
commit 87cb1dfcd6
17 changed files with 4897 additions and 1 deletions

View File

@ -8,6 +8,7 @@ libmvec-supported-funcs = acos \
cos \
cosh \
erf \
erfc \
exp \
exp10 \
exp2 \
@ -39,7 +40,9 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
erff_data \
sv_erf_data \
sv_erff_data \
v_exp_tail_data
v_exp_tail_data \
erfc_data \
erfcf_data
endif
sve-cflags = -march=armv8-a+sve

View File

@ -104,6 +104,11 @@ libmvec {
_ZGVnN4v_erff;
_ZGVsMxv_erf;
_ZGVsMxv_erff;
_ZGVnN2v_erfc;
_ZGVnN2v_erfcf;
_ZGVnN4v_erfcf;
_ZGVsMxv_erfc;
_ZGVsMxv_erfcf;
_ZGVnN2v_sinh;
_ZGVnN2v_sinhf;
_ZGVnN4v_sinhf;

View File

@ -26,6 +26,7 @@ libmvec_hidden_proto (V_NAME_F1(atanh));
libmvec_hidden_proto (V_NAME_F1(cos));
libmvec_hidden_proto (V_NAME_F1(cosh));
libmvec_hidden_proto (V_NAME_F1(erf));
libmvec_hidden_proto (V_NAME_F1(erfc));
libmvec_hidden_proto (V_NAME_F1(exp10));
libmvec_hidden_proto (V_NAME_F1(exp2));
libmvec_hidden_proto (V_NAME_F1(exp));

View File

@ -69,6 +69,10 @@
# define __DECL_SIMD_erf __DECL_SIMD_aarch64
# undef __DECL_SIMD_erff
# define __DECL_SIMD_erff __DECL_SIMD_aarch64
# undef __DECL_SIMD_erfc
# define __DECL_SIMD_erfc __DECL_SIMD_aarch64
# undef __DECL_SIMD_erfcf
# define __DECL_SIMD_erfcf __DECL_SIMD_aarch64
# undef __DECL_SIMD_exp
# define __DECL_SIMD_exp __DECL_SIMD_aarch64
# undef __DECL_SIMD_expf
@ -153,6 +157,7 @@ __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
@ -176,6 +181,7 @@ __vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
@ -204,6 +210,7 @@ __sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_erfcf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
@ -227,6 +234,7 @@ __sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_erfc (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);

View File

@ -0,0 +1,201 @@
/* Double-precision vector (Advanced SIMD) erfc function
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
#include "vecmath_config.h"
static const struct data
{
uint64x2_t offset, table_scale;
float64x2_t max, shift;
float64x2_t p20, p40, p41, p42;
float64x2_t p51, p52;
float64x2_t qr5, qr6, qr7, qr8, qr9;
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
#endif
} data = {
/* Set an offset so the range of the index used for lookup is 3487, and it
can be clamped using a saturated add on an offset index.
Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */
.offset = V2 (0xbd3ffffffffff260),
.table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1. */
.max = V2 (0x1.b3ep+4), /* 3487/128. */
.shift = V2 (0x1p45),
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
.p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
.p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
.qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 },
.qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 },
.qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 },
#if WANT_SIMD_EXCEPT
.uflow_bound = V2 (0x1.a8b12fc6e4892p+4),
#endif
};
#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */
#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */
struct entry
{
float64x2_t erfc;
float64x2_t scale;
};
static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
e.erfc = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
}
#if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
{
return v_call_f64 (erfc, x, y, cmp);
}
#endif
/* Optimized double-precision vector erfc(x).
Approximation based on series expansion near x rounded to
nearest multiple of 1/128.
Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
- r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
+ p6(r) d^6 + ... + p10(r) d^10
Polynomials p6(r) to p10(r) are computed using recurrence relation
2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
with p0 = 1, and p1(r) = -r.
Values of erfc(r) and scale are read from lookup tables. Stored values
are scaled to avoid hitting the subnormal range.
Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
Maximum measured error: 1.71 ULP
V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
want 0x1.e15fcbea3e7adp-608. */
VPCS_ATTR
float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
{
const struct data *dat = ptr_barrier (&data);
#if WANT_SIMD_EXCEPT
/* |x| < 2^-511. Avoid fabs by left-shifting by 1. */
uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound));
/* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in
integer domain to avoid raising exceptions in presence of nans. */
uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x),
vreinterpretq_s64_f64 (dat->uflow_bound));
cmp = vorrq_u64 (cmp, uflow);
float64x2_t xm = x;
/* If any lanes are special, mask them with 0 and retain a copy of x to allow
special case handler to fix special lanes later. This is only necessary if
fenv exceptions are to be triggered correctly. */
if (__glibc_unlikely (v_any_u64 (cmp)))
x = v_zerofy_f64 (x, cmp);
#endif
float64x2_t a = vabsq_f64 (x);
a = vminq_f64 (a, dat->max);
/* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
2/sqrt(pi), when x reduced to r = 0. */
float64x2_t shift = dat->shift;
float64x2_t z = vaddq_f64 (a, shift);
/* Clamp index to a range of 3487. A naive approach would use a subtract and
min. Instead we offset the table address and the index, then use a
saturating add. */
uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset);
struct entry e = lookup (i);
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
float64x2_t r = vsubq_f64 (z, shift);
float64x2_t d = vsubq_f64 (a, r);
float64x2_t d2 = vmulq_f64 (d, d);
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t p1 = r;
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
p4 = vfmsq_f64 (dat->p40, r2, p4);
float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
/* Compute polynomial in d using pairwise Horner scheme. */
float64x2_t p90 = vfmaq_f64 (p9, d, p10);
float64x2_t p78 = vfmaq_f64 (p7, d, p8);
float64x2_t p56 = vfmaq_f64 (p5, d, p6);
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
float64x2_t p12 = vfmaq_f64 (p1, d, p2);
float64x2_t y = vfmaq_f64 (p78, d2, p90);
y = vfmaq_f64 (p56, d2, y);
y = vfmaq_f64 (p34, d2, y);
y = vfmaq_f64 (p12, d2, y);
y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y));
/* Offset equals 2.0 if sign, else 0.0. */
uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63);
float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62));
/* Copy sign and scale back in a single fma. Since the bit patterns do not
overlap, then logical or and addition are equivalent here. */
float64x2_t fac = vreinterpretq_f64_u64 (
vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1));
#if WANT_SIMD_EXCEPT
if (__glibc_unlikely (v_any_u64 (cmp)))
return special_case (xm, vfmaq_f64 (off, fac, y), cmp);
#endif
return vfmaq_f64 (off, fac, y);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,167 @@
/* Double-precision vector (SVE) erfc function
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
#include "vecmath_config.h"
static const struct data
{
uint64_t off_idx, off_arr;
double max, shift;
double p20, p40, p41, p42;
double p51, p52;
double q5, r5;
double q6, r6;
double q7, r7;
double q8, r8;
double q9, r9;
uint64_t table_scale;
} data = {
/* Set an offset so the range of the index used for lookup is 3487, and it
can be clamped using a saturated add on an offset index.
Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */
.off_idx = 0xbd3ffffffffff260,
.off_arr = 0xfffffffffffff260, /* 0xffffffffffffffff - 3487. */
.max = 0x1.b3ep+4, /* 3487/128. */
.shift = 0x1p45,
.table_scale = 0x37f0000000000000, /* asuint64(0x1p-128). */
.p20 = 0x1.5555555555555p-2, /* 1/3, used to compute 2/3 and 1/6. */
.p40 = -0x1.999999999999ap-4, /* 1/10. */
.p41 = -0x1.999999999999ap-2, /* 2/5. */
.p42 = 0x1.1111111111111p-3, /* 2/15. */
.p51 = -0x1.c71c71c71c71cp-3, /* 2/9. */
.p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
/* Qi = (i+1) / i, for i = 5, ..., 9. */
.q5 = 0x1.3333333333333p0,
.q6 = 0x1.2aaaaaaaaaaabp0,
.q7 = 0x1.2492492492492p0,
.q8 = 0x1.2p0,
.q9 = 0x1.1c71c71c71c72p0,
/* Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.r5 = -0x1.e79e79e79e79ep-3,
.r6 = -0x1.b6db6db6db6dbp-3,
.r7 = -0x1.8e38e38e38e39p-3,
.r8 = -0x1.6c16c16c16c17p-3,
.r9 = -0x1.4f2094f2094f2p-3,
};
/* Optimized double-precision vector erfc(x).
Approximation based on series expansion near x rounded to
nearest multiple of 1/128.
Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
- r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
+ p6(r) d^6 + ... + p10(r) d^10
Polynomials p6(r) to p10(r) are computed using recurrence relation
2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
with p0 = 1, and p1(r) = -r.
Values of erfc(r) and scale are read from lookup tables. Stored values
are scaled to avoid hitting the subnormal range.
Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
Maximum measured error: 1.71 ULP
_ZGVsMxv_erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
want 0x1.e15fcbea3e7adp-608. */
svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
{
const struct data *dat = ptr_barrier (&data);
svfloat64_t a = svabs_x (pg, x);
/* Clamp input at |x| <= 3487/128. */
a = svmin_x (pg, a, dat->max);
/* Reduce x to the nearest multiple of 1/128. */
svfloat64_t shift = sv_f64 (dat->shift);
svfloat64_t z = svadd_x (pg, a, shift);
/* Saturate index for the NaN case. */
svuint64_t i = svqadd (svreinterpret_u64 (z), dat->off_idx);
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
i = svadd_x (pg, i, i);
const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
svfloat64_t erfcr = svld1_gather_index (pg, p, i);
svfloat64_t scale = svld1_gather_index (pg, p + 1, i);
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
svfloat64_t r = svsub_x (pg, z, shift);
svfloat64_t d = svsub_x (pg, a, r);
svfloat64_t d2 = svmul_x (pg, d, d);
svfloat64_t r2 = svmul_x (pg, r, r);
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p9(r) * d^9. */
svfloat64_t p1 = r;
svfloat64_t third = sv_f64 (dat->p20);
svfloat64_t twothird = svmul_x (pg, third, 2.0);
svfloat64_t sixth = svmul_x (pg, third, 0.5);
svfloat64_t p2 = svmls_x (pg, third, r2, twothird);
svfloat64_t p3 = svmad_x (pg, r2, third, -0.5);
p3 = svmul_x (pg, r, p3);
svfloat64_t p4 = svmla_x (pg, sv_f64 (dat->p41), r2, dat->p42);
p4 = svmls_x (pg, sv_f64 (dat->p40), r2, p4);
svfloat64_t p5 = svmla_x (pg, sv_f64 (dat->p51), r2, dat->p52);
p5 = svmla_x (pg, sixth, r2, p5);
p5 = svmul_x (pg, r, p5);
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
svfloat64_t qr5 = svld1rq (svptrue_b64 (), &dat->q5);
svfloat64_t qr6 = svld1rq (svptrue_b64 (), &dat->q6);
svfloat64_t qr7 = svld1rq (svptrue_b64 (), &dat->q7);
svfloat64_t qr8 = svld1rq (svptrue_b64 (), &dat->q8);
svfloat64_t qr9 = svld1rq (svptrue_b64 (), &dat->q9);
svfloat64_t p6 = svmla_x (pg, p4, p5, svmul_lane (r, qr5, 0));
p6 = svmul_lane (p6, qr5, 1);
svfloat64_t p7 = svmla_x (pg, p5, p6, svmul_lane (r, qr6, 0));
p7 = svmul_lane (p7, qr6, 1);
svfloat64_t p8 = svmla_x (pg, p6, p7, svmul_lane (r, qr7, 0));
p8 = svmul_lane (p8, qr7, 1);
svfloat64_t p9 = svmla_x (pg, p7, p8, svmul_lane (r, qr8, 0));
p9 = svmul_lane (p9, qr8, 1);
svfloat64_t p10 = svmla_x (pg, p8, p9, svmul_lane (r, qr9, 0));
p10 = svmul_lane (p10, qr9, 1);
/* Compute polynomial in d using pairwise Horner scheme. */
svfloat64_t p90 = svmla_x (pg, p9, d, p10);
svfloat64_t p78 = svmla_x (pg, p7, d, p8);
svfloat64_t p56 = svmla_x (pg, p5, d, p6);
svfloat64_t p34 = svmla_x (pg, p3, d, p4);
svfloat64_t p12 = svmla_x (pg, p1, d, p2);
svfloat64_t y = svmla_x (pg, p78, d2, p90);
y = svmla_x (pg, p56, d2, y);
y = svmla_x (pg, p34, d2, y);
y = svmla_x (pg, p12, d2, y);
y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y));
/* Offset equals 2.0 if sign, else 0.0. */
svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
svfloat64_t off = svreinterpret_f64 (svlsr_x (pg, sign, 1));
/* Handle sign and scale back in a single fma. */
svfloat64_t fac = svreinterpret_f64 (svorr_x (pg, sign, dat->table_scale));
return svmla_x (pg, off, fac, y);
}

View File

@ -0,0 +1,170 @@
/* Single-precision vector (Advanced SIMD) erfc function
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
static const struct data
{
uint32x4_t offset, table_scale;
float32x4_t max, shift;
float32x4_t coeffs, third, two_over_five, tenth;
#if WANT_SIMD_EXCEPT
float32x4_t uflow_bound;
#endif
} data = {
/* Set an offset so the range of the index used for lookup is 644, and it can
be clamped using a saturated add. */
.offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */
.table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */
.max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */
.shift = V4 (0x1p17f),
/* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
fmas. */
.coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
.third = V4 (0x1.555556p-2f),
.two_over_five = V4 (-0x1.99999ap-2f),
.tenth = V4 (-0x1.99999ap-4f),
#if WANT_SIMD_EXCEPT
.uflow_bound = V4 (0x1.2639cp+3f),
#endif
};
#define TinyBound 0x41000000 /* 0x1p-62f << 1. */
#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */
#define Off 0xfffffd7b /* 0xffffffff - 644. */
struct entry
{
float32x4_t erfc;
float32x4_t scale;
};
static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
e.erfc = vuzp1q_f32 (e1, e2);
e.scale = vuzp2q_f32 (e1, e2);
return e;
}
#if WANT_SIMD_EXCEPT
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
{
return v_call_f32 (erfcf, x, y, cmp);
}
#endif
/* Optimized single-precision vector erfcf(x).
Approximation based on series expansion near x rounded to
nearest multiple of 1/64.
Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
Values of erfc(r) and scale are read from lookup tables. Stored values
are scaled to avoid hitting the subnormal range.
Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
_ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
want 0x1.f51216p-120. */
VPCS_ATTR
float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
#if WANT_SIMD_EXCEPT
/* |x| < 2^-62. Avoid fabs by left-shifting by 1. */
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound));
/* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in
integer domain to avoid raising exceptions in presence of nans. */
uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x),
vreinterpretq_s32_f32 (dat->uflow_bound));
cmp = vorrq_u32 (cmp, uflow);
float32x4_t xm = x;
/* If any lanes are special, mask them with 0 and retain a copy of x to allow
special case handler to fix special lanes later. This is only necessary if
fenv exceptions are to be triggered correctly. */
if (__glibc_unlikely (v_any_u32 (cmp)))
x = v_zerofy_f32 (x, cmp);
#endif
float32x4_t a = vabsq_f32 (x);
a = vminq_f32 (a, dat->max);
/* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
2/sqrt(pi), when x reduced to r = 0. */
float32x4_t shift = dat->shift;
float32x4_t z = vaddq_f32 (a, shift);
/* Clamp index to a range of 644. A naive approach would use a subtract and
min. Instead we offset the table address and the index, then use a
saturating add. */
uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset);
struct entry e = lookup (i);
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
float32x4_t r = vsubq_f32 (z, shift);
float32x4_t d = vsubq_f32 (a, r);
float32x4_t d2 = vmulq_f32 (d, d);
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p1 = r;
float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
float32x4_t p3
= vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
p4 = vfmsq_f32 (dat->tenth, r2, p4);
float32x4_t y = vfmaq_f32 (p3, d, p4);
y = vfmaq_f32 (p2, d, y);
y = vfmaq_f32 (p1, d, y);
y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y));
/* Offset equals 2.0f if sign, else 0.0f. */
uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31);
float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30));
/* Copy sign and scale back in a single fma. Since the bit patterns do not
overlap, then logical or and addition are equivalent here. */
float32x4_t fac = vreinterpretq_f32_u32 (
vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1));
#if WANT_SIMD_EXCEPT
if (__glibc_unlikely (v_any_u32 (cmp)))
return special_case (xm, vfmaq_f32 (off, fac, y), cmp);
#endif
return vfmaq_f32 (off, fac, y);
}
libmvec_hidden_def (V_NAME_F1 (erfc))
HALF_WIDTH_ALIAS_F1 (erfc)

View File

@ -0,0 +1,676 @@
/* Table for Advanced SIMD erfcf
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "vecmath_config.h"
/* Lookup table used in erfcf.
For each possible rounded input r (multiples of 1/64), between
r = 0.0 and r = 10.0625 (645 values):
- the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
- the second entry __erfcf_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
they are scaled by a large enough value 2^47 (fits in 8 bits). */
const struct erfcf_data __erfcf_data = {
.tab = { { 0x1p47, 0x1.20dd76p47 },
{ 0x1.f6f944p46, 0x1.20cb68p47 },
{ 0x1.edf3aap46, 0x1.209546p47 },
{ 0x1.e4f05p46, 0x1.203b26p47 },
{ 0x1.dbf056p46, 0x1.1fbd28p47 },
{ 0x1.d2f4dcp46, 0x1.1f1b7ap47 },
{ 0x1.c9fefep46, 0x1.1e565cp47 },
{ 0x1.c10fd4p46, 0x1.1d6e14p47 },
{ 0x1.b8287ap46, 0x1.1c62fap47 },
{ 0x1.af4ap46, 0x1.1b3572p47 },
{ 0x1.a6757ep46, 0x1.19e5eap47 },
{ 0x1.9dabfcp46, 0x1.1874dep47 },
{ 0x1.94ee88p46, 0x1.16e2d8p47 },
{ 0x1.8c3e24p46, 0x1.153068p47 },
{ 0x1.839bd6p46, 0x1.135e3p47 },
{ 0x1.7b0894p46, 0x1.116cd8p47 },
{ 0x1.728558p46, 0x1.0f5d16p47 },
{ 0x1.6a1312p46, 0x1.0d2fa6p47 },
{ 0x1.61b2acp46, 0x1.0ae55p47 },
{ 0x1.596508p46, 0x1.087ee4p47 },
{ 0x1.512b06p46, 0x1.05fd3ep47 },
{ 0x1.49057ap46, 0x1.03614p47 },
{ 0x1.40f536p46, 0x1.00abdp47 },
{ 0x1.38fbp46, 0x1.fbbbbep46 },
{ 0x1.311796p46, 0x1.f5f0cep46 },
{ 0x1.294bb4p46, 0x1.eff8c4p46 },
{ 0x1.21980ap46, 0x1.e9d5a8p46 },
{ 0x1.19fd3ep46, 0x1.e38988p46 },
{ 0x1.127bf2p46, 0x1.dd167cp46 },
{ 0x1.0b14bcp46, 0x1.d67ea2p46 },
{ 0x1.03c82ap46, 0x1.cfc41ep46 },
{ 0x1.f92d8cp45, 0x1.c8e91cp46 },
{ 0x1.eb0214p45, 0x1.c1efcap46 },
{ 0x1.dd0edap45, 0x1.bada5ap46 },
{ 0x1.cf54b4p45, 0x1.b3aafcp46 },
{ 0x1.c1d46ap45, 0x1.ac63e8p46 },
{ 0x1.b48eaep45, 0x1.a5074ep46 },
{ 0x1.a78428p45, 0x1.9d9762p46 },
{ 0x1.9ab566p45, 0x1.96165p46 },
{ 0x1.8e22eap45, 0x1.8e8646p46 },
{ 0x1.81cd24p45, 0x1.86e96ap46 },
{ 0x1.75b47p45, 0x1.7f41dcp46 },
{ 0x1.69d91ep45, 0x1.7791b8p46 },
{ 0x1.5e3b66p45, 0x1.6fdb12p46 },
{ 0x1.52db78p45, 0x1.681ff2p46 },
{ 0x1.47b96ep45, 0x1.60625cp46 },
{ 0x1.3cd554p45, 0x1.58a446p46 },
{ 0x1.322f26p45, 0x1.50e79ep46 },
{ 0x1.27c6d2p45, 0x1.492e42p46 },
{ 0x1.1d9c34p45, 0x1.417a0cp46 },
{ 0x1.13af1ep45, 0x1.39ccc2p46 },
{ 0x1.09ff5p45, 0x1.32281ep46 },
{ 0x1.008c8p45, 0x1.2a8dcep46 },
{ 0x1.eeaca8p44, 0x1.22ff72p46 },
{ 0x1.dcb8cap44, 0x1.1b7e98p46 },
{ 0x1.cb3c86p44, 0x1.140cc4p46 },
{ 0x1.ba36dap44, 0x1.0cab62p46 },
{ 0x1.a9a6bap44, 0x1.055bd6p46 },
{ 0x1.998afap44, 0x1.fc3ee6p45 },
{ 0x1.89e25ep44, 0x1.edeeeep45 },
{ 0x1.7aab98p44, 0x1.dfca26p45 },
{ 0x1.6be542p44, 0x1.d1d2dp45 },
{ 0x1.5d8decp44, 0x1.c40b08p45 },
{ 0x1.4fa40ep44, 0x1.b674c8p45 },
{ 0x1.422616p44, 0x1.a911fp45 },
{ 0x1.351262p44, 0x1.9be438p45 },
{ 0x1.28674p44, 0x1.8eed36p45 },
{ 0x1.1c22f8p44, 0x1.822e66p45 },
{ 0x1.1043c2p44, 0x1.75a91ap45 },
{ 0x1.04c7cap44, 0x1.695e8cp45 },
{ 0x1.f35a72p43, 0x1.5d4fd4p45 },
{ 0x1.dde456p43, 0x1.517de6p45 },
{ 0x1.c9296cp43, 0x1.45e99cp45 },
{ 0x1.b525d6p43, 0x1.3a93b2p45 },
{ 0x1.a1d5a6p43, 0x1.2f7cc4p45 },
{ 0x1.8f34eap43, 0x1.24a554p45 },
{ 0x1.7d3fa6p43, 0x1.1a0dc6p45 },
{ 0x1.6bf1dcp43, 0x1.0fb662p45 },
{ 0x1.5b4784p43, 0x1.059f5ap45 },
{ 0x1.4b3c98p43, 0x1.f79184p44 },
{ 0x1.3bcd14p43, 0x1.e4653p44 },
{ 0x1.2cf4eep43, 0x1.d1b982p44 },
{ 0x1.1eb024p43, 0x1.bf8e1cp44 },
{ 0x1.10fab8p43, 0x1.ade26cp44 },
{ 0x1.03d0acp43, 0x1.9cb5bep44 },
{ 0x1.ee5c18p42, 0x1.8c0732p44 },
{ 0x1.d61dd6p42, 0x1.7bd5c8p44 },
{ 0x1.bedec8p42, 0x1.6c2056p44 },
{ 0x1.a8973cp42, 0x1.5ce596p44 },
{ 0x1.933f9p42, 0x1.4e241ep44 },
{ 0x1.7ed03ap42, 0x1.3fda6cp44 },
{ 0x1.6b41ccp42, 0x1.3206dcp44 },
{ 0x1.588cf2p42, 0x1.24a7b8p44 },
{ 0x1.46aa72p42, 0x1.17bb2cp44 },
{ 0x1.359332p42, 0x1.0b3f52p44 },
{ 0x1.254038p42, 0x1.fe646p43 },
{ 0x1.15aaa8p42, 0x1.e72372p43 },
{ 0x1.06cbcap42, 0x1.d0b7ap43 },
{ 0x1.f13a04p41, 0x1.bb1c98p43 },
{ 0x1.d62fbep41, 0x1.a64de6p43 },
{ 0x1.bc6c1ep41, 0x1.92470ap43 },
{ 0x1.a3e2ccp41, 0x1.7f036cp43 },
{ 0x1.8c87b8p41, 0x1.6c7e64p43 },
{ 0x1.764f2p41, 0x1.5ab342p43 },
{ 0x1.612d8ap41, 0x1.499d48p43 },
{ 0x1.4d17cap41, 0x1.3937b2p43 },
{ 0x1.3a03p41, 0x1.297dbap43 },
{ 0x1.27e498p41, 0x1.1a6a96p43 },
{ 0x1.16b24cp41, 0x1.0bf97ep43 },
{ 0x1.066222p41, 0x1.fc4b5ep42 },
{ 0x1.edd4d2p40, 0x1.e1d4dp42 },
{ 0x1.d08382p40, 0x1.c885ep42 },
{ 0x1.b4be2p40, 0x1.b0553p42 },
{ 0x1.9a7316p40, 0x1.99397ap42 },
{ 0x1.81915cp40, 0x1.83298ep42 },
{ 0x1.6a088p40, 0x1.6e1c58p42 },
{ 0x1.53c89ep40, 0x1.5a08e8p42 },
{ 0x1.3ec25ep40, 0x1.46e66cp42 },
{ 0x1.2ae6fap40, 0x1.34ac36p42 },
{ 0x1.18282ep40, 0x1.2351c2p42 },
{ 0x1.067844p40, 0x1.12ceb4p42 },
{ 0x1.eb940ep39, 0x1.031ad6p42 },
{ 0x1.cc2186p39, 0x1.e85c44p41 },
{ 0x1.ae808cp39, 0x1.cc018p41 },
{ 0x1.9299bp39, 0x1.b1160ap41 },
{ 0x1.785674p39, 0x1.978ae8p41 },
{ 0x1.5fa14ap39, 0x1.7f5188p41 },
{ 0x1.486586p39, 0x1.685bb6p41 },
{ 0x1.328f5ep39, 0x1.529b9ep41 },
{ 0x1.1e0be6p39, 0x1.3e03d8p41 },
{ 0x1.0ac8fcp39, 0x1.2a875cp41 },
{ 0x1.f16aaep38, 0x1.181984p41 },
{ 0x1.cf80d4p38, 0x1.06ae14p41 },
{ 0x1.afb4e2p38, 0x1.ec7262p40 },
{ 0x1.91e8bep38, 0x1.cd5ecap40 },
{ 0x1.75ffb4p38, 0x1.b00b38p40 },
{ 0x1.5bde72p38, 0x1.94624ep40 },
{ 0x1.436af4p38, 0x1.7a4f6ap40 },
{ 0x1.2c8c7ap38, 0x1.61beaep40 },
{ 0x1.172b7ap38, 0x1.4a9cf6p40 },
{ 0x1.033198p38, 0x1.34d7dcp40 },
{ 0x1.e11332p37, 0x1.205dacp40 },
{ 0x1.be3ebp37, 0x1.0d1d6ap40 },
{ 0x1.9dbf72p37, 0x1.f60d8ap39 },
{ 0x1.7f714p37, 0x1.d4143ap39 },
{ 0x1.6331cap37, 0x1.b430ecp39 },
{ 0x1.48e09cp37, 0x1.9646f4p39 },
{ 0x1.305ef8p37, 0x1.7a3adep39 },
{ 0x1.198fd6p37, 0x1.5ff276p39 },
{ 0x1.0457c6p37, 0x1.4754acp39 },
{ 0x1.e139bcp36, 0x1.30499cp39 },
{ 0x1.bc8d52p36, 0x1.1aba78p39 },
{ 0x1.9a7c3p36, 0x1.06918cp39 },
{ 0x1.7adadep36, 0x1.e77448p38 },
{ 0x1.5d806ap36, 0x1.c4412cp38 },
{ 0x1.424642p36, 0x1.a36454p38 },
{ 0x1.290826p36, 0x1.84ba3p38 },
{ 0x1.11a3f8p36, 0x1.6821p38 },
{ 0x1.f7f358p35, 0x1.4d78bcp38 },
{ 0x1.cfd652p35, 0x1.34a306p38 },
{ 0x1.aab85ap35, 0x1.1d8318p38 },
{ 0x1.88647p35, 0x1.07fdb4p38 },
{ 0x1.68a8e4p35, 0x1.e7f232p37 },
{ 0x1.4b5726p35, 0x1.c2b9dp37 },
{ 0x1.30439cp35, 0x1.a02436p37 },
{ 0x1.174578p35, 0x1.8005fp37 },
{ 0x1.003692p35, 0x1.6235fcp37 },
{ 0x1.d5e678p34, 0x1.468daep37 },
{ 0x1.aeb442p34, 0x1.2ce898p37 },
{ 0x1.8a9848p34, 0x1.15246ep37 },
{ 0x1.695876p34, 0x1.fe41cep36 },
{ 0x1.4abea2p34, 0x1.d57f52p36 },
{ 0x1.2e984ep34, 0x1.afc85ep36 },
{ 0x1.14b676p34, 0x1.8ce75ep36 },
{ 0x1.f9daap33, 0x1.6caa0ep36 },
{ 0x1.ce283ap33, 0x1.4ee142p36 },
{ 0x1.a609f8p33, 0x1.3360ccp36 },
{ 0x1.81396ap33, 0x1.19ff46p36 },
{ 0x1.5f7524p33, 0x1.0295fp36 },
{ 0x1.40806ep33, 0x1.da011p35 },
{ 0x1.2422eep33, 0x1.b23a5ap35 },
{ 0x1.0a286p33, 0x1.8d986ap35 },
{ 0x1.e4c0bp32, 0x1.6be022p35 },
{ 0x1.b93bf4p32, 0x1.4cda54p35 },
{ 0x1.916f7cp32, 0x1.30539p35 },
{ 0x1.6d0e7p32, 0x1.161be4p35 },
{ 0x1.4bd1cp32, 0x1.fc0d56p34 },
{ 0x1.2d77bep32, 0x1.cfd4a6p34 },
{ 0x1.11c3bep32, 0x1.a74068p34 },
{ 0x1.f0fb86p31, 0x1.8208bcp34 },
{ 0x1.c2e43ep31, 0x1.5feadap34 },
{ 0x1.98e254p31, 0x1.40a8c2p34 },
{ 0x1.729df6p31, 0x1.2408eap34 },
{ 0x1.4fc63cp31, 0x1.09d5f8p34 },
{ 0x1.3010aap31, 0x1.e3bcf4p33 },
{ 0x1.1338b8p31, 0x1.b7e946p33 },
{ 0x1.f1fecp30, 0x1.8fdc1cp33 },
{ 0x1.c2556ap30, 0x1.6b4702p33 },
{ 0x1.970b06p30, 0x1.49e178p33 },
{ 0x1.6fbddep30, 0x1.2b6876p33 },
{ 0x1.4c144ep30, 0x1.0f9e1cp33 },
{ 0x1.2bbc1ep30, 0x1.ec929ap32 },
{ 0x1.0e69f2p30, 0x1.be6abcp32 },
{ 0x1.e7b188p29, 0x1.94637ep32 },
{ 0x1.b792bcp29, 0x1.6e2368p32 },
{ 0x1.8c03d2p29, 0x1.4b581cp32 },
{ 0x1.649b02p29, 0x1.2bb5ccp32 },
{ 0x1.40f794p29, 0x1.0ef6c4p32 },
{ 0x1.20c13p29, 0x1.e9b5e8p31 },
{ 0x1.03a72ap29, 0x1.ba4f04p31 },
{ 0x1.d2bfc6p28, 0x1.8f4cccp31 },
{ 0x1.a35068p28, 0x1.684c22p31 },
{ 0x1.7885cep28, 0x1.44f21ep31 },
{ 0x1.51f06ap28, 0x1.24eb72p31 },
{ 0x1.2f2aaap28, 0x1.07ebd2p31 },
{ 0x1.0fd816p28, 0x1.db5adp30 },
{ 0x1.e7493p27, 0x1.abe09ep30 },
{ 0x1.b48774p27, 0x1.80f43ap30 },
{ 0x1.86e006p27, 0x1.5a2aep30 },
{ 0x1.5dd4bp27, 0x1.37231p30 },
{ 0x1.38f2e8p27, 0x1.1783cep30 },
{ 0x1.17d2c6p27, 0x1.f5f7d8p29 },
{ 0x1.f42c18p26, 0x1.c282cep29 },
{ 0x1.beceb2p26, 0x1.94219cp29 },
{ 0x1.8ef2aap26, 0x1.6a5972p29 },
{ 0x1.640bf6p26, 0x1.44ba86p29 },
{ 0x1.3d9be6p26, 0x1.22df2ap29 },
{ 0x1.1b2fe4p26, 0x1.046aeap29 },
{ 0x1.f8c0c2p25, 0x1.d21398p28 },
{ 0x1.c19fa8p25, 0x1.a0df1p28 },
{ 0x1.90538cp25, 0x1.74adc8p28 },
{ 0x1.6443fep25, 0x1.4d0232p28 },
{ 0x1.3ce784p25, 0x1.296a7p28 },
{ 0x1.19c232p25, 0x1.097f62p28 },
{ 0x1.f4c8c4p24, 0x1.d9c736p27 },
{ 0x1.bcd30ep24, 0x1.a6852cp27 },
{ 0x1.8aee4cp24, 0x1.789fb8p27 },
{ 0x1.5e77b6p24, 0x1.4f8c96p27 },
{ 0x1.36dcf2p24, 0x1.2acee2p27 },
{ 0x1.139a7cp24, 0x1.09f5dp27 },
{ 0x1.e8747p23, 0x1.d9371ep26 },
{ 0x1.b0a44ap23, 0x1.a4c89ep26 },
{ 0x1.7f064ap23, 0x1.75fa8ep26 },
{ 0x1.52efep23, 0x1.4c37cp26 },
{ 0x1.2bc82ap23, 0x1.26f9ep26 },
{ 0x1.09064p23, 0x1.05c804p26 },
{ 0x1.d45f16p22, 0x1.d06ad6p25 },
{ 0x1.9dacb2p22, 0x1.9bc0ap25 },
{ 0x1.6d3126p22, 0x1.6ce1aap25 },
{ 0x1.423d14p22, 0x1.43302cp25 },
{ 0x1.1c33cep22, 0x1.1e1e86p25 },
{ 0x1.f512dep21, 0x1.fa5b5p24 },
{ 0x1.b9823cp21, 0x1.bfd756p24 },
{ 0x1.84d6fep21, 0x1.8be4f8p24 },
{ 0x1.564a92p21, 0x1.5dcd66p24 },
{ 0x1.2d2c0ap21, 0x1.34ecf8p24 },
{ 0x1.08ddd2p21, 0x1.10b148p24 },
{ 0x1.d1a75p20, 0x1.e12eep23 },
{ 0x1.99218cp20, 0x1.a854eap23 },
{ 0x1.674c6ap20, 0x1.7603bap23 },
{ 0x1.3b62b6p20, 0x1.4980ccp23 },
{ 0x1.14b54p20, 0x1.2225b2p23 },
{ 0x1.e55102p19, 0x1.febc1p22 },
{ 0x1.a964eep19, 0x1.c14b22p22 },
{ 0x1.74b17ap19, 0x1.8b0cfcp22 },
{ 0x1.465daap19, 0x1.5b2fe6p22 },
{ 0x1.1da944p19, 0x1.30f93cp22 },
{ 0x1.f3d41p18, 0x1.0bc30cp22 },
{ 0x1.b512a2p18, 0x1.d5f3a8p21 },
{ 0x1.7e03b2p18, 0x1.9c3518p21 },
{ 0x1.4dbb98p18, 0x1.6961b8p21 },
{ 0x1.236a1ap18, 0x1.3cab14p21 },
{ 0x1.fcae94p17, 0x1.155a0ap21 },
{ 0x1.bbc1ap17, 0x1.e5989p20 },
{ 0x1.82eedcp17, 0x1.a8e406p20 },
{ 0x1.5139a6p17, 0x1.7397c6p20 },
{ 0x1.25c354p17, 0x1.44d26ep20 },
{ 0x1.ff8f84p16, 0x1.1bcca4p20 },
{ 0x1.bd3474p16, 0x1.efac52p19 },
{ 0x1.834586p16, 0x1.b0a68ap19 },
{ 0x1.50b75cp16, 0x1.7974e8p19 },
{ 0x1.249ef2p16, 0x1.4924a8p19 },
{ 0x1.fc5b88p15, 0x1.1edfa4p19 },
{ 0x1.b95ceep15, 0x1.f3d218p18 },
{ 0x1.7f03bap15, 0x1.b334fap18 },
{ 0x1.4c389cp15, 0x1.7ac2d8p18 },
{ 0x1.2006aep15, 0x1.4979acp18 },
{ 0x1.f32eap14, 0x1.1e767cp18 },
{ 0x1.b05cfep14, 0x1.f1e352p17 },
{ 0x1.764f46p14, 0x1.b0778cp17 },
{ 0x1.43e56cp14, 0x1.77756ep17 },
{ 0x1.18238p14, 0x1.45ce66p17 },
{ 0x1.e45a98p13, 0x1.1a95p17 },
{ 0x1.a284ccp13, 0x1.e9f2p16 },
{ 0x1.697596p13, 0x1.a887bep16 },
{ 0x1.3807acp13, 0x1.6fab64p16 },
{ 0x1.0d3b36p13, 0x1.3e44e4p16 },
{ 0x1.d0624p12, 0x1.135f28p16 },
{ 0x1.904e0cp12, 0x1.dc479ep15 },
{ 0x1.58e72ap12, 0x1.9baed4p15 },
{ 0x1.2906ccp12, 0x1.63ac6cp15 },
{ 0x1.ff58dap11, 0x1.33225ap15 },
{ 0x1.b7f1f4p11, 0x1.0916fp15 },
{ 0x1.7a551p11, 0x1.c960cp14 },
{ 0x1.453142p11, 0x1.8a6174p14 },
{ 0x1.1761f8p11, 0x1.53e4f8p14 },
{ 0x1.dfd296p10, 0x1.24caf2p14 },
{ 0x1.9bd5fp10, 0x1.f830cp13 },
{ 0x1.61501p10, 0x1.b1e5acp13 },
{ 0x1.2ef6p10, 0x1.7538c6p13 },
{ 0x1.03a918p10, 0x1.40dfd8p13 },
{ 0x1.bce26ap9, 0x1.13bc08p13 },
{ 0x1.7cef42p9, 0x1.d9a88p12 },
{ 0x1.46056p9, 0x1.96a0b4p12 },
{ 0x1.16e3cap9, 0x1.5ce9acp12 },
{ 0x1.dcea68p8, 0x1.2b3e54p12 },
{ 0x1.97945ap8, 0x1.0085p12 },
{ 0x1.5c2828p8, 0x1.b7937ep11 },
{ 0x1.29415p8, 0x1.7872dap11 },
{ 0x1.fb58fap7, 0x1.423acp11 },
{ 0x1.b0c1a8p7, 0x1.13af5p11 },
{ 0x1.70f474p7, 0x1.d77f0cp10 },
{ 0x1.3a68a8p7, 0x1.92ff34p10 },
{ 0x1.0bcc6p7, 0x1.5847eep10 },
{ 0x1.c7fa0cp6, 0x1.25f9eep10 },
{ 0x1.8401b6p6, 0x1.f5cc78p9 },
{ 0x1.4a029ap6, 0x1.ac0f6p9 },
{ 0x1.188c46p6, 0x1.6cfa9cp9 },
{ 0x1.dcc4fap5, 0x1.370ab8p9 },
{ 0x1.94ec06p5, 0x1.08f24p9 },
{ 0x1.57bc96p5, 0x1.c324c2p8 },
{ 0x1.23a81ap5, 0x1.7fe904p8 },
{ 0x1.eeb278p4, 0x1.46897ep8 },
{ 0x1.a35794p4, 0x1.159a38p8 },
{ 0x1.634b8p4, 0x1.d7c594p7 },
{ 0x1.2ce2a4p4, 0x1.90ae4ep7 },
{ 0x1.fd5f08p3, 0x1.5422fp7 },
{ 0x1.aef3cep3, 0x1.20998p7 },
{ 0x1.6c6e62p3, 0x1.e98102p6 },
{ 0x1.3407b6p3, 0x1.9eee06p6 },
{ 0x1.043bap3, 0x1.5f8b88p6 },
{ 0x1.b77e5cp2, 0x1.29b294p6 },
{ 0x1.72f0c4p2, 0x1.f7f338p5 },
{ 0x1.38ee18p2, 0x1.aa5772p5 },
{ 0x1.07dd68p2, 0x1.68823ep5 },
{ 0x1.bcc58ep1, 0x1.30b14ep5 },
{ 0x1.76aca4p1, 0x1.01647cp5 },
{ 0x1.3b7912p1, 0x1.b2a87ep4 },
{ 0x1.097f82p1, 0x1.6ed2f2p4 },
{ 0x1.beaa3ep0, 0x1.356cd6p4 },
{ 0x1.778be2p0, 0x1.04e15ep4 },
{ 0x1.3b9984p0, 0x1.b7b04p3 },
{ 0x1.09182cp0, 0x1.725862p3 },
{ 0x1.bd20fcp-1, 0x1.37c92cp3 },
{ 0x1.75892p-1, 0x1.065b96p3 },
{ 0x1.394e7ap-1, 0x1.b950d4p2 },
{ 0x1.06a996p-1, 0x1.72fd94p2 },
{ 0x1.b8328ep-2, 0x1.37b83cp2 },
{ 0x1.70aff4p-2, 0x1.05ca5p2 },
{ 0x1.34a53cp-2, 0x1.b7807ep1 },
{ 0x1.0241dep-2, 0x1.70bebp1 },
{ 0x1.affb9p-3, 0x1.353a6cp1 },
{ 0x1.691c7cp-3, 0x1.0330fp1 },
{ 0x1.2db8cap-3, 0x1.b24a16p0 },
{ 0x1.f7f4f8p-4, 0x1.6ba91ap0 },
{ 0x1.a4ab64p-4, 0x1.305e98p0 },
{ 0x1.5efa4ep-4, 0x1.fd3de2p-1 },
{ 0x1.24b0d8p-4, 0x1.a9cc94p-1 },
{ 0x1.e7eeap-5, 0x1.63daf8p-1 },
{ 0x1.96826ep-5, 0x1.294176p-1 },
{ 0x1.5282d2p-5, 0x1.f05e82p-2 },
{ 0x1.19c05p-5, 0x1.9e39dcp-2 },
{ 0x1.d4ca9cp-6, 0x1.5982p-2 },
{ 0x1.85cfacp-6, 0x1.200c8ap-2 },
{ 0x1.43fb32p-6, 0x1.e00e92p-3 },
{ 0x1.0d2382p-6, 0x1.8fd4ep-3 },
{ 0x1.bef1b2p-7, 0x1.4cd9cp-3 },
{ 0x1.72ede4p-7, 0x1.14f48ap-3 },
{ 0x1.33b1cap-7, 0x1.ccaaeap-4 },
{ 0x1.fe3bdp-8, 0x1.7eef14p-4 },
{ 0x1.a6d7d2p-8, 0x1.3e2964p-4 },
{ 0x1.5e4062p-8, 0x1.083768p-4 },
{ 0x1.21fb7ap-8, 0x1.b69f1p-5 },
{ 0x1.dfefbep-9, 0x1.6be574p-5 },
{ 0x1.8cf816p-9, 0x1.2dc11ap-5 },
{ 0x1.482fa8p-9, 0x1.f4343cp-6 },
{ 0x1.0f30c4p-9, 0x1.9e614ep-6 },
{ 0x1.bff86ep-10, 0x1.571d34p-6 },
{ 0x1.71d0b6p-10, 0x1.1bf742p-6 },
{ 0x1.3125f6p-10, 0x1.d5cc6cp-7 },
{ 0x1.f755eap-11, 0x1.846e9ep-7 },
{ 0x1.9eebaap-11, 0x1.410048p-7 },
{ 0x1.55df18p-11, 0x1.09258p-7 },
{ 0x1.198c18p-11, 0x1.b5ceb6p-8 },
{ 0x1.cf82ep-12, 0x1.69468p-8 },
{ 0x1.7d5af6p-12, 0x1.29f9e8p-8 },
{ 0x1.399c28p-12, 0x1.eb4b9ep-9 },
{ 0x1.01c65ap-12, 0x1.94d1dep-9 },
{ 0x1.a78e82p-13, 0x1.4d6706p-9 },
{ 0x1.5bcf92p-13, 0x1.127346p-9 },
{ 0x1.1d791cp-13, 0x1.c39fap-10 },
{ 0x1.d463dcp-14, 0x1.73679cp-10 },
{ 0x1.8011fcp-14, 0x1.314916p-10 },
{ 0x1.3ac71cp-14, 0x1.f5a11ap-11 },
{ 0x1.01dcc2p-14, 0x1.9beca8p-11 },
{ 0x1.a6459cp-15, 0x1.52189ap-11 },
{ 0x1.59962ap-15, 0x1.155d48p-11 },
{ 0x1.1ab0e4p-15, 0x1.c6dc8ap-12 },
{ 0x1.ce42dep-16, 0x1.74ca88p-12 },
{ 0x1.79c43p-16, 0x1.31612ap-12 },
{ 0x1.349128p-16, 0x1.f4125ap-13 },
{ 0x1.f7d80ep-17, 0x1.993e82p-13 },
{ 0x1.9b270cp-17, 0x1.4ec006p-13 },
{ 0x1.4f59fap-17, 0x1.11aebp-13 },
{ 0x1.1164acp-17, 0x1.bf4ab2p-14 },
{ 0x1.bd8c96p-18, 0x1.6d561ep-14 },
{ 0x1.6ae172p-18, 0x1.2a406ep-14 },
{ 0x1.276874p-18, 0x1.e6bba6p-15 },
{ 0x1.e0bad2p-19, 0x1.8cf814p-15 },
{ 0x1.86f788p-19, 0x1.4399f8p-15 },
{ 0x1.3dcfaep-19, 0x1.07aa3p-15 },
{ 0x1.023828p-19, 0x1.ad7302p-16 },
{ 0x1.a3666ep-20, 0x1.5d90f4p-16 },
{ 0x1.546e38p-20, 0x1.1c674ep-16 },
{ 0x1.143264p-20, 0x1.ce8ccp-17 },
{ 0x1.bff316p-21, 0x1.77f562p-17 },
{ 0x1.6b13ecp-21, 0x1.316da8p-17 },
{ 0x1.2624f4p-21, 0x1.f0046p-18 },
{ 0x1.dc5de4p-22, 0x1.92920ap-18 },
{ 0x1.818d3ap-22, 0x1.4691b2p-18 },
{ 0x1.37e62p-22, 0x1.08c96ap-18 },
{ 0x1.f8637ep-23, 0x1.ad2d0ap-19 },
{ 0x1.97a3dcp-23, 0x1.5ba462p-19 },
{ 0x1.494a4p-23, 0x1.1975ep-19 },
{ 0x1.09dee4p-23, 0x1.c78892p-20 },
{ 0x1.ad1fap-24, 0x1.7073c4p-20 },
{ 0x1.5a245ep-24, 0x1.29df48p-20 },
{ 0x1.171278p-24, 0x1.e163bep-21 },
{ 0x1.c1c74cp-25, 0x1.84cbbp-21 },
{ 0x1.6a46f4p-25, 0x1.39dbcep-21 },
{ 0x1.23a858p-25, 0x1.fa7b92p-22 },
{ 0x1.d56196p-26, 0x1.9876ap-22 },
{ 0x1.7984b6p-26, 0x1.4940bcp-22 },
{ 0x1.2f7cc4p-26, 0x1.094608p-22 },
{ 0x1.e7b62cp-27, 0x1.ab3e8cp-23 },
{ 0x1.87b15ep-27, 0x1.57e33ep-23 },
{ 0x1.3a6dp-27, 0x1.14a8b6p-23 },
{ 0x1.f88ebap-28, 0x1.bcede6p-24 },
{ 0x1.94a282p-28, 0x1.659918p-24 },
{ 0x1.44580ap-28, 0x1.1f4498p-24 },
{ 0x1.03dbf8p-28, 0x1.cd5086p-25 },
{ 0x1.a03066p-29, 0x1.723974p-25 },
{ 0x1.4d1f2ep-29, 0x1.28f9cap-25 },
{ 0x1.0a814ap-29, 0x1.dc34b6p-26 },
{ 0x1.aa36cap-30, 0x1.7d9dbp-26 },
{ 0x1.54a6b6p-30, 0x1.31aa56p-26 },
{ 0x1.102232p-30, 0x1.e96c26p-27 },
{ 0x1.b2959ep-31, 0x1.87a218p-27 },
{ 0x1.5ad66cp-31, 0x1.393ad2p-27 },
{ 0x1.14ac7ep-31, 0x1.f4ccdap-28 },
{ 0x1.b931b8p-32, 0x1.9026a8p-28 },
{ 0x1.5f9a24p-32, 0x1.3f92eap-28 },
{ 0x1.181154p-32, 0x1.fe3208p-29 },
{ 0x1.bdf55ep-33, 0x1.970fbp-29 },
{ 0x1.62e226p-33, 0x1.449de6p-29 },
{ 0x1.1a4576p-33, 0x1.02be7p-29 },
{ 0x1.c0d0bep-34, 0x1.9c4672p-30 },
{ 0x1.64a386p-34, 0x1.484b1ep-30 },
{ 0x1.1b418cp-34, 0x1.054a9ap-30 },
{ 0x1.c1ba4ap-35, 0x1.9fb994p-31 },
{ 0x1.64d86p-35, 0x1.4a8e4ep-31 },
{ 0x1.1b0242p-35, 0x1.06b4fep-31 },
{ 0x1.c0aee6p-36, 0x1.a15d86p-32 },
{ 0x1.637ffap-36, 0x1.4b5fdep-32 },
{ 0x1.198862p-36, 0x1.06f8dap-32 },
{ 0x1.bdb204p-37, 0x1.a12cc8p-33 },
{ 0x1.609ec2p-37, 0x1.4abd0ap-33 },
{ 0x1.16d8d2p-37, 0x1.06154ap-33 },
{ 0x1.b8cd88p-38, 0x1.9f27fap-34 },
{ 0x1.5c3e42p-38, 0x1.48a7fcp-34 },
{ 0x1.12fc6cp-38, 0x1.040d4ap-34 },
{ 0x1.b2119p-39, 0x1.9b55e8p-35 },
{ 0x1.566cep-39, 0x1.4527acp-35 },
{ 0x1.0dffep-39, 0x1.00e7acp-35 },
{ 0x1.a99426p-40, 0x1.95c358p-36 },
{ 0x1.4f3d92p-40, 0x1.4047cep-36 },
{ 0x1.07f35ep-40, 0x1.f95dcep-37 },
{ 0x1.9f70cp-41, 0x1.8e82cep-37 },
{ 0x1.46c77ap-41, 0x1.3a1882p-37 },
{ 0x1.00ea48p-41, 0x1.eee1d4p-38 },
{ 0x1.93c7acp-42, 0x1.85ac18p-38 },
{ 0x1.3d256ap-42, 0x1.32ae04p-38 },
{ 0x1.f1f59p-43, 0x1.e27d88p-39 },
{ 0x1.86bd6ap-43, 0x1.7b5bdap-39 },
{ 0x1.327554p-43, 0x1.2a2036p-39 },
{ 0x1.e07ab4p-44, 0x1.d458ap-40 },
{ 0x1.7879ecp-44, 0x1.6fb2eap-40 },
{ 0x1.26d7bp-44, 0x1.208a2cp-40 },
{ 0x1.cd98a2p-45, 0x1.c49f8ap-41 },
{ 0x1.6927c2p-45, 0x1.62d5aap-41 },
{ 0x1.1a6ed6p-45, 0x1.16098ep-41 },
{ 0x1.b986acp-46, 0x1.b3828ep-42 },
{ 0x1.58f35ap-46, 0x1.54eb3ep-42 },
{ 0x1.0d5e6p-46, 0x1.0abe0ep-42 },
{ 0x1.a47db6p-47, 0x1.a134d4p-43 },
{ 0x1.480a18p-47, 0x1.461cdap-43 },
{ 0x1.ff94e4p-48, 0x1.fd9182p-44 },
{ 0x1.8eb738p-48, 0x1.8deb62p-44 },
{ 0x1.369994p-48, 0x1.3694e8p-44 },
{ 0x1.e3ae4ap-49, 0x1.e49706p-45 },
{ 0x1.786c3ep-49, 0x1.79dc28p-45 },
{ 0x1.24cec8p-49, 0x1.267e46p-45 },
{ 0x1.c74fc4p-50, 0x1.cad0bp-46 },
{ 0x1.61d46cp-50, 0x1.653d08p-46 },
{ 0x1.12d55cp-50, 0x1.16038cp-46 },
{ 0x1.aabdacp-51, 0x1.b081aap-47 },
{ 0x1.4b252ep-51, 0x1.5042e2p-47 },
{ 0x1.00d6f8p-51, 0x1.054e44p-47 },
{ 0x1.8e38ep-52, 0x1.95eb2cp-48 },
{ 0x1.3490e8p-52, 0x1.3b20c6p-48 },
{ 0x1.ddf56ap-53, 0x1.e90cb6p-49 },
{ 0x1.71fdep-53, 0x1.7b4b76p-49 },
{ 0x1.1e465ap-53, 0x1.26072ap-49 },
{ 0x1.bac92ep-54, 0x1.c7a2ecp-50 },
{ 0x1.56441cp-54, 0x1.60dcfp-50 },
{ 0x1.08700cp-54, 0x1.112346p-50 },
{ 0x1.986a66p-55, 0x1.a6a50ap-51 },
{ 0x1.3b3d56p-55, 0x1.46d572p-51 },
{ 0x1.e667dap-56, 0x1.f93d0ep-52 },
{ 0x1.7712b8p-56, 0x1.86529ep-52 },
{ 0x1.211544p-56, 0x1.2d65aep-52 },
{ 0x1.bd660ap-57, 0x1.d13c32p-53 },
{ 0x1.56f3eep-57, 0x1.66e45ap-53 },
{ 0x1.07f14ap-57, 0x1.14b8b6p-53 },
{ 0x1.96129cp-58, 0x1.aa854cp-54 },
{ 0x1.3837cp-58, 0x1.488b94p-54 },
{ 0x1.dfe0c2p-59, 0x1.f9e772p-55 },
{ 0x1.709b5ap-59, 0x1.85503p-55 },
{ 0x1.1affd2p-59, 0x1.2b7218p-55 },
{ 0x1.b2564p-60, 0x1.cc6bb6p-56 },
{ 0x1.4d23fap-60, 0x1.61cb1ap-56 },
{ 0x1.fecbdp-61, 0x1.0fba0ep-56 },
{ 0x1.8767d8p-61, 0x1.a13072p-57 },
{ 0x1.2bc67ep-61, 0x1.401abcp-57 },
{ 0x1.caf846p-62, 0x1.eafc2cp-58 },
{ 0x1.5f2e7ap-62, 0x1.785cp-58 },
{ 0x1.0c93acp-62, 0x1.205a7ep-58 },
{ 0x1.9a9b06p-63, 0x1.b9a31ap-59 },
{ 0x1.39b7fcp-63, 0x1.520968p-59 },
{ 0x1.df277ap-64, 0x1.029ce6p-59 },
{ 0x1.6dbcdp-64, 0x1.8b81d6p-60 },
{ 0x1.17080ap-64, 0x1.2e48f2p-60 },
{ 0x1.a98e26p-65, 0x1.cdd86cp-61 },
{ 0x1.445a6ap-65, 0x1.60a47ap-61 },
{ 0x1.ee324ep-66, 0x1.0d210cp-61 },
{ 0x1.784e3p-66, 0x1.9a961ep-62 },
{ 0x1.1e65fep-66, 0x1.390b74p-62 },
{ 0x1.b3bb86p-67, 0x1.dd1e52p-63 },
{ 0x1.4b4e36p-67, 0x1.6b6a7ap-63 },
{ 0x1.f790f6p-68, 0x1.14acc2p-63 },
{ 0x1.7e82cep-68, 0x1.a511aap-64 },
{ 0x1.226a7ap-68, 0x1.404114p-64 },
{ 0x1.b8c634p-69, 0x1.e6ea96p-65 },
{ 0x1.4e53acp-69, 0x1.71f97ap-65 },
{ 0x1.faed5cp-70, 0x1.18fb2ep-65 },
{ 0x1.80217ep-70, 0x1.aa947ep-66 },
{ 0x1.22f066p-70, 0x1.43a796p-66 },
{ 0x1.b87f86p-71, 0x1.eae2fp-67 },
{ 0x1.4d4ec8p-71, 0x1.7414e6p-67 },
{ 0x1.f8283ep-72, 0x1.19e474p-67 },
{ 0x1.7d1b22p-72, 0x1.aaeb7ep-68 },
{ 0x1.1ff2dp-72, 0x1.431f66p-68 },
{ 0x1.b2e9e8p-73, 0x1.e8e272p-69 },
{ 0x1.4848dep-73, 0x1.71a91ep-69 },
{ 0x1.ef5b16p-74, 0x1.176014p-69 },
{ 0x1.758b92p-74, 0x1.a6137cp-70 },
{ 0x1.198d42p-74, 0x1.3ead74p-70 },
{ 0x1.a838bp-75, 0x1.e0fbc2p-71 },
{ 0x1.3f700cp-75, 0x1.6accaep-71 },
{ 0x1.e0d68ep-76, 0x1.118578p-71 },
{ 0x1.69b7f4p-76, 0x1.9c3974p-72 },
{ 0x1.0ffa12p-76, 0x1.367afap-72 },
{ 0x1.98cd1cp-77, 0x1.d377fap-73 },
{ 0x1.33148p-77, 0x1.5fbee6p-73 },
{ 0x1.cd1dbap-78, 0x1.088a8p-73 },
{ 0x1.5a0a9cp-78, 0x1.8db7ccp-74 },
{ 0x1.038ef4p-78, 0x1.2ad2ecp-74 },
{ 0x1.85308ap-79, 0x1.c0d23ep-75 },
{ 0x1.23a3cp-79, 0x1.50e41ap-75 },
{ 0x1.b4de68p-80, 0x1.f980a8p-76 },
{ 0x1.470ce4p-80, 0x1.7b10fep-76 },
{ 0x1.e9700cp-81, 0x1.1c1d98p-76 },
{ 0x1.6e0c9p-81, 0x1.a9b08p-77 },
{ 0x1.11a25ap-81, 0x1.3ebfb4p-77 },
{ 0x1.98e73ap-82, 0x1.dd1d36p-78 },
{ 0x1.315f58p-82, 0x1.64e7fp-78 },
{ 0x1.c7e35cp-83, 0x1.0ada94p-78 },
{ 0x1.542176p-83, 0x1.8ed9e8p-79 },
{ 0x1.fb491ep-84, 0x1.29ecb2p-79 },
{ 0x1.7a1c34p-84, 0x1.bcdb34p-80 },
{ 0x1.19b0f2p-84, 0x1.4bf6cap-80 },
{ 0x1.a383cap-85, 0x1.ef3318p-81 },
{ 0x1.383bf2p-85, 0x1.712bc2p-81 },
{ 0x1.d08cdap-86, 0x1.13151p-81 },
{ 0x1.596adp-86, 0x1.99bf36p-82 },
{ 0x1.00b602p-86, 0x1.3104d6p-82 },
{ 0x1.7d62a2p-87, 0x1.c5e534p-83 },
{ 0x1.1b2abcp-87, 0x1.518db2p-83 },
{ 0x1.a4480ep-88, 0x1.f5d1c6p-84 },
{ 0x1.37be42p-88, 0x1.74d45ap-84 },
{ 0x1.ce3ee4p-89, 0x1.14dc4ap-84 },
{ 0x1.568986p-89, 0x1.9afd0ep-85 },
{ 0x1.fb69c6p-90, 0x1.30e632p-85 },
{ 0x1.77a47ep-90, 0x1.c42b48p-86 },
{ 0x1.15f4ep-90, 0x1.4f1f52p-86 },
{ 0x1.9b25dcp-91, 0x1.f08156p-87 },
{ 0x1.2feeeep-91, 0x1.6f9f62p-87 },
{ 0x1.c122bcp-92, 0x1.100ffap-87 },
{ 0x1.4bb154p-92, 0x1.927ce6p-88 },
{ 0x1.e9ae56p-93, 0x1.2992f4p-88 },
{ 0x1.6948e8p-93, 0x1.b7cccap-89 },
{ 0x1.0a6cd2p-93, 0x1.44d7c4p-89 },
{ 0x1.88c0cap-94, 0x1.dfa22p-90 },
{ 0x1.215988p-94, 0x1.61eb26p-90 },
{ 0x1.aa222ap-95, 0x1.0506e2p-90 },
{ 0x1.39a30ep-95, 0x1.80d828p-91 },
{ 0x1.cd740ep-96, 0x1.1b8f04p-91 },
{ 0x1.534d82p-96, 0x1.a1a7ecp-92 },
{ 0x1.f2bb06p-97, 0x1.336f3p-92 },
{ 0x1.6e5b34p-97, 0x1.c46172p-93 },
{ 0x1.0cfc82p-97, 0x1.4cab82p-93 },
{ 0x1.8acc82p-98, 0x1.e9094cp-94 },
{ 0x1.219686p-98, 0x1.67465p-94 },
{ 0x1.a89fa6p-99, 0x1.07d0b8p-94 },
{ 0x1.372982p-99, 0x1.833ffap-95 },
{ 0x1.c7d094p-100, 0x1.1c147ap-95 },
{ 0x1.4db1c8p-100, 0x1.a096ccp-96 },
{ 0x1.e858d8p-101, 0x1.314decp-96 },
{ 0x1.6529ep-101, 0x1.bf46cep-97 },
{ 0x1.0517bap-101, 0x1.47796ap-97 },
{ 0x1.7d8a8p-102, 0x1.df49a2p-98 },
{ 0x1.16a46p-102, 0x1.5e9198p-98 },
{ 0x1.96ca76p-103, 0x1.004b34p-98 },
{ 0x1.28cb2cp-103, 0x1.768f3ep-99 },
{ 0x1.b0de98p-104, 0x1.1190d2p-99 },
},
};

View File

@ -0,0 +1,113 @@
/* Single-precision vector (SVE) erfc function
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
static const struct data
{
uint32_t off_idx, off_arr;
float max, shift;
float third, two_thirds, two_over_fifteen, two_over_five, tenth;
} data = {
/* Set an offset so the range of the index used for lookup is 644, and it can
be clamped using a saturated add. */
.off_idx = 0xb7fffd7b, /* 0xffffffff - asuint(shift) - 644. */
.off_arr = 0xfffffd7b, /* 0xffffffff - 644. */
.max = 10.0625f, /* 644/64. */
.shift = 0x1p17f,
.third = 0x1.555556p-2f,
.two_thirds = 0x1.555556p-1f,
.two_over_fifteen = 0x1.111112p-3f,
.two_over_five = -0x1.99999ap-2f,
.tenth = -0x1.99999ap-4f,
};
#define SignMask 0x80000000
#define TableScale 0x28000000 /* 0x1p-47. */
/* Optimized single-precision vector erfcf(x).
Approximation based on series expansion near x rounded to
nearest multiple of 1/64.
Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
Values of erfc(r) and scale are read from lookup tables. Stored values
are scaled to avoid hitting the subnormal range.
Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
_ZGVsMxv_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
want 0x1.f51216p-120. */
svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
{
const struct data *dat = ptr_barrier (&data);
svfloat32_t a = svabs_x (pg, x);
/* Clamp input at |x| <= 10.0 + 4/64. */
a = svmin_x (pg, a, dat->max);
/* Reduce x to the nearest multiple of 1/64. */
svfloat32_t shift = sv_f32 (dat->shift);
svfloat32_t z = svadd_x (pg, a, shift);
/* Saturate index for the NaN case. */
svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
i = svmul_x (pg, i, 2);
const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
svfloat32_t erfcr = svld1_gather_index (pg, p, i);
svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
svfloat32_t r = svsub_x (pg, z, shift);
svfloat32_t d = svsub_x (pg, a, r);
svfloat32_t d2 = svmul_x (pg, d, d);
svfloat32_t r2 = svmul_x (pg, r, r);
svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
svfloat32_t third = svdup_lane (coeffs, 0);
svfloat32_t p1 = r;
svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
svfloat32_t y = svmla_x (pg, p3, d, p4);
y = svmla_x (pg, p2, d, y);
y = svmla_x (pg, p1, d, y);
/* Solves the |x| = inf/nan case. */
y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y));
/* Offset equals 2.0f if sign, else 0.0f. */
svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), SignMask);
svfloat32_t off = svreinterpret_f32 (svlsr_x (pg, sign, 1));
/* Handle sign and scale back in a single fma. */
svfloat32_t fac = svreinterpret_f32 (svorr_x (pg, sign, TableScale));
return svmla_x (pg, off, fac, y);
}

View File

@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
VPCS_VECTOR_WRAPPER (erfc_advsimd, _ZGVnN2v_erfc)
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)

View File

@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
SVE_VECTOR_WRAPPER (erfc_sve, _ZGVsMxv_erfc)
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)

View File

@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
VPCS_VECTOR_WRAPPER (erfcf_advsimd, _ZGVnN4v_erfcf)
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)

View File

@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
SVE_VECTOR_WRAPPER (erfcf_sve, _ZGVsMxv_erfcf)
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)

View File

@ -114,4 +114,20 @@ extern const struct sv_erf_data
double scale[769];
} __sv_erf_data attribute_hidden;
extern const struct erfc_data
{
struct
{
double erfc, scale;
} tab[3488];
} __erfc_data attribute_hidden;
extern const struct erfcf_data
{
struct
{
float erfc, scale;
} tab[645];
} __erfcf_data attribute_hidden;
#endif

View File

@ -1017,11 +1017,19 @@ double: 2
float: 2
ldouble: 4
Function: "erfc_advsimd":
double: 1
float: 1
Function: "erfc_downward":
double: 4
float: 4
ldouble: 5
Function: "erfc_sve":
double: 1
float: 1
Function: "erfc_towardzero":
double: 3
float: 3

View File

@ -82,6 +82,8 @@ GLIBC_2.40 _ZGVnN2v_atanhf F
GLIBC_2.40 _ZGVnN2v_cosh F
GLIBC_2.40 _ZGVnN2v_coshf F
GLIBC_2.40 _ZGVnN2v_erf F
GLIBC_2.40 _ZGVnN2v_erfc F
GLIBC_2.40 _ZGVnN2v_erfcf F
GLIBC_2.40 _ZGVnN2v_erff F
GLIBC_2.40 _ZGVnN2v_sinh F
GLIBC_2.40 _ZGVnN2v_sinhf F
@ -91,6 +93,7 @@ GLIBC_2.40 _ZGVnN4v_acoshf F
GLIBC_2.40 _ZGVnN4v_asinhf F
GLIBC_2.40 _ZGVnN4v_atanhf F
GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erfcf F
GLIBC_2.40 _ZGVnN4v_erff F
GLIBC_2.40 _ZGVnN4v_sinhf F
GLIBC_2.40 _ZGVnN4v_tanhf F
@ -103,6 +106,8 @@ GLIBC_2.40 _ZGVsMxv_atanhf F
GLIBC_2.40 _ZGVsMxv_cosh F
GLIBC_2.40 _ZGVsMxv_coshf F
GLIBC_2.40 _ZGVsMxv_erf F
GLIBC_2.40 _ZGVsMxv_erfc F
GLIBC_2.40 _ZGVsMxv_erfcf F
GLIBC_2.40 _ZGVsMxv_erff F
GLIBC_2.40 _ZGVsMxv_sinh F
GLIBC_2.40 _ZGVsMxv_sinhf F