glibc/sysdeps/ieee754/dbl-64/mpa.c
Siddhesh Poyarekar 5739f705ee Use integral constants
The compiler is smart enough to convert those into double for powerpc,
but if we put them as doubles, it adds overhead by performing those
operations in floating point mode.
2013-03-26 20:24:04 +05:30

905 lines
19 KiB
C

/*
* IBM Accurate Mathematical Library
* written by International Business Machines Corp.
* Copyright (C) 2001-2013 Free Software Foundation, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/************************************************************************/
/* MODULE_NAME: mpa.c */
/* */
/* FUNCTIONS: */
/* mcr */
/* acr */
/* cpy */
/* norm */
/* denorm */
/* mp_dbl */
/* dbl_mp */
/* add_magnitudes */
/* sub_magnitudes */
/* add */
/* sub */
/* mul */
/* inv */
/* dvd */
/* */
/* Arithmetic functions for multiple precision numbers. */
/* Relative errors are bounded */
/************************************************************************/
#include "endian.h"
#include "mpa.h"
#include <sys/param.h>
#include <alloca.h>
#ifndef SECTION
# define SECTION
#endif
#ifndef NO__CONST
const mp_no mpone = {1, {1.0, 1.0}};
const mp_no mptwo = {1, {1.0, 2.0}};
#endif
#ifndef NO___ACR
/* Compare mantissa of two multiple precision numbers regardless of the sign
and exponent of the numbers. */
static int
mcr (const mp_no *x, const mp_no *y, int p)
{
long i;
long p2 = p;
for (i = 1; i <= p2; i++)
{
if (X[i] == Y[i])
continue;
else if (X[i] > Y[i])
return 1;
else
return -1;
}
return 0;
}
/* Compare the absolute values of two multiple precision numbers. */
int
__acr (const mp_no *x, const mp_no *y, int p)
{
long i;
if (X[0] == 0)
{
if (Y[0] == 0)
i = 0;
else
i = -1;
}
else if (Y[0] == 0)
i = 1;
else
{
if (EX > EY)
i = 1;
else if (EX < EY)
i = -1;
else
i = mcr (x, y, p);
}
return i;
}
#endif
#ifndef NO___CPY
/* Copy multiple precision number X into Y. They could be the same
number. */
void
__cpy (const mp_no *x, mp_no *y, int p)
{
long i;
EY = EX;
for (i = 0; i <= p; i++)
Y[i] = X[i];
}
#endif
#ifndef NO___MP_DBL
/* Convert a multiple precision number *X into a double precision
number *Y, normalized case (|x| >= 2**(-1022))). */
static void
norm (const mp_no *x, double *y, int p)
{
#define R RADIXI
long i;
double c;
mantissa_t a, u, v, z[5];
if (p < 5)
{
if (p == 1)
c = X[1];
else if (p == 2)
c = X[1] + R * X[2];
else if (p == 3)
c = X[1] + R * (X[2] + R * X[3]);
else if (p == 4)
c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]);
}
else
{
for (a = 1, z[1] = X[1]; z[1] < TWO23;)
{
a *= 2;
z[1] *= 2;
}
for (i = 2; i < 5; i++)
{
mantissa_store_t d, r;
d = X[i] * (mantissa_store_t) a;
DIV_RADIX (d, r);
z[i] = r;
z[i - 1] += d;
}
u = ALIGN_DOWN_TO (z[3], TWO19);
v = z[3] - u;
if (v == TWO18)
{
if (z[4] == 0)
{
for (i = 5; i <= p; i++)
{
if (X[i] == 0)
continue;
else
{
z[3] += 1;
break;
}
}
}
else
z[3] += 1;
}
c = (z[1] + R * (z[2] + R * z[3])) / a;
}
c *= X[0];
for (i = 1; i < EX; i++)
c *= RADIX;
for (i = 1; i > EX; i--)
c *= RADIXI;
*y = c;
#undef R
}
/* Convert a multiple precision number *X into a double precision
number *Y, Denormal case (|x| < 2**(-1022))). */
static void
denorm (const mp_no *x, double *y, int p)
{
long i, k;
long p2 = p;
double c;
mantissa_t u, z[5];
#define R RADIXI
if (EX < -44 || (EX == -44 && X[1] < TWO5))
{
*y = 0;
return;
}
if (p2 == 1)
{
if (EX == -42)
{
z[1] = X[1] + TWO10;
z[2] = 0;
z[3] = 0;
k = 3;
}
else if (EX == -43)
{
z[1] = TWO10;
z[2] = X[1];
z[3] = 0;
k = 2;
}
else
{
z[1] = TWO10;
z[2] = 0;
z[3] = X[1];
k = 1;
}
}
else if (p2 == 2)
{
if (EX == -42)
{
z[1] = X[1] + TWO10;
z[2] = X[2];
z[3] = 0;
k = 3;
}
else if (EX == -43)
{
z[1] = TWO10;
z[2] = X[1];
z[3] = X[2];
k = 2;
}
else
{
z[1] = TWO10;
z[2] = 0;
z[3] = X[1];
k = 1;
}
}
else
{
if (EX == -42)
{
z[1] = X[1] + TWO10;
z[2] = X[2];
k = 3;
}
else if (EX == -43)
{
z[1] = TWO10;
z[2] = X[1];
k = 2;
}
else
{
z[1] = TWO10;
z[2] = 0;
k = 1;
}
z[3] = X[k];
}
u = ALIGN_DOWN_TO (z[3], TWO5);
if (u == z[3])
{
for (i = k + 1; i <= p2; i++)
{
if (X[i] == 0)
continue;
else
{
z[3] += 1;
break;
}
}
}
c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10);
*y = c * TWOM1032;
#undef R
}
/* Convert multiple precision number *X into double precision number *Y. The
result is correctly rounded to the nearest/even. */
void
__mp_dbl (const mp_no *x, double *y, int p)
{
if (X[0] == 0)
{
*y = 0;
return;
}
if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10)))
norm (x, y, p);
else
denorm (x, y, p);
}
#endif
/* Get the multiple precision equivalent of X into *Y. If the precision is too
small, the result is truncated. */
void
SECTION
__dbl_mp (double x, mp_no *y, int p)
{
long i, n;
long p2 = p;
/* Sign. */
if (x == 0)
{
Y[0] = 0;
return;
}
else if (x > 0)
Y[0] = 1;
else
{
Y[0] = -1;
x = -x;
}
/* Exponent. */
for (EY = 1; x >= RADIX; EY += 1)
x *= RADIXI;
for (; x < 1; EY -= 1)
x *= RADIX;
/* Digits. */
n = MIN (p2, 4);
for (i = 1; i <= n; i++)
{
INTEGER_OF (x, Y[i]);
x *= RADIX;
}
for (; i <= p2; i++)
Y[i] = 0;
}
/* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The
sign of the sum *Z is not changed. X and Y may overlap but not X and Z or
Y and Z. No guard digit is used. The result equals the exact sum,
truncated. */
static void
SECTION
add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
long i, j, k;
long p2 = p;
mantissa_t zk;
EZ = EX;
i = p2;
j = p2 + EY - EX;
k = p2 + 1;
if (__glibc_unlikely (j < 1))
{
__cpy (x, z, p);
return;
}
zk = 0;
for (; j > 0; i--, j--)
{
zk += X[i] + Y[j];
if (zk >= RADIX)
{
Z[k--] = zk - RADIX;
zk = 1;
}
else
{
Z[k--] = zk;
zk = 0;
}
}
for (; i > 0; i--)
{
zk += X[i];
if (zk >= RADIX)
{
Z[k--] = zk - RADIX;
zk = 1;
}
else
{
Z[k--] = zk;
zk = 0;
}
}
if (zk == 0)
{
for (i = 1; i <= p2; i++)
Z[i] = Z[i + 1];
}
else
{
Z[1] = zk;
EZ += 1;
}
}
/* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0.
The sign of the difference *Z is not changed. X and Y may overlap but not X
and Z or Y and Z. One guard digit is used. The error is less than one
ULP. */
static void
SECTION
sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
long i, j, k;
long p2 = p;
mantissa_t zk;
EZ = EX;
i = p2;
j = p2 + EY - EX;
k = p2;
/* Y is too small compared to X, copy X over to the result. */
if (__glibc_unlikely (j < 1))
{
__cpy (x, z, p);
return;
}
/* The relevant least significant digit in Y is non-zero, so we factor it in
to enhance accuracy. */
if (j < p2 && Y[j + 1] > 0)
{
Z[k + 1] = RADIX - Y[j + 1];
zk = -1;
}
else
zk = Z[k + 1] = 0;
/* Subtract and borrow. */
for (; j > 0; i--, j--)
{
zk += (X[i] - Y[j]);
if (zk < 0)
{
Z[k--] = zk + RADIX;
zk = -1;
}
else
{
Z[k--] = zk;
zk = 0;
}
}
/* We're done with digits from Y, so it's just digits in X. */
for (; i > 0; i--)
{
zk += X[i];
if (zk < 0)
{
Z[k--] = zk + RADIX;
zk = -1;
}
else
{
Z[k--] = zk;
zk = 0;
}
}
/* Normalize. */
for (i = 1; Z[i] == 0; i++);
EZ = EZ - i + 1;
for (k = 1; i <= p2 + 1;)
Z[k++] = Z[i++];
for (; k <= p2;)
Z[k++] = 0;
}
/* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X
and Z or Y and Z. One guard digit is used. The error is less than one
ULP. */
void
SECTION
__add (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
int n;
if (X[0] == 0)
{
__cpy (y, z, p);
return;
}
else if (Y[0] == 0)
{
__cpy (x, z, p);
return;
}
if (X[0] == Y[0])
{
if (__acr (x, y, p) > 0)
{
add_magnitudes (x, y, z, p);
Z[0] = X[0];
}
else
{
add_magnitudes (y, x, z, p);
Z[0] = Y[0];
}
}
else
{
if ((n = __acr (x, y, p)) == 1)
{
sub_magnitudes (x, y, z, p);
Z[0] = X[0];
}
else if (n == -1)
{
sub_magnitudes (y, x, z, p);
Z[0] = Y[0];
}
else
Z[0] = 0;
}
}
/* Subtract *Y from *X and return the result in *Z. X and Y may overlap but
not X and Z or Y and Z. One guard digit is used. The error is less than
one ULP. */
void
SECTION
__sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
int n;
if (X[0] == 0)
{
__cpy (y, z, p);
Z[0] = -Z[0];
return;
}
else if (Y[0] == 0)
{
__cpy (x, z, p);
return;
}
if (X[0] != Y[0])
{
if (__acr (x, y, p) > 0)
{
add_magnitudes (x, y, z, p);
Z[0] = X[0];
}
else
{
add_magnitudes (y, x, z, p);
Z[0] = -Y[0];
}
}
else
{
if ((n = __acr (x, y, p)) == 1)
{
sub_magnitudes (x, y, z, p);
Z[0] = X[0];
}
else if (n == -1)
{
sub_magnitudes (y, x, z, p);
Z[0] = -Y[0];
}
else
Z[0] = 0;
}
}
#ifndef NO__MUL
/* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X
and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P
digits. In case P > 3 the error is bounded by 1.001 ULP. */
void
SECTION
__mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
long i, j, k, ip, ip2;
long p2 = p;
mantissa_store_t zk;
const mp_no *a;
mantissa_store_t *diag;
/* Is z=0? */
if (__glibc_unlikely (X[0] * Y[0] == 0))
{
Z[0] = 0;
return;
}
/* We need not iterate through all X's and Y's since it's pointless to
multiply zeroes. Here, both are zero... */
for (ip2 = p2; ip2 > 0; ip2--)
if (X[ip2] != 0 || Y[ip2] != 0)
break;
a = X[ip2] != 0 ? y : x;
/* ... and here, at least one of them is still zero. */
for (ip = ip2; ip > 0; ip--)
if (a->d[ip] != 0)
break;
/* The product looks like this for p = 3 (as an example):
a1 a2 a3
x b1 b2 b3
-----------------------------
a1*b3 a2*b3 a3*b3
a1*b2 a2*b2 a3*b2
a1*b1 a2*b1 a3*b1
So our K needs to ideally be P*2, but we're limiting ourselves to P + 3
for P >= 3. We compute the above digits in two parts; the last P-1
digits and then the first P digits. The last P-1 digits are a sum of
products of the input digits from P to P-k where K is 0 for the least
significant digit and increases as we go towards the left. The product
term is of the form X[k]*X[P-k] as can be seen in the above example.
The first P digits are also a sum of products with the same product term,
except that the sum is from 1 to k. This is also evident from the above
example.
Another thing that becomes evident is that only the most significant
ip+ip2 digits of the result are non-zero, where ip and ip2 are the
'internal precision' of the input numbers, i.e. digits after ip and ip2
are all 0. */
k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3;
while (k > ip + ip2 + 1)
Z[k--] = 0;
zk = 0;
/* Precompute sums of diagonal elements so that we can directly use them
later. See the next comment to know we why need them. */
diag = alloca (k * sizeof (mantissa_store_t));
mantissa_store_t d = 0;
for (i = 1; i <= ip; i++)
{
d += X[i] * (mantissa_store_t) Y[i];
diag[i] = d;
}
while (i < k)
diag[i++] = d;
while (k > p2)
{
long lim = k / 2;
if (k % 2 == 0)
/* We want to add this only once, but since we subtract it in the sum
of products above, we add twice. */
zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
for (i = k - p2, j = p2; i < j; i++, j--)
zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
zk -= diag[k - 1];
DIV_RADIX (zk, Z[k]);
k--;
}
/* The real deal. Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
goes from 1 -> k - 1 and j goes the same range in reverse. To reduce the
number of multiplications, we halve the range and if k is an even number,
add the diagonal element X[k/2]Y[k/2]. Through the half range, we compute
X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j].
This reduction tells us that we're summing two things, the first term
through the half range and the negative of the sum of the product of all
terms of X and Y in the full range. i.e.
SUM(X[i] * Y[i]) for k terms. This is precalculated above for each k in
a single loop so that it completes in O(n) time and can hence be directly
used in the loop below. */
while (k > 1)
{
long lim = k / 2;
if (k % 2 == 0)
/* We want to add this only once, but since we subtract it in the sum
of products above, we add twice. */
zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
for (i = 1, j = k - 1; i < j; i++, j--)
zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
zk -= diag[k - 1];
DIV_RADIX (zk, Z[k]);
k--;
}
Z[k] = zk;
/* Get the exponent sum into an intermediate variable. This is a subtle
optimization, where given enough registers, all operations on the exponent
happen in registers and the result is written out only once into EZ. */
int e = EX + EY;
/* Is there a carry beyond the most significant digit? */
if (__glibc_unlikely (Z[1] == 0))
{
for (i = 1; i <= p2; i++)
Z[i] = Z[i + 1];
e--;
}
EZ = e;
Z[0] = X[0] * Y[0];
}
#endif
#ifndef NO__SQR
/* Square *X and store result in *Y. X and Y may not overlap. For P in
[1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
error is bounded by 1.001 ULP. This is a faster special case of
multiplication. */
void
SECTION
__sqr (const mp_no *x, mp_no *y, int p)
{
long i, j, k, ip;
mantissa_store_t yk;
/* Is z=0? */
if (__glibc_unlikely (X[0] == 0))
{
Y[0] = 0;
return;
}
/* We need not iterate through all X's since it's pointless to
multiply zeroes. */
for (ip = p; ip > 0; ip--)
if (X[ip] != 0)
break;
k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
while (k > 2 * ip + 1)
Y[k--] = 0;
yk = 0;
while (k > p)
{
mantissa_store_t yk2 = 0;
long lim = k / 2;
if (k % 2 == 0)
yk += X[lim] * (mantissa_store_t) X[lim];
/* In __mul, this loop (and the one within the next while loop) run
between a range to calculate the mantissa as follows:
Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
+ X[n] * Y[k]
For X == Y, we can get away with summing halfway and doubling the
result. For cases where the range size is even, the mid-point needs
to be added separately (above). */
for (i = k - p, j = p; i < j; i++, j--)
yk2 += X[i] * (mantissa_store_t) X[j];
yk += 2 * yk2;
DIV_RADIX (yk, Y[k]);
k--;
}
while (k > 1)
{
mantissa_store_t yk2 = 0;
long lim = k / 2;
if (k % 2 == 0)
yk += X[lim] * (mantissa_store_t) X[lim];
/* Likewise for this loop. */
for (i = 1, j = k - 1; i < j; i++, j--)
yk2 += X[i] * (mantissa_store_t) X[j];
yk += 2 * yk2;
DIV_RADIX (yk, Y[k]);
k--;
}
Y[k] = yk;
/* Squares are always positive. */
Y[0] = 1;
/* Get the exponent sum into an intermediate variable. This is a subtle
optimization, where given enough registers, all operations on the exponent
happen in registers and the result is written out only once into EZ. */
int e = EX * 2;
/* Is there a carry beyond the most significant digit? */
if (__glibc_unlikely (Y[1] == 0))
{
for (i = 1; i <= p; i++)
Y[i] = Y[i + 1];
e--;
}
EY = e;
}
#endif
/* Invert *X and store in *Y. Relative error bound:
- For P = 2: 1.001 * R ^ (1 - P)
- For P = 3: 1.063 * R ^ (1 - P)
- For P > 3: 2.001 * R ^ (1 - P)
*X = 0 is not permissible. */
static void
SECTION
__inv (const mp_no *x, mp_no *y, int p)
{
long i;
double t;
mp_no z, w;
static const int np1[] =
{ 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
__cpy (x, &z, p);
z.e = 0;
__mp_dbl (&z, &t, p);
t = 1 / t;
__dbl_mp (t, y, p);
EY -= EX;
for (i = 0; i < np1[p]; i++)
{
__cpy (y, &w, p);
__mul (x, &w, y, p);
__sub (&mptwo, y, &z, p);
__mul (&w, &z, y, p);
}
}
/* Divide *X by *Y and store result in *Z. X and Y may overlap but not X and Z
or Y and Z. Relative error bound:
- For P = 2: 2.001 * R ^ (1 - P)
- For P = 3: 2.063 * R ^ (1 - P)
- For P > 3: 3.001 * R ^ (1 - P)
*X = 0 is not permissible. */
void
SECTION
__dvd (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
mp_no w;
if (X[0] == 0)
Z[0] = 0;
else
{
__inv (y, &w, p);
__mul (x, &w, z, p);
}
}