powerpc: Remove power4 mpa optimization

This patch removes the POWER4 optimized mpa optimization used currently
on all powerpc targets.  In fact for newer chips, GCC generates *worse*
code than generic implementation as below.  One possibilty would to
add ifunc variants for the mpa routines (as x86_64), but it will add
complexity only for older chips (and one would need to check if
power5, power5+, and power6 do benefict from this optimization),
and only for specific implementation (since most used one such
as sin, cos, exp, pow where optimized to avoid calling the slow
multiprecision path).

* POWER9 patched
$ ./testrun.sh benchtests/bench-atan
  "atan": {
   "": {
    "duration": 5.12565e+09,
    "iterations": 1.552e+08,
    "max": 100.552,
    "min": 7.799,
    "mean": 33.0261
   },
   "144bits": {
    "duration": 5.12745e+09,
    "iterations": 825000,
    "max": 7517.17,
    "min": 6186.3,
    "mean": 6215.09
   }
  }
$ ./testrun.sh benchtests/bench-acos
  "acos": {
   "": {
    "duration": 5.21741e+09,
    "iterations": 1.269e+08,
    "max": 191.738,
    "min": 7.931,
    "mean": 41.1144
   },
   "slow": {
    "duration": 5.25999e+09,
    "iterations": 198000,
    "max": 26681.7,
    "min": 26463.6,
    "mean": 26565.6
   }
  }

* POWER9 master
$ ./testrun.sh benchtests/bench-atan
  "atan": {
   "": {
    "duration": 5.12815e+09,
    "iterations": 1.552e+08,
    "max": 134.788,
    "min": 7.803,
    "mean": 33.0422
   },
   "144bits": {
    "duration": 5.1209e+09,
    "iterations": 447000,
    "max": 11615.8,
    "min": 11301.8,
    "mean": 11456.2
   }
  }
$ ./testrun.sh benchtests/bench-acos
  "acos": {
   "": {
    "duration": 5.22272e+09,
    "iterations": 1.269e+08,
    "max": 115.981,
    "min": 7.931,
    "mean": 41.1562
   },
   "slow": {
    "duration": 5.28723e+09,
    "iterations": 96000,
    "max": 55434.1,
    "min": 54820.6,
    "mean": 55075.3
   }
  }

* POWER8 patched
$ taskset -c 16 ./testrun.sh benchtests/bench-acos
  "acos": {
   "": {
    "duration": 5.16398e+09,
    "iterations": 9.99e+07,
    "max": 174.408,
    "min": 8.645,
    "mean": 51.6915
   },
   "slow": {
    "duration": 5.16982e+09,
    "iterations": 96000,
    "max": 54830.5,
    "min": 53703.8,
    "mean": 53852.3
   }
  }
* POWER8 master
$ taskset -c 16 ./testrun.sh benchtests/bench-acos
  "acos": {
   "": {
    "duration": 5.17019e+09,
    "iterations": 9.99e+07,
    "max": 186.127,
    "min": 8.633,
    "mean": 51.7537
   },
   "slow": {
    "duration": 5.34225e+09,
    "iterations": 90000,
    "max": 60353.2,
    "min": 59155.3,
    "mean": 59358.4
   }
  }

* POWER7 patched
$ taskset -c 16 benchtests/bench-asin
  "asin": {
   "": {
    "duration": 5.15559e+09,
    "iterations": 6.5e+07,
    "max": 193.335,
    "min": 12.227,
    "mean": 79.3168
   },
   "slow": {
    "duration": 5.20538e+09,
    "iterations": 80000,
    "max": 65705.2,
    "min": 64299.4,
    "mean": 65067.3
   }
  }
* POWER7 master
$ taskset -c 16 benchtests/bench-asin
  "asin": {
   "": {
    "duration": 5.15446e+09,
    "iterations": 6.5e+07,
    "max": 184.575,
    "min": 12.226,
    "mean": 79.2994
   },
   "slow": {
    "duration": 5.20616e+09,
    "iterations": 80000,
    "max": 65705.1,
    "min": 64336.6,
    "mean": 65076.9
   }
  }

Checked on powerpc-linux-gnu (built without --with-cpu, with
--with-cpu=power4 and with --with-cpu=power5+ and --disable-multi-arch),
powerpc64-linux-gnu (built without --with-cp and with --with-cpu=power5+
and --disable-multi-arch).

	* sysdeps/powerpc/power4/fpu/Makefile: Remove file.
	* sysdeps/powerpc/power4/fpu/mpa-arch.h: Likewise.
	* sysdeps/powerpc/power4/fpu/mpa.c: Likewise.

Reviewed-by: Gabriel F. T. Gomes <gabriel@inconstante.eti.br>
This commit is contained in:
Adhemerval Zanella 2019-03-20 15:34:10 +00:00
parent f492fc99f4
commit c4c0848bbb
4 changed files with 6 additions and 275 deletions

View File

@ -1,3 +1,9 @@
2019-04-29 Adhemerval Zanella <adhemerval.zanella@linaro.org>
* sysdeps/powerpc/power4/fpu/Makefile: Remove file.
* sysdeps/powerpc/power4/fpu/mpa-arch.h: Likewise.
* sysdeps/powerpc/power4/fpu/mpa.c: Likewise.
2019-04-29 Yann Droneaud <ydroneaud@opteya.com>
* sysdeps/pthread/semaphore.h (sem_init): Add __nonnull attribute.

View File

@ -1,5 +0,0 @@
# Makefile fragment for POWER4/5/5+ with FPU.
ifeq ($(subdir),math)
CFLAGS-mpa.c += --param max-unroll-times=4 -funroll-loops -fpeel-loops
endif

View File

@ -1,56 +0,0 @@
/* Overridable constants and operations.
Copyright (C) 2013-2019 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, see <http://www.gnu.org/licenses/>. */
typedef double mantissa_t;
typedef double mantissa_store_t;
#define TWOPOW(i) (0x1.0p##i)
#define RADIX TWOPOW (24) /* 2^24 */
#define CUTTER TWOPOW (76) /* 2^76 */
#define RADIXI 0x1.0p-24 /* 2^-24 */
#define TWO52 TWOPOW (52) /* 2^52 */
/* Divide D by RADIX and put the remainder in R. */
#define DIV_RADIX(d,r) \
({ \
double u = ((d) + CUTTER) - CUTTER; \
if (u > (d)) \
u -= RADIX; \
r = (d) - u; \
(d) = u * RADIXI; \
})
/* Put the integer component of a double X in R and retain the fraction in
X. */
#define INTEGER_OF(x, r) \
({ \
double u = ((x) + TWO52) - TWO52; \
if (u > (x)) \
u -= 1; \
(r) = u; \
(x) -= u; \
})
/* Align IN down to a multiple of F, where F is a power of two. */
#define ALIGN_DOWN_TO(in, f) \
({ \
double factor = f * TWO52; \
double u = (in + factor) - factor; \
if (u > in) \
u -= f; \
u; \
})

View File

@ -1,214 +0,0 @@
/*
* IBM Accurate Mathematical Library
* written by International Business Machines Corp.
* Copyright (C) 2001-2019 Free Software Foundation, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/* Define __mul and __sqr and use the rest from generic code. */
#define NO__MUL
#define NO__SQR
#include <sysdeps/ieee754/dbl-64/mpa.c>
/* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X
and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P
digits. In case P > 3 the error is bounded by 1.001 ULP. */
void
__mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
long i, i1, i2, j, k, k2;
long p2 = p;
double u, zk, zk2;
/* Is z=0? */
if (__glibc_unlikely (X[0] * Y[0] == 0))
{
Z[0] = 0;
return;
}
/* Multiply, add and carry */
k2 = (p2 < 3) ? p2 + p2 : p2 + 3;
zk = Z[k2] = 0;
for (k = k2; k > 1;)
{
if (k > p2)
{
i1 = k - p2;
i2 = p2 + 1;
}
else
{
i1 = 1;
i2 = k;
}
#if 1
/* Rearrange this inner loop to allow the fmadd instructions to be
independent and execute in parallel on processors that have
dual symmetrical FP pipelines. */
if (i1 < (i2 - 1))
{
/* Make sure we have at least 2 iterations. */
if (((i2 - i1) & 1L) == 1L)
{
/* Handle the odd iterations case. */
zk2 = x->d[i2 - 1] * y->d[i1];
}
else
zk2 = 0.0;
/* Do two multiply/adds per loop iteration, using independent
accumulators; zk and zk2. */
for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2)
{
zk += x->d[i] * y->d[j];
zk2 += x->d[i + 1] * y->d[j - 1];
}
zk += zk2; /* Final sum. */
}
else
{
/* Special case when iterations is 1. */
zk += x->d[i1] * y->d[i1];
}
#else
/* The original code. */
for (i = i1, j = i2 - 1; i < i2; i++, j--)
zk += X[i] * Y[j];
#endif
u = (zk + CUTTER) - CUTTER;
if (u > zk)
u -= RADIX;
Z[k] = zk - u;
zk = u * RADIXI;
--k;
}
Z[k] = zk;
int e = EX + EY;
/* Is there a carry beyond the most significant digit? */
if (Z[1] == 0)
{
for (i = 1; i <= p2; i++)
Z[i] = Z[i + 1];
e--;
}
EZ = e;
Z[0] = X[0] * Y[0];
}
/* Square *X and store result in *Y. X and Y may not overlap. For P in
[1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
error is bounded by 1.001 ULP. This is a faster special case of
multiplication. */
void
__sqr (const mp_no *x, mp_no *y, int p)
{
long i, j, k, ip;
double u, yk;
/* Is z=0? */
if (__glibc_unlikely (X[0] == 0))
{
Y[0] = 0;
return;
}
/* We need not iterate through all X's since it's pointless to
multiply zeroes. */
for (ip = p; ip > 0; ip--)
if (X[ip] != 0)
break;
k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
while (k > 2 * ip + 1)
Y[k--] = 0;
yk = 0;
while (k > p)
{
double yk2 = 0.0;
long lim = k / 2;
if (k % 2 == 0)
{
yk += X[lim] * X[lim];
lim--;
}
/* In __mul, this loop (and the one within the next while loop) run
between a range to calculate the mantissa as follows:
Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
+ X[n] * Y[k]
For X == Y, we can get away with summing halfway and doubling the
result. For cases where the range size is even, the mid-point needs
to be added separately (above). */
for (i = k - p, j = p; i <= lim; i++, j--)
yk2 += X[i] * X[j];
yk += 2.0 * yk2;
u = (yk + CUTTER) - CUTTER;
if (u > yk)
u -= RADIX;
Y[k--] = yk - u;
yk = u * RADIXI;
}
while (k > 1)
{
double yk2 = 0.0;
long lim = k / 2;
if (k % 2 == 0)
{
yk += X[lim] * X[lim];
lim--;
}
/* Likewise for this loop. */
for (i = 1, j = k - 1; i <= lim; i++, j--)
yk2 += X[i] * X[j];
yk += 2.0 * yk2;
u = (yk + CUTTER) - CUTTER;
if (u > yk)
u -= RADIX;
Y[k--] = yk - u;
yk = u * RADIXI;
}
Y[k] = yk;
/* Squares are always positive. */
Y[0] = 1.0;
int e = EX * 2;
/* Is there a carry beyond the most significant digit? */
if (__glibc_unlikely (Y[1] == 0))
{
for (i = 1; i <= p; i++)
Y[i] = Y[i + 1];
e--;
}
EY = e;
}