Implement fma in soft-fp.

2024-11-21 20:40:05 +00:00 · 2013-07-02 14:55:32 +00:00 · 2013-07-02 14:55:32 +00:00 · 77f01ab5d1
commit 77f01ab5d1
parent 1413c693d3
21 changed files with 681 additions and 109 deletions
--- a/51
+++ b/51
@ -1,3 +1,54 @@
 2013-07-02  Joseph Myers  <joseph@codesourcery.com>
 	[BZ #13304]
 	* soft-fp/op-common.h (_FP_FMA): New macro.
 	* soft-fp/op-1.h (_FP_FRAC_HIGHBIT_DW_1): New macro.
 	(_FP_MUL_MEAT_DW_1_imm): Likewise.  Split out of ...
 	(_FP_MUL_MEAT_1_imm): ... here.
 	(_FP_MUL_MEAT_DW_1_wide): New macro.  Split out of ...
 	(_FP_MUL_MEAT_1_wide): ... here.
 	(_FP_MUL_MEAT_DW_1_hard): Likewise.  Split out of ...
 	(_FP_MUL_MEAT_1_hard): ... here.
 	* soft-fp/op-2.h (_FP_FRAC_HIGHBIT_DW_2): New macro.
 	(_FP_MUL_MEAT_DW_2_wide): Likewise.  Split out of ...
 	(_FP_MUL_MEAT_2_wide): ... here.
 	(_FP_MUL_MEAT_DW_2_wide_3mul): New macro.  Split out of ...
 	(_FP_MUL_MEAT_2_wide_3mul): ... here.
 	(_FP_MUL_MEAT_DW_2_gmp): New macro.  Split out of ...
 	(_FP_MUL_MEAT_2_gmp): ... here.
 	* soft-fp/op-4.h (_FP_FRAC_HIGHBIT_DW_4): New macro.
 	(_FP_MUL_MEAT_DW_4_wide): Likewise.  Split out of ...
 	(_FP_MUL_MEAT_4_wide): ... here.
 	(_FP_MUL_MEAT_DW_4_gmp): New macro.  Split out of ...
 	(_FP_MUL_MEAT_4_gmp): ... here.
 	* soft-fp/single.h (_FP_FRACTBITS_DW_S): New macro.
 	(_FP_WFRACBITS_DW_S): Likewise.
 	(_FP_WFRACXBITS_DW_S): Likewise.
 	(_FP_HIGHBIT_DW_S): Likewise.
 	(FP_FMA_S): Likewise.
 	(_FP_FRAC_HIGH_DW_S): Likewise.
 	* soft-fp/double.h (_FP_FRACTBITS_DW_D): New macro.
 	(_FP_WFRACBITS_DW_D): Likewise.
 	(_FP_WFRACXBITS_DW_D): Likewise.
 	(_FP_HIGHBIT_DW_D): Likewise.
 	(FP_FMA_D): Likewise.
 	(_FP_FRAC_HIGH_DW_D): Likewise.
 	* soft-fp/extended.h (_FP_FRACTBITS_DW_E): New macro.
 	(_FP_WFRACBITS_DW_E): Likewise.
 	(_FP_WFRACXBITS_DW_E): Likewise.
 	(_FP_HIGHBIT_DW_E): Likewise.
 	(FP_FMA_E): Likewise.
 	(_FP_FRAC_HIGH_DW_E): Likewise.
 	* soft-fp/quad.h (_FP_FRACTBITS_DW_Q): New macro.
 	(_FP_WFRACBITS_DW_Q): Likewise.
 	(_FP_WFRACXBITS_DW_Q): Likewise.
 	(_FP_HIGHBIT_DW_Q): Likewise.
 	(FP_FMA_Q): Likewise.
 	(_FP_FRAC_HIGH_DW_Q): Likewise.
 	* soft-fp/fmasf4.c: New file.
 	* soft-fp/fmadf4.c: Likewise.
 	* soft-fp/fmatf4.c: Likewise.
 2013-06-28  Liubov Dmitrieva  <liubov.dmitrieva@intel.com>
 	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set
--- a/ports/ChangeLog.mips
+++ b/ports/ChangeLog.mips
@ -1,3 +1,21 @@
 2013-07-02  Joseph Myers  <joseph@codesourcery.com>
 	[BZ #13304]
 	* sysdeps/mips/ieee754/s_fma.c: New file.
 	* sysdeps/mips/ieee754/s_fmaf.c: Likewise.
 	* sysdeps/mips/ieee754/s_fmal.c: Likewise.
 	* sysdeps/mips/mips32/Implies: Add mips/soft-fp.
 	* sysdeps/mips/mips64/n32/s_fma.c: Remove file.
 	* sysdeps/mips/mips64/n64/s_fma.c: Likewise.
 	* sysdeps/mips/mips64/soft-fp/sfp-machine.h (_FP_MUL_MEAT_DW_S):
 	New macro.
 	(_FP_MUL_MEAT_DW_D): Likewise.
 	(_FP_MUL_MEAT_DW_Q): Likewise.
 	* sysdeps/mips/soft-fp/sfp-machine.h (_FP_MUL_MEAT_DW_S): New
 	macro.
 	(_FP_MUL_MEAT_DW_D): Likewise.
 	(_FP_MUL_MEAT_DW_Q): Likewise.
 2013-06-28  Ryan S. Arnold  <rsa@linux.vnet.ibm.com>
 	* sysdeps/mips/dl-procinfo.h (_dl_procinfo): Add TYPE parameter
--- a/ports/sysdeps/mips/ieee754/s_fma.c
+++ b/ports/sysdeps/mips/ieee754/s_fma.c
@ -0,0 +1,5 @@
 #ifdef __mips_hard_float
 # include <sysdeps/ieee754/dbl-64/s_fma.c>
 #else
 # include <soft-fp/fmadf4.c>
 #endif
--- a/ports/sysdeps/mips/ieee754/s_fmaf.c
+++ b/ports/sysdeps/mips/ieee754/s_fmaf.c
@ -0,0 +1,5 @@
 #ifdef __mips_hard_float
 # include <sysdeps/ieee754/dbl-64/s_fmaf.c>
 #else
 # include <soft-fp/fmasf4.c>
 #endif
--- a/ports/sysdeps/mips/ieee754/s_fmal.c
+++ b/ports/sysdeps/mips/ieee754/s_fmal.c
@ -0,0 +1,7 @@
 #include <sgidefs.h>
 #if _MIPS_SIM == _ABIO32
 # error "long double fma being compiled for o32 ABI"
 #endif
 #include <soft-fp/fmatf4.c>
--- a/ports/sysdeps/mips/mips32/Implies
+++ b/ports/sysdeps/mips/mips32/Implies
@ -1,3 +1,4 @@
 mips/ieee754
 mips/soft-fp
 mips
 wordsize-32
--- a/ports/sysdeps/mips/mips64/n32/s_fma.c
+++ b/ports/sysdeps/mips/mips64/n32/s_fma.c
@ -1,6 +0,0 @@
 /* MIPS long double is implemented in software by fp-bit (as of GCC
   4.7) without support for exceptions or rounding modes, so the fma
   implementation in terms of long double is slow and will not produce
   correctly rounding results.  */
 #include <sysdeps/ieee754/dbl-64/s_fma.c>
--- a/ports/sysdeps/mips/mips64/n64/s_fma.c
+++ b/ports/sysdeps/mips/mips64/n64/s_fma.c
@ -1,6 +0,0 @@
 /* MIPS long double is implemented in software by fp-bit (as of GCC
   4.7) without support for exceptions or rounding modes, so the fma
   implementation in terms of long double is slow and will not produce
   correctly rounding results.  */
 #include <sysdeps/ieee754/dbl-64/s_fma.c>
--- a/ports/sysdeps/mips/mips64/soft-fp/sfp-machine.h
+++ b/ports/sysdeps/mips/mips64/soft-fp/sfp-machine.h
@ -13,6 +13,13 @@
 #define _FP_MUL_MEAT_Q(R,X,Y)					\
  _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
 #define _FP_MUL_MEAT_DW_S(R,X,Y)				\
  _FP_MUL_MEAT_DW_1_imm(_FP_WFRACBITS_S,R,X,Y)
 #define _FP_MUL_MEAT_DW_D(R,X,Y)				\
  _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
 #define _FP_MUL_MEAT_DW_Q(R,X,Y)				\
  _FP_MUL_MEAT_DW_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
 #define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm)
 #define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_1_udiv_norm(D,R,X,Y)
 #define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_2_udiv(Q,R,X,Y)
--- a/ports/sysdeps/mips/soft-fp/sfp-machine.h
+++ b/ports/sysdeps/mips/soft-fp/sfp-machine.h
@ -10,6 +10,13 @@
 #define _FP_MUL_MEAT_Q(R,X,Y)				\
  _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
 #define _FP_MUL_MEAT_DW_S(R,X,Y)				\
  _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm)
 #define _FP_MUL_MEAT_DW_D(R,X,Y)				\
  _FP_MUL_MEAT_DW_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
 #define _FP_MUL_MEAT_DW_Q(R,X,Y)				\
  _FP_MUL_MEAT_DW_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
 #define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_udiv_norm(S,R,X,Y)
 #define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_2_udiv(D,R,X,Y)
 #define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_4_udiv(Q,R,X,Y)
--- a/soft-fp/double.h
+++ b/soft-fp/double.h
@ -36,8 +36,10 @@
 #if _FP_W_TYPE_SIZE < 64
 #define _FP_FRACTBITS_D		(2 * _FP_W_TYPE_SIZE)
 #define _FP_FRACTBITS_DW_D	(4 * _FP_W_TYPE_SIZE)
 #else
 #define _FP_FRACTBITS_D		_FP_W_TYPE_SIZE
 #define _FP_FRACTBITS_DW_D	(2 * _FP_W_TYPE_SIZE)
 #endif
 #define _FP_FRACBITS_D		53
@ -59,6 +61,11 @@
 #define _FP_OVERFLOW_D		\
 	((_FP_W_TYPE)1 << _FP_WFRACBITS_D % _FP_W_TYPE_SIZE)
 #define _FP_WFRACBITS_DW_D	(2 * _FP_WFRACBITS_D)
 #define _FP_WFRACXBITS_DW_D	(_FP_FRACTBITS_DW_D - _FP_WFRACBITS_DW_D)
 #define _FP_HIGHBIT_DW_D	\
  ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_D - 1) % _FP_W_TYPE_SIZE)
 typedef float DFtype __attribute__((mode(DF)));
 #if _FP_W_TYPE_SIZE < 64
@ -149,6 +156,7 @@ union _FP_UNION_D
 #define FP_DIV_D(R,X,Y)			_FP_DIV(D,2,R,X,Y)
 #define FP_SQRT_D(R,X)			_FP_SQRT(D,2,R,X)
 #define _FP_SQRT_MEAT_D(R,S,T,X,Q)	_FP_SQRT_MEAT_2(R,S,T,X,Q)
 #define FP_FMA_D(R,X,Y,Z)		_FP_FMA(D,2,4,R,X,Y,Z)
 #define FP_CMP_D(r,X,Y,un)	_FP_CMP(D,2,r,X,Y,un)
 #define FP_CMP_EQ_D(r,X,Y)	_FP_CMP_EQ(D,2,r,X,Y)
@ -160,6 +168,8 @@ union _FP_UNION_D
 #define _FP_FRAC_HIGH_D(X)	_FP_FRAC_HIGH_2(X)
 #define _FP_FRAC_HIGH_RAW_D(X)	_FP_FRAC_HIGH_2(X)
 #define _FP_FRAC_HIGH_DW_D(X)	_FP_FRAC_HIGH_4(X)
 #else
 union _FP_UNION_D
@ -246,6 +256,7 @@ union _FP_UNION_D
 #define FP_DIV_D(R,X,Y)			_FP_DIV(D,1,R,X,Y)
 #define FP_SQRT_D(R,X)			_FP_SQRT(D,1,R,X)
 #define _FP_SQRT_MEAT_D(R,S,T,X,Q)	_FP_SQRT_MEAT_1(R,S,T,X,Q)
 #define FP_FMA_D(R,X,Y,Z)		_FP_FMA(D,1,2,R,X,Y,Z)
 /* The implementation of _FP_MUL_D and _FP_DIV_D should be chosen by
   the target machine.  */
@ -260,4 +271,6 @@ union _FP_UNION_D
 #define _FP_FRAC_HIGH_D(X)	_FP_FRAC_HIGH_1(X)
 #define _FP_FRAC_HIGH_RAW_D(X)	_FP_FRAC_HIGH_1(X)
 #define _FP_FRAC_HIGH_DW_D(X)	_FP_FRAC_HIGH_2(X)
 #endif /* W_TYPE_SIZE < 64 */
--- a/soft-fp/extended.h
+++ b/soft-fp/extended.h
@ -33,8 +33,10 @@
 #if _FP_W_TYPE_SIZE < 64
 #define _FP_FRACTBITS_E         (4*_FP_W_TYPE_SIZE)
 #define _FP_FRACTBITS_DW_E	(8*_FP_W_TYPE_SIZE)
 #else
 #define _FP_FRACTBITS_E		(2*_FP_W_TYPE_SIZE)
 #define _FP_FRACTBITS_DW_E	(4*_FP_W_TYPE_SIZE)
 #endif
 #define _FP_FRACBITS_E		64
@ -56,6 +58,11 @@
 #define _FP_OVERFLOW_E		\
 	((_FP_W_TYPE)1 << (_FP_WFRACBITS_E % _FP_W_TYPE_SIZE))
 #define _FP_WFRACBITS_DW_E	(2 * _FP_WFRACBITS_E)
 #define _FP_WFRACXBITS_DW_E	(_FP_FRACTBITS_DW_E - _FP_WFRACBITS_DW_E)
 #define _FP_HIGHBIT_DW_E	\
  ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_E - 1) % _FP_W_TYPE_SIZE)
 typedef float XFtype __attribute__((mode(XF)));
 #if _FP_W_TYPE_SIZE < 64
@ -192,6 +199,7 @@ union _FP_UNION_E
 #define FP_MUL_E(R,X,Y)		_FP_MUL(E,4,R,X,Y)
 #define FP_DIV_E(R,X,Y)		_FP_DIV(E,4,R,X,Y)
 #define FP_SQRT_E(R,X)		_FP_SQRT(E,4,R,X)
 #define FP_FMA_E(R,X,Y,Z)	_FP_FMA(E,4,8,R,X,Y,Z)
 /*
 * Square root algorithms:
@ -258,6 +266,8 @@ union _FP_UNION_E
 #define _FP_FRAC_HIGH_E(X)	(X##_f[2])
 #define _FP_FRAC_HIGH_RAW_E(X)	(X##_f[1])
 #define _FP_FRAC_HIGH_DW_E(X)	(X##_f[4])
 #else   /* not _FP_W_TYPE_SIZE < 64 */
 union _FP_UNION_E
 {
@ -383,6 +393,7 @@ union _FP_UNION_E
 #define FP_MUL_E(R,X,Y)		_FP_MUL(E,2,R,X,Y)
 #define FP_DIV_E(R,X,Y)		_FP_DIV(E,2,R,X,Y)
 #define FP_SQRT_E(R,X)		_FP_SQRT(E,2,R,X)
 #define FP_FMA_E(R,X,Y,Z)	_FP_FMA(E,2,4,R,X,Y,Z)
 /*
 * Square root algorithms:
@ -427,4 +438,6 @@ union _FP_UNION_E
 #define _FP_FRAC_HIGH_E(X)	(X##_f1)
 #define _FP_FRAC_HIGH_RAW_E(X)	(X##_f0)
 #define _FP_FRAC_HIGH_DW_E(X)	(X##_f[2])
 #endif /* not _FP_W_TYPE_SIZE < 64 */
--- a/soft-fp/fmadf4.c
+++ b/soft-fp/fmadf4.c
@ -0,0 +1,56 @@
 /* Implement fma using soft-fp.
   Copyright (C) 2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   In addition to the permissions in the GNU Lesser General Public
   License, the Free Software Foundation gives you unlimited
   permission to link the compiled version of this file into
   combinations with other programs, and to distribute those
   combinations without any restriction coming from the use of this
   file.  (The Lesser General Public License restrictions do apply in
   other respects; for example, they cover modification of the file,
   and distribution when not linked into a combine executable.)
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 #include <math.h>
 #include "soft-fp.h"
 #include "double.h"
 double
 __fma (double a, double b, double c)
 {
  FP_DECL_EX;
  FP_DECL_D(A); FP_DECL_D(B); FP_DECL_D(C); FP_DECL_D(R);
  double r;
  FP_INIT_ROUNDMODE;
  FP_UNPACK_D(A, a);
  FP_UNPACK_D(B, b);
  FP_UNPACK_D(C, c);
  FP_FMA_D(R, A, B, C);
  FP_PACK_D(r, R);
  FP_HANDLE_EXCEPTIONS;
  return r;
 }
 #ifndef __fma
 weak_alias (__fma, fma)
 #endif
 #ifdef NO_LONG_DOUBLE
 strong_alias (__fma, __fmal)
 weak_alias (__fmal, fmal)
 #endif
--- a/soft-fp/fmasf4.c
+++ b/soft-fp/fmasf4.c
@ -0,0 +1,51 @@
 /* Implement fmaf using soft-fp.
   Copyright (C) 2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   In addition to the permissions in the GNU Lesser General Public
   License, the Free Software Foundation gives you unlimited
   permission to link the compiled version of this file into
   combinations with other programs, and to distribute those
   combinations without any restriction coming from the use of this
   file.  (The Lesser General Public License restrictions do apply in
   other respects; for example, they cover modification of the file,
   and distribution when not linked into a combine executable.)
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 #include <math.h>
 #include "soft-fp.h"
 #include "single.h"
 float
 __fmaf (float a, float b, float c)
 {
  FP_DECL_EX;
  FP_DECL_S(A); FP_DECL_S(B); FP_DECL_S(C); FP_DECL_S(R);
  float r;
  FP_INIT_ROUNDMODE;
  FP_UNPACK_S(A, a);
  FP_UNPACK_S(B, b);
  FP_UNPACK_S(C, c);
  FP_FMA_S(R, A, B, C);
  FP_PACK_S(r, R);
  FP_HANDLE_EXCEPTIONS;
  return r;
 }
 #ifndef __fmaf
 weak_alias (__fmaf, fmaf)
 #endif
--- a/soft-fp/fmatf4.c
+++ b/soft-fp/fmatf4.c
@ -0,0 +1,49 @@
 /* Implement fmal using soft-fp.
   Copyright (C) 2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   In addition to the permissions in the GNU Lesser General Public
   License, the Free Software Foundation gives you unlimited
   permission to link the compiled version of this file into
   combinations with other programs, and to distribute those
   combinations without any restriction coming from the use of this
   file.  (The Lesser General Public License restrictions do apply in
   other respects; for example, they cover modification of the file,
   and distribution when not linked into a combine executable.)
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 #include <math.h>
 #include "soft-fp.h"
 #include "quad.h"
 long double
 __fmal (long double a, long double b, long double c)
 {
  FP_DECL_EX;
  FP_DECL_Q(A); FP_DECL_Q(B); FP_DECL_Q(C); FP_DECL_Q(R);
  long double r;
  FP_INIT_ROUNDMODE;
  FP_UNPACK_Q(A, a);
  FP_UNPACK_Q(B, b);
  FP_UNPACK_Q(C, c);
  FP_FMA_Q(R, A, B, C);
  FP_PACK_Q(r, R);
  FP_HANDLE_EXCEPTIONS;
  return r;
 }
 weak_alias (__fmal, fmal)
--- a/soft-fp/op-1.h
+++ b/soft-fp/op-1.h
@ -72,6 +72,7 @@ do {							\
 #define _FP_FRAC_ZEROP_1(X)	(X##_f == 0)
 #define _FP_FRAC_OVERP_1(fs,X)	(X##_f & _FP_OVERFLOW_##fs)
 #define _FP_FRAC_CLEAR_OVERP_1(fs,X)	(X##_f &= ~_FP_OVERFLOW_##fs)
 #define _FP_FRAC_HIGHBIT_DW_1(fs,X)	(X##_f & _FP_HIGHBIT_DW_##fs)
 #define _FP_FRAC_EQ_1(X, Y)	(X##_f == Y##_f)
 #define _FP_FRAC_GE_1(X, Y)	(X##_f >= Y##_f)
 #define _FP_FRAC_GT_1(X, Y)	(X##_f > Y##_f)
@ -137,9 +138,14 @@ do {							\
 /* Basic.  Assuming the host word size is >= 2*FRACBITS, we can do the
   multiplication immediately.  */
-#define _FP_MUL_MEAT_1_imm(wfracbits, R, X, Y)				\
+#define _FP_MUL_MEAT_DW_1_imm(wfracbits, R, X, Y)			\
  do {									\
    R##_f = X##_f * Y##_f;						\
  } while (0)
 #define _FP_MUL_MEAT_1_imm(wfracbits, R, X, Y)				\
  do {									\
    _FP_MUL_MEAT_DW_1_imm(wfracbits, R, X, Y);				\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
@ -148,10 +154,15 @@ do {							\
 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
 #define _FP_MUL_MEAT_DW_1_wide(wfracbits, R, X, Y, doit)		\
  do {									\
    doit(R##_f1, R##_f0, X##_f, Y##_f);					\
  } while (0)
 #define _FP_MUL_MEAT_1_wide(wfracbits, R, X, Y, doit)			\
  do {									\
-    _FP_W_TYPE _Z_f0, _Z_f1;						\
+    _FP_FRAC_DECL_2(_Z);						\
-    doit(_Z_f1, _Z_f0, X##_f, Y##_f);					\
+    _FP_MUL_MEAT_DW_1_wide(wfracbits, _Z, X, Y, doit);			\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
@ -161,9 +172,10 @@ do {							\
 /* Finally, a simple widening multiply algorithm.  What fun!  */
-#define _FP_MUL_MEAT_1_hard(wfracbits, R, X, Y)				\
+#define _FP_MUL_MEAT_DW_1_hard(wfracbits, R, X, Y)			\
  do {									\
-    _FP_W_TYPE _xh, _xl, _yh, _yl, _z_f0, _z_f1, _a_f0, _a_f1;		\
+    _FP_W_TYPE _xh, _xl, _yh, _yl;					\
    _FP_FRAC_DECL_2(_a);						\
 									\
    /* split the words in half */					\
    _xh = X##_f >> (_FP_W_TYPE_SIZE/2);					\
@ -172,17 +184,23 @@ do {							\
    _yl = Y##_f & (((_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE/2)) - 1);		\
 									\
    /* multiply the pieces */						\
-    _z_f0 = _xl * _yl;							\
+    R##_f0 = _xl * _yl;							\
    _a_f0 = _xh * _yl;							\
    _a_f1 = _xl * _yh;							\
-    _z_f1 = _xh * _yh;							\
+    R##_f1 = _xh * _yh;							\
 									\
    /* reassemble into two full words */				\
    if ((_a_f0 += _a_f1) < _a_f1)					\
-      _z_f1 += (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE/2);			\
+      R##_f1 += (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE/2);			\
    _a_f1 = _a_f0 >> (_FP_W_TYPE_SIZE/2);				\
    _a_f0 = _a_f0 << (_FP_W_TYPE_SIZE/2);				\
-    _FP_FRAC_ADD_2(_z, _z, _a);						\
+    _FP_FRAC_ADD_2(R, R, _a);						\
  } while (0)
 #define _FP_MUL_MEAT_1_hard(wfracbits, R, X, Y)				\
  do {									\
    _FP_FRAC_DECL_2(_z);						\
    _FP_MUL_MEAT_DW_1_hard(wfracbits, _z, X, Y);			\
 									\
    /* normalize */							\
    _FP_FRAC_SRS_2(_z, wfracbits - 1, 2*wfracbits);			\
--- a/soft-fp/op-2.h
+++ b/soft-fp/op-2.h
@ -134,6 +134,8 @@
 #define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
 #define _FP_FRAC_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
 #define _FP_FRAC_CLEAR_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
 #define _FP_FRAC_HIGHBIT_DW_2(fs,X)	\
  (_FP_FRAC_HIGH_DW_##fs(X) & _FP_HIGHBIT_DW_##fs)
 #define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
 #define _FP_FRAC_GT_2(X, Y)	\
  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
@ -257,23 +259,30 @@
 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
-#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
+#define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit)		\
  do {									\
-    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
+    _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);				\
 									\
-    doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);	\
+    doit(_FP_FRAC_WORD_4(R,1), _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0);	\
    doit(_b_f1, _b_f0, X##_f0, Y##_f1);					\
    doit(_c_f1, _c_f0, X##_f1, Y##_f0);					\
-    doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1);	\
+    doit(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2), X##_f1, Y##_f1);	\
 									\
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
-		    _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0,		\
+		    _FP_FRAC_WORD_4(R,1), 0, _b_f1, _b_f0,		\
-		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+		    _FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
-		    _FP_FRAC_WORD_4(_z,1));				\
+		    _FP_FRAC_WORD_4(R,1));				\
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
-		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0,		\
+		    _FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0,		\
-		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+		    _FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
-		    _FP_FRAC_WORD_4(_z,1));				\
+		    _FP_FRAC_WORD_4(R,1));				\
  } while (0)
 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
 									\
    _FP_MUL_MEAT_DW_2_wide(wfracbits, _z, X, Y, doit);			\
 									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
@ -287,9 +296,9 @@
   Do only 3 multiplications instead of four. This one is for machines
   where multiplication is much more expensive than subtraction.  */
-#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
+#define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit)		\
  do {									\
-    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
+    _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);				\
    _FP_W_TYPE _d;							\
    int _c1, _c2;							\
 									\
@ -297,27 +306,34 @@
    _c1 = _b_f0 < X##_f0;						\
    _b_f1 = Y##_f0 + Y##_f1;						\
    _c2 = _b_f1 < Y##_f0;						\
-    doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);			\
+    doit(_d, _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0);			\
-    doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1);	\
+    doit(_FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1), _b_f0, _b_f1);	\
    doit(_c_f1, _c_f0, X##_f1, Y##_f1);					\
 									\
    _b_f0 &= -_c2;							\
    _b_f1 &= -_c1;							\
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
-		    _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d,		\
+		    _FP_FRAC_WORD_4(R,1), (_c1 & _c2), 0, _d,		\
-		    0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1));	\
+		    0, _FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1));	\
-    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
 		     _b_f0);						\
-    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
 		     _b_f1);						\
-    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
-		    _FP_FRAC_WORD_4(_z,1),				\
+		    _FP_FRAC_WORD_4(R,1),				\
-		    0, _d, _FP_FRAC_WORD_4(_z,0));			\
+		    0, _d, _FP_FRAC_WORD_4(R,0));			\
-    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
-		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0);		\
+		    _FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0);		\
-    __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2),	\
+    __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2),		\
 		    _c_f1, _c_f0,					\
-		    _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2));	\
+		    _FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2));	\
  } while (0)
 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
 									\
    _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, _z, X, Y, doit);		\
 									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
@ -327,14 +343,20 @@
    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
  } while (0)
-#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
+#define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y)			\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
    _FP_W_TYPE _x[2], _y[2];						\
    _x[0] = X##_f0; _x[1] = X##_f1;					\
    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
 									\
-    mpn_mul_n(_z_f, _x, _y, 2);						\
+    mpn_mul_n(R##_f, _x, _y, 2);					\
  } while (0)
 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
 									\
    _FP_MUL_MEAT_DW_2_gmp(wfracbits, _z, X, Y);				\
 									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
--- a/soft-fp/op-4.h
+++ b/soft-fp/op-4.h
@ -142,6 +142,8 @@
 #define _FP_FRAC_ZEROP_4(X)     ((X##_f[0] | X##_f[1] | X##_f[2] | X##_f[3]) == 0)
 #define _FP_FRAC_NEGP_4(X)      ((_FP_WS_TYPE)X##_f[3] < 0)
 #define _FP_FRAC_OVERP_4(fs,X)  (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
 #define _FP_FRAC_HIGHBIT_DW_4(fs,X)	\
  (_FP_FRAC_HIGH_DW_##fs(X) & _FP_HIGHBIT_DW_##fs)
 #define _FP_FRAC_CLEAR_OVERP_4(fs,X)  (_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
 #define _FP_FRAC_EQ_4(X,Y)				\
@ -246,81 +248,88 @@
 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
-#define _FP_MUL_MEAT_4_wide(wfracbits, R, X, Y, doit)			    \
+#define _FP_MUL_MEAT_DW_4_wide(wfracbits, R, X, Y, doit)		    \
  do {									    \
-    _FP_FRAC_DECL_8(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	    \
+    _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);				    \
    _FP_FRAC_DECL_2(_d); _FP_FRAC_DECL_2(_e); _FP_FRAC_DECL_2(_f);	    \
 									    \
-    doit(_FP_FRAC_WORD_8(_z,1), _FP_FRAC_WORD_8(_z,0), X##_f[0], Y##_f[0]); \
+    doit(_FP_FRAC_WORD_8(R,1), _FP_FRAC_WORD_8(R,0), X##_f[0], Y##_f[0]);   \
    doit(_b_f1, _b_f0, X##_f[0], Y##_f[1]);				    \
    doit(_c_f1, _c_f0, X##_f[1], Y##_f[0]);				    \
    doit(_d_f1, _d_f0, X##_f[1], Y##_f[1]);				    \
    doit(_e_f1, _e_f0, X##_f[0], Y##_f[2]);				    \
    doit(_f_f1, _f_f0, X##_f[2], Y##_f[0]);				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2),		    \
-		    _FP_FRAC_WORD_8(_z,1), 0,_b_f1,_b_f0,		    \
+		    _FP_FRAC_WORD_8(R,1), 0,_b_f1,_b_f0,		    \
-		    0,0,_FP_FRAC_WORD_8(_z,1));				    \
+		    0,0,_FP_FRAC_WORD_8(R,1));				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2),		    \
-		    _FP_FRAC_WORD_8(_z,1), 0,_c_f1,_c_f0,		    \
+		    _FP_FRAC_WORD_8(R,1), 0,_c_f1,_c_f0,		    \
-		    _FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2),	    \
+		    _FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2),		    \
-		    _FP_FRAC_WORD_8(_z,1));				    \
+		    _FP_FRAC_WORD_8(R,1));				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3),		    \
-		    _FP_FRAC_WORD_8(_z,2), 0,_d_f1,_d_f0,		    \
+		    _FP_FRAC_WORD_8(R,2), 0,_d_f1,_d_f0,		    \
-		    0,_FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2));	    \
+		    0,_FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2));	    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3),		    \
-		    _FP_FRAC_WORD_8(_z,2), 0,_e_f1,_e_f0,		    \
+		    _FP_FRAC_WORD_8(R,2), 0,_e_f1,_e_f0,		    \
-		    _FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3),	    \
+		    _FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3),		    \
-		    _FP_FRAC_WORD_8(_z,2));				    \
+		    _FP_FRAC_WORD_8(R,2));				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3),		    \
-		    _FP_FRAC_WORD_8(_z,2), 0,_f_f1,_f_f0,		    \
+		    _FP_FRAC_WORD_8(R,2), 0,_f_f1,_f_f0,		    \
-		    _FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3),	    \
+		    _FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3),		    \
-		    _FP_FRAC_WORD_8(_z,2));				    \
+		    _FP_FRAC_WORD_8(R,2));				    \
    doit(_b_f1, _b_f0, X##_f[0], Y##_f[3]);				    \
    doit(_c_f1, _c_f0, X##_f[3], Y##_f[0]);				    \
    doit(_d_f1, _d_f0, X##_f[1], Y##_f[2]);				    \
    doit(_e_f1, _e_f0, X##_f[2], Y##_f[1]);				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4),		    \
-		    _FP_FRAC_WORD_8(_z,3), 0,_b_f1,_b_f0,		    \
+		    _FP_FRAC_WORD_8(R,3), 0,_b_f1,_b_f0,		    \
-		    0,_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3));	    \
+		    0,_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3));	    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4),		    \
-		    _FP_FRAC_WORD_8(_z,3), 0,_c_f1,_c_f0,		    \
+		    _FP_FRAC_WORD_8(R,3), 0,_c_f1,_c_f0,		    \
-		    _FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4),	    \
+		    _FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4),		    \
-		    _FP_FRAC_WORD_8(_z,3));				    \
+		    _FP_FRAC_WORD_8(R,3));				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4),		    \
-		    _FP_FRAC_WORD_8(_z,3), 0,_d_f1,_d_f0,		    \
+		    _FP_FRAC_WORD_8(R,3), 0,_d_f1,_d_f0,		    \
-		    _FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4),	    \
+		    _FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4),		    \
-		    _FP_FRAC_WORD_8(_z,3));				    \
+		    _FP_FRAC_WORD_8(R,3));				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4),		    \
-		    _FP_FRAC_WORD_8(_z,3), 0,_e_f1,_e_f0,		    \
+		    _FP_FRAC_WORD_8(R,3), 0,_e_f1,_e_f0,		    \
-		    _FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4),	    \
+		    _FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4),		    \
-		    _FP_FRAC_WORD_8(_z,3));				    \
+		    _FP_FRAC_WORD_8(R,3));				    \
    doit(_b_f1, _b_f0, X##_f[2], Y##_f[2]);				    \
    doit(_c_f1, _c_f0, X##_f[1], Y##_f[3]);				    \
    doit(_d_f1, _d_f0, X##_f[3], Y##_f[1]);				    \
    doit(_e_f1, _e_f0, X##_f[2], Y##_f[3]);				    \
    doit(_f_f1, _f_f0, X##_f[3], Y##_f[2]);				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5),		    \
-		    _FP_FRAC_WORD_8(_z,4), 0,_b_f1,_b_f0,		    \
+		    _FP_FRAC_WORD_8(R,4), 0,_b_f1,_b_f0,		    \
-		    0,_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4));	    \
+		    0,_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4));	    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5),		    \
-		    _FP_FRAC_WORD_8(_z,4), 0,_c_f1,_c_f0,		    \
+		    _FP_FRAC_WORD_8(R,4), 0,_c_f1,_c_f0,		    \
-		    _FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5),	    \
+		    _FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5),		    \
-		    _FP_FRAC_WORD_8(_z,4));				    \
+		    _FP_FRAC_WORD_8(R,4));				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5),		    \
-		    _FP_FRAC_WORD_8(_z,4), 0,_d_f1,_d_f0,		    \
+		    _FP_FRAC_WORD_8(R,4), 0,_d_f1,_d_f0,		    \
-		    _FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5),	    \
+		    _FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5),		    \
-		    _FP_FRAC_WORD_8(_z,4));				    \
+		    _FP_FRAC_WORD_8(R,4));				    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6),		    \
-		    _FP_FRAC_WORD_8(_z,5), 0,_e_f1,_e_f0,		    \
+		    _FP_FRAC_WORD_8(R,5), 0,_e_f1,_e_f0,		    \
-		    0,_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5));	    \
+		    0,_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5));	    \
-    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6),	    \
+    __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6),		    \
-		    _FP_FRAC_WORD_8(_z,5), 0,_f_f1,_f_f0,		    \
+		    _FP_FRAC_WORD_8(R,5), 0,_f_f1,_f_f0,		    \
-		    _FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6),	    \
+		    _FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6),		    \
-		    _FP_FRAC_WORD_8(_z,5));				    \
+		    _FP_FRAC_WORD_8(R,5));				    \
    doit(_b_f1, _b_f0, X##_f[3], Y##_f[3]);				    \
-    __FP_FRAC_ADD_2(_FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6),	    \
+    __FP_FRAC_ADD_2(_FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6),		    \
 		    _b_f1,_b_f0,					    \
-		    _FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6));	    \
+		    _FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6));		    \
  } while (0)
 #define _FP_MUL_MEAT_4_wide(wfracbits, R, X, Y, doit)			    \
  do {									    \
    _FP_FRAC_DECL_8(_z);						    \
 									    \
    _FP_MUL_MEAT_DW_4_wide(wfracbits, _z, X, Y, doit);			    \
 									    \
    /* Normalize since we know where the msb of the multiplicands	    \
       were (bit B), we know that the msb of the of the product is	    \
@ -330,11 +339,16 @@
 		    _FP_FRAC_WORD_8(_z,1), _FP_FRAC_WORD_8(_z,0));	    \
  } while (0)
 #define _FP_MUL_MEAT_DW_4_gmp(wfracbits, R, X, Y)			    \
  do {									    \
    mpn_mul_n(R##_f, _x_f, _y_f, 4);					    \
  } while (0)
 #define _FP_MUL_MEAT_4_gmp(wfracbits, R, X, Y)				    \
  do {									    \
    _FP_FRAC_DECL_8(_z);						    \
 									    \
-    mpn_mul_n(_z_f, _x_f, _y_f, 4);					    \
+    _FP_MUL_MEAT_DW_4_gmp(wfracbits, _z, X, Y);				    \
 									    \
    /* Normalize since we know where the msb of the multiplicands	    \
       were (bit B), we know that the msb of the of the product is	    \
--- a/soft-fp/op-common.h
+++ b/soft-fp/op-common.h
@ -847,6 +847,217 @@ do {							\
 } while (0)
 /* Fused multiply-add.  The input values should be cooked.  */
 #define _FP_FMA(fs, wc, dwc, R, X, Y, Z)			\
 do {								\
  FP_DECL_##fs(T);						\
  T##_s = X##_s ^ Y##_s;					\
  T##_e = X##_e + Y##_e + 1;					\
  switch (_FP_CLS_COMBINE(X##_c, Y##_c))			\
  {								\
  case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_NORMAL):		\
    switch (Z##_c)						\
      {								\
      case FP_CLS_INF:						\
      case FP_CLS_NAN:						\
 	R##_s = Z##_s;						\
 	_FP_FRAC_COPY_##wc(R, Z);				\
 	R##_c = Z##_c;						\
 	break;							\
 								\
      case FP_CLS_ZERO:						\
 	R##_c = FP_CLS_NORMAL;					\
 	R##_s = T##_s;						\
 	R##_e = T##_e;						\
 								\
 	_FP_MUL_MEAT_##fs(R, X, Y);				\
 								\
 	if (_FP_FRAC_OVERP_##wc(fs, R))				\
 	  _FP_FRAC_SRS_##wc(R, 1, _FP_WFRACBITS_##fs);		\
 	else							\
 	  R##_e--;						\
 	break;							\
 								\
      case FP_CLS_NORMAL:;					\
 	_FP_FRAC_DECL_##dwc(TD);				\
 	_FP_FRAC_DECL_##dwc(ZD);				\
 	_FP_FRAC_DECL_##dwc(RD);				\
 	_FP_MUL_MEAT_DW_##fs(TD, X, Y);				\
 	R##_e = T##_e;						\
 	int tsh = _FP_FRAC_HIGHBIT_DW_##dwc(fs, TD) == 0;	\
 	T##_e -= tsh;						\
 	int ediff = T##_e - Z##_e;				\
 	if (ediff >= 0)						\
 	  {							\
 	    int shift = _FP_WFRACBITS_##fs - tsh - ediff;	\
 	    if (shift <= -_FP_WFRACBITS_##fs)			\
 	      _FP_FRAC_SET_##dwc(ZD, _FP_MINFRAC_##dwc);	\
 	    else						\
 	      {							\
 		_FP_FRAC_COPY_##dwc##_##wc(ZD, Z);		\
 		if (shift < 0)					\
 		  _FP_FRAC_SRS_##dwc(ZD, -shift,		\
 				     _FP_WFRACBITS_DW_##fs);	\
 		else if (shift > 0)				\
 		  _FP_FRAC_SLL_##dwc(ZD, shift);		\
 	      }							\
 	    R##_s = T##_s;					\
 	    if (T##_s == Z##_s)					\
 	      _FP_FRAC_ADD_##dwc(RD, TD, ZD);			\
 	    else						\
 	      {							\
 		_FP_FRAC_SUB_##dwc(RD, TD, ZD);			\
 		if (_FP_FRAC_NEGP_##dwc(RD))			\
 		  {						\
 		    R##_s = Z##_s;				\
 		    _FP_FRAC_SUB_##dwc(RD, ZD, TD);		\
 		  }						\
 	      }							\
 	  }							\
 	else							\
 	  {							\
 	    R##_e = Z##_e;					\
 	    R##_s = Z##_s;					\
 	    _FP_FRAC_COPY_##dwc##_##wc(ZD, Z);			\
 	    _FP_FRAC_SLL_##dwc(ZD, _FP_WFRACBITS_##fs);		\
 	    int shift = -ediff - tsh;				\
 	    if (shift >= _FP_WFRACBITS_DW_##fs)			\
 	      _FP_FRAC_SET_##dwc(TD, _FP_MINFRAC_##dwc);	\
 	    else if (shift > 0)					\
 	      _FP_FRAC_SRS_##dwc(TD, shift,			\
 				 _FP_WFRACBITS_DW_##fs);	\
 	    if (Z##_s == T##_s)					\
 	      _FP_FRAC_ADD_##dwc(RD, ZD, TD);			\
 	    else						\
 	      _FP_FRAC_SUB_##dwc(RD, ZD, TD);			\
 	  }							\
 	if (_FP_FRAC_ZEROP_##dwc(RD))				\
 	  {							\
 	    if (T##_s == Z##_s)					\
 	      R##_s = Z##_s;					\
 	    else						\
 	      R##_s = (FP_ROUNDMODE == FP_RND_MINF);		\
 	    _FP_FRAC_SET_##wc(R, _FP_ZEROFRAC_##wc);		\
 	    R##_c = FP_CLS_ZERO;				\
 	  }							\
 	else							\
 	  {							\
 	    int rlz;						\
 	    _FP_FRAC_CLZ_##dwc(rlz, RD);			\
 	    rlz -= _FP_WFRACXBITS_DW_##fs;			\
 	    R##_e -= rlz;					\
 	    int shift = _FP_WFRACBITS_##fs - rlz;		\
 	    if (shift > 0)					\
 	      _FP_FRAC_SRS_##dwc(RD, shift,			\
 				 _FP_WFRACBITS_DW_##fs);	\
 	    else if (shift < 0)					\
 	      _FP_FRAC_SLL_##dwc(RD, -shift);			\
 	    _FP_FRAC_COPY_##wc##_##dwc(R, RD);			\
 	    R##_c = FP_CLS_NORMAL;				\
 	  }							\
 	break;							\
      }								\
    goto done_fma;						\
 								\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NAN):			\
    _FP_CHOOSENAN(fs, wc, T, X, Y, '*');			\
    break;							\
 								\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NORMAL):		\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_INF):			\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_ZERO):			\
    T##_s = X##_s;						\
 								\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_INF):			\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NORMAL):		\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NORMAL):		\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_ZERO):		\
    _FP_FRAC_COPY_##wc(T, X);					\
    T##_c = X##_c;						\
    break;							\
 								\
  case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_NAN):		\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NAN):			\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NAN):			\
    T##_s = Y##_s;						\
 								\
  case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_INF):		\
  case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_ZERO):		\
    _FP_FRAC_COPY_##wc(T, Y);					\
    T##_c = Y##_c;						\
    break;							\
 								\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_ZERO):			\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_INF):			\
    T##_s = _FP_NANSIGN_##fs;					\
    T##_c = FP_CLS_NAN;						\
    _FP_FRAC_SET_##wc(T, _FP_NANFRAC_##fs);			\
    FP_SET_EXCEPTION(FP_EX_INVALID);				\
    break;							\
 								\
  default:							\
    abort();							\
  }								\
 								\
  /* T = X * Y is zero, infinity or NaN.  */			\
  switch (_FP_CLS_COMBINE(T##_c, Z##_c))			\
  {								\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NAN):			\
    _FP_CHOOSENAN(fs, wc, R, T, Z, '+');			\
    break;							\
 								\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NORMAL):		\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_INF):			\
  case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_ZERO):			\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NORMAL):		\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_ZERO):			\
    R##_s = T##_s;						\
    _FP_FRAC_COPY_##wc(R, T);					\
    R##_c = T##_c;						\
    break;							\
 								\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NAN):			\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NAN):			\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NORMAL):		\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_INF):			\
    R##_s = Z##_s;						\
    _FP_FRAC_COPY_##wc(R, Z);					\
    R##_c = Z##_c;						\
    break;							\
 								\
  case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_INF):			\
    if (T##_s == Z##_s)						\
      {								\
 	R##_s = Z##_s;						\
 	_FP_FRAC_COPY_##wc(R, Z);				\
 	R##_c = Z##_c;						\
      }								\
    else							\
      {								\
 	R##_s = _FP_NANSIGN_##fs;				\
 	R##_c = FP_CLS_NAN;					\
 	_FP_FRAC_SET_##wc(R, _FP_NANFRAC_##fs);			\
 	FP_SET_EXCEPTION(FP_EX_INVALID);			\
      }								\
    break;							\
 								\
  case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_ZERO):		\
    if (T##_s == Z##_s)						\
      R##_s = Z##_s;						\
    else							\
      R##_s = (FP_ROUNDMODE == FP_RND_MINF);			\
    _FP_FRAC_COPY_##wc(R, Z);					\
    R##_c = Z##_c;						\
    break;							\
 								\
  default:							\
    abort();							\
  }								\
 done_fma: ;							\
 } while (0)
 /*
 * Main division routine.  The input values should be cooked.
 */
--- a/soft-fp/quad.h
+++ b/soft-fp/quad.h
@ -36,8 +36,10 @@
 #if _FP_W_TYPE_SIZE < 64
 #define _FP_FRACTBITS_Q         (4*_FP_W_TYPE_SIZE)
 #define _FP_FRACTBITS_DW_Q	(8*_FP_W_TYPE_SIZE)
 #else
 #define _FP_FRACTBITS_Q		(2*_FP_W_TYPE_SIZE)
 #define _FP_FRACTBITS_DW_Q	(4*_FP_W_TYPE_SIZE)
 #endif
 #define _FP_FRACBITS_Q		113
@ -59,6 +61,11 @@
 #define _FP_OVERFLOW_Q		\
 	((_FP_W_TYPE)1 << (_FP_WFRACBITS_Q % _FP_W_TYPE_SIZE))
 #define _FP_WFRACBITS_DW_Q	(2 * _FP_WFRACBITS_Q)
 #define _FP_WFRACXBITS_DW_Q	(_FP_FRACTBITS_DW_Q - _FP_WFRACBITS_DW_Q)
 #define _FP_HIGHBIT_DW_Q	\
  ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_Q - 1) % _FP_W_TYPE_SIZE)
 typedef float TFtype __attribute__((mode(TF)));
 #if _FP_W_TYPE_SIZE < 64
@ -155,6 +162,7 @@ union _FP_UNION_Q
 #define FP_DIV_Q(R,X,Y)			_FP_DIV(Q,4,R,X,Y)
 #define FP_SQRT_Q(R,X)			_FP_SQRT(Q,4,R,X)
 #define _FP_SQRT_MEAT_Q(R,S,T,X,Q)	_FP_SQRT_MEAT_4(R,S,T,X,Q)
 #define FP_FMA_Q(R,X,Y,Z)		_FP_FMA(Q,4,8,R,X,Y,Z)
 #define FP_CMP_Q(r,X,Y,un)	_FP_CMP(Q,4,r,X,Y,un)
 #define FP_CMP_EQ_Q(r,X,Y)	_FP_CMP_EQ(Q,4,r,X,Y)
@ -166,6 +174,8 @@ union _FP_UNION_Q
 #define _FP_FRAC_HIGH_Q(X)	_FP_FRAC_HIGH_4(X)
 #define _FP_FRAC_HIGH_RAW_Q(X)	_FP_FRAC_HIGH_4(X)
 #define _FP_FRAC_HIGH_DW_Q(X)	_FP_FRAC_HIGH_8(X)
 #else   /* not _FP_W_TYPE_SIZE < 64 */
 union _FP_UNION_Q
 {
@ -256,6 +266,7 @@ union _FP_UNION_Q
 #define FP_DIV_Q(R,X,Y)			_FP_DIV(Q,2,R,X,Y)
 #define FP_SQRT_Q(R,X)			_FP_SQRT(Q,2,R,X)
 #define _FP_SQRT_MEAT_Q(R,S,T,X,Q)	_FP_SQRT_MEAT_2(R,S,T,X,Q)
 #define FP_FMA_Q(R,X,Y,Z)		_FP_FMA(Q,2,4,R,X,Y,Z)
 #define FP_CMP_Q(r,X,Y,un)	_FP_CMP(Q,2,r,X,Y,un)
 #define FP_CMP_EQ_Q(r,X,Y)	_FP_CMP_EQ(Q,2,r,X,Y)
@ -267,4 +278,6 @@ union _FP_UNION_Q
 #define _FP_FRAC_HIGH_Q(X)	_FP_FRAC_HIGH_2(X)
 #define _FP_FRAC_HIGH_RAW_Q(X)	_FP_FRAC_HIGH_2(X)
 #define _FP_FRAC_HIGH_DW_Q(X)	_FP_FRAC_HIGH_4(X)
 #endif /* not _FP_W_TYPE_SIZE < 64 */
--- a/soft-fp/single.h
+++ b/soft-fp/single.h
@ -36,6 +36,12 @@
 #define _FP_FRACTBITS_S		_FP_W_TYPE_SIZE
 #if _FP_W_TYPE_SIZE < 64
 # define _FP_FRACTBITS_DW_S	(2 * _FP_W_TYPE_SIZE)
 #else
 # define _FP_FRACTBITS_DW_S	_FP_W_TYPE_SIZE
 #endif
 #define _FP_FRACBITS_S		24
 #define _FP_FRACXBITS_S		(_FP_FRACTBITS_S - _FP_FRACBITS_S)
 #define _FP_WFRACBITS_S		(_FP_WORKBITS + _FP_FRACBITS_S)
@ -49,6 +55,11 @@
 #define _FP_IMPLBIT_SH_S	((_FP_W_TYPE)1 << (_FP_FRACBITS_S-1+_FP_WORKBITS))
 #define _FP_OVERFLOW_S		((_FP_W_TYPE)1 << (_FP_WFRACBITS_S))
 #define _FP_WFRACBITS_DW_S	(2 * _FP_WFRACBITS_S)
 #define _FP_WFRACXBITS_DW_S	(_FP_FRACTBITS_DW_S - _FP_WFRACBITS_DW_S)
 #define _FP_HIGHBIT_DW_S	\
  ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_S - 1) % _FP_W_TYPE_SIZE)
 /* The implementation of _FP_MUL_MEAT_S and _FP_DIV_MEAT_S should be
   chosen by the target machine.  */
@ -139,6 +150,12 @@ union _FP_UNION_S
 #define FP_SQRT_S(R,X)			_FP_SQRT(S,1,R,X)
 #define _FP_SQRT_MEAT_S(R,S,T,X,Q)	_FP_SQRT_MEAT_1(R,S,T,X,Q)
 #if _FP_W_TYPE_SIZE < 64
 # define FP_FMA_S(R, X, Y, Z)	_FP_FMA(S, 1, 2, R, X, Y, Z)
 #else
 # define FP_FMA_S(R, X, Y, Z)	_FP_FMA(S, 1, 1, R, X, Y, Z)
 #endif
 #define FP_CMP_S(r,X,Y,un)	_FP_CMP(S,1,r,X,Y,un)
 #define FP_CMP_EQ_S(r,X,Y)	_FP_CMP_EQ(S,1,r,X,Y)
 #define FP_CMP_UNORD_S(r,X,Y)	_FP_CMP_UNORD(S,1,r,X,Y)
@ -148,3 +165,9 @@ union _FP_UNION_S
 #define _FP_FRAC_HIGH_S(X)	_FP_FRAC_HIGH_1(X)
 #define _FP_FRAC_HIGH_RAW_S(X)	_FP_FRAC_HIGH_1(X)
 #if _FP_W_TYPE_SIZE < 64
 # define _FP_FRAC_HIGH_DW_S(X)	_FP_FRAC_HIGH_2(X)
 #else
 # define _FP_FRAC_HIGH_DW_S(X)	_FP_FRAC_HIGH_1(X)
 #endif