added libtommath-0.07

2003-02-28 16:05:52 +00:00 · 2003-02-28 16:05:52 +00:00 · 3cd7000342
commit 3cd7000342
parent 16c6ccc62c
9 changed files with 277 additions and 111 deletions
--- a/b.bat
+++ b/b.bat
@ -1,3 +1,3 @@
 nasm -f coff timer.asm
 gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
-gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
+rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
--- a/bn.c
+++ b/bn.c
@ -849,8 +849,7 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
 */
 static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 {
-   mp_int t;
+   int olduse, res, pa, pb, ix, iy;
   int res, pa, pb, ix, iy;
   mp_word W[512], *_W;
   mp_digit tmpx, *tmpy;
@ -859,11 +858,12 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   VERIFY(b);
   VERIFY(c);
-   if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
+   if (c->alloc < digs) {
-      DECFUNC();
+      if ((res = mp_grow(c, digs)) != MP_OKAY) {
-      return res;
+         DECFUNC();
         return res;
      }
   }
   t.used = digs;
   /* clear temp buf (the columns) */
   memset(W, 0, digs*sizeof(mp_word));
@ -893,6 +893,11 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
       }
   }
   /* setup dest */
   olduse  = c->used;
   c->used = digs;
   /* At this point W[] contains the sums of each column.  To get the
    * correct result we must take the extra bits from each column and
    * carry them down
@ -904,14 +909,17 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
    * this is slower but on most cryptographic size numbers it is faster.
    */
   for (ix = 1; ix < digs; ix++) {
-       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
   }
   c->dp[digs-1]   = W[digs-1] & ((mp_word)MP_MASK);
   /* clear unused */
   for (ix = c->used; ix < olduse; ix++) {
      c->dp[ix] = 0;
   }
   t.dp[digs-1]   = W[digs-1] & ((mp_word)MP_MASK);
-   mp_clamp(&t);
+   mp_clamp(c);
   mp_exch(&t, c);
   mp_clear(&t);
   DECFUNC();
   return MP_OKAY;
 }
@ -993,8 +1001,7 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 */
 static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 {
-   mp_int t;
+   int oldused, newused, res, pa, pb, ix, iy;
   int res, pa, pb, ix, iy;
   mp_word W[512], *_W;
   mp_digit tmpx, *tmpy;
@ -1003,11 +1010,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   VERIFY(b);
   VERIFY(c);
-   if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
+   newused = a->used + b->used + 1;
-      DECFUNC();
+   if (c->alloc < newused) {
-      return res;
+      if ((res = mp_grow(c, newused)) != MP_OKAY) {
         DECFUNC();
         return res;
      }
   }
   t.used = a->used + b->used + 1;
   /* like the other comba method we compute the columns first */
   pa = a->used;
@ -1025,17 +1034,21 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
       }
   }
   /* setup dest */
   oldused = c->used;
   c->used = newused;
   /* now convert the array W downto what we need */
   for (ix = digs+1; ix < (pa+pb+1); ix++) {
-       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
   }
-   t.dp[(pa+pb+1)-1]   = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);
+   c->dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);
-   
+   for (ix = c->used; ix < oldused; ix++) {
-   mp_clamp(&t);
+      c->dp[ix] = 0;
-   mp_exch(&t, c);
+   }
-   mp_clear(&t);
+   mp_clamp(c);
   DECFUNC();
   return MP_OKAY;
 }
@ -1106,8 +1119,7 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 */
 static int fast_s_mp_sqr(mp_int *a, mp_int *b)
 {
-   mp_int t;
+   int olduse, newused, res, ix, iy, pa;
   int res, ix, iy, pa;
   mp_word  W2[512], W[512], *_W;
   mp_digit tmpx, *tmpy;
@ -1116,11 +1128,13 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
   VERIFY(b);
   pa = a->used;
-   if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
+   newused = pa + pa + 1;
-      DECFUNC();
+   if (b->alloc < newused) {
-      return res;
+      if ((res = mp_grow(b, newused)) != MP_OKAY) {
-   }
+         DECFUNC();
-   t.used = pa + pa + 1;
+         return res;
      }
   }   
   /* zero temp buffer (columns) */
   memset(W, 0, (pa+pa+1)*sizeof(mp_word));
@ -1144,19 +1158,29 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
   /* double first value, since the inner products are half of what they should be */
   W[0] += W[0] + W2[0];
   /* setup dest */
   olduse  = b->used;
   b->used = newused;
   /* now compute digits */
   for (ix = 1; ix < (pa+pa+1); ix++) {
       /* double/add next digit */
-       W[ix] += W[ix] + W2[ix];
+       W[ix]       += W[ix] + W2[ix];
-       
+
-       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       b->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
   }
-   t.dp[(pa+pa+1)-1]   = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);
+   b->dp[(pa+pa+1)-1]   = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);
-   mp_clamp(&t);
+   /* clear high */
-   mp_exch(&t, b);
+   for (ix = b->used; ix < olduse; ix++) {
-   mp_clear(&t);
+       b->dp[ix] = 0;
   }
   /* fix the sign (since we no longer make a fresh temp) */
   b->sign = MP_ZPOS;
   mp_clamp(b);
   DECFUNC();
   return MP_OKAY;
 }
@ -1173,13 +1197,13 @@ static int s_mp_sqr(mp_int *a, mp_int *b)
   VERIFY(a);
   VERIFY(b);
-   /* can we use the fast multiplier? */
+   /* can we use the fast multiplier? */  
   if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
      res = fast_s_mp_sqr(a,b);
      DECFUNC();
      return res;
   }  
-
+   
   pa = a->used;
   if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
      DECFUNC();
@ -1385,10 +1409,9 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
   if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1;                 /* x1y1 = x1y1 << 2*B */
   if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1;             /* t1 = x0y0 + t1 */
-   if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1;             /* t1 = x0y0 + t1 + x1y1 */
+   if (mp_add(&t1, &x1y1, c) != MP_OKAY) goto X1Y1;               /* t1 = x0y0 + t1 + x1y1 */
   err = MP_OKAY;
   mp_exch(&t1, c);
 X1Y1: mp_clear(&x1y1);
 X0Y0: mp_clear(&x0y0);
@ -1426,7 +1449,7 @@ int mp_mul(mp_int *a, mp_int *b, mp_int *c)
 static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
 {
   mp_int x0, x1, t1, t2, x0x0, x1x1;
-   int B, err;
+   int B, err, x;
   REGFUNC("mp_karatsuba_sqr");
   VERIFY(a);
@ -1441,8 +1464,8 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
   B = B/2;
   /* init copy all the temps */
-   if (mp_init_copy(&x0, a) != MP_OKAY) goto ERR;
+   if (mp_init_size(&x0, B) != MP_OKAY) goto ERR;
-   if (mp_init_copy(&x1, a) != MP_OKAY) goto X0;
+   if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0;
   /* init temps */
   if (mp_init(&t1) != MP_OKAY)         goto X1;
@ -1451,16 +1474,27 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
   if (mp_init(&x1x1) != MP_OKAY)       goto X0X0;
   /* now shift the digits */
-   mp_mod_2d(&x0, B*DIGIT_BIT, &x0);
+   for (x = 0; x < B; x++) {
-   mp_rshd(&x1, B);
+       x0.dp[x] = a->dp[x];
   }
   for (x = B; x < a->used; x++) {
       x1.dp[x-B] = a->dp[x];
   }
   x0.used = B;
   x1.used = a->used - B;
   mp_clamp(&x0);
   mp_clamp(&x1);
   /* now calc the products x0*x0 and x1*x1 */
-   if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                /* x0x0 = x0*x0 */
+   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                /* x0x0 = x0*x0 */
-   if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                /* x1x1 = x1*x1 */
+   if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                /* x1x1 = x1*x1 */
   /* now calc x1-x0 and y1-y0 */
   if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */
-   if (s_mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */
+   if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */
   /* add x0y0 */
   if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */
@ -1471,10 +1505,9 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
   if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1;                 /* x1y1 = x1y1 << 2*B */
   if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1;             /* t1 = x0y0 + t1 */
-   if (mp_add(&t1, &x1x1, &t1) != MP_OKAY) goto X1X1;             /* t1 = x0y0 + t1 + x1y1 */
+   if (mp_add(&t1, &x1x1, b) != MP_OKAY) goto X1X1;               /* t1 = x0y0 + t1 + x1y1 */
   err = MP_OKAY;
   mp_exch(&t1, b);
 X1X1: mp_clear(&x1x1);
 X0X0: mp_clear(&x0x0);
@ -2784,6 +2817,102 @@ __M  :
   return err;
 }
 /* find the n'th root of an integer 
 *
 * Result found such that (c)^b <= a and (c+1)^b > a 
 */
 int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
 {
   mp_int t1, t2, t3;
   int res, neg;
   /* input must be positive if b is even*/
   if ((b&1) == 0 && a->sign == MP_NEG) {
      return MP_VAL;
   }
   if ((res = mp_init(&t1)) != MP_OKAY) {
      return res;
   }
   if ((res = mp_init(&t2)) != MP_OKAY) {
      goto __T1;
   }
   if ((res = mp_init(&t3)) != MP_OKAY) {
      goto __T2;
   }
   /* if a is negative fudge the sign but keep track */
   neg     = a->sign;
   a->sign = MP_ZPOS;
   /* t2 = a */
   if ((res = mp_copy(a, &t2)) != MP_OKAY) {
      goto __T3;
   }
   do {
      /* t1 = t2 */
      if ((res = mp_copy(&t2, &t1)) != MP_OKAY) {
         goto __T3;
      }
      /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
      if ((res = mp_expt_d(&t1, b-1, &t3)) != MP_OKAY) {            /* t3 = t1^(b-1) */
         goto __T3;
      }
      /* numerator */
      if ((res = mp_mul(&t3, &t1, &t2)) != MP_OKAY) {               /* t2 = t1^b */
         goto __T3;
      }
      if ((res = mp_sub(&t2, a, &t2)) != MP_OKAY) {                 /* t2 = t1^b - a */
         goto __T3;
      }
      if ((res = mp_mul_d(&t3, b, &t3)) != MP_OKAY) {               /* t3 = t1^(b-1) * b  */
         goto __T3;
      }
      if ((res = mp_div(&t2, &t3, &t3, NULL)) != MP_OKAY) {         /* t3 = (t1^b - a)/(b * t1^(b-1)) */
         goto __T3;
      }
      if ((res = mp_sub(&t1, &t3, &t2)) != MP_OKAY) {
         goto __T3;
      }
   } while (mp_cmp(&t1, &t2) != MP_EQ);
   /* result can be at most off by one so check */
   if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
      goto __T3;
   }
   if (mp_cmp(&t2, a) == MP_GT) {
      if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
         goto __T3;
      }
   }
   /* reset the sign of a first */
   a->sign = neg;
   /* set the result */
   mp_exch(&t1, c);
   /* set the sign of the result */
   c->sign = neg;   
   res = MP_OKAY;
 __T3:  mp_clear(&t3);
 __T2:  mp_clear(&t2);
 __T1:  mp_clear(&t1);
   return res;
 }
 /* --> radix conversion <-- */
 /* reverse an array, used for radix code */
 static void reverse(unsigned char *s, int len)
--- a/bn.h
+++ b/bn.h
@ -233,6 +233,15 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
 /* c = [a, b] or (a*b)/(a, b) */
 int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
 /* finds one of the b'th root of a, such that |c|^b <= |a| 
 *
 * returns error if a < 0 and b is even
 */
 int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
 /* shortcut for square root */
 #define mp_sqrt(a, b) mp_n_root(a, 2, b)
 /* used to setup the Barrett reduction for a given modulus b */
 int mp_reduce_setup(mp_int *a, mp_int *b);
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
-\title{LibTomMath v0.06 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.07 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@ -187,17 +187,6 @@ int mp_mul_2(mp_int *a, mp_int *b);
 int mp_mod_2d(mp_int *a, int b, mp_int *c);
 \end{verbatim}
 Both the \textbf{mp\_rshd} and \textbf{mp\_lshd} functions provide shifting by whole digits.  For example, 
 mp\_rshd($x$, $n$) is the same as $x \leftarrow \lfloor x / \beta^n \rfloor$ while mp\_lshd($x$, $n$) is equivalent
 to $x \leftarrow x \cdot \beta^n$.  Both functions are extremely fast as they merely copy digits within the array.  
 Similarly the \textbf{mp\_div\_2d} and \textbf{mp\_mul\_2d} functions provide shifting but allow any bit count to 
 be specified.  For example, mp\_div\_2d($x$, $n$, $y$) is the same as $y =\lfloor x / 2^n \rfloor$ while 
 mp\_mul\_2d($x$, $n$, $y$) is the same as $y = x \cdot 2^n$.  The \textbf{mp\_div\_2} and \textbf{mp\_mul\_2} 
 functions are legacy functions that merely shift right or left one bit respectively.  The \textbf{mp\_mod\_2d} function
 reduces an integer mod a power of two.  For example, mp\_mod\_2d($x$, $n$, $y$) is the same as 
 $y \equiv x \mbox{ (mod }2^n\mbox{)}$.
 \subsection{Basic Arithmetic}
 Next are the class of functions which provide basic arithmetic.
@ -234,17 +223,7 @@ int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 int mp_mod(mp_int *a, mp_int *b, mp_int *c);
 \end{verbatim}
-The \textbf{mp\_cmp} will compare two integers.  It will return \textbf{MP\_LT} if the first parameter is less than
+\subsection{Single Digit Functions}
 the second, \textbf{MP\_GT} if it is greater or \textbf{MP\_EQ} if they are equal.  These constants are the same as from
 MPI.
 The \textbf{mp\_add}, \textbf{mp\_sub}, \textbf{mp\_mul}, \textbf{mp\_div}, \textbf{mp\_sqr} and \textbf{mp\_mod} are all
 fairly straight forward to understand.  Note that in mp\_div either $c$ (the quotient) or $d$ (the remainder) can be 
 passed as NULL to ignore it.  For example, if you only want the quotient $z = \lfloor x/y \rfloor$ then a call such as 
 mp\_div(\&x, \&y, \&z, NULL) is acceptable.
 There is a related class of ``single digit'' functions that are like the above except they use a digit as the second
 operand.
 \begin{verbatim}
 /* compare against a single digit */
@ -296,14 +275,13 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
 /* c = [a, b] or (a*b)/(a, b) */
 int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
 /* find the b'th root of a  */
 int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
 /* d = a^b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 \end{verbatim}
 These are all fairly simple to understand.  The \textbf{mp\_invmod} is a modular multiplicative inverse.  That is it
 stores in the third parameter an integer such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$ provided such integer exists.  If
 there is no such integer the function returns \textbf{MP\_VAL}.
 \subsection{Radix Conversions}
 To read or store integers in other formats there are the following functions.
@ -432,7 +410,7 @@ when $b \le 0$, in theory the routine will still give a properly congruent answe
 This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
-\subsection{Modular Arithmetic}
+\subsection{Number Theoretic Functions}
 \subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
 These functions take the time of their host function plus the time it takes to perform a division.  For example, 
@ -445,6 +423,41 @@ Also note that these functions use mp\_mod which means the result are guaranteed
 This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$.  When
 $b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  
 \subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
 Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$.  Will work with either positive
 or negative inputs.  
 Functions requires no additional memory and approximately $O(N \cdot log(N))$ time.
 \subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)}
 Finds the least common multiple of both $a$ and $b$ and places the result in $c$.  Will work with either positive
 or negative inputs.  This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of 
 both.  
 Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
 \subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
 Finds the $b$'th root of $a$ and stores it in $b$.  The roots are found such that $\vert c \vert^b \le \vert a \vert$.  
 Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result.  Each iteration
 requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
 If the input $a$ is negative and $b$ is even the function returns an error.  Otherwise the function will return a root
 that has a sign that agrees with the sign of $a$.
 \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
 Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit
 exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications.  The value of $k$ is
 chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett reductions are used
 to reduce the squared or multiplied temporary results modulo $c$.  A Barrett reduction requires one division that is
 performed only and two half multipliers of $N$ digit numbers resulting in approximation $O((N^2)/2)$ work.  
 Let $\gamma = \lfloor \alpha/k \rfloor + k$ represent the total multiplications.  The total work of a exponentiation is
 therefore, $O(3 \cdot N^2 + (\alpha + \gamma) \cdot ((N^2)/2) + \alpha \cdot ((N^2 + N)/2) + \gamma \cdot N^2)$ which 
 simplies to $O(3 \cdot N^2 + \gamma N^2 + \alpha (N^2 + (N/2)))$.  The bulk of the time is spent in the Barrett 
 reductions and the squaring algorithms.  Since $\gamma < \alpha$ it makes sense to optimize first the Barrett and
 squaring routines first.  Significant improvements in the future will most likely stem from optimizing these instead
 of optimizing the multipliers.
 \section{Timing Analysis}
 \subsection{Observed Timings}
 A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification).  The
@ -467,27 +480,27 @@ Inversion & 1024 & 5,237,957  & 1,054,158   \\
 Inversion & 2048 & 17,871,944  & 3,459,683   \\
 Inversion & 4096 & 66,610,468  & 11,834,556   \\
 \hline
-Multiply & 128 & 1,426   & 828    \\
+Multiply & 128 & 1,426   & 451     \\
-Multiply & 256 & 2,551   & 1,393    \\
+Multiply & 256 & 2,551   & 958     \\
-Multiply & 512 & 7,913   & 2,926    \\
+Multiply & 512 & 7,913   & 2,476     \\
-Multiply & 1024 & 28,496   & 8,620  \\
+Multiply & 1024 & 28,496   & 7,927   \\
-Multiply & 2048 & 109,897   & 28,967    \\
+Multiply & 2048 & 109,897   & 282,24     \\
-Multiply & 4096 & 469,970   & 105,387    \\
+Multiply & 4096 & 469,970   & 104,681     \\
 \hline 
-Square & 128 & 1,319   & 869    \\
+Square & 128 & 1,319   & 511     \\
-Square & 256 & 1,776   & 1,362    \\
+Square & 256 & 1,776   & 947     \\
-Square & 512 & 5,399  & 2,571   \\
+Square & 512 & 5,399  & 2,153    \\
-Square & 1024 & 18,991  & 6,332    \\
+Square & 1024 & 18,991  & 5,733     \\
-Square & 2048 & 72,126  & 18,426   \\
+Square & 2048 & 72,126  & 17,621    \\
-Square & 4096 & 306,269  & 74,908 \\
+Square & 4096 & 306,269  & 70,168  \\
 \hline 
-Exptmod & 512 & 32,021,586  & 5,696,459  \\
+Exptmod & 512 & 32,021,586  & 4,472,406   \\
-Exptmod & 768 & 97,595,492  & 12,428,274   \\
+Exptmod & 768 & 97,595,492  & 10,427,845    \\
-Exptmod & 1024 & 223,302,532  & 22,834,316   \\
+Exptmod & 1024 & 223,302,532  & 20,561,722    \\
-Exptmod & 2048 & 1,682,223,369   & 119,888,049    \\
+Exptmod & 2048 & 1,682,223,369   & 113,978,803     \\
-Exptmod & 2560 & 3,268,615,571   & 250,901,263     \\
+Exptmod & 2560 & 3,268,615,571   & 236,650,133      \\
-Exptmod & 3072 & 5,597,240,141   & 391,716,431    \\
+Exptmod & 3072 & 5,597,240,141   & 373,449,291     \\
-Exptmod & 4096 & 13,347,270,891   & 814,429,647    
+Exptmod & 4096 & 13,347,270,891   & 787,568,457     
 \end{tabular}
 \end{center}
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,8 @@
 Jan 1st, 2003
 v0.07  -- Removed alot of heap operations from core functions to speed them up
       -- Added a root finding function [and mp_sqrt macro like from MPI]
       -- Added more to manual 
 Dec 31st, 2002
 v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
       -- Cleaned up the header a bit more
--- a/demo.c
+++ b/demo.c
@ -105,10 +105,9 @@ int main(void)
   mp_sub_d(&a, 1, &c);
   mp_exptmod(&b, &c, &a, &d);
   mp_toradix(&d, buf, 10);
-   printf("b^p-1 == %s\n", buf); 
+   printf("b^p-1 == %s\n", buf);     
 #ifdef TIMER   
      mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
      mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
      while (a.used * DIGIT_BIT < 8192) {
@ -156,9 +155,6 @@ int main(void)
      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
      mp_copy(&b, &a);
   }
   {
      char *primes[] = {
@ -177,6 +173,7 @@ int main(void)
      for (rr = 0; rr < mp_count_bits(&a); rr++) {
         mp_mul_2d(&b, 1, &b);
         b.dp[0] |= lbit();
         b.used  += 1;
      }
      mp_sub_d(&a, 1, &c);
      mp_mod(&b, &c, &b);
@ -198,7 +195,7 @@ int main(void)
      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
   }
   }
-   
+
   mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
   mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
   while (a.used * DIGIT_BIT < 8192) {
@ -223,6 +220,19 @@ int main(void)
   inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0;   
   for (;;) {
       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
       switch (abs(rand()) % 7) {
           case 0:  mp_clear(&a); mp_init(&a); break;
           case 1:  mp_clear(&b); mp_init(&b); break;
           case 2:  mp_clear(&c); mp_init(&c); break;
           case 3:  mp_clear(&d); mp_init(&d); break;
           case 4:  mp_clear(&e); mp_init(&e); break;
           case 5:  mp_clear(&f); mp_init(&f); break;
           case 6:  break; /* don't clear any */
       }
       printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%5d\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, _ifuncs);
       fgets(cmd, 4095, stdin);
       cmd[strlen(cmd)-1] = 0;
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 CC = gcc
 CFLAGS  += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops 
-VERSION=0.06
+VERSION=0.07
 default: test
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -41,7 +41,7 @@ void rand_num(mp_int *a)
   unsigned char buf[512];
 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   for (n = 0; n < size; n++) {
@ -57,7 +57,7 @@ void rand_num2(mp_int *a)
   unsigned char buf[512];
 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   for (n = 0; n < size; n++) {