diff --git a/b.bat b/b.bat
index 1d0b900..32dee86 100644
--- a/b.bat
+++ b/b.bat
@@ -1,3 +1,3 @@
 nasm -f coff timer.asm
 gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
-gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
+rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
diff --git a/bn.c b/bn.c
index cf8a391..6d258d0 100644
--- a/bn.c
+++ b/bn.c
@@ -849,8 +849,7 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
  */
 static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 {
-   mp_int t;
-   int res, pa, pb, ix, iy;
+   int olduse, res, pa, pb, ix, iy;
    mp_word W[512], *_W;
    mp_digit tmpx, *tmpy;
    
@@ -859,11 +858,12 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
    VERIFY(b);
    VERIFY(c);
    
-   if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
-      DECFUNC();
-      return res;
+   if (c->alloc < digs) {
+      if ((res = mp_grow(c, digs)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
    }
-   t.used = digs;
    
    /* clear temp buf (the columns) */
    memset(W, 0, digs*sizeof(mp_word));
@@ -893,6 +893,11 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
        }
    }
    
+   /* setup dest */
+   olduse  = c->used;
+   c->used = digs;
+
+   
    /* At this point W[] contains the sums of each column.  To get the
     * correct result we must take the extra bits from each column and
     * carry them down
@@ -904,14 +909,17 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
     * this is slower but on most cryptographic size numbers it is faster.
     */
    for (ix = 1; ix < digs; ix++) {
-       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+   }
+   c->dp[digs-1]   = W[digs-1] & ((mp_word)MP_MASK);
+   
+   /* clear unused */
+   for (ix = c->used; ix < olduse; ix++) {
+      c->dp[ix] = 0;
    }
-   t.dp[digs-1]   = W[digs-1] & ((mp_word)MP_MASK);
   
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
+   mp_clamp(c);
    DECFUNC();
    return MP_OKAY;
 }
@@ -993,8 +1001,7 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
  */
 static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 {
-   mp_int t;
-   int res, pa, pb, ix, iy;
+   int oldused, newused, res, pa, pb, ix, iy;
    mp_word W[512], *_W;
    mp_digit tmpx, *tmpy;
    
@@ -1003,11 +1010,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
    VERIFY(b);
    VERIFY(c);
    
-   if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
+   newused = a->used + b->used + 1;
+   if (c->alloc < newused) {
+      if ((res = mp_grow(c, newused)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
    }
-   t.used = a->used + b->used + 1;
    
    /* like the other comba method we compute the columns first */
    pa = a->used;
@@ -1025,17 +1034,21 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
        }
    }
    
+   /* setup dest */
+   oldused = c->used;
+   c->used = newused;
+   
    /* now convert the array W downto what we need */
    for (ix = digs+1; ix < (pa+pb+1); ix++) {
-       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
    }
-   t.dp[(pa+pb+1)-1]   = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);
+   c->dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);
    
-   
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
+   for (ix = c->used; ix < oldused; ix++) {
+      c->dp[ix] = 0;
+   }
+   mp_clamp(c);
    DECFUNC();
    return MP_OKAY;
 }
@@ -1106,8 +1119,7 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
  */
 static int fast_s_mp_sqr(mp_int *a, mp_int *b)
 {
-   mp_int t;
-   int res, ix, iy, pa;
+   int olduse, newused, res, ix, iy, pa;
    mp_word  W2[512], W[512], *_W;
    mp_digit tmpx, *tmpy;
    
@@ -1116,11 +1128,13 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
    VERIFY(b);
 
    pa = a->used;
-   if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   t.used = pa + pa + 1;
+   newused = pa + pa + 1;
+   if (b->alloc < newused) {
+      if ((res = mp_grow(b, newused)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
+   }   
    
    /* zero temp buffer (columns) */
    memset(W, 0, (pa+pa+1)*sizeof(mp_word));
@@ -1144,19 +1158,29 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
    /* double first value, since the inner products are half of what they should be */
    W[0] += W[0] + W2[0];
    
+   /* setup dest */
+   olduse  = b->used;
+   b->used = newused;
+   
    /* now compute digits */
    for (ix = 1; ix < (pa+pa+1); ix++) {
        /* double/add next digit */
-       W[ix] += W[ix] + W2[ix];
-       
-       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       W[ix]       += W[ix] + W2[ix];
+
+       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       b->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
    }
-   t.dp[(pa+pa+1)-1]   = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);
+   b->dp[(pa+pa+1)-1]   = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);
    
-   mp_clamp(&t);
-   mp_exch(&t, b);
-   mp_clear(&t);
+   /* clear high */
+   for (ix = b->used; ix < olduse; ix++) {
+       b->dp[ix] = 0;
+   }
+   
+   /* fix the sign (since we no longer make a fresh temp) */
+   b->sign = MP_ZPOS;
+   
+   mp_clamp(b);
    DECFUNC();
    return MP_OKAY;
 }
@@ -1173,13 +1197,13 @@ static int s_mp_sqr(mp_int *a, mp_int *b)
    VERIFY(a);
    VERIFY(b);
    
-   /* can we use the fast multiplier? */
+   /* can we use the fast multiplier? */  
    if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
       res = fast_s_mp_sqr(a,b);
       DECFUNC();
       return res;
    }  
-
+   
    pa = a->used;
    if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
       DECFUNC();
@@ -1385,10 +1409,9 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
    if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1;                 /* x1y1 = x1y1 << 2*B */
 
    if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1;             /* t1 = x0y0 + t1 */
-   if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1;             /* t1 = x0y0 + t1 + x1y1 */
+   if (mp_add(&t1, &x1y1, c) != MP_OKAY) goto X1Y1;               /* t1 = x0y0 + t1 + x1y1 */
 
    err = MP_OKAY;
-   mp_exch(&t1, c);
 
 X1Y1: mp_clear(&x1y1);
 X0Y0: mp_clear(&x0y0);
@@ -1426,7 +1449,7 @@ int mp_mul(mp_int *a, mp_int *b, mp_int *c)
 static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
 {
    mp_int x0, x1, t1, t2, x0x0, x1x1;
-   int B, err;
+   int B, err, x;
    
    REGFUNC("mp_karatsuba_sqr");
    VERIFY(a);
@@ -1441,8 +1464,8 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
    B = B/2;
 
    /* init copy all the temps */
-   if (mp_init_copy(&x0, a) != MP_OKAY) goto ERR;
-   if (mp_init_copy(&x1, a) != MP_OKAY) goto X0;
+   if (mp_init_size(&x0, B) != MP_OKAY) goto ERR;
+   if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0;
 
    /* init temps */
    if (mp_init(&t1) != MP_OKAY)         goto X1;
@@ -1451,16 +1474,27 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
    if (mp_init(&x1x1) != MP_OKAY)       goto X0X0;
 
    /* now shift the digits */
-   mp_mod_2d(&x0, B*DIGIT_BIT, &x0);
-   mp_rshd(&x1, B);
+   for (x = 0; x < B; x++) {
+       x0.dp[x] = a->dp[x];
+   }
 
+   for (x = B; x < a->used; x++) {
+       x1.dp[x-B] = a->dp[x];
+   }
+   
+   x0.used = B;
+   x1.used = a->used - B;
+   
+   mp_clamp(&x0);
+   mp_clamp(&x1);
+   
    /* now calc the products x0*x0 and x1*x1 */
-   if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                /* x0x0 = x0*x0 */
-   if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                /* x1x1 = x1*x1 */
+   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                /* x0x0 = x0*x0 */
+   if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                /* x1x1 = x1*x1 */
 
    /* now calc x1-x0 and y1-y0 */
    if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */
-   if (s_mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */
+   if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */
 
    /* add x0y0 */
    if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */
@@ -1471,10 +1505,9 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
    if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1;                 /* x1y1 = x1y1 << 2*B */
 
    if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1;             /* t1 = x0y0 + t1 */
-   if (mp_add(&t1, &x1x1, &t1) != MP_OKAY) goto X1X1;             /* t1 = x0y0 + t1 + x1y1 */
+   if (mp_add(&t1, &x1x1, b) != MP_OKAY) goto X1X1;               /* t1 = x0y0 + t1 + x1y1 */
 
    err = MP_OKAY;
-   mp_exch(&t1, b);
    
 X1X1: mp_clear(&x1x1);
 X0X0: mp_clear(&x0x0);
@@ -2784,6 +2817,102 @@ __M  :
    return err;
 }
 
+/* find the n'th root of an integer 
+ *
+ * Result found such that (c)^b <= a and (c+1)^b > a 
+ */
+int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
+{
+   mp_int t1, t2, t3;
+   int res, neg;
+   
+   /* input must be positive if b is even*/
+   if ((b&1) == 0 && a->sign == MP_NEG) {
+      return MP_VAL;
+   }
+   
+   if ((res = mp_init(&t1)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_init(&t2)) != MP_OKAY) {
+      goto __T1;
+   }
+   
+   if ((res = mp_init(&t3)) != MP_OKAY) {
+      goto __T2;
+   }
+
+   /* if a is negative fudge the sign but keep track */
+   neg     = a->sign;
+   a->sign = MP_ZPOS;
+
+   /* t2 = a */
+   if ((res = mp_copy(a, &t2)) != MP_OKAY) {
+      goto __T3;
+   }
+  
+   do {
+      /* t1 = t2 */
+      if ((res = mp_copy(&t2, &t1)) != MP_OKAY) {
+         goto __T3;
+      }
+
+      /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
+      if ((res = mp_expt_d(&t1, b-1, &t3)) != MP_OKAY) {            /* t3 = t1^(b-1) */
+         goto __T3;
+      }
+
+      /* numerator */
+      if ((res = mp_mul(&t3, &t1, &t2)) != MP_OKAY) {               /* t2 = t1^b */
+         goto __T3;
+      }
+      
+      if ((res = mp_sub(&t2, a, &t2)) != MP_OKAY) {                 /* t2 = t1^b - a */
+         goto __T3;
+      }
+
+      if ((res = mp_mul_d(&t3, b, &t3)) != MP_OKAY) {               /* t3 = t1^(b-1) * b  */
+         goto __T3;
+      }
+      
+      if ((res = mp_div(&t2, &t3, &t3, NULL)) != MP_OKAY) {         /* t3 = (t1^b - a)/(b * t1^(b-1)) */
+         goto __T3;
+      }
+      
+      if ((res = mp_sub(&t1, &t3, &t2)) != MP_OKAY) {
+         goto __T3;
+      }
+   } while (mp_cmp(&t1, &t2) != MP_EQ);
+   
+   /* result can be at most off by one so check */
+   if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
+      goto __T3;
+   }
+   
+   if (mp_cmp(&t2, a) == MP_GT) {
+      if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
+         goto __T3;
+      }
+   }
+   
+   /* reset the sign of a first */
+   a->sign = neg;
+   
+   /* set the result */
+   mp_exch(&t1, c);
+   
+   /* set the sign of the result */
+   c->sign = neg;   
+   
+   res = MP_OKAY;
+   
+__T3:  mp_clear(&t3);
+__T2:  mp_clear(&t2);
+__T1:  mp_clear(&t1);
+   return res;
+}
+
 /* --> radix conversion <-- */
 /* reverse an array, used for radix code */
 static void reverse(unsigned char *s, int len)
diff --git a/bn.h b/bn.h
index 624c50d..3975727 100644
--- a/bn.h
+++ b/bn.h
@@ -233,6 +233,15 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
 /* c = [a, b] or (a*b)/(a, b) */
 int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
 
+/* finds one of the b'th root of a, such that |c|^b <= |a| 
+ *
+ * returns error if a < 0 and b is even
+ */
+int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
+
+/* shortcut for square root */
+#define mp_sqrt(a, b) mp_n_root(a, 2, b)
+
 /* used to setup the Barrett reduction for a given modulus b */
 int mp_reduce_setup(mp_int *a, mp_int *b);
 
diff --git a/bn.pdf b/bn.pdf
index d517dc4..54bde38 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 069f76b..7c02e2c 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
 
-\title{LibTomMath v0.06 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.07 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -187,17 +187,6 @@ int mp_mul_2(mp_int *a, mp_int *b);
 int mp_mod_2d(mp_int *a, int b, mp_int *c);
 \end{verbatim}
 
-Both the \textbf{mp\_rshd} and \textbf{mp\_lshd} functions provide shifting by whole digits.  For example, 
-mp\_rshd($x$, $n$) is the same as $x \leftarrow \lfloor x / \beta^n \rfloor$ while mp\_lshd($x$, $n$) is equivalent
-to $x \leftarrow x \cdot \beta^n$.  Both functions are extremely fast as they merely copy digits within the array.  
-
-Similarly the \textbf{mp\_div\_2d} and \textbf{mp\_mul\_2d} functions provide shifting but allow any bit count to 
-be specified.  For example, mp\_div\_2d($x$, $n$, $y$) is the same as $y =\lfloor x / 2^n \rfloor$ while 
-mp\_mul\_2d($x$, $n$, $y$) is the same as $y = x \cdot 2^n$.  The \textbf{mp\_div\_2} and \textbf{mp\_mul\_2} 
-functions are legacy functions that merely shift right or left one bit respectively.  The \textbf{mp\_mod\_2d} function
-reduces an integer mod a power of two.  For example, mp\_mod\_2d($x$, $n$, $y$) is the same as 
-$y \equiv x \mbox{ (mod }2^n\mbox{)}$.
-
 \subsection{Basic Arithmetic}
 
 Next are the class of functions which provide basic arithmetic.
@@ -234,17 +223,7 @@ int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 int mp_mod(mp_int *a, mp_int *b, mp_int *c);
 \end{verbatim}
 
-The \textbf{mp\_cmp} will compare two integers.  It will return \textbf{MP\_LT} if the first parameter is less than
-the second, \textbf{MP\_GT} if it is greater or \textbf{MP\_EQ} if they are equal.  These constants are the same as from
-MPI.
-
-The \textbf{mp\_add}, \textbf{mp\_sub}, \textbf{mp\_mul}, \textbf{mp\_div}, \textbf{mp\_sqr} and \textbf{mp\_mod} are all
-fairly straight forward to understand.  Note that in mp\_div either $c$ (the quotient) or $d$ (the remainder) can be 
-passed as NULL to ignore it.  For example, if you only want the quotient $z = \lfloor x/y \rfloor$ then a call such as 
-mp\_div(\&x, \&y, \&z, NULL) is acceptable.
-
-There is a related class of ``single digit'' functions that are like the above except they use a digit as the second
-operand.
+\subsection{Single Digit Functions}
 
 \begin{verbatim}
 /* compare against a single digit */
@@ -296,14 +275,13 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
 /* c = [a, b] or (a*b)/(a, b) */
 int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
 
+/* find the b'th root of a  */
+int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
+
 /* d = a^b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 \end{verbatim}
 
-These are all fairly simple to understand.  The \textbf{mp\_invmod} is a modular multiplicative inverse.  That is it
-stores in the third parameter an integer such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$ provided such integer exists.  If
-there is no such integer the function returns \textbf{MP\_VAL}.
-
 \subsection{Radix Conversions}
 To read or store integers in other formats there are the following functions.
 
@@ -432,7 +410,7 @@ when $b \le 0$, in theory the routine will still give a properly congruent answe
 
 This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
 
-\subsection{Modular Arithmetic}
+\subsection{Number Theoretic Functions}
 
 \subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
 These functions take the time of their host function plus the time it takes to perform a division.  For example, 
@@ -445,6 +423,41 @@ Also note that these functions use mp\_mod which means the result are guaranteed
 This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$.  When
 $b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  
 
+\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
+Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$.  Will work with either positive
+or negative inputs.  
+
+Functions requires no additional memory and approximately $O(N \cdot log(N))$ time.
+
+\subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)}
+Finds the least common multiple of both $a$ and $b$ and places the result in $c$.  Will work with either positive
+or negative inputs.  This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of 
+both.  
+
+Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
+
+\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
+Finds the $b$'th root of $a$ and stores it in $b$.  The roots are found such that $\vert c \vert^b \le \vert a \vert$.  
+Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result.  Each iteration
+requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
+
+If the input $a$ is negative and $b$ is even the function returns an error.  Otherwise the function will return a root
+that has a sign that agrees with the sign of $a$.
+
+\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
+Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit
+exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications.  The value of $k$ is
+chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett reductions are used
+to reduce the squared or multiplied temporary results modulo $c$.  A Barrett reduction requires one division that is
+performed only and two half multipliers of $N$ digit numbers resulting in approximation $O((N^2)/2)$ work.  
+
+Let $\gamma = \lfloor \alpha/k \rfloor + k$ represent the total multiplications.  The total work of a exponentiation is
+therefore, $O(3 \cdot N^2 + (\alpha + \gamma) \cdot ((N^2)/2) + \alpha \cdot ((N^2 + N)/2) + \gamma \cdot N^2)$ which 
+simplies to $O(3 \cdot N^2 + \gamma N^2 + \alpha (N^2 + (N/2)))$.  The bulk of the time is spent in the Barrett 
+reductions and the squaring algorithms.  Since $\gamma < \alpha$ it makes sense to optimize first the Barrett and
+squaring routines first.  Significant improvements in the future will most likely stem from optimizing these instead
+of optimizing the multipliers.
+
 \section{Timing Analysis}
 \subsection{Observed Timings}
 A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification).  The
@@ -467,27 +480,27 @@ Inversion & 1024 & 5,237,957  & 1,054,158   \\
 Inversion & 2048 & 17,871,944  & 3,459,683   \\
 Inversion & 4096 & 66,610,468  & 11,834,556   \\
 \hline
-Multiply & 128 & 1,426   & 828    \\
-Multiply & 256 & 2,551   & 1,393    \\
-Multiply & 512 & 7,913   & 2,926    \\
-Multiply & 1024 & 28,496   & 8,620  \\
-Multiply & 2048 & 109,897   & 28,967    \\
-Multiply & 4096 & 469,970   & 105,387    \\
+Multiply & 128 & 1,426   & 451     \\
+Multiply & 256 & 2,551   & 958     \\
+Multiply & 512 & 7,913   & 2,476     \\
+Multiply & 1024 & 28,496   & 7,927   \\
+Multiply & 2048 & 109,897   & 282,24     \\
+Multiply & 4096 & 469,970   & 104,681     \\
 \hline 
-Square & 128 & 1,319   & 869    \\
-Square & 256 & 1,776   & 1,362    \\
-Square & 512 & 5,399  & 2,571   \\
-Square & 1024 & 18,991  & 6,332    \\
-Square & 2048 & 72,126  & 18,426   \\
-Square & 4096 & 306,269  & 74,908 \\
+Square & 128 & 1,319   & 511     \\
+Square & 256 & 1,776   & 947     \\
+Square & 512 & 5,399  & 2,153    \\
+Square & 1024 & 18,991  & 5,733     \\
+Square & 2048 & 72,126  & 17,621    \\
+Square & 4096 & 306,269  & 70,168  \\
 \hline 
-Exptmod & 512 & 32,021,586  & 5,696,459  \\
-Exptmod & 768 & 97,595,492  & 12,428,274   \\
-Exptmod & 1024 & 223,302,532  & 22,834,316   \\
-Exptmod & 2048 & 1,682,223,369   & 119,888,049    \\
-Exptmod & 2560 & 3,268,615,571   & 250,901,263     \\
-Exptmod & 3072 & 5,597,240,141   & 391,716,431    \\
-Exptmod & 4096 & 13,347,270,891   & 814,429,647    
+Exptmod & 512 & 32,021,586  & 4,472,406   \\
+Exptmod & 768 & 97,595,492  & 10,427,845    \\
+Exptmod & 1024 & 223,302,532  & 20,561,722    \\
+Exptmod & 2048 & 1,682,223,369   & 113,978,803     \\
+Exptmod & 2560 & 3,268,615,571   & 236,650,133      \\
+Exptmod & 3072 & 5,597,240,141   & 373,449,291     \\
+Exptmod & 4096 & 13,347,270,891   & 787,568,457     
 
 \end{tabular}
 \end{center}
diff --git a/changes.txt b/changes.txt
index e5b6294..4737590 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,8 @@
+Jan 1st, 2003
+v0.07  -- Removed alot of heap operations from core functions to speed them up
+       -- Added a root finding function [and mp_sqrt macro like from MPI]
+       -- Added more to manual 
+
 Dec 31st, 2002
 v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
        -- Cleaned up the header a bit more
diff --git a/demo.c b/demo.c
index 0a1943f..ae35964 100644
--- a/demo.c
+++ b/demo.c
@@ -105,10 +105,9 @@ int main(void)
    mp_sub_d(&a, 1, &c);
    mp_exptmod(&b, &c, &a, &d);
    mp_toradix(&d, buf, 10);
-   printf("b^p-1 == %s\n", buf); 
+   printf("b^p-1 == %s\n", buf);     
 
 #ifdef TIMER   
-
       mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
       mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
       while (a.used * DIGIT_BIT < 8192) {
@@ -156,9 +155,6 @@ int main(void)
       printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
       mp_copy(&b, &a);
    }
-
-  
- 
    
    {
       char *primes[] = {
@@ -177,6 +173,7 @@ int main(void)
       for (rr = 0; rr < mp_count_bits(&a); rr++) {
          mp_mul_2d(&b, 1, &b);
          b.dp[0] |= lbit();
+         b.used  += 1;
       }
       mp_sub_d(&a, 1, &c);
       mp_mod(&b, &c, &b);
@@ -198,7 +195,7 @@ int main(void)
       printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
    }
    }
-   
+
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
    while (a.used * DIGIT_BIT < 8192) {
@@ -223,6 +220,19 @@ int main(void)
 
    inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0;   
    for (;;) {
+   
+       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
+       switch (abs(rand()) % 7) {
+           case 0:  mp_clear(&a); mp_init(&a); break;
+           case 1:  mp_clear(&b); mp_init(&b); break;
+           case 2:  mp_clear(&c); mp_init(&c); break;
+           case 3:  mp_clear(&d); mp_init(&d); break;
+           case 4:  mp_clear(&e); mp_init(&e); break;
+           case 5:  mp_clear(&f); mp_init(&f); break;
+           case 6:  break; /* don't clear any */
+       }
+   
+   
        printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%5d\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, _ifuncs);
        fgets(cmd, 4095, stdin);
        cmd[strlen(cmd)-1] = 0;
diff --git a/makefile b/makefile
index ec94b70..95c6465 100644
--- a/makefile
+++ b/makefile
@@ -1,7 +1,7 @@
 CC = gcc
 CFLAGS  += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops 
 
-VERSION=0.06
+VERSION=0.07
 
 default: test
 
diff --git a/mtest/mtest.c b/mtest/mtest.c
index a32e0e5..576feb2 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -41,7 +41,7 @@ void rand_num(mp_int *a)
    unsigned char buf[512];
 
 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    for (n = 0; n < size; n++) {
@@ -57,7 +57,7 @@ void rand_num2(mp_int *a)
    unsigned char buf[512];
 
 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    for (n = 0; n < size; n++) {