diff --git a/b.bat b/b.bat index 1d0b900..32dee86 100644 --- a/b.bat +++ b/b.bat @@ -1,3 +1,3 @@ nasm -f coff timer.asm gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo -gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo +rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo diff --git a/bn.c b/bn.c index cf8a391..6d258d0 100644 --- a/bn.c +++ b/bn.c @@ -849,8 +849,7 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c) */ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) { - mp_int t; - int res, pa, pb, ix, iy; + int olduse, res, pa, pb, ix, iy; mp_word W[512], *_W; mp_digit tmpx, *tmpy; @@ -859,11 +858,12 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) VERIFY(b); VERIFY(c); - if ((res = mp_init_size(&t, digs)) != MP_OKAY) { - DECFUNC(); - return res; + if (c->alloc < digs) { + if ((res = mp_grow(c, digs)) != MP_OKAY) { + DECFUNC(); + return res; + } } - t.used = digs; /* clear temp buf (the columns) */ memset(W, 0, digs*sizeof(mp_word)); @@ -893,6 +893,11 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) } } + /* setup dest */ + olduse = c->used; + c->used = digs; + + /* At this point W[] contains the sums of each column. To get the * correct result we must take the extra bits from each column and * carry them down @@ -904,14 +909,17 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) * this is slower but on most cryptographic size numbers it is faster. */ for (ix = 1; ix < digs; ix++) { - W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); - t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); + W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); + c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); + } + c->dp[digs-1] = W[digs-1] & ((mp_word)MP_MASK); + + /* clear unused */ + for (ix = c->used; ix < olduse; ix++) { + c->dp[ix] = 0; } - t.dp[digs-1] = W[digs-1] & ((mp_word)MP_MASK); - mp_clamp(&t); - mp_exch(&t, c); - mp_clear(&t); + mp_clamp(c); DECFUNC(); return MP_OKAY; } @@ -993,8 +1001,7 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) */ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) { - mp_int t; - int res, pa, pb, ix, iy; + int oldused, newused, res, pa, pb, ix, iy; mp_word W[512], *_W; mp_digit tmpx, *tmpy; @@ -1003,11 +1010,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) VERIFY(b); VERIFY(c); - if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) { - DECFUNC(); - return res; + newused = a->used + b->used + 1; + if (c->alloc < newused) { + if ((res = mp_grow(c, newused)) != MP_OKAY) { + DECFUNC(); + return res; + } } - t.used = a->used + b->used + 1; /* like the other comba method we compute the columns first */ pa = a->used; @@ -1025,17 +1034,21 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) } } + /* setup dest */ + oldused = c->used; + c->used = newused; + /* now convert the array W downto what we need */ for (ix = digs+1; ix < (pa+pb+1); ix++) { - W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); - t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); + W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); + c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); } - t.dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK); + c->dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK); - - mp_clamp(&t); - mp_exch(&t, c); - mp_clear(&t); + for (ix = c->used; ix < oldused; ix++) { + c->dp[ix] = 0; + } + mp_clamp(c); DECFUNC(); return MP_OKAY; } @@ -1106,8 +1119,7 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) */ static int fast_s_mp_sqr(mp_int *a, mp_int *b) { - mp_int t; - int res, ix, iy, pa; + int olduse, newused, res, ix, iy, pa; mp_word W2[512], W[512], *_W; mp_digit tmpx, *tmpy; @@ -1116,11 +1128,13 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b) VERIFY(b); pa = a->used; - if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) { - DECFUNC(); - return res; - } - t.used = pa + pa + 1; + newused = pa + pa + 1; + if (b->alloc < newused) { + if ((res = mp_grow(b, newused)) != MP_OKAY) { + DECFUNC(); + return res; + } + } /* zero temp buffer (columns) */ memset(W, 0, (pa+pa+1)*sizeof(mp_word)); @@ -1144,19 +1158,29 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b) /* double first value, since the inner products are half of what they should be */ W[0] += W[0] + W2[0]; + /* setup dest */ + olduse = b->used; + b->used = newused; + /* now compute digits */ for (ix = 1; ix < (pa+pa+1); ix++) { /* double/add next digit */ - W[ix] += W[ix] + W2[ix]; - - W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); - t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); + W[ix] += W[ix] + W2[ix]; + + W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); + b->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); } - t.dp[(pa+pa+1)-1] = W[(pa+pa+1)-1] & ((mp_word)MP_MASK); + b->dp[(pa+pa+1)-1] = W[(pa+pa+1)-1] & ((mp_word)MP_MASK); - mp_clamp(&t); - mp_exch(&t, b); - mp_clear(&t); + /* clear high */ + for (ix = b->used; ix < olduse; ix++) { + b->dp[ix] = 0; + } + + /* fix the sign (since we no longer make a fresh temp) */ + b->sign = MP_ZPOS; + + mp_clamp(b); DECFUNC(); return MP_OKAY; } @@ -1173,13 +1197,13 @@ static int s_mp_sqr(mp_int *a, mp_int *b) VERIFY(a); VERIFY(b); - /* can we use the fast multiplier? */ + /* can we use the fast multiplier? */ if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) { res = fast_s_mp_sqr(a,b); DECFUNC(); return res; } - + pa = a->used; if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) { DECFUNC(); @@ -1385,10 +1409,9 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c) if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */ if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */ - if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */ + if (mp_add(&t1, &x1y1, c) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */ err = MP_OKAY; - mp_exch(&t1, c); X1Y1: mp_clear(&x1y1); X0Y0: mp_clear(&x0y0); @@ -1426,7 +1449,7 @@ int mp_mul(mp_int *a, mp_int *b, mp_int *c) static int mp_karatsuba_sqr(mp_int *a, mp_int *b) { mp_int x0, x1, t1, t2, x0x0, x1x1; - int B, err; + int B, err, x; REGFUNC("mp_karatsuba_sqr"); VERIFY(a); @@ -1441,8 +1464,8 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b) B = B/2; /* init copy all the temps */ - if (mp_init_copy(&x0, a) != MP_OKAY) goto ERR; - if (mp_init_copy(&x1, a) != MP_OKAY) goto X0; + if (mp_init_size(&x0, B) != MP_OKAY) goto ERR; + if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0; /* init temps */ if (mp_init(&t1) != MP_OKAY) goto X1; @@ -1451,16 +1474,27 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b) if (mp_init(&x1x1) != MP_OKAY) goto X0X0; /* now shift the digits */ - mp_mod_2d(&x0, B*DIGIT_BIT, &x0); - mp_rshd(&x1, B); + for (x = 0; x < B; x++) { + x0.dp[x] = a->dp[x]; + } + for (x = B; x < a->used; x++) { + x1.dp[x-B] = a->dp[x]; + } + + x0.used = B; + x1.used = a->used - B; + + mp_clamp(&x0); + mp_clamp(&x1); + /* now calc the products x0*x0 and x1*x1 */ - if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ - if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ + if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ + if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ /* now calc x1-x0 and y1-y0 */ if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */ - if (s_mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ + if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ /* add x0y0 */ if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */ @@ -1471,10 +1505,9 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b) if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1; /* x1y1 = x1y1 << 2*B */ if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 */ - if (mp_add(&t1, &x1x1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 + x1y1 */ + if (mp_add(&t1, &x1x1, b) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 + x1y1 */ err = MP_OKAY; - mp_exch(&t1, b); X1X1: mp_clear(&x1x1); X0X0: mp_clear(&x0x0); @@ -2784,6 +2817,102 @@ __M : return err; } +/* find the n'th root of an integer + * + * Result found such that (c)^b <= a and (c+1)^b > a + */ +int mp_n_root(mp_int *a, mp_digit b, mp_int *c) +{ + mp_int t1, t2, t3; + int res, neg; + + /* input must be positive if b is even*/ + if ((b&1) == 0 && a->sign == MP_NEG) { + return MP_VAL; + } + + if ((res = mp_init(&t1)) != MP_OKAY) { + return res; + } + + if ((res = mp_init(&t2)) != MP_OKAY) { + goto __T1; + } + + if ((res = mp_init(&t3)) != MP_OKAY) { + goto __T2; + } + + /* if a is negative fudge the sign but keep track */ + neg = a->sign; + a->sign = MP_ZPOS; + + /* t2 = a */ + if ((res = mp_copy(a, &t2)) != MP_OKAY) { + goto __T3; + } + + do { + /* t1 = t2 */ + if ((res = mp_copy(&t2, &t1)) != MP_OKAY) { + goto __T3; + } + + /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */ + if ((res = mp_expt_d(&t1, b-1, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) */ + goto __T3; + } + + /* numerator */ + if ((res = mp_mul(&t3, &t1, &t2)) != MP_OKAY) { /* t2 = t1^b */ + goto __T3; + } + + if ((res = mp_sub(&t2, a, &t2)) != MP_OKAY) { /* t2 = t1^b - a */ + goto __T3; + } + + if ((res = mp_mul_d(&t3, b, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) * b */ + goto __T3; + } + + if ((res = mp_div(&t2, &t3, &t3, NULL)) != MP_OKAY) { /* t3 = (t1^b - a)/(b * t1^(b-1)) */ + goto __T3; + } + + if ((res = mp_sub(&t1, &t3, &t2)) != MP_OKAY) { + goto __T3; + } + } while (mp_cmp(&t1, &t2) != MP_EQ); + + /* result can be at most off by one so check */ + if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) { + goto __T3; + } + + if (mp_cmp(&t2, a) == MP_GT) { + if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) { + goto __T3; + } + } + + /* reset the sign of a first */ + a->sign = neg; + + /* set the result */ + mp_exch(&t1, c); + + /* set the sign of the result */ + c->sign = neg; + + res = MP_OKAY; + +__T3: mp_clear(&t3); +__T2: mp_clear(&t2); +__T1: mp_clear(&t1); + return res; +} + /* --> radix conversion <-- */ /* reverse an array, used for radix code */ static void reverse(unsigned char *s, int len) diff --git a/bn.h b/bn.h index 624c50d..3975727 100644 --- a/bn.h +++ b/bn.h @@ -233,6 +233,15 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c); /* c = [a, b] or (a*b)/(a, b) */ int mp_lcm(mp_int *a, mp_int *b, mp_int *c); +/* finds one of the b'th root of a, such that |c|^b <= |a| + * + * returns error if a < 0 and b is even + */ +int mp_n_root(mp_int *a, mp_digit b, mp_int *c); + +/* shortcut for square root */ +#define mp_sqrt(a, b) mp_n_root(a, 2, b) + /* used to setup the Barrett reduction for a given modulus b */ int mp_reduce_setup(mp_int *a, mp_int *b); diff --git a/bn.pdf b/bn.pdf index d517dc4..54bde38 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index 069f76b..7c02e2c 100644 --- a/bn.tex +++ b/bn.tex @@ -1,7 +1,7 @@ \documentclass{article} \begin{document} -\title{LibTomMath v0.06 \\ A Free Multiple Precision Integer Library} +\title{LibTomMath v0.07 \\ A Free Multiple Precision Integer Library} \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle \newpage @@ -187,17 +187,6 @@ int mp_mul_2(mp_int *a, mp_int *b); int mp_mod_2d(mp_int *a, int b, mp_int *c); \end{verbatim} -Both the \textbf{mp\_rshd} and \textbf{mp\_lshd} functions provide shifting by whole digits. For example, -mp\_rshd($x$, $n$) is the same as $x \leftarrow \lfloor x / \beta^n \rfloor$ while mp\_lshd($x$, $n$) is equivalent -to $x \leftarrow x \cdot \beta^n$. Both functions are extremely fast as they merely copy digits within the array. - -Similarly the \textbf{mp\_div\_2d} and \textbf{mp\_mul\_2d} functions provide shifting but allow any bit count to -be specified. For example, mp\_div\_2d($x$, $n$, $y$) is the same as $y =\lfloor x / 2^n \rfloor$ while -mp\_mul\_2d($x$, $n$, $y$) is the same as $y = x \cdot 2^n$. The \textbf{mp\_div\_2} and \textbf{mp\_mul\_2} -functions are legacy functions that merely shift right or left one bit respectively. The \textbf{mp\_mod\_2d} function -reduces an integer mod a power of two. For example, mp\_mod\_2d($x$, $n$, $y$) is the same as -$y \equiv x \mbox{ (mod }2^n\mbox{)}$. - \subsection{Basic Arithmetic} Next are the class of functions which provide basic arithmetic. @@ -234,17 +223,7 @@ int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d); int mp_mod(mp_int *a, mp_int *b, mp_int *c); \end{verbatim} -The \textbf{mp\_cmp} will compare two integers. It will return \textbf{MP\_LT} if the first parameter is less than -the second, \textbf{MP\_GT} if it is greater or \textbf{MP\_EQ} if they are equal. These constants are the same as from -MPI. - -The \textbf{mp\_add}, \textbf{mp\_sub}, \textbf{mp\_mul}, \textbf{mp\_div}, \textbf{mp\_sqr} and \textbf{mp\_mod} are all -fairly straight forward to understand. Note that in mp\_div either $c$ (the quotient) or $d$ (the remainder) can be -passed as NULL to ignore it. For example, if you only want the quotient $z = \lfloor x/y \rfloor$ then a call such as -mp\_div(\&x, \&y, \&z, NULL) is acceptable. - -There is a related class of ``single digit'' functions that are like the above except they use a digit as the second -operand. +\subsection{Single Digit Functions} \begin{verbatim} /* compare against a single digit */ @@ -296,14 +275,13 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c); /* c = [a, b] or (a*b)/(a, b) */ int mp_lcm(mp_int *a, mp_int *b, mp_int *c); +/* find the b'th root of a */ +int mp_n_root(mp_int *a, mp_digit b, mp_int *c); + /* d = a^b (mod c) */ int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); \end{verbatim} -These are all fairly simple to understand. The \textbf{mp\_invmod} is a modular multiplicative inverse. That is it -stores in the third parameter an integer such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$ provided such integer exists. If -there is no such integer the function returns \textbf{MP\_VAL}. - \subsection{Radix Conversions} To read or store integers in other formats there are the following functions. @@ -432,7 +410,7 @@ when $b \le 0$, in theory the routine will still give a properly congruent answe This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time. -\subsection{Modular Arithmetic} +\subsection{Number Theoretic Functions} \subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod} These functions take the time of their host function plus the time it takes to perform a division. For example, @@ -445,6 +423,41 @@ Also note that these functions use mp\_mod which means the result are guaranteed This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When $b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. +\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)} +Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive +or negative inputs. + +Functions requires no additional memory and approximately $O(N \cdot log(N))$ time. + +\subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)} +Finds the least common multiple of both $a$ and $b$ and places the result in $c$. Will work with either positive +or negative inputs. This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of +both. + +Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time. + +\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)} +Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$. +Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration +requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$. + +If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root +that has a sign that agrees with the sign of $a$. + +\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} +Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit +exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications. The value of $k$ is +chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett reductions are used +to reduce the squared or multiplied temporary results modulo $c$. A Barrett reduction requires one division that is +performed only and two half multipliers of $N$ digit numbers resulting in approximation $O((N^2)/2)$ work. + +Let $\gamma = \lfloor \alpha/k \rfloor + k$ represent the total multiplications. The total work of a exponentiation is +therefore, $O(3 \cdot N^2 + (\alpha + \gamma) \cdot ((N^2)/2) + \alpha \cdot ((N^2 + N)/2) + \gamma \cdot N^2)$ which +simplies to $O(3 \cdot N^2 + \gamma N^2 + \alpha (N^2 + (N/2)))$. The bulk of the time is spent in the Barrett +reductions and the squaring algorithms. Since $\gamma < \alpha$ it makes sense to optimize first the Barrett and +squaring routines first. Significant improvements in the future will most likely stem from optimizing these instead +of optimizing the multipliers. + \section{Timing Analysis} \subsection{Observed Timings} A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The @@ -467,27 +480,27 @@ Inversion & 1024 & 5,237,957 & 1,054,158 \\ Inversion & 2048 & 17,871,944 & 3,459,683 \\ Inversion & 4096 & 66,610,468 & 11,834,556 \\ \hline -Multiply & 128 & 1,426 & 828 \\ -Multiply & 256 & 2,551 & 1,393 \\ -Multiply & 512 & 7,913 & 2,926 \\ -Multiply & 1024 & 28,496 & 8,620 \\ -Multiply & 2048 & 109,897 & 28,967 \\ -Multiply & 4096 & 469,970 & 105,387 \\ +Multiply & 128 & 1,426 & 451 \\ +Multiply & 256 & 2,551 & 958 \\ +Multiply & 512 & 7,913 & 2,476 \\ +Multiply & 1024 & 28,496 & 7,927 \\ +Multiply & 2048 & 109,897 & 282,24 \\ +Multiply & 4096 & 469,970 & 104,681 \\ \hline -Square & 128 & 1,319 & 869 \\ -Square & 256 & 1,776 & 1,362 \\ -Square & 512 & 5,399 & 2,571 \\ -Square & 1024 & 18,991 & 6,332 \\ -Square & 2048 & 72,126 & 18,426 \\ -Square & 4096 & 306,269 & 74,908 \\ +Square & 128 & 1,319 & 511 \\ +Square & 256 & 1,776 & 947 \\ +Square & 512 & 5,399 & 2,153 \\ +Square & 1024 & 18,991 & 5,733 \\ +Square & 2048 & 72,126 & 17,621 \\ +Square & 4096 & 306,269 & 70,168 \\ \hline -Exptmod & 512 & 32,021,586 & 5,696,459 \\ -Exptmod & 768 & 97,595,492 & 12,428,274 \\ -Exptmod & 1024 & 223,302,532 & 22,834,316 \\ -Exptmod & 2048 & 1,682,223,369 & 119,888,049 \\ -Exptmod & 2560 & 3,268,615,571 & 250,901,263 \\ -Exptmod & 3072 & 5,597,240,141 & 391,716,431 \\ -Exptmod & 4096 & 13,347,270,891 & 814,429,647 +Exptmod & 512 & 32,021,586 & 4,472,406 \\ +Exptmod & 768 & 97,595,492 & 10,427,845 \\ +Exptmod & 1024 & 223,302,532 & 20,561,722 \\ +Exptmod & 2048 & 1,682,223,369 & 113,978,803 \\ +Exptmod & 2560 & 3,268,615,571 & 236,650,133 \\ +Exptmod & 3072 & 5,597,240,141 & 373,449,291 \\ +Exptmod & 4096 & 13,347,270,891 & 787,568,457 \end{tabular} \end{center} diff --git a/changes.txt b/changes.txt index e5b6294..4737590 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,8 @@ +Jan 1st, 2003 +v0.07 -- Removed alot of heap operations from core functions to speed them up + -- Added a root finding function [and mp_sqrt macro like from MPI] + -- Added more to manual + Dec 31st, 2002 v0.06 -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc... -- Cleaned up the header a bit more diff --git a/demo.c b/demo.c index 0a1943f..ae35964 100644 --- a/demo.c +++ b/demo.c @@ -105,10 +105,9 @@ int main(void) mp_sub_d(&a, 1, &c); mp_exptmod(&b, &c, &a, &d); mp_toradix(&d, buf, 10); - printf("b^p-1 == %s\n", buf); + printf("b^p-1 == %s\n", buf); #ifdef TIMER - mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&b, "340282366920938463463574607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { @@ -156,9 +155,6 @@ int main(void) printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); mp_copy(&b, &a); } - - - { char *primes[] = { @@ -177,6 +173,7 @@ int main(void) for (rr = 0; rr < mp_count_bits(&a); rr++) { mp_mul_2d(&b, 1, &b); b.dp[0] |= lbit(); + b.used += 1; } mp_sub_d(&a, 1, &c); mp_mod(&b, &c, &b); @@ -198,7 +195,7 @@ int main(void) printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35)); } } - + mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&b, "234892374891378913789237289378973232333", 10); while (a.used * DIGIT_BIT < 8192) { @@ -223,6 +220,19 @@ int main(void) inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0; for (;;) { + + /* randomly clear and re-init one variable, this has the affect of triming the alloc space */ + switch (abs(rand()) % 7) { + case 0: mp_clear(&a); mp_init(&a); break; + case 1: mp_clear(&b); mp_init(&b); break; + case 2: mp_clear(&c); mp_init(&c); break; + case 3: mp_clear(&d); mp_init(&d); break; + case 4: mp_clear(&e); mp_init(&e); break; + case 5: mp_clear(&f); mp_init(&f); break; + case 6: break; /* don't clear any */ + } + + printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%5d\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, _ifuncs); fgets(cmd, 4095, stdin); cmd[strlen(cmd)-1] = 0; diff --git a/makefile b/makefile index ec94b70..95c6465 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ CC = gcc CFLAGS += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops -VERSION=0.06 +VERSION=0.07 default: test diff --git a/mtest/mtest.c b/mtest/mtest.c index a32e0e5..576feb2 100644 --- a/mtest/mtest.c +++ b/mtest/mtest.c @@ -41,7 +41,7 @@ void rand_num(mp_int *a) unsigned char buf[512]; top: - size = 1 + ((fgetc(rng)*fgetc(rng)) % 32); + size = 1 + ((fgetc(rng)*fgetc(rng)) % 512); buf[0] = (fgetc(rng)&1)?1:0; fread(buf+1, 1, size, rng); for (n = 0; n < size; n++) { @@ -57,7 +57,7 @@ void rand_num2(mp_int *a) unsigned char buf[512]; top: - size = 1 + ((fgetc(rng)*fgetc(rng)) % 32); + size = 1 + ((fgetc(rng)*fgetc(rng)) % 512); buf[0] = (fgetc(rng)&1)?1:0; fread(buf+1, 1, size, rng); for (n = 0; n < size; n++) {