added libtommath-0.07

This commit is contained in:
Tom St Denis 2003-02-28 16:05:52 +00:00 committed by Steffen Jaeckel
parent 16c6ccc62c
commit 3cd7000342
9 changed files with 277 additions and 111 deletions

2
b.bat
View File

@ -1,3 +1,3 @@
nasm -f coff timer.asm nasm -f coff timer.asm
gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo

237
bn.c
View File

@ -849,8 +849,7 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
*/ */
static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
{ {
mp_int t; int olduse, res, pa, pb, ix, iy;
int res, pa, pb, ix, iy;
mp_word W[512], *_W; mp_word W[512], *_W;
mp_digit tmpx, *tmpy; mp_digit tmpx, *tmpy;
@ -859,11 +858,12 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
VERIFY(b); VERIFY(b);
VERIFY(c); VERIFY(c);
if ((res = mp_init_size(&t, digs)) != MP_OKAY) { if (c->alloc < digs) {
DECFUNC(); if ((res = mp_grow(c, digs)) != MP_OKAY) {
return res; DECFUNC();
return res;
}
} }
t.used = digs;
/* clear temp buf (the columns) */ /* clear temp buf (the columns) */
memset(W, 0, digs*sizeof(mp_word)); memset(W, 0, digs*sizeof(mp_word));
@ -893,6 +893,11 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
} }
} }
/* setup dest */
olduse = c->used;
c->used = digs;
/* At this point W[] contains the sums of each column. To get the /* At this point W[] contains the sums of each column. To get the
* correct result we must take the extra bits from each column and * correct result we must take the extra bits from each column and
* carry them down * carry them down
@ -904,14 +909,17 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
* this is slower but on most cryptographic size numbers it is faster. * this is slower but on most cryptographic size numbers it is faster.
*/ */
for (ix = 1; ix < digs; ix++) { for (ix = 1; ix < digs; ix++) {
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
}
c->dp[digs-1] = W[digs-1] & ((mp_word)MP_MASK);
/* clear unused */
for (ix = c->used; ix < olduse; ix++) {
c->dp[ix] = 0;
} }
t.dp[digs-1] = W[digs-1] & ((mp_word)MP_MASK);
mp_clamp(&t); mp_clamp(c);
mp_exch(&t, c);
mp_clear(&t);
DECFUNC(); DECFUNC();
return MP_OKAY; return MP_OKAY;
} }
@ -993,8 +1001,7 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
*/ */
static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
{ {
mp_int t; int oldused, newused, res, pa, pb, ix, iy;
int res, pa, pb, ix, iy;
mp_word W[512], *_W; mp_word W[512], *_W;
mp_digit tmpx, *tmpy; mp_digit tmpx, *tmpy;
@ -1003,11 +1010,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
VERIFY(b); VERIFY(b);
VERIFY(c); VERIFY(c);
if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) { newused = a->used + b->used + 1;
DECFUNC(); if (c->alloc < newused) {
return res; if ((res = mp_grow(c, newused)) != MP_OKAY) {
DECFUNC();
return res;
}
} }
t.used = a->used + b->used + 1;
/* like the other comba method we compute the columns first */ /* like the other comba method we compute the columns first */
pa = a->used; pa = a->used;
@ -1025,17 +1034,21 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
} }
} }
/* setup dest */
oldused = c->used;
c->used = newused;
/* now convert the array W downto what we need */ /* now convert the array W downto what we need */
for (ix = digs+1; ix < (pa+pb+1); ix++) { for (ix = digs+1; ix < (pa+pb+1); ix++) {
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
} }
t.dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK); c->dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);
for (ix = c->used; ix < oldused; ix++) {
mp_clamp(&t); c->dp[ix] = 0;
mp_exch(&t, c); }
mp_clear(&t); mp_clamp(c);
DECFUNC(); DECFUNC();
return MP_OKAY; return MP_OKAY;
} }
@ -1106,8 +1119,7 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
*/ */
static int fast_s_mp_sqr(mp_int *a, mp_int *b) static int fast_s_mp_sqr(mp_int *a, mp_int *b)
{ {
mp_int t; int olduse, newused, res, ix, iy, pa;
int res, ix, iy, pa;
mp_word W2[512], W[512], *_W; mp_word W2[512], W[512], *_W;
mp_digit tmpx, *tmpy; mp_digit tmpx, *tmpy;
@ -1116,11 +1128,13 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
VERIFY(b); VERIFY(b);
pa = a->used; pa = a->used;
if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) { newused = pa + pa + 1;
DECFUNC(); if (b->alloc < newused) {
return res; if ((res = mp_grow(b, newused)) != MP_OKAY) {
} DECFUNC();
t.used = pa + pa + 1; return res;
}
}
/* zero temp buffer (columns) */ /* zero temp buffer (columns) */
memset(W, 0, (pa+pa+1)*sizeof(mp_word)); memset(W, 0, (pa+pa+1)*sizeof(mp_word));
@ -1144,19 +1158,29 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
/* double first value, since the inner products are half of what they should be */ /* double first value, since the inner products are half of what they should be */
W[0] += W[0] + W2[0]; W[0] += W[0] + W2[0];
/* setup dest */
olduse = b->used;
b->used = newused;
/* now compute digits */ /* now compute digits */
for (ix = 1; ix < (pa+pa+1); ix++) { for (ix = 1; ix < (pa+pa+1); ix++) {
/* double/add next digit */ /* double/add next digit */
W[ix] += W[ix] + W2[ix]; W[ix] += W[ix] + W2[ix];
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); b->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
} }
t.dp[(pa+pa+1)-1] = W[(pa+pa+1)-1] & ((mp_word)MP_MASK); b->dp[(pa+pa+1)-1] = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);
mp_clamp(&t); /* clear high */
mp_exch(&t, b); for (ix = b->used; ix < olduse; ix++) {
mp_clear(&t); b->dp[ix] = 0;
}
/* fix the sign (since we no longer make a fresh temp) */
b->sign = MP_ZPOS;
mp_clamp(b);
DECFUNC(); DECFUNC();
return MP_OKAY; return MP_OKAY;
} }
@ -1173,13 +1197,13 @@ static int s_mp_sqr(mp_int *a, mp_int *b)
VERIFY(a); VERIFY(a);
VERIFY(b); VERIFY(b);
/* can we use the fast multiplier? */ /* can we use the fast multiplier? */
if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) { if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
res = fast_s_mp_sqr(a,b); res = fast_s_mp_sqr(a,b);
DECFUNC(); DECFUNC();
return res; return res;
} }
pa = a->used; pa = a->used;
if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) { if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
DECFUNC(); DECFUNC();
@ -1385,10 +1409,9 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */ if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */
if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */ if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */
if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */ if (mp_add(&t1, &x1y1, c) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
err = MP_OKAY; err = MP_OKAY;
mp_exch(&t1, c);
X1Y1: mp_clear(&x1y1); X1Y1: mp_clear(&x1y1);
X0Y0: mp_clear(&x0y0); X0Y0: mp_clear(&x0y0);
@ -1426,7 +1449,7 @@ int mp_mul(mp_int *a, mp_int *b, mp_int *c)
static int mp_karatsuba_sqr(mp_int *a, mp_int *b) static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
{ {
mp_int x0, x1, t1, t2, x0x0, x1x1; mp_int x0, x1, t1, t2, x0x0, x1x1;
int B, err; int B, err, x;
REGFUNC("mp_karatsuba_sqr"); REGFUNC("mp_karatsuba_sqr");
VERIFY(a); VERIFY(a);
@ -1441,8 +1464,8 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
B = B/2; B = B/2;
/* init copy all the temps */ /* init copy all the temps */
if (mp_init_copy(&x0, a) != MP_OKAY) goto ERR; if (mp_init_size(&x0, B) != MP_OKAY) goto ERR;
if (mp_init_copy(&x1, a) != MP_OKAY) goto X0; if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0;
/* init temps */ /* init temps */
if (mp_init(&t1) != MP_OKAY) goto X1; if (mp_init(&t1) != MP_OKAY) goto X1;
@ -1451,16 +1474,27 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
if (mp_init(&x1x1) != MP_OKAY) goto X0X0; if (mp_init(&x1x1) != MP_OKAY) goto X0X0;
/* now shift the digits */ /* now shift the digits */
mp_mod_2d(&x0, B*DIGIT_BIT, &x0); for (x = 0; x < B; x++) {
mp_rshd(&x1, B); x0.dp[x] = a->dp[x];
}
for (x = B; x < a->used; x++) {
x1.dp[x-B] = a->dp[x];
}
x0.used = B;
x1.used = a->used - B;
mp_clamp(&x0);
mp_clamp(&x1);
/* now calc the products x0*x0 and x1*x1 */ /* now calc the products x0*x0 and x1*x1 */
if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
/* now calc x1-x0 and y1-y0 */ /* now calc x1-x0 and y1-y0 */
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */ if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
if (s_mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */
/* add x0y0 */ /* add x0y0 */
if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */ if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */
@ -1471,10 +1505,9 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1; /* x1y1 = x1y1 << 2*B */ if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1; /* x1y1 = x1y1 << 2*B */
if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 */ if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 */
if (mp_add(&t1, &x1x1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 + x1y1 */ if (mp_add(&t1, &x1x1, b) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 + x1y1 */
err = MP_OKAY; err = MP_OKAY;
mp_exch(&t1, b);
X1X1: mp_clear(&x1x1); X1X1: mp_clear(&x1x1);
X0X0: mp_clear(&x0x0); X0X0: mp_clear(&x0x0);
@ -2784,6 +2817,102 @@ __M :
return err; return err;
} }
/* find the n'th root of an integer
*
* Result found such that (c)^b <= a and (c+1)^b > a
*/
int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
{
mp_int t1, t2, t3;
int res, neg;
/* input must be positive if b is even*/
if ((b&1) == 0 && a->sign == MP_NEG) {
return MP_VAL;
}
if ((res = mp_init(&t1)) != MP_OKAY) {
return res;
}
if ((res = mp_init(&t2)) != MP_OKAY) {
goto __T1;
}
if ((res = mp_init(&t3)) != MP_OKAY) {
goto __T2;
}
/* if a is negative fudge the sign but keep track */
neg = a->sign;
a->sign = MP_ZPOS;
/* t2 = a */
if ((res = mp_copy(a, &t2)) != MP_OKAY) {
goto __T3;
}
do {
/* t1 = t2 */
if ((res = mp_copy(&t2, &t1)) != MP_OKAY) {
goto __T3;
}
/* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
if ((res = mp_expt_d(&t1, b-1, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) */
goto __T3;
}
/* numerator */
if ((res = mp_mul(&t3, &t1, &t2)) != MP_OKAY) { /* t2 = t1^b */
goto __T3;
}
if ((res = mp_sub(&t2, a, &t2)) != MP_OKAY) { /* t2 = t1^b - a */
goto __T3;
}
if ((res = mp_mul_d(&t3, b, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) * b */
goto __T3;
}
if ((res = mp_div(&t2, &t3, &t3, NULL)) != MP_OKAY) { /* t3 = (t1^b - a)/(b * t1^(b-1)) */
goto __T3;
}
if ((res = mp_sub(&t1, &t3, &t2)) != MP_OKAY) {
goto __T3;
}
} while (mp_cmp(&t1, &t2) != MP_EQ);
/* result can be at most off by one so check */
if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
goto __T3;
}
if (mp_cmp(&t2, a) == MP_GT) {
if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
goto __T3;
}
}
/* reset the sign of a first */
a->sign = neg;
/* set the result */
mp_exch(&t1, c);
/* set the sign of the result */
c->sign = neg;
res = MP_OKAY;
__T3: mp_clear(&t3);
__T2: mp_clear(&t2);
__T1: mp_clear(&t1);
return res;
}
/* --> radix conversion <-- */ /* --> radix conversion <-- */
/* reverse an array, used for radix code */ /* reverse an array, used for radix code */
static void reverse(unsigned char *s, int len) static void reverse(unsigned char *s, int len)

9
bn.h
View File

@ -233,6 +233,15 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
/* c = [a, b] or (a*b)/(a, b) */ /* c = [a, b] or (a*b)/(a, b) */
int mp_lcm(mp_int *a, mp_int *b, mp_int *c); int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
/* finds one of the b'th root of a, such that |c|^b <= |a|
*
* returns error if a < 0 and b is even
*/
int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
/* shortcut for square root */
#define mp_sqrt(a, b) mp_n_root(a, 2, b)
/* used to setup the Barrett reduction for a given modulus b */ /* used to setup the Barrett reduction for a given modulus b */
int mp_reduce_setup(mp_int *a, mp_int *b); int mp_reduce_setup(mp_int *a, mp_int *b);

BIN
bn.pdf

Binary file not shown.

107
bn.tex
View File

@ -1,7 +1,7 @@
\documentclass{article} \documentclass{article}
\begin{document} \begin{document}
\title{LibTomMath v0.06 \\ A Free Multiple Precision Integer Library} \title{LibTomMath v0.07 \\ A Free Multiple Precision Integer Library}
\author{Tom St Denis \\ tomstdenis@iahu.ca} \author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle \maketitle
\newpage \newpage
@ -187,17 +187,6 @@ int mp_mul_2(mp_int *a, mp_int *b);
int mp_mod_2d(mp_int *a, int b, mp_int *c); int mp_mod_2d(mp_int *a, int b, mp_int *c);
\end{verbatim} \end{verbatim}
Both the \textbf{mp\_rshd} and \textbf{mp\_lshd} functions provide shifting by whole digits. For example,
mp\_rshd($x$, $n$) is the same as $x \leftarrow \lfloor x / \beta^n \rfloor$ while mp\_lshd($x$, $n$) is equivalent
to $x \leftarrow x \cdot \beta^n$. Both functions are extremely fast as they merely copy digits within the array.
Similarly the \textbf{mp\_div\_2d} and \textbf{mp\_mul\_2d} functions provide shifting but allow any bit count to
be specified. For example, mp\_div\_2d($x$, $n$, $y$) is the same as $y =\lfloor x / 2^n \rfloor$ while
mp\_mul\_2d($x$, $n$, $y$) is the same as $y = x \cdot 2^n$. The \textbf{mp\_div\_2} and \textbf{mp\_mul\_2}
functions are legacy functions that merely shift right or left one bit respectively. The \textbf{mp\_mod\_2d} function
reduces an integer mod a power of two. For example, mp\_mod\_2d($x$, $n$, $y$) is the same as
$y \equiv x \mbox{ (mod }2^n\mbox{)}$.
\subsection{Basic Arithmetic} \subsection{Basic Arithmetic}
Next are the class of functions which provide basic arithmetic. Next are the class of functions which provide basic arithmetic.
@ -234,17 +223,7 @@ int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
int mp_mod(mp_int *a, mp_int *b, mp_int *c); int mp_mod(mp_int *a, mp_int *b, mp_int *c);
\end{verbatim} \end{verbatim}
The \textbf{mp\_cmp} will compare two integers. It will return \textbf{MP\_LT} if the first parameter is less than \subsection{Single Digit Functions}
the second, \textbf{MP\_GT} if it is greater or \textbf{MP\_EQ} if they are equal. These constants are the same as from
MPI.
The \textbf{mp\_add}, \textbf{mp\_sub}, \textbf{mp\_mul}, \textbf{mp\_div}, \textbf{mp\_sqr} and \textbf{mp\_mod} are all
fairly straight forward to understand. Note that in mp\_div either $c$ (the quotient) or $d$ (the remainder) can be
passed as NULL to ignore it. For example, if you only want the quotient $z = \lfloor x/y \rfloor$ then a call such as
mp\_div(\&x, \&y, \&z, NULL) is acceptable.
There is a related class of ``single digit'' functions that are like the above except they use a digit as the second
operand.
\begin{verbatim} \begin{verbatim}
/* compare against a single digit */ /* compare against a single digit */
@ -296,14 +275,13 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
/* c = [a, b] or (a*b)/(a, b) */ /* c = [a, b] or (a*b)/(a, b) */
int mp_lcm(mp_int *a, mp_int *b, mp_int *c); int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
/* find the b'th root of a */
int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
/* d = a^b (mod c) */ /* d = a^b (mod c) */
int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
\end{verbatim} \end{verbatim}
These are all fairly simple to understand. The \textbf{mp\_invmod} is a modular multiplicative inverse. That is it
stores in the third parameter an integer such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$ provided such integer exists. If
there is no such integer the function returns \textbf{MP\_VAL}.
\subsection{Radix Conversions} \subsection{Radix Conversions}
To read or store integers in other formats there are the following functions. To read or store integers in other formats there are the following functions.
@ -432,7 +410,7 @@ when $b \le 0$, in theory the routine will still give a properly congruent answe
This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time. This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
\subsection{Modular Arithmetic} \subsection{Number Theoretic Functions}
\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod} \subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
These functions take the time of their host function plus the time it takes to perform a division. For example, These functions take the time of their host function plus the time it takes to perform a division. For example,
@ -445,6 +423,41 @@ Also note that these functions use mp\_mod which means the result are guaranteed
This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When
$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. $b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.
\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive
or negative inputs.
Functions requires no additional memory and approximately $O(N \cdot log(N))$ time.
\subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)}
Finds the least common multiple of both $a$ and $b$ and places the result in $c$. Will work with either positive
or negative inputs. This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of
both.
Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$.
Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration
requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root
that has a sign that agrees with the sign of $a$.
\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit
exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications. The value of $k$ is
chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett reductions are used
to reduce the squared or multiplied temporary results modulo $c$. A Barrett reduction requires one division that is
performed only and two half multipliers of $N$ digit numbers resulting in approximation $O((N^2)/2)$ work.
Let $\gamma = \lfloor \alpha/k \rfloor + k$ represent the total multiplications. The total work of a exponentiation is
therefore, $O(3 \cdot N^2 + (\alpha + \gamma) \cdot ((N^2)/2) + \alpha \cdot ((N^2 + N)/2) + \gamma \cdot N^2)$ which
simplies to $O(3 \cdot N^2 + \gamma N^2 + \alpha (N^2 + (N/2)))$. The bulk of the time is spent in the Barrett
reductions and the squaring algorithms. Since $\gamma < \alpha$ it makes sense to optimize first the Barrett and
squaring routines first. Significant improvements in the future will most likely stem from optimizing these instead
of optimizing the multipliers.
\section{Timing Analysis} \section{Timing Analysis}
\subsection{Observed Timings} \subsection{Observed Timings}
A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The
@ -467,27 +480,27 @@ Inversion & 1024 & 5,237,957 & 1,054,158 \\
Inversion & 2048 & 17,871,944 & 3,459,683 \\ Inversion & 2048 & 17,871,944 & 3,459,683 \\
Inversion & 4096 & 66,610,468 & 11,834,556 \\ Inversion & 4096 & 66,610,468 & 11,834,556 \\
\hline \hline
Multiply & 128 & 1,426 & 828 \\ Multiply & 128 & 1,426 & 451 \\
Multiply & 256 & 2,551 & 1,393 \\ Multiply & 256 & 2,551 & 958 \\
Multiply & 512 & 7,913 & 2,926 \\ Multiply & 512 & 7,913 & 2,476 \\
Multiply & 1024 & 28,496 & 8,620 \\ Multiply & 1024 & 28,496 & 7,927 \\
Multiply & 2048 & 109,897 & 28,967 \\ Multiply & 2048 & 109,897 & 282,24 \\
Multiply & 4096 & 469,970 & 105,387 \\ Multiply & 4096 & 469,970 & 104,681 \\
\hline \hline
Square & 128 & 1,319 & 869 \\ Square & 128 & 1,319 & 511 \\
Square & 256 & 1,776 & 1,362 \\ Square & 256 & 1,776 & 947 \\
Square & 512 & 5,399 & 2,571 \\ Square & 512 & 5,399 & 2,153 \\
Square & 1024 & 18,991 & 6,332 \\ Square & 1024 & 18,991 & 5,733 \\
Square & 2048 & 72,126 & 18,426 \\ Square & 2048 & 72,126 & 17,621 \\
Square & 4096 & 306,269 & 74,908 \\ Square & 4096 & 306,269 & 70,168 \\
\hline \hline
Exptmod & 512 & 32,021,586 & 5,696,459 \\ Exptmod & 512 & 32,021,586 & 4,472,406 \\
Exptmod & 768 & 97,595,492 & 12,428,274 \\ Exptmod & 768 & 97,595,492 & 10,427,845 \\
Exptmod & 1024 & 223,302,532 & 22,834,316 \\ Exptmod & 1024 & 223,302,532 & 20,561,722 \\
Exptmod & 2048 & 1,682,223,369 & 119,888,049 \\ Exptmod & 2048 & 1,682,223,369 & 113,978,803 \\
Exptmod & 2560 & 3,268,615,571 & 250,901,263 \\ Exptmod & 2560 & 3,268,615,571 & 236,650,133 \\
Exptmod & 3072 & 5,597,240,141 & 391,716,431 \\ Exptmod & 3072 & 5,597,240,141 & 373,449,291 \\
Exptmod & 4096 & 13,347,270,891 & 814,429,647 Exptmod & 4096 & 13,347,270,891 & 787,568,457
\end{tabular} \end{tabular}
\end{center} \end{center}

View File

@ -1,3 +1,8 @@
Jan 1st, 2003
v0.07 -- Removed alot of heap operations from core functions to speed them up
-- Added a root finding function [and mp_sqrt macro like from MPI]
-- Added more to manual
Dec 31st, 2002 Dec 31st, 2002
v0.06 -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc... v0.06 -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
-- Cleaned up the header a bit more -- Cleaned up the header a bit more

22
demo.c
View File

@ -105,10 +105,9 @@ int main(void)
mp_sub_d(&a, 1, &c); mp_sub_d(&a, 1, &c);
mp_exptmod(&b, &c, &a, &d); mp_exptmod(&b, &c, &a, &d);
mp_toradix(&d, buf, 10); mp_toradix(&d, buf, 10);
printf("b^p-1 == %s\n", buf); printf("b^p-1 == %s\n", buf);
#ifdef TIMER #ifdef TIMER
mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
mp_read_radix(&b, "340282366920938463463574607431768211455", 10); mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) { while (a.used * DIGIT_BIT < 8192) {
@ -156,9 +155,6 @@ int main(void)
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
mp_copy(&b, &a); mp_copy(&b, &a);
} }
{ {
char *primes[] = { char *primes[] = {
@ -177,6 +173,7 @@ int main(void)
for (rr = 0; rr < mp_count_bits(&a); rr++) { for (rr = 0; rr < mp_count_bits(&a); rr++) {
mp_mul_2d(&b, 1, &b); mp_mul_2d(&b, 1, &b);
b.dp[0] |= lbit(); b.dp[0] |= lbit();
b.used += 1;
} }
mp_sub_d(&a, 1, &c); mp_sub_d(&a, 1, &c);
mp_mod(&b, &c, &b); mp_mod(&b, &c, &b);
@ -198,7 +195,7 @@ int main(void)
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35)); printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
} }
} }
mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
mp_read_radix(&b, "234892374891378913789237289378973232333", 10); mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
while (a.used * DIGIT_BIT < 8192) { while (a.used * DIGIT_BIT < 8192) {
@ -223,6 +220,19 @@ int main(void)
inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0; inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0;
for (;;) { for (;;) {
/* randomly clear and re-init one variable, this has the affect of triming the alloc space */
switch (abs(rand()) % 7) {
case 0: mp_clear(&a); mp_init(&a); break;
case 1: mp_clear(&b); mp_init(&b); break;
case 2: mp_clear(&c); mp_init(&c); break;
case 3: mp_clear(&d); mp_init(&d); break;
case 4: mp_clear(&e); mp_init(&e); break;
case 5: mp_clear(&f); mp_init(&f); break;
case 6: break; /* don't clear any */
}
printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%5d\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, _ifuncs); printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%5d\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, _ifuncs);
fgets(cmd, 4095, stdin); fgets(cmd, 4095, stdin);
cmd[strlen(cmd)-1] = 0; cmd[strlen(cmd)-1] = 0;

View File

@ -1,7 +1,7 @@
CC = gcc CC = gcc
CFLAGS += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops CFLAGS += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops
VERSION=0.06 VERSION=0.07
default: test default: test

View File

@ -41,7 +41,7 @@ void rand_num(mp_int *a)
unsigned char buf[512]; unsigned char buf[512];
top: top:
size = 1 + ((fgetc(rng)*fgetc(rng)) % 32); size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
buf[0] = (fgetc(rng)&1)?1:0; buf[0] = (fgetc(rng)&1)?1:0;
fread(buf+1, 1, size, rng); fread(buf+1, 1, size, rng);
for (n = 0; n < size; n++) { for (n = 0; n < size; n++) {
@ -57,7 +57,7 @@ void rand_num2(mp_int *a)
unsigned char buf[512]; unsigned char buf[512];
top: top:
size = 1 + ((fgetc(rng)*fgetc(rng)) % 32); size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
buf[0] = (fgetc(rng)&1)?1:0; buf[0] = (fgetc(rng)&1)?1:0;
fread(buf+1, 1, size, rng); fread(buf+1, 1, size, rng);
for (n = 0; n < size; n++) { for (n = 0; n < size; n++) {