added libtommath-0.04

This commit is contained in:
Tom St Denis 2003-02-28 16:04:18 +00:00 committed by Steffen Jaeckel
parent f89172c3af
commit e051ed159b
8 changed files with 889 additions and 150 deletions

745
bn.c

File diff suppressed because it is too large Load Diff

9
bn.h
View File

@ -83,6 +83,9 @@ int mp_init(mp_int *a);
/* free a bignum */
void mp_clear(mp_int *a);
/* exchange two ints */
void mp_exch(mp_int *a, mp_int *b);
/* shrink ram required for a bignum */
int mp_shrink(mp_int *a);
@ -214,7 +217,11 @@ int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
/* used to setup the Barrett reduction for a given modulus b */
int mp_reduce_setup(mp_int *a, mp_int *b);
/* Barrett Reduction, computes a (mod b) with a precomputed value c */
/* Barrett Reduction, computes a (mod b) with a precomputed value c
*
* Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
* compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
*/
int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
/* d = a^b (mod c) */

BIN
bn.pdf

Binary file not shown.

163
bn.tex
View File

@ -1,7 +1,7 @@
\documentclass{article}
\begin{document}
\title{LibTomMath v0.03 \\ A Free Multiple Precision Integer Library}
\title{LibTomMath v0.04 \\ A Free Multiple Precision Integer Library}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle
\newpage
@ -323,46 +323,161 @@ The integers are stored in big endian format as most libraries (and MPI) expect.
\textbf{mp\_toradix} functions read and write (respectively) null terminated ASCII strings in a given radix. Valid values
for the radix are between 2 and 64 (inclusively).
\section{Function Analysis}
Throughout the function analysis the variable $N$ will denote the average size of an input to a function as measured
by the number of digits it has. The variable $W$ will denote the number of bits per word and $c$ will denote a small
constant amount of work. The big-oh notation will be abused slightly to consider numbers that do not grow to infinity.
That is we shall consider $O(N/2) \ne O(N)$ which is an abuse of the notation.
\subsection{Digit Manipulation Functions}
The class of digit manipulation functions such as \textbf{mp\_rshd}, \textbf{mp\_lshd} and \textbf{mp\_mul\_2} are all
very simple functions to analyze.
\subsubsection{mp\_rshd(mp\_int *a, int b)}
If the shift count ``b'' is less than or equal to zero the function returns without doing any work. If the
the shift count is larger than the number of digits in ``a'' then ``a'' is simply zeroed without shifting digits.
This function requires no additional memory and $O(N)$ time.
\subsubsection{mp\_lshd(mp\_int *a, int b)}
If the shift count ``b'' is less than or equal to zero the function returns success without doing any work.
This function requires $O(b)$ additional digits of memory and $O(N)$ time.
\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)}
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time.
\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)}
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)}
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
\subsection{Basic Arithmetic}
\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
Performs a \textbf{signed} comparison between ``a'' and ``b'' returning
\textbf{MP\_GT} is ``a'' is larger than ``b''.
This function requires no additional memory and $O(N)$ time.
\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)}
Performs a \textbf{unsigned} comparison between ``a'' and ``b'' returning
\textbf{MP\_GT} is ``a'' is larger than ``b''. Note that this comparison is unsigned which means it will report, for
example, $-5 > 3$. By comparison mp\_cmp will report $-5 < 3$.
This function requires no additional memory and $O(N)$ time.
\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)}
Handles the sign of the numbers correctly which means it will subtract as required, e.g. $a + -b$ turns into $a - b$.
This function requires no additional memory and $O(N)$ time.
\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)}
Handles the sign of the numbers correctly which means it will add as required, e.g. $a - -b$ turns into $a + b$.
This function requires no additional memory and $O(N)$ time.
\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)}
Handles the sign of the numbers correctly which means it will correct the sign of the product as required,
e.g. $a \cdot -b$ turns into $-ab$.
For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It
requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used
without losing carry digits. The comba method is faster than the baseline method but cannot always be used which is why
both are provided. The code will automatically determine when it can be used. If the digit count is higher
than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and
$O(N^{lg(3)})$ time.
\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It
requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used
without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately
$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
The quotient is placed in ``c'' and the remainder in ``d''. Either (or both) of ``c'' and ``d'' can be set to NULL
if the value is not desired.
This function requires $O(4 \cdot N)$ memory and $O(N^2 + N)$ time.
\subsection{Modular Arithmetic}
\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
These functions take the time of their host function plus the time it takes to perform a division. For example,
mp\_addmod takes $O(N + (N^2 + N))$ time. Note that if you are performing many modular operations in a row with
the same modulus you should consider Barrett reductions.
NOTE: This section will be expanded upon in future releases of the library.
\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
This function is technically only defined for moduli who are positive and inputs that are positive. That is it will find
$c = 1/a \mbox{ (mod }b\mbox{)}$ for any $a > 0$ and $b > 0$. The function will work for negative values of $a$ since
it merely computes $c = -1 \cdot (1/{\vert a \vert}) \mbox{ (mod }b\mbox{)}$. In general the input is only
\textbf{guaranteed} to lead to a correct output if $-b < a < b$ and $(a, b) = 1$.
NOTE: This function will be revised to accept a wider range of inputs in future releases.
\section{Timing Analysis}
\subsection{Observed Timings}
A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The
test was conducted on an AMD Athlon XP processor with 266Mhz DDR memory and the GCC 3.2 compiler\footnote{With build
options ``-O3 -fomit-frame-pointer -funroll-loops''}. The multiplications and squarings were repeated 10,000 times
each while the modular exponentiation (exptmod) were performed 10 times each. The RDTSC (Read Time Stamp Counter) instruction
was used to measure the time the entire iterations took and was divided by the number of iterations to get an
average. The following results were observed.
options ``-O3 -fomit-frame-pointer -funroll-loops''}. The multiplications and squarings were repeated 100,000 times
each while the modular exponentiation (exptmod) were performed 50 times each. The ``inversions'' refers to multiplicative
inversions modulo an odd number of a given size. The RDTSC (Read Time Stamp Counter) instruction was used to measure the
time the entire iterations took and was divided by the number of iterations to get an average. The following results
were observed.
\begin{small}
\begin{center}
\begin{tabular}{c|c|c|c}
\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
\hline
Multiply & 128 & 1,426 & 928 \\
Multiply & 256 & 2,551 & 1,787 \\
Multiply & 512 & 7,913 & 3,458 \\
Multiply & 1024 & 28,496 & 9,271 \\
Multiply & 2048 & 109,897 & 29,917 \\
Multiply & 4096 & 469,970 & 123,934 \\
Inversion & 128 & 264,083 & 172,381 \\
Inversion & 256 & 549,370 & 381,237 \\
Inversion & 512 & 1,675,975 & 1,212,341 \\
Inversion & 1024 & 5,237,957 & 3,114,144 \\
Inversion & 2048 & 17,871,944 & 8,137,896 \\
Inversion & 4096 & 66,610,468 & 22,469,360 \\
\hline
Multiply & 128 & 1,426 & 847 \\
Multiply & 256 & 2,551 & 1,848 \\
Multiply & 512 & 7,913 & 3,505 \\
Multiply & 1024 & 28,496 & 9,097 \\
Multiply & 2048 & 109,897 & 29,497 \\
Multiply & 4096 & 469,970 & 112,651 \\
\hline
Square & 128 & 1,319 & 1,230 \\
Square & 256 & 1,776 & 2,131 \\
Square & 512 & 5,399 & 3,694 \\
Square & 1024 & 18,991 & 9,172 \\
Square & 2048 & 72,126 & 27,352 \\
Square & 4096 & 306,269 & 110,607 \\
Square & 128 & 1,319 & 883 \\
Square & 256 & 1,776 & 1,895 \\
Square & 512 & 5,399 & 3,543 \\
Square & 1024 & 18,991 & 8,692 \\
Square & 2048 & 72,126 & 26,792 \\
Square & 4096 & 306,269 & 103,263 \\
\hline
Exptmod & 512 & 32,021,586 & 6,880,075 \\
Exptmod & 768 & 97,595,492 & 15,202,614 \\
Exptmod & 1024 & 223,302,532 & 28,081,865 \\
Exptmod & 2048 & 1,682,223,369 & 146,545,454 \\
Exptmod & 2560 & 3,268,615,571 & 310,970,112 \\
Exptmod & 3072 & 5,597,240,141 & 480,703,712 \\
Exptmod & 4096 & 13,347,270,891 & 985,918,868
Exptmod & 512 & 32,021,586 & 7,096,687 \\
Exptmod & 768 & 97,595,492 & 14,849,813 \\
Exptmod & 1024 & 223,302,532 & 27,826,489 \\
Exptmod & 2048 & 1,682,223,369 & 142,026,274 \\
Exptmod & 2560 & 3,268,615,571 & 292,597,205 \\
Exptmod & 3072 & 5,597,240,141 & 452,731,243 \\
Exptmod & 4096 & 13,347,270,891 & 941,433,401
\end{tabular}
\end{center}
\end{small}
Note that the figures do fluctuate but their magnitudes are relatively intact. The purpose of the chart is not to
get an exact timing but to compare the two libraries. For example, in all of the tests the exact time for a 512-bit
squaring operation was not the same. The observed times were all approximately 3,500 cycles, more importantly they
were always faster than the timings observed with MPI by about the same magnitude.
\subsection{Digit Size}
The first major constribution to the time savings is the fact that 28 bits are stored per digit instead of the MPI
defualt of 16. This means in many of the algorithms the savings can be considerable. Consider a baseline multiplier

View File

@ -1,3 +1,10 @@
Dec 29th, 2002
v0.04 -- Fixed a memory leak in mp_to_unsigned_bin
-- optimized invmod code
-- Fixed bug in mp_div
-- use exchange instead of copy for results
-- added a bit more to the manual
Dec 27th, 2002
v0.03 -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
-- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.

103
demo.c
View File

@ -21,15 +21,20 @@ void reset(void) { _tt = clock(); }
unsigned long long rdtsc(void) { return clock() - _tt; }
#endif
void draw(mp_int *a)
void ndraw(mp_int *a, char *name)
{
char buf[4096];
printf("a->used == %d\na->alloc == %d\na->sign == %d\n", a->used, a->alloc, a->sign);
printf("%s: ", name);
mp_toradix(a, buf, 10);
printf("num == %s\n", buf);
printf("\n");
printf("%s\n", buf);
}
static void draw(mp_int *a)
{
ndraw(a, "");
}
unsigned long lfsr = 0xAAAAAAAAUL;
int lbit(void)
@ -43,7 +48,18 @@ int lbit(void)
}
}
#ifdef U_MPI
int mp_reduce_setup(mp_int *a, mp_int *b)
{
int res;
mp_set(a, 1);
if ((res = s_mp_lshd(a, b->used * 2)) != MP_OKAY) {
return res;
}
return mp_div(a, b, a, NULL);
}
#endif
int main(void)
{
@ -51,7 +67,6 @@ int main(void)
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
unsigned char cmd[4096], buf[4096];
int rr;
mp_digit tom;
#ifdef TIMER
int n;
@ -63,35 +78,63 @@ int main(void)
mp_init(&c);
mp_init(&d);
mp_init(&e);
mp_init(&f);
mp_init(&f);
mp_read_radix(&a, "-2", 10);
mp_read_radix(&b, "2", 10);
mp_expt_d(&a, 3, &a);
draw(&a);
mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
mp_reduce_setup(&b, &a);
printf("\n\n----\n\n");
mp_toradix(&b, buf, 10);
printf("b == %s\n\n\n", buf);
mp_read_radix(&b, "4982748972349724892742", 10);
mp_sub_d(&a, 1, &c);
mp_exptmod(&b, &c, &a, &d);
mp_toradix(&d, buf, 10);
printf("b^p-1 == %s\n", buf);
#ifdef TIMER
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 100000; rr++) {
for (rr = 0; rr < 1000; rr++) {
mp_invmod(&b, &a, &c);
}
tt = rdtsc();
mp_mulmod(&b, &c, &a, &d);
if (mp_cmp_d(&d, 1) != MP_EQ) {
printf("Failed to invert\n");
return 0;
}
printf("Inverting mod %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000));
mp_sqr(&a, &a);
mp_sqr(&b, &b);
}
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 1000000; rr++) {
mp_mul(&a, &a, &b);
}
tt = rdtsc();
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)100000));
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000000));
mp_copy(&b, &a);
}
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 100000; rr++) {
for (rr = 0; rr < 1000000; rr++) {
mp_sqr(&a, &b);
}
tt = rdtsc();
printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)100000));
printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000000));
mp_copy(&b, &a);
}
@ -117,7 +160,7 @@ int main(void)
mp_mod(&b, &c, &b);
mp_set(&c, 3);
reset();
for (rr = 0; rr < 35; rr++) {
for (rr = 0; rr < 50; rr++) {
mp_exptmod(&c, &b, &a, &d);
}
tt = rdtsc();
@ -130,7 +173,7 @@ int main(void)
draw(&d);
exit(0);
}
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)35));
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)50));
}
}
@ -141,7 +184,7 @@ int main(void)
printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n);
fgets(cmd, 4095, stdin);
cmd[strlen(cmd)-1] = 0;
printf("%s ]\r",cmd);
printf("%s ]\r",cmd); fflush(stdout);
if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
@ -173,7 +216,8 @@ int main(void)
fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
mp_add(&a, &b, &d);
mp_copy(&a, &d);
mp_add(&d, &b, &d);
if (mp_cmp(&c, &d) != MP_EQ) {
printf("add %lu failure!\n", add_n);
draw(&a);draw(&b);draw(&c);draw(&d);
@ -204,13 +248,13 @@ draw(&a);draw(&b);draw(&c);draw(&d);
draw(&d);
return 0;
}
} else if (!strcmp(cmd, "sub")) { ++sub_n;
fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
mp_sub(&a, &b, &d);
mp_copy(&a, &d);
mp_sub(&d, &b, &d);
if (mp_cmp(&c, &d) != MP_EQ) {
printf("sub %lu failure!\n", sub_n);
draw(&a);draw(&b);draw(&c);draw(&d);
@ -220,7 +264,8 @@ draw(&a);draw(&b);draw(&c);draw(&d);
fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
mp_mul(&a, &b, &d);
mp_copy(&a, &d);
mp_mul(&d, &b, &d);
if (mp_cmp(&c, &d) != MP_EQ) {
printf("mul %lu failure!\n", mul_n);
draw(&a);draw(&b);draw(&c);draw(&d);
@ -242,7 +287,8 @@ draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
} else if (!strcmp(cmd, "sqr")) { ++sqr_n;
fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
mp_sqr(&a, &c);
mp_copy(&a, &c);
mp_sqr(&c, &c);
if (mp_cmp(&b, &c) != MP_EQ) {
printf("sqr %lu failure!\n", sqr_n);
draw(&a);draw(&b);draw(&c);
@ -252,7 +298,8 @@ draw(&a);draw(&b);draw(&c);
fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
mp_gcd(&a, &b, &d);
mp_copy(&a, &d);
mp_gcd(&d, &b, &d);
d.sign = c.sign;
if (mp_cmp(&c, &d) != MP_EQ) {
printf("gcd %lu failure!\n", gcd_n);
@ -263,7 +310,8 @@ draw(&a);draw(&b);draw(&c);draw(&d);
fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
mp_lcm(&a, &b, &d);
mp_copy(&a, &d);
mp_lcm(&d, &b, &d);
d.sign = c.sign;
if (mp_cmp(&c, &d) != MP_EQ) {
printf("lcm %lu failure!\n", lcm_n);
@ -275,7 +323,8 @@ draw(&a);draw(&b);draw(&c);draw(&d);
fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 10);
mp_exptmod(&a, &b, &c, &e);
mp_copy(&a, &e);
mp_exptmod(&e, &b, &c, &e);
if (mp_cmp(&d, &e) != MP_EQ) {
printf("expt %lu failure!\n", expt_n);
draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);

View File

@ -1,7 +1,7 @@
CC = gcc
CFLAGS += -Wall -W -O3 -funroll-loops
CFLAGS += -DDEBUG -Wall -W -Os
VERSION=0.03
VERSION=0.04
default: test

View File

@ -41,7 +41,7 @@ void rand_num(mp_int *a)
unsigned char buf[512];
top:
size = 1 + (fgetc(rng) % 96);
size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
buf[0] = (fgetc(rng)&1)?1:0;
fread(buf+1, 1, size, rng);
for (n = 0; n < size; n++) {
@ -57,7 +57,7 @@ void rand_num2(mp_int *a)
unsigned char buf[512];
top:
size = 1 + (fgetc(rng) % 128);
size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
buf[0] = (fgetc(rng)&1)?1:0;
fread(buf+1, 1, size, rng);
for (n = 0; n < size; n++) {
@ -196,7 +196,7 @@ int main(void)
mp_todecimal(&c, buf);
printf("%s\n", buf);
} else if (n == 9) {
/* lcm test */
/* exptmod test */
rand_num2(&a);
rand_num2(&b);
rand_num2(&c);
@ -216,8 +216,10 @@ int main(void)
rand_num2(&a);
rand_num2(&b);
b.sign = MP_ZPOS;
a.sign = MP_ZPOS;
mp_gcd(&a, &b, &c);
if (mp_cmp_d(&c, 1) != 0) continue;
if (mp_cmp_d(&b, 1) == 0) continue;
mp_invmod(&a, &b, &c);
printf("invmod\n");
mp_todecimal(&a, buf);