diff --git a/bn.pdf b/bn.pdf index df69417..6b5a2d1 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index b5f0227..5c40174 100644 --- a/bn.tex +++ b/bn.tex @@ -1,7 +1,7 @@ \documentclass{article} \begin{document} -\title{LibTomMath v0.12 \\ A Free Multiple Precision Integer Library} +\title{LibTomMath v0.13 \\ A Free Multiple Precision Integer Library} \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle \newpage @@ -409,7 +409,6 @@ of $c$ is the maximum length of the two inputs. Computes $c = a \land b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length of $c$ is the maximum length of the two inputs. - \subsection{Basic Arithmetic} \subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)} @@ -440,19 +439,18 @@ This function requires no additional memory and $O(N)$ time. Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$. -For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It -requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used -without losing carry digits. The comba method is faster than the baseline method but cannot always be used which is why -both are provided. The code will automatically determine when it can be used. If the digit count is higher -than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and -$O(N^{lg(3)})$ time. +This function requires $O(N^2)$ time for small inputs and $O(N^{1.584})$ time for relatively large +inputs (\textit{above the }KARATSUBA\_MUL\_CUTOFF \textit{value defined in bncore.c.}). There is +considerable overhead in the Karatsuba method which only pays off when the digit count is fairly high +(\textit{typically around 80}). For small inputs the function requires $O(2N)$ memory, otherwise it +requires $O(6 \cdot \mbox{lg}(N) \cdot N)$ memory. + \subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)} -Computes $b = a^2$. -For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It -requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used -without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately -$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time. +Computes $b = a^2$ and fixes the sign of $b$ to be positive. + +This function has a running time and memory requirement profile very similar to that of the +mp\_mul function. It is always faster and uses less memory for the larger inputs. \subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign @@ -482,7 +480,8 @@ Also note that these functions use mp\_mod which means the result are guaranteed \subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)} This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When -$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. +$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. If no inverse is found (e.g. $(a, b) \ne 1$) then +the function returns \textbf{MP\_VAL} and the result in $c$ is undefined. \subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)} Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive @@ -497,13 +496,13 @@ both. Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time. -\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)} +\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int *c)} Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$. Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$. -If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root -that has a sign that agrees with the sign of $a$. +If the input $a$ is negative and $b$ is even the function returns \textbf{MP\_VAL}. Otherwise the function will +return a root that has a sign that agrees with the sign of $a$. \subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)} Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c index e1dcce3..249ff43 100644 --- a/bn_fast_mp_invmod.c +++ b/bn_fast_mp_invmod.c @@ -14,13 +14,17 @@ */ #include -/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */ +/* computes the modular inverse via binary extended euclidean algorithm, + * that is c = 1/a mod b + * + * Based on mp_invmod except this is optimized for the case where b is + * odd as per HAC Note 14.64 on pp. 610 + */ int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c) { - mp_int x, y, u, v, B, D; - int res, neg; - + mp_int x, y, u, v, B, D; + int res, neg; if ((res = mp_init (&x)) != MP_OKAY) { goto __ERR; @@ -58,7 +62,10 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c) goto __D; } - /* 2. [modified] if x,y are both even then return an error! */ + /* 2. [modified] if x,y are both even then return an error! + * + * That is if gcd(x,y) = 2 * k then obviously there is no inverse. + */ if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) { res = MP_VAL; goto __D; @@ -135,8 +142,9 @@ top: } /* if not zero goto step 4 */ - if (mp_iszero (&u) == 0) + if (mp_iszero (&u) == 0) { goto top; + } /* now a = C, b = D, gcd == g*v */ diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c index dbc3478..2e03936 100644 --- a/bn_fast_mp_montgomery_reduce.c +++ b/bn_fast_mp_montgomery_reduce.c @@ -14,12 +14,19 @@ */ #include -/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */ +/* computes xR^-1 == x (mod N) via Montgomery Reduction + * + * This is an optimized implementation of mp_montgomery_reduce + * which uses the comba method to quickly calculate the columns of the + * reduction. + * + * Based on Algorithm 14.32 on pp.601 of HAC. +*/ int fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) { - int ix, res, olduse; - mp_word W[512]; + int ix, res, olduse; + mp_word W[512]; /* get old used count */ olduse = a->used; @@ -31,14 +38,22 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) } } - /* copy the digits of a */ - for (ix = 0; ix < a->used; ix++) { - W[ix] = a->dp[ix]; - } + { + register mp_word *_W; + register mp_digit *tmpa; - /* zero the high words */ - for (; ix < m->used * 2 + 1; ix++) { - W[ix] = 0; + _W = W; + tmpa = a->dp; + + /* copy the digits of a */ + for (ix = 0; ix < a->used; ix++) { + *_W++ = *tmpa++; + } + + /* zero the high words */ + for (; ix < m->used * 2 + 1; ix++) { + *_W++ = 0; + } } for (ix = 0; ix < m->used; ix++) { @@ -69,8 +84,10 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) register mp_digit *tmpx; register mp_word *_W; - /* aliases */ + /* alias for the digits of the modulus */ tmpx = m->dp; + + /* Alias for the columns set by an offset of ix */ _W = W + ix; /* inner loop */ @@ -88,24 +105,32 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); } - /* copy out, A = A/b^n - * - * The result is A/b^n but instead of converting from an array of mp_word - * to mp_digit than calling mp_rshd we just copy them in the right - * order - */ - for (ix = 0; ix < m->used + 1; ix++) { - a->dp[ix] = W[ix + m->used] & ((mp_word) MP_MASK); + { + register mp_digit *tmpa; + register mp_word *_W; + + /* copy out, A = A/b^n + * + * The result is A/b^n but instead of converting from an array of mp_word + * to mp_digit than calling mp_rshd we just copy them in the right + * order + */ + tmpa = a->dp; + _W = W + m->used; + + for (ix = 0; ix < m->used + 1; ix++) { + *tmpa++ = *_W++ & ((mp_word) MP_MASK); + } + + /* zero oldused digits, if the input a was larger than + * m->used+1 we'll have to clear the digits */ + for (; ix < olduse; ix++) { + *tmpa++ = 0; + } } - /* set the max used */ + /* set the max used and clamp */ a->used = m->used + 1; - - /* zero oldused digits, if the input a was larger than - * m->used+1 we'll have to clear the digits */ - for (; ix < olduse; ix++) { - a->dp[ix] = 0; - } mp_clamp (a); /* if A >= m then A = A - m */ diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c index c433dd7..dc0c33e 100644 --- a/bn_fast_s_mp_mul_digs.c +++ b/bn_fast_s_mp_mul_digs.c @@ -21,13 +21,20 @@ * has the effect of making the nested loops that compute the columns very * simple and schedulable on super-scalar processors. * + * This has been modified to produce a variable number of digits of output so + * if say only a half-product is required you don't have to compute the upper half + * (a feature required for fast Barrett reduction). + * + * Based on Algorithm 14.12 on pp.595 of HAC. + * */ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { - int olduse, res, pa, ix; - mp_word W[512]; + int olduse, res, pa, ix; + mp_word W[512]; + /* grow the destination as required */ if (c->alloc < digs) { if ((res = mp_grow (c, digs)) != MP_OKAY) { return res; @@ -43,11 +50,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* this multiplier has been modified to allow you to control how many digits * of output are produced. So at most we want to make upto "digs" digits - * of output - */ - - - /* this adds products to distinct columns (at ix+iy) of W + * of output. + * + * this adds products to distinct columns (at ix+iy) of W * note that each step through the loop is not dependent on * the previous which means the compiler can easily unroll * the loop without scheduling problems @@ -85,27 +90,30 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) olduse = c->used; c->used = digs; + { + register mp_digit *tmpc; - /* At this point W[] contains the sums of each column. To get the - * correct result we must take the extra bits from each column and - * carry them down - * - * Note that while this adds extra code to the multiplier it saves time - * since the carry propagation is removed from the above nested loop. - * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to - * N^2 + N*c where c is the cost of the shifting. On very small numbers - * this is slower but on most cryptographic size numbers it is faster. - */ + /* At this point W[] contains the sums of each column. To get the + * correct result we must take the extra bits from each column and + * carry them down + * + * Note that while this adds extra code to the multiplier it saves time + * since the carry propagation is removed from the above nested loop. + * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to + * N^2 + N*c where c is the cost of the shifting. On very small numbers + * this is slower but on most cryptographic size numbers it is faster. + */ + tmpc = c->dp; + for (ix = 1; ix < digs; ix++) { + W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); + *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); + } + *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); - for (ix = 1; ix < digs; ix++) { - W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); - c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); - } - c->dp[digs - 1] = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); - - /* clear unused */ - for (; ix < olduse; ix++) { - c->dp[ix] = 0; + /* clear unused */ + for (; ix < olduse; ix++) { + *tmpc++ = 0; + } } mp_clamp (c); diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c index cc30386..3458d96 100644 --- a/bn_fast_s_mp_mul_high_digs.c +++ b/bn_fast_s_mp_mul_high_digs.c @@ -20,14 +20,16 @@ * * This is used in the Barrett reduction since for one of the multiplications * only the higher digits were needed. This essentially halves the work. + * + * Based on Algorithm 14.12 on pp.595 of HAC. */ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { - int oldused, newused, res, pa, pb, ix; - mp_word W[512]; - + int oldused, newused, res, pa, pb, ix; + mp_word W[512]; + /* calculate size of product and allocate more space if required */ newused = a->used + b->used + 1; if (c->alloc < newused) { if ((res = mp_grow (c, newused)) != MP_OKAY) { @@ -38,7 +40,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* like the other comba method we compute the columns first */ pa = a->used; pb = b->used; - memset (&W[digs], 0, (pa + pb + 1 - digs) * sizeof (mp_word)); + memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word)); for (ix = 0; ix < pa; ix++) { { register mp_digit tmpx, *tmpy; @@ -75,8 +77,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); } - c->dp[(pa + pb + 1) - 1] = - (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK)); + c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK)); for (; ix < oldused; ix++) { c->dp[ix] = 0; diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c index c4f786d..2b945ba 100644 --- a/bn_fast_s_mp_sqr.c +++ b/bn_fast_s_mp_sqr.c @@ -26,14 +26,16 @@ * "A * B * 2". The *2 part does not need to be computed until the end which is * good because 64-bit shifts are slow! * + * Based on Algorithm 14.16 on pp.597 of HAC. * */ int fast_s_mp_sqr (mp_int * a, mp_int * b) { - int olduse, newused, res, ix, pa; - mp_word W2[512], W[512]; + int olduse, newused, res, ix, pa; + mp_word W2[512], W[512]; + /* calculate size of product and allocate as required */ pa = a->used; newused = pa + pa + 1; if (b->alloc < newused) { @@ -51,15 +53,31 @@ fast_s_mp_sqr (mp_int * a, mp_int * b) * the inner product can be doubled using n doublings instead of * n^2 */ - memset (W, 0, newused * sizeof (mp_word)); + memset (W, 0, newused * sizeof (mp_word)); memset (W2, 0, newused * sizeof (mp_word)); +/* note optimization + * values in W2 are only written in even locations which means + * we can collapse the array to 256 words [and fixup the memset above] + * provided we also fix up the summations below. Ideally + * the fixup loop should be unrolled twice to handle the even/odd + * cases, and then a final step to handle odd cases [e.g. newused == odd] + * + * This will not only save ~8*256 = 2KB of stack but lower the number of + * operations required to finally fix up the columns + */ + /* This computes the inner product. To simplify the inner N^2 loop * the multiplication by two is done afterwards in the N loop. */ for (ix = 0; ix < pa; ix++) { - /* compute the outer product */ - W2[ix + ix] += ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); + /* compute the outer product + * + * Note that every outer product is computed + * for a particular column only once which means that + * there is no need todo a double precision addition + */ + W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); { register mp_digit tmpx, *tmpy; @@ -90,22 +108,25 @@ fast_s_mp_sqr (mp_int * a, mp_int * b) W[0] += W[0] + W2[0]; /* now compute digits */ - for (ix = 1; ix < newused; ix++) { - /* double/add next digit */ - W[ix] += W[ix] + W2[ix]; + { + register mp_digit *tmpb; - W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); - b->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); + tmpb = b->dp; + + for (ix = 1; ix < newused; ix++) { + /* double/add next digit */ + W[ix] += W[ix] + W2[ix]; + + W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); + *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); + } + *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); + + /* clear high */ + for (; ix < olduse; ix++) { + *tmpb++ = 0; + } } - b->dp[(newused) - 1] = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); - - /* clear high */ - for (; ix < olduse; ix++) { - b->dp[ix] = 0; - } - - /* fix the sign (since we no longer make a fresh temp) */ - b->sign = MP_ZPOS; mp_clamp (b); return MP_OKAY; diff --git a/bn_mp_2expt.c b/bn_mp_2expt.c index ace1b99..71d04e9 100644 --- a/bn_mp_2expt.c +++ b/bn_mp_2expt.c @@ -14,11 +14,15 @@ */ #include -/* computes a = 2^b */ +/* computes a = 2^b + * + * Simple algorithm which zeroes the int, grows it then just sets one bit + * as required. + */ int mp_2expt (mp_int * a, int b) { - int res; + int res; mp_zero (a); if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) { diff --git a/bn_mp_abs.c b/bn_mp_abs.c index 299d3b6..9e6956e 100644 --- a/bn_mp_abs.c +++ b/bn_mp_abs.c @@ -14,11 +14,14 @@ */ #include -/* b = |a| */ +/* b = |a| + * + * Simple function copies the input and fixes the sign to positive + */ int mp_abs (mp_int * a, mp_int * b) { - int res; + int res; if ((res = mp_copy (a, b)) != MP_OKAY) { return res; } diff --git a/bn_mp_add.c b/bn_mp_add.c index a2ad4fd..a8addfb 100644 --- a/bn_mp_add.c +++ b/bn_mp_add.c @@ -18,9 +18,9 @@ int mp_add (mp_int * a, mp_int * b, mp_int * c) { - int sa, sb, res; - + int sa, sb, res; + /* get sign of both inputs */ sa = a->sign; sb = b->sign; diff --git a/bn_mp_add_d.c b/bn_mp_add_d.c index 0391bc1..1b30fa4 100644 --- a/bn_mp_add_d.c +++ b/bn_mp_add_d.c @@ -18,9 +18,8 @@ int mp_add_d (mp_int * a, mp_digit b, mp_int * c) { - mp_int t; - int res; - + mp_int t; + int res; if ((res = mp_init (&t)) != MP_OKAY) { return res; diff --git a/bn_mp_addmod.c b/bn_mp_addmod.c index adce5d5..abc3719 100644 --- a/bn_mp_addmod.c +++ b/bn_mp_addmod.c @@ -18,9 +18,8 @@ int mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) { - int res; - mp_int t; - + int res; + mp_int t; if ((res = mp_init (&t)) != MP_OKAY) { return res; diff --git a/bn_mp_and.c b/bn_mp_and.c index d153c44..6c05d68 100644 --- a/bn_mp_and.c +++ b/bn_mp_and.c @@ -18,8 +18,8 @@ int mp_and (mp_int * a, mp_int * b, mp_int * c) { - int res, ix, px; - mp_int t, *x; + int res, ix, px; + mp_int t, *x; if (a->used > b->used) { if ((res = mp_init_copy (&t, a)) != MP_OKAY) { diff --git a/bn_mp_clamp.c b/bn_mp_clamp.c index 77f3e1a..3741f62 100644 --- a/bn_mp_clamp.c +++ b/bn_mp_clamp.c @@ -14,7 +14,13 @@ */ #include -/* trim unused digits */ +/* trim unused digits + * + * This is used to ensure that leading zero digits are + * trimed and the leading "used" digit will be non-zero + * Typically very fast. Also fixes the sign if there + * are no more leading digits + */ void mp_clamp (mp_int * a) { diff --git a/bn_mp_cmp.c b/bn_mp_cmp.c index 527d35c..ca0c463 100644 --- a/bn_mp_cmp.c +++ b/bn_mp_cmp.c @@ -18,13 +18,11 @@ int mp_cmp (mp_int * a, mp_int * b) { - int res; /* compare based on sign */ if (a->sign == MP_NEG && b->sign == MP_ZPOS) { return MP_LT; } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) { return MP_GT; } - res = mp_cmp_mag (a, b); - return res; + return mp_cmp_mag (a, b); } diff --git a/bn_mp_cmp_mag.c b/bn_mp_cmp_mag.c index efe5b2b..6d4a02d 100644 --- a/bn_mp_cmp_mag.c +++ b/bn_mp_cmp_mag.c @@ -18,8 +18,7 @@ int mp_cmp_mag (mp_int * a, mp_int * b) { - int n; - + int n; /* compare based on # of non-zero digits */ if (a->used > b->used) { diff --git a/bn_mp_copy.c b/bn_mp_copy.c index a502620..68705a4 100644 --- a/bn_mp_copy.c +++ b/bn_mp_copy.c @@ -18,8 +18,7 @@ int mp_copy (mp_int * a, mp_int * b) { - int res, n; - + int res, n; /* if dst == src do nothing */ if (a == b || a->dp == b->dp) { @@ -35,14 +34,21 @@ mp_copy (mp_int * a, mp_int * b) b->used = a->used; b->sign = a->sign; - /* copy all the digits */ - for (n = 0; n < a->used; n++) { - b->dp[n] = a->dp[n]; - } + { + register mp_digit *tmpa, *tmpb; - /* clear high digits */ - for (n = b->used; n < b->alloc; n++) { - b->dp[n] = 0; + tmpa = a->dp; + tmpb = b->dp; + + /* copy all the digits */ + for (n = 0; n < a->used; n++) { + *tmpb++ = *tmpa++; + } + + /* clear high digits */ + for (; n < b->alloc; n++) { + *tmpb++ = 0; + } } return MP_OKAY; } diff --git a/bn_mp_count_bits.c b/bn_mp_count_bits.c index 3f6ed28..09992d4 100644 --- a/bn_mp_count_bits.c +++ b/bn_mp_count_bits.c @@ -18,8 +18,8 @@ int mp_count_bits (mp_int * a) { - int r; - mp_digit q; + int r; + mp_digit q; if (a->used == 0) { return 0; diff --git a/bn_mp_div.c b/bn_mp_div.c index b324d2d..3954c9f 100644 --- a/bn_mp_div.c +++ b/bn_mp_div.c @@ -26,8 +26,8 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) { - mp_int q, x, y, t1, t2; - int res, n, t, i, norm, neg; + mp_int q, x, y, t1, t2; + int res, n, t, i, norm, neg; /* is divisor zero ? */ @@ -75,13 +75,12 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */ norm = 0; - while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == - ((mp_digit) 0)) { + while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == ((mp_digit) 0)) { ++norm; - if ((res = mp_mul_2d (&x, 1, &x)) != MP_OKAY) { + if ((res = mp_mul_2 (&x, &x)) != MP_OKAY) { goto __Y; } - if ((res = mp_mul_2d (&y, 1, &y)) != MP_OKAY) { + if ((res = mp_mul_2 (&y, &y)) != MP_OKAY) { goto __Y; } } @@ -114,7 +113,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) if (x.dp[i] == y.dp[t]) { q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL); } else { - mp_word tmp; + mp_word tmp; tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT); tmp |= ((mp_word) x.dp[i - 1]); tmp /= ((mp_word) y.dp[t]); @@ -142,8 +141,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1]; t2.dp[2] = x.dp[i]; t2.used = 3; - } - while (mp_cmp (&t1, &t2) == MP_GT); + } while (mp_cmp (&t1, &t2) == MP_GT); /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */ if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) { diff --git a/bn_mp_div_2.c b/bn_mp_div_2.c index bc7dc28..284f330 100644 --- a/bn_mp_div_2.c +++ b/bn_mp_div_2.c @@ -18,20 +18,33 @@ int mp_div_2 (mp_int * a, mp_int * b) { - mp_digit r, rr; - int x, res; - + int x, res, oldused; /* copy */ - if ((res = mp_copy (a, b)) != MP_OKAY) { - return res; + if (b->alloc < a->used) { + if ((res = mp_grow (b, a->used)) != MP_OKAY) { + return res; + } } - r = 0; - for (x = b->used - 1; x >= 0; x--) { - rr = b->dp[x] & 1; - b->dp[x] = (b->dp[x] >> 1) | (r << (DIGIT_BIT - 1)); - r = rr; + oldused = b->used; + b->used = a->used; + { + register mp_digit r, rr, *tmpa, *tmpb; + + tmpa = a->dp + b->used - 1; + tmpb = b->dp + b->used - 1; + r = 0; + for (x = b->used - 1; x >= 0; x--) { + rr = *tmpa & 1; + *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); + r = rr; + } + + tmpb = b->dp + b->used; + for (x = b->used; x < oldused; x++) { + *tmpb++ = 0; + } } mp_clamp (b); return MP_OKAY; diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c index 3bfa5aa..9d24e10 100644 --- a/bn_mp_div_2d.c +++ b/bn_mp_div_2d.c @@ -18,9 +18,9 @@ int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) { - mp_digit D, r, rr; - int x, res; - mp_int t; + mp_digit D, r, rr; + int x, res; + mp_int t; /* if the shift count is <= 0 then we do no work */ diff --git a/bn_mp_div_d.c b/bn_mp_div_d.c index 321f030..e43802a 100644 --- a/bn_mp_div_d.c +++ b/bn_mp_div_d.c @@ -18,8 +18,8 @@ int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d) { - mp_int t, t2; - int res; + mp_int t, t2; + int res; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_exch.c b/bn_mp_exch.c index 44e7087..2ccaf9e 100644 --- a/bn_mp_exch.c +++ b/bn_mp_exch.c @@ -17,7 +17,7 @@ void mp_exch (mp_int * a, mp_int * b) { - mp_int t; + mp_int t; t = *a; *a = *b; diff --git a/bn_mp_expt_d.c b/bn_mp_expt_d.c index 80f9578..e5106be 100644 --- a/bn_mp_expt_d.c +++ b/bn_mp_expt_d.c @@ -17,8 +17,8 @@ int mp_expt_d (mp_int * a, mp_digit b, mp_int * c) { - int res, x; - mp_int g; + int res, x; + mp_int g; if ((res = mp_init_copy (&g, a)) != MP_OKAY) { diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c index ce81953..ea58a4f 100644 --- a/bn_mp_exptmod.c +++ b/bn_mp_exptmod.c @@ -14,19 +14,30 @@ */ #include +static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y); + +/* this is a shell function that calls either the normal or Montgomery + * exptmod functions. Originally the call to the montgomery code was + * embedded in the normal function but that wasted alot of stack space + * for nothing (since 99% of the time the Montgomery code would be called) + */ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) { - mp_int M[256], res, mu; - mp_digit buf; - int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; - - /* if the modulus is odd use the fast method */ if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) { - err = mp_exptmod_fast (G, X, P, Y); - return err; + return mp_exptmod_fast (G, X, P, Y); + } else { + return f_mp_exptmod (G, X, P, Y); } +} + +static int +f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) +{ + mp_int M[256], res, mu; + mp_digit buf; + int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; /* find window size */ x = mp_count_bits (X); @@ -80,9 +91,7 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) } for (x = 0; x < (winsize - 1); x++) { - if ((err = - mp_sqr (&M[1 << (winsize - 1)], - &M[1 << (winsize - 1)])) != MP_OKAY) { + if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) { goto __MU; } if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) { diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c index bd25b3e..fe0dfcb 100644 --- a/bn_mp_exptmod_fast.c +++ b/bn_mp_exptmod_fast.c @@ -24,9 +24,9 @@ int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) { - mp_int M[256], res; - mp_digit buf, mp; - int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; + mp_int M[256], res; + mp_digit buf, mp; + int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; /* find window size */ x = mp_count_bits (X); @@ -48,7 +48,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) /* init G array */ for (x = 0; x < (1 << winsize); x++) { - if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) { + if ((err = mp_init (&M[x])) != MP_OKAY) { for (y = 0; y < x; y++) { mp_clear (&M[y]); } @@ -66,44 +66,32 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) goto __RES; } - /* now we need R mod m */ - if ((err = mp_2expt (&res, P->used * DIGIT_BIT)) != MP_OKAY) { - goto __RES; - } - - /* res = R mod m (can use modified double/subtract ...) */ - if ((err = mp_mod (&res, P, &res)) != MP_OKAY) { - goto __RES; - } - /* create M table * * The M table contains powers of the input base, e.g. M[x] = G^x mod P * * The first half of the table is not computed though accept for M[0] and M[1] */ - if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) { + + /* now we need R mod m */ + if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) { goto __RES; } /* now set M[1] to G * R mod m */ - if ((err = mp_mulmod (&M[1], &res, P, &M[1])) != MP_OKAY) { + if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) { goto __RES; } - /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) { goto __RES; } for (x = 0; x < (winsize - 1); x++) { - if ((err = - mp_sqr (&M[1 << (winsize - 1)], - &M[1 << (winsize - 1)])) != MP_OKAY) { + if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) { goto __RES; } - if ((err = - mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) { + if ((err = mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) { goto __RES; } } diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c index e2594b5..35b287e 100644 --- a/bn_mp_gcd.c +++ b/bn_mp_gcd.c @@ -19,8 +19,8 @@ int mp_gcd (mp_int * a, mp_int * b, mp_int * c) { - mp_int u, v, t; - int k, res, neg; + mp_int u, v, t; + int k, res, neg; /* either zero than gcd is the largest */ @@ -57,10 +57,10 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c) k = 0; while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) { ++k; - if ((res = mp_div_2d (&u, 1, &u, NULL)) != MP_OKAY) { + if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { goto __T; } - if ((res = mp_div_2d (&v, 1, &v, NULL)) != MP_OKAY) { + if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { goto __T; } } @@ -80,7 +80,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c) do { /* B3 (and B4). Halve t, if even */ while (t.used != 0 && (t.dp[0] & 1) == 0) { - if ((res = mp_div_2d (&t, 1, &t, NULL)) != MP_OKAY) { + if ((res = mp_div_2 (&t, &t)) != MP_OKAY) { goto __T; } } diff --git a/bn_mp_grow.c b/bn_mp_grow.c index 2a12369..91c1867 100644 --- a/bn_mp_grow.c +++ b/bn_mp_grow.c @@ -18,8 +18,7 @@ int mp_grow (mp_int * a, int size) { - int i, n; - + int i, n; /* if the alloc size is smaller alloc more ram */ if (a->alloc < size) { diff --git a/bn_mp_init_copy.c b/bn_mp_init_copy.c index b3f25ee..f79d2b1 100644 --- a/bn_mp_init_copy.c +++ b/bn_mp_init_copy.c @@ -18,11 +18,10 @@ int mp_init_copy (mp_int * a, mp_int * b) { - int res; + int res; if ((res = mp_init (a)) != MP_OKAY) { return res; } - res = mp_copy (b, a); - return res; + return mp_copy (b, a); } diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c index 1051eb0..006efd2 100644 --- a/bn_mp_invmod.c +++ b/bn_mp_invmod.c @@ -17,9 +17,8 @@ int mp_invmod (mp_int * a, mp_int * b, mp_int * c) { - mp_int x, y, u, v, A, B, C, D; - int res; - + mp_int x, y, u, v, A, B, C, D; + int res; /* b cannot be negative */ if (b->sign == MP_NEG) { @@ -28,8 +27,7 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c) /* if the modulus is odd we can use a faster routine instead */ if (mp_iseven (b) == 0) { - res = fast_mp_invmod (a, b, c); - return res; + return fast_mp_invmod (a, b, c); } if ((res = mp_init (&x)) != MP_OKAY) { diff --git a/bn_mp_jacobi.c b/bn_mp_jacobi.c index b97d5f3..95aee42 100644 --- a/bn_mp_jacobi.c +++ b/bn_mp_jacobi.c @@ -20,9 +20,9 @@ int mp_jacobi (mp_int * a, mp_int * n, int *c) { - mp_int a1, n1, e; - int s, r, res; - mp_digit residue; + mp_int a1, n1, e; + int s, r, res; + mp_digit residue; /* step 1. if a == 0, return 0 */ if (mp_iszero (a) == 1) { diff --git a/bn_mp_karatsuba_mul.c b/bn_mp_karatsuba_mul.c index 0a8c34e..bee8eaa 100644 --- a/bn_mp_karatsuba_mul.c +++ b/bn_mp_karatsuba_mul.c @@ -36,8 +36,8 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) { - mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1; - int B, err, x; + mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1; + int B, err, x; err = MP_MEM; diff --git a/bn_mp_karatsuba_sqr.c b/bn_mp_karatsuba_sqr.c index 29396ba..3078588 100644 --- a/bn_mp_karatsuba_sqr.c +++ b/bn_mp_karatsuba_sqr.c @@ -22,8 +22,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b) { - mp_int x0, x1, t1, t2, x0x0, x1x1; - int B, err, x; + mp_int x0, x1, t1, t2, x0x0, x1x1; + int B, err, x; err = MP_MEM; diff --git a/bn_mp_lcm.c b/bn_mp_lcm.c index 7d38135..60d5461 100644 --- a/bn_mp_lcm.c +++ b/bn_mp_lcm.c @@ -18,8 +18,8 @@ int mp_lcm (mp_int * a, mp_int * b, mp_int * c) { - int res; - mp_int t; + int res; + mp_int t; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_lshd.c b/bn_mp_lshd.c index ea02409..44b0588 100644 --- a/bn_mp_lshd.c +++ b/bn_mp_lshd.c @@ -18,7 +18,7 @@ int mp_lshd (mp_int * a, int b) { - int x, res; + int x, res; /* if its less than zero return */ diff --git a/bn_mp_mod.c b/bn_mp_mod.c index 5b208a6..c4a7374 100644 --- a/bn_mp_mod.c +++ b/bn_mp_mod.c @@ -18,8 +18,8 @@ int mp_mod (mp_int * a, mp_int * b, mp_int * c) { - mp_int t; - int res; + mp_int t; + int res; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_mod_2d.c b/bn_mp_mod_2d.c index df73612..4c6f1f1 100644 --- a/bn_mp_mod_2d.c +++ b/bn_mp_mod_2d.c @@ -18,7 +18,7 @@ int mp_mod_2d (mp_int * a, int b, mp_int * c) { - int x, res; + int x, res; /* if b is <= 0 then zero the int */ @@ -44,8 +44,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c) } /* clear the digit that is not completely outside/inside the modulus */ c->dp[b / DIGIT_BIT] &= - (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - - ((mp_digit) 1)); + (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1)); mp_clamp (c); return MP_OKAY; } diff --git a/bn_mp_mod_d.c b/bn_mp_mod_d.c index 7b08d23..b557381 100644 --- a/bn_mp_mod_d.c +++ b/bn_mp_mod_d.c @@ -17,8 +17,8 @@ int mp_mod_d (mp_int * a, mp_digit b, mp_digit * c) { - mp_int t, t2; - int res; + mp_int t, t2; + int res; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_montgomery_calc_normalization.c b/bn_mp_montgomery_calc_normalization.c new file mode 100644 index 0000000..06252de --- /dev/null +++ b/bn_mp_montgomery_calc_normalization.c @@ -0,0 +1,53 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca + */ +#include + +/* calculates a = B^n mod b for Montgomery reduction + * Where B is the base [e.g. 2^DIGIT_BIT]. + * B^n mod b is computed by first computing + * A = B^(n-1) which doesn't require a reduction but a simple OR. + * then C = A * B = B^n is computed by performing upto DIGIT_BIT + * shifts with subtractions when the result is greater than b. + * + * The method is slightly modified to shift B unconditionally upto just under + * the leading bit of b. This saves alot of multiple precision shifting. + */ +int +mp_montgomery_calc_normalization (mp_int * a, mp_int * b) +{ + int x, bits, res; + + /* how many bits of last digit does b use */ + bits = mp_count_bits (b) % DIGIT_BIT; + + /* compute A = B^(n-1) * 2^(bits-1) */ + if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) { + return res; + } + + /* now compute C = A * B mod b */ + for (x = bits - 1; x < DIGIT_BIT; x++) { + if ((res = mp_mul_2 (a, a)) != MP_OKAY) { + return res; + } + if (mp_cmp_mag (a, b) != MP_LT) { + if ((res = s_mp_sub (a, b, a)) != MP_OKAY) { + return res; + } + } + } + + return MP_OKAY; +} diff --git a/bn_mp_montgomery_reduce.c b/bn_mp_montgomery_reduce.c index aeb2cde..586142a 100644 --- a/bn_mp_montgomery_reduce.c +++ b/bn_mp_montgomery_reduce.c @@ -18,14 +18,13 @@ int mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) { - int ix, res, digs; - mp_digit ui; + int ix, res, digs; + mp_digit ui; digs = m->used * 2 + 1; if ((digs < 512) && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { - res = fast_mp_montgomery_reduce (a, m, mp); - return res; + return fast_mp_montgomery_reduce (a, m, mp); } if (a->alloc < m->used * 2 + 1) { @@ -51,9 +50,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) mu = 0; for (iy = 0; iy < m->used; iy++) { - r = - ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + - ((mp_word) * tmpy); + r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy); mu = (r >> ((mp_word) DIGIT_BIT)); *tmpy++ = (r & ((mp_word) MP_MASK)); } @@ -71,9 +68,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) /* if A >= m then A = A - m */ if (mp_cmp_mag (a, m) != MP_LT) { - if ((res = s_mp_sub (a, m, a)) != MP_OKAY) { - return res; - } + return s_mp_sub (a, m, a); } return MP_OKAY; diff --git a/bn_mp_montgomery_setup.c b/bn_mp_montgomery_setup.c index 601e74c..c739895 100644 --- a/bn_mp_montgomery_setup.c +++ b/bn_mp_montgomery_setup.c @@ -18,8 +18,8 @@ int mp_montgomery_setup (mp_int * a, mp_digit * mp) { - mp_int t, tt; - int res; + mp_int t, tt; + int res; if ((res = mp_init (&t)) != MP_OKAY) { return res; diff --git a/bn_mp_mul.c b/bn_mp_mul.c index cfa1467..064fc85 100644 --- a/bn_mp_mul.c +++ b/bn_mp_mul.c @@ -18,12 +18,26 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c) { - int res, neg; + int res, neg; neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) { res = mp_karatsuba_mul (a, b, c); } else { - res = s_mp_mul (a, b, c); + + /* can we use the fast multiplier? + * + * The fast multiplier can be used if the output will have less than + * 512 digits and the number of digits won't affect carry propagation + */ + int digs = a->used + b->used + 1; + + if ((digs < 512) + && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + res = fast_s_mp_mul_digs (a, b, c, digs); + } else { + res = s_mp_mul (a, b, c); + } + } c->sign = neg; return res; diff --git a/bn_mp_mul_2.c b/bn_mp_mul_2.c index dd5ecca..8174d0a 100644 --- a/bn_mp_mul_2.c +++ b/bn_mp_mul_2.c @@ -18,27 +18,48 @@ int mp_mul_2 (mp_int * a, mp_int * b) { - mp_digit r, rr; - int x, res; + int x, res, oldused; + /* Optimization: should copy and shift at the same time */ - /* copy */ - if ((res = mp_copy (a, b)) != MP_OKAY) { - return res; + if (b->alloc < a->used) { + if ((res = mp_grow (b, a->used)) != MP_OKAY) { + return res; + } } - if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) { - return res; - } - ++b->used; + oldused = b->used; + b->used = a->used; /* shift any bit count < DIGIT_BIT */ - r = 0; - for (x = 0; x < b->used; x++) { - rr = (b->dp[x] >> (DIGIT_BIT - 1)) & 1; - b->dp[x] = ((b->dp[x] << 1) | r) & MP_MASK; - r = rr; + { + register mp_digit r, rr, *tmpa, *tmpb; + + r = 0; + tmpa = a->dp; + tmpb = b->dp; + for (x = 0; x < b->used; x++) { + rr = *tmpa >> (DIGIT_BIT - 1); + *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK; + r = rr; + } + + /* new leading digit? */ + if (r != 0) { + if (b->alloc == b->used) { + if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) { + return res; + } + } + /* add a MSB of 1 */ + *tmpb = 1; + ++b->used; + } + + tmpb = b->dp + b->used; + for (x = b->used; x < oldused; x++) { + *tmpb++ = 0; + } } - mp_clamp (b); return MP_OKAY; } diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c index 7823eb9..97ee26c 100644 --- a/bn_mp_mul_2d.c +++ b/bn_mp_mul_2d.c @@ -18,8 +18,8 @@ int mp_mul_2d (mp_int * a, int b, mp_int * c) { - mp_digit d, r, rr; - int x, res; + mp_digit d, r, rr; + int x, res; /* copy */ diff --git a/bn_mp_mul_d.c b/bn_mp_mul_d.c index bced9b7..164bcac 100644 --- a/bn_mp_mul_d.c +++ b/bn_mp_mul_d.c @@ -18,29 +18,40 @@ int mp_mul_d (mp_int * a, mp_digit b, mp_int * c) { - int res, pa, ix; - mp_word r; - mp_digit u; - mp_int t; - + int res, pa, olduse; pa = a->used; - if ((res = mp_init_size (&t, pa + 2)) != MP_OKAY) { - return res; + if (c->alloc < pa + 1) { + if ((res = mp_grow (c, pa + 1)) != MP_OKAY) { + return res; + } } - t.used = pa + 2; - u = 0; - for (ix = 0; ix < pa; ix++) { - r = ((mp_word) u) + ((mp_word) a->dp[ix]) * ((mp_word) b); - t.dp[ix] = (mp_digit) (r & ((mp_word) MP_MASK)); - u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); + olduse = c->used; + c->used = pa + 1; + + { + register mp_digit u, *tmpa, *tmpc; + register mp_word r; + register int ix; + + tmpc = c->dp + c->used; + for (ix = c->used; ix < olduse; ix++) { + *tmpc++ = 0; + } + + tmpa = a->dp; + tmpc = c->dp; + + u = 0; + for (ix = 0; ix < pa; ix++) { + r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b); + *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK)); + u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); + } + *tmpc = u; } - t.dp[ix] = u; - t.sign = a->sign; - mp_clamp (&t); - mp_exch (&t, c); - mp_clear (&t); + mp_clamp (c); return MP_OKAY; } diff --git a/bn_mp_mulmod.c b/bn_mp_mulmod.c index 2cdbdda..abdf77b 100644 --- a/bn_mp_mulmod.c +++ b/bn_mp_mulmod.c @@ -18,8 +18,8 @@ int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) { - int res; - mp_int t; + int res; + mp_int t; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_n_root.c b/bn_mp_n_root.c index 72b3a8c..eb49b3f 100644 --- a/bn_mp_n_root.c +++ b/bn_mp_n_root.c @@ -17,12 +17,16 @@ /* find the n'th root of an integer * * Result found such that (c)^b <= a and (c+1)^b > a + * + * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) + * which will find the root in log(N) time where each step involves a fair bit. This + * is not meant to find huge roots [square and cube at most]. */ int mp_n_root (mp_int * a, mp_digit b, mp_int * c) { - mp_int t1, t2, t3; - int res, neg; + mp_int t1, t2, t3; + int res, neg; /* input must be positive if b is even */ if ((b & 1) == 0 && a->sign == MP_NEG) { diff --git a/bn_mp_neg.c b/bn_mp_neg.c index a2bb7c4..fd2e497 100644 --- a/bn_mp_neg.c +++ b/bn_mp_neg.c @@ -18,7 +18,7 @@ int mp_neg (mp_int * a, mp_int * b) { - int res; + int res; if ((res = mp_copy (a, b)) != MP_OKAY) { return res; } diff --git a/bn_mp_or.c b/bn_mp_or.c index a3843d9..e821bac 100644 --- a/bn_mp_or.c +++ b/bn_mp_or.c @@ -18,8 +18,8 @@ int mp_or (mp_int * a, mp_int * b, mp_int * c) { - int res, ix, px; - mp_int t, *x; + int res, ix, px; + mp_int t, *x; if (a->used > b->used) { if ((res = mp_init_copy (&t, a)) != MP_OKAY) { diff --git a/bn_mp_rand.c b/bn_mp_rand.c index c72dec9..dc13534 100644 --- a/bn_mp_rand.c +++ b/bn_mp_rand.c @@ -18,8 +18,8 @@ int mp_rand (mp_int * a, int digits) { - int res; - mp_digit d; + int res; + mp_digit d; mp_zero (a); if (digits <= 0) { @@ -27,19 +27,20 @@ mp_rand (mp_int * a, int digits) } /* first place a random non-zero digit */ - d = ((mp_digit) abs (rand ())); - d = d == 0 ? 1 : d; + do { + d = ((mp_digit) abs (rand ())); + } while (d == 0); if ((res = mp_add_d (a, d, a)) != MP_OKAY) { return res; } - while (digits-- > 0) { - if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) { + if ((res = mp_lshd (a, 1)) != MP_OKAY) { return res; } - if ((res = mp_lshd (a, 1)) != MP_OKAY) { + + if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) { return res; } } diff --git a/bn_mp_read_signed_bin.c b/bn_mp_read_signed_bin.c index b5af4c0..8a9df88 100644 --- a/bn_mp_read_signed_bin.c +++ b/bn_mp_read_signed_bin.c @@ -18,7 +18,7 @@ int mp_read_signed_bin (mp_int * a, unsigned char *b, int c) { - int res; + int res; if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) { return res; diff --git a/bn_mp_read_unsigned_bin.c b/bn_mp_read_unsigned_bin.c index 726b574..16e2f29 100644 --- a/bn_mp_read_unsigned_bin.c +++ b/bn_mp_read_unsigned_bin.c @@ -18,7 +18,7 @@ int mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c) { - int res; + int res; mp_zero (a); while (c-- > 0) { if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) { diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c index be5d18e..8f15458 100644 --- a/bn_mp_reduce.c +++ b/bn_mp_reduce.c @@ -20,7 +20,7 @@ int mp_reduce_setup (mp_int * a, mp_int * b) { - int res; + int res; if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) { @@ -36,8 +36,8 @@ mp_reduce_setup (mp_int * a, mp_int * b) int mp_reduce (mp_int * x, mp_int * m, mp_int * mu) { - mp_int q; - int res, um = m->used; + mp_int q; + int res, um = m->used; if ((res = mp_init_copy (&q, x)) != MP_OKAY) { diff --git a/bn_mp_rshd.c b/bn_mp_rshd.c index 8b37d09..39e631e 100644 --- a/bn_mp_rshd.c +++ b/bn_mp_rshd.c @@ -18,7 +18,7 @@ void mp_rshd (mp_int * a, int b) { - int x; + int x; /* if b <= 0 then ignore it */ diff --git a/bn_mp_set_int.c b/bn_mp_set_int.c index a690bb4..f22ab69 100644 --- a/bn_mp_set_int.c +++ b/bn_mp_set_int.c @@ -18,7 +18,7 @@ int mp_set_int (mp_int * a, unsigned long b) { - int x, res; + int x, res; mp_zero (a); diff --git a/bn_mp_sqr.c b/bn_mp_sqr.c index 2ba877c..c8b5cb7 100644 --- a/bn_mp_sqr.c +++ b/bn_mp_sqr.c @@ -18,11 +18,18 @@ int mp_sqr (mp_int * a, mp_int * b) { - int res; + int res; if (a->used > KARATSUBA_SQR_CUTOFF) { res = mp_karatsuba_sqr (a, b); } else { - res = s_mp_sqr (a, b); + + /* can we use the fast multiplier? */ + if (((a->used * 2 + 1) < 512) + && a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) { + res = fast_s_mp_sqr (a, b); + } else { + res = s_mp_sqr (a, b); + } } b->sign = MP_ZPOS; return res; diff --git a/bn_mp_sqrmod.c b/bn_mp_sqrmod.c index 66135d4..44f608f 100644 --- a/bn_mp_sqrmod.c +++ b/bn_mp_sqrmod.c @@ -18,8 +18,8 @@ int mp_sqrmod (mp_int * a, mp_int * b, mp_int * c) { - int res; - mp_int t; + int res; + mp_int t; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_sub.c b/bn_mp_sub.c index 045dee5..1366c55 100644 --- a/bn_mp_sub.c +++ b/bn_mp_sub.c @@ -18,7 +18,7 @@ int mp_sub (mp_int * a, mp_int * b, mp_int * c) { - int sa, sb, res; + int sa, sb, res; sa = a->sign; diff --git a/bn_mp_sub_d.c b/bn_mp_sub_d.c index 9839d5e..aebc414 100644 --- a/bn_mp_sub_d.c +++ b/bn_mp_sub_d.c @@ -18,8 +18,8 @@ int mp_sub_d (mp_int * a, mp_digit b, mp_int * c) { - mp_int t; - int res; + mp_int t; + int res; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_submod.c b/bn_mp_submod.c index b56d921..16fee71 100644 --- a/bn_mp_submod.c +++ b/bn_mp_submod.c @@ -18,8 +18,8 @@ int mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) { - int res; - mp_int t; + int res; + mp_int t; if ((res = mp_init (&t)) != MP_OKAY) { diff --git a/bn_mp_to_signed_bin.c b/bn_mp_to_signed_bin.c index b00cf8f..41abac1 100644 --- a/bn_mp_to_signed_bin.c +++ b/bn_mp_to_signed_bin.c @@ -18,7 +18,7 @@ int mp_to_signed_bin (mp_int * a, unsigned char *b) { - int res; + int res; if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) { return res; diff --git a/bn_mp_to_unsigned_bin.c b/bn_mp_to_unsigned_bin.c index b122555..eec9f75 100644 --- a/bn_mp_to_unsigned_bin.c +++ b/bn_mp_to_unsigned_bin.c @@ -18,8 +18,8 @@ int mp_to_unsigned_bin (mp_int * a, unsigned char *b) { - int x, res; - mp_int t; + int x, res; + mp_int t; if ((res = mp_init_copy (&t, a)) != MP_OKAY) { return res; diff --git a/bn_mp_unsigned_bin_size.c b/bn_mp_unsigned_bin_size.c index 1c92b5c..bee88e6 100644 --- a/bn_mp_unsigned_bin_size.c +++ b/bn_mp_unsigned_bin_size.c @@ -18,6 +18,6 @@ int mp_unsigned_bin_size (mp_int * a) { - int size = mp_count_bits (a); + int size = mp_count_bits (a); return (size / 8 + ((size & 7) != 0 ? 1 : 0)); } diff --git a/bn_mp_xor.c b/bn_mp_xor.c index c8e9f43..4a2ff9b 100644 --- a/bn_mp_xor.c +++ b/bn_mp_xor.c @@ -18,8 +18,8 @@ int mp_xor (mp_int * a, mp_int * b, mp_int * c) { - int res, ix, px; - mp_int t, *x; + int res, ix, px; + mp_int t, *x; if (a->used > b->used) { if ((res = mp_init_copy (&t, a)) != MP_OKAY) { diff --git a/bn_radix.c b/bn_radix.c index 1fc4b35..205c148 100644 --- a/bn_radix.c +++ b/bn_radix.c @@ -15,16 +15,15 @@ #include /* chars used in radix conversions */ -static const char *s_rmap = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"; +static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"; /* read a string [ASCII] in a given radix */ int mp_read_radix (mp_int * a, char *str, int radix) { - int y, res, neg; - char ch; + int y, res, neg; + char ch; if (radix < 2 || radix > 64) { return MP_VAL; @@ -66,10 +65,10 @@ mp_read_radix (mp_int * a, char *str, int radix) int mp_toradix (mp_int * a, char *str, int radix) { - int res, digs; - mp_int t; - mp_digit d; - char *_s = str; + int res, digs; + mp_int t; + mp_digit d; + char *_s = str; if (radix < 2 || radix > 64) { return MP_VAL; @@ -104,9 +103,9 @@ mp_toradix (mp_int * a, char *str, int radix) int mp_radix_size (mp_int * a, int radix) { - int res, digs; - mp_int t; - mp_digit d; + int res, digs; + mp_int t; + mp_digit d; /* special case for binary */ if (radix == 2) { diff --git a/bn_reverse.c b/bn_reverse.c index 10c2375..50109d7 100644 --- a/bn_reverse.c +++ b/bn_reverse.c @@ -18,7 +18,7 @@ void bn_reverse (unsigned char *s, int len) { - int ix, iy; + int ix, iy; unsigned char t; ix = 0; diff --git a/bn_s_mp_add.c b/bn_s_mp_add.c index 369a0e1..328ec06 100644 --- a/bn_s_mp_add.c +++ b/bn_s_mp_add.c @@ -18,10 +18,8 @@ int s_mp_add (mp_int * a, mp_int * b, mp_int * c) { - mp_int *x; - int olduse, res, min, max, i; - mp_digit u; - + mp_int *x; + int olduse, res, min, max; /* find sizes, we let |a| <= |b| which means we have to sort * them. "x" will point to the input with the most digits @@ -52,38 +50,48 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c) /* add digits from lower part */ /* set the carry to zero */ - u = 0; - for (i = 0; i < min; i++) { - /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ - c->dp[i] = a->dp[i] + b->dp[i] + u; + { + register mp_digit u, *tmpa, *tmpb, *tmpc; + register int i; - /* U = carry bit of T[i] */ - u = (c->dp[i] >> DIGIT_BIT) & 1; + /* alias for digit pointers */ + tmpa = a->dp; + tmpb = b->dp; + tmpc = c->dp; - /* take away carry bit from T[i] */ - c->dp[i] &= MP_MASK; - } - - /* now copy higher words if any, that is in A+B if A or B has more digits add those in */ - if (min != max) { - for (; i < max; i++) { - /* T[i] = X[i] + U */ - c->dp[i] = x->dp[i] + u; + u = 0; + for (i = 0; i < min; i++) { + /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ + *tmpc = *tmpa++ + *tmpb++ + u; /* U = carry bit of T[i] */ - u = (c->dp[i] >> DIGIT_BIT) & 1; + u = *tmpc >> DIGIT_BIT; /* take away carry bit from T[i] */ - c->dp[i] &= MP_MASK; + *tmpc++ &= MP_MASK; } - } - /* add carry */ - c->dp[i] = u; + /* now copy higher words if any, that is in A+B if A or B has more digits add those in */ + if (min != max) { + for (; i < max; i++) { + /* T[i] = X[i] + U */ + *tmpc = x->dp[i] + u; - /* clear digits above used (since we may not have grown result above) */ - for (i = c->used; i < olduse; i++) { - c->dp[i] = 0; + /* U = carry bit of T[i] */ + u = *tmpc >> DIGIT_BIT; + + /* take away carry bit from T[i] */ + *tmpc++ &= MP_MASK; + } + } + + /* add carry */ + *tmpc++ = u; + + /* clear digits above used (since we may not have grown result above) */ + for (i = c->used; i < olduse; i++) { + *tmpc++ = 0; + } } mp_clamp (c); diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c index 55522a1..f2b0d13 100644 --- a/bn_s_mp_mul_digs.c +++ b/bn_s_mp_mul_digs.c @@ -21,23 +21,11 @@ int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { - mp_int t; - int res, pa, pb, ix, iy; - mp_digit u; - mp_word r; - mp_digit tmpx, *tmpt, *tmpy; - - - /* can we use the fast multiplier? - * - * The fast multiplier can be used if the output will have less than - * 512 digits and the number of digits won't affect carry propagation - */ - if ((digs < 512) - && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { - res = fast_s_mp_mul_digs (a, b, c, digs); - return res; - } + mp_int t; + int res, pa, pb, ix, iy; + mp_digit u; + mp_word r; + mp_digit tmpx, *tmpt, *tmpy; if ((res = mp_init_size (&t, digs)) != MP_OKAY) { return res; @@ -61,9 +49,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* compute the columns of the output and propagate the carry */ for (iy = 0; iy < pb; iy++) { /* compute the column as a mp_word */ - r = - ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + - ((mp_word) u); + r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u); /* the new column is the lower part of the result */ *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); diff --git a/bn_s_mp_mul_high_digs.c b/bn_s_mp_mul_high_digs.c index ff2530f..a43a593 100644 --- a/bn_s_mp_mul_high_digs.c +++ b/bn_s_mp_mul_high_digs.c @@ -20,20 +20,17 @@ int s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { - mp_int t; - int res, pa, pb, ix, iy; - mp_digit u; - mp_word r; - mp_digit tmpx, *tmpt, *tmpy; + mp_int t; + int res, pa, pb, ix, iy; + mp_digit u; + mp_word r; + mp_digit tmpx, *tmpt, *tmpy; /* can we use the fast multiplier? */ if (((a->used + b->used + 1) < 512) - && MAX (a->used, - b->used) < - (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { - res = fast_s_mp_mul_high_digs (a, b, c, digs); - return res; + && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + return fast_s_mp_mul_high_digs (a, b, c, digs); } if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) { @@ -58,9 +55,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) for (iy = digs - ix; iy < pb; iy++) { /* calculate the double precision result */ - r = - ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + - ((mp_word) u); + r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u); /* get the lower part */ *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); diff --git a/bn_s_mp_sqr.c b/bn_s_mp_sqr.c index 94449a3..a0ec38b 100644 --- a/bn_s_mp_sqr.c +++ b/bn_s_mp_sqr.c @@ -18,18 +18,10 @@ int s_mp_sqr (mp_int * a, mp_int * b) { - mp_int t; - int res, ix, iy, pa; - mp_word r, u; - mp_digit tmpx, *tmpt; - - /* can we use the fast multiplier? */ - if (((a->used * 2 + 1) < 512) - && a->used < - (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) { - res = fast_s_mp_sqr (a, b); - return res; - } + mp_int t; + int res, ix, iy, pa; + mp_word r, u; + mp_digit tmpx, *tmpt; pa = a->used; if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) { @@ -40,9 +32,7 @@ s_mp_sqr (mp_int * a, mp_int * b) for (ix = 0; ix < pa; ix++) { /* first calculate the digit at 2*ix */ /* calculate double precision result */ - r = - ((mp_word) t.dp[ix + ix]) + - ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); + r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); /* store lower part in result */ t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK)); diff --git a/bn_s_mp_sub.c b/bn_s_mp_sub.c index f6da162..fe15d23 100644 --- a/bn_s_mp_sub.c +++ b/bn_s_mp_sub.c @@ -18,9 +18,7 @@ int s_mp_sub (mp_int * a, mp_int * b, mp_int * c) { - int olduse, res, min, max, i; - mp_digit u; - + int olduse, res, min, max; /* find sizes */ min = b->used; @@ -37,36 +35,48 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c) /* sub digits from lower part */ - /* set carry to zero */ - u = 0; - for (i = 0; i < min; i++) { - /* T[i] = A[i] - B[i] - U */ - c->dp[i] = a->dp[i] - (b->dp[i] + u); + { + register mp_digit u, *tmpa, *tmpb, *tmpc; + register int i; - /* U = carry bit of T[i] */ - u = (c->dp[i] >> DIGIT_BIT) & 1; + /* alias for digit pointers */ + tmpa = a->dp; + tmpb = b->dp; + tmpc = c->dp; - /* Clear carry from T[i] */ - c->dp[i] &= MP_MASK; - } + /* set carry to zero */ + u = 0; + for (i = 0; i < min; i++) { + /* T[i] = A[i] - B[i] - U */ + *tmpc = *tmpa++ - *tmpb++ - u; - /* now copy higher words if any, e.g. if A has more digits than B */ - if (min != max) { - for (; i < max; i++) { - /* T[i] = A[i] - U */ - c->dp[i] = a->dp[i] - u; - - /* U = carry bit of T[i] */ - u = (c->dp[i] >> DIGIT_BIT) & 1; + /* U = carry bit of T[i] + * Note this saves performing an AND operation since + * if a carry does occur it will propagate all the way to the + * MSB. As a result a single shift is required to get the carry + */ + u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1); /* Clear carry from T[i] */ - c->dp[i] &= MP_MASK; + *tmpc++ &= MP_MASK; } - } - /* clear digits above used (since we may not have grown result above) */ - for (i = c->used; i < olduse; i++) { - c->dp[i] = 0; + /* now copy higher words if any, e.g. if A has more digits than B */ + for (; i < max; i++) { + /* T[i] = A[i] - U */ + *tmpc = *tmpa++ - u; + + /* U = carry bit of T[i] */ + u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1); + + /* Clear carry from T[i] */ + *tmpc++ &= MP_MASK; + } + + /* clear digits above used (since we may not have grown result above) */ + for (i = c->used; i < olduse; i++) { + *tmpc++ = 0; + } } mp_clamp (c); diff --git a/bncore.c b/bncore.c index 5c7d098..8863935 100644 --- a/bncore.c +++ b/bncore.c @@ -14,6 +14,6 @@ */ #include -int KARATSUBA_MUL_CUTOFF = 80, /* Min. number of digits before Karatsuba multiplication is used. */ - KARATSUBA_SQR_CUTOFF = 80, /* Min. number of digits before Karatsuba squaring is used. */ - MONTGOMERY_EXPT_CUTOFF = 40; /* max. number of digits that montgomery reductions will help for */ +int KARATSUBA_MUL_CUTOFF = 80, /* Min. number of digits before Karatsuba multiplication is used. */ + KARATSUBA_SQR_CUTOFF = 80, /* Min. number of digits before Karatsuba squaring is used. */ + MONTGOMERY_EXPT_CUTOFF = 74; /* max. number of digits that montgomery reductions will help for */ diff --git a/changes.txt b/changes.txt index e2c9903..c31526e 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,9 @@ +Feb 13th, 2003 +v0.13 -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate + to other functions like mp_invmod, mp_div, etc... + -- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m] + -- minor fixes + Jan 17th, 2003 v0.12 -- re-wrote the majority of the makefile so its more portable and will install via "make install" on most *nix platforms diff --git a/demo/demo.c b/demo/demo.c index 3c3ef07..8bf9acd 100644 --- a/demo/demo.c +++ b/demo/demo.c @@ -76,7 +76,7 @@ int main(void) { mp_int a, b, c, d, e, f; unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n; - int rr; + unsigned rr; #ifdef TIMER int n; @@ -90,42 +90,20 @@ int main(void) mp_init(&e); mp_init(&f); -#ifdef DEBUG - mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10); - mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10); - mp_set(&c, 1); - reset_timings(); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - mp_exptmod(&c, &b, &a, &d); - dump_timings(); - return 0; -#endif #ifdef TIMER +goto multtime; + printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC); mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&b, "340282366920938463463574607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { reset(); - for (rr = 0; rr < 1000000; rr++) { + for (rr = 0; rr < 10000000; rr++) { mp_add(&a, &b, &c); } tt = rdtsc(); - printf("Adding %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000)); + printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); mp_sqr(&a, &a); mp_sqr(&b, &b); } @@ -134,37 +112,40 @@ int main(void) mp_read_radix(&b, "340282366920938463463574607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { reset(); - for (rr = 0; rr < 1000000; rr++) { + for (rr = 0; rr < 10000000; rr++) { mp_sub(&a, &b, &c); } tt = rdtsc(); - printf("Subtracting %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000)); + printf("Subtracting\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); mp_sqr(&a, &a); mp_sqr(&b, &b); } + +multtime: mp_read_radix(&a, "340282366920938463463374607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { reset(); - for (rr = 0; rr < 1000000; rr++) { + for (rr = 0; rr < 250000; rr++) { mp_sqr(&a, &b); } tt = rdtsc(); - printf("Squaring %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000)); + printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); mp_copy(&b, &a); } mp_read_radix(&a, "340282366920938463463374607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { reset(); - for (rr = 0; rr < 1000000; rr++) { + for (rr = 0; rr < 250000; rr++) { mp_mul(&a, &a, &b); } tt = rdtsc(); - printf("Multiplying %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000)); + printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); mp_copy(&b, &a); } - + +expttime: { char *primes[] = { "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", @@ -180,7 +161,7 @@ int main(void) mp_read_radix(&a, primes[n], 10); mp_zero(&b); for (rr = 0; rr < mp_count_bits(&a); rr++) { - mp_mul_2d(&b, 1, &b); + mp_mul_2(&b, &b); b.dp[0] |= lbit(); b.used += 1; } @@ -188,7 +169,7 @@ int main(void) mp_mod(&b, &c, &b); mp_set(&c, 3); reset(); - for (rr = 0; rr < 100; rr++) { + for (rr = 0; rr < 50; rr++) { mp_exptmod(&c, &b, &a, &d); } tt = rdtsc(); @@ -201,16 +182,15 @@ int main(void) draw(&d); exit(0); } - printf("Exponentiating %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100)); + printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); } } - mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&b, "234892374891378913789237289378973232333", 10); while (a.used * DIGIT_BIT < 8192) { reset(); - for (rr = 0; rr < 100; rr++) { + for (rr = 0; rr < 10000; rr++) { mp_invmod(&b, &a, &c); } tt = rdtsc(); @@ -219,7 +199,7 @@ int main(void) printf("Failed to invert\n"); return 0; } - printf("Inverting mod %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100)); + printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); mp_sqr(&a, &a); mp_sqr(&b, &b); } diff --git a/etc/makefile b/etc/makefile index 7a98e09..bf4befb 100644 --- a/etc/makefile +++ b/etc/makefile @@ -1,15 +1,20 @@ -CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops +CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../ + + +# default lib name (requires install with root) +# LIBNAME=-ltommath + +# libname when you can't install the lib with install +LIBNAME=../libtommath.a pprime: pprime.o - $(CC) pprime.o -ltommath -o pprime + $(CC) pprime.o $(LIBNAME) -o pprime tune: tune.o - $(CC) tune.o -ltommath -o tune + $(CC) tune.o $(LIBNAME) -o tune mersenne: mersenne.o - $(CC) mersenne.o -ltommath -o mersenne + $(CC) mersenne.o $(LIBNAME) -o mersenne clean: - rm -f *.o pprime tune mersenne - - + rm -f *.o *.exe pprime tune mersenne \ No newline at end of file diff --git a/etc/pprime.c b/etc/pprime.c index c136901..6fea3da 100644 --- a/etc/pprime.c +++ b/etc/pprime.c @@ -5,7 +5,7 @@ * Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca */ #include -#include "bn.h" +#include "tommath.h" /* fast square root */ static mp_digit diff --git a/etc/tune.c b/etc/tune.c index 0d03e77..73a44b7 100644 --- a/etc/tune.c +++ b/etc/tune.c @@ -18,9 +18,9 @@ time_mult (void) t1 = clock (); for (x = 8; x <= 128; x += 8) { - mp_rand (&a, x); - mp_rand (&b, x); - for (y = 0; y < 10000; y++) { + for (y = 0; y < 1000; y++) { + mp_rand (&a, x); + mp_rand (&b, x); mp_mul (&a, &b, &c); } } @@ -42,8 +42,8 @@ time_sqr (void) t1 = clock (); for (x = 8; x <= 128; x += 8) { - mp_rand (&a, x); - for (y = 0; y < 10000; y++) { + for (y = 0; y < 1000; y++) { + mp_rand (&a, x); mp_sqr (&a, &b); } } diff --git a/gen.pl b/gen.pl new file mode 100644 index 0000000..fcfd57d --- /dev/null +++ b/gen.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl +# +#Generates a "single file" you can use to quickly add the whole source +#without any makefile troubles +# + +opendir(DIR,"."); +@files = readdir(DIR); +closedir(DIR); + +open(OUT,">mpi.c"); +print OUT "/* File Generated Automatically by gen.pl */\n\n"; +for (@files) { + if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) { + $fname = $_; + open(SRC,"<$fname"); + print OUT "/* Start: $fname */\n"; + while () { + print OUT $_; + } + close(SRC); + print OUT "\n/* End: $fname */\n\n"; + } +} +print OUT "\n/* EOF */\n"; +close(OUT); + \ No newline at end of file diff --git a/ltmtest.exe b/ltmtest.exe deleted file mode 100644 index cc5c1f5..0000000 Binary files a/ltmtest.exe and /dev/null differ diff --git a/makefile b/makefile index 5f6bcc6..9e6127d 100644 --- a/makefile +++ b/makefile @@ -1,6 +1,6 @@ CFLAGS += -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -VERSION=0.12 +VERSION=0.13 default: libtommath.a @@ -30,7 +30,7 @@ bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_mon bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \ bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \ bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \ -bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o +bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o libtommath.a: $(OBJECTS) $(AR) $(ARFLAGS) libtommath.a $(OBJECTS) @@ -60,8 +60,8 @@ docs: docdvi rm -f bn.log bn.aux bn.dvi clean: - rm -f *.pdf *.o *.a etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest \ - bn.log bn.aux bn.dvi *.log *.s + rm -f *.pdf *.o *.a *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \ + bn.log bn.aux bn.dvi *.log *.s mpi.c cd etc ; make clean zipup: clean docs diff --git a/mpitest.exe b/mpitest.exe deleted file mode 100644 index d32553a..0000000 Binary files a/mpitest.exe and /dev/null differ diff --git a/mtest/mtest.c b/mtest/mtest.c index 17aef8d..3759d15 100644 --- a/mtest/mtest.c +++ b/mtest/mtest.c @@ -89,7 +89,7 @@ int main(void) } for (;;) { - n = fgetc(rng) % 11; + n = 4; // fgetc(rng) % 11; if (n == 0) { /* add tests */ diff --git a/timings.txt b/timings.txt index 117b9c5..128649e 100644 --- a/timings.txt +++ b/timings.txt @@ -1,39 +1,36 @@ -CLOCKS_PER_SEC == 1000000 -Adding 128-bit took 0.070000 ticks -Adding 256-bit took 0.100000 ticks -Adding 512-bit took 0.140000 ticks -Adding 1024-bit took 0.210000 ticks -Adding 2048-bit took 0.360000 ticks -Adding 4096-bit took 0.670000 ticks -Subtracting 128-bit took 0.090000 ticks -Subtracting 256-bit took 0.120000 ticks -Subtracting 512-bit took 0.140000 ticks -Subtracting 1024-bit took 0.210000 ticks -Subtracting 2048-bit took 0.330000 ticks -Subtracting 4096-bit took 0.580000 ticks -Squaring 128-bit took 0.320000 ticks -Squaring 256-bit took 0.620000 ticks -Squaring 512-bit took 1.410000 ticks -Squaring 1024-bit took 3.730000 ticks -Squaring 2048-bit took 11.580000 ticks -Squaring 4096-bit took 44.540000 ticks -Multiplying 128-bit took 0.270000 ticks -Multiplying 256-bit took 0.650000 ticks -Multiplying 512-bit took 1.630000 ticks -Multiplying 1024-bit took 5.180000 ticks -Multiplying 2048-bit took 19.210000 ticks -Multiplying 4096-bit took 67.500000 ticks -Exponentiating 513-bit took 2000.000000 ticks -Exponentiating 769-bit took 5200.000000 ticks -Exponentiating 1025-bit took 11400.000000 ticks -Exponentiating 2049-bit took 75100.000000 ticks -Exponentiating 2561-bit took 150000.000000 ticks -Exponentiating 3073-bit took 237800.000000 ticks -Exponentiating 4097-bit took 510600.000000 ticks -Inverting mod 128-bit took 0.000000 ticks -Inverting mod 256-bit took 200.000000 ticks -Inverting mod 512-bit took 300.000000 ticks -Inverting mod 1024-bit took 800.000000 ticks -Inverting mod 2048-bit took 2500.000000 ticks -Inverting mod 4096-bit took 8400.000000 ticks - +CLOCKS_PER_SEC == 1000 +Adding 128-bit => 14534883/sec, 688 ticks +Adding 256-bit => 11037527/sec, 906 ticks +Adding 512-bit => 8650519/sec, 1156 ticks +Adding 1024-bit => 5871990/sec, 1703 ticks +Adding 2048-bit => 3575259/sec, 2797 ticks +Adding 4096-bit => 2018978/sec, 4953 ticks +Subtracting 128-bit => 11025358/sec, 907 ticks +Subtracting 256-bit => 9149130/sec, 1093 ticks +Subtracting 512-bit => 7440476/sec, 1344 ticks +Subtracting 1024-bit => 5078720/sec, 1969 ticks +Subtracting 2048-bit => 3168567/sec, 3156 ticks +Subtracting 4096-bit => 1833852/sec, 5453 ticks +Squaring 128-bit => 3205128/sec, 78 ticks +Squaring 256-bit => 1592356/sec, 157 ticks +Squaring 512-bit => 696378/sec, 359 ticks +Squaring 1024-bit => 266808/sec, 937 ticks +Squaring 2048-bit => 85999/sec, 2907 ticks +Squaring 4096-bit => 21949/sec, 11390 ticks +Multiplying 128-bit => 3205128/sec, 78 ticks +Multiplying 256-bit => 1592356/sec, 157 ticks +Multiplying 512-bit => 615763/sec, 406 ticks +Multiplying 1024-bit => 192752/sec, 1297 ticks +Multiplying 2048-bit => 53510/sec, 4672 ticks +Multiplying 4096-bit => 14801/sec, 16890 ticks +Exponentiating 513-bit => 531/sec, 47 ticks +Exponentiating 769-bit => 177/sec, 141 ticks +Exponentiating 1025-bit => 88/sec, 282 ticks +Exponentiating 2049-bit => 13/sec, 1890 ticks +Exponentiating 2561-bit => 6/sec, 3812 ticks +Exponentiating 3073-bit => 4/sec, 6031 ticks +Exponentiating 4097-bit => 1/sec, 12843 ticks +Inverting mod 128-bit => 19160/sec, 5219 ticks +Inverting mod 256-bit => 8290/sec, 12062 ticks +Inverting mod 512-bit => 3565/sec, 28047 ticks +Inverting mod 1024-bit => 1305/sec, 76594 ticks \ No newline at end of file diff --git a/timings2.txt b/timings2.txt new file mode 100644 index 0000000..0b87e21 --- /dev/null +++ b/timings2.txt @@ -0,0 +1,36 @@ +CLOCKS_PER_SEC == 1000 +Adding 128-bit => 15600624/sec, 641 ticks +Adding 256-bit => 12804097/sec, 781 ticks +Adding 512-bit => 10000000/sec, 1000 ticks +Adding 1024-bit => 7032348/sec, 1422 ticks +Adding 2048-bit => 4076640/sec, 2453 ticks +Adding 4096-bit => 2424242/sec, 4125 ticks +Subtracting 128-bit => 10845986/sec, 922 ticks +Subtracting 256-bit => 9416195/sec, 1062 ticks +Subtracting 512-bit => 7710100/sec, 1297 ticks +Subtracting 1024-bit => 5159958/sec, 1938 ticks +Subtracting 2048-bit => 3299241/sec, 3031 ticks +Subtracting 4096-bit => 1987676/sec, 5031 ticks +Squaring 128-bit => 3205128/sec, 78 ticks +Squaring 256-bit => 1592356/sec, 157 ticks +Squaring 512-bit => 696378/sec, 359 ticks +Squaring 1024-bit => 266524/sec, 938 ticks +Squaring 2048-bit => 86505/sec, 2890 ticks +Squaring 4096-bit => 22471/sec, 11125 ticks +Multiplying 128-bit => 3205128/sec, 78 ticks +Multiplying 256-bit => 1592356/sec, 157 ticks +Multiplying 512-bit => 615763/sec, 406 ticks +Multiplying 1024-bit => 190548/sec, 1312 ticks +Multiplying 2048-bit => 54418/sec, 4594 ticks +Multiplying 4096-bit => 14897/sec, 16781 ticks +Exponentiating 513-bit => 531/sec, 47 ticks +Exponentiating 769-bit => 177/sec, 141 ticks +Exponentiating 1025-bit => 84/sec, 297 ticks +Exponentiating 2049-bit => 13/sec, 1875 ticks +Exponentiating 2561-bit => 6/sec, 3766 ticks +Exponentiating 3073-bit => 4/sec, 6000 ticks +Exponentiating 4097-bit => 1/sec, 12750 ticks +Inverting mod 128-bit => 17301/sec, 578 ticks +Inverting mod 256-bit => 8103/sec, 1234 ticks +Inverting mod 512-bit => 3422/sec, 2922 ticks +Inverting mod 1024-bit => 1330/sec, 7516 ticks \ No newline at end of file diff --git a/timings3.txt b/timings3.txt new file mode 100644 index 0000000..f269c2b --- /dev/null +++ b/timings3.txt @@ -0,0 +1,5 @@ +Exponentiating 513-bit => 531/sec, 94 ticks +Exponentiating 769-bit => 187/sec, 266 ticks +Exponentiating 1025-bit => 88/sec, 562 ticks +Exponentiating 2049-bit => 13/sec, 3719 ticks + diff --git a/tommath.h b/tommath.h index 4ac6173..9db1781 100644 --- a/tommath.h +++ b/tommath.h @@ -289,6 +289,11 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c); /* setups the montgomery reduction */ int mp_montgomery_setup(mp_int *a, mp_digit *mp); +/* computes a = B^n mod b without division or multiplication useful for + * normalizing numbers in a Montgomery system. + */ +int mp_montgomery_calc_normalization(mp_int *a, mp_int *b); + /* computes xR^-1 == x (mod N) via Montgomery Reduction */ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); @@ -343,6 +348,5 @@ void bn_reverse(unsigned char *s, int len); } #endif - #endif