Merge pull request #434 from libtom/simplifications

Simplifications
2019-11-05 17:55:04 +01:00 · 2019-11-05 17:55:04 +01:00 · 3035e22fd3
commit 3035e22fd3
parent 2d3262af26 80176de372
86 changed files with 1255 additions and 1524 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -144,6 +144,7 @@ matrix:
    # clang for x86-64 architecture (64-bit longs and 64-bit pointers)
    - env: SANITIZER=1 CONV_WARNINGS=relaxed BUILDOPTIONS='--with-cc=clang-7 --with-m64 --with-travis-valgrind'
    - env: SANITIZER=1 CONV_WARNINGS=strict BUILDOPTIONS='--with-cc=clang-7 --with-m64 --with-travis-valgrind'
+    - env: SANITIZER=1 CONV_WARNINGS=strict BUILDOPTIONS='--with-cc=clang-7 --cflags=-DMP_USE_MEMOPS --with-m64 --with-travis-valgrind'
    - env: SANITIZER=1 CONV_WARNINGS=strict BUILDOPTIONS='--with-cc=clang-7 --c89 --with-m64 --with-travis-valgrind'
    - env: SANITIZER=1 BUILDOPTIONS='--with-cc=clang-7 --with-m64 --with-travis-valgrind --cflags=-DMP_PREC=MP_MIN_PREC'
    - env: SANITIZER=1 BUILDOPTIONS='--with-cc=clang-6.0 --with-m64 --with-travis-valgrind'
--- a/demo/mtest_opponent.c
+++ b/demo/mtest_opponent.c
@ -37,8 +37,8 @@ static int mtest_opponent(void)

 #ifndef MP_FIXED_CUTOFFS
   /* force KARA and TOOM to enable despite cutoffs */
-   MP_KARATSUBA_SQR_CUTOFF = MP_KARATSUBA_MUL_CUTOFF = 8;
-   MP_TOOM_SQR_CUTOFF = MP_TOOM_MUL_CUTOFF = 16;
+   MP_SQR_KARATSUBA_CUTOFF = MP_MUL_KARATSUBA_CUTOFF = 8;
+   MP_SQR_TOOM_CUTOFF = MP_MUL_TOOM_CUTOFF = 16;
 #endif

   for (;;) {
--- a/demo/test.c
+++ b/demo/test.c
@ -1866,7 +1866,7 @@ LBL_ERR:
   return EXIT_FAILURE;
 }

-static int test_s_mp_balance_mul(void)
+static int test_s_mp_mul_balance(void)
 {
   mp_int a, b, c;

@ -1881,7 +1881,7 @@ static int test_s_mp_balance_mul(void)
   DO(mp_read_radix(&a, na, 64));
   DO(mp_read_radix(&b, nb, 64));

-   DO(s_mp_balance_mul(&a, &b, &c));
+   DO(s_mp_mul_balance(&a, &b, &c));

   DO(mp_read_radix(&b, nc, 64));

@ -1896,18 +1896,18 @@ LBL_ERR:
   return EXIT_FAILURE;
 }

-#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
-static int test_s_mp_karatsuba_mul(void)
+#define s_mp_mul_full(a, b, c) s_mp_mul(a, b, c, (a)->used + (b)->used + 1)
+static int test_s_mp_mul_karatsuba(void)
 {
   mp_int a, b, c, d;
   int size;

   DOR(mp_init_multi(&a, &b, &c, &d, NULL));
-   for (size = MP_KARATSUBA_MUL_CUTOFF; size < MP_KARATSUBA_MUL_CUTOFF + 20; size++) {
+   for (size = MP_MUL_KARATSUBA_CUTOFF; size < MP_MUL_KARATSUBA_CUTOFF + 20; size++) {
      DO(mp_rand(&a, size));
      DO(mp_rand(&b, size));
-      DO(s_mp_karatsuba_mul(&a, &b, &c));
-      DO(s_mp_mul(&a,&b,&d));
+      DO(s_mp_mul_karatsuba(&a, &b, &c));
+      DO(s_mp_mul_full(&a,&b,&d));
      if (mp_cmp(&c, &d) != MP_EQ) {
         fprintf(stderr, "Karatsuba multiplication failed at size %d\n", size);
         goto LBL_ERR;
@ -1921,15 +1921,15 @@ LBL_ERR:
   return EXIT_FAILURE;
 }

-static int test_s_mp_karatsuba_sqr(void)
+static int test_s_mp_sqr_karatsuba(void)
 {
   mp_int a, b, c;
   int size;

   DOR(mp_init_multi(&a, &b, &c, NULL));
-   for (size = MP_KARATSUBA_SQR_CUTOFF; size < MP_KARATSUBA_SQR_CUTOFF + 20; size++) {
+   for (size = MP_SQR_KARATSUBA_CUTOFF; size < MP_SQR_KARATSUBA_CUTOFF + 20; size++) {
      DO(mp_rand(&a, size));
-      DO(s_mp_karatsuba_sqr(&a, &b));
+      DO(s_mp_sqr_karatsuba(&a, &b));
      DO(s_mp_sqr(&a, &c));
      if (mp_cmp(&b, &c) != MP_EQ) {
         fprintf(stderr, "Karatsuba squaring failed at size %d\n", size);
@ -1944,7 +1944,7 @@ LBL_ERR:
   return EXIT_FAILURE;
 }

-static int test_s_mp_toom_mul(void)
+static int test_s_mp_mul_toom(void)
 {
   mp_int a, b, c, d;
   int size;
@ -1965,10 +1965,10 @@ static int test_s_mp_toom_mul(void)
   DO(mp_2expt(&c, 99000 - 1000));
   DO(mp_add(&b, &c, &b));

-   tc_cutoff = MP_TOOM_MUL_CUTOFF;
-   MP_TOOM_MUL_CUTOFF = INT_MAX;
+   tc_cutoff = MP_MUL_TOOM_CUTOFF;
+   MP_MUL_TOOM_CUTOFF = INT_MAX;
   DO(mp_mul(&a, &b, &c));
-   MP_TOOM_MUL_CUTOFF = tc_cutoff;
+   MP_MUL_TOOM_CUTOFF = tc_cutoff;
   DO(mp_mul(&a, &b, &d));
   if (mp_cmp(&c, &d) != MP_EQ) {
      fprintf(stderr, "Toom-Cook 3-way multiplication failed for edgecase f1 * f2\n");
@ -1976,11 +1976,11 @@ static int test_s_mp_toom_mul(void)
   }
 #endif

-   for (size = MP_TOOM_MUL_CUTOFF; size < MP_TOOM_MUL_CUTOFF + 20; size++) {
+   for (size = MP_MUL_TOOM_CUTOFF; size < MP_MUL_TOOM_CUTOFF + 20; size++) {
      DO(mp_rand(&a, size));
      DO(mp_rand(&b, size));
-      DO(s_mp_toom_mul(&a, &b, &c));
-      DO(s_mp_mul(&a,&b,&d));
+      DO(s_mp_mul_toom(&a, &b, &c));
+      DO(s_mp_mul_full(&a,&b,&d));
      if (mp_cmp(&c, &d) != MP_EQ) {
         fprintf(stderr, "Toom-Cook 3-way multiplication failed at size %d\n", size);
         goto LBL_ERR;
@ -1994,15 +1994,15 @@ LBL_ERR:
   return EXIT_FAILURE;
 }

-static int test_s_mp_toom_sqr(void)
+static int test_s_mp_sqr_toom(void)
 {
   mp_int a, b, c;
   int size;

   DOR(mp_init_multi(&a, &b, &c, NULL));
-   for (size = MP_TOOM_SQR_CUTOFF; size < MP_TOOM_SQR_CUTOFF + 20; size++) {
+   for (size = MP_SQR_TOOM_CUTOFF; size < MP_SQR_TOOM_CUTOFF + 20; size++) {
      DO(mp_rand(&a, size));
-      DO(s_mp_toom_sqr(&a, &b));
+      DO(s_mp_sqr_toom(&a, &b));
      DO(s_mp_sqr(&a, &c));
      if (mp_cmp(&b, &c) != MP_EQ) {
         fprintf(stderr, "Toom-Cook 3-way squaring failed at size %d\n", size);
@ -2075,7 +2075,7 @@ static int test_s_mp_div_recursive(void)

   DOR(mp_init_multi(&a, &b, &c_q, &c_r, &d_q, &d_r, NULL));

-   for (size = MP_KARATSUBA_MUL_CUTOFF; size < 3 * MP_KARATSUBA_MUL_CUTOFF; size += 10) {
+   for (size = MP_MUL_KARATSUBA_CUTOFF; size < 3 * MP_MUL_KARATSUBA_CUTOFF; size += 10) {
      printf("\rsizes = %d / %d", 10 * size, size);
      /* Relation 10:1 */
      DO(mp_rand(&a, 10 * size));
@ -2139,7 +2139,7 @@ static int test_s_mp_div_small(void)
   int size;

   DOR(mp_init_multi(&a, &b, &c_q, &c_r, &d_q, &d_r, NULL));
-   for (size = 1; size < MP_KARATSUBA_MUL_CUTOFF; size += 10) {
+   for (size = 1; size < MP_MUL_KARATSUBA_CUTOFF; size += 10) {
      printf("\rsizes = %d / %d", 2 * size, size);
      /* Relation 10:1 */
      DO(mp_rand(&a, 2 * size));
@ -2332,11 +2332,11 @@ static int unit_tests(int argc, char **argv)
      T1(mp_xor, MP_XOR),
      T2(s_mp_div_recursive, S_MP_DIV_RECURSIVE, S_MP_DIV_SCHOOL),
      T2(s_mp_div_small, S_MP_DIV_SMALL, S_MP_DIV_SCHOOL),
-      T1(s_mp_balance_mul, S_MP_BALANCE_MUL),
-      T1(s_mp_karatsuba_mul, S_MP_KARATSUBA_MUL),
-      T1(s_mp_karatsuba_sqr, S_MP_KARATSUBA_SQR),
-      T1(s_mp_toom_mul, S_MP_TOOM_MUL),
-      T1(s_mp_toom_sqr, S_MP_TOOM_SQR)
+      T1(s_mp_mul_balance, S_MP_MUL_BALANCE),
+      T1(s_mp_mul_karatsuba, S_MP_MUL_KARATSUBA),
+      T1(s_mp_sqr_karatsuba, S_MP_SQR_KARATSUBA),
+      T1(s_mp_mul_toom, S_MP_MUL_TOOM),
+      T1(s_mp_sqr_toom, S_MP_SQR_TOOM)
 #undef T2
 #undef T1
   };
--- a/demo/timing.c
+++ b/demo/timing.c
@ -247,18 +247,18 @@ int main(int argc, char **argv)

   if (should_test("mulsqr", argc, argv) != 0) {
      /* do mult/square twice, first without karatsuba and second with */
-      old_kara_m = MP_KARATSUBA_MUL_CUTOFF;
-      old_kara_s = MP_KARATSUBA_SQR_CUTOFF;
+      old_kara_m = MP_MUL_KARATSUBA_CUTOFF;
+      old_kara_s = MP_SQR_KARATSUBA_CUTOFF;
      /* currently toom-cook cut-off is too high to kick in, so we just use the karatsuba values */
      old_toom_m = old_kara_m;
      old_toom_s = old_kara_s;
      for (ix = 0; ix < 3; ix++) {
         printf("With%s Karatsuba, With%s Toom\n", (ix == 1) ? "" : "out", (ix == 2) ? "" : "out");

-         MP_KARATSUBA_MUL_CUTOFF = (ix == 1) ? old_kara_m : 9999;
-         MP_KARATSUBA_SQR_CUTOFF = (ix == 1) ? old_kara_s : 9999;
-         MP_TOOM_MUL_CUTOFF = (ix == 2) ? old_toom_m : 9999;
-         MP_TOOM_SQR_CUTOFF = (ix == 2) ? old_toom_s : 9999;
+         MP_MUL_KARATSUBA_CUTOFF = (ix == 1) ? old_kara_m : 9999;
+         MP_SQR_KARATSUBA_CUTOFF = (ix == 1) ? old_kara_s : 9999;
+         MP_MUL_TOOM_CUTOFF = (ix == 2) ? old_toom_m : 9999;
+         MP_SQR_TOOM_CUTOFF = (ix == 2) ? old_toom_s : 9999;

         log = FOPEN((ix == 0) ? "logs/mult" MP_TIMING_VERSION ".log" : (ix == 1) ? "logs/mult_kara" MP_TIMING_VERSION ".log" :
                     "logs/mult_toom" MP_TIMING_VERSION ".log", "w");
--- a/etc/tune.c
+++ b/etc/tune.c
@ -58,7 +58,7 @@ static int s_number_of_test_loops;
 static int s_stabilization_extra;
 static int s_offset = 1;

-#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
+#define s_mp_mul_full(a, b, c) s_mp_mul(a, b, c, (a)->used + (b)->used + 1)
 static uint64_t s_time_mul(int size)
 {
   int x;
@ -87,7 +87,7 @@ static uint64_t s_time_mul(int size)
         goto LBL_ERR;
      }
      if (s_check_result == 1) {
-         if ((e = s_mp_mul(&a,&b,&d)) != MP_OKAY) {
+         if ((e = s_mp_mul_full(&a,&b,&d)) != MP_OKAY) {
            t1 = UINT64_MAX;
            goto LBL_ERR;
         }
@ -247,8 +247,8 @@ static void s_usage(char *s)
 }

 struct cutoffs {
-   int KARATSUBA_MUL, KARATSUBA_SQR;
-   int TOOM_MUL, TOOM_SQR;
+   int MUL_KARATSUBA, SQR_KARATSUBA;
+   int MUL_TOOM, SQR_TOOM;
 };

 const struct cutoffs max_cutoffs =
@ -256,18 +256,18 @@ const struct cutoffs max_cutoffs =

 static void set_cutoffs(const struct cutoffs *c)
 {
-   MP_KARATSUBA_MUL_CUTOFF = c->KARATSUBA_MUL;
-   MP_KARATSUBA_SQR_CUTOFF = c->KARATSUBA_SQR;
-   MP_TOOM_MUL_CUTOFF = c->TOOM_MUL;
-   MP_TOOM_SQR_CUTOFF = c->TOOM_SQR;
+   MP_MUL_KARATSUBA_CUTOFF = c->MUL_KARATSUBA;
+   MP_SQR_KARATSUBA_CUTOFF = c->SQR_KARATSUBA;
+   MP_MUL_TOOM_CUTOFF = c->MUL_TOOM;
+   MP_SQR_TOOM_CUTOFF = c->SQR_TOOM;
 }

 static void get_cutoffs(struct cutoffs *c)
 {
-   c->KARATSUBA_MUL  = MP_KARATSUBA_MUL_CUTOFF;
-   c->KARATSUBA_SQR  = MP_KARATSUBA_SQR_CUTOFF;
-   c->TOOM_MUL = MP_TOOM_MUL_CUTOFF;
-   c->TOOM_SQR = MP_TOOM_SQR_CUTOFF;
+   c->MUL_KARATSUBA  = MP_MUL_KARATSUBA_CUTOFF;
+   c->SQR_KARATSUBA  = MP_SQR_KARATSUBA_CUTOFF;
+   c->MUL_TOOM = MP_MUL_TOOM_CUTOFF;
+   c->SQR_TOOM = MP_SQR_TOOM_CUTOFF;

 }

@ -292,7 +292,7 @@ int main(int argc, char **argv)
   s_number_of_test_loops = 64;
   s_stabilization_extra = 3;

-   MP_ZERO_BUFFER(&args, sizeof(args));
+   s_mp_zero_buf(&args, sizeof(args));

   args.testmode = 0;
   args.verbose = 0;
@ -414,13 +414,13 @@ int main(int argc, char **argv)
               s_usage(argv[0]);
            }
            str = argv[opt];
-            MP_KARATSUBA_MUL_CUTOFF = (int)s_strtol(str, &endptr, "[1/4] No value for MP_KARATSUBA_MUL_CUTOFF given");
+            MP_MUL_KARATSUBA_CUTOFF = (int)s_strtol(str, &endptr, "[1/4] No value for MP_MUL_KARATSUBA_CUTOFF given");
            str = endptr + 1;
-            MP_KARATSUBA_SQR_CUTOFF = (int)s_strtol(str, &endptr, "[2/4] No value for MP_KARATSUBA_SQR_CUTOFF given");
+            MP_SQR_KARATSUBA_CUTOFF = (int)s_strtol(str, &endptr, "[2/4] No value for MP_SQR_KARATSUBA_CUTOFF given");
            str = endptr + 1;
-            MP_TOOM_MUL_CUTOFF = (int)s_strtol(str, &endptr, "[3/4] No value for MP_TOOM_MUL_CUTOFF given");
+            MP_MUL_TOOM_CUTOFF = (int)s_strtol(str, &endptr, "[3/4] No value for MP_MUL_TOOM_CUTOFF given");
            str = endptr + 1;
-            MP_TOOM_SQR_CUTOFF = (int)s_strtol(str, &endptr, "[4/4] No value for MP_TOOM_SQR_CUTOFF given");
+            MP_SQR_TOOM_CUTOFF = (int)s_strtol(str, &endptr, "[4/4] No value for MP_SQR_TOOM_CUTOFF given");
            break;
         case 'h':
            s_exit_code = EXIT_SUCCESS;
@ -455,10 +455,10 @@ int main(int argc, char **argv)
            of the macro MP_WPARRAY in tommath.h which needs to
            be changed manually (to 0 (zero)).
          */
-         T_MUL_SQR("Karatsuba multiplication", KARATSUBA_MUL, s_time_mul),
-         T_MUL_SQR("Karatsuba squaring", KARATSUBA_SQR, s_time_sqr),
-         T_MUL_SQR("Toom-Cook 3-way multiplying", TOOM_MUL, s_time_mul),
-         T_MUL_SQR("Toom-Cook 3-way squaring", TOOM_SQR, s_time_sqr),
+         T_MUL_SQR("Karatsuba multiplication", MUL_KARATSUBA, s_time_mul),
+         T_MUL_SQR("Karatsuba squaring", SQR_KARATSUBA, s_time_sqr),
+         T_MUL_SQR("Toom-Cook 3-way multiplying", MUL_TOOM, s_time_mul),
+         T_MUL_SQR("Toom-Cook 3-way squaring", SQR_TOOM, s_time_sqr),
 #undef T_MUL_SQR
      };
      /* Turn all limits from bncore.c to the max */
@ -473,15 +473,15 @@ int main(int argc, char **argv)
   }
   if (args.terse == 1) {
      printf("%d %d %d %d\n",
-             updated.KARATSUBA_MUL,
-             updated.KARATSUBA_SQR,
-             updated.TOOM_MUL,
-             updated.TOOM_SQR);
+             updated.MUL_KARATSUBA,
+             updated.SQR_KARATSUBA,
+             updated.MUL_TOOM,
+             updated.SQR_TOOM);
   } else {
-      printf("KARATSUBA_MUL_CUTOFF = %d\n", updated.KARATSUBA_MUL);
-      printf("KARATSUBA_SQR_CUTOFF = %d\n", updated.KARATSUBA_SQR);
-      printf("TOOM_MUL_CUTOFF = %d\n", updated.TOOM_MUL);
-      printf("TOOM_SQR_CUTOFF = %d\n", updated.TOOM_SQR);
+      printf("MUL_KARATSUBA_CUTOFF = %d\n", updated.MUL_KARATSUBA);
+      printf("SQR_KARATSUBA_CUTOFF = %d\n", updated.SQR_KARATSUBA);
+      printf("MUL_TOOM_CUTOFF = %d\n", updated.MUL_TOOM);
+      printf("SQR_TOOM_CUTOFF = %d\n", updated.SQR_TOOM);
   }

   if (args.print == 1) {
@ -526,15 +526,15 @@ int main(int argc, char **argv)
         set_cutoffs(&orig);
         if (args.terse == 1) {
            printf("%d %d %d %d\n",
-                   MP_KARATSUBA_MUL_CUTOFF,
-                   MP_KARATSUBA_SQR_CUTOFF,
-                   MP_TOOM_MUL_CUTOFF,
-                   MP_TOOM_SQR_CUTOFF);
+                   MP_MUL_KARATSUBA_CUTOFF,
+                   MP_SQR_KARATSUBA_CUTOFF,
+                   MP_MUL_TOOM_CUTOFF,
+                   MP_SQR_TOOM_CUTOFF);
         } else {
-            printf("KARATSUBA_MUL_CUTOFF = %d\n", MP_KARATSUBA_MUL_CUTOFF);
-            printf("KARATSUBA_SQR_CUTOFF = %d\n", MP_KARATSUBA_SQR_CUTOFF);
-            printf("TOOM_MUL_CUTOFF = %d\n", MP_TOOM_MUL_CUTOFF);
-            printf("TOOM_SQR_CUTOFF = %d\n", MP_TOOM_SQR_CUTOFF);
+            printf("MUL_KARATSUBA_CUTOFF = %d\n", MP_MUL_KARATSUBA_CUTOFF);
+            printf("SQR_KARATSUBA_CUTOFF = %d\n", MP_SQR_KARATSUBA_CUTOFF);
+            printf("MUL_TOOM_CUTOFF = %d\n", MP_MUL_TOOM_CUTOFF);
+            printf("SQR_TOOM_CUTOFF = %d\n", MP_SQR_TOOM_CUTOFF);
         }
      }
   }
--- a/etc/tune_it.sh
+++ b/etc/tune_it.sh
@ -93,15 +93,14 @@ i=$(tail -n +2 $FILE_NAME | wc -l)
 # our median point will be at $i entries
 i=$(( (i / 2) + 1 ))
 TMP=$(median $FILE_NAME 1 $i)
-echo "#define MP_DEFAULT_KARATSUBA_MUL_CUTOFF $TMP"
-echo "#define MP_DEFAULT_KARATSUBA_MUL_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(km) Appending to $TOMMATH_CUTOFFS_H" $?
+echo "#define MP_DEFAULT_MUL_KARATSUBA_CUTOFF $TMP"
+echo "#define MP_DEFAULT_MUL_KARATSUBA_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(km) Appending to $TOMMATH_CUTOFFS_H" $?
 TMP=$(median $FILE_NAME 2 $i)
-echo "#define MP_DEFAULT_KARATSUBA_SQR_CUTOFF $TMP"
-echo "#define MP_DEFAULT_KARATSUBA_SQR_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(ks) Appending to $TOMMATH_CUTOFFS_H" $?
+echo "#define MP_DEFAULT_SQR_KARATSUBA_CUTOFF $TMP"
+echo "#define MP_DEFAULT_SQR_KARATSUBA_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(ks) Appending to $TOMMATH_CUTOFFS_H" $?
 TMP=$(median $FILE_NAME 3 $i)
-echo "#define MP_DEFAULT_TOOM_MUL_CUTOFF      $TMP"
-echo "#define MP_DEFAULT_TOOM_MUL_CUTOFF      $TMP" >> $TOMMATH_CUTOFFS_H || die "(tc3m) Appending to $TOMMATH_CUTOFFS_H" $?
+echo "#define MP_DEFAULT_MUL_TOOM_CUTOFF      $TMP"
+echo "#define MP_DEFAULT_MUL_TOOM_CUTOFF      $TMP" >> $TOMMATH_CUTOFFS_H || die "(tc3m) Appending to $TOMMATH_CUTOFFS_H" $?
 TMP=$(median $FILE_NAME 4 $i)
-echo "#define MP_DEFAULT_TOOM_SQR_CUTOFF      $TMP"
-echo "#define MP_DEFAULT_TOOM_SQR_CUTOFF      $TMP" >> $TOMMATH_CUTOFFS_H || die "(tc3s) Appending to $TOMMATH_CUTOFFS_H" $?
-
+echo "#define MP_DEFAULT_SQR_TOOM_CUTOFF      $TMP"
+echo "#define MP_DEFAULT_SQR_TOOM_CUTOFF      $TMP" >> $TOMMATH_CUTOFFS_H || die "(tc3s) Appending to $TOMMATH_CUTOFFS_H" $?
--- a/helper.pl
+++ b/helper.pl
@ -57,9 +57,8 @@ sub check_source {
      push @{$troubles->{unwanted_calloc}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bcalloc\s*\(/;
      push @{$troubles->{unwanted_free}},      $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bfree\s*\(/;
      # and we probably want to also avoid the following
-      push @{$troubles->{unwanted_memcpy}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bmemcpy\s*\(/;
-      push @{$troubles->{unwanted_memset}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bmemset\s*\(/;
-      push @{$troubles->{unwanted_memcpy}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bmemcpy\s*\(/;
+      push @{$troubles->{unwanted_memcpy}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bmemcpy\s*\(/ && $file !~ /s_mp_copy_digs.c/;
+      push @{$troubles->{unwanted_memset}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bmemset\s*\(/ && $file !~ /s_mp_zero_buf.c/ && $file !~ /s_mp_zero_digs.c/;
      push @{$troubles->{unwanted_memmove}},   $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bmemmove\s*\(/;
      push @{$troubles->{unwanted_memcmp}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bmemcmp\s*\(/;
      push @{$troubles->{unwanted_strcmp}},    $lineno if $file =~ /^[^\/]+\.c$/ && $l =~ /\bstrcmp\s*\(/;
--- a/libtommath_VS2008.vcproj
+++ b/libtommath_VS2008.vcproj
@ -576,10 +576,6 @@
 			RelativePath="mp_mod_2d.c"
 			>
 		</File>
-		<File
-			RelativePath="mp_mod_d.c"
-			>
-		</File>
 		<File
 			RelativePath="mp_montgomery_calc_normalization.c"
 			>
@ -833,7 +829,7 @@
 			>
 		</File>
 		<File
-			RelativePath="s_mp_balance_mul.c"
+			RelativePath="s_mp_copy_digs.c"
 			>
 		</File>
 		<File
@ -861,19 +857,11 @@
 			>
 		</File>
 		<File
-			RelativePath="s_mp_invmod_fast.c"
+			RelativePath="s_mp_invmod.c"
 			>
 		</File>
 		<File
-			RelativePath="s_mp_invmod_slow.c"
-			>
-		</File>
-		<File
-			RelativePath="s_mp_karatsuba_mul.c"
-			>
-		</File>
-		<File
-			RelativePath="s_mp_karatsuba_sqr.c"
+			RelativePath="s_mp_invmod_odd.c"
 			>
 		</File>
 		<File
@ -889,23 +877,35 @@
 			>
 		</File>
 		<File
-			RelativePath="s_mp_montgomery_reduce_fast.c"
+			RelativePath="s_mp_montgomery_reduce_comba.c"
 			>
 		</File>
 		<File
-			RelativePath="s_mp_mul_digs.c"
+			RelativePath="s_mp_mul.c"
 			>
 		</File>
 		<File
-			RelativePath="s_mp_mul_digs_fast.c"
+			RelativePath="s_mp_mul_balance.c"
 			>
 		</File>
 		<File
-			RelativePath="s_mp_mul_high_digs.c"
+			RelativePath="s_mp_mul_comba.c"
 			>
 		</File>
 		<File
-			RelativePath="s_mp_mul_high_digs_fast.c"
+			RelativePath="s_mp_mul_high.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_mul_high_comba.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_mul_karatsuba.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_mul_toom.c"
 			>
 		</File>
 		<File
@ -925,7 +925,15 @@
 			>
 		</File>
 		<File
-			RelativePath="s_mp_sqr_fast.c"
+			RelativePath="s_mp_sqr_comba.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_sqr_karatsuba.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_sqr_toom.c"
 			>
 		</File>
 		<File
@ -933,11 +941,11 @@
 			>
 		</File>
 		<File
-			RelativePath="s_mp_toom_mul.c"
+			RelativePath="s_mp_zero_buf.c"
 			>
 		</File>
 		<File
-			RelativePath="s_mp_toom_sqr.c"
+			RelativePath="s_mp_zero_digs.c"
 			>
 		</File>
 		<File
--- a/32
+++ b/32
@ -34,22 +34,22 @@ mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
 mp_init_i32.o mp_init_i64.o mp_init_l.o mp_init_ll.o mp_init_multi.o mp_init_set.o mp_init_size.o \
 mp_init_u32.o mp_init_u64.o mp_init_ul.o mp_init_ull.o mp_invmod.o mp_is_square.o mp_kronecker.o mp_lcm.o \
-mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_mod_d.o mp_montgomery_calc_normalization.o \
-mp_montgomery_reduce.o mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o \
-mp_neg.o mp_or.o mp_pack.o mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o \
-mp_prime_is_prime.o mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o \
-mp_prime_rand.o mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o \
-mp_rand.o mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o \
-mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o \
-mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o \
-mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o \
-mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_balance_mul.o s_mp_div_recursive.o s_mp_div_school.o \
-s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod_fast.o s_mp_invmod_slow.o \
-s_mp_karatsuba_mul.o s_mp_karatsuba_sqr.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
-s_mp_montgomery_reduce_fast.o s_mp_mul_digs.o s_mp_mul_digs_fast.o s_mp_mul_high_digs.o \
-s_mp_mul_high_digs_fast.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_fast.o s_mp_sub.o s_mp_toom_mul.o s_mp_toom_sqr.o
+mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_montgomery_calc_normalization.o mp_montgomery_reduce.o \
+mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o mp_neg.o mp_or.o mp_pack.o \
+mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o mp_prime_is_prime.o \
+mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o mp_prime_rand.o \
+mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o mp_rand.o \
+mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o \
+mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o \
+mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o \
+mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o \
+mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o \
+s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o \
+s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
+s_mp_zero_buf.o s_mp_zero_digs.o

 #END_INS

--- a/makefile.mingw
+++ b/makefile.mingw
@ -37,22 +37,22 @@ mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
 mp_init_i32.o mp_init_i64.o mp_init_l.o mp_init_ll.o mp_init_multi.o mp_init_set.o mp_init_size.o \
 mp_init_u32.o mp_init_u64.o mp_init_ul.o mp_init_ull.o mp_invmod.o mp_is_square.o mp_kronecker.o mp_lcm.o \
-mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_mod_d.o mp_montgomery_calc_normalization.o \
-mp_montgomery_reduce.o mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o \
-mp_neg.o mp_or.o mp_pack.o mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o \
-mp_prime_is_prime.o mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o \
-mp_prime_rand.o mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o \
-mp_rand.o mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o \
-mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o \
-mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o \
-mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o \
-mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_balance_mul.o s_mp_div_recursive.o s_mp_div_school.o \
-s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod_fast.o s_mp_invmod_slow.o \
-s_mp_karatsuba_mul.o s_mp_karatsuba_sqr.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
-s_mp_montgomery_reduce_fast.o s_mp_mul_digs.o s_mp_mul_digs_fast.o s_mp_mul_high_digs.o \
-s_mp_mul_high_digs_fast.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_fast.o s_mp_sub.o s_mp_toom_mul.o s_mp_toom_sqr.o
+mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_montgomery_calc_normalization.o mp_montgomery_reduce.o \
+mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o mp_neg.o mp_or.o mp_pack.o \
+mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o mp_prime_is_prime.o \
+mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o mp_prime_rand.o \
+mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o mp_rand.o \
+mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o \
+mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o \
+mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o \
+mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o \
+mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o \
+s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o \
+s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
+s_mp_zero_buf.o s_mp_zero_digs.o

 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
--- a/makefile.msvc
+++ b/makefile.msvc
@ -29,22 +29,22 @@ mp_from_ubin.obj mp_fwrite.obj mp_gcd.obj mp_get_double.obj mp_get_i32.obj mp_ge
 mp_get_mag_u32.obj mp_get_mag_u64.obj mp_get_mag_ul.obj mp_get_mag_ull.obj mp_grow.obj mp_init.obj mp_init_copy.obj \
 mp_init_i32.obj mp_init_i64.obj mp_init_l.obj mp_init_ll.obj mp_init_multi.obj mp_init_set.obj mp_init_size.obj \
 mp_init_u32.obj mp_init_u64.obj mp_init_ul.obj mp_init_ull.obj mp_invmod.obj mp_is_square.obj mp_kronecker.obj mp_lcm.obj \
-mp_log_u32.obj mp_lshd.obj mp_mod.obj mp_mod_2d.obj mp_mod_d.obj mp_montgomery_calc_normalization.obj \
-mp_montgomery_reduce.obj mp_montgomery_setup.obj mp_mul.obj mp_mul_2.obj mp_mul_2d.obj mp_mul_d.obj mp_mulmod.obj \
-mp_neg.obj mp_or.obj mp_pack.obj mp_pack_count.obj mp_prime_fermat.obj mp_prime_frobenius_underwood.obj \
-mp_prime_is_prime.obj mp_prime_miller_rabin.obj mp_prime_next_prime.obj mp_prime_rabin_miller_trials.obj \
-mp_prime_rand.obj mp_prime_strong_lucas_selfridge.obj mp_prime_tab.obj mp_radix_size.obj mp_radix_smap.obj \
-mp_rand.obj mp_read_radix.obj mp_reduce.obj mp_reduce_2k.obj mp_reduce_2k_l.obj mp_reduce_2k_setup.obj \
-mp_reduce_2k_setup_l.obj mp_reduce_is_2k.obj mp_reduce_is_2k_l.obj mp_reduce_setup.obj mp_root_u32.obj mp_rshd.obj \
-mp_sbin_size.obj mp_set.obj mp_set_double.obj mp_set_i32.obj mp_set_i64.obj mp_set_l.obj mp_set_ll.obj mp_set_u32.obj \
-mp_set_u64.obj mp_set_ul.obj mp_set_ull.obj mp_shrink.obj mp_signed_rsh.obj mp_sqr.obj mp_sqrmod.obj mp_sqrt.obj \
-mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj \
-mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_balance_mul.obj s_mp_div_recursive.obj s_mp_div_school.obj \
-s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_get_bit.obj s_mp_invmod_fast.obj s_mp_invmod_slow.obj \
-s_mp_karatsuba_mul.obj s_mp_karatsuba_sqr.obj s_mp_log.obj s_mp_log_d.obj s_mp_log_pow2.obj \
-s_mp_montgomery_reduce_fast.obj s_mp_mul_digs.obj s_mp_mul_digs_fast.obj s_mp_mul_high_digs.obj \
-s_mp_mul_high_digs_fast.obj s_mp_prime_is_divisible.obj s_mp_rand_jenkins.obj s_mp_rand_platform.obj s_mp_sqr.obj \
-s_mp_sqr_fast.obj s_mp_sub.obj s_mp_toom_mul.obj s_mp_toom_sqr.obj
+mp_log_u32.obj mp_lshd.obj mp_mod.obj mp_mod_2d.obj mp_montgomery_calc_normalization.obj mp_montgomery_reduce.obj \
+mp_montgomery_setup.obj mp_mul.obj mp_mul_2.obj mp_mul_2d.obj mp_mul_d.obj mp_mulmod.obj mp_neg.obj mp_or.obj mp_pack.obj \
+mp_pack_count.obj mp_prime_fermat.obj mp_prime_frobenius_underwood.obj mp_prime_is_prime.obj \
+mp_prime_miller_rabin.obj mp_prime_next_prime.obj mp_prime_rabin_miller_trials.obj mp_prime_rand.obj \
+mp_prime_strong_lucas_selfridge.obj mp_prime_tab.obj mp_radix_size.obj mp_radix_smap.obj mp_rand.obj \
+mp_read_radix.obj mp_reduce.obj mp_reduce_2k.obj mp_reduce_2k_l.obj mp_reduce_2k_setup.obj mp_reduce_2k_setup_l.obj \
+mp_reduce_is_2k.obj mp_reduce_is_2k_l.obj mp_reduce_setup.obj mp_root_u32.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj \
+mp_set_double.obj mp_set_i32.obj mp_set_i64.obj mp_set_l.obj mp_set_ll.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj \
+mp_set_ull.obj mp_shrink.obj mp_signed_rsh.obj mp_sqr.obj mp_sqrmod.obj mp_sqrt.obj mp_sqrtmod_prime.obj mp_sub.obj \
+mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj mp_unpack.obj mp_xor.obj mp_zero.obj \
+s_mp_add.obj s_mp_copy_digs.obj s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj \
+s_mp_exptmod_fast.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log.obj s_mp_log_d.obj s_mp_log_pow2.obj \
+s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj \
+s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_rand_jenkins.obj \
+s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj \
+s_mp_zero_buf.obj s_mp_zero_digs.obj

 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
--- a/makefile.shared
+++ b/makefile.shared
@ -31,22 +31,22 @@ mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
 mp_init_i32.o mp_init_i64.o mp_init_l.o mp_init_ll.o mp_init_multi.o mp_init_set.o mp_init_size.o \
 mp_init_u32.o mp_init_u64.o mp_init_ul.o mp_init_ull.o mp_invmod.o mp_is_square.o mp_kronecker.o mp_lcm.o \
-mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_mod_d.o mp_montgomery_calc_normalization.o \
-mp_montgomery_reduce.o mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o \
-mp_neg.o mp_or.o mp_pack.o mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o \
-mp_prime_is_prime.o mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o \
-mp_prime_rand.o mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o \
-mp_rand.o mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o \
-mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o \
-mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o \
-mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o \
-mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_balance_mul.o s_mp_div_recursive.o s_mp_div_school.o \
-s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod_fast.o s_mp_invmod_slow.o \
-s_mp_karatsuba_mul.o s_mp_karatsuba_sqr.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
-s_mp_montgomery_reduce_fast.o s_mp_mul_digs.o s_mp_mul_digs_fast.o s_mp_mul_high_digs.o \
-s_mp_mul_high_digs_fast.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_fast.o s_mp_sub.o s_mp_toom_mul.o s_mp_toom_sqr.o
+mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_montgomery_calc_normalization.o mp_montgomery_reduce.o \
+mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o mp_neg.o mp_or.o mp_pack.o \
+mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o mp_prime_is_prime.o \
+mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o mp_prime_rand.o \
+mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o mp_rand.o \
+mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o \
+mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o \
+mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o \
+mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o \
+mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o \
+s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o \
+s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
+s_mp_zero_buf.o s_mp_zero_digs.o

 #END_INS

--- a/makefile.unix
+++ b/makefile.unix
@ -38,22 +38,22 @@ mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
 mp_init_i32.o mp_init_i64.o mp_init_l.o mp_init_ll.o mp_init_multi.o mp_init_set.o mp_init_size.o \
 mp_init_u32.o mp_init_u64.o mp_init_ul.o mp_init_ull.o mp_invmod.o mp_is_square.o mp_kronecker.o mp_lcm.o \
-mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_mod_d.o mp_montgomery_calc_normalization.o \
-mp_montgomery_reduce.o mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o \
-mp_neg.o mp_or.o mp_pack.o mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o \
-mp_prime_is_prime.o mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o \
-mp_prime_rand.o mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o \
-mp_rand.o mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o \
-mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o \
-mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o \
-mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o \
-mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_balance_mul.o s_mp_div_recursive.o s_mp_div_school.o \
-s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod_fast.o s_mp_invmod_slow.o \
-s_mp_karatsuba_mul.o s_mp_karatsuba_sqr.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
-s_mp_montgomery_reduce_fast.o s_mp_mul_digs.o s_mp_mul_digs_fast.o s_mp_mul_high_digs.o \
-s_mp_mul_high_digs_fast.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_fast.o s_mp_sub.o s_mp_toom_mul.o s_mp_toom_sqr.o
+mp_log_u32.o mp_lshd.o mp_mod.o mp_mod_2d.o mp_montgomery_calc_normalization.o mp_montgomery_reduce.o \
+mp_montgomery_setup.o mp_mul.o mp_mul_2.o mp_mul_2d.o mp_mul_d.o mp_mulmod.o mp_neg.o mp_or.o mp_pack.o \
+mp_pack_count.o mp_prime_fermat.o mp_prime_frobenius_underwood.o mp_prime_is_prime.o \
+mp_prime_miller_rabin.o mp_prime_next_prime.o mp_prime_rabin_miller_trials.o mp_prime_rand.o \
+mp_prime_strong_lucas_selfridge.o mp_prime_tab.o mp_radix_size.o mp_radix_smap.o mp_rand.o \
+mp_read_radix.o mp_reduce.o mp_reduce_2k.o mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o \
+mp_reduce_is_2k.o mp_reduce_is_2k_l.o mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o \
+mp_set_double.o mp_set_i32.o mp_set_i64.o mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o \
+mp_set_ull.o mp_shrink.o mp_signed_rsh.o mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o \
+mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o \
+s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_rand_jenkins.o \
+s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
+s_mp_zero_buf.o s_mp_zero_digs.o

 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
--- a/mp_abs.c
+++ b/mp_abs.c
@ -9,12 +9,11 @@
 */
 mp_err mp_abs(const mp_int *a, mp_int *b)
 {
+   mp_err err;
+
   /* copy a to b */
-   if (a != b) {
-      mp_err err;
-      if ((err = mp_copy(a, b)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_copy(a, b)) != MP_OKAY) {
+      return err;
   }

   /* force the sign of b to positive */
--- a/mp_add_d.c
+++ b/mp_add_d.c
@ -6,9 +6,8 @@
 /* single digit addition */
 mp_err mp_add_d(const mp_int *a, mp_digit b, mp_int *c)
 {
-   mp_err     err;
-   int ix, oldused;
-   mp_digit *tmpa, *tmpc;
+   mp_err err;
+   int oldused;

   /* fast path for a == c */
   if (a == c) {
@ -26,10 +25,8 @@ mp_err mp_add_d(const mp_int *a, mp_digit b, mp_int *c)
   }

   /* grow c as required */
-   if (c->alloc < (a->used + 1)) {
-      if ((err = mp_grow(c, a->used + 1)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, a->used + 1)) != MP_OKAY) {
+      return err;
   }

   /* if a is negative and |a| >= b, call c = |a| - b */
@ -53,49 +50,34 @@ mp_err mp_add_d(const mp_int *a, mp_digit b, mp_int *c)
   /* old number of used digits in c */
   oldused = c->used;

-   /* source alias */
-   tmpa    = a->dp;
-
-   /* destination alias */
-   tmpc    = c->dp;
-
   /* if a is positive */
   if (a->sign == MP_ZPOS) {
      /* add digits, mu is carry */
+      int i;
      mp_digit mu = b;
-      for (ix = 0; ix < a->used; ix++) {
-         *tmpc   = *tmpa++ + mu;
-         mu      = *tmpc >> MP_DIGIT_BIT;
-         *tmpc++ &= MP_MASK;
+      for (i = 0; i < a->used; i++) {
+         c->dp[i] = a->dp[i] + mu;
+         mu = c->dp[i] >> MP_DIGIT_BIT;
+         c->dp[i] &= MP_MASK;
      }
      /* set final carry */
-      ix++;
-      *tmpc++  = mu;
+      c->dp[i] = mu;

      /* setup size */
      c->used = a->used + 1;
   } else {
      /* a was negative and |a| < b */
-      c->used  = 1;
+      c->used = 1;

      /* the result is a single digit */
-      if (a->used == 1) {
-         *tmpc++  =  b - a->dp[0];
-      } else {
-         *tmpc++  =  b;
-      }
-
-      /* setup count so the clearing of oldused
-       * can fall through correctly
-       */
-      ix       = 1;
+      c->dp[0] = (a->used == 1) ? b - a->dp[0] : b;
   }

   /* sign always positive */
   c->sign = MP_ZPOS;

   /* now zero to oldused */
-   MP_ZERO_DIGITS(tmpc, oldused - ix);
+   s_mp_zero_digs(c->dp + c->used, oldused - c->used);
   mp_clamp(c);

   return MP_OKAY;
--- a/mp_and.c
+++ b/mp_and.c
@ -11,10 +11,8 @@ mp_err mp_and(const mp_int *a, const mp_int *b, mp_int *c)
   mp_digit ac = 1, bc = 1, cc = 1;
   mp_sign csign = ((a->sign == MP_NEG) && (b->sign == MP_NEG)) ? MP_NEG : MP_ZPOS;

-   if (c->alloc < used) {
-      if ((err = mp_grow(c, used)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, used)) != MP_OKAY) {
+      return err;
   }

   for (i = 0; i < used; i++) {
--- a/mp_clear.c
+++ b/mp_clear.c
@ -9,7 +9,7 @@ void mp_clear(mp_int *a)
   /* only do anything if a hasn't been freed previously */
   if (a->dp != NULL) {
      /* free ram */
-      MP_FREE_DIGITS(a->dp, a->alloc);
+      MP_FREE_DIGS(a->dp, a->alloc);

      /* reset members to make debugging easier */
      a->dp    = NULL;
--- a/mp_complement.c
+++ b/mp_complement.c
@ -6,7 +6,8 @@
 /* b = ~a */
 mp_err mp_complement(const mp_int *a, mp_int *b)
 {
-   mp_err err = mp_neg(a, b);
-   return (err == MP_OKAY) ? mp_sub_d(b, 1uL, b) : err;
+   mp_int a_ = *a;
+   a_.sign = ((a_.sign == MP_ZPOS) && !mp_iszero(a)) ? MP_NEG : MP_ZPOS;
+   return mp_sub_d(&a_, 1uL, b);
 }
 #endif
--- a/mp_copy.c
+++ b/mp_copy.c
@ -6,7 +6,7 @@
 /* copy, b = a */
 mp_err mp_copy(const mp_int *a, mp_int *b)
 {
-   int n;
+   mp_err err;

   /* if dst == src do nothing */
   if (a == b) {
@ -14,26 +14,16 @@ mp_err mp_copy(const mp_int *a, mp_int *b)
   }

   /* grow dest */
-   if (b->alloc < a->used) {
-      mp_err err;
-      if ((err = mp_grow(b, a->used)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(b, a->used)) != MP_OKAY) {
+      return err;
   }

-   /* zero b and copy the parameters over */
-
-   /* copy all the digits */
-   for (n = 0; n < a->used; n++) {
-      b->dp[n] = a->dp[n];
-   }
-
-   /* clear high digits */
-   MP_ZERO_DIGITS(b->dp + a->used, b->used - a->used);
-
-   /* copy used count and sign */
+   /* copy everything over and zero high digits */
+   s_mp_copy_digs(b->dp, a->dp, a->used);
+   s_mp_zero_digs(b->dp + a->used, b->used - a->used);
   b->used = a->used;
   b->sign = a->sign;
+
   return MP_OKAY;
 }
 #endif
--- a/mp_cutoffs.c
+++ b/mp_cutoffs.c
@ -5,10 +5,10 @@

 #ifndef MP_FIXED_CUTOFFS
 #include "tommath_cutoffs.h"
-int MP_KARATSUBA_MUL_CUTOFF = MP_DEFAULT_KARATSUBA_MUL_CUTOFF,
-    MP_KARATSUBA_SQR_CUTOFF = MP_DEFAULT_KARATSUBA_SQR_CUTOFF,
-    MP_TOOM_MUL_CUTOFF = MP_DEFAULT_TOOM_MUL_CUTOFF,
-    MP_TOOM_SQR_CUTOFF = MP_DEFAULT_TOOM_SQR_CUTOFF;
+int MP_MUL_KARATSUBA_CUTOFF = MP_DEFAULT_MUL_KARATSUBA_CUTOFF,
+    MP_SQR_KARATSUBA_CUTOFF = MP_DEFAULT_SQR_KARATSUBA_CUTOFF,
+    MP_MUL_TOOM_CUTOFF = MP_DEFAULT_MUL_TOOM_CUTOFF,
+    MP_SQR_TOOM_CUTOFF = MP_DEFAULT_SQR_TOOM_CUTOFF;
 #endif

 #endif
--- a/mp_div.c
+++ b/mp_div.c
@ -26,7 +26,7 @@ mp_err mp_div(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d)
   }

   if (MP_HAS(S_MP_DIV_RECURSIVE)
-       && (b->used > MP_KARATSUBA_MUL_CUTOFF)
+       && (b->used > MP_MUL_KARATSUBA_CUTOFF)
       && (b->used <= ((a->used/3)*2))) {
      err = s_mp_div_recursive(a, b, c, d);
   } else if (MP_HAS(S_MP_DIV_SCHOOL)) {
--- a/mp_div_2.c
+++ b/mp_div_2.c
@ -6,41 +6,32 @@
 /* b = a/2 */
 mp_err mp_div_2(const mp_int *a, mp_int *b)
 {
-   int     x, oldused;
-   mp_digit r, rr, *tmpa, *tmpb;
   mp_err err;
+   int x, oldused;
+   mp_digit r;

-   /* copy */
-   if (b->alloc < a->used) {
-      if ((err = mp_grow(b, a->used)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(b, a->used)) != MP_OKAY) {
+      return err;
   }

   oldused = b->used;
   b->used = a->used;

-   /* source alias */
-   tmpa = a->dp + b->used - 1;
-
-   /* dest alias */
-   tmpb = b->dp + b->used - 1;
-
   /* carry */
   r = 0;
-   for (x = b->used - 1; x >= 0; x--) {
+   for (x = b->used; x --> 0;) {
      /* get the carry for the next iteration */
-      rr = *tmpa & 1u;
+      mp_digit rr = a->dp[x] & 1u;

      /* shift the current digit, add in carry and store */
-      *tmpb-- = (*tmpa-- >> 1) | (r << (MP_DIGIT_BIT - 1));
+      b->dp[x] = (a->dp[x] >> 1) | (r << (MP_DIGIT_BIT - 1));

      /* forward carry to next iteration */
      r = rr;
   }

   /* zero excess digits */
-   MP_ZERO_DIGITS(b->dp + b->used, oldused - b->used);
+   s_mp_zero_digs(b->dp + b->used, oldused - b->used);

   b->sign = a->sign;
   mp_clamp(b);
--- a/mp_div_2d.c
+++ b/mp_div_2d.c
@ -6,23 +6,16 @@
 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
 mp_err mp_div_2d(const mp_int *a, int b, mp_int *c, mp_int *d)
 {
-   mp_digit D, r, rr;
-   int     x;
   mp_err err;

-   /* if the shift count is <= 0 then we do no work */
-   if (b <= 0) {
-      err = mp_copy(a, c);
-      if (d != NULL) {
-         mp_zero(d);
-      }
-      return err;
+   if (b < 0) {
+      return MP_VAL;
   }

-   /* copy */
   if ((err = mp_copy(a, c)) != MP_OKAY) {
      return err;
   }
+
   /* 'a' should not be used after here - it might be the same as d */

   /* get the remainder */
@ -38,28 +31,25 @@ mp_err mp_div_2d(const mp_int *a, int b, mp_int *c, mp_int *d)
   }

   /* shift any bit count < MP_DIGIT_BIT */
-   D = (mp_digit)(b % MP_DIGIT_BIT);
-   if (D != 0u) {
-      mp_digit *tmpc, mask, shift;
+   b %= MP_DIGIT_BIT;
+   if (b != 0u) {
+      int x;
+      mp_digit r, mask, shift;

      /* mask */
-      mask = ((mp_digit)1 << D) - 1uL;
+      mask = ((mp_digit)1 << b) - 1uL;

      /* shift for lsb */
-      shift = (mp_digit)MP_DIGIT_BIT - D;
-
-      /* alias */
-      tmpc = c->dp + (c->used - 1);
+      shift = (mp_digit)(MP_DIGIT_BIT - b);

      /* carry */
      r = 0;
-      for (x = c->used - 1; x >= 0; x--) {
+      for (x = c->used; x --> 0;) {
         /* get the lower  bits of this word in a temp */
-         rr = *tmpc & mask;
+         mp_digit rr = c->dp[x] & mask;

         /* shift the current word and mix in the carry bits from the previous word */
-         *tmpc = (*tmpc >> D) | (r << shift);
-         --tmpc;
+         c->dp[x] = (c->dp[x] >> b) | (r << shift);

         /* set the carry to the carry bits of the current word found above */
         r = rr;
--- a/mp_div_d.c
+++ b/mp_div_d.c
@ -8,7 +8,6 @@ mp_err mp_div_d(const mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
 {
   mp_int  q;
   mp_word w;
-   mp_digit t;
   mp_err err;
   int ix;

@ -56,14 +55,12 @@ mp_err mp_div_d(const mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
   q.used = a->used;
   q.sign = a->sign;
   w = 0;
-   for (ix = a->used - 1; ix >= 0; ix--) {
+   for (ix = a->used; ix --> 0;) {
+      mp_digit t = 0;
      w = (w << (mp_word)MP_DIGIT_BIT) | (mp_word)a->dp[ix];
-
      if (w >= b) {
         t = (mp_digit)(w / b);
         w -= (mp_word)t * (mp_word)b;
-      } else {
-         t = 0;
      }
      q.dp[ix] = t;
   }
@ -78,7 +75,7 @@ mp_err mp_div_d(const mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
   }
   mp_clear(&q);

-   return err;
+   return MP_OKAY;
 }

 #endif
--- a/mp_dr_reduce.c
+++ b/mp_dr_reduce.c
@ -19,59 +19,49 @@
 */
 mp_err mp_dr_reduce(mp_int *x, const mp_int *n, mp_digit k)
 {
-   mp_err      err;
-   int i, m;
-   mp_word  r;
-   mp_digit mu, *tmpx1, *tmpx2;
+   mp_err err;

   /* m = digits in modulus */
-   m = n->used;
+   int m = n->used;

   /* ensure that "x" has at least 2m digits */
-   if (x->alloc < (m + m)) {
-      if ((err = mp_grow(x, m + m)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(x, m + m)) != MP_OKAY) {
+      return err;
   }

   /* top of loop, this is where the code resumes if
    * another reduction pass is required.
    */
-top:
-   /* aliases for digits */
-   /* alias for lower half of x */
-   tmpx1 = x->dp;
+   for (;;) {
+      int i;
+      mp_digit mu = 0;

-   /* alias for upper half of x, or x/B**m */
-   tmpx2 = x->dp + m;
+      /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
+      for (i = 0; i < m; i++) {
+         mp_word r         = ((mp_word)x->dp[i + m] * (mp_word)k) + x->dp[i] + mu;
+         x->dp[i]  = (mp_digit)(r & MP_MASK);
+         mu        = (mp_digit)(r >> ((mp_word)MP_DIGIT_BIT));
+      }

-   /* set carry to zero */
-   mu = 0;
+      /* set final carry */
+      x->dp[i] = mu;

-   /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
-   for (i = 0; i < m; i++) {
-      r         = ((mp_word)*tmpx2++ * (mp_word)k) + *tmpx1 + mu;
-      *tmpx1++  = (mp_digit)(r & MP_MASK);
-      mu        = (mp_digit)(r >> ((mp_word)MP_DIGIT_BIT));
-   }
+      /* zero words above m */
+      s_mp_zero_digs(x->dp + m + 1, (x->used - m) - 1);

-   /* set final carry */
-   *tmpx1++ = mu;
+      /* clamp, sub and return */
+      mp_clamp(x);

-   /* zero words above m */
-   MP_ZERO_DIGITS(tmpx1, (x->used - m) - 1);
+      /* if x >= n then subtract and reduce again
+       * Each successive "recursion" makes the input smaller and smaller.
+       */
+      if (mp_cmp_mag(x, n) == MP_LT) {
+         break;
+      }

-   /* clamp, sub and return */
-   mp_clamp(x);
-
-   /* if x >= n then subtract and reduce again
-    * Each successive "recursion" makes the input smaller and smaller.
-    */
-   if (mp_cmp_mag(x, n) != MP_LT) {
      if ((err = s_mp_sub(x, n, x)) != MP_OKAY) {
         return err;
      }
-      goto top;
   }
   return MP_OKAY;
 }
--- a/mp_from_ubin.c
+++ b/mp_from_ubin.c
@ -9,10 +9,8 @@ mp_err mp_from_ubin(mp_int *a, const uint8_t *buf, size_t size)
   mp_err err;

   /* make sure there are at least two digits */
-   if (a->alloc < 2) {
-      if ((err = mp_grow(a, 2)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(a, 2)) != MP_OKAY) {
+      return err;
   }

   /* zero the int */
--- a/mp_fwrite.c
+++ b/mp_fwrite.c
@ -25,7 +25,7 @@ mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream)
      }
   }

-   MP_FREE_BUFFER(buf, size);
+   MP_FREE_BUF(buf, size);
   return err;
 }
 #endif
--- a/mp_grow.c
+++ b/mp_grow.c
@ -26,7 +26,7 @@ mp_err mp_grow(mp_int *a, int size)
      a->dp = dp;

      /* zero excess digits */
-      MP_ZERO_DIGITS(a->dp + a->alloc, size - a->alloc);
+      s_mp_zero_digs(a->dp + a->alloc, size - a->alloc);
      a->alloc = size;
   }
   return MP_OKAY;
--- a/mp_invmod.c
+++ b/mp_invmod.c
@ -12,12 +12,12 @@ mp_err mp_invmod(const mp_int *a, const mp_int *b, mp_int *c)
   }

   /* if the modulus is odd we can use a faster routine instead */
-   if (MP_HAS(S_MP_INVMOD_FAST) && mp_isodd(b)) {
-      return s_mp_invmod_fast(a, b, c);
+   if (MP_HAS(S_MP_INVMOD_ODD) && mp_isodd(b)) {
+      return s_mp_invmod_odd(a, b, c);
   }

-   return MP_HAS(S_MP_INVMOD_SLOW)
-          ? s_mp_invmod_slow(a, b, c)
+   return MP_HAS(S_MP_INVMOD)
+          ? s_mp_invmod(a, b, c)
          : MP_VAL;
 }
 #endif
--- a/mp_lshd.c
+++ b/mp_lshd.c
@ -6,6 +6,7 @@
 /* shift left a certain amount of digits */
 mp_err mp_lshd(mp_int *a, int b)
 {
+   mp_err err;
   int x;

   /* if its less than zero return */
@ -18,11 +19,8 @@ mp_err mp_lshd(mp_int *a, int b)
   }

   /* grow to fit the new digits */
-   if (a->alloc < (a->used + b)) {
-      mp_err err;
-      if ((err = mp_grow(a, a->used + b)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(a, a->used + b)) != MP_OKAY) {
+      return err;
   }

   /* increment the used by the shift amount then copy upwards */
@ -37,7 +35,7 @@ mp_err mp_lshd(mp_int *a, int b)
   }

   /* zero the lower digits */
-   MP_ZERO_DIGITS(a->dp, b);
+   s_mp_zero_digs(a->dp, b);

   return MP_OKAY;
 }
--- a/mp_mod_2d.c
+++ b/mp_mod_2d.c
@ -29,7 +29,7 @@ mp_err mp_mod_2d(const mp_int *a, int b, mp_int *c)

   /* zero digits above the last digit of the modulus */
   x = (b / MP_DIGIT_BIT) + (((b % MP_DIGIT_BIT) == 0) ? 0 : 1);
-   MP_ZERO_DIGITS(c->dp + x, c->used - x);
+   s_mp_zero_digs(c->dp + x, c->used - x);

   /* clear the digit that is not completely outside/inside the modulus */
   c->dp[b / MP_DIGIT_BIT] &=
--- a/mp_mod_d.c
+++ b/mp_mod_d.c
@ -1,10 +0,0 @@
-#include "tommath_private.h"
-#ifdef MP_MOD_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis */
-/* SPDX-License-Identifier: Unlicense */
-
-mp_err mp_mod_d(const mp_int *a, mp_digit b, mp_digit *c)
-{
-   return mp_div_d(a, b, NULL, c);
-}
-#endif
--- a/mp_montgomery_reduce.c
+++ b/mp_montgomery_reduce.c
@ -6,9 +6,8 @@
 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
 mp_err mp_montgomery_reduce(mp_int *x, const mp_int *n, mp_digit rho)
 {
-   int      ix, digs;
-   mp_err   err;
-   mp_digit mu;
+   mp_err err;
+   int ix, digs;

   /* can the fast reduction [comba] method be used?
    *
@ -20,18 +19,19 @@ mp_err mp_montgomery_reduce(mp_int *x, const mp_int *n, mp_digit rho)
   if ((digs < MP_WARRAY) &&
       (x->used <= MP_WARRAY) &&
       (n->used < MP_MAXFAST)) {
-      return s_mp_montgomery_reduce_fast(x, n, rho);
+      return s_mp_montgomery_reduce_comba(x, n, rho);
   }

   /* grow the input as required */
-   if (x->alloc < digs) {
-      if ((err = mp_grow(x, digs)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(x, digs)) != MP_OKAY) {
+      return err;
   }
   x->used = digs;

   for (ix = 0; ix < n->used; ix++) {
+      int iy;
+      mp_digit u, mu;
+
      /* mu = ai * rho mod b
       *
       * The value of rho must be precalculated via
@ -43,41 +43,28 @@ mp_err mp_montgomery_reduce(mp_int *x, const mp_int *n, mp_digit rho)
      mu = (mp_digit)(((mp_word)x->dp[ix] * (mp_word)rho) & MP_MASK);

      /* a = a + mu * m * b**i */
-      {
-         int iy;
-         mp_digit *tmpn, *tmpx, u;
-         mp_word r;

-         /* alias for digits of the modulus */
-         tmpn = n->dp;
+      /* Multiply and add in place */
+      u = 0;
+      for (iy = 0; iy < n->used; iy++) {
+         /* compute product and sum */
+         mp_word r = ((mp_word)mu * (mp_word)n->dp[iy]) +
+                     (mp_word)u + (mp_word)x->dp[ix + iy];

-         /* alias for the digits of x [the input] */
-         tmpx = x->dp + ix;
+         /* get carry */
+         u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);

-         /* set the carry to zero */
-         u = 0;
+         /* fix digit */
+         x->dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
+      }
+      /* At this point the ix'th digit of x should be zero */

-         /* Multiply and add in place */
-         for (iy = 0; iy < n->used; iy++) {
-            /* compute product and sum */
-            r       = ((mp_word)mu * (mp_word)*tmpn++) +
-                      (mp_word)u + (mp_word)*tmpx;
-
-            /* get carry */
-            u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
-
-            /* fix digit */
-            *tmpx++ = (mp_digit)(r & (mp_word)MP_MASK);
-         }
-         /* At this point the ix'th digit of x should be zero */
-
-
-         /* propagate carries upwards as required*/
-         while (u != 0u) {
-            *tmpx   += u;
-            u        = *tmpx >> MP_DIGIT_BIT;
-            *tmpx++ &= MP_MASK;
-         }
+      /* propagate carries upwards as required*/
+      while (u != 0u) {
+         x->dp[ix + iy]   += u;
+         u        = x->dp[ix + iy] >> MP_DIGIT_BIT;
+         x->dp[ix + iy] &= MP_MASK;
+         ++iy;
      }
   }

--- a/mp_mul.c
+++ b/mp_mul.c
@ -7,31 +7,31 @@
 mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
 {
   mp_err err;
-   int min_len = MP_MIN(a->used, b->used),
-       max_len = MP_MAX(a->used, b->used),
+   int min = MP_MIN(a->used, b->used),
+       max = MP_MAX(a->used, b->used),
       digs = a->used + b->used + 1;
   mp_sign neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;

-   if (MP_HAS(S_MP_BALANCE_MUL) &&
+   if (MP_HAS(S_MP_MUL_BALANCE) &&
       /* Check sizes. The smaller one needs to be larger than the Karatsuba cut-off.
-        * The bigger one needs to be at least about one MP_KARATSUBA_MUL_CUTOFF bigger
+        * The bigger one needs to be at least about one MP_MUL_KARATSUBA_CUTOFF bigger
        * to make some sense, but it depends on architecture, OS, position of the
        * stars... so YMMV.
-        * Using it to cut the input into slices small enough for s_mp_mul_digs_fast
+        * Using it to cut the input into slices small enough for s_mp_mul_comba
        * was actually slower on the author's machine, but YMMV.
        */
-       (min_len >= MP_KARATSUBA_MUL_CUTOFF) &&
-       ((max_len / 2) >= MP_KARATSUBA_MUL_CUTOFF) &&
+       (min >= MP_MUL_KARATSUBA_CUTOFF) &&
+       ((max / 2) >= MP_MUL_KARATSUBA_CUTOFF) &&
       /* Not much effect was observed below a ratio of 1:2, but again: YMMV. */
-       (max_len >= (2 * min_len))) {
-      err = s_mp_balance_mul(a,b,c);
-   } else if (MP_HAS(S_MP_TOOM_MUL) &&
-              (min_len >= MP_TOOM_MUL_CUTOFF)) {
-      err = s_mp_toom_mul(a, b, c);
-   } else if (MP_HAS(S_MP_KARATSUBA_MUL) &&
-              (min_len >= MP_KARATSUBA_MUL_CUTOFF)) {
-      err = s_mp_karatsuba_mul(a, b, c);
-   } else if (MP_HAS(S_MP_MUL_DIGS_FAST) &&
+       (max >= (2 * min))) {
+      err = s_mp_mul_balance(a,b,c);
+   } else if (MP_HAS(S_MP_MUL_TOOM) &&
+              (min >= MP_MUL_TOOM_CUTOFF)) {
+      err = s_mp_mul_toom(a, b, c);
+   } else if (MP_HAS(S_MP_MUL_KARATSUBA) &&
+              (min >= MP_MUL_KARATSUBA_CUTOFF)) {
+      err = s_mp_mul_karatsuba(a, b, c);
+   } else if (MP_HAS(S_MP_MUL_COMBA) &&
              /* can we use the fast multiplier?
               *
               * The fast multiplier can be used if the output will
@ -39,10 +39,10 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
               * digits won't affect carry propagation
               */
              (digs < MP_WARRAY) &&
-              (min_len <= MP_MAXFAST)) {
-      err = s_mp_mul_digs_fast(a, b, c, digs);
-   } else if (MP_HAS(S_MP_MUL_DIGS)) {
-      err = s_mp_mul_digs(a, b, c, digs);
+              (min <= MP_MAXFAST)) {
+      err = s_mp_mul_comba(a, b, c, digs);
+   } else if (MP_HAS(S_MP_MUL)) {
+      err = s_mp_mul(a, b, c, digs);
   } else {
      err = MP_VAL;
   }
--- a/mp_mul_2.c
+++ b/mp_mul_2.c
@ -6,58 +6,47 @@
 /* b = a*2 */
 mp_err mp_mul_2(const mp_int *a, mp_int *b)
 {
-   int     x, oldused;
   mp_err err;
+   int x, oldused;
+   mp_digit r;

   /* grow to accomodate result */
-   if (b->alloc < (a->used + 1)) {
-      if ((err = mp_grow(b, a->used + 1)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(b, a->used + 1)) != MP_OKAY) {
+      return err;
   }

   oldused = b->used;
   b->used = a->used;

-   {
-      mp_digit r, rr, *tmpa, *tmpb;
+   /* carry */
+   r = 0;
+   for (x = 0; x < a->used; x++) {

-      /* alias for source */
-      tmpa = a->dp;
-
-      /* alias for dest */
-      tmpb = b->dp;
-
-      /* carry */
-      r = 0;
-      for (x = 0; x < a->used; x++) {
-
-         /* get what will be the *next* carry bit from the
-          * MSB of the current digit
-          */
-         rr = *tmpa >> (mp_digit)(MP_DIGIT_BIT - 1);
-
-         /* now shift up this digit, add in the carry [from the previous] */
-         *tmpb++ = ((*tmpa++ << 1uL) | r) & MP_MASK;
-
-         /* copy the carry that would be from the source
-          * digit into the next iteration
-          */
-         r = rr;
-      }
-
-      /* new leading digit? */
-      if (r != 0u) {
-         /* add a MSB which is always 1 at this point */
-         *tmpb = 1;
-         ++(b->used);
-      }
-
-      /* now zero any excess digits on the destination
-       * that we didn't write to
+      /* get what will be the *next* carry bit from the
+       * MSB of the current digit
       */
-      MP_ZERO_DIGITS(b->dp + b->used, oldused - b->used);
+      mp_digit rr = a->dp[x] >> (mp_digit)(MP_DIGIT_BIT - 1);
+
+      /* now shift up this digit, add in the carry [from the previous] */
+      b->dp[x] = ((a->dp[x] << 1uL) | r) & MP_MASK;
+
+      /* copy the carry that would be from the source
+       * digit into the next iteration
+       */
+      r = rr;
   }
+
+   /* new leading digit? */
+   if (r != 0u) {
+      /* add a MSB which is always 1 at this point */
+      b->dp[b->used++] = 1;
+   }
+
+   /* now zero any excess digits on the destination
+    * that we didn't write to
+    */
+   s_mp_zero_digs(b->dp + b->used, oldused - b->used);
+
   b->sign = a->sign;
   return MP_OKAY;
 }
--- a/mp_mul_2d.c
+++ b/mp_mul_2d.c
@ -6,20 +6,18 @@
 /* shift left by a certain bit count */
 mp_err mp_mul_2d(const mp_int *a, int b, mp_int *c)
 {
-   mp_digit d;
-   mp_err   err;
+   mp_err err;

-   /* copy */
-   if (a != c) {
-      if ((err = mp_copy(a, c)) != MP_OKAY) {
-         return err;
-      }
+   if (b < 0) {
+      return MP_VAL;
   }

-   if (c->alloc < (c->used + (b / MP_DIGIT_BIT) + 1)) {
-      if ((err = mp_grow(c, c->used + (b / MP_DIGIT_BIT) + 1)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_copy(a, c)) != MP_OKAY) {
+      return err;
+   }
+
+   if ((err = mp_grow(c, c->used + (b / MP_DIGIT_BIT) + 1)) != MP_OKAY) {
+      return err;
   }

   /* shift by as many digits in the bit count */
@ -30,29 +28,25 @@ mp_err mp_mul_2d(const mp_int *a, int b, mp_int *c)
   }

   /* shift any bit count < MP_DIGIT_BIT */
-   d = (mp_digit)(b % MP_DIGIT_BIT);
-   if (d != 0u) {
-      mp_digit *tmpc, shift, mask, r, rr;
+   b %= MP_DIGIT_BIT;
+   if (b != 0u) {
+      mp_digit shift, mask, r;
      int x;

      /* bitmask for carries */
-      mask = ((mp_digit)1 << d) - (mp_digit)1;
+      mask = ((mp_digit)1 << b) - (mp_digit)1;

      /* shift for msbs */
-      shift = (mp_digit)MP_DIGIT_BIT - d;
-
-      /* alias */
-      tmpc = c->dp;
+      shift = (mp_digit)(MP_DIGIT_BIT - b);

      /* carry */
      r    = 0;
      for (x = 0; x < c->used; x++) {
         /* get the higher bits of the current word */
-         rr = (*tmpc >> shift) & mask;
+         mp_digit rr = (c->dp[x] >> shift) & mask;

         /* shift the current word and OR in the carry */
-         *tmpc = ((*tmpc << d) | r) & MP_MASK;
-         ++tmpc;
+         c->dp[x] = ((c->dp[x] << b) | r) & MP_MASK;

         /* set the carry to the carry bits of the current word */
         r = rr;
--- a/mp_mul_d.c
+++ b/mp_mul_d.c
@ -6,54 +6,45 @@
 /* multiply by a digit */
 mp_err mp_mul_d(const mp_int *a, mp_digit b, mp_int *c)
 {
-   mp_digit u, *tmpa, *tmpc;
-   mp_word  r;
+   mp_digit u;
   mp_err   err;
-   int      ix, olduse;
+   int   ix, oldused;

   /* make sure c is big enough to hold a*b */
-   if (c->alloc < (a->used + 1)) {
-      if ((err = mp_grow(c, a->used + 1)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, a->used + 1)) != MP_OKAY) {
+      return err;
   }

   /* get the original destinations used count */
-   olduse = c->used;
+   oldused = c->used;

   /* set the sign */
   c->sign = a->sign;

-   /* alias for a->dp [source] */
-   tmpa = a->dp;
-
-   /* alias for c->dp [dest] */
-   tmpc = c->dp;
-
   /* zero carry */
   u = 0;

   /* compute columns */
   for (ix = 0; ix < a->used; ix++) {
      /* compute product and carry sum for this term */
-      r       = (mp_word)u + ((mp_word)*tmpa++ * (mp_word)b);
+      mp_word r       = (mp_word)u + ((mp_word)a->dp[ix] * (mp_word)b);

      /* mask off higher bits to get a single digit */
-      *tmpc++ = (mp_digit)(r & (mp_word)MP_MASK);
+      c->dp[ix] = (mp_digit)(r & (mp_word)MP_MASK);

      /* send carry into next iteration */
      u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
   }

   /* store final carry [if any] and increment ix offset  */
-   *tmpc++ = u;
-   ++ix;
-
-   /* now zero digits above the top */
-   MP_ZERO_DIGITS(tmpc, olduse - ix);
+   c->dp[ix] = u;

   /* set used count */
   c->used = a->used + 1;
+
+   /* now zero digits above the top */
+   s_mp_zero_digs(c->dp + c->used, oldused - c->used);
+
   mp_clamp(c);

   return MP_OKAY;
--- a/mp_neg.c
+++ b/mp_neg.c
@ -6,11 +6,9 @@
 /* b = -a */
 mp_err mp_neg(const mp_int *a, mp_int *b)
 {
-   if (a != b) {
-      mp_err err;
-      if ((err = mp_copy(a, b)) != MP_OKAY) {
-         return err;
-      }
+   mp_err err;
+   if ((err = mp_copy(a, b)) != MP_OKAY) {
+      return err;
   }

   b->sign = mp_iszero(b) || b->sign == MP_NEG ? MP_ZPOS : MP_NEG;
--- a/mp_or.c
+++ b/mp_or.c
@ -11,10 +11,8 @@ mp_err mp_or(const mp_int *a, const mp_int *b, mp_int *c)
   mp_digit ac = 1, bc = 1, cc = 1;
   mp_sign csign = ((a->sign == MP_NEG) || (b->sign == MP_NEG)) ? MP_NEG : MP_ZPOS;

-   if (c->alloc < used) {
-      if ((err = mp_grow(c, used)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, used)) != MP_OKAY) {
+      return err;
   }

   for (i = 0; i < used; i++) {
--- a/mp_prime_fermat.c
+++ b/mp_prime_fermat.c
@ -16,9 +16,6 @@ mp_err mp_prime_fermat(const mp_int *a, const mp_int *b, bool *result)
   mp_int  t;
   mp_err  err;

-   /* default to composite  */
-   *result = false;
-
   /* ensure b > 1 */
   if (mp_cmp_d(b, 1uL) != MP_GT) {
      return MP_VAL;
@ -31,16 +28,13 @@ mp_err mp_prime_fermat(const mp_int *a, const mp_int *b, bool *result)

   /* compute t = b**a mod a */
   if ((err = mp_exptmod(b, a, a, &t)) != MP_OKAY) {
-      goto LBL_T;
+      goto LBL_ERR;
   }

   /* is it equal to b? */
-   if (mp_cmp(&t, b) == MP_EQ) {
-      *result = true;
-   }
+   *result = mp_cmp(&t, b) == MP_EQ;

-   err = MP_OKAY;
-LBL_T:
+LBL_ERR:
   mp_clear(&t);
   return err;
 }
--- a/mp_prime_frobenius_underwood.c
+++ b/mp_prime_frobenius_underwood.c
@ -23,17 +23,16 @@
 mp_err mp_prime_frobenius_underwood(const mp_int *N, bool *result)
 {
   mp_int T1z, T2z, Np1z, sz, tz;
-
-   int a, ap2, length, i, j;
+   int a, ap2, i;
   mp_err err;

-   *result = false;
-
   if ((err = mp_init_multi(&T1z, &T2z, &Np1z, &sz, &tz, NULL)) != MP_OKAY) {
      return err;
   }

   for (a = 0; a < LTM_FROBENIUS_UNDERWOOD_A; a++) {
+      int j;
+
      /* TODO: That's ugly! No, really, it is! */
      if ((a==2) || (a==4) || (a==7) || (a==8) || (a==10) ||
          (a==14) || (a==18) || (a==23) || (a==26) || (a==28)) {
@ -42,7 +41,7 @@ mp_err mp_prime_frobenius_underwood(const mp_int *N, bool *result)

      mp_set_i32(&T1z, (int32_t)((a * a) - 4));

-      if ((err = mp_kronecker(&T1z, N, &j)) != MP_OKAY)           goto LBL_FU_ERR;
+      if ((err = mp_kronecker(&T1z, N, &j)) != MP_OKAY)           goto LBL_END;

      if (j == -1) {
         break;
@ -50,73 +49,76 @@ mp_err mp_prime_frobenius_underwood(const mp_int *N, bool *result)

      if (j == 0) {
         /* composite */
-         goto LBL_FU_ERR;
+         *result = false;
+         goto LBL_END;
      }
   }
   /* Tell it a composite and set return value accordingly */
   if (a >= LTM_FROBENIUS_UNDERWOOD_A) {
      err = MP_ITER;
-      goto LBL_FU_ERR;
+      goto LBL_END;
   }
   /* Composite if N and (a+4)*(2*a+5) are not coprime */
   mp_set_u32(&T1z, (uint32_t)((a+4)*((2*a)+5)));

-   if ((err = mp_gcd(N, &T1z, &T1z)) != MP_OKAY)                  goto LBL_FU_ERR;
+   if ((err = mp_gcd(N, &T1z, &T1z)) != MP_OKAY)                  goto LBL_END;

-   if (!((T1z.used == 1) && (T1z.dp[0] == 1u)))                   goto LBL_FU_ERR;
+   if (!((T1z.used == 1) && (T1z.dp[0] == 1u))) {
+      /* composite */
+      *result = false;
+      goto LBL_END;
+   }

   ap2 = a + 2;
-   if ((err = mp_add_d(N, 1uL, &Np1z)) != MP_OKAY)                goto LBL_FU_ERR;
+   if ((err = mp_add_d(N, 1uL, &Np1z)) != MP_OKAY)                goto LBL_END;

   mp_set(&sz, 1uL);
   mp_set(&tz, 2uL);
-   length = mp_count_bits(&Np1z);

-   for (i = length - 2; i >= 0; i--) {
+   for (i = mp_count_bits(&Np1z) - 2; i >= 0; i--) {
      /*
       * temp = (sz*(a*sz+2*tz))%N;
       * tz   = ((tz-sz)*(tz+sz))%N;
       * sz   = temp;
       */
-      if ((err = mp_mul_2(&tz, &T2z)) != MP_OKAY)                 goto LBL_FU_ERR;
+      if ((err = mp_mul_2(&tz, &T2z)) != MP_OKAY)                 goto LBL_END;

      /* a = 0 at about 50% of the cases (non-square and odd input) */
      if (a != 0) {
-         if ((err = mp_mul_d(&sz, (mp_digit)a, &T1z)) != MP_OKAY) goto LBL_FU_ERR;
-         if ((err = mp_add(&T1z, &T2z, &T2z)) != MP_OKAY)         goto LBL_FU_ERR;
+         if ((err = mp_mul_d(&sz, (mp_digit)a, &T1z)) != MP_OKAY) goto LBL_END;
+         if ((err = mp_add(&T1z, &T2z, &T2z)) != MP_OKAY)         goto LBL_END;
      }

-      if ((err = mp_mul(&T2z, &sz, &T1z)) != MP_OKAY)             goto LBL_FU_ERR;
-      if ((err = mp_sub(&tz, &sz, &T2z)) != MP_OKAY)              goto LBL_FU_ERR;
-      if ((err = mp_add(&sz, &tz, &sz)) != MP_OKAY)               goto LBL_FU_ERR;
-      if ((err = mp_mul(&sz, &T2z, &tz)) != MP_OKAY)              goto LBL_FU_ERR;
-      if ((err = mp_mod(&tz, N, &tz)) != MP_OKAY)                 goto LBL_FU_ERR;
-      if ((err = mp_mod(&T1z, N, &sz)) != MP_OKAY)                goto LBL_FU_ERR;
-      if (s_mp_get_bit(&Np1z, (unsigned int)i)) {
+      if ((err = mp_mul(&T2z, &sz, &T1z)) != MP_OKAY)             goto LBL_END;
+      if ((err = mp_sub(&tz, &sz, &T2z)) != MP_OKAY)              goto LBL_END;
+      if ((err = mp_add(&sz, &tz, &sz)) != MP_OKAY)               goto LBL_END;
+      if ((err = mp_mul(&sz, &T2z, &tz)) != MP_OKAY)              goto LBL_END;
+      if ((err = mp_mod(&tz, N, &tz)) != MP_OKAY)                 goto LBL_END;
+      if ((err = mp_mod(&T1z, N, &sz)) != MP_OKAY)                goto LBL_END;
+      if (s_mp_get_bit(&Np1z, i)) {
         /*
          *  temp = (a+2) * sz + tz
          *  tz   = 2 * tz - sz
          *  sz   = temp
          */
         if (a == 0) {
-            if ((err = mp_mul_2(&sz, &T1z)) != MP_OKAY)           goto LBL_FU_ERR;
+            if ((err = mp_mul_2(&sz, &T1z)) != MP_OKAY)           goto LBL_END;
         } else {
-            if ((err = mp_mul_d(&sz, (mp_digit)ap2, &T1z)) != MP_OKAY) goto LBL_FU_ERR;
+            if ((err = mp_mul_d(&sz, (mp_digit)ap2, &T1z)) != MP_OKAY) goto LBL_END;
         }
-         if ((err = mp_add(&T1z, &tz, &T1z)) != MP_OKAY)          goto LBL_FU_ERR;
-         if ((err = mp_mul_2(&tz, &T2z)) != MP_OKAY)              goto LBL_FU_ERR;
-         if ((err = mp_sub(&T2z, &sz, &tz)) != MP_OKAY)           goto LBL_FU_ERR;
+         if ((err = mp_add(&T1z, &tz, &T1z)) != MP_OKAY)          goto LBL_END;
+         if ((err = mp_mul_2(&tz, &T2z)) != MP_OKAY)              goto LBL_END;
+         if ((err = mp_sub(&T2z, &sz, &tz)) != MP_OKAY)           goto LBL_END;
         mp_exch(&sz, &T1z);
      }
   }

   mp_set_u32(&T1z, (uint32_t)((2 * a) + 5));
-   if ((err = mp_mod(&T1z, N, &T1z)) != MP_OKAY)                  goto LBL_FU_ERR;
-   if (mp_iszero(&sz) && (mp_cmp(&tz, &T1z) == MP_EQ)) {
-      *result = true;
-   }
+   if ((err = mp_mod(&T1z, N, &T1z)) != MP_OKAY)                  goto LBL_END;

-LBL_FU_ERR:
+   *result = mp_iszero(&sz) && (mp_cmp(&tz, &T1z) == MP_EQ);
+
+LBL_END:
   mp_clear_multi(&tz, &sz, &Np1z, &T2z, &T1z, NULL);
   return err;
 }
--- a/mp_prime_is_prime.c
+++ b/mp_prime_is_prime.c
@ -13,14 +13,12 @@ static unsigned int s_floor_ilog2(int value)
   return r;
 }

-
 mp_err mp_prime_is_prime(const mp_int *a, int t, bool *result)
 {
   mp_int  b;
-   int     ix, p_max = 0, size_a, len;
-   bool res;
+   int     ix;
+   bool    res;
   mp_err  err;
-   unsigned int fips_rand, mask;

   /* default to no */
   *result = false;
@ -133,6 +131,8 @@ mp_err mp_prime_is_prime(const mp_int *a, int t, bool *result)
      TODO: can be made a bit finer grained but comparing is not free.
   */
   if (t < 0) {
+      int p_max = 0;
+
      /*
          Sorenson, Jonathan; Webster, Jonathan (2015).
           "Strong Pseudoprimes to Twelve Prime Bases".
@ -174,6 +174,9 @@ mp_err mp_prime_is_prime(const mp_int *a, int t, bool *result)
       See Fips 186.4 p. 126ff
   */
   else if (t > 0) {
+      unsigned int mask;
+      int size_a;
+
      /*
       * The mp_digit's have a defined bit-size but the size of the
       * array a.dp is a simple 'int' and this library can not assume full
@ -219,6 +222,9 @@ mp_err mp_prime_is_prime(const mp_int *a, int t, bool *result)
        need to be prime.
      */
      for (ix = 0; ix < t; ix++) {
+         unsigned int fips_rand;
+         int len;
+
         /* mp_rand() guarantees the first digit to be non-zero */
         if ((err = mp_rand(&b, 1)) != MP_OKAY) {
            goto LBL_B;
--- a/mp_prime_miller_rabin.c
+++ b/mp_prime_miller_rabin.c
@ -16,9 +16,6 @@ mp_err mp_prime_miller_rabin(const mp_int *a, const mp_int *b, bool *result)
   mp_err  err;
   int     s, j;

-   /* default */
-   *result = false;
-
   /* ensure b > 1 */
   if (mp_cmp_d(b, 1uL) != MP_GT) {
      return MP_VAL;
@ -29,12 +26,12 @@ mp_err mp_prime_miller_rabin(const mp_int *a, const mp_int *b, bool *result)
      return err;
   }
   if ((err = mp_sub_d(&n1, 1uL, &n1)) != MP_OKAY) {
-      goto LBL_N1;
+      goto LBL_ERR1;
   }

   /* set 2**s * r = n1 */
   if ((err = mp_init_copy(&r, &n1)) != MP_OKAY) {
-      goto LBL_N1;
+      goto LBL_ERR1;
   }

   /* count the number of least significant bits
@ -44,15 +41,15 @@ mp_err mp_prime_miller_rabin(const mp_int *a, const mp_int *b, bool *result)

   /* now divide n - 1 by 2**s */
   if ((err = mp_div_2d(&r, s, &r, NULL)) != MP_OKAY) {
-      goto LBL_R;
+      goto LBL_ERR2;
   }

   /* compute y = b**r mod a */
   if ((err = mp_init(&y)) != MP_OKAY) {
-      goto LBL_R;
+      goto LBL_ERR2;
   }
   if ((err = mp_exptmod(b, &r, a, &y)) != MP_OKAY) {
-      goto LBL_Y;
+      goto LBL_END;
   }

   /* if y != 1 and y != n1 do */
@ -61,12 +58,13 @@ mp_err mp_prime_miller_rabin(const mp_int *a, const mp_int *b, bool *result)
      /* while j <= s-1 and y != n1 */
      while ((j <= (s - 1)) && (mp_cmp(&y, &n1) != MP_EQ)) {
         if ((err = mp_sqrmod(&y, a, &y)) != MP_OKAY) {
-            goto LBL_Y;
+            goto LBL_END;
         }

         /* if y == 1 then composite */
         if (mp_cmp_d(&y, 1uL) == MP_EQ) {
-            goto LBL_Y;
+            *result = false;
+            goto LBL_END;
         }

         ++j;
@ -74,17 +72,19 @@ mp_err mp_prime_miller_rabin(const mp_int *a, const mp_int *b, bool *result)

      /* if y != n1 then composite */
      if (mp_cmp(&y, &n1) != MP_EQ) {
-         goto LBL_Y;
+         *result = false;
+         goto LBL_END;
      }
   }

   /* probably prime now */
   *result = true;
-LBL_Y:
+
+LBL_END:
   mp_clear(&y);
-LBL_R:
+LBL_ERR2:
   mp_clear(&r);
-LBL_N1:
+LBL_ERR1:
   mp_clear(&n1);
   return err;
 }
--- a/mp_prime_next_prime.c
+++ b/mp_prime_next_prime.c
@ -10,11 +10,10 @@
 */
 mp_err mp_prime_next_prime(mp_int *a, int t, bool bbs_style)
 {
-   int      x, y;
-   mp_ord   cmp;
+   int      x;
   mp_err   err;
   bool  res = false;
-   mp_digit res_tab[MP_PRIME_TAB_SIZE], step, kstep;
+   mp_digit res_tab[MP_PRIME_TAB_SIZE], kstep;
   mp_int   b;

   /* force positive */
@ -24,7 +23,7 @@ mp_err mp_prime_next_prime(mp_int *a, int t, bool bbs_style)
   if (mp_cmp_d(a, s_mp_prime_tab[MP_PRIME_TAB_SIZE-1]) == MP_LT) {
      /* find which prime it is bigger than "a" */
      for (x = 0; x < MP_PRIME_TAB_SIZE; x++) {
-         cmp = mp_cmp_d(a, s_mp_prime_tab[x]);
+         mp_ord cmp = mp_cmp_d(a, s_mp_prime_tab[x]);
         if (cmp == MP_EQ) {
            continue;
         }
@ -42,11 +41,7 @@ mp_err mp_prime_next_prime(mp_int *a, int t, bool bbs_style)
   }

   /* generate a prime congruent to 3 mod 4 or 1/3 mod 4? */
-   if (bbs_style) {
-      kstep   = 4;
-   } else {
-      kstep   = 2;
-   }
+   kstep = bbs_style ? 4 : 2;

   /* at this point we will use a combination of a sieve and Miller-Rabin */

@ -79,11 +74,12 @@ mp_err mp_prime_next_prime(mp_int *a, int t, bool bbs_style)
   }

   for (;;) {
+      mp_digit step = 0;
+      bool y;
      /* skip to the next non-trivially divisible candidate */
-      step = 0;
      do {
-         /* y == 1 if any residue was zero [e.g. cannot be prime] */
-         y     =  0;
+         /* y == true if any residue was zero [e.g. cannot be prime] */
+         y     = false;

         /* increase step to next candidate */
         step += kstep;
@ -100,10 +96,10 @@ mp_err mp_prime_next_prime(mp_int *a, int t, bool bbs_style)

            /* set flag if zero */
            if (res_tab[x] == 0u) {
-               y = 1;
+               y = true;
            }
         }
-      } while ((y == 1) && (step < (((mp_digit)1 << MP_DIGIT_BIT) - kstep)));
+      } while (y && (step < (((mp_digit)1 << MP_DIGIT_BIT) - kstep)));

      /* add the step */
      if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
@ -111,7 +107,7 @@ mp_err mp_prime_next_prime(mp_int *a, int t, bool bbs_style)
      }

      /* if didn't pass sieve and step == MP_MAX then skip test */
-      if ((y == 1) && (step >= (((mp_digit)1 << MP_DIGIT_BIT) - kstep))) {
+      if (y && (step >= (((mp_digit)1 << MP_DIGIT_BIT) - kstep))) {
         continue;
      }

@ -123,7 +119,6 @@ mp_err mp_prime_next_prime(mp_int *a, int t, bool bbs_style)
      }
   }

-   err = MP_OKAY;
 LBL_ERR:
   mp_clear(&b);
   return err;
--- a/mp_prime_rand.c
+++ b/mp_prime_rand.c
@ -116,7 +116,7 @@ mp_err mp_prime_rand(mp_int *a, int t, int size, int flags)

   err = MP_OKAY;
 LBL_ERR:
-   MP_FREE_BUFFER(tmp, (size_t)bsize);
+   MP_FREE_BUF(tmp, (size_t)bsize);
   return err;
 }

--- a/mp_prime_strong_lucas_selfridge.c
+++ b/mp_prime_strong_lucas_selfridge.c
@ -192,7 +192,7 @@ mp_err mp_prime_strong_lucas_selfridge(const mp_int *a, bool *result)
      if ((err = mp_mod(&Qmz, a, &Qmz)) != MP_OKAY)               goto LBL_LS_ERR;
      if ((err = mp_mul_2(&Qmz, &Q2mz)) != MP_OKAY)               goto LBL_LS_ERR;

-      if (s_mp_get_bit(&Dz, (unsigned int)u)) {
+      if (s_mp_get_bit(&Dz, u)) {
         /* Formulas for addition of indices (carried out mod N);
          *
          * U_(m+n) = (U_m*V_n + U_n*V_m)/2
--- a/mp_reduce.c
+++ b/mp_reduce.c
@ -26,12 +26,12 @@ mp_err mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
      if ((err = mp_mul(&q, mu, &q)) != MP_OKAY) {
         goto LBL_ERR;
      }
-   } else if (MP_HAS(S_MP_MUL_HIGH_DIGS)) {
-      if ((err = s_mp_mul_high_digs(&q, mu, &q, um)) != MP_OKAY) {
+   } else if (MP_HAS(S_MP_MUL_HIGH)) {
+      if ((err = s_mp_mul_high(&q, mu, &q, um)) != MP_OKAY) {
         goto LBL_ERR;
      }
-   } else if (MP_HAS(S_MP_MUL_HIGH_DIGS_FAST)) {
-      if ((err = s_mp_mul_high_digs_fast(&q, mu, &q, um)) != MP_OKAY) {
+   } else if (MP_HAS(S_MP_MUL_HIGH_COMBA)) {
+      if ((err = s_mp_mul_high_comba(&q, mu, &q, um)) != MP_OKAY) {
         goto LBL_ERR;
      }
   } else {
@ -48,7 +48,7 @@ mp_err mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
   }

   /* q = q * m mod b**(k+1), quick (no division) */
-   if ((err = s_mp_mul_digs(&q, m, &q, um + 1)) != MP_OKAY) {
+   if ((err = s_mp_mul(&q, m, &q, um + 1)) != MP_OKAY) {
      goto LBL_ERR;
   }

--- a/mp_reduce_2k.c
+++ b/mp_reduce_2k.c
@ -8,36 +8,37 @@ mp_err mp_reduce_2k(mp_int *a, const mp_int *n, mp_digit d)
 {
   mp_int q;
   mp_err err;
-   int    p;
+   int p;

   if ((err = mp_init(&q)) != MP_OKAY) {
      return err;
   }

   p = mp_count_bits(n);
-top:
-   /* q = a/2**p, a = a mod 2**p */
-   if ((err = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
-      goto LBL_ERR;
-   }
-
-   if (d != 1u) {
-      /* q = q * d */
-      if ((err = mp_mul_d(&q, d, &q)) != MP_OKAY) {
+   for (;;) {
+      /* q = a/2**p, a = a mod 2**p */
+      if ((err = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
         goto LBL_ERR;
      }
-   }

-   /* a = a + q */
-   if ((err = s_mp_add(a, &q, a)) != MP_OKAY) {
-      goto LBL_ERR;
-   }
+      if (d != 1u) {
+         /* q = q * d */
+         if ((err = mp_mul_d(&q, d, &q)) != MP_OKAY) {
+            goto LBL_ERR;
+         }
+      }

-   if (mp_cmp_mag(a, n) != MP_LT) {
+      /* a = a + q */
+      if ((err = s_mp_add(a, &q, a)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+
+      if (mp_cmp_mag(a, n) == MP_LT) {
+         break;
+      }
      if ((err = s_mp_sub(a, n, a)) != MP_OKAY) {
         goto LBL_ERR;
      }
-      goto top;
   }

 LBL_ERR:
--- a/mp_reduce_2k_l.c
+++ b/mp_reduce_2k_l.c
@ -18,27 +18,30 @@ mp_err mp_reduce_2k_l(mp_int *a, const mp_int *n, const mp_int *d)
   }

   p = mp_count_bits(n);
-top:
-   /* q = a/2**p, a = a mod 2**p */
-   if ((err = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
-      goto LBL_ERR;
-   }

-   /* q = q * d */
-   if ((err = mp_mul(&q, d, &q)) != MP_OKAY) {
-      goto LBL_ERR;
-   }
+   for (;;) {
+      /* q = a/2**p, a = a mod 2**p */
+      if ((err = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+         goto LBL_ERR;
+      }

-   /* a = a + q */
-   if ((err = s_mp_add(a, &q, a)) != MP_OKAY) {
-      goto LBL_ERR;
-   }
+      /* q = q * d */
+      if ((err = mp_mul(&q, d, &q)) != MP_OKAY) {
+         goto LBL_ERR;
+      }

-   if (mp_cmp_mag(a, n) != MP_LT) {
+      /* a = a + q */
+      if ((err = s_mp_add(a, &q, a)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+
+      if (mp_cmp_mag(a, n) == MP_LT) {
+         break;
+      }
      if ((err = s_mp_sub(a, n, a)) != MP_OKAY) {
         goto LBL_ERR;
      }
-      goto top;
+
   }

 LBL_ERR:
--- a/mp_reduce_2k_setup.c
+++ b/mp_reduce_2k_setup.c
@ -8,25 +8,23 @@ mp_err mp_reduce_2k_setup(const mp_int *a, mp_digit *d)
 {
   mp_err err;
   mp_int tmp;
-   int    p;

   if ((err = mp_init(&tmp)) != MP_OKAY) {
      return err;
   }

-   p = mp_count_bits(a);
-   if ((err = mp_2expt(&tmp, p)) != MP_OKAY) {
-      mp_clear(&tmp);
-      return err;
+   if ((err = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto LBL_ERR;
   }

   if ((err = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
-      mp_clear(&tmp);
-      return err;
+      goto LBL_ERR;
   }

   *d = tmp.dp[0];
+
+LBL_ERR:
   mp_clear(&tmp);
-   return MP_OKAY;
+   return err;
 }
 #endif
--- a/mp_rshd.c
+++ b/mp_rshd.c
@ -35,7 +35,7 @@ void mp_rshd(mp_int *a, int b)
   }

   /* zero the top digits */
-   MP_ZERO_DIGITS(a->dp + a->used - b, b);
+   s_mp_zero_digs(a->dp + a->used - b, b);

   /* remove excess digits */
   a->used -= b;
--- a/mp_set.c
+++ b/mp_set.c
@ -10,6 +10,6 @@ void mp_set(mp_int *a, mp_digit b)
   a->dp[0] = b & MP_MASK;
   a->sign  = MP_ZPOS;
   a->used  = (a->dp[0] != 0u) ? 1 : 0;
-   MP_ZERO_DIGITS(a->dp + a->used, oldused - a->used);
+   s_mp_zero_digs(a->dp + a->used, oldused - a->used);
 }
 #endif
--- a/mp_sqr.c
+++ b/mp_sqr.c
@ -7,16 +7,16 @@
 mp_err mp_sqr(const mp_int *a, mp_int *b)
 {
   mp_err err;
-   if (MP_HAS(S_MP_TOOM_SQR) && /* use Toom-Cook? */
-       (a->used >= MP_TOOM_SQR_CUTOFF)) {
-      err = s_mp_toom_sqr(a, b);
-   } else if (MP_HAS(S_MP_KARATSUBA_SQR) &&  /* Karatsuba? */
-              (a->used >= MP_KARATSUBA_SQR_CUTOFF)) {
-      err = s_mp_karatsuba_sqr(a, b);
-   } else if (MP_HAS(S_MP_SQR_FAST) && /* can we use the fast comba multiplier? */
+   if (MP_HAS(S_MP_SQR_TOOM) && /* use Toom-Cook? */
+       (a->used >= MP_SQR_TOOM_CUTOFF)) {
+      err = s_mp_sqr_toom(a, b);
+   } else if (MP_HAS(S_MP_SQR_KARATSUBA) &&  /* Karatsuba? */
+              (a->used >= MP_SQR_KARATSUBA_CUTOFF)) {
+      err = s_mp_sqr_karatsuba(a, b);
+   } else if (MP_HAS(S_MP_SQR_COMBA) && /* can we use the fast comba multiplier? */
              (((a->used * 2) + 1) < MP_WARRAY) &&
              (a->used < (MP_MAXFAST / 2))) {
-      err = s_mp_sqr_fast(a, b);
+      err = s_mp_sqr_comba(a, b);
   } else if (MP_HAS(S_MP_SQR)) {
      err = s_mp_sqr(a, b);
   } else {
--- a/mp_sub_d.c
+++ b/mp_sub_d.c
@ -6,9 +6,8 @@
 /* single digit subtraction */
 mp_err mp_sub_d(const mp_int *a, mp_digit b, mp_int *c)
 {
-   mp_digit *tmpa, *tmpc;
-   mp_err    err;
-   int       ix, oldused;
+   mp_err err;
+   int oldused;

   /* fast path for a == c */
   if (a == c) {
@ -25,10 +24,8 @@ mp_err mp_sub_d(const mp_int *a, mp_digit b, mp_int *c)
   }

   /* grow c as required */
-   if (c->alloc < (a->used + 1)) {
-      if ((err = mp_grow(c, a->used + 1)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, a->used + 1)) != MP_OKAY) {
+      return err;
   }

   /* if a is negative just do an unsigned
@ -46,24 +43,17 @@ mp_err mp_sub_d(const mp_int *a, mp_digit b, mp_int *c)
      return err;
   }

-   /* setup regs */
   oldused = c->used;
-   tmpa    = a->dp;
-   tmpc    = c->dp;

   /* if a <= b simply fix the single digit */
   if (((a->used == 1) && (a->dp[0] <= b)) || (a->used == 0)) {
-      if (a->used == 1) {
-         *tmpc++ = b - *tmpa;
-      } else {
-         *tmpc++ = b;
-      }
-      ix      = 1;
+      c->dp[0] = (a->used == 1) ? b - a->dp[0] : b;

      /* negative/1digit */
      c->sign = MP_NEG;
      c->used = 1;
   } else {
+      int i;
      mp_digit mu = b;

      /* positive/size */
@ -71,15 +61,15 @@ mp_err mp_sub_d(const mp_int *a, mp_digit b, mp_int *c)
      c->used = a->used;

      /* subtract digits, mu is carry */
-      for (ix = 0; ix < a->used; ix++) {
-         *tmpc    = *tmpa++ - mu;
-         mu       = *tmpc >> (MP_SIZEOF_BITS(mp_digit) - 1u);
-         *tmpc++ &= MP_MASK;
+      for (i = 0; i < a->used; i++) {
+         c->dp[i] = a->dp[i] - mu;
+         mu = c->dp[i] >> (MP_SIZEOF_BITS(mp_digit) - 1u);
+         c->dp[i] &= MP_MASK;
      }
   }

   /* zero excess digits */
-   MP_ZERO_DIGITS(tmpc, oldused - ix);
+   s_mp_zero_digs(c->dp + c->used, oldused - c->used);

   mp_clamp(c);
   return MP_OKAY;
--- a/mp_xor.c
+++ b/mp_xor.c
@ -11,10 +11,8 @@ mp_err mp_xor(const mp_int *a, const mp_int *b, mp_int *c)
   mp_digit ac = 1, bc = 1, cc = 1;
   mp_sign csign = (a->sign != b->sign) ? MP_NEG : MP_ZPOS;

-   if (c->alloc < used) {
-      if ((err = mp_grow(c, used)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, used)) != MP_OKAY) {
+      return err;
   }

   for (i = 0; i < used; i++) {
--- a/mp_zero.c
+++ b/mp_zero.c
@ -7,7 +7,7 @@
 void mp_zero(mp_int *a)
 {
   a->sign = MP_ZPOS;
-   MP_ZERO_DIGITS(a->dp, a->used);
+   s_mp_zero_digs(a->dp, a->used);
   a->used = 0;
 }
 #endif
--- a/s_mp_add.c
+++ b/s_mp_add.c
@ -6,85 +6,64 @@
 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
 mp_err s_mp_add(const mp_int *a, const mp_int *b, mp_int *c)
 {
-   const mp_int *x;
+   int oldused, min, max, i;
+   mp_digit u;
   mp_err err;
-   int     olduse, min, max;

   /* find sizes, we let |a| <= |b| which means we have to sort
    * them.  "x" will point to the input with the most digits
    */
-   if (a->used > b->used) {
-      min = b->used;
-      max = a->used;
-      x = a;
-   } else {
-      min = a->used;
-      max = b->used;
-      x = b;
+   if (a->used < b->used) {
+      MP_EXCH(const mp_int *, a, b);
   }

+   min = b->used;
+   max = a->used;
+
   /* init result */
-   if (c->alloc < (max + 1)) {
-      if ((err = mp_grow(c, max + 1)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, max + 1)) != MP_OKAY) {
+      return err;
   }

   /* get old used digit count and set new one */
-   olduse = c->used;
+   oldused = c->used;
   c->used = max + 1;

-   {
-      mp_digit u, *tmpa, *tmpb, *tmpc;
-      int i;
+   /* zero the carry */
+   u = 0;
+   for (i = 0; i < min; i++) {
+      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+      c->dp[i] = a->dp[i] + b->dp[i] + u;

-      /* alias for digit pointers */
+      /* U = carry bit of T[i] */
+      u = c->dp[i] >> (mp_digit)MP_DIGIT_BIT;

-      /* first input */
-      tmpa = a->dp;
+      /* take away carry bit from T[i] */
+      c->dp[i] &= MP_MASK;
+   }

-      /* second input */
-      tmpb = b->dp;
-
-      /* destination */
-      tmpc = c->dp;
-
-      /* zero the carry */
-      u = 0;
-      for (i = 0; i < min; i++) {
-         /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-         *tmpc = *tmpa++ + *tmpb++ + u;
+   /* now copy higher words if any, that is in A+B
+    * if A or B has more digits add those in
+    */
+   if (min != max) {
+      for (; i < max; i++) {
+         /* T[i] = A[i] + U */
+         c->dp[i] = a->dp[i] + u;

         /* U = carry bit of T[i] */
-         u = *tmpc >> (mp_digit)MP_DIGIT_BIT;
+         u = c->dp[i] >> (mp_digit)MP_DIGIT_BIT;

         /* take away carry bit from T[i] */
-         *tmpc++ &= MP_MASK;
+         c->dp[i] &= MP_MASK;
      }
-
-      /* now copy higher words if any, that is in A+B
-       * if A or B has more digits add those in
-       */
-      if (min != max) {
-         for (; i < max; i++) {
-            /* T[i] = X[i] + U */
-            *tmpc = x->dp[i] + u;
-
-            /* U = carry bit of T[i] */
-            u = *tmpc >> (mp_digit)MP_DIGIT_BIT;
-
-            /* take away carry bit from T[i] */
-            *tmpc++ &= MP_MASK;
-         }
-      }
-
-      /* add carry */
-      *tmpc++ = u;
-
-      /* clear digits above oldused */
-      MP_ZERO_DIGITS(tmpc, olduse - c->used);
   }

+   /* add carry */
+   c->dp[i] = u;
+
+   /* clear digits above oldused */
+   s_mp_zero_digs(c->dp + c->used, oldused - c->used);
+
   mp_clamp(c);
   return MP_OKAY;
 }
--- a/s_mp_copy_digs.c
+++ b/s_mp_copy_digs.c
@ -0,0 +1,23 @@
+#include "tommath_private.h"
+#ifdef S_MP_COPY_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+#ifdef MP_USE_MEMOPS
+#  include <string.h>
+#endif
+
+void s_mp_copy_digs(mp_digit *d, const mp_digit *s, int digits)
+{
+#ifdef MP_USE_MEMOPS
+   if (digits > 0) {
+      memcpy(d, s, (size_t)digits * sizeof(mp_digit));
+   }
+#else
+   while (digits-- > 0) {
+      *d++ = *s++;
+   }
+#endif
+}
+
+#endif
--- a/s_mp_div_recursive.c
+++ b/s_mp_div_recursive.c
@ -20,7 +20,7 @@ static mp_err s_mp_recursion(const mp_int *a, const mp_int *b, mp_int *q, mp_int
   mp_int A1, A2, B1, B0, Q1, Q0, R1, R0, t;
   int m = a->used - b->used, k = m/2;

-   if (m < MP_KARATSUBA_MUL_CUTOFF) {
+   if (m < MP_MUL_KARATSUBA_CUTOFF) {
      return s_mp_div_school(a, b, q, r);
   }

@ -104,7 +104,7 @@ mp_err s_mp_div_recursive(const mp_int *a, const mp_int *b, mp_int *q, mp_int *r

       Vid. section 2.3.
    */
-   m = MP_KARATSUBA_MUL_CUTOFF;
+   m = MP_MUL_KARATSUBA_CUTOFF;
   while (m <= b->used) {
      m <<= 1;
   }
--- a/s_mp_exptmod_fast.c
+++ b/s_mp_exptmod_fast.c
@ -80,10 +80,10 @@ mp_err s_mp_exptmod_fast(const mp_int *G, const mp_int *X, const mp_int *P, mp_i
      }

      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
-      if (MP_HAS(S_MP_MONTGOMERY_REDUCE_FAST) &&
+      if (MP_HAS(S_MP_MONTGOMERY_REDUCE_COMBA) &&
          (((P->used * 2) + 1) < MP_WARRAY) &&
          (P->used < MP_MAXFAST)) {
-         redux = s_mp_montgomery_reduce_fast;
+         redux = s_mp_montgomery_reduce_comba;
      } else if (MP_HAS(MP_MONTGOMERY_REDUCE)) {
         /* use slower baseline Montgomery method */
         redux = mp_montgomery_reduce;
--- a/s_mp_get_bit.c
+++ b/s_mp_get_bit.c
@ -5,12 +5,12 @@
 /* SPDX-License-Identifier: Unlicense */

 /* Get bit at position b and return true if the bit is 1, false if it is 0 */
-bool s_mp_get_bit(const mp_int *a, unsigned int b)
+bool s_mp_get_bit(const mp_int *a, int b)
 {
   mp_digit bit;
-   int limb = (int)(b / MP_DIGIT_BIT);
+   int limb = b / MP_DIGIT_BIT;

-   if (limb >= a->used) {
+   if (limb < 0 || limb >= a->used) {
      return false;
   }

--- a/s_mp_invmod.c
+++ b/s_mp_invmod.c
@ -0,0 +1,117 @@
+#include "tommath_private.h"
+#ifdef S_MP_INVMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+/* hac 14.61, pp608 */
+mp_err s_mp_invmod(const mp_int *a, const mp_int *b, mp_int *c)
+{
+   mp_int  x, y, u, v, A, B, C, D;
+   mp_err  err;
+
+   /* b cannot be negative */
+   if ((b->sign == MP_NEG) || mp_iszero(b)) {
+      return MP_VAL;
+   }
+
+   /* init temps */
+   if ((err = mp_init_multi(&x, &y, &u, &v,
+                            &A, &B, &C, &D, NULL)) != MP_OKAY) {
+      return err;
+   }
+
+   /* x = a, y = b */
+   if ((err = mp_mod(a, b, &x)) != MP_OKAY)                       goto LBL_ERR;
+   if ((err = mp_copy(b, &y)) != MP_OKAY)                         goto LBL_ERR;
+
+   /* 2. [modified] if x,y are both even then return an error! */
+   if (mp_iseven(&x) && mp_iseven(&y)) {
+      err = MP_VAL;
+      goto LBL_ERR;
+   }
+
+   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+   if ((err = mp_copy(&x, &u)) != MP_OKAY)                        goto LBL_ERR;
+   if ((err = mp_copy(&y, &v)) != MP_OKAY)                        goto LBL_ERR;
+   mp_set(&A, 1uL);
+   mp_set(&D, 1uL);
+
+   do {
+      /* 4.  while u is even do */
+      while (mp_iseven(&u)) {
+         /* 4.1 u = u/2 */
+         if ((err = mp_div_2(&u, &u)) != MP_OKAY)                    goto LBL_ERR;
+
+         /* 4.2 if A or B is odd then */
+         if (mp_isodd(&A) || mp_isodd(&B)) {
+            /* A = (A+y)/2, B = (B-x)/2 */
+            if ((err = mp_add(&A, &y, &A)) != MP_OKAY)               goto LBL_ERR;
+            if ((err = mp_sub(&B, &x, &B)) != MP_OKAY)               goto LBL_ERR;
+         }
+         /* A = A/2, B = B/2 */
+         if ((err = mp_div_2(&A, &A)) != MP_OKAY)                    goto LBL_ERR;
+         if ((err = mp_div_2(&B, &B)) != MP_OKAY)                    goto LBL_ERR;
+      }
+
+      /* 5.  while v is even do */
+      while (mp_iseven(&v)) {
+         /* 5.1 v = v/2 */
+         if ((err = mp_div_2(&v, &v)) != MP_OKAY)                    goto LBL_ERR;
+
+         /* 5.2 if C or D is odd then */
+         if (mp_isodd(&C) || mp_isodd(&D)) {
+            /* C = (C+y)/2, D = (D-x)/2 */
+            if ((err = mp_add(&C, &y, &C)) != MP_OKAY)               goto LBL_ERR;
+            if ((err = mp_sub(&D, &x, &D)) != MP_OKAY)               goto LBL_ERR;
+         }
+         /* C = C/2, D = D/2 */
+         if ((err = mp_div_2(&C, &C)) != MP_OKAY)                    goto LBL_ERR;
+         if ((err = mp_div_2(&D, &D)) != MP_OKAY)                    goto LBL_ERR;
+      }
+
+      /* 6.  if u >= v then */
+      if (mp_cmp(&u, &v) != MP_LT) {
+         /* u = u - v, A = A - C, B = B - D */
+         if ((err = mp_sub(&u, &v, &u)) != MP_OKAY)                  goto LBL_ERR;
+
+         if ((err = mp_sub(&A, &C, &A)) != MP_OKAY)                  goto LBL_ERR;
+
+         if ((err = mp_sub(&B, &D, &B)) != MP_OKAY)                  goto LBL_ERR;
+      } else {
+         /* v - v - u, C = C - A, D = D - B */
+         if ((err = mp_sub(&v, &u, &v)) != MP_OKAY)                  goto LBL_ERR;
+
+         if ((err = mp_sub(&C, &A, &C)) != MP_OKAY)                  goto LBL_ERR;
+
+         if ((err = mp_sub(&D, &B, &D)) != MP_OKAY)                  goto LBL_ERR;
+      }
+
+      /* if not zero goto step 4 */
+   } while (!mp_iszero(&u));
+
+   /* now a = C, b = D, gcd == g*v */
+
+   /* if v != 1 then there is no inverse */
+   if (mp_cmp_d(&v, 1uL) != MP_EQ) {
+      err = MP_VAL;
+      goto LBL_ERR;
+   }
+
+   /* if its too low */
+   while (mp_cmp_d(&C, 0uL) == MP_LT) {
+      if ((err = mp_add(&C, b, &C)) != MP_OKAY)                   goto LBL_ERR;
+   }
+
+   /* too big */
+   while (mp_cmp_mag(&C, b) != MP_LT) {
+      if ((err = mp_sub(&C, b, &C)) != MP_OKAY)                   goto LBL_ERR;
+   }
+
+   /* C is now the inverse */
+   mp_exch(&C, c);
+
+LBL_ERR:
+   mp_clear_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+   return err;
+}
+#endif
--- a/s_mp_invmod_fast.c
+++ b/s_mp_invmod_fast.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_INVMOD_FAST_C
+#ifdef S_MP_INVMOD_ODD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -9,7 +9,7 @@
 * Based on slow invmod except this is optimized for the case where b is
 * odd as per HAC Note 14.64 on pp. 610
 */
-mp_err s_mp_invmod_fast(const mp_int *a, const mp_int *b, mp_int *c)
+mp_err s_mp_invmod_odd(const mp_int *a, const mp_int *b, mp_int *c)
 {
   mp_int  x, y, u, v, B, D;
   mp_sign neg;
@ -42,51 +42,49 @@ mp_err s_mp_invmod_fast(const mp_int *a, const mp_int *b, mp_int *c)
   if ((err = mp_copy(&y, &v)) != MP_OKAY)                        goto LBL_ERR;
   mp_set(&D, 1uL);

-top:
-   /* 4.  while u is even do */
-   while (mp_iseven(&u)) {
-      /* 4.1 u = u/2 */
-      if ((err = mp_div_2(&u, &u)) != MP_OKAY)                    goto LBL_ERR;
+   do {
+      /* 4.  while u is even do */
+      while (mp_iseven(&u)) {
+         /* 4.1 u = u/2 */
+         if ((err = mp_div_2(&u, &u)) != MP_OKAY)                    goto LBL_ERR;

-      /* 4.2 if B is odd then */
-      if (mp_isodd(&B)) {
-         if ((err = mp_sub(&B, &x, &B)) != MP_OKAY)               goto LBL_ERR;
+         /* 4.2 if B is odd then */
+         if (mp_isodd(&B)) {
+            if ((err = mp_sub(&B, &x, &B)) != MP_OKAY)               goto LBL_ERR;
+         }
+         /* B = B/2 */
+         if ((err = mp_div_2(&B, &B)) != MP_OKAY)                    goto LBL_ERR;
      }
-      /* B = B/2 */
-      if ((err = mp_div_2(&B, &B)) != MP_OKAY)                    goto LBL_ERR;
-   }

-   /* 5.  while v is even do */
-   while (mp_iseven(&v)) {
-      /* 5.1 v = v/2 */
-      if ((err = mp_div_2(&v, &v)) != MP_OKAY)                    goto LBL_ERR;
+      /* 5.  while v is even do */
+      while (mp_iseven(&v)) {
+         /* 5.1 v = v/2 */
+         if ((err = mp_div_2(&v, &v)) != MP_OKAY)                    goto LBL_ERR;

-      /* 5.2 if D is odd then */
-      if (mp_isodd(&D)) {
-         /* D = (D-x)/2 */
-         if ((err = mp_sub(&D, &x, &D)) != MP_OKAY)               goto LBL_ERR;
+         /* 5.2 if D is odd then */
+         if (mp_isodd(&D)) {
+            /* D = (D-x)/2 */
+            if ((err = mp_sub(&D, &x, &D)) != MP_OKAY)               goto LBL_ERR;
+         }
+         /* D = D/2 */
+         if ((err = mp_div_2(&D, &D)) != MP_OKAY)                    goto LBL_ERR;
      }
-      /* D = D/2 */
-      if ((err = mp_div_2(&D, &D)) != MP_OKAY)                    goto LBL_ERR;
-   }

-   /* 6.  if u >= v then */
-   if (mp_cmp(&u, &v) != MP_LT) {
-      /* u = u - v, B = B - D */
-      if ((err = mp_sub(&u, &v, &u)) != MP_OKAY)                  goto LBL_ERR;
+      /* 6.  if u >= v then */
+      if (mp_cmp(&u, &v) != MP_LT) {
+         /* u = u - v, B = B - D */
+         if ((err = mp_sub(&u, &v, &u)) != MP_OKAY)                  goto LBL_ERR;

-      if ((err = mp_sub(&B, &D, &B)) != MP_OKAY)                  goto LBL_ERR;
-   } else {
-      /* v - v - u, D = D - B */
-      if ((err = mp_sub(&v, &u, &v)) != MP_OKAY)                  goto LBL_ERR;
+         if ((err = mp_sub(&B, &D, &B)) != MP_OKAY)                  goto LBL_ERR;
+      } else {
+         /* v - v - u, D = D - B */
+         if ((err = mp_sub(&v, &u, &v)) != MP_OKAY)                  goto LBL_ERR;

-      if ((err = mp_sub(&D, &B, &D)) != MP_OKAY)                  goto LBL_ERR;
-   }
+         if ((err = mp_sub(&D, &B, &D)) != MP_OKAY)                  goto LBL_ERR;
+      }

-   /* if not zero goto step 4 */
-   if (!mp_iszero(&u)) {
-      goto top;
-   }
+      /* if not zero goto step 4 */
+   } while (!mp_iszero(&u));

   /* now a = C, b = D, gcd == g*v */

--- a/s_mp_invmod_slow.c
+++ b/s_mp_invmod_slow.c
@ -1,119 +0,0 @@
-#include "tommath_private.h"
-#ifdef S_MP_INVMOD_SLOW_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis */
-/* SPDX-License-Identifier: Unlicense */
-
-/* hac 14.61, pp608 */
-mp_err s_mp_invmod_slow(const mp_int *a, const mp_int *b, mp_int *c)
-{
-   mp_int  x, y, u, v, A, B, C, D;
-   mp_err  err;
-
-   /* b cannot be negative */
-   if ((b->sign == MP_NEG) || mp_iszero(b)) {
-      return MP_VAL;
-   }
-
-   /* init temps */
-   if ((err = mp_init_multi(&x, &y, &u, &v,
-                            &A, &B, &C, &D, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* x = a, y = b */
-   if ((err = mp_mod(a, b, &x)) != MP_OKAY)                       goto LBL_ERR;
-   if ((err = mp_copy(b, &y)) != MP_OKAY)                         goto LBL_ERR;
-
-   /* 2. [modified] if x,y are both even then return an error! */
-   if (mp_iseven(&x) && mp_iseven(&y)) {
-      err = MP_VAL;
-      goto LBL_ERR;
-   }
-
-   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-   if ((err = mp_copy(&x, &u)) != MP_OKAY)                        goto LBL_ERR;
-   if ((err = mp_copy(&y, &v)) != MP_OKAY)                        goto LBL_ERR;
-   mp_set(&A, 1uL);
-   mp_set(&D, 1uL);
-
-top:
-   /* 4.  while u is even do */
-   while (mp_iseven(&u)) {
-      /* 4.1 u = u/2 */
-      if ((err = mp_div_2(&u, &u)) != MP_OKAY)                    goto LBL_ERR;
-
-      /* 4.2 if A or B is odd then */
-      if (mp_isodd(&A) || mp_isodd(&B)) {
-         /* A = (A+y)/2, B = (B-x)/2 */
-         if ((err = mp_add(&A, &y, &A)) != MP_OKAY)               goto LBL_ERR;
-         if ((err = mp_sub(&B, &x, &B)) != MP_OKAY)               goto LBL_ERR;
-      }
-      /* A = A/2, B = B/2 */
-      if ((err = mp_div_2(&A, &A)) != MP_OKAY)                    goto LBL_ERR;
-      if ((err = mp_div_2(&B, &B)) != MP_OKAY)                    goto LBL_ERR;
-   }
-
-   /* 5.  while v is even do */
-   while (mp_iseven(&v)) {
-      /* 5.1 v = v/2 */
-      if ((err = mp_div_2(&v, &v)) != MP_OKAY)                    goto LBL_ERR;
-
-      /* 5.2 if C or D is odd then */
-      if (mp_isodd(&C) || mp_isodd(&D)) {
-         /* C = (C+y)/2, D = (D-x)/2 */
-         if ((err = mp_add(&C, &y, &C)) != MP_OKAY)               goto LBL_ERR;
-         if ((err = mp_sub(&D, &x, &D)) != MP_OKAY)               goto LBL_ERR;
-      }
-      /* C = C/2, D = D/2 */
-      if ((err = mp_div_2(&C, &C)) != MP_OKAY)                    goto LBL_ERR;
-      if ((err = mp_div_2(&D, &D)) != MP_OKAY)                    goto LBL_ERR;
-   }
-
-   /* 6.  if u >= v then */
-   if (mp_cmp(&u, &v) != MP_LT) {
-      /* u = u - v, A = A - C, B = B - D */
-      if ((err = mp_sub(&u, &v, &u)) != MP_OKAY)                  goto LBL_ERR;
-
-      if ((err = mp_sub(&A, &C, &A)) != MP_OKAY)                  goto LBL_ERR;
-
-      if ((err = mp_sub(&B, &D, &B)) != MP_OKAY)                  goto LBL_ERR;
-   } else {
-      /* v - v - u, C = C - A, D = D - B */
-      if ((err = mp_sub(&v, &u, &v)) != MP_OKAY)                  goto LBL_ERR;
-
-      if ((err = mp_sub(&C, &A, &C)) != MP_OKAY)                  goto LBL_ERR;
-
-      if ((err = mp_sub(&D, &B, &D)) != MP_OKAY)                  goto LBL_ERR;
-   }
-
-   /* if not zero goto step 4 */
-   if (!mp_iszero(&u)) {
-      goto top;
-   }
-
-   /* now a = C, b = D, gcd == g*v */
-
-   /* if v != 1 then there is no inverse */
-   if (mp_cmp_d(&v, 1uL) != MP_EQ) {
-      err = MP_VAL;
-      goto LBL_ERR;
-   }
-
-   /* if its too low */
-   while (mp_cmp_d(&C, 0uL) == MP_LT) {
-      if ((err = mp_add(&C, b, &C)) != MP_OKAY)                   goto LBL_ERR;
-   }
-
-   /* too big */
-   while (mp_cmp_mag(&C, b) != MP_LT) {
-      if ((err = mp_sub(&C, b, &C)) != MP_OKAY)                   goto LBL_ERR;
-   }
-
-   /* C is now the inverse */
-   mp_exch(&C, c);
-   err = MP_OKAY;
-LBL_ERR:
-   mp_clear_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL);
-   return err;
-}
-#endif
--- a/s_mp_montgomery_reduce_comba.c
+++ b/s_mp_montgomery_reduce_comba.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_MONTGOMERY_REDUCE_FAST_C
+#ifdef S_MP_MONTGOMERY_REDUCE_COMBA_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -11,9 +11,9 @@
 *
 * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-mp_err s_mp_montgomery_reduce_fast(mp_int *x, const mp_int *n, mp_digit rho)
+mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
 {
-   int     ix, olduse;
+   int     ix, oldused;
   mp_err  err;
   mp_word W[MP_WARRAY];

@ -22,50 +22,40 @@ mp_err s_mp_montgomery_reduce_fast(mp_int *x, const mp_int *n, mp_digit rho)
   }

   /* get old used count */
-   olduse = x->used;
+   oldused = x->used;

   /* grow a as required */
-   if (x->alloc < (n->used + 1)) {
-      if ((err = mp_grow(x, n->used + 1)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(x, n->used + 1)) != MP_OKAY) {
+      return err;
   }

   /* first we have to get the digits of the input into
    * an array of double precision words W[...]
    */
-   {
-      mp_word *_W;
-      mp_digit *tmpx;

-      /* alias for the W[] array */
-      _W   = W;
+   /* copy the digits of a into W[0..a->used-1] */
+   for (ix = 0; ix < x->used; ix++) {
+      W[ix] = x->dp[ix];
+   }

-      /* alias for the digits of  x*/
-      tmpx = x->dp;
-
-      /* copy the digits of a into W[0..a->used-1] */
-      for (ix = 0; ix < x->used; ix++) {
-         *_W++ = *tmpx++;
-      }
-
-      /* zero the high words of W[a->used..m->used*2] */
-      if (ix < ((n->used * 2) + 1)) {
-         MP_ZERO_BUFFER(_W, sizeof(mp_word) * (size_t)(((n->used * 2) + 1) - ix));
-      }
+   /* zero the high words of W[a->used..m->used*2] */
+   if (ix < ((n->used * 2) + 1)) {
+      s_mp_zero_buf(W + x->used, sizeof(mp_word) * (size_t)(((n->used * 2) + 1) - ix));
   }

   /* now we proceed to zero successive digits
    * from the least significant upwards
    */
   for (ix = 0; ix < n->used; ix++) {
+      int iy;
+      mp_digit mu;
+
      /* mu = ai * m' mod b
       *
       * We avoid a double precision multiplication (which isn't required)
       * by casting the value down to a mp_digit.  Note this requires
       * that W[ix-1] have  the carry cleared (see after the inner loop)
       */
-      mp_digit mu;
      mu = ((W[ix] & MP_MASK) * rho) & MP_MASK;

      /* a = a + mu * m * b**i
@ -82,21 +72,8 @@ mp_err s_mp_montgomery_reduce_fast(mp_int *x, const mp_int *n, mp_digit rho)
       * carry fixups are done in order so after these loops the
       * first m->used words of W[] have the carries fixed
       */
-      {
-         int iy;
-         mp_digit *tmpn;
-         mp_word *_W;
-
-         /* alias for the digits of the modulus */
-         tmpn = n->dp;
-
-         /* Alias for the columns set by an offset of ix */
-         _W = W + ix;
-
-         /* inner loop */
-         for (iy = 0; iy < n->used; iy++) {
-            *_W++ += (mp_word)mu * (mp_word)*tmpn++;
-         }
+      for (iy = 0; iy < n->used; iy++) {
+         W[ix + iy] += (mp_word)mu * (mp_word)n->dp[iy];
      }

      /* now fix carry for next digit, W[ix+1] */
@ -107,47 +84,30 @@ mp_err s_mp_montgomery_reduce_fast(mp_int *x, const mp_int *n, mp_digit rho)
    * shift the words downward [all those least
    * significant digits we zeroed].
    */
-   {
-      mp_digit *tmpx;
-      mp_word *_W, *_W1;

-      /* nox fix rest of carries */
-
-      /* alias for current word */
-      _W1 = W + ix;
-
-      /* alias for next word, where the carry goes */
-      _W = W + ++ix;
-
-      for (; ix < ((n->used * 2) + 1); ix++) {
-         *_W++ += *_W1++ >> (mp_word)MP_DIGIT_BIT;
-      }
-
-      /* copy out, A = A/b**n
-       *
-       * The result is A/b**n but instead of converting from an
-       * array of mp_word to mp_digit than calling mp_rshd
-       * we just copy them in the right order
-       */
-
-      /* alias for destination word */
-      tmpx = x->dp;
-
-      /* alias for shifted double precision result */
-      _W = W + n->used;
-
-      for (ix = 0; ix < (n->used + 1); ix++) {
-         *tmpx++ = *_W++ & (mp_word)MP_MASK;
-      }
-
-      /* zero oldused digits, if the input a was larger than
-       * m->used+1 we'll have to clear the digits
-       */
-      MP_ZERO_DIGITS(tmpx, olduse - ix);
+   for (; ix < (n->used * 2); ix++) {
+      W[ix + 1] += W[ix] >> (mp_word)MP_DIGIT_BIT;
   }

-   /* set the max used and clamp */
+   /* copy out, A = A/b**n
+    *
+    * The result is A/b**n but instead of converting from an
+    * array of mp_word to mp_digit than calling mp_rshd
+    * we just copy them in the right order
+    */
+
+   for (ix = 0; ix < (n->used + 1); ix++) {
+      x->dp[ix] = W[n->used + ix] & (mp_word)MP_MASK;
+   }
+
+   /* set the max used */
   x->used = n->used + 1;
+
+   /* zero oldused digits, if the input a was larger than
+    * m->used+1 we'll have to clear the digits
+    */
+   s_mp_zero_digs(x->dp + x->used, oldused - x->used);
+
   mp_clamp(x);

   /* if A >= m then A = A - m */
--- a/s_mp_mul_digs.c
+++ b/s_mp_mul_digs.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_MUL_DIGS_C
+#ifdef S_MP_MUL_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -7,19 +7,16 @@
 * HAC pp. 595, Algorithm 14.12  Modified so you can control how
 * many digits of output are created.
 */
-mp_err s_mp_mul_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs)
+mp_err s_mp_mul(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
   mp_int  t;
   mp_err  err;
-   int     pa, pb, ix, iy;
-   mp_digit u;
-   mp_word r;
-   mp_digit tmpx, *tmpt, *tmpy;
+   int     pa, ix;

   /* can we use the fast multiplier? */
   if ((digs < MP_WARRAY) &&
       (MP_MIN(a->used, b->used) < MP_MAXFAST)) {
-      return s_mp_mul_digs_fast(a, b, c, digs);
+      return s_mp_mul_comba(a, b, c, digs);
   }

   if ((err = mp_init_size(&t, digs)) != MP_OKAY) {
@ -30,38 +27,28 @@ mp_err s_mp_mul_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs)
   /* compute the digits of the product directly */
   pa = a->used;
   for (ix = 0; ix < pa; ix++) {
-      /* set the carry to zero */
-      u = 0;
+      int iy, pb;
+      mp_digit u = 0;

      /* limit ourselves to making digs digits of output */
      pb = MP_MIN(b->used, digs - ix);

-      /* setup some aliases */
-      /* copy of the digit from a used within the nested loop */
-      tmpx = a->dp[ix];
-
-      /* an alias for the destination shifted ix places */
-      tmpt = t.dp + ix;
-
-      /* an alias for the digits of b */
-      tmpy = b->dp;
-
      /* compute the columns of the output and propagate the carry */
      for (iy = 0; iy < pb; iy++) {
         /* compute the column as a mp_word */
-         r       = (mp_word)*tmpt +
-                   ((mp_word)tmpx * (mp_word)*tmpy++) +
-                   (mp_word)u;
+         mp_word r = (mp_word)t.dp[ix + iy] +
+                     ((mp_word)a->dp[ix] * (mp_word)b->dp[iy]) +
+                     (mp_word)u;

         /* the new column is the lower part of the result */
-         *tmpt++ = (mp_digit)(r & (mp_word)MP_MASK);
+         t.dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);

         /* get the carry word from the result */
         u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
      }
      /* set carry if it is placed below digs */
      if ((ix + iy) < digs) {
-         *tmpt = u;
+         t.dp[ix + pb] = u;
      }
   }

--- a/s_mp_mul_balance.c
+++ b/s_mp_mul_balance.c
@ -1,20 +1,16 @@
 #include "tommath_private.h"
-#ifdef S_MP_BALANCE_MUL_C
+#ifdef S_MP_MUL_BALANCE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

 /* single-digit multiplication with the smaller number as the single-digit */
-mp_err s_mp_balance_mul(const mp_int *a, const mp_int *b, mp_int *c)
+mp_err s_mp_mul_balance(const mp_int *a, const mp_int *b, mp_int *c)
 {
-   int count, len_a, len_b, nblocks, i, j, bsize;
-   mp_int a0, tmp, A, B, r;
+   mp_int a0, tmp, r;
   mp_err err;
-
-   len_a = a->used;
-   len_b = b->used;
-
-   nblocks = MP_MAX(a->used, b->used) / MP_MIN(a->used, b->used);
-   bsize = MP_MIN(a->used, b->used) ;
+   int i, j,
+       nblocks = MP_MAX(a->used, b->used) / MP_MIN(a->used, b->used),
+       bsize = MP_MIN(a->used, b->used);

   if ((err = mp_init_size(&a0, bsize + 2)) != MP_OKAY) {
      return err;
@ -25,24 +21,19 @@ mp_err s_mp_balance_mul(const mp_int *a, const mp_int *b, mp_int *c)
   }

   /* Make sure that A is the larger one*/
-   if (len_a < len_b) {
-      B = *a;
-      A = *b;
-   } else {
-      A = *a;
-      B = *b;
+   if (a->used < b->used) {
+      MP_EXCH(const mp_int *, a, b);
   }

   for (i = 0, j=0; i < nblocks; i++) {
      /* Cut a slice off of a */
-      a0.used = 0;
-      for (count = 0; count < bsize; count++) {
-         a0.dp[count] = A.dp[ j++ ];
-         a0.used++;
-      }
+      a0.used = bsize;
+      s_mp_copy_digs(a0.dp, a->dp + j, a0.used);
+      j += a0.used;
      mp_clamp(&a0);
+
      /* Multiply with b */
-      if ((err = mp_mul(&a0, &B, &tmp)) != MP_OKAY) {
+      if ((err = mp_mul(&a0, b, &tmp)) != MP_OKAY) {
         goto LBL_ERR;
      }
      /* Shift tmp to the correct position */
@ -55,14 +46,13 @@ mp_err s_mp_balance_mul(const mp_int *a, const mp_int *b, mp_int *c)
      }
   }
   /* The left-overs; there are always left-overs */
-   if (j < A.used) {
-      a0.used = 0;
-      for (count = 0; j < A.used; count++) {
-         a0.dp[count] = A.dp[ j++ ];
-         a0.used++;
-      }
+   if (j < a->used) {
+      a0.used = a->used - j;
+      s_mp_copy_digs(a0.dp, a->dp + j, a0.used);
+      j += a0.used;
      mp_clamp(&a0);
-      if ((err = mp_mul(&a0, &B, &tmp)) != MP_OKAY) {
+
+      if ((err = mp_mul(&a0, b, &tmp)) != MP_OKAY) {
         goto LBL_ERR;
      }
      if ((err = mp_lshd(&tmp, bsize * i)) != MP_OKAY) {
--- a/s_mp_mul_digs_fast.c
+++ b/s_mp_mul_digs_fast.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_MUL_DIGS_FAST_C
+#ifdef S_MP_MUL_COMBA_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -19,18 +19,16 @@
 * Based on Algorithm 14.12 on pp.595 of HAC.
 *
 */
-mp_err s_mp_mul_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int digs)
+mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
-   int      olduse, pa, ix, iz;
+   int      oldused, pa, ix;
   mp_err   err;
   mp_digit W[MP_WARRAY];
   mp_word  _W;

   /* grow the destination as required */
-   if (c->alloc < digs) {
-      if ((err = mp_grow(c, digs)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, digs)) != MP_OKAY) {
+      return err;
   }

   /* number of output digits to produce */
@ -39,18 +37,12 @@ mp_err s_mp_mul_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int digs)
   /* clear the carry */
   _W = 0;
   for (ix = 0; ix < pa; ix++) {
-      int      tx, ty;
-      int      iy;
-      mp_digit *tmpx, *tmpy;
+      int tx, ty, iy, iz;

      /* get offsets into the two bignums */
      ty = MP_MIN(b->used-1, ix);
      tx = ix - ty;

-      /* setup temp aliases */
-      tmpx = a->dp + tx;
-      tmpy = b->dp + ty;
-
      /* this is the number of times the loop will iterrate, essentially
         while (tx++ < a->used && ty-- >= 0) { ... }
       */
@ -58,8 +50,7 @@ mp_err s_mp_mul_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int digs)

      /* execute loop */
      for (iz = 0; iz < iy; ++iz) {
-         _W += (mp_word)*tmpx++ * (mp_word)*tmpy--;
-
+         _W += (mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz];
      }

      /* store term */
@ -70,20 +61,17 @@ mp_err s_mp_mul_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int digs)
   }

   /* setup dest */
-   olduse  = c->used;
+   oldused  = c->used;
   c->used = pa;

-   {
-      mp_digit *tmpc;
-      tmpc = c->dp;
-      for (ix = 0; ix < pa; ix++) {
-         /* now extract the previous digit [below the carry] */
-         *tmpc++ = W[ix];
-      }
-
-      /* clear unused digits [that existed in the old copy of c] */
-      MP_ZERO_DIGITS(tmpc, olduse - ix);
+   for (ix = 0; ix < pa; ix++) {
+      /* now extract the previous digit [below the carry] */
+      c->dp[ix] = W[ix];
   }
+
+   /* clear unused digits [that existed in the old copy of c] */
+   s_mp_zero_digs(c->dp + c->used, oldused - c->used);
+
   mp_clamp(c);
   return MP_OKAY;
 }
--- a/s_mp_mul_high_digs.c
+++ b/s_mp_mul_high_digs.c
@ -1,25 +1,22 @@
 #include "tommath_private.h"
-#ifdef S_MP_MUL_HIGH_DIGS_C
+#ifdef S_MP_MUL_HIGH_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

 /* multiplies |a| * |b| and does not compute the lower digs digits
 * [meant to get the higher part of the product]
 */
-mp_err s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs)
+mp_err s_mp_mul_high(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
   mp_int   t;
-   int      pa, pb, ix, iy;
+   int      pa, pb, ix;
   mp_err   err;
-   mp_digit u;
-   mp_word  r;
-   mp_digit tmpx, *tmpt, *tmpy;

   /* can we use the fast multiplier? */
-   if (MP_HAS(S_MP_MUL_HIGH_DIGS_FAST)
+   if (MP_HAS(S_MP_MUL_HIGH_COMBA)
       && ((a->used + b->used + 1) < MP_WARRAY)
       && (MP_MIN(a->used, b->used) < MP_MAXFAST)) {
-      return s_mp_mul_high_digs_fast(a, b, c, digs);
+      return s_mp_mul_high_comba(a, b, c, digs);
   }

   if ((err = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
@ -30,31 +27,22 @@ mp_err s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs)
   pa = a->used;
   pb = b->used;
   for (ix = 0; ix < pa; ix++) {
-      /* clear the carry */
-      u = 0;
-
-      /* left hand side of A[ix] * B[iy] */
-      tmpx = a->dp[ix];
-
-      /* alias to the address of where the digits will be stored */
-      tmpt = &(t.dp[digs]);
-
-      /* alias for where to read the right hand side from */
-      tmpy = b->dp + (digs - ix);
+      int iy;
+      mp_digit u = 0;

      for (iy = digs - ix; iy < pb; iy++) {
         /* calculate the double precision result */
-         r       = (mp_word)*tmpt +
-                   ((mp_word)tmpx * (mp_word)*tmpy++) +
-                   (mp_word)u;
+         mp_word r = (mp_word)t.dp[ix + iy] +
+                     ((mp_word)a->dp[ix] * (mp_word)b->dp[iy]) +
+                     (mp_word)u;

         /* get the lower part */
-         *tmpt++ = (mp_digit)(r & (mp_word)MP_MASK);
+         t.dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);

         /* carry the carry */
         u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
      }
-      *tmpt = u;
+      t.dp[ix + pb] = u;
   }
   mp_clamp(&t);
   mp_exch(&t, c);
--- a/s_mp_mul_high_digs_fast.c
+++ b/s_mp_mul_high_digs_fast.c
@ -1,10 +1,10 @@
 #include "tommath_private.h"
-#ifdef S_MP_MUL_HIGH_DIGS_FAST_C
+#ifdef S_MP_MUL_HIGH_COMBA_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

-/* this is a modified version of s_mp_mul_digs_fast that only produces
- * output digits *above* digs.  See the comments for s_mp_mul_digs_fast
+/* this is a modified version of s_mp_mul_comba that only produces
+ * output digits *above* digs.  See the comments for s_mp_mul_comba
 * to see how it works.
 *
 * This is used in the Barrett reduction since for one of the multiplications
@ -12,36 +12,29 @@
 *
 * Based on Algorithm 14.12 on pp.595 of HAC.
 */
-mp_err s_mp_mul_high_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int digs)
+mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
-   int     olduse, pa, ix, iz;
+   int     oldused, pa, ix;
   mp_err   err;
   mp_digit W[MP_WARRAY];
   mp_word  _W;

   /* grow the destination as required */
   pa = a->used + b->used;
-   if (c->alloc < pa) {
-      if ((err = mp_grow(c, pa)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, pa)) != MP_OKAY) {
+      return err;
   }

   /* number of output digits to produce */
   pa = a->used + b->used;
   _W = 0;
   for (ix = digs; ix < pa; ix++) {
-      int      tx, ty, iy;
-      mp_digit *tmpx, *tmpy;
+      int      tx, ty, iy, iz;

      /* get offsets into the two bignums */
      ty = MP_MIN(b->used-1, ix);
      tx = ix - ty;

-      /* setup temp aliases */
-      tmpx = a->dp + tx;
-      tmpy = b->dp + ty;
-
      /* this is the number of times the loop will iterrate, essentially its
         while (tx++ < a->used && ty-- >= 0) { ... }
       */
@ -49,7 +42,7 @@ mp_err s_mp_mul_high_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int

      /* execute loop */
      for (iz = 0; iz < iy; iz++) {
-         _W += (mp_word)*tmpx++ * (mp_word)*tmpy--;
+         _W += (mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz];
      }

      /* store term */
@ -60,21 +53,17 @@ mp_err s_mp_mul_high_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int
   }

   /* setup dest */
-   olduse  = c->used;
+   oldused  = c->used;
   c->used = pa;

-   {
-      mp_digit *tmpc;
-
-      tmpc = c->dp + digs;
-      for (ix = digs; ix < pa; ix++) {
-         /* now extract the previous digit [below the carry] */
-         *tmpc++ = W[ix];
-      }
-
-      /* clear unused digits [that existed in the old copy of c] */
-      MP_ZERO_DIGITS(tmpc, olduse - ix);
+   for (ix = digs; ix < pa; ix++) {
+      /* now extract the previous digit [below the carry] */
+      c->dp[ix] = W[ix];
   }
+
+   /* clear unused digits [that existed in the old copy of c] */
+   s_mp_zero_digs(c->dp + c->used, oldused - c->used);
+
   mp_clamp(c);
   return MP_OKAY;
 }
--- a/s_mp_mul_karatsuba.c
+++ b/s_mp_mul_karatsuba.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_KARATSUBA_MUL_C
+#ifdef S_MP_MUL_KARATSUBA_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -32,11 +32,11 @@
 * Generally though the overhead of this method doesn't pay off
 * until a certain size (N ~ 80) is reached.
 */
-mp_err s_mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c)
+mp_err s_mp_mul_karatsuba(const mp_int *a, const mp_int *b, mp_int *c)
 {
   mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
-   int     B;
-   mp_err  err = MP_MEM; /* default the return code to an error */
+   int  B;
+   mp_err  err;

   /* min # of digits */
   B = MP_MIN(a->used, b->used);
@ -45,27 +45,27 @@ mp_err s_mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c)
   B = B >> 1;

   /* init copy all the temps */
-   if (mp_init_size(&x0, B) != MP_OKAY) {
+   if ((err = mp_init_size(&x0, B)) != MP_OKAY) {
      goto LBL_ERR;
   }
-   if (mp_init_size(&x1, a->used - B) != MP_OKAY) {
+   if ((err = mp_init_size(&x1, a->used - B)) != MP_OKAY) {
      goto X0;
   }
-   if (mp_init_size(&y0, B) != MP_OKAY) {
+   if ((err = mp_init_size(&y0, B)) != MP_OKAY) {
      goto X1;
   }
-   if (mp_init_size(&y1, b->used - B) != MP_OKAY) {
+   if ((err = mp_init_size(&y1, b->used - B)) != MP_OKAY) {
      goto Y0;
   }

   /* init temps */
-   if (mp_init_size(&t1, B * 2) != MP_OKAY) {
+   if ((err = mp_init_size(&t1, B * 2)) != MP_OKAY) {
      goto Y1;
   }
-   if (mp_init_size(&x0y0, B * 2) != MP_OKAY) {
+   if ((err = mp_init_size(&x0y0, B * 2)) != MP_OKAY) {
      goto T1;
   }
-   if (mp_init_size(&x1y1, B * 2) != MP_OKAY) {
+   if ((err = mp_init_size(&x1y1, B * 2)) != MP_OKAY) {
      goto X0Y0;
   }

@ -74,33 +74,13 @@ mp_err s_mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c)
   x1.used = a->used - B;
   y1.used = b->used - B;

-   {
-      int x;
-      mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
-
-      /* we copy the digits directly instead of using higher level functions
-       * since we also need to shift the digits
-       */
-      tmpa = a->dp;
-      tmpb = b->dp;
-
-      tmpx = x0.dp;
-      tmpy = y0.dp;
-      for (x = 0; x < B; x++) {
-         *tmpx++ = *tmpa++;
-         *tmpy++ = *tmpb++;
-      }
-
-      tmpx = x1.dp;
-      for (x = B; x < a->used; x++) {
-         *tmpx++ = *tmpa++;
-      }
-
-      tmpy = y1.dp;
-      for (x = B; x < b->used; x++) {
-         *tmpy++ = *tmpb++;
-      }
-   }
+   /* we copy the digits directly instead of using higher level functions
+    * since we also need to shift the digits
+    */
+   s_mp_copy_digs(x0.dp, a->dp, x0.used);
+   s_mp_copy_digs(y0.dp, b->dp, y0.used);
+   s_mp_copy_digs(x1.dp, a->dp + B, x1.used);
+   s_mp_copy_digs(y1.dp, b->dp + B, y1.used);

   /* only need to clamp the lower words since by definition the
    * upper words x1/y1 must have a known number of digits
@ -110,50 +90,47 @@ mp_err s_mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c)

   /* now calc the products x0y0 and x1y1 */
   /* after this x0 is no longer required, free temp [x0==t2]! */
-   if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) {
+   if ((err = mp_mul(&x0, &y0, &x0y0)) != MP_OKAY) {
      goto X1Y1;          /* x0y0 = x0*y0 */
   }
-   if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) {
+   if ((err = mp_mul(&x1, &y1, &x1y1)) != MP_OKAY) {
      goto X1Y1;          /* x1y1 = x1*y1 */
   }

   /* now calc x1+x0 and y1+y0 */
-   if (s_mp_add(&x1, &x0, &t1) != MP_OKAY) {
+   if ((err = s_mp_add(&x1, &x0, &t1)) != MP_OKAY) {
      goto X1Y1;          /* t1 = x1 - x0 */
   }
-   if (s_mp_add(&y1, &y0, &x0) != MP_OKAY) {
+   if ((err = s_mp_add(&y1, &y0, &x0)) != MP_OKAY) {
      goto X1Y1;          /* t2 = y1 - y0 */
   }
-   if (mp_mul(&t1, &x0, &t1) != MP_OKAY) {
+   if ((err = mp_mul(&t1, &x0, &t1)) != MP_OKAY) {
      goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
   }

   /* add x0y0 */
-   if (mp_add(&x0y0, &x1y1, &x0) != MP_OKAY) {
+   if ((err = mp_add(&x0y0, &x1y1, &x0)) != MP_OKAY) {
      goto X1Y1;          /* t2 = x0y0 + x1y1 */
   }
-   if (s_mp_sub(&t1, &x0, &t1) != MP_OKAY) {
+   if ((err = s_mp_sub(&t1, &x0, &t1)) != MP_OKAY) {
      goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
   }

   /* shift by B */
-   if (mp_lshd(&t1, B) != MP_OKAY) {
+   if ((err = mp_lshd(&t1, B)) != MP_OKAY) {
      goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
   }
-   if (mp_lshd(&x1y1, B * 2) != MP_OKAY) {
+   if ((err = mp_lshd(&x1y1, B * 2)) != MP_OKAY) {
      goto X1Y1;          /* x1y1 = x1y1 << 2*B */
   }

-   if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) {
+   if ((err = mp_add(&x0y0, &t1, &t1)) != MP_OKAY) {
      goto X1Y1;          /* t1 = x0y0 + t1 */
   }
-   if (mp_add(&t1, &x1y1, c) != MP_OKAY) {
+   if ((err = mp_add(&t1, &x1y1, c)) != MP_OKAY) {
      goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
   }

-   /* Algorithm succeeded set the return code to MP_OKAY */
-   err = MP_OKAY;
-
 X1Y1:
   mp_clear(&x1y1);
 X0Y0:
--- a/s_mp_mul_toom.c
+++ b/s_mp_mul_toom.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_TOOM_MUL_C
+#ifdef S_MP_MUL_TOOM_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -29,10 +29,10 @@
     Centro Vito Volterra Universita di Roma Tor Vergata (2006)
 */

-mp_err s_mp_toom_mul(const mp_int *a, const mp_int *b, mp_int *c)
+mp_err s_mp_mul_toom(const mp_int *a, const mp_int *b, mp_int *c)
 {
   mp_int S1, S2, T1, a0, a1, a2, b0, b1, b2;
-   int B, count;
+   int B;
   mp_err err;

   /* init temps */
@ -45,43 +45,30 @@ mp_err s_mp_toom_mul(const mp_int *a, const mp_int *b, mp_int *c)

   /** a = a2 * x^2 + a1 * x + a0; */
   if ((err = mp_init_size(&a0, B)) != MP_OKAY)                   goto LBL_ERRa0;
-
-   for (count = 0; count < B; count++) {
-      a0.dp[count] = a->dp[count];
-      a0.used++;
-   }
-   mp_clamp(&a0);
   if ((err = mp_init_size(&a1, B)) != MP_OKAY)                   goto LBL_ERRa1;
-   for (; count < (2 * B); count++) {
-      a1.dp[count - B] = a->dp[count];
-      a1.used++;
-   }
+   if ((err = mp_init_size(&a2, a->used - 2 * B)) != MP_OKAY)     goto LBL_ERRa2;
+
+   a0.used = a1.used = B;
+   a2.used = a->used - 2 * B;
+   s_mp_copy_digs(a0.dp, a->dp, a0.used);
+   s_mp_copy_digs(a1.dp, a->dp + B, a1.used);
+   s_mp_copy_digs(a2.dp, a->dp + 2 * B, a2.used);
+   mp_clamp(&a0);
   mp_clamp(&a1);
-   if ((err = mp_init_size(&a2, B + (a->used - (3 * B)))) != MP_OKAY) goto LBL_ERRa2;
-   for (; count < a->used; count++) {
-      a2.dp[count - (2 * B)] = a->dp[count];
-      a2.used++;
-   }
   mp_clamp(&a2);

   /** b = b2 * x^2 + b1 * x + b0; */
   if ((err = mp_init_size(&b0, B)) != MP_OKAY)                   goto LBL_ERRb0;
-   for (count = 0; count < B; count++) {
-      b0.dp[count] = b->dp[count];
-      b0.used++;
-   }
-   mp_clamp(&b0);
   if ((err = mp_init_size(&b1, B)) != MP_OKAY)                   goto LBL_ERRb1;
-   for (; count < (2 * B); count++) {
-      b1.dp[count - B] = b->dp[count];
-      b1.used++;
-   }
+   if ((err = mp_init_size(&b2, b->used - 2 * B)) != MP_OKAY)     goto LBL_ERRb2;
+
+   b0.used = b1.used = B;
+   b2.used = b->used - 2 * B;
+   s_mp_copy_digs(b0.dp, b->dp, b0.used);
+   s_mp_copy_digs(b1.dp, b->dp + B, b1.used);
+   s_mp_copy_digs(b2.dp, b->dp + 2 * B, b2.used);
+   mp_clamp(&b0);
   mp_clamp(&b1);
-   if ((err = mp_init_size(&b2, B + (b->used - (3 * B)))) != MP_OKAY) goto LBL_ERRb2;
-   for (; count < b->used; count++) {
-      b2.dp[count - (2 * B)] = b->dp[count];
-      b2.used++;
-   }
   mp_clamp(&b2);

   /** \\ S1 = (a2+a1+a0) * (b2+b1+b0); */
--- a/s_mp_prime_is_divisible.c
+++ b/s_mp_prime_is_divisible.c
@ -10,16 +10,12 @@
 */
 mp_err s_mp_prime_is_divisible(const mp_int *a, bool *result)
 {
-   int      ix;
-   mp_err   err;
-   mp_digit res;
-
-   /* default to not */
-   *result = false;
-
-   for (ix = 0; ix < MP_PRIME_TAB_SIZE; ix++) {
-      /* what is a mod LBL_prime_tab[ix] */
-      if ((err = mp_mod_d(a, s_mp_prime_tab[ix], &res)) != MP_OKAY) {
+   int i;
+   for (i = 0; i < MP_PRIME_TAB_SIZE; i++) {
+      /* what is a mod LBL_prime_tab[i] */
+      mp_err err;
+      mp_digit res;
+      if ((err = mp_mod_d(a, s_mp_prime_tab[i], &res)) != MP_OKAY) {
         return err;
      }

@ -30,6 +26,8 @@ mp_err s_mp_prime_is_divisible(const mp_int *a, bool *result)
      }
   }

+   /* default to not */
+   *result = false;
   return MP_OKAY;
 }
 #endif
--- a/s_mp_sqr.c
+++ b/s_mp_sqr.c
@ -7,10 +7,8 @@
 mp_err s_mp_sqr(const mp_int *a, mp_int *b)
 {
   mp_int   t;
-   int      ix, iy, pa;
+   int      ix, pa;
   mp_err   err;
-   mp_word  r;
-   mp_digit u, tmpx, *tmpt;

   pa = a->used;
   if ((err = mp_init_size(&t, (2 * pa) + 1)) != MP_OKAY) {
@ -21,10 +19,13 @@ mp_err s_mp_sqr(const mp_int *a, mp_int *b)
   t.used = (2 * pa) + 1;

   for (ix = 0; ix < pa; ix++) {
+      mp_digit u;
+      int iy;
+
      /* first calculate the digit at 2*ix */
      /* calculate double precision result */
-      r = (mp_word)t.dp[2*ix] +
-          ((mp_word)a->dp[ix] * (mp_word)a->dp[ix]);
+      mp_word r = (mp_word)t.dp[2*ix] +
+                  ((mp_word)a->dp[ix] * (mp_word)a->dp[ix]);

      /* store lower part in result */
      t.dp[ix+ix] = (mp_digit)(r & (mp_word)MP_MASK);
@ -32,32 +33,27 @@ mp_err s_mp_sqr(const mp_int *a, mp_int *b)
      /* get the carry */
      u           = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);

-      /* left hand side of A[ix] * A[iy] */
-      tmpx        = a->dp[ix];
-
-      /* alias for where to store the results */
-      tmpt        = t.dp + ((2 * ix) + 1);
-
      for (iy = ix + 1; iy < pa; iy++) {
         /* first calculate the product */
-         r       = (mp_word)tmpx * (mp_word)a->dp[iy];
+         r       = (mp_word)a->dp[ix] * (mp_word)a->dp[iy];

         /* now calculate the double precision result, note we use
          * addition instead of *2 since it's easier to optimize
          */
-         r       = (mp_word)*tmpt + r + r + (mp_word)u;
+         r       = (mp_word)t.dp[ix + iy] + r + r + (mp_word)u;

         /* store lower part */
-         *tmpt++ = (mp_digit)(r & (mp_word)MP_MASK);
+         t.dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);

         /* get carry */
         u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
      }
      /* propagate upwards */
      while (u != 0uL) {
-         r       = (mp_word)*tmpt + (mp_word)u;
-         *tmpt++ = (mp_digit)(r & (mp_word)MP_MASK);
+         r       = (mp_word)t.dp[ix + iy] + (mp_word)u;
+         t.dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
         u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
+         ++iy;
      }
   }

--- a/s_mp_sqr_comba.c
+++ b/s_mp_sqr_comba.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_SQR_FAST_C
+#ifdef S_MP_SQR_COMBA_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -13,27 +13,24 @@
 After that loop you do the squares and add them in.
 */

-mp_err s_mp_sqr_fast(const mp_int *a, mp_int *b)
+mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b)
 {
-   int       olduse, pa, ix, iz;
-   mp_digit  W[MP_WARRAY], *tmpx;
+   int       oldused, pa, ix;
+   mp_digit  W[MP_WARRAY];
   mp_word   W1;
-   mp_err    err;
+   mp_err err;

   /* grow the destination as required */
   pa = a->used + a->used;
-   if (b->alloc < pa) {
-      if ((err = mp_grow(b, pa)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(b, pa)) != MP_OKAY) {
+      return err;
   }

   /* number of output digits to produce */
   W1 = 0;
   for (ix = 0; ix < pa; ix++) {
-      int      tx, ty, iy;
+      int      tx, ty, iy, iz;
      mp_word  _W;
-      mp_digit *tmpy;

      /* clear counter */
      _W = 0;
@ -42,10 +39,6 @@ mp_err s_mp_sqr_fast(const mp_int *a, mp_int *b)
      ty = MP_MIN(a->used-1, ix);
      tx = ix - ty;

-      /* setup temp aliases */
-      tmpx = a->dp + tx;
-      tmpy = a->dp + ty;
-
      /* this is the number of times the loop will iterrate, essentially
         while (tx++ < a->used && ty-- >= 0) { ... }
       */
@ -59,7 +52,7 @@ mp_err s_mp_sqr_fast(const mp_int *a, mp_int *b)

      /* execute loop */
      for (iz = 0; iz < iy; iz++) {
-         _W += (mp_word)*tmpx++ * (mp_word)*tmpy--;
+         _W += (mp_word)a->dp[tx + iz] * (mp_word)a->dp[ty - iz];
      }

      /* double the inner product and add carry */
@ -78,19 +71,16 @@ mp_err s_mp_sqr_fast(const mp_int *a, mp_int *b)
   }

   /* setup dest */
-   olduse  = b->used;
+   oldused  = b->used;
   b->used = a->used+a->used;

-   {
-      mp_digit *tmpb;
-      tmpb = b->dp;
-      for (ix = 0; ix < pa; ix++) {
-         *tmpb++ = W[ix] & MP_MASK;
-      }
-
-      /* clear unused digits [that existed in the old copy of c] */
-      MP_ZERO_DIGITS(tmpb, olduse - ix);
+   for (ix = 0; ix < pa; ix++) {
+      b->dp[ix] = W[ix] & MP_MASK;
   }
+
+   /* clear unused digits [that existed in the old copy of c] */
+   s_mp_zero_digs(b->dp + b->used, oldused - b->used);
+
   mp_clamp(b);
   return MP_OKAY;
 }
--- a/s_mp_sqr_karatsuba.c
+++ b/s_mp_sqr_karatsuba.c
@ -1,20 +1,20 @@
 #include "tommath_private.h"
-#ifdef S_MP_KARATSUBA_SQR_C
+#ifdef S_MP_SQR_KARATSUBA_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

 /* Karatsuba squaring, computes b = a*a using three
 * half size squarings
 *
- * See comments of karatsuba_mul for details.  It
+ * See comments of mul_karatsuba for details.  It
 * is essentially the same algorithm but merely
 * tuned to perform recursive squarings.
 */
-mp_err s_mp_karatsuba_sqr(const mp_int *a, mp_int *b)
+mp_err s_mp_sqr_karatsuba(const mp_int *a, mp_int *b)
 {
   mp_int  x0, x1, t1, t2, x0x0, x1x1;
-   int     B;
-   mp_err  err = MP_MEM;
+   int B;
+   mp_err  err;

   /* min # of digits */
   B = a->used;
@ -23,75 +23,57 @@ mp_err s_mp_karatsuba_sqr(const mp_int *a, mp_int *b)
   B = B >> 1;

   /* init copy all the temps */
-   if (mp_init_size(&x0, B) != MP_OKAY)
+   if ((err = mp_init_size(&x0, B)) != MP_OKAY)
      goto LBL_ERR;
-   if (mp_init_size(&x1, a->used - B) != MP_OKAY)
+   if ((err = mp_init_size(&x1, a->used - B)) != MP_OKAY)
      goto X0;

   /* init temps */
-   if (mp_init_size(&t1, a->used * 2) != MP_OKAY)
+   if ((err = mp_init_size(&t1, a->used * 2)) != MP_OKAY)
      goto X1;
-   if (mp_init_size(&t2, a->used * 2) != MP_OKAY)
+   if ((err = mp_init_size(&t2, a->used * 2)) != MP_OKAY)
      goto T1;
-   if (mp_init_size(&x0x0, B * 2) != MP_OKAY)
+   if ((err = mp_init_size(&x0x0, B * 2)) != MP_OKAY)
      goto T2;
-   if (mp_init_size(&x1x1, (a->used - B) * 2) != MP_OKAY)
+   if ((err = mp_init_size(&x1x1, (a->used - B) * 2)) != MP_OKAY)
      goto X0X0;

-   {
-      int x;
-      mp_digit *dst, *src;
-
-      src = a->dp;
-
-      /* now shift the digits */
-      dst = x0.dp;
-      for (x = 0; x < B; x++) {
-         *dst++ = *src++;
-      }
-
-      dst = x1.dp;
-      for (x = B; x < a->used; x++) {
-         *dst++ = *src++;
-      }
-   }
-
+   /* now shift the digits */
   x0.used = B;
   x1.used = a->used - B;
-
+   s_mp_copy_digs(x0.dp, a->dp, x0.used);
+   s_mp_copy_digs(x1.dp, a->dp + B, x1.used);
   mp_clamp(&x0);

   /* now calc the products x0*x0 and x1*x1 */
-   if (mp_sqr(&x0, &x0x0) != MP_OKAY)
+   if ((err = mp_sqr(&x0, &x0x0)) != MP_OKAY)
      goto X1X1;           /* x0x0 = x0*x0 */
-   if (mp_sqr(&x1, &x1x1) != MP_OKAY)
+   if ((err = mp_sqr(&x1, &x1x1)) != MP_OKAY)
      goto X1X1;           /* x1x1 = x1*x1 */

   /* now calc (x1+x0)**2 */
-   if (s_mp_add(&x1, &x0, &t1) != MP_OKAY)
+   if ((err = s_mp_add(&x1, &x0, &t1)) != MP_OKAY)
      goto X1X1;           /* t1 = x1 - x0 */
-   if (mp_sqr(&t1, &t1) != MP_OKAY)
+   if ((err = mp_sqr(&t1, &t1)) != MP_OKAY)
      goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */

   /* add x0y0 */
-   if (s_mp_add(&x0x0, &x1x1, &t2) != MP_OKAY)
+   if ((err = s_mp_add(&x0x0, &x1x1, &t2)) != MP_OKAY)
      goto X1X1;           /* t2 = x0x0 + x1x1 */
-   if (s_mp_sub(&t1, &t2, &t1) != MP_OKAY)
+   if ((err = s_mp_sub(&t1, &t2, &t1)) != MP_OKAY)
      goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */

   /* shift by B */
-   if (mp_lshd(&t1, B) != MP_OKAY)
+   if ((err = mp_lshd(&t1, B)) != MP_OKAY)
      goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
-   if (mp_lshd(&x1x1, B * 2) != MP_OKAY)
+   if ((err = mp_lshd(&x1x1, B * 2)) != MP_OKAY)
      goto X1X1;           /* x1x1 = x1x1 << 2*B */

-   if (mp_add(&x0x0, &t1, &t1) != MP_OKAY)
+   if ((err = mp_add(&x0x0, &t1, &t1)) != MP_OKAY)
      goto X1X1;           /* t1 = x0x0 + t1 */
-   if (mp_add(&t1, &x1x1, b) != MP_OKAY)
+   if ((err = mp_add(&t1, &x1x1, b)) != MP_OKAY)
      goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */

-   err = MP_OKAY;
-
 X1X1:
   mp_clear(&x1x1);
 X0X0:
--- a/s_mp_sqr_toom.c
+++ b/s_mp_sqr_toom.c
@ -1,5 +1,5 @@
 #include "tommath_private.h"
-#ifdef S_MP_TOOM_SQR_C
+#ifdef S_MP_SQR_TOOM_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */

@ -18,14 +18,12 @@
     18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.

 */
-mp_err s_mp_toom_sqr(const mp_int *a, mp_int *b)
+mp_err s_mp_sqr_toom(const mp_int *a, mp_int *b)
 {
   mp_int S0, a0, a1, a2;
-   mp_digit *tmpa, *tmpc;
-   int B, count;
+   int B;
   mp_err err;

-
   /* init temps */
   if ((err = mp_init(&S0)) != MP_OKAY) {
      return err;
@ -36,26 +34,14 @@ mp_err s_mp_toom_sqr(const mp_int *a, mp_int *b)

   /** a = a2 * x^2 + a1 * x + a0; */
   if ((err = mp_init_size(&a0, B)) != MP_OKAY)                   goto LBL_ERRa0;
-
-   a0.used = B;
   if ((err = mp_init_size(&a1, B)) != MP_OKAY)                   goto LBL_ERRa1;
-   a1.used = B;
-   if ((err = mp_init_size(&a2, B + (a->used - (3 * B)))) != MP_OKAY) goto LBL_ERRa2;
+   if ((err = mp_init_size(&a2, a->used - (2 * B))) != MP_OKAY)   goto LBL_ERRa2;

-   tmpa = a->dp;
-   tmpc = a0.dp;
-   for (count = 0; count < B; count++) {
-      *tmpc++ = *tmpa++;
-   }
-   tmpc = a1.dp;
-   for (; count < (2 * B); count++) {
-      *tmpc++ = *tmpa++;
-   }
-   tmpc = a2.dp;
-   for (; count < a->used; count++) {
-      *tmpc++ = *tmpa++;
-      a2.used++;
-   }
+   a0.used = a1.used = B;
+   a2.used = a->used - 2 * B;
+   s_mp_copy_digs(a0.dp, a->dp, a0.used);
+   s_mp_copy_digs(a1.dp, a->dp + B, a1.used);
+   s_mp_copy_digs(a2.dp, a->dp + 2 * B, a2.used);
   mp_clamp(&a0);
   mp_clamp(&a1);
   mp_clamp(&a2);
--- a/s_mp_sub.c
+++ b/s_mp_sub.c
@ -6,64 +6,49 @@
 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
 mp_err s_mp_sub(const mp_int *a, const mp_int *b, mp_int *c)
 {
-   int    olduse, min, max;
+   int oldused = c->used, min = b->used, max = a->used, i;
+   mp_digit u;
   mp_err err;

-   /* find sizes */
-   min = b->used;
-   max = a->used;
-
   /* init result */
-   if (c->alloc < max) {
-      if ((err = mp_grow(c, max)) != MP_OKAY) {
-         return err;
-      }
+   if ((err = mp_grow(c, max)) != MP_OKAY) {
+      return err;
   }
-   olduse = c->used;
+
   c->used = max;

-   {
-      mp_digit u, *tmpa, *tmpb, *tmpc;
-      int i;
+   /* set carry to zero */
+   u = 0;
+   for (i = 0; i < min; i++) {
+      /* T[i] = A[i] - B[i] - U */
+      c->dp[i] = (a->dp[i] - b->dp[i]) - u;

-      /* alias for digit pointers */
-      tmpa = a->dp;
-      tmpb = b->dp;
-      tmpc = c->dp;
+      /* U = carry bit of T[i]
+       * Note this saves performing an AND operation since
+       * if a carry does occur it will propagate all the way to the
+       * MSB.  As a result a single shift is enough to get the carry
+       */
+      u = c->dp[i] >> (MP_SIZEOF_BITS(mp_digit) - 1u);

-      /* set carry to zero */
-      u = 0;
-      for (i = 0; i < min; i++) {
-         /* T[i] = A[i] - B[i] - U */
-         *tmpc = (*tmpa++ - *tmpb++) - u;
-
-         /* U = carry bit of T[i]
-          * Note this saves performing an AND operation since
-          * if a carry does occur it will propagate all the way to the
-          * MSB.  As a result a single shift is enough to get the carry
-          */
-         u = *tmpc >> (MP_SIZEOF_BITS(mp_digit) - 1u);
-
-         /* Clear carry from T[i] */
-         *tmpc++ &= MP_MASK;
-      }
-
-      /* now copy higher words if any, e.g. if A has more digits than B  */
-      for (; i < max; i++) {
-         /* T[i] = A[i] - U */
-         *tmpc = *tmpa++ - u;
-
-         /* U = carry bit of T[i] */
-         u = *tmpc >> (MP_SIZEOF_BITS(mp_digit) - 1u);
-
-         /* Clear carry from T[i] */
-         *tmpc++ &= MP_MASK;
-      }
-
-      /* clear digits above used (since we may not have grown result above) */
-      MP_ZERO_DIGITS(tmpc, olduse - c->used);
+      /* Clear carry from T[i] */
+      c->dp[i] &= MP_MASK;
   }

+   /* now copy higher words if any, e.g. if A has more digits than B  */
+   for (; i < max; i++) {
+      /* T[i] = A[i] - U */
+      c->dp[i] = a->dp[i] - u;
+
+      /* U = carry bit of T[i] */
+      u = c->dp[i] >> (MP_SIZEOF_BITS(mp_digit) - 1u);
+
+      /* Clear carry from T[i] */
+      c->dp[i] &= MP_MASK;
+   }
+
+   /* clear digits above used (since we may not have grown result above) */
+   s_mp_zero_digs(c->dp + c->used, oldused - c->used);
+
   mp_clamp(c);
   return MP_OKAY;
 }
--- a/s_mp_zero_buf.c
+++ b/s_mp_zero_buf.c
@ -0,0 +1,22 @@
+#include "tommath_private.h"
+#ifdef S_MP_ZERO_BUF_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+#ifdef MP_USE_MEMOPS
+#  include <string.h>
+#endif
+
+void s_mp_zero_buf(void *mem, size_t size)
+{
+#ifdef MP_USE_MEMOPS
+   memset(mem, 0, size);
+#else
+   char *m = (char *)mem;
+   while (size-- > 0u) {
+      *m++ = '\0';
+   }
+#endif
+}
+
+#endif
--- a/s_mp_zero_digs.c
+++ b/s_mp_zero_digs.c
@ -0,0 +1,23 @@
+#include "tommath_private.h"
+#ifdef S_MP_ZERO_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+#ifdef MP_USE_MEMOPS
+#  include <string.h>
+#endif
+
+void s_mp_zero_digs(mp_digit *d, int digits)
+{
+#ifdef MP_USE_MEMOPS
+   if (digits > 0) {
+      memset(d, 0, (size_t)digits * sizeof(mp_digit));
+   }
+#else
+   while (digits-- > 0) {
+      *d++ = 0;
+   }
+#endif
+}
+
+#endif
--- a/tommath.def
+++ b/tommath.def
@ -72,7 +72,6 @@ EXPORTS
    mp_lshd
    mp_mod
    mp_mod_2d
-    mp_mod_d
    mp_montgomery_calc_normalization
    mp_montgomery_reduce
    mp_montgomery_setup
--- a/tommath.h
+++ b/tommath.h
@ -63,7 +63,7 @@ typedef uint32_t             mp_digit;
 #   ifdef MP_31BIT
 /*
 * This is an extension that uses 31-bit digits.
- * Please be aware that not all functions support this size, especially s_mp_mul_digs_fast
+ * Please be aware that not all functions support this size, especially s_mp_mul_comba
 * will be reduced to work on small numbers only:
 * Up to 8 limbs, 248 bits instead of up to 512 limbs, 15872 bits with MP_28BIT.
 */
@ -117,10 +117,10 @@ typedef enum {
 /* tunable cutoffs */
 #ifndef MP_FIXED_CUTOFFS
 extern int
-MP_KARATSUBA_MUL_CUTOFF,
-MP_KARATSUBA_SQR_CUTOFF,
-MP_TOOM_MUL_CUTOFF,
-MP_TOOM_SQR_CUTOFF;
+MP_MUL_KARATSUBA_CUTOFF,
+MP_SQR_KARATSUBA_CUTOFF,
+MP_MUL_TOOM_CUTOFF,
+MP_SQR_TOOM_CUTOFF;
 #endif

 /* define this to use lower memory usage routines (exptmods mostly) */
@ -398,7 +398,7 @@ mp_err mp_mul_d(const mp_int *a, mp_digit b, mp_int *c) MP_WUR;
 mp_err mp_div_d(const mp_int *a, mp_digit b, mp_int *c, mp_digit *d) MP_WUR;

 /* c = a mod b, 0 <= c < b  */
-mp_err mp_mod_d(const mp_int *a, mp_digit b, mp_digit *c) MP_WUR;
+#define mp_mod_d(a, b, c) mp_div_d((a), (b), NULL, (c))

 /* ---> number theory <--- */

--- a/tommath_class.h
+++ b/tommath_class.h
@ -77,7 +77,6 @@
 #   define MP_LSHD_C
 #   define MP_MOD_C
 #   define MP_MOD_2D_C
-#   define MP_MOD_D_C
 #   define MP_MONTGOMERY_CALC_NORMALIZATION_C
 #   define MP_MONTGOMERY_REDUCE_C
 #   define MP_MONTGOMERY_SETUP_C
@ -141,33 +140,36 @@
 #   define MP_XOR_C
 #   define MP_ZERO_C
 #   define S_MP_ADD_C
-#   define S_MP_BALANCE_MUL_C
+#   define S_MP_COPY_DIGS_C
 #   define S_MP_DIV_RECURSIVE_C
 #   define S_MP_DIV_SCHOOL_C
 #   define S_MP_DIV_SMALL_C
 #   define S_MP_EXPTMOD_C
 #   define S_MP_EXPTMOD_FAST_C
 #   define S_MP_GET_BIT_C
-#   define S_MP_INVMOD_FAST_C
-#   define S_MP_INVMOD_SLOW_C
-#   define S_MP_KARATSUBA_MUL_C
-#   define S_MP_KARATSUBA_SQR_C
+#   define S_MP_INVMOD_C
+#   define S_MP_INVMOD_ODD_C
 #   define S_MP_LOG_C
 #   define S_MP_LOG_D_C
 #   define S_MP_LOG_POW2_C
-#   define S_MP_MONTGOMERY_REDUCE_FAST_C
-#   define S_MP_MUL_DIGS_C
-#   define S_MP_MUL_DIGS_FAST_C
-#   define S_MP_MUL_HIGH_DIGS_C
-#   define S_MP_MUL_HIGH_DIGS_FAST_C
+#   define S_MP_MONTGOMERY_REDUCE_COMBA_C
+#   define S_MP_MUL_C
+#   define S_MP_MUL_BALANCE_C
+#   define S_MP_MUL_COMBA_C
+#   define S_MP_MUL_HIGH_C
+#   define S_MP_MUL_HIGH_COMBA_C
+#   define S_MP_MUL_KARATSUBA_C
+#   define S_MP_MUL_TOOM_C
 #   define S_MP_PRIME_IS_DIVISIBLE_C
 #   define S_MP_RAND_JENKINS_C
 #   define S_MP_RAND_PLATFORM_C
 #   define S_MP_SQR_C
-#   define S_MP_SQR_FAST_C
+#   define S_MP_SQR_COMBA_C
+#   define S_MP_SQR_KARATSUBA_C
+#   define S_MP_SQR_TOOM_C
 #   define S_MP_SUB_C
-#   define S_MP_TOOM_MUL_C
-#   define S_MP_TOOM_SQR_C
+#   define S_MP_ZERO_BUF_C
+#   define S_MP_ZERO_DIGS_C
 #endif
 #endif
 #if defined(MP_2EXPT_C)
@ -189,6 +191,7 @@
 #   define MP_CLAMP_C
 #   define MP_GROW_C
 #   define MP_SUB_D_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_ADDMOD_C)
@ -207,6 +210,7 @@
 #endif

 #if defined(MP_CLEAR_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_CLEAR_MULTI_C)
@ -227,12 +231,13 @@
 #endif

 #if defined(MP_COMPLEMENT_C)
-#   define MP_NEG_C
 #   define MP_SUB_D_C
 #endif

 #if defined(MP_COPY_C)
 #   define MP_GROW_C
+#   define S_MP_COPY_DIGS_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_COUNT_BITS_C)
@ -253,6 +258,7 @@
 #if defined(MP_DIV_2_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_DIV_2D_C)
@ -260,7 +266,6 @@
 #   define MP_COPY_C
 #   define MP_MOD_2D_C
 #   define MP_RSHD_C
-#   define MP_ZERO_C
 #endif

 #if defined(MP_DIV_3_C)
@ -288,6 +293,7 @@
 #   define MP_CMP_MAG_C
 #   define MP_GROW_C
 #   define S_MP_SUB_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_DR_SETUP_C)
@ -351,6 +357,7 @@
 #if defined(MP_FWRITE_C)
 #   define MP_RADIX_SIZE_C
 #   define MP_TO_RADIX_C
+#   define S_MP_ZERO_BUF_C
 #endif

 #if defined(MP_GCD_C)
@ -397,6 +404,7 @@
 #endif

 #if defined(MP_GROW_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_INIT_C)
@ -463,17 +471,17 @@

 #if defined(MP_INVMOD_C)
 #   define MP_CMP_D_C
-#   define S_MP_INVMOD_FAST_C
-#   define S_MP_INVMOD_SLOW_C
+#   define S_MP_INVMOD_C
+#   define S_MP_INVMOD_ODD_C
 #endif

 #if defined(MP_IS_SQUARE_C)
 #   define MP_CLEAR_C
 #   define MP_CMP_MAG_C
+#   define MP_DIV_D_C
 #   define MP_GET_I32_C
 #   define MP_INIT_U32_C
 #   define MP_MOD_C
-#   define MP_MOD_D_C
 #   define MP_SQRT_C
 #   define MP_SQR_C
 #endif
@ -506,6 +514,7 @@

 #if defined(MP_LSHD_C)
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_MOD_C)
@ -520,10 +529,7 @@
 #   define MP_CLAMP_C
 #   define MP_COPY_C
 #   define MP_ZERO_C
-#endif
-
-#if defined(MP_MOD_D_C)
-#   define MP_DIV_D_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_MONTGOMERY_CALC_NORMALIZATION_C)
@ -540,7 +546,7 @@
 #   define MP_CMP_MAG_C
 #   define MP_GROW_C
 #   define MP_RSHD_C
-#   define S_MP_MONTGOMERY_REDUCE_FAST_C
+#   define S_MP_MONTGOMERY_REDUCE_COMBA_C
 #   define S_MP_SUB_C
 #endif

@ -548,15 +554,16 @@
 #endif

 #if defined(MP_MUL_C)
-#   define S_MP_BALANCE_MUL_C
-#   define S_MP_KARATSUBA_MUL_C
-#   define S_MP_MUL_DIGS_C
-#   define S_MP_MUL_DIGS_FAST_C
-#   define S_MP_TOOM_MUL_C
+#   define S_MP_MUL_BALANCE_C
+#   define S_MP_MUL_C
+#   define S_MP_MUL_COMBA_C
+#   define S_MP_MUL_KARATSUBA_C
+#   define S_MP_MUL_TOOM_C
 #endif

 #if defined(MP_MUL_2_C)
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_MUL_2D_C)
@ -569,6 +576,7 @@
 #if defined(MP_MUL_D_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_MULMOD_C)
@ -660,8 +668,8 @@
 #   define MP_ADD_D_C
 #   define MP_CLEAR_C
 #   define MP_CMP_D_C
+#   define MP_DIV_D_C
 #   define MP_INIT_C
-#   define MP_MOD_D_C
 #   define MP_PRIME_IS_PRIME_C
 #   define MP_SET_C
 #   define MP_SUB_D_C
@ -678,6 +686,7 @@
 #   define MP_PRIME_IS_PRIME_C
 #   define MP_SUB_D_C
 #   define S_MP_RAND_SOURCE_C
+#   define S_MP_ZERO_BUF_C
 #endif

 #if defined(MP_PRIME_STRONG_LUCAS_SELFRIDGE_C)
@ -744,9 +753,9 @@
 #   define MP_RSHD_C
 #   define MP_SET_C
 #   define MP_SUB_C
-#   define S_MP_MUL_DIGS_C
-#   define S_MP_MUL_HIGH_DIGS_C
-#   define S_MP_MUL_HIGH_DIGS_FAST_C
+#   define S_MP_MUL_C
+#   define S_MP_MUL_HIGH_C
+#   define S_MP_MUL_HIGH_COMBA_C
 #   define S_MP_SUB_C
 #endif

@ -820,6 +829,7 @@

 #if defined(MP_RSHD_C)
 #   define MP_ZERO_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_SBIN_SIZE_C)
@ -827,6 +837,7 @@
 #endif

 #if defined(MP_SET_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_SET_DOUBLE_C)
@ -852,15 +863,19 @@
 #endif

 #if defined(MP_SET_U32_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_SET_U64_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_SET_UL_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_SET_ULL_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_SHRINK_C)
@ -873,10 +888,10 @@
 #endif

 #if defined(MP_SQR_C)
-#   define S_MP_KARATSUBA_SQR_C
 #   define S_MP_SQR_C
-#   define S_MP_SQR_FAST_C
-#   define S_MP_TOOM_SQR_C
+#   define S_MP_SQR_COMBA_C
+#   define S_MP_SQR_KARATSUBA_C
+#   define S_MP_SQR_TOOM_C
 #endif

 #if defined(MP_SQRMOD_C)
@ -905,10 +920,10 @@
 #   define MP_CMP_D_C
 #   define MP_COPY_C
 #   define MP_DIV_2_C
+#   define MP_DIV_D_C
 #   define MP_EXPTMOD_C
 #   define MP_INIT_MULTI_C
 #   define MP_KRONECKER_C
-#   define MP_MOD_D_C
 #   define MP_MULMOD_C
 #   define MP_SET_C
 #   define MP_SQRMOD_C
@ -926,6 +941,7 @@
 #   define MP_ADD_D_C
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(MP_SUBMOD_C)
@ -969,23 +985,16 @@
 #endif

 #if defined(MP_ZERO_C)
+#   define S_MP_ZERO_DIGS_C
 #endif

 #if defined(S_MP_ADD_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

-#if defined(S_MP_BALANCE_MUL_C)
-#   define MP_ADD_C
-#   define MP_CLAMP_C
-#   define MP_CLEAR_C
-#   define MP_CLEAR_MULTI_C
-#   define MP_EXCH_C
-#   define MP_INIT_MULTI_C
-#   define MP_INIT_SIZE_C
-#   define MP_LSHD_C
-#   define MP_MUL_C
+#if defined(S_MP_COPY_DIGS_C)
 #endif

 #if defined(S_MP_DIV_RECURSIVE_C)
@ -1075,13 +1084,13 @@
 #   define MP_REDUCE_2K_SETUP_C
 #   define MP_SET_C
 #   define MP_SQR_C
-#   define S_MP_MONTGOMERY_REDUCE_FAST_C
+#   define S_MP_MONTGOMERY_REDUCE_COMBA_C
 #endif

 #if defined(S_MP_GET_BIT_C)
 #endif

-#if defined(S_MP_INVMOD_FAST_C)
+#if defined(S_MP_INVMOD_C)
 #   define MP_ADD_C
 #   define MP_CLEAR_MULTI_C
 #   define MP_CMP_C
@ -1096,7 +1105,7 @@
 #   define MP_SUB_C
 #endif

-#if defined(S_MP_INVMOD_SLOW_C)
+#if defined(S_MP_INVMOD_ODD_C)
 #   define MP_ADD_C
 #   define MP_CLEAR_MULTI_C
 #   define MP_CMP_C
@ -1111,28 +1120,6 @@
 #   define MP_SUB_C
 #endif

-#if defined(S_MP_KARATSUBA_MUL_C)
-#   define MP_ADD_C
-#   define MP_CLAMP_C
-#   define MP_CLEAR_C
-#   define MP_INIT_SIZE_C
-#   define MP_LSHD_C
-#   define MP_MUL_C
-#   define S_MP_ADD_C
-#   define S_MP_SUB_C
-#endif
-
-#if defined(S_MP_KARATSUBA_SQR_C)
-#   define MP_ADD_C
-#   define MP_CLAMP_C
-#   define MP_CLEAR_C
-#   define MP_INIT_SIZE_C
-#   define MP_LSHD_C
-#   define MP_SQR_C
-#   define S_MP_ADD_C
-#   define S_MP_SUB_C
-#endif
-
 #if defined(S_MP_LOG_C)
 #   define MP_CLEAR_MULTI_C
 #   define MP_CMP_C
@ -1153,41 +1140,86 @@
 #   define MP_COUNT_BITS_C
 #endif

-#if defined(S_MP_MONTGOMERY_REDUCE_FAST_C)
+#if defined(S_MP_MONTGOMERY_REDUCE_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_CMP_MAG_C
 #   define MP_GROW_C
 #   define S_MP_SUB_C
+#   define S_MP_ZERO_BUF_C
+#   define S_MP_ZERO_DIGS_C
 #endif

-#if defined(S_MP_MUL_DIGS_C)
+#if defined(S_MP_MUL_C)
 #   define MP_CLAMP_C
 #   define MP_CLEAR_C
 #   define MP_EXCH_C
 #   define MP_INIT_SIZE_C
-#   define S_MP_MUL_DIGS_FAST_C
+#   define S_MP_MUL_COMBA_C
 #endif

-#if defined(S_MP_MUL_DIGS_FAST_C)
+#if defined(S_MP_MUL_BALANCE_C)
+#   define MP_ADD_C
+#   define MP_CLAMP_C
+#   define MP_CLEAR_C
+#   define MP_CLEAR_MULTI_C
+#   define MP_EXCH_C
+#   define MP_INIT_MULTI_C
+#   define MP_INIT_SIZE_C
+#   define MP_LSHD_C
+#   define MP_MUL_C
+#   define S_MP_COPY_DIGS_C
+#endif
+
+#if defined(S_MP_MUL_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

-#if defined(S_MP_MUL_HIGH_DIGS_C)
+#if defined(S_MP_MUL_HIGH_C)
 #   define MP_CLAMP_C
 #   define MP_CLEAR_C
 #   define MP_EXCH_C
 #   define MP_INIT_SIZE_C
-#   define S_MP_MUL_HIGH_DIGS_FAST_C
+#   define S_MP_MUL_HIGH_COMBA_C
 #endif

-#if defined(S_MP_MUL_HIGH_DIGS_FAST_C)
+#if defined(S_MP_MUL_HIGH_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
+#endif
+
+#if defined(S_MP_MUL_KARATSUBA_C)
+#   define MP_ADD_C
+#   define MP_CLAMP_C
+#   define MP_CLEAR_C
+#   define MP_INIT_SIZE_C
+#   define MP_LSHD_C
+#   define MP_MUL_C
+#   define S_MP_ADD_C
+#   define S_MP_COPY_DIGS_C
+#   define S_MP_SUB_C
+#endif
+
+#if defined(S_MP_MUL_TOOM_C)
+#   define MP_ADD_C
+#   define MP_CLAMP_C
+#   define MP_CLEAR_C
+#   define MP_CLEAR_MULTI_C
+#   define MP_DIV_2_C
+#   define MP_DIV_3_C
+#   define MP_INIT_MULTI_C
+#   define MP_INIT_SIZE_C
+#   define MP_LSHD_C
+#   define MP_MUL_2_C
+#   define MP_MUL_C
+#   define MP_SUB_C
+#   define S_MP_COPY_DIGS_C
 #endif

 #if defined(S_MP_PRIME_IS_DIVISIBLE_C)
-#   define MP_MOD_D_C
+#   define MP_DIV_D_C
 #endif

 #if defined(S_MP_RAND_JENKINS_C)
@ -1204,32 +1236,25 @@
 #   define MP_INIT_SIZE_C
 #endif

-#if defined(S_MP_SQR_FAST_C)
+#if defined(S_MP_SQR_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
 #endif

-#if defined(S_MP_SUB_C)
-#   define MP_CLAMP_C
-#   define MP_GROW_C
-#endif
-
-#if defined(S_MP_TOOM_MUL_C)
+#if defined(S_MP_SQR_KARATSUBA_C)
 #   define MP_ADD_C
 #   define MP_CLAMP_C
 #   define MP_CLEAR_C
-#   define MP_CLEAR_MULTI_C
-#   define MP_DIV_2_C
-#   define MP_DIV_3_C
-#   define MP_INIT_MULTI_C
 #   define MP_INIT_SIZE_C
 #   define MP_LSHD_C
-#   define MP_MUL_2_C
-#   define MP_MUL_C
-#   define MP_SUB_C
+#   define MP_SQR_C
+#   define S_MP_ADD_C
+#   define S_MP_COPY_DIGS_C
+#   define S_MP_SUB_C
 #endif

-#if defined(S_MP_TOOM_SQR_C)
+#if defined(S_MP_SQR_TOOM_C)
 #   define MP_ADD_C
 #   define MP_CLAMP_C
 #   define MP_CLEAR_C
@ -1241,6 +1266,19 @@
 #   define MP_MUL_C
 #   define MP_SQR_C
 #   define MP_SUB_C
+#   define S_MP_COPY_DIGS_C
+#endif
+
+#if defined(S_MP_SUB_C)
+#   define MP_CLAMP_C
+#   define MP_GROW_C
+#   define S_MP_ZERO_DIGS_C
+#endif
+
+#if defined(S_MP_ZERO_BUF_C)
+#endif
+
+#if defined(S_MP_ZERO_DIGS_C)
 #endif

 #ifdef LTM_INSIDE
--- a/tommath_cutoffs.h
+++ b/tommath_cutoffs.h
@ -7,7 +7,7 @@
   on the aforementioned machine for example.
 */

-#define MP_DEFAULT_KARATSUBA_MUL_CUTOFF 80
-#define MP_DEFAULT_KARATSUBA_SQR_CUTOFF 120
-#define MP_DEFAULT_TOOM_MUL_CUTOFF      350
-#define MP_DEFAULT_TOOM_SQR_CUTOFF      400
+#define MP_DEFAULT_MUL_KARATSUBA_CUTOFF 80
+#define MP_DEFAULT_SQR_KARATSUBA_CUTOFF 120
+#define MP_DEFAULT_MUL_TOOM_CUTOFF      350
+#define MP_DEFAULT_SQR_TOOM_CUTOFF      400
--- a/tommath_private.h
+++ b/tommath_private.h
@ -42,55 +42,25 @@
 * define MP_NO_ZERO_ON_FREE during compilation.
 */
 #ifdef MP_NO_ZERO_ON_FREE
-#  define MP_FREE_BUFFER(mem, size)   MP_FREE((mem), (size))
-#  define MP_FREE_DIGITS(mem, digits) MP_FREE((mem), sizeof (mp_digit) * (size_t)(digits))
+#  define MP_FREE_BUF(mem, size)   MP_FREE((mem), (size))
+#  define MP_FREE_DIGS(mem, digits) MP_FREE((mem), sizeof (mp_digit) * (size_t)(digits))
 #else
-#  define MP_FREE_BUFFER(mem, size)                     \
+#  define MP_FREE_BUF(mem, size)                        \
 do {                                                    \
   size_t fs_ = (size);                                 \
   void* fm_ = (mem);                                   \
   if (fm_ != NULL) {                                   \
-      MP_ZERO_BUFFER(fm_, fs_);                         \
+      s_mp_zero_buf(fm_, fs_);                          \
      MP_FREE(fm_, fs_);                                \
   }                                                    \
 } while (0)
-#  define MP_FREE_DIGITS(mem, digits)                   \
+#  define MP_FREE_DIGS(mem, digits)                     \
 do {                                                    \
   int fd_ = (digits);                                  \
-   void* fm_ = (mem);                                   \
+   mp_digit* fm_ = (mem);                               \
   if (fm_ != NULL) {                                   \
-      size_t fs_ = sizeof (mp_digit) * (size_t)fd_;     \
-      MP_ZERO_BUFFER(fm_, fs_);                         \
-      MP_FREE(fm_, fs_);                                \
-   }                                                    \
-} while (0)
-#endif
-
-#ifdef MP_USE_MEMSET
-#  include <string.h>
-#  define MP_ZERO_BUFFER(mem, size)   memset((mem), 0, (size))
-#  define MP_ZERO_DIGITS(mem, digits)                   \
-do {                                                    \
-   int zd_ = (digits);                                  \
-   if (zd_ > 0) {                                       \
-      memset((mem), 0, sizeof(mp_digit) * (size_t)zd_); \
-   }                                                    \
-} while (0)
-#else
-#  define MP_ZERO_BUFFER(mem, size)                     \
-do {                                                    \
-   size_t zs_ = (size);                                 \
-   char* zm_ = (char*)(mem);                            \
-   while (zs_-- > 0u) {                                 \
-      *zm_++ = '\0';                                    \
-   }                                                    \
-} while (0)
-#  define MP_ZERO_DIGITS(mem, digits)                   \
-do {                                                    \
-   int zd_ = (digits);                                  \
-   mp_digit* zm_ = (mem);                               \
-   while (zd_-- > 0) {                                  \
-      *zm_++ = 0;                                       \
+      s_mp_zero_digs(fm_, fd_);                         \
+      MP_FREE(fm_, sizeof (mp_digit) * (size_t)fd_);    \
   }                                                    \
 } while (0)
 #endif
@ -112,10 +82,10 @@ do {                                                    \

 #ifdef MP_FIXED_CUTOFFS
 #  include "tommath_cutoffs.h"
-#  define MP_KARATSUBA_MUL_CUTOFF MP_DEFAULT_KARATSUBA_MUL_CUTOFF
-#  define MP_KARATSUBA_SQR_CUTOFF MP_DEFAULT_KARATSUBA_SQR_CUTOFF
-#  define MP_TOOM_MUL_CUTOFF      MP_DEFAULT_TOOM_MUL_CUTOFF
-#  define MP_TOOM_SQR_CUTOFF      MP_DEFAULT_TOOM_SQR_CUTOFF
+#  define MP_MUL_KARATSUBA_CUTOFF MP_DEFAULT_MUL_KARATSUBA_CUTOFF
+#  define MP_SQR_KARATSUBA_CUTOFF MP_DEFAULT_SQR_KARATSUBA_CUTOFF
+#  define MP_MUL_TOOM_CUTOFF      MP_DEFAULT_MUL_TOOM_CUTOFF
+#  define MP_SQR_TOOM_CUTOFF      MP_DEFAULT_SQR_TOOM_CUTOFF
 #endif

 /* define heap macros */
@ -188,23 +158,23 @@ MP_STATIC_ASSERT(prec_geq_min_prec, MP_PREC >= MP_MIN_PREC)
 extern MP_PRIVATE mp_err(*s_mp_rand_source)(void *out, size_t size);

 /* lowlevel functions, do not call! */
-MP_PRIVATE bool s_mp_get_bit(const mp_int *a, unsigned int b);
+MP_PRIVATE bool s_mp_get_bit(const mp_int *a, int b);
 MP_PRIVATE mp_err s_mp_add(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_sub(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul_high_digs_fast(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
-MP_PRIVATE mp_err s_mp_sqr_fast(const mp_int *a, mp_int *b) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_high(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
+MP_PRIVATE mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b) MP_WUR;
 MP_PRIVATE mp_err s_mp_sqr(const mp_int *a, mp_int *b) MP_WUR;
-MP_PRIVATE mp_err s_mp_balance_mul(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_toom_mul(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_karatsuba_sqr(const mp_int *a, mp_int *b) MP_WUR;
-MP_PRIVATE mp_err s_mp_toom_sqr(const mp_int *a, mp_int *b) MP_WUR;
-MP_PRIVATE mp_err s_mp_invmod_fast(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_invmod_slow(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_montgomery_reduce_fast(mp_int *x, const mp_int *n, mp_digit rho) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_balance(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_karatsuba(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_toom(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_sqr_karatsuba(const mp_int *a, mp_int *b) MP_WUR;
+MP_PRIVATE mp_err s_mp_sqr_toom(const mp_int *a, mp_int *b) MP_WUR;
+MP_PRIVATE mp_err s_mp_invmod_odd(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_invmod(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) MP_WUR;
 MP_PRIVATE mp_err s_mp_exptmod_fast(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y, int redmode) MP_WUR;
 MP_PRIVATE mp_err s_mp_exptmod(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y, int redmode) MP_WUR;
 MP_PRIVATE mp_err s_mp_rand_platform(void *p, size_t n) MP_WUR;
@ -215,6 +185,9 @@ MP_PRIVATE uint32_t s_mp_log_pow2(const mp_int *a, uint32_t base);
 MP_PRIVATE mp_err s_mp_div_recursive(const mp_int *a, const mp_int *b, mp_int *q, mp_int *r);
 MP_PRIVATE mp_err s_mp_div_school(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d);
 MP_PRIVATE mp_err s_mp_div_small(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d);
+MP_PRIVATE void s_mp_zero_buf(void *mem, size_t size);
+MP_PRIVATE void s_mp_zero_digs(mp_digit *d, int digits);
+MP_PRIVATE void s_mp_copy_digs(mp_digit *d, const mp_digit *s, int digits);

 /* TODO: jenkins prng is not thread safe as of now */
 MP_PRIVATE mp_err s_mp_rand_jenkins(void *p, size_t n) MP_WUR;
@ -247,7 +220,7 @@ extern MP_PRIVATE const mp_digit s_mp_prime_tab[];
        }                                                                              \
        a->used = i;                                                                   \
        a->sign = MP_ZPOS;                                                             \
-        MP_ZERO_DIGITS(a->dp + a->used, a->alloc - a->used);                           \
+        s_mp_zero_digs(a->dp + a->used, a->alloc - a->used);                         \
    }

 #define MP_SET_SIGNED(name, uname, type, utype)          \
--- a/tommath_superclass.h
+++ b/tommath_superclass.h
@ -76,23 +76,23 @@
 * like removing support for even moduli, etc...
 */
 #   ifdef LTM_LAST
-#      undef MP_DR_IS_MODULUS_C
-#      undef MP_DR_SETUP_C
-#      undef MP_DR_REDUCE_C
 #      undef MP_DIV_3_C
-#      undef MP_REDUCE_2K_SETUP_C
+#      undef MP_DR_IS_MODULUS_C
+#      undef MP_DR_REDUCE_C
+#      undef MP_DR_SETUP_C
 #      undef MP_REDUCE_2K_C
+#      undef MP_REDUCE_2K_SETUP_C
 #      undef MP_REDUCE_IS_2K_C
 #      undef MP_REDUCE_SETUP_C
-#      undef S_MP_BALANCE_MUL_C
 #      undef S_MP_EXPTMOD_C
-#      undef S_MP_INVMOD_FAST_C
-#      undef S_MP_KARATSUBA_MUL_C
-#      undef S_MP_KARATSUBA_SQR_C
-#      undef S_MP_MUL_HIGH_DIGS_C
-#      undef S_MP_MUL_HIGH_DIGS_FAST_C
-#      undef S_MP_TOOM_MUL_C
-#      undef S_MP_TOOM_SQR_C
+#      undef S_MP_INVMOD_ODD_C
+#      undef S_MP_MUL_BALANCE_C
+#      undef S_MP_MUL_HIGH_C
+#      undef S_MP_MUL_HIGH_COMBA_C
+#      undef S_MP_MUL_KARATSUBA_C
+#      undef S_MP_MUL_TOOM_C
+#      undef S_MP_SQR_KARATSUBA_C
+#      undef S_MP_SQR_TOOM_C

 #      ifndef SC_RSA_1_WITH_TESTS
 #         undef MP_REDUCE_C
@ -104,7 +104,7 @@
 * trouble.
 */
 #      undef MP_MONTGOMERY_REDUCE_C
-#      undef S_MP_MUL_DIGS_C
+#      undef S_MP_MUL_C
 #      undef S_MP_SQR_C
 #   endif