diff --git a/demo/test.c b/demo/test.c
index f2f5800..3e432cf 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1241,14 +1241,14 @@ LBL_ERR:
    return EXIT_FAILURE;
 }
 
-static int test_mp_div_3(void)
+static int test_s_mp_div_3(void)
 {
    int cnt;
 
    mp_int a, b, c, d, e;
    DOR(mp_init_multi(&a, &b, &c, &d, &e, NULL));
 
-   /* test mp_div_3  */
+   /* test s_mp_div_3  */
    mp_set(&d, 3u);
    for (cnt = 0; cnt < 10000;) {
       mp_digit r2;
@@ -1259,10 +1259,10 @@ static int test_mp_div_3(void)
       }
       DO(mp_rand(&a, (abs(rand_int()) % 128) + 1));
       DO(mp_div(&a, &d, &b, &e));
-      DO(mp_div_3(&a, &c, &r2));
+      DO(s_mp_div_3(&a, &c, &r2));
 
       if (mp_cmp(&b, &c) || mp_cmp_d(&e, r2)) {
-         printf("\nmp_div_3 => Failure\n");
+         printf("\ns_mp_div_3 => Failure\n");
          goto LBL_ERR;
       }
    }
@@ -2297,7 +2297,7 @@ static int unit_tests(int argc, char **argv)
       T1(mp_cnt_lsb, MP_CNT_LSB),
       T1(mp_complement, MP_COMPLEMENT),
       T1(mp_decr, MP_SUB_D),
-      T1(mp_div_3, MP_DIV_3),
+      T1(s_mp_div_3, S_MP_DIV_3),
       T1(mp_dr_reduce, MP_DR_REDUCE),
       T2(mp_pack_unpack,MP_PACK, MP_UNPACK),
       T2(mp_fread_fwrite, MP_FREAD, MP_FWRITE),
diff --git a/doc/bn.tex b/doc/bn.tex
index b8a6404..6204414 100644
--- a/doc/bn.tex
+++ b/doc/bn.tex
@@ -2605,14 +2605,6 @@ mp_err mp_incr(mp_int *a);
 mp_err mp_decr(mp_int *a);
 \end{alltt}
 
-The division by three can be made faster by replacing the division with a multiplication by the
-multiplicative inverse of three.
-
-\index{mp\_div\_3}
-\begin{alltt}
-mp_err mp_div_3(const mp_int *a, mp_int *c, mp_digit *d);
-\end{alltt}
-
 \chapter{Little Helpers}
 It is never wrong to have some useful little shortcuts at hand.
 \section{Function Macros}
diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj
index 215ab4a..7e16199 100644
--- a/libtommath_VS2008.vcproj
+++ b/libtommath_VS2008.vcproj
@@ -392,10 +392,6 @@
 			RelativePath="mp_div_2d.c"
 			>
 		</File>
-		<File
-			RelativePath="mp_div_3.c"
-			>
-		</File>
 		<File
 			RelativePath="mp_div_d.c"
 			>
@@ -760,10 +756,6 @@
 			RelativePath="mp_signed_rsh.c"
 			>
 		</File>
-		<File
-			RelativePath="mp_sqr.c"
-			>
-		</File>
 		<File
 			RelativePath="mp_sqrmod.c"
 			>
@@ -824,6 +816,10 @@
 			RelativePath="s_mp_copy_digs.c"
 			>
 		</File>
+		<File
+			RelativePath="s_mp_div_3.c"
+			>
+		</File>
 		<File
 			RelativePath="s_mp_div_recursive.c"
 			>
diff --git a/makefile b/makefile
index a9633f8..88eff79 100644
--- a/makefile
+++ b/makefile
@@ -28,7 +28,7 @@ LCOV_ARGS=--directory .
 #START_INS
 OBJECTS=mp_2expt.o mp_abs.o mp_add.o mp_add_d.o mp_addmod.o mp_and.o mp_clamp.o mp_clear.o mp_clear_multi.o \
 mp_cmp.o mp_cmp_d.o mp_cmp_mag.o mp_cnt_lsb.o mp_complement.o mp_copy.o mp_count_bits.o mp_cutoffs.o \
-mp_div.o mp_div_2.o mp_div_2d.o mp_div_3.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
+mp_div.o mp_div_2.o mp_div_2d.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
 mp_error_to_string.o mp_exch.o mp_expt_u32.o mp_exptmod.o mp_exteuclid.o mp_fread.o mp_from_sbin.o \
 mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp_get_l.o mp_get_ll.o \
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
@@ -42,11 +42,11 @@ mp_prime_strong_lucas_selfridge.o mp_radix_size.o mp_rand.o mp_read_radix.o mp_r
 mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o \
 mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o \
-mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
-mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o \
-s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
+mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
+s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
+s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/makefile.mingw b/makefile.mingw
index 55da599..3a3bc63 100644
--- a/makefile.mingw
+++ b/makefile.mingw
@@ -30,7 +30,7 @@ LIBMAIN_D =libtommath.dll
 #List of objects to compile (all goes to libtommath.a)
 OBJECTS=mp_2expt.o mp_abs.o mp_add.o mp_add_d.o mp_addmod.o mp_and.o mp_clamp.o mp_clear.o mp_clear_multi.o \
 mp_cmp.o mp_cmp_d.o mp_cmp_mag.o mp_cnt_lsb.o mp_complement.o mp_copy.o mp_count_bits.o mp_cutoffs.o \
-mp_div.o mp_div_2.o mp_div_2d.o mp_div_3.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
+mp_div.o mp_div_2.o mp_div_2d.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
 mp_error_to_string.o mp_exch.o mp_expt_u32.o mp_exptmod.o mp_exteuclid.o mp_fread.o mp_from_sbin.o \
 mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp_get_l.o mp_get_ll.o \
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
@@ -44,11 +44,11 @@ mp_prime_strong_lucas_selfridge.o mp_radix_size.o mp_rand.o mp_read_radix.o mp_r
 mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o \
 mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o \
-mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
-mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o \
-s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
+mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
+s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
+s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/makefile.msvc b/makefile.msvc
index 7681252..a22267c 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -23,7 +23,7 @@ LIBMAIN_S =tommath.lib
 #List of objects to compile (all goes to tommath.lib)
 OBJECTS=mp_2expt.obj mp_abs.obj mp_add.obj mp_add_d.obj mp_addmod.obj mp_and.obj mp_clamp.obj mp_clear.obj mp_clear_multi.obj \
 mp_cmp.obj mp_cmp_d.obj mp_cmp_mag.obj mp_cnt_lsb.obj mp_complement.obj mp_copy.obj mp_count_bits.obj mp_cutoffs.obj \
-mp_div.obj mp_div_2.obj mp_div_2d.obj mp_div_3.obj mp_div_d.obj mp_dr_is_modulus.obj mp_dr_reduce.obj mp_dr_setup.obj \
+mp_div.obj mp_div_2.obj mp_div_2d.obj mp_div_d.obj mp_dr_is_modulus.obj mp_dr_reduce.obj mp_dr_setup.obj \
 mp_error_to_string.obj mp_exch.obj mp_expt_u32.obj mp_exptmod.obj mp_exteuclid.obj mp_fread.obj mp_from_sbin.obj \
 mp_from_ubin.obj mp_fwrite.obj mp_gcd.obj mp_get_double.obj mp_get_i32.obj mp_get_i64.obj mp_get_l.obj mp_get_ll.obj \
 mp_get_mag_u32.obj mp_get_mag_u64.obj mp_get_mag_ul.obj mp_get_mag_ull.obj mp_grow.obj mp_init.obj mp_init_copy.obj \
@@ -37,11 +37,11 @@ mp_prime_strong_lucas_selfridge.obj mp_radix_size.obj mp_rand.obj mp_read_radix.
 mp_reduce_2k_l.obj mp_reduce_2k_setup.obj mp_reduce_2k_setup_l.obj mp_reduce_is_2k.obj mp_reduce_is_2k_l.obj \
 mp_reduce_setup.obj mp_root_u32.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj mp_set_double.obj mp_set_i32.obj mp_set_i64.obj \
 mp_set_l.obj mp_set_ll.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj mp_set_ull.obj mp_shrink.obj mp_signed_rsh.obj \
-mp_sqr.obj mp_sqrmod.obj mp_sqrt.obj mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj \
-mp_to_ubin.obj mp_ubin_size.obj mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_recursive.obj \
-s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_get_bit.obj s_mp_invmod.obj \
-s_mp_invmod_odd.obj s_mp_log.obj s_mp_log_d.obj s_mp_log_pow2.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj \
-s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
+mp_sqrmod.obj mp_sqrt.obj mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj \
+mp_to_ubin.obj mp_ubin_size.obj mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj \
+s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_get_bit.obj \
+s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log.obj s_mp_log_d.obj s_mp_log_pow2.obj s_mp_montgomery_reduce_comba.obj \
+s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
 s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_rand_jenkins.obj \
 s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj \
 s_mp_zero_buf.obj s_mp_zero_digs.obj
diff --git a/makefile.shared b/makefile.shared
index 62a9343..ad58e61 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -25,7 +25,7 @@ LCOV_ARGS=--directory .libs --directory .
 #START_INS
 OBJECTS=mp_2expt.o mp_abs.o mp_add.o mp_add_d.o mp_addmod.o mp_and.o mp_clamp.o mp_clear.o mp_clear_multi.o \
 mp_cmp.o mp_cmp_d.o mp_cmp_mag.o mp_cnt_lsb.o mp_complement.o mp_copy.o mp_count_bits.o mp_cutoffs.o \
-mp_div.o mp_div_2.o mp_div_2d.o mp_div_3.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
+mp_div.o mp_div_2.o mp_div_2d.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
 mp_error_to_string.o mp_exch.o mp_expt_u32.o mp_exptmod.o mp_exteuclid.o mp_fread.o mp_from_sbin.o \
 mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp_get_l.o mp_get_ll.o \
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
@@ -39,11 +39,11 @@ mp_prime_strong_lucas_selfridge.o mp_radix_size.o mp_rand.o mp_read_radix.o mp_r
 mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o \
 mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o \
-mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
-mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o \
-s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
+mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
+s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
+s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/makefile.unix b/makefile.unix
index 859eed4..1e0da73 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -31,7 +31,7 @@ LIBMAIN_S = libtommath.a
 
 OBJECTS=mp_2expt.o mp_abs.o mp_add.o mp_add_d.o mp_addmod.o mp_and.o mp_clamp.o mp_clear.o mp_clear_multi.o \
 mp_cmp.o mp_cmp_d.o mp_cmp_mag.o mp_cnt_lsb.o mp_complement.o mp_copy.o mp_count_bits.o mp_cutoffs.o \
-mp_div.o mp_div_2.o mp_div_2d.o mp_div_3.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
+mp_div.o mp_div_2.o mp_div_2d.o mp_div_d.o mp_dr_is_modulus.o mp_dr_reduce.o mp_dr_setup.o \
 mp_error_to_string.o mp_exch.o mp_expt_u32.o mp_exptmod.o mp_exteuclid.o mp_fread.o mp_from_sbin.o \
 mp_from_ubin.o mp_fwrite.o mp_gcd.o mp_get_double.o mp_get_i32.o mp_get_i64.o mp_get_l.o mp_get_ll.o \
 mp_get_mag_u32.o mp_get_mag_u64.o mp_get_mag_ul.o mp_get_mag_ull.o mp_grow.o mp_init.o mp_init_copy.o \
@@ -45,11 +45,11 @@ mp_prime_strong_lucas_selfridge.o mp_radix_size.o mp_rand.o mp_read_radix.o mp_r
 mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o mp_reduce_is_2k_l.o \
 mp_reduce_setup.o mp_root_u32.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_ll.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_set_ull.o mp_shrink.o mp_signed_rsh.o \
-mp_sqr.o mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
-mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o s_mp_invmod.o \
-s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o \
+mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
+s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
+s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/mp_div_d.c b/mp_div_d.c
index 472ab27..5697e54 100644
--- a/mp_div_d.c
+++ b/mp_div_d.c
@@ -28,7 +28,13 @@ mp_err mp_div_d(const mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
    }
 
    /* power of two ? */
-   if ((b & (b - 1u)) == 0u) {
+   if (MP_HAS(MP_DIV_2) && (b == 2u)) {
+      if (d != NULL) {
+         *d = mp_isodd(a) ? 1u : 0u;
+      }
+      return (c == NULL) ? MP_OKAY : mp_div_2(a, c);
+   }
+   if (MP_HAS(MP_DIV_2D) && MP_IS_2EXPT(b)) {
       ix = 1;
       while ((ix < MP_DIGIT_BIT) && (b != (((mp_digit)1)<<ix))) {
          ix++;
@@ -36,15 +42,12 @@ mp_err mp_div_d(const mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
       if (d != NULL) {
          *d = a->dp[0] & (((mp_digit)1<<(mp_digit)ix) - 1uL);
       }
-      if (c != NULL) {
-         return mp_div_2d(a, ix, c, NULL);
-      }
-      return MP_OKAY;
+      return (c == NULL) ? MP_OKAY : mp_div_2d(a, ix, c, NULL);
    }
 
    /* three? */
-   if (MP_HAS(MP_DIV_3) && (b == 3u)) {
-      return mp_div_3(a, c, d);
+   if (MP_HAS(S_MP_DIV_3) && (b == 3u)) {
+      return s_mp_div_3(a, c, d);
    }
 
    /* no easy answer [c'est la vie].  Just division */
diff --git a/mp_log_u32.c b/mp_log_u32.c
index 31d9662..949ef87 100644
--- a/mp_log_u32.c
+++ b/mp_log_u32.c
@@ -17,7 +17,7 @@ mp_err mp_log_u32(const mp_int *a, uint32_t base, uint32_t *c)
       return MP_VAL;
    }
 
-   if (MP_HAS(S_MP_LOG_POW2) && ((base & (base - 1u)) == 0u)) {
+   if (MP_HAS(S_MP_LOG_POW2) && MP_IS_2EXPT(base)) {
       *c = s_mp_log_pow2(a, base);
       return MP_OKAY;
    }
diff --git a/mp_mul.c b/mp_mul.c
index 9a83687..b2dbf7d 100644
--- a/mp_mul.c
+++ b/mp_mul.c
@@ -12,18 +12,34 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
        digs = a->used + b->used + 1;
    mp_sign neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
 
-   if (MP_HAS(S_MP_MUL_BALANCE) &&
-       /* Check sizes. The smaller one needs to be larger than the Karatsuba cut-off.
-        * The bigger one needs to be at least about one MP_MUL_KARATSUBA_CUTOFF bigger
-        * to make some sense, but it depends on architecture, OS, position of the
-        * stars... so YMMV.
-        * Using it to cut the input into slices small enough for s_mp_mul_comba
-        * was actually slower on the author's machine, but YMMV.
-        */
-       (min >= MP_MUL_KARATSUBA_CUTOFF) &&
-       ((max / 2) >= MP_MUL_KARATSUBA_CUTOFF) &&
-       /* Not much effect was observed below a ratio of 1:2, but again: YMMV. */
-       (max >= (2 * min))) {
+   if ((a == b) &&
+       MP_HAS(S_MP_SQR_TOOM) && /* use Toom-Cook? */
+       (a->used >= MP_SQR_TOOM_CUTOFF)) {
+      err = s_mp_sqr_toom(a, c);
+   } else if ((a == b) &&
+              MP_HAS(S_MP_SQR_KARATSUBA) &&  /* Karatsuba? */
+              (a->used >= MP_SQR_KARATSUBA_CUTOFF)) {
+      err = s_mp_sqr_karatsuba(a, c);
+   } else if ((a == b) &&
+              MP_HAS(S_MP_SQR_COMBA) && /* can we use the fast comba multiplier? */
+              (((a->used * 2) + 1) < MP_WARRAY) &&
+              (a->used < (MP_MAX_COMBA / 2))) {
+      err = s_mp_sqr_comba(a, c);
+   } else if ((a == b) &&
+              MP_HAS(S_MP_SQR)) {
+      err = s_mp_sqr(a, c);
+   } else if (MP_HAS(S_MP_MUL_BALANCE) &&
+              /* Check sizes. The smaller one needs to be larger than the Karatsuba cut-off.
+               * The bigger one needs to be at least about one MP_MUL_KARATSUBA_CUTOFF bigger
+               * to make some sense, but it depends on architecture, OS, position of the
+               * stars... so YMMV.
+               * Using it to cut the input into slices small enough for s_mp_mul_comba
+               * was actually slower on the author's machine, but YMMV.
+               */
+              (min >= MP_MUL_KARATSUBA_CUTOFF) &&
+              ((max / 2) >= MP_MUL_KARATSUBA_CUTOFF) &&
+              /* Not much effect was observed below a ratio of 1:2, but again: YMMV. */
+              (max >= (2 * min))) {
       err = s_mp_mul_balance(a,b,c);
    } else if (MP_HAS(S_MP_MUL_TOOM) &&
               (min >= MP_MUL_TOOM_CUTOFF)) {
diff --git a/mp_mul_d.c b/mp_mul_d.c
index 30d6c93..2585055 100644
--- a/mp_mul_d.c
+++ b/mp_mul_d.c
@@ -10,6 +10,22 @@ mp_err mp_mul_d(const mp_int *a, mp_digit b, mp_int *c)
    mp_err   err;
    int   ix, oldused;
 
+   if (b == 1u) {
+      return mp_copy(a, c);
+   }
+
+   /* power of two ? */
+   if (MP_HAS(MP_MUL_2) && (b == 2u)) {
+      return mp_mul_2(a, c);
+   }
+   if (MP_HAS(MP_MUL_2D) && MP_IS_2EXPT(b)) {
+      ix = 1;
+      while ((ix < MP_DIGIT_BIT) && (b != (((mp_digit)1)<<ix))) {
+         ix++;
+      }
+      return mp_mul_2d(a, ix, c);
+   }
+
    /* make sure c is big enough to hold a*b */
    if ((err = mp_grow(c, a->used + 1)) != MP_OKAY) {
       return err;
diff --git a/mp_sqr.c b/mp_sqr.c
deleted file mode 100644
index 67a8224..0000000
--- a/mp_sqr.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "tommath_private.h"
-#ifdef MP_SQR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis */
-/* SPDX-License-Identifier: Unlicense */
-
-/* computes b = a*a */
-mp_err mp_sqr(const mp_int *a, mp_int *b)
-{
-   mp_err err;
-   if (MP_HAS(S_MP_SQR_TOOM) && /* use Toom-Cook? */
-       (a->used >= MP_SQR_TOOM_CUTOFF)) {
-      err = s_mp_sqr_toom(a, b);
-   } else if (MP_HAS(S_MP_SQR_KARATSUBA) &&  /* Karatsuba? */
-              (a->used >= MP_SQR_KARATSUBA_CUTOFF)) {
-      err = s_mp_sqr_karatsuba(a, b);
-   } else if (MP_HAS(S_MP_SQR_COMBA) && /* can we use the fast comba multiplier? */
-              (((a->used * 2) + 1) < MP_WARRAY) &&
-              (a->used < (MP_MAX_COMBA / 2))) {
-      err = s_mp_sqr_comba(a, b);
-   } else if (MP_HAS(S_MP_SQR)) {
-      err = s_mp_sqr(a, b);
-   } else {
-      err = MP_VAL;
-   }
-   b->sign = MP_ZPOS;
-   return err;
-}
-#endif
diff --git a/mp_div_3.c b/s_mp_div_3.c
similarity index 94%
rename from mp_div_3.c
rename to s_mp_div_3.c
index c26692c..1cc6d3d 100644
--- a/mp_div_3.c
+++ b/s_mp_div_3.c
@@ -1,10 +1,10 @@
 #include "tommath_private.h"
-#ifdef MP_DIV_3_C
+#ifdef S_MP_DIV_3_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
 /* divide by three (based on routine from MPI and the GMP manual) */
-mp_err mp_div_3(const mp_int *a, mp_int *c, mp_digit *d)
+mp_err s_mp_div_3(const mp_int *a, mp_int *c, mp_digit *d)
 {
    mp_int   q;
    mp_word  w;
diff --git a/s_mp_mul_toom.c b/s_mp_mul_toom.c
index 1ed03c8..f6c2b10 100644
--- a/s_mp_mul_toom.c
+++ b/s_mp_mul_toom.c
@@ -133,7 +133,7 @@ mp_err s_mp_mul_toom(const mp_int *a, const mp_int *b, mp_int *c)
    if ((err = mp_sub(&S2, &a1, &S2)) != MP_OKAY)                  goto LBL_ERR;
 
    /** S2 = S2 / 3; \\ this is an exact division  */
-   if ((err = mp_div_3(&S2, &S2, NULL)) != MP_OKAY)               goto LBL_ERR;
+   if ((err = s_mp_div_3(&S2, &S2, NULL)) != MP_OKAY)             goto LBL_ERR;
 
    /** a1 = S1 - a1; */
    if ((err = mp_sub(&S1, &a1, &a1)) != MP_OKAY)                  goto LBL_ERR;
diff --git a/tommath.def b/tommath.def
index d2509e1..8bc6eac 100644
--- a/tommath.def
+++ b/tommath.def
@@ -25,7 +25,6 @@ EXPORTS
     mp_div
     mp_div_2
     mp_div_2d
-    mp_div_3
     mp_div_d
     mp_dr_is_modulus
     mp_dr_reduce
@@ -117,7 +116,6 @@ EXPORTS
     mp_set_ull
     mp_shrink
     mp_signed_rsh
-    mp_sqr
     mp_sqrmod
     mp_sqrt
     mp_sqrtmod_prime
diff --git a/tommath.h b/tommath.h
index 68a1592..5e75c98 100644
--- a/tommath.h
+++ b/tommath.h
@@ -300,9 +300,6 @@ mp_err mp_div_2d(const mp_int *a, int b, mp_int *c, mp_int *d) MP_WUR;
 /* b = a/2 */
 mp_err mp_div_2(const mp_int *a, mp_int *b) MP_WUR;
 
-/* a/3 => 3c + d == a */
-mp_err mp_div_3(const mp_int *a, mp_int *c, mp_digit *d) MP_WUR;
-
 /* c = a * 2**b, implemented as c = a << b */
 mp_err mp_mul_2d(const mp_int *a, int b, mp_int *c) MP_WUR;
 
@@ -366,7 +363,7 @@ mp_err mp_sub(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
 mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
 
 /* b = a*a  */
-mp_err mp_sqr(const mp_int *a, mp_int *b) MP_WUR;
+#define mp_sqr(a, b) mp_mul((a), (a), (b))
 
 /* a/b => cb + d == a */
 mp_err mp_div(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d) MP_WUR;
diff --git a/tommath_class.h b/tommath_class.h
index b11c574..f5f9907 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -31,7 +31,6 @@
 #   define MP_DIV_C
 #   define MP_DIV_2_C
 #   define MP_DIV_2D_C
-#   define MP_DIV_3_C
 #   define MP_DIV_D_C
 #   define MP_DR_IS_MODULUS_C
 #   define MP_DR_REDUCE_C
@@ -123,7 +122,6 @@
 #   define MP_SET_ULL_C
 #   define MP_SHRINK_C
 #   define MP_SIGNED_RSH_C
-#   define MP_SQR_C
 #   define MP_SQRMOD_C
 #   define MP_SQRT_C
 #   define MP_SQRTMOD_PRIME_C
@@ -139,6 +137,7 @@
 #   define MP_ZERO_C
 #   define S_MP_ADD_C
 #   define S_MP_COPY_DIGS_C
+#   define S_MP_DIV_3_C
 #   define S_MP_DIV_RECURSIVE_C
 #   define S_MP_DIV_SCHOOL_C
 #   define S_MP_DIV_SMALL_C
@@ -266,21 +265,15 @@
 #   define MP_RSHD_C
 #endif
 
-#if defined(MP_DIV_3_C)
-#   define MP_CLAMP_C
-#   define MP_CLEAR_C
-#   define MP_EXCH_C
-#   define MP_INIT_SIZE_C
-#endif
-
 #if defined(MP_DIV_D_C)
 #   define MP_CLAMP_C
 #   define MP_CLEAR_C
 #   define MP_COPY_C
 #   define MP_DIV_2D_C
-#   define MP_DIV_3_C
+#   define MP_DIV_2_C
 #   define MP_EXCH_C
 #   define MP_INIT_SIZE_C
+#   define S_MP_DIV_3_C
 #endif
 
 #if defined(MP_DR_IS_MODULUS_C)
@@ -308,7 +301,6 @@
 #   define MP_INIT_COPY_C
 #   define MP_MUL_C
 #   define MP_SET_C
-#   define MP_SQR_C
 #endif
 
 #if defined(MP_EXPTMOD_C)
@@ -480,8 +472,8 @@
 #   define MP_GET_I32_C
 #   define MP_INIT_U32_C
 #   define MP_MOD_C
+#   define MP_MUL_C
 #   define MP_SQRT_C
-#   define MP_SQR_C
 #endif
 
 #if defined(MP_KRONECKER_C)
@@ -554,6 +546,10 @@
 #   define S_MP_MUL_COMBA_C
 #   define S_MP_MUL_KARATSUBA_C
 #   define S_MP_MUL_TOOM_C
+#   define S_MP_SQR_C
+#   define S_MP_SQR_COMBA_C
+#   define S_MP_SQR_KARATSUBA_C
+#   define S_MP_SQR_TOOM_C
 #endif
 
 #if defined(MP_MUL_2_C)
@@ -570,7 +566,10 @@
 
 #if defined(MP_MUL_D_C)
 #   define MP_CLAMP_C
+#   define MP_COPY_C
 #   define MP_GROW_C
+#   define MP_MUL_2D_C
+#   define MP_MUL_2_C
 #   define S_MP_ZERO_DIGS_C
 #endif
 
@@ -703,7 +702,6 @@
 #   define MP_SET_C
 #   define MP_SET_I32_C
 #   define MP_SET_U32_C
-#   define MP_SQR_C
 #   define MP_SUB_C
 #   define MP_SUB_D_C
 #   define S_MP_GET_BIT_C
@@ -873,16 +871,9 @@
 #   define MP_SUB_D_C
 #endif
 
-#if defined(MP_SQR_C)
-#   define S_MP_SQR_C
-#   define S_MP_SQR_COMBA_C
-#   define S_MP_SQR_KARATSUBA_C
-#   define S_MP_SQR_TOOM_C
-#endif
-
 #if defined(MP_SQRMOD_C)
 #   define MP_MOD_C
-#   define MP_SQR_C
+#   define MP_MUL_C
 #endif
 
 #if defined(MP_SQRT_C)
@@ -978,6 +969,13 @@
 #if defined(S_MP_COPY_DIGS_C)
 #endif
 
+#if defined(S_MP_DIV_3_C)
+#   define MP_CLAMP_C
+#   define MP_CLEAR_C
+#   define MP_EXCH_C
+#   define MP_INIT_SIZE_C
+#endif
+
 #if defined(S_MP_DIV_RECURSIVE_C)
 #   define MP_ADD_C
 #   define MP_CLEAR_MULTI_C
@@ -1043,7 +1041,6 @@
 #   define MP_REDUCE_C
 #   define MP_REDUCE_SETUP_C
 #   define MP_SET_C
-#   define MP_SQR_C
 #endif
 
 #if defined(S_MP_EXPTMOD_FAST_C)
@@ -1063,7 +1060,6 @@
 #   define MP_REDUCE_2K_C
 #   define MP_REDUCE_2K_SETUP_C
 #   define MP_SET_C
-#   define MP_SQR_C
 #   define S_MP_MONTGOMERY_REDUCE_COMBA_C
 #endif
 
@@ -1110,7 +1106,6 @@
 #   define MP_INIT_MULTI_C
 #   define MP_MUL_C
 #   define MP_SET_C
-#   define MP_SQR_C
 #endif
 
 #if defined(S_MP_LOG_D_C)
@@ -1188,7 +1183,6 @@
 #   define MP_CLEAR_C
 #   define MP_CLEAR_MULTI_C
 #   define MP_DIV_2_C
-#   define MP_DIV_3_C
 #   define MP_INIT_MULTI_C
 #   define MP_INIT_SIZE_C
 #   define MP_LSHD_C
@@ -1196,6 +1190,7 @@
 #   define MP_MUL_C
 #   define MP_SUB_C
 #   define S_MP_COPY_DIGS_C
+#   define S_MP_DIV_3_C
 #endif
 
 #if defined(S_MP_PRIME_IS_DIVISIBLE_C)
@@ -1234,7 +1229,7 @@
 #   define MP_CLEAR_C
 #   define MP_INIT_SIZE_C
 #   define MP_LSHD_C
-#   define MP_SQR_C
+#   define MP_MUL_C
 #   define S_MP_ADD_C
 #   define S_MP_COPY_DIGS_C
 #   define S_MP_SUB_C
@@ -1250,7 +1245,6 @@
 #   define MP_LSHD_C
 #   define MP_MUL_2_C
 #   define MP_MUL_C
-#   define MP_SQR_C
 #   define MP_SUB_C
 #   define S_MP_COPY_DIGS_C
 #endif
diff --git a/tommath_private.h b/tommath_private.h
index 17bbb73..f629502 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -120,6 +120,8 @@ extern void MP_FREE(void *mem, size_t size);
 
 #define MP_EXCH(t, a, b) do { t _c = a; a = b; b = _c; } while (0)
 
+#define MP_IS_2EXPT(x) (((x) != 0u) && (((x) & ((x) - 1u)) == 0u))
+
 /* Static assertion */
 #define MP_STATIC_ASSERT(msg, cond) typedef char mp_static_assert_##msg[(cond) ? 1 : -1];
 
@@ -158,36 +160,37 @@ MP_STATIC_ASSERT(prec_geq_min_prec, MP_PREC >= MP_MIN_PREC)
 extern MP_PRIVATE mp_err(*s_mp_rand_source)(void *out, size_t size);
 
 /* lowlevel functions, do not call! */
-MP_PRIVATE bool s_mp_get_bit(const mp_int *a, int b);
+MP_PRIVATE bool s_mp_get_bit(const mp_int *a, int b) MP_WUR;
+MP_PRIVATE mp_digit s_mp_log_d(mp_digit base, mp_digit n) MP_WUR;
 MP_PRIVATE mp_err s_mp_add(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_sub(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
+MP_PRIVATE mp_err s_mp_div_3(const mp_int *a, mp_int *c, mp_digit *d) MP_WUR;
+MP_PRIVATE mp_err s_mp_div_recursive(const mp_int *a, const mp_int *b, mp_int *q, mp_int *r) MP_WUR;
+MP_PRIVATE mp_err s_mp_div_school(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d) MP_WUR;
+MP_PRIVATE mp_err s_mp_div_small(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d) MP_WUR;
+MP_PRIVATE mp_err s_mp_exptmod(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y, int redmode) MP_WUR;
+MP_PRIVATE mp_err s_mp_exptmod_fast(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y, int redmode) MP_WUR;
+MP_PRIVATE mp_err s_mp_invmod(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_invmod_odd(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_log(const mp_int *a, uint32_t base, uint32_t *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) MP_WUR;
 MP_PRIVATE mp_err s_mp_mul(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul_high(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
-MP_PRIVATE mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b) MP_WUR;
-MP_PRIVATE mp_err s_mp_sqr(const mp_int *a, mp_int *b) MP_WUR;
 MP_PRIVATE mp_err s_mp_mul_balance(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_high(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
+MP_PRIVATE mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
 MP_PRIVATE mp_err s_mp_mul_karatsuba(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_mul_toom(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE mp_err s_mp_prime_is_divisible(const mp_int *a, bool *result) MP_WUR;
+MP_PRIVATE mp_err s_mp_rand_platform(void *p, size_t n) MP_WUR;
+MP_PRIVATE mp_err s_mp_sqr(const mp_int *a, mp_int *b) MP_WUR;
+MP_PRIVATE mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b) MP_WUR;
 MP_PRIVATE mp_err s_mp_sqr_karatsuba(const mp_int *a, mp_int *b) MP_WUR;
 MP_PRIVATE mp_err s_mp_sqr_toom(const mp_int *a, mp_int *b) MP_WUR;
-MP_PRIVATE mp_err s_mp_invmod_odd(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_invmod(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
-MP_PRIVATE mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) MP_WUR;
-MP_PRIVATE mp_err s_mp_exptmod_fast(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y, int redmode) MP_WUR;
-MP_PRIVATE mp_err s_mp_exptmod(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y, int redmode) MP_WUR;
-MP_PRIVATE mp_err s_mp_rand_platform(void *p, size_t n) MP_WUR;
-MP_PRIVATE mp_err s_mp_prime_is_divisible(const mp_int *a, bool *result);
-MP_PRIVATE mp_digit s_mp_log_d(mp_digit base, mp_digit n);
-MP_PRIVATE mp_err s_mp_log(const mp_int *a, uint32_t base, uint32_t *c);
-MP_PRIVATE uint32_t s_mp_log_pow2(const mp_int *a, uint32_t base);
-MP_PRIVATE mp_err s_mp_div_recursive(const mp_int *a, const mp_int *b, mp_int *q, mp_int *r);
-MP_PRIVATE mp_err s_mp_div_school(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d);
-MP_PRIVATE mp_err s_mp_div_small(const mp_int *a, const mp_int *b, mp_int *c, mp_int *d);
+MP_PRIVATE mp_err s_mp_sub(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
+MP_PRIVATE uint32_t s_mp_log_pow2(const mp_int *a, uint32_t base) MP_WUR;
+MP_PRIVATE void s_mp_copy_digs(mp_digit *d, const mp_digit *s, int digits);
 MP_PRIVATE void s_mp_zero_buf(void *mem, size_t size);
 MP_PRIVATE void s_mp_zero_digs(mp_digit *d, int digits);
-MP_PRIVATE void s_mp_copy_digs(mp_digit *d, const mp_digit *s, int digits);
 
 /* TODO: jenkins prng is not thread safe as of now */
 MP_PRIVATE mp_err s_mp_rand_jenkins(void *p, size_t n) MP_WUR;
diff --git a/tommath_superclass.h b/tommath_superclass.h
index 1d1e000..dd83cad 100644
--- a/tommath_superclass.h
+++ b/tommath_superclass.h
@@ -75,7 +75,6 @@
  * like removing support for even moduli, etc...
  */
 #   ifdef LTM_LAST
-#      undef MP_DIV_3_C
 #      undef MP_DR_IS_MODULUS_C
 #      undef MP_DR_REDUCE_C
 #      undef MP_DR_SETUP_C
@@ -83,6 +82,7 @@
 #      undef MP_REDUCE_2K_SETUP_C
 #      undef MP_REDUCE_IS_2K_C
 #      undef MP_REDUCE_SETUP_C
+#      undef S_MP_DIV_3_C
 #      undef S_MP_EXPTMOD_C
 #      undef S_MP_INVMOD_ODD_C
 #      undef S_MP_MUL_BALANCE_C