added autotuning functionality for the Toom-Cook cut-offs
This commit is contained in:
parent
9e1a75cfdc
commit
db76bed220
11
.travis.yml
11
.travis.yml
@ -141,6 +141,16 @@ matrix:
|
||||
- env: BUILDOPTIONS='--with-cc=clang-7 --cflags=-DMP_16BIT --with-valgrind'
|
||||
- env: BUILDOPTIONS='--with-cc=clang-7 --cflags=-DMP_32BIT --with-valgrind'
|
||||
|
||||
# Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
|
||||
- env: BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_8BIT --with-valgrind --make-option=tune'
|
||||
- env: BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --with-valgrind --make-option=tune'
|
||||
- env: BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_32BIT --with-valgrind --make-option=tune'
|
||||
- env: BUILDOPTIONS='--with-cc=gcc-5 --with-valgrind --make-option=tune'
|
||||
- env: BUILDOPTIONS='--with-cc=clang-7 --cflags=-DMP_8BIT --with-valgrind --make-option=tune'
|
||||
- env: BUILDOPTIONS='--with-cc=clang-7 --cflags=-DMP_16BIT --with-valgrind --make-option=tune'
|
||||
- env: BUILDOPTIONS='--with-cc=clang-7 --cflags=-DMP_32BIT --with-valgrind --make-option=tune'
|
||||
- env: BUILDOPTIONS='--with-cc=clang-7 --with-valgrind --make-option=tune'
|
||||
|
||||
# GCC for the x86-64 architecture testing against a different Bigint-implementation
|
||||
# with 333333 different inputs.
|
||||
- env: BUILDOPTIONS='--with-cc=gcc-5 --test-vs-mtest=333333 --with-valgrind'
|
||||
@ -151,6 +161,7 @@ matrix:
|
||||
- env: BUILDOPTIONS='--with-cc=gcc-5 --test-vs-mtest=333333 --mtest-real-rand --with-valgrind'
|
||||
- env: BUILDOPTIONS='--with-cc=clang-7 --test-vs-mtest=333333 --mtest-real-rand --with-valgrind'
|
||||
|
||||
|
||||
# Notifications go to
|
||||
# An email address is also possible.
|
||||
notifications:
|
||||
|
48
doc/bn.tex
48
doc/bn.tex
@ -110,7 +110,7 @@ ICC does not know all options available for GCC and LibTomMath uses two diagnost
|
||||
icc: command line warning #10148: option '-Wbad-function-cast' not supported
|
||||
icc: command line warning #10148: option '-Wcast-align' not supported
|
||||
\end{alltt}
|
||||
It is possible to mute this ICC warning with the compiler flag \texttt{-diag-disable=10006}\footnote{It is not recommended to suppress warnings without a very good reason but there is no harm in doing so in this very special case.}.
|
||||
It is possible to mute this ICC warning with the compiler flag \texttt{-diag-disable=10148}\footnote{It is not recommended to suppress warnings without a very good reason but there is no harm in doing so in this very special case.}.
|
||||
|
||||
\subsection{Static Libraries}
|
||||
To build as a static library for GCC issue the following
|
||||
@ -127,6 +127,12 @@ nmake -f makefile.msvc
|
||||
This will build the library and archive the object files in ``tommath.lib''. This has been tested with MSVC
|
||||
version 6.00 with service pack 5.
|
||||
|
||||
To run a program to adapt the Toom-Cook cut-off values to your architecture type
|
||||
\begin{alltt}
|
||||
make tune
|
||||
\end{alltt}
|
||||
This will take some time.
|
||||
|
||||
\subsection{Shared Libraries}
|
||||
\subsubsection{GNU based Operating Systems}
|
||||
To build as a shared library for GCC issue the following
|
||||
@ -137,6 +143,13 @@ This requires the ``libtool'' package (common on most Linux/BSD systems). It wi
|
||||
and static then install (by default) into /usr/lib as well as install the header files in /usr/include. The shared
|
||||
library (resource) will be called ``libtommath.la'' while the static library called ``libtommath.a''. Generally
|
||||
you use libtool to link your application against the shared object.
|
||||
|
||||
To run a program to adapt the Toom-Cook cut-off values to your architecture type
|
||||
\begin{alltt}
|
||||
make -f makefile.shared tune
|
||||
\end{alltt}
|
||||
This will take some time.
|
||||
|
||||
\subsubsection{Microsoft Windows based Operating Systems}
|
||||
There is limited support for making a ``DLL'' in windows via the ``makefile.cygwin\_dll'' makefile. It requires
|
||||
Cygwin to work with since it requires the auto-export/import functionality. The resulting DLL and import library
|
||||
@ -1366,7 +1379,7 @@ should only be used with very large inputs. This is followed by the Karatsuba m
|
||||
sized inputs. Then followed by the Comba and baseline multipliers.
|
||||
|
||||
Fortunately for the developer you don't really need to know this unless you really want to fine tune the system. mp\_mul()
|
||||
will determine on its own\footnote{Some tweaking may be required.} what routine to use automatically when it is called.
|
||||
will determine on its own\footnote{Some tweaking may be required but \texttt{make tune} will put some reasonable values in \texttt{bncore.c}} what routine to use automatically when it is called.
|
||||
|
||||
\begin{alltt}
|
||||
int main(void)
|
||||
@ -1448,34 +1461,17 @@ GCC 3.3.1 and an Athlon XP processor the cutoff point is roughly 110 digits (abo
|
||||
Toom-Cook has incredible overhead and is probably only useful for very large inputs. So far no known cutoff points
|
||||
exist and for the most part I just set the cutoff points very high to make sure they're not called.
|
||||
|
||||
A demo program in the ``etc/'' directory of the project called ``tune.c'' can be used to find the cutoff points. This
|
||||
can be built with GCC as follows
|
||||
To get reasonable values for the cut-off points for your architecture, type
|
||||
|
||||
\begin{alltt}
|
||||
make XXX
|
||||
make tune
|
||||
\end{alltt}
|
||||
Where ``XXX'' is one of the following entries from the table \ref{fig:tuning}.
|
||||
|
||||
\begin{figure}[h]
|
||||
\begin{center}
|
||||
\begin{small}
|
||||
\begin{tabular}{|l|l|}
|
||||
\hline \textbf{Value of XXX} & \textbf{Meaning} \\
|
||||
\hline tune & Builds portable tuning application \\
|
||||
\hline tune86 & Builds x86 (pentium and up) program for COFF \\
|
||||
\hline tune86c & Builds x86 program for Cygwin \\
|
||||
\hline tune86l & Builds x86 program for Linux (ELF format) \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{small}
|
||||
\end{center}
|
||||
\caption{Build Names for Tuning Programs}
|
||||
\label{fig:tuning}
|
||||
\end{figure}
|
||||
This will run a benchmark, computes the medians, rewrites \texttt{bncore.c}, and recompiles \texttt{bncore.c} and relinks the library.
|
||||
|
||||
When the program is running it will output a series of measurements for different cutoff points. It will first find
|
||||
good Karatsuba squaring and multiplication points. Then it proceeds to find Toom-Cook points. Note that the Toom-Cook
|
||||
tuning takes a very long time as the cutoff points are likely to be very high.
|
||||
The benchmark itself can be fine-tuned in the file \texttt{etc/tune\_it.sh}.
|
||||
|
||||
The program \texttt{etc/tune} is also able to print a list of values for printing curves with e.g.: \texttt{gnuplot}. type \texttt{./etc/tune -h} to get a list of all available options.
|
||||
|
||||
\chapter{Modular Reduction}
|
||||
|
||||
@ -1846,7 +1842,7 @@ int mp_sqrt (mp_int * a, mp_digit b, mp_int * c)
|
||||
|
||||
\chapter{Logarithm}
|
||||
\section{Integer Logarithm}
|
||||
A logarithm function for positive integer input \texttt{a, base} computing $\floor{\log_bx}$ such that $(\ilog_bx)^b \le x$.
|
||||
A logarithm function for positive integer input \texttt{a, base} computing $\floor{\log_bx}$ such that $(\log_b x)^b \le x$.
|
||||
\index{mp\_ilogb}
|
||||
\begin{alltt}
|
||||
int mp_ilogb(mp_int *a, mp_digit base, mp_int *c)
|
||||
|
43
etc/makefile
43
etc/makefile
@ -1,4 +1,4 @@
|
||||
CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
|
||||
CFLAGS += -Wall -W -Wextra -Wshadow -O3 -I../
|
||||
|
||||
# default lib name (requires install with root)
|
||||
# LIBNAME=-ltommath
|
||||
@ -8,43 +8,36 @@ LIBNAME=../libtommath.a
|
||||
|
||||
#provable primes
|
||||
pprime: pprime.o
|
||||
$(CC) pprime.o $(LIBNAME) -o pprime
|
||||
$(CC) $(CFLAGS) pprime.o $(LIBNAME) -o pprime
|
||||
|
||||
# portable [well requires clock()] tuning app
|
||||
tune: tune.o
|
||||
$(CC) tune.o $(LIBNAME) -o tune
|
||||
|
||||
# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
|
||||
tune86: tune.c
|
||||
nasm -f coff timer.asm
|
||||
$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86
|
||||
|
||||
# for cygwin
|
||||
tune86c: tune.c
|
||||
nasm -f gnuwin32 timer.asm
|
||||
$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86
|
||||
# The actual benchmark program
|
||||
$(CC) $(CFLAGS) tune.o $(LIBNAME) -o tune
|
||||
# a small script to run it
|
||||
/bin/sh tune_it.sh
|
||||
|
||||
#make tune86 for linux or any ELF format
|
||||
tune86l: tune.c
|
||||
nasm -f elf -DUSE_ELF timer.asm
|
||||
$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
|
||||
test_standalone: tune.o
|
||||
# The benchmark program works as a testtool, too
|
||||
$(CC) $(CFLAGS) tune.o $(LIBNAME) -o test
|
||||
|
||||
# spits out mersenne primes
|
||||
mersenne: mersenne.o
|
||||
$(CC) mersenne.o $(LIBNAME) -o mersenne
|
||||
$(CC) $(CFLAGS) mersenne.o $(LIBNAME) -o mersenne
|
||||
|
||||
# fines DR safe primes for the given config
|
||||
# finds DR safe primes for the given config
|
||||
drprime: drprime.o
|
||||
$(CC) drprime.o $(LIBNAME) -o drprime
|
||||
$(CC) $(CFLAGS) drprime.o $(LIBNAME) -o drprime
|
||||
|
||||
# fines 2k safe primes for the given config
|
||||
# finds 2k safe primes for the given config
|
||||
2kprime: 2kprime.o
|
||||
$(CC) 2kprime.o $(LIBNAME) -o 2kprime
|
||||
$(CC) $(CFLAGS) 2kprime.o $(LIBNAME) -o 2kprime
|
||||
|
||||
mont: mont.o
|
||||
$(CC) mont.o $(LIBNAME) -o mont
|
||||
$(CC) $(CFLAGS) mont.o $(LIBNAME) -o mont
|
||||
|
||||
|
||||
clean:
|
||||
rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat \
|
||||
*.da *.dyn *.dpi *~
|
||||
rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime mont 2kprime pprime.dat \
|
||||
tuning_list multiplying squaring test *.da *.dyn *.dpi *~
|
||||
|
||||
|
@ -28,9 +28,11 @@ LIBNAME=../libtommath.a
|
||||
pprime: pprime.o
|
||||
$(CC) pprime.o $(LIBNAME) -o pprime
|
||||
|
||||
# portable [well requires clock()] tuning app
|
||||
tune: tune.o
|
||||
$(CC) tune.o $(LIBNAME) -o tune
|
||||
# The actual benchmark program
|
||||
$(CC) $(CFLAGS) tune.o $(LIBNAME) -o tune
|
||||
# a small script to run it
|
||||
/bin/sh tune_it.sh
|
||||
|
||||
# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
|
||||
tune86: tune.c
|
||||
@ -64,4 +66,4 @@ mont: mont.o
|
||||
|
||||
|
||||
clean:
|
||||
rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il
|
||||
rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il tuning_list
|
||||
|
@ -9,10 +9,11 @@ pprime: pprime.obj
|
||||
|
||||
mersenne: mersenne.obj
|
||||
cl mersenne.obj ../tommath.lib
|
||||
|
||||
|
||||
tune: tune.obj
|
||||
cl tune.obj ../tommath.lib
|
||||
|
||||
|
||||
mont: mont.obj
|
||||
cl mont.obj ../tommath.lib
|
||||
|
||||
|
821
etc/tune.c
821
etc/tune.c
@ -2,141 +2,748 @@
|
||||
*
|
||||
* Tom St Denis, tstdenis82@gmail.com
|
||||
*/
|
||||
#include <tommath.h>
|
||||
#include "../tommath.h"
|
||||
#include "../tommath_private.h"
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
#include <inttypes.h>
|
||||
#include <limits.h>
|
||||
#include <errno.h>
|
||||
|
||||
/* how many times todo each size mult. Depends on your computer. For slow computers
|
||||
* this can be low like 5 or 10. For fast [re: Athlon] should be 25 - 50 or so
|
||||
*/
|
||||
#define TIMES (1UL<<14UL)
|
||||
static uint64_t s_ranval(void);
|
||||
static void s_raninit(uint64_t seed);
|
||||
static int s_mp_random(mp_int *a, int limbs);
|
||||
static uint64_t s_timer_function(void);
|
||||
static void s_timer_start(void);
|
||||
static uint64_t s_timer_stop(void);
|
||||
static uint64_t s_time_mul(int size);
|
||||
static uint64_t s_time_sqr(int size);
|
||||
static void s_usage(char *s);
|
||||
|
||||
#ifndef X86_TIMER
|
||||
/*
|
||||
Please take in mind that both multiplicands are of the same size. The balancing
|
||||
mechanism in mp_balance works well but has some overhead itself. You can test
|
||||
the behaviour of it with the option "-o" followed by a (small) positive number 'x'
|
||||
to generate ratios of the form 1:x.
|
||||
*/
|
||||
|
||||
/* RDTSC from Scott Duplichan */
|
||||
static uint64_t TIMFUNC(void)
|
||||
/* Bob Jenkins' http://burtleburtle.net/bob/rand/smallprng.html */
|
||||
/* Chosen for speed and a good "mix" */
|
||||
typedef struct ranctx {
|
||||
uint64_t a;
|
||||
uint64_t b;
|
||||
uint64_t c;
|
||||
uint64_t d;
|
||||
} ranctx;
|
||||
|
||||
static ranctx burtle_x;
|
||||
|
||||
# define rot(x,k) (((x)<<(k))|((x)>>(64-(k))))
|
||||
static uint64_t s_ranval(void)
|
||||
{
|
||||
# if defined __GNUC__
|
||||
# if defined(__i386__) || defined(__x86_64__)
|
||||
/* version from http://www.mcs.anl.gov/~kazutomo/rdtsc.html
|
||||
* the old code always got a warning issued by gcc, clang did not complain...
|
||||
*/
|
||||
unsigned hi, lo;
|
||||
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
|
||||
return ((uint64_t)lo)|(((uint64_t)hi)<<32);
|
||||
# else /* gcc-IA64 version */
|
||||
unsigned long result;
|
||||
__asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
|
||||
while (__builtin_expect((int) result == -1, 0))
|
||||
__asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
|
||||
return result;
|
||||
# endif
|
||||
uint64_t e = burtle_x.a - rot(burtle_x.b, 7);
|
||||
burtle_x.a = burtle_x.b ^ rot(burtle_x.c, 13);
|
||||
burtle_x.b = burtle_x.c + rot(burtle_x.d, 37);
|
||||
burtle_x.c = burtle_x.d + e;
|
||||
burtle_x.d = e + burtle_x.a;
|
||||
return burtle_x.d;
|
||||
}
|
||||
|
||||
/* Microsoft and Intel Windows compilers */
|
||||
# elif defined _M_IX86
|
||||
__asm rdtsc
|
||||
# elif defined _M_AMD64
|
||||
return __rdtsc();
|
||||
# elif defined _M_IA64
|
||||
# if defined __INTEL_COMPILER
|
||||
# include <ia64intrin.h>
|
||||
# endif
|
||||
return __getReg(3116);
|
||||
# else
|
||||
# error need rdtsc function for this build
|
||||
# endif
|
||||
static void s_raninit(uint64_t seed)
|
||||
{
|
||||
uint64_t i;
|
||||
burtle_x.a = 0xf1ea5eed;
|
||||
burtle_x.b = burtle_x.c = burtle_x.d = seed;
|
||||
for (i = 0; i < 20; ++i) {
|
||||
(void) s_ranval();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
The original used LTM's mp_rand which uses the cryptographically secure
|
||||
source of the OS for its purpose. That is too expensive, too slow and
|
||||
most important for a benchmark: it is not repeatable.
|
||||
*/
|
||||
static int s_mp_random(mp_int *a, int limbs)
|
||||
{
|
||||
int e = MP_OKAY;
|
||||
if ((e = mp_grow(a, limbs + 1)) != MP_OKAY) {
|
||||
goto LTM_ERR;
|
||||
}
|
||||
a->used = limbs--;
|
||||
do {
|
||||
a->dp[limbs] = (mp_digit)(s_ranval() & MP_MASK);
|
||||
} while (limbs--);
|
||||
mp_clamp(a);
|
||||
LTM_ERR:
|
||||
return e;
|
||||
}
|
||||
|
||||
static uint64_t s_timer_function(void)
|
||||
{
|
||||
#if _POSIX_C_SOURCE >= 199309L
|
||||
#define LTM_BILLION 1000000000
|
||||
struct timespec ts;
|
||||
|
||||
/* TODO: Sets errno in case of error. Use? */
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (((uint64_t)ts.tv_sec) * LTM_BILLION + (uint64_t)ts.tv_nsec);
|
||||
#else
|
||||
clock_t t;
|
||||
t = clock();
|
||||
if (t < (clock_t)(0)) {
|
||||
return (uint64_t)(0);
|
||||
}
|
||||
return (uint64_t)(t);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* generic ISO C timer */
|
||||
static uint64_t s_timer_tmp;
|
||||
static void s_timer_start(void)
|
||||
{
|
||||
s_timer_tmp = s_timer_function();
|
||||
}
|
||||
static uint64_t s_timer_stop(void)
|
||||
{
|
||||
return s_timer_function() - s_timer_tmp;
|
||||
}
|
||||
|
||||
|
||||
/* *INDENT-OFF* */
|
||||
/* generic ISO C timer */
|
||||
static uint64_t LBL_T;
|
||||
static void t_start(void) { LBL_T = TIMFUNC(); }
|
||||
static uint64_t t_read(void) { return TIMFUNC() - LBL_T; }
|
||||
/* *INDENT-ON* */
|
||||
static int s_check_result;
|
||||
static int s_number_of_test_loops;
|
||||
static int s_stabilization_extra;
|
||||
static int s_offset = 1;
|
||||
|
||||
#else
|
||||
extern void t_start(void);
|
||||
extern uint64_t t_read(void);
|
||||
#endif
|
||||
|
||||
static uint64_t time_mult(int size, int s)
|
||||
#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
|
||||
static uint64_t s_time_mul(int size)
|
||||
{
|
||||
unsigned long x;
|
||||
int x, e;
|
||||
mp_int a, b, c, d;
|
||||
uint64_t t1;
|
||||
|
||||
if ((e = mp_init_multi(&a, &b, &c, &d, NULL)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
|
||||
if ((e = s_mp_random(&a, size * s_offset)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
if ((e = s_mp_random(&b, size)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
|
||||
s_timer_start();
|
||||
for (x = 0; x < s_number_of_test_loops; x++) {
|
||||
if ((e = mp_mul(&a,&b,&c)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
if (s_check_result == 1) {
|
||||
if ((e = s_mp_mul(&a,&b,&d)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
if (mp_cmp(&c, &d) != MP_EQ) {
|
||||
/* Time of 0 cannot happen (famous last words?) */
|
||||
t1 = 0uLL;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t1 = s_timer_stop();
|
||||
LTM_ERR:
|
||||
mp_clear_multi(&a, &b, &c, &d, NULL);
|
||||
return t1;
|
||||
}
|
||||
|
||||
static uint64_t s_time_sqr(int size)
|
||||
{
|
||||
int x, e;
|
||||
mp_int a, b, c;
|
||||
uint64_t t1;
|
||||
|
||||
mp_init(&a);
|
||||
mp_init(&b);
|
||||
mp_init(&c);
|
||||
|
||||
mp_rand(&a, size);
|
||||
mp_rand(&b, size);
|
||||
|
||||
if (s == 1) {
|
||||
KARATSUBA_MUL_CUTOFF = size;
|
||||
} else {
|
||||
KARATSUBA_MUL_CUTOFF = 100000;
|
||||
if ((e = mp_init_multi(&a, &b, &c, NULL)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
|
||||
t_start();
|
||||
for (x = 0; x < TIMES; x++) {
|
||||
mp_mul(&a,&b,&c);
|
||||
if ((e = s_mp_random(&a, size)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
t1 = t_read();
|
||||
mp_clear(&a);
|
||||
mp_clear(&b);
|
||||
mp_clear(&c);
|
||||
|
||||
s_timer_start();
|
||||
for (x = 0; x < s_number_of_test_loops; x++) {
|
||||
if ((e = mp_sqr(&a,&b)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
if (s_check_result == 1) {
|
||||
if ((e = s_mp_sqr(&a,&c)) != MP_OKAY) {
|
||||
t1 = UINT64_MAX;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
if (mp_cmp(&c, &b) != MP_EQ) {
|
||||
t1 = 0uLL;
|
||||
goto LTM_ERR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t1 = s_timer_stop();
|
||||
LTM_ERR:
|
||||
mp_clear_multi(&a, &b, &c, NULL);
|
||||
return t1;
|
||||
}
|
||||
|
||||
static uint64_t time_sqr(int size, int s)
|
||||
static void s_usage(char *s)
|
||||
{
|
||||
unsigned long x;
|
||||
mp_int a, b;
|
||||
uint64_t t1;
|
||||
|
||||
mp_init(&a);
|
||||
mp_init(&b);
|
||||
|
||||
mp_rand(&a, size);
|
||||
|
||||
if (s == 1) {
|
||||
KARATSUBA_SQR_CUTOFF = size;
|
||||
} else {
|
||||
KARATSUBA_SQR_CUTOFF = 100000;
|
||||
}
|
||||
|
||||
t_start();
|
||||
for (x = 0; x < TIMES; x++) {
|
||||
mp_sqr(&a,&b);
|
||||
}
|
||||
t1 = t_read();
|
||||
mp_clear(&a);
|
||||
mp_clear(&b);
|
||||
return t1;
|
||||
fprintf(stderr,"Usage: %s [TvcpGbtrSLFfMmosh]\n",s);
|
||||
fprintf(stderr," -T testmode, for use with testme.sh\n");
|
||||
fprintf(stderr," -v verbose, print all timings\n");
|
||||
fprintf(stderr," -c check results\n");
|
||||
fprintf(stderr," -p print benchmark of final cutoffs in files \"multiplying\"\n");
|
||||
fprintf(stderr," and \"squaring\"\n");
|
||||
fprintf(stderr," -G [string] suffix for the filenames listed above\n");
|
||||
fprintf(stderr," Implies '-p'\n");
|
||||
fprintf(stderr," -b print benchmark of bncore.c\n");
|
||||
fprintf(stderr," -t prints comma separated results\n");
|
||||
fprintf(stderr," -r [64] number of rounds\n");
|
||||
fprintf(stderr," -S [0xdeadbeef] seed for PRNG\n");
|
||||
fprintf(stderr," -L [3] number of negative values accumulated until the result is accepted\n");
|
||||
fprintf(stderr," -M [3000] upper limit of T-C tests/prints\n");
|
||||
fprintf(stderr," -m [1] increment of T-C tests/prints\n");
|
||||
fprintf(stderr," -o [1] multiplier for the second multiplicand\n");
|
||||
fprintf(stderr," (Not for computing the cut-offs!)\n");
|
||||
fprintf(stderr," -s 'preset' use values in 'preset' for printing.\n");
|
||||
fprintf(stderr," 'preset' is a comma separated string with cut-offs for\n");
|
||||
fprintf(stderr," ksm, kss, tc3m, tc3s in that order\n");
|
||||
fprintf(stderr," ksm = karatsuba multiplication\n");
|
||||
fprintf(stderr," kss = karatsuba squaring\n");
|
||||
fprintf(stderr," tc3m = Toom-Cook 3-way multiplication\n");
|
||||
fprintf(stderr," tc3s = Toom-Cook 3-way squaring\n");
|
||||
fprintf(stderr," Implies '-p'\n");
|
||||
fprintf(stderr," -h this message\n");
|
||||
}
|
||||
|
||||
int main(void)
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
uint64_t t1, t2;
|
||||
int x, y;
|
||||
int x, i, j;
|
||||
int count = 0;
|
||||
|
||||
for (x = 8; ; x += 2) {
|
||||
t1 = time_mult(x, 0);
|
||||
t2 = time_mult(x, 1);
|
||||
printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
|
||||
if (t2 < t1) break;
|
||||
int testmode = 0;
|
||||
int verbose = 0;
|
||||
int print = 0;
|
||||
int bncore = 0;
|
||||
int terse = 0;
|
||||
|
||||
int upper_limit_print = 3000;
|
||||
int increment_print = 1;
|
||||
|
||||
int printpreset = 0;
|
||||
/*int preset[8];*/
|
||||
int base = 10;
|
||||
char *endptr, *str;
|
||||
long val;
|
||||
|
||||
uint64_t seed = 0xdeadbeef;
|
||||
|
||||
int opt;
|
||||
int ksm, kss, tc3m, tc3s;
|
||||
|
||||
FILE *squaring, *multiplying;
|
||||
char mullog[256] = "multiplying";
|
||||
char sqrlog[256] = "squaring";
|
||||
s_number_of_test_loops = 64;
|
||||
s_stabilization_extra = 3;
|
||||
|
||||
/* Very simple option parser, please treat it nicely. */
|
||||
if (argc != 1) {
|
||||
for (opt = 1; (opt < argc) && (argv[opt][0] == '-'); opt++) {
|
||||
switch (argv[opt][1]) {
|
||||
case 'T':
|
||||
testmode = 1;
|
||||
s_check_result = 1;
|
||||
upper_limit_print = 1000;
|
||||
increment_print = 11;
|
||||
s_number_of_test_loops = 1;
|
||||
s_stabilization_extra = 1;
|
||||
s_offset = 1;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = 1;
|
||||
break;
|
||||
case 'c':
|
||||
s_check_result = 1;
|
||||
break;
|
||||
case 'p':
|
||||
print = 1;
|
||||
break;
|
||||
case 'G':
|
||||
print = 1;
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
/* manual strcat() */
|
||||
for (i = 0; i < 255; i++) {
|
||||
if (mullog[i] == '\0') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (j = 0; i < 255; j++, i++) {
|
||||
mullog[i] = argv[opt][j];
|
||||
if (argv[opt][j] == '\0') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < 255; i++) {
|
||||
if (sqrlog[i] == '\0') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (j = 0; i < 255; j++, i++) {
|
||||
sqrlog[i] = argv[opt][j];
|
||||
if (argv[opt][j] == '\0') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'b':
|
||||
bncore = 1;
|
||||
break;
|
||||
case 't':
|
||||
terse = 1;
|
||||
break;
|
||||
case 'S':
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
str = argv[opt];
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN))
|
||||
|| (errno != 0 && val == 0)) {
|
||||
fprintf(stderr,"Seed %s not usable\n", argv[opt]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No seed given?\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
seed = (uint64_t)val;
|
||||
break;
|
||||
case 'L':
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
str = argv[opt];
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"Value %s not usable\n", argv[opt]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No value for option \"-L\"given\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
s_stabilization_extra = (int)val;
|
||||
break;
|
||||
case 'o':
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
str = argv[opt];
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"Value %s not usable as an offset\n", argv[opt]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No value for the offset given\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
s_offset = (int)val;
|
||||
break;
|
||||
case 'r':
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
str = argv[opt];
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"Value %s not usable as the number of rounds for \"-r\"\n", argv[opt]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No value for the number of rounds given\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
s_number_of_test_loops = (int)val;
|
||||
break;
|
||||
|
||||
case 'M':
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
str = argv[opt];
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"Value %s not usable as the upper limit of T-C tests (\"-M\")\n", argv[opt]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No value for the upper limit of T-C tests given\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
upper_limit_print = (int)val;
|
||||
break;
|
||||
case 'm':
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
str = argv[opt];
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"Value %s not usable as the increment for the T-C tests (\"-m\")\n", argv[opt]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No value for the increment for the T-C tests given\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
increment_print = (int)val;
|
||||
break;
|
||||
case 's':
|
||||
printpreset = 1;
|
||||
print = 1;
|
||||
opt++;
|
||||
if (opt >= argc) {
|
||||
s_usage(argv[0]);
|
||||
}
|
||||
str = argv[opt];
|
||||
i = 0;
|
||||
/* Only the most basic checks */
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"input #%d wrong\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No input for #%d?\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
i++;
|
||||
str = endptr + 1;
|
||||
KARATSUBA_MUL_CUTOFF = (int)val;
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"input #%d wrong\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No input for #%d?\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
i++;
|
||||
str = endptr + 1;
|
||||
KARATSUBA_SQR_CUTOFF = (int)val;
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"input #%d wrong\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No input for #%d?\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
i++;
|
||||
str = endptr + 1;
|
||||
TOOM_MUL_CUTOFF = (int)val;
|
||||
errno = 0;
|
||||
val = strtol(str, &endptr, base);
|
||||
if ((val > INT_MAX || val < 0) || (errno != 0)) {
|
||||
fprintf(stderr,"input #%d wrong\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (endptr == str) {
|
||||
fprintf(stderr, "No input for #%d?\n", i+1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
i++;
|
||||
str = endptr + 1;
|
||||
TOOM_SQR_CUTOFF = (int)val;
|
||||
case 'h':
|
||||
default:
|
||||
s_usage(argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
y = x;
|
||||
|
||||
for (x = 8; ; x += 2) {
|
||||
t1 = time_sqr(x, 0);
|
||||
t2 = time_sqr(x, 1);
|
||||
printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
|
||||
if (t2 < t1) break;
|
||||
s_raninit(seed);
|
||||
|
||||
ksm = KARATSUBA_MUL_CUTOFF;
|
||||
kss = KARATSUBA_SQR_CUTOFF;
|
||||
tc3m = TOOM_MUL_CUTOFF;
|
||||
tc3s = TOOM_SQR_CUTOFF;
|
||||
|
||||
if ((bncore == 0) && (printpreset == 0)) {
|
||||
/* Turn all limits from bncore.c to the max */
|
||||
KARATSUBA_MUL_CUTOFF = INT_MAX;
|
||||
KARATSUBA_SQR_CUTOFF = INT_MAX;
|
||||
TOOM_MUL_CUTOFF = INT_MAX;
|
||||
TOOM_SQR_CUTOFF = INT_MAX;
|
||||
#ifdef BN_S_MP_KARATSUBA_MUL_C
|
||||
/*
|
||||
The influence of the Comba multiplication cannot be
|
||||
eradicated programmatically. It depends on the size
|
||||
of the macro MP_WPARRAY in tommath.h which needs to
|
||||
be changed manually (to 0 (zero)).
|
||||
*/
|
||||
if ((verbose == 1) || (testmode == 1)) {
|
||||
puts("# Karatsuba multiplication.");
|
||||
}
|
||||
for (x = 8; x < upper_limit_print; x += increment_print) {
|
||||
KARATSUBA_MUL_CUTOFF = INT_MAX;
|
||||
t1 = s_time_mul(x);
|
||||
if ((t1 == 0uLL) || (t1 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Karatsuba multiplication failed at x = INT_MAX (%s)\n",
|
||||
(t1 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
KARATSUBA_MUL_CUTOFF = x;
|
||||
t2 = s_time_mul(x);
|
||||
if ((t2 == 0uLL) || (t2 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Karatsuba multiplication failed (%s)\n",
|
||||
(t2 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (verbose == 1) {
|
||||
printf("%d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
}
|
||||
if (t2 < t1) {
|
||||
if (count == s_stabilization_extra) {
|
||||
count = 0;
|
||||
break;
|
||||
} else if (count < s_stabilization_extra) {
|
||||
count++;
|
||||
}
|
||||
} else if (count > 0) {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
KARATSUBA_MUL_CUTOFF = x - s_stabilization_extra * increment_print;
|
||||
#endif
|
||||
#ifdef BN_S_MP_KARATSUBA_SQR_C
|
||||
if ((verbose == 1) || (testmode == 1)) {
|
||||
puts("# Karatsuba squaring.");
|
||||
}
|
||||
for (x = 8; x < upper_limit_print; x += increment_print) {
|
||||
KARATSUBA_SQR_CUTOFF = INT_MAX;
|
||||
t1 = s_time_sqr(x);
|
||||
if ((t1 == 0uLL) || (t1 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Karatsuba squaring failed at x = INT_MAX (%s)\n",
|
||||
(t1 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
KARATSUBA_SQR_CUTOFF = x;
|
||||
t2 = s_time_sqr(x);
|
||||
if ((t2 == 0uLL) || (t2 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Karatsuba squaring failed (%s)\n",
|
||||
(t2 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (verbose == 1) {
|
||||
printf("%d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
}
|
||||
if (t2 < t1) {
|
||||
if (count == s_stabilization_extra) {
|
||||
count = 0;
|
||||
break;
|
||||
} else if (count < s_stabilization_extra) {
|
||||
count++;
|
||||
}
|
||||
} else if (count > 0) {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
KARATSUBA_SQR_CUTOFF = x - s_stabilization_extra * increment_print;
|
||||
#endif
|
||||
#ifdef BN_S_MP_TOOM_MUL_C
|
||||
if ((verbose == 1) || (testmode == 1)) {
|
||||
puts("# Toom-Cook 3-way multiplying.");
|
||||
}
|
||||
for (x = 8; x < upper_limit_print; x += increment_print) {
|
||||
TOOM_MUL_CUTOFF = INT_MAX;
|
||||
t1 = s_time_mul(x);
|
||||
if ((t1 == 0uLL) || (t1 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Toom-Cook 3-way multiplying failed at x = INT_MAX (%s)\n",
|
||||
(t1 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
TOOM_MUL_CUTOFF = x;
|
||||
t2 = s_time_mul(x);
|
||||
if ((t2 == 0uLL) || (t2 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Toom-Cook 3-way multiplication failed (%s)\n",
|
||||
(t2 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (verbose == 1) {
|
||||
printf("%d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
}
|
||||
if (t2 < t1) {
|
||||
if (count == s_stabilization_extra) {
|
||||
count = 0;
|
||||
break;
|
||||
} else if (count < s_stabilization_extra) {
|
||||
count++;
|
||||
}
|
||||
} else if (count > 0) {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
TOOM_MUL_CUTOFF = x - s_stabilization_extra * increment_print;
|
||||
#endif
|
||||
#ifdef BN_S_MP_TOOM_SQR_C
|
||||
if ((verbose == 1) || (testmode == 1)) {
|
||||
puts("# Toom-Cook 3-way squaring.");
|
||||
}
|
||||
for (x = 8; x < upper_limit_print; x += increment_print) {
|
||||
TOOM_SQR_CUTOFF = INT_MAX;
|
||||
t1 = s_time_sqr(x);
|
||||
if ((t1 == 0uLL) || (t1 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Toom-Cook 3-way squaring failed at x = INT_MAX (%s)\n",
|
||||
(t1 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
TOOM_SQR_CUTOFF = x;
|
||||
t2 = s_time_sqr(x);
|
||||
if ((t2 == 0uLL) || (t2 == UINT64_MAX)) {
|
||||
fprintf(stderr,"Toom-Cook 3-way squaring failed (%s)\n",
|
||||
(t2 == 0uLL)?"wrong result":"internal error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (verbose == 1) {
|
||||
printf("%d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
}
|
||||
if (t2 < t1) {
|
||||
if (count == s_stabilization_extra) {
|
||||
count = 0;
|
||||
break;
|
||||
} else if (count < s_stabilization_extra) {
|
||||
count++;
|
||||
}
|
||||
} else if (count > 0) {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
TOOM_SQR_CUTOFF = x - s_stabilization_extra * increment_print;
|
||||
#endif
|
||||
}
|
||||
if (terse == 1) {
|
||||
printf("%d %d %d %d\n",
|
||||
KARATSUBA_MUL_CUTOFF,
|
||||
KARATSUBA_SQR_CUTOFF,
|
||||
TOOM_MUL_CUTOFF,
|
||||
TOOM_SQR_CUTOFF);
|
||||
} else {
|
||||
printf("KARATSUBA_MUL_CUTOFF = %d\n", KARATSUBA_MUL_CUTOFF);
|
||||
printf("KARATSUBA_SQR_CUTOFF = %d\n", KARATSUBA_SQR_CUTOFF);
|
||||
printf("TOOM_MUL_CUTOFF = %d\n", TOOM_MUL_CUTOFF);
|
||||
printf("TOOM_SQR_CUTOFF = %d\n", TOOM_SQR_CUTOFF);
|
||||
}
|
||||
printf("KARATSUBA_MUL_CUTOFF = %d\n", y);
|
||||
printf("KARATSUBA_SQR_CUTOFF = %d\n", x);
|
||||
|
||||
return 0;
|
||||
if (print == 1) {
|
||||
printf("Printing data for graphing to \"%s\" and \"%s\"\n",mullog, sqrlog);
|
||||
|
||||
multiplying = fopen(mullog, "w+");
|
||||
if (multiplying == NULL) {
|
||||
fprintf(stderr, "Opening file \"%s\" failed\n", mullog);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
squaring = fopen(sqrlog, "w+");
|
||||
if (squaring == NULL) {
|
||||
fprintf(stderr, "Opening file \"%s\" failed\n",sqrlog);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
for (x = 8; x < upper_limit_print; x += increment_print) {
|
||||
KARATSUBA_MUL_CUTOFF = INT_MAX;
|
||||
KARATSUBA_SQR_CUTOFF = INT_MAX;
|
||||
TOOM_MUL_CUTOFF = INT_MAX;
|
||||
TOOM_SQR_CUTOFF = INT_MAX;
|
||||
t1 = s_time_mul(x);
|
||||
KARATSUBA_MUL_CUTOFF = kss;
|
||||
KARATSUBA_SQR_CUTOFF = ksm;
|
||||
TOOM_MUL_CUTOFF = tc3m;
|
||||
TOOM_SQR_CUTOFF = tc3s;
|
||||
t2 = s_time_mul(x);
|
||||
fprintf(multiplying, "%d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
fflush(multiplying);
|
||||
if (verbose == 1) {
|
||||
printf("MUL %d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
fflush(stdout);
|
||||
}
|
||||
KARATSUBA_MUL_CUTOFF = INT_MAX;
|
||||
KARATSUBA_SQR_CUTOFF = INT_MAX;
|
||||
TOOM_MUL_CUTOFF = INT_MAX;
|
||||
TOOM_SQR_CUTOFF = INT_MAX;
|
||||
t1 = s_time_sqr(x);
|
||||
KARATSUBA_MUL_CUTOFF = kss;
|
||||
KARATSUBA_SQR_CUTOFF = ksm;
|
||||
TOOM_MUL_CUTOFF = tc3m;
|
||||
TOOM_SQR_CUTOFF = tc3s;
|
||||
t2 = s_time_sqr(x);
|
||||
fprintf(squaring,"%d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
fflush(squaring);
|
||||
if (verbose == 1) {
|
||||
printf("SQR %d: %9"PRIu64" %9"PRIu64", %9"PRIi64"\n", x, t1, t2, (int64_t)t2 - (int64_t)t1);
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
printf("Finished. Data for graphing in \"%s\" and \"%s\"\n",mullog, sqrlog);
|
||||
if (verbose == 1) {
|
||||
KARATSUBA_MUL_CUTOFF = kss;
|
||||
KARATSUBA_SQR_CUTOFF = ksm;
|
||||
TOOM_MUL_CUTOFF = tc3m;
|
||||
TOOM_SQR_CUTOFF = tc3s;
|
||||
if (terse == 1) {
|
||||
printf("%d %d %d %d\n",
|
||||
KARATSUBA_MUL_CUTOFF,
|
||||
KARATSUBA_SQR_CUTOFF,
|
||||
TOOM_MUL_CUTOFF,
|
||||
TOOM_SQR_CUTOFF);
|
||||
} else {
|
||||
printf("KARATSUBA_MUL_CUTOFF = %d\n", KARATSUBA_MUL_CUTOFF);
|
||||
printf("KARATSUBA_SQR_CUTOFF = %d\n", KARATSUBA_SQR_CUTOFF);
|
||||
printf("TOOM_MUL_CUTOFF = %d\n", TOOM_MUL_CUTOFF);
|
||||
printf("TOOM_SQR_CUTOFF = %d\n", TOOM_SQR_CUTOFF);
|
||||
}
|
||||
}
|
||||
}
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
129
etc/tune_it.sh
Executable file
129
etc/tune_it.sh
Executable file
@ -0,0 +1,129 @@
|
||||
#!/bin/sh
|
||||
|
||||
die() {
|
||||
echo "$1 failed"
|
||||
echo "Exiting"
|
||||
exit $2
|
||||
}
|
||||
# A linear congruential generator is sufficient for the purpose.
|
||||
SEED=3735928559
|
||||
LCG() {
|
||||
SEED=$(((1103515245 * $SEED + 12345) % 2147483648))
|
||||
echo $SEED
|
||||
}
|
||||
|
||||
median() {
|
||||
median=0;
|
||||
flag=1;
|
||||
for val in $* ; do
|
||||
if [ $flag -eq 1 ] ; then
|
||||
flag=$((flag + 1))
|
||||
continue
|
||||
elif [ $flag -eq 2 ] ; then
|
||||
median=$val
|
||||
flag=$((flag + 1))
|
||||
continue
|
||||
else
|
||||
if [ $median -lt $val ] ; then
|
||||
median=$((median + 1))
|
||||
fi
|
||||
if [ $median -gt $val ] ; then
|
||||
median=$((median - 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
echo $median
|
||||
}
|
||||
|
||||
MPWD=$(pwd)
|
||||
FILE_NAME="tuning_list"
|
||||
BNCORE_C="../bncore.c"
|
||||
BACKUP_SUFFIX=".orig"
|
||||
RNUM=0;
|
||||
#############################################################################
|
||||
# It would be a good idea to isolate these processes (with e.g.: cpuset) #
|
||||
# #
|
||||
# It is not a good idea to e.g: watch high resolution videos while this #
|
||||
# test are running if you do not have enough memory to avoid page faults. #
|
||||
#############################################################################
|
||||
|
||||
# Number of rounds overall.
|
||||
LIMIT=100
|
||||
# Number of loops for each input.
|
||||
RLOOPS=10
|
||||
# Offset ( > 0 ) . Runs tests with asymmetric input of the form 1:OFFSET
|
||||
# Please use another destination for BNCORE_C if you change OFFSET, because the numbers
|
||||
# with an offset different from 1 (one) are not usable as the general cut-off values
|
||||
# in "bncore.c".
|
||||
OFFSET=1
|
||||
# Number ( >= 3 ) of positive results (TC-is-faster) accumulated until it is accepted.
|
||||
# Due to the algorithm used to compute the median in this Posix compliant shell script
|
||||
# the value needs to be 3 (three), not less, to keep the variation small.
|
||||
LAG=3
|
||||
# Keep the temporary file $FILE_NAME. Set to 0 (zero) to remove it at the end.
|
||||
# The file is in a format fit to feed into R directly. If you do it and find the median
|
||||
# of this program to be off by more than a couple: please contact the authors and report
|
||||
# the numbers from this program and R and the standard deviation. This program is known
|
||||
# to get larger errors if the standard deviation is larger than ~50.
|
||||
KEEP_TEMP=1
|
||||
|
||||
echo "You might like to watch the numbers go up to $LIMIT but it will take a long time!"
|
||||
|
||||
# Might not have sufficient rights or disc full.
|
||||
echo "km ks tc3m tc3s" > $FILE_NAME || die "Writing header to $FILE_NAME" $?
|
||||
i=1
|
||||
while [ $i -le $LIMIT ]; do
|
||||
RNUM=$(LCG)
|
||||
echo $i
|
||||
"$MPWD"/tune -t -r $RLOOPS -L $LAG -S "$RNUM" -o $OFFSET >> $FILE_NAME || die "tune" $?
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
if [ $KEEP_TEMP -eq 0 ]; then
|
||||
rm -v $FILE_NAME || die "Removing $KEEP_TEMP" $?
|
||||
fi
|
||||
|
||||
echo "Writing cut-off values to \"bncore.c\"."
|
||||
echo "In case of failure: a copy of \"bncore.c\" is in \"bncore.c.orig\""
|
||||
|
||||
cp -v $BNCORE_C $BNCORE_C$BACKUP_SUFFIX || die "Making backup copy of bncore.c" $?
|
||||
|
||||
cat << END_OF_INPUT > $BNCORE_C || die "Writing header to bncore.c" $?
|
||||
#include "tommath_private.h"
|
||||
#ifdef BNCORE_C
|
||||
/* LibTomMath, multiple-precision integer library -- Tom St Denis */
|
||||
/* SPDX-License-Identifier: Unlicense */
|
||||
/*
|
||||
Current values evaluated on an AMD A8-6600K (64-bit).
|
||||
Type "make tune" to optimize them for your machine but
|
||||
be aware that it may take a long time. It took 2:30 minutes
|
||||
on the aforementioned machine for example.
|
||||
*/
|
||||
END_OF_INPUT
|
||||
|
||||
# The Posix shell does not offer an array data type
|
||||
|
||||
i=1;
|
||||
TMP=""
|
||||
TMP=$(cat $FILE_NAME | cut -d' ' -f$i )
|
||||
TMP=$(median $TMP )
|
||||
echo "int KARATSUBA_MUL_CUTOFF = $TMP;"
|
||||
echo "int KARATSUBA_MUL_CUTOFF = $TMP;" >> $BNCORE_C || die "(km) Appending to bncore.c" $?
|
||||
i=$((i + 1))
|
||||
TMP=$(cat $FILE_NAME | cut -d' ' -f$i )
|
||||
TMP=$(median $TMP );
|
||||
echo "int KARATSUBA_SQR_CUTOFF = $TMP;"
|
||||
echo "int KARATSUBA_SQR_CUTOFF = $TMP;" >> $BNCORE_C || die "(ks) Appending to bncore.c" $?
|
||||
i=$((i + 1))
|
||||
TMP=$(cat $FILE_NAME | cut -d' ' -f$i)
|
||||
TMP=$(median $TMP );
|
||||
echo "int TOOM_MUL_CUTOFF = $TMP;"
|
||||
echo "int TOOM_MUL_CUTOFF = $TMP;" >> $BNCORE_C || die "(tc3m) Appending to bncore.c" $?
|
||||
i=$((i + 1))
|
||||
TMP=$(cat $FILE_NAME | cut -d' ' -f$i)
|
||||
TMP=$(median $TMP );
|
||||
echo "int TOOM_SQR_CUTOFF = $TMP;"
|
||||
echo "int TOOM_SQR_CUTOFF = $TMP;" >> $BNCORE_C || die "(tc3s) Appending to bncore.c" $?
|
||||
|
||||
echo "#endif" >> $BNCORE_C || die "(end) Appending to bncore.c" $?
|
||||
|
4
makefile
4
makefile
@ -108,6 +108,10 @@ mtest:
|
||||
timing: $(LIBNAME) demo/timing.c
|
||||
$(CC) $(CFLAGS) -DTIMER demo/timing.c $(LIBNAME) $(LFLAGS) -o timing
|
||||
|
||||
tune: $(LIBNAME)
|
||||
$(MAKE) -C etc tune
|
||||
$(MAKE)
|
||||
|
||||
# You have to create a file .coveralls.yml with the content "repo_token: <the token>"
|
||||
# in the base folder to be able to submit to coveralls
|
||||
coveralls: lcov
|
||||
|
@ -89,6 +89,10 @@ test_standalone: test.exe
|
||||
|
||||
all: $(LIBMAIN_S) test_standalone
|
||||
|
||||
tune: $(LIBNAME_S)
|
||||
$(MAKE) -C etc tune
|
||||
$(MAKE)
|
||||
|
||||
clean:
|
||||
@-cmd /c del /Q /S *.o *.a *.exe *.dll 2>nul
|
||||
|
||||
|
@ -75,6 +75,10 @@ test_standalone: test.exe
|
||||
|
||||
all: $(LIBMAIN_S) test_standalone
|
||||
|
||||
tune: $(LIBMAIN_S)
|
||||
$(MAKE) -C etc tune
|
||||
$(MAKE)
|
||||
|
||||
clean:
|
||||
@-cmd /c del /Q /S *.OBJ *.LIB *.EXE *.DLL 2>nul
|
||||
|
||||
|
@ -95,3 +95,17 @@ mtest:
|
||||
|
||||
timing: $(LIBNAME) demo/timing.c
|
||||
$(LTLINK) $(CFLAGS) $(LDFLAGS) -DTIMER demo/timing.c $(LIBNAME) -o timing
|
||||
|
||||
tune: $(LIBNAME)
|
||||
$(LTCOMPILE) $(CFLAGS) -c etc/tune.c -o etc/tune.o
|
||||
$(LTLINK) $(LDFLAGS) -o etc/tune etc/tune.o $(LIBNAME)
|
||||
$(LTCOMPILE) $(CFLAGS) -c etc/statistic_summary_single_column.c -o etc/statistic_summary_single_column.o
|
||||
$(LTLINK) $(LDFLAGS) -o etc/statistic_summary_single_column etc/statistic_summary_single_column.o
|
||||
cd etc/
|
||||
/bin/sh tune_it.sh
|
||||
cd ..
|
||||
$(MAKE) -f makefile.shared
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -87,6 +87,10 @@ test_standalone: test
|
||||
|
||||
all: $(LIBMAIN_S) test_standalone
|
||||
|
||||
tune: $(LIBMAIN_S)
|
||||
$(MAKE) -C etc tune
|
||||
$(MAKE)
|
||||
|
||||
#NOTE: this makefile works also on cygwin, thus we need to delete *.exe
|
||||
clean:
|
||||
-@rm -f $(OBJECTS) $(LIBMAIN_S)
|
||||
@ -98,3 +102,4 @@ install: $(LIBMAIN_S)
|
||||
@cp $(LIBMAIN_S) $(DESTDIR)$(LIBPATH)/
|
||||
@cp $(HEADERS_PUB) $(DESTDIR)$(INCPATH)/
|
||||
@sed -e 's,^prefix=.*,prefix=$(PREFIX),' -e 's,^Version:.*,Version: $(VERSION),' libtommath.pc.in > $(DESTDIR)$(LIBPATH)/pkgconfig/libtommath.pc
|
||||
|
||||
|
@ -147,7 +147,7 @@ cleancov-clean:
|
||||
cleancov: cleancov-clean clean
|
||||
|
||||
clean:
|
||||
rm -f *.gcda *.gcno *.gcov *.bat *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/test.o demo/main.o demo/opponent.o test timing mpitest mtest/mtest mtest/mtest.exe \
|
||||
rm -f *.gcda *.gcno *.gcov *.bat *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/test.o demo/main.o demo/opponent.o test timing mpitest mtest/mtest mtest/mtest.exe tuning_list\
|
||||
*.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.da *.dyn *.dpi tommath.tex `find . -type f | grep [~] | xargs` *.lo *.la
|
||||
rm -rf .libs/
|
||||
${MAKE} -C etc/ clean MAKE=${MAKE}
|
||||
|
32
testme.sh
32
testme.sh
@ -118,11 +118,20 @@ _make()
|
||||
_runtest()
|
||||
{
|
||||
make clean > /dev/null
|
||||
_make "$1" "$2" "test_standalone"
|
||||
local _timeout=""
|
||||
which timeout >/dev/null && _timeout="timeout --foreground 90"
|
||||
echo -e "\rRun test $1 $2"
|
||||
$_timeout ./test > test_${suffix}.log || _die "running tests" $?
|
||||
if [[ "$MAKE_OPTIONS" =~ "tune" ]]
|
||||
then
|
||||
# "make tune" will run "tune_it.sh" automatically, hence "autotune", but it cannot
|
||||
# get switched off without some effort, so we just let it run twice for testing purposes
|
||||
_make "$1" "$2" ""
|
||||
echo -e "\rRun autotune $1 $2"
|
||||
$_timeout ./etc/tune_it.sh > test_${suffix}.log || _die "running autotune" $?
|
||||
else
|
||||
_make "$1" "$2" "test_standalone"
|
||||
echo -e "\rRun test $1 $2"
|
||||
$_timeout ./test > test_${suffix}.log || _die "running tests" $?
|
||||
fi
|
||||
}
|
||||
|
||||
# This is not much more of a C&P of _runtest with a different timeout
|
||||
@ -131,13 +140,24 @@ _runtest()
|
||||
_runvalgrind()
|
||||
{
|
||||
make clean > /dev/null
|
||||
_make "$1" "$2" "test_standalone"
|
||||
local _timeout=""
|
||||
# 30 minutes? Yes. Had it at 20 minutes and the Valgrind run needed over 25 minutes.
|
||||
# A bit too close for comfort.
|
||||
which timeout >/dev/null && _timeout="timeout --foreground 1800"
|
||||
echo -e "\rRun test $1 $2 inside valgrind"
|
||||
$_timeout $VALGRIND_BIN $VALGRIND_OPTS ./test > test_${suffix}.log || _die "running tests" $?
|
||||
echo "MAKE_OPTIONS = \"$MAKE_OPTIONS\""
|
||||
if [[ "$MAKE_OPTIONS" =~ "tune" ]]
|
||||
then
|
||||
echo "autotune branch"
|
||||
_make "$1" "$2" ""
|
||||
# The shell used for /bin/sh is DASH 0.5.7-4ubuntu1 on the author's machine which fails valgrind, so
|
||||
# we just run on instance of etc/tune with the same options as in etc/tune_it.sh
|
||||
echo -e "\rRun etc/tune $1 $2 once inside valgrind"
|
||||
$_timeout $VALGRIND_BIN $VALGRIND_OPTS ./etc/tune -t -r 10 -L 3 > test_${suffix}.log || _die "running etc/tune" $?
|
||||
else
|
||||
_make "$1" "$2" "test_standalone"
|
||||
echo -e "\rRun test $1 $2 inside valgrind"
|
||||
$_timeout $VALGRIND_BIN $VALGRIND_OPTS ./test > test_${suffix}.log || _die "running tests" $?
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user