Add math benchmark latency test

This patch further improves math function benchmarking by adding a latency
test in addition to throughput.  This enables more accurate comparisons of the
math functions. The latency test works by creating a dependency on the previous
iteration: func_res = F (func_res * zero + input[i]). The multiply by zero
avoids changing the input.

It reports reciprocal throughput and latency in nanoseconds (depending on the
timing header used) and max/min throughput in iterations per second:

   "workload-spec2006.wrf": {
    "reciprocal-throughput": 100,
    "latency": 200,
    "max-throughput": 1.0e+07,
    "min-throughput": 5.0e+06
   }

	* benchtests/bench-skeleton.c (main): Add support for
	latency benchmarking.
	* benchtests/scripts/bench.py: Add support for latency benchmarking.
This commit is contained in:
Wilco Dijkstra 2017-08-17 16:27:20 +01:00
parent 34d6a3cbf2
commit d4505b895f
3 changed files with 43 additions and 10 deletions

View File

@ -1,3 +1,9 @@
2017-08-17 Wilco Dijkstra <wdijkstr@arm.com>
* benchtests/bench-skeleton.c (main): Add support for
latency benchmarking.
* benchtests/scripts/bench.py: Add support for latency benchmarking.
2017-08-17 H.J. Lu <hongjiu.lu@intel.com> 2017-08-17 H.J. Lu <hongjiu.lu@intel.com>
* Makeconfig (+link-pie-before-libc): Add CRT-* hook to override * Makeconfig (+link-pie-before-libc): Add CRT-* hook to override

View File

@ -71,8 +71,10 @@ main (int argc, char **argv)
bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0; bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
double d_total_i = 0; double d_total_i = 0;
timing_t total = 0, max = 0, min = 0x7fffffffffffffff; timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
timing_t throughput = 0, latency = 0;
int64_t c = 0; int64_t c = 0;
uint64_t cur; uint64_t cur;
BENCH_VARS;
while (1) while (1)
{ {
if (is_bench) if (is_bench)
@ -86,7 +88,16 @@ main (int argc, char **argv)
BENCH_FUNC (v, i); BENCH_FUNC (v, i);
TIMING_NOW (end); TIMING_NOW (end);
TIMING_DIFF (cur, start, end); TIMING_DIFF (cur, start, end);
TIMING_ACCUM (total, cur); TIMING_ACCUM (throughput, cur);
TIMING_NOW (start);
for (k = 0; k < iters; k++)
for (i = 0; i < NUM_SAMPLES (v); i++)
BENCH_FUNC_LAT (v, i);
TIMING_NOW (end);
TIMING_DIFF (cur, start, end);
TIMING_ACCUM (latency, cur);
d_total_i += iters * NUM_SAMPLES (v); d_total_i += iters * NUM_SAMPLES (v);
} }
else else
@ -131,12 +142,20 @@ main (int argc, char **argv)
/* Begin variant. */ /* Begin variant. */
json_attr_object_begin (&json_ctx, VARIANT (v)); json_attr_object_begin (&json_ctx, VARIANT (v));
json_attr_double (&json_ctx, "duration", d_total_s);
json_attr_double (&json_ctx, "iterations", d_total_i);
if (is_bench) if (is_bench)
json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i); {
json_attr_double (&json_ctx, "reciprocal-throughput",
throughput / d_total_i);
json_attr_double (&json_ctx, "latency", latency / d_total_i);
json_attr_double (&json_ctx, "max-throughput",
d_total_i / throughput * 1000000000.0);
json_attr_double (&json_ctx, "min-throughput",
d_total_i / latency * 1000000000.0);
}
else else
{ {
json_attr_double (&json_ctx, "duration", d_total_s);
json_attr_double (&json_ctx, "iterations", d_total_i);
json_attr_double (&json_ctx, "max", max / d_iters); json_attr_double (&json_ctx, "max", max / d_iters);
json_attr_double (&json_ctx, "min", min / d_iters); json_attr_double (&json_ctx, "min", min / d_iters);
json_attr_double (&json_ctx, "mean", d_total_s / d_total_i); json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);

View File

@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
# variant is represented by the _VARIANT structure. The ARGS structure # variant is represented by the _VARIANT structure. The ARGS structure
# represents a single set of arguments. # represents a single set of arguments.
STRUCT_TEMPLATE = ''' STRUCT_TEMPLATE = '''
#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s) #define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
struct args struct args
{ {
@ -84,7 +84,9 @@ EPILOGUE = '''
#define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing) #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
#define RESULT_ACCUM(r, v, i, old, new) \\ #define RESULT_ACCUM(r, v, i, old, new) \\
((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1)) ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);}) #define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
#define BENCH_VARS %(defvar)s
#define FUNCNAME "%(func)s" #define FUNCNAME "%(func)s"
#include "bench-skeleton.c"''' #include "bench-skeleton.c"'''
@ -122,17 +124,23 @@ def gen_source(func, directives, all_vals):
# If we have a return value from the function, make sure it is # If we have a return value from the function, make sure it is
# assigned to prevent the compiler from optimizing out the # assigned to prevent the compiler from optimizing out the
# call. # call.
getret = ''
latarg = ''
defvar = ''
if directives['ret']: if directives['ret']:
print('static %s volatile ret;' % directives['ret']) print('static %s volatile ret;' % directives['ret'])
getret = 'ret = ' print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
else: getret = 'ret = func_res = '
getret = '' # Note this may not work if argument and result type are incompatible.
latarg = 'func_res * zero +'
defvar = '%s func_res = 0;' % directives['ret']
# Test initialization. # Test initialization.
if directives['init']: if directives['init']:
print('#define BENCH_INIT %s' % directives['init']) print('#define BENCH_INIT %s' % directives['init'])
print(EPILOGUE % {'getret': getret, 'func': func}) print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
def _print_arg_data(func, directives, all_vals): def _print_arg_data(func, directives, all_vals):