Add math benchmark latency test

This patch further improves math function benchmarking by adding a latency test in addition to throughput. This enables more accurate comparisons of the math functions. The latency test works by creating a dependency on the previous iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids changing the input. It reports reciprocal throughput and latency in nanoseconds (depending on the timing header used) and max/min throughput in iterations per second: "workload-spec2006.wrf": { "reciprocal-throughput": 100, "latency": 200, "max-throughput": 1.0e+07, "min-throughput": 5.0e+06 } * benchtests/bench-skeleton.c (main): Add support for latency benchmarking. * benchtests/scripts/bench.py: Add support for latency benchmarking.
2024-11-25 06:20:06 +00:00 · 2017-08-17 16:27:20 +01:00 · 2017-08-17 16:27:20 +01:00 · d4505b895f
commit d4505b895f
parent 34d6a3cbf2
3 changed files with 43 additions and 10 deletions
--- a/6
+++ b/6
@ -1,3 +1,9 @@
 2017-08-17  Wilco Dijkstra  <wdijkstr@arm.com>
 	* benchtests/bench-skeleton.c (main): Add support for
 	latency benchmarking.
 	* benchtests/scripts/bench.py: Add support for latency benchmarking.
 2017-08-17  H.J. Lu  <hongjiu.lu@intel.com>
 	* Makeconfig (+link-pie-before-libc): Add CRT-* hook to override
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@ -71,8 +71,10 @@ main (int argc, char **argv)
      bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
      double d_total_i = 0;
      timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
      timing_t throughput = 0, latency = 0;
      int64_t c = 0;
      uint64_t cur;
      BENCH_VARS;
      while (1)
 	{
 	  if (is_bench)
@ -86,7 +88,16 @@ main (int argc, char **argv)
 		  BENCH_FUNC (v, i);
 	      TIMING_NOW (end);
 	      TIMING_DIFF (cur, start, end);
-	      TIMING_ACCUM (total, cur);
+	      TIMING_ACCUM (throughput, cur);
 	      TIMING_NOW (start);
 	      for (k = 0; k < iters; k++)
 		for (i = 0; i < NUM_SAMPLES (v); i++)
 		  BENCH_FUNC_LAT (v, i);
 	      TIMING_NOW (end);
 	      TIMING_DIFF (cur, start, end);
 	      TIMING_ACCUM (latency, cur);
 	      d_total_i += iters * NUM_SAMPLES (v);
 	    }
 	  else
@ -131,12 +142,20 @@ main (int argc, char **argv)
      /* Begin variant.  */
      json_attr_object_begin (&json_ctx, VARIANT (v));
      json_attr_double (&json_ctx, "duration", d_total_s);
      json_attr_double (&json_ctx, "iterations", d_total_i);
      if (is_bench)
-	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
+	{
 	  json_attr_double (&json_ctx, "reciprocal-throughput",
 			    throughput / d_total_i);
 	  json_attr_double (&json_ctx, "latency", latency / d_total_i);
 	  json_attr_double (&json_ctx, "max-throughput",
 			    d_total_i / throughput * 1000000000.0);
 	  json_attr_double (&json_ctx, "min-throughput",
 			    d_total_i / latency * 1000000000.0);
 	}
      else
 	{
 	  json_attr_double (&json_ctx, "duration", d_total_s);
 	  json_attr_double (&json_ctx, "iterations", d_total_i);
 	  json_attr_double (&json_ctx, "max", max / d_iters);
 	  json_attr_double (&json_ctx, "min", min / d_iters);
 	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
--- a/benchtests/scripts/bench.py
+++ b/benchtests/scripts/bench.py
@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
 # variant is represented by the _VARIANT structure.  The ARGS structure
 # represents a single set of arguments.
 STRUCT_TEMPLATE = '''
-#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
+#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
 struct args
 {
@ -84,7 +84,9 @@ EPILOGUE = '''
 #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
 #define RESULT_ACCUM(r, v, i, old, new) \\
        ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
-#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
+#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
 #define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
 #define BENCH_VARS %(defvar)s
 #define FUNCNAME "%(func)s"
 #include "bench-skeleton.c"'''
@ -122,17 +124,23 @@ def gen_source(func, directives, all_vals):
    # If we have a return value from the function, make sure it is
    # assigned to prevent the compiler from optimizing out the
    # call.
    getret = ''
    latarg = ''
    defvar = ''
    if directives['ret']:
        print('static %s volatile ret;' % directives['ret'])
-        getret = 'ret = '
+        print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
-    else:
+        getret = 'ret = func_res = '
-        getret = ''
+        # Note this may not work if argument and result type are incompatible.
        latarg = 'func_res * zero +'
        defvar = '%s func_res = 0;' % directives['ret']
    # Test initialization.
    if directives['init']:
        print('#define BENCH_INIT %s' % directives['init'])
-    print(EPILOGUE % {'getret': getret, 'func': func})
+    print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
 def _print_arg_data(func, directives, all_vals):