[tools] Add support for --confidence-level flag to tools/try_perf.py

See example v8_linux64_perf_try(_triggered) builds on this CL triggered using the following command lines: python tools/try_perf.py --linux64 --confidence-level 1 compile python tools/try_perf.py --linux64 --confidence-level 1 arewefastyet python tools/try_perf.py --linux64 --confidence-level 3 arewefastyet This also fixes running tools/run_perf.py --help and adds logging for the current confidence level, which allows users to monitor progress. Example runs: https://chrome-swarming.appspot.com/task?id=456e4d6e743cc510 (Compile) https://chrome-swarming.appspot.com/task?id=456e5145615aa510 (JetStream) https://chrome-swarming.appspot.com/task?id=456e53eeb9104410 (JSBench) https://chrome-swarming.appspot.com/task?id=456e541e0e13bc10 (AreWeFastYet) Finally, this adds support for fractional confidence levels. Example runs: https://chrome-swarming.appspot.com/task?id=456e5970e6f24410 (AreWeFastYet) https://chrome-swarming.appspot.com/task?id=456e5a8f3f407c10 (Compile) R=tmrts@chromium.org, machenbach@chromium.org Bug: chromium:880724 Change-Id: I725a83060c0bdd3ef08a7f0e4df843611c712d37 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1660471 Reviewed-by: Tamer Tas <tmrts@chromium.org> Commit-Queue: Sergiy Belozorov <sergiyb@chromium.org> Cr-Commit-Position: refs/heads/master@{#62176}
2019-06-14 12:32:00 +02:00 · 2019-06-14 12:32:00 +02:00 · 1de4631ef4
commit 1de4631ef4
parent 4fb050565a
2 changed files with 22 additions and 9 deletions
--- a/tools/run_perf.py
+++ b/tools/run_perf.py
@ -266,6 +266,7 @@ class ResultTracker(object):
    mean = numpy.mean(results)
    mean_stderr = numpy.std(results) / numpy.sqrt(len(results))
    logging.debug('  Mean: %.2f, mean_stderr: %.2f', mean, mean_stderr)
+    logging.info('>>> Confidence level is %.2f', mean / (1000.0 * mean_stderr))
    return confidence_level * mean_stderr < mean / 1000.0

  def __str__(self):  # pragma: no cover
@ -928,16 +929,16 @@ def Main(argv):
                      '--filter=JSTests/TypedArrays/ will run only TypedArray '
                      'benchmarks from the JSTests suite.',
                      default='')
-  parser.add_argument('--confidence-level', type=int,
+  parser.add_argument('--confidence-level', type=float,
                      help='Repeatedly runs each benchmark until specified '
                      'confidence level is reached. The value is interpreted '
                      'as the number of standard deviations from the mean that '
                      'all values must lie within. Typical values are 1, 2 and '
-                      '3 and correspond to 68%, 95% and 99.7% probability that '
-                      'the measured value is within 0.1% of the true value. '
-                      'Larger values result in more retries and thus longer '
-                      'runtime, but also provide more reliable results. Also '
-                      'see --max-total-duration flag.')
+                      '3 and correspond to 68%%, 95%% and 99.7%% probability '
+                      'that the measured value is within 0.1%% of the true '
+                      'value. Larger values result in more retries and thus '
+                      'longer runtime, but also provide more reliable results. '
+                      'Also see --max-total-duration flag.')
  parser.add_argument('--max-total-duration', type=int, default=7140,  # 1h 59m
                      help='Max total duration in seconds allowed for retries '
                      'across all tests. This is especially useful in '
--- a/tools/try_perf.py
+++ b/tools/try_perf.py
@ -66,6 +66,15 @@ def main():
                           'try server; see its waterfall for more info')
  parser.add_argument('-v', '--verbose', action='store_true',
                      help='Print debug information')
+  parser.add_argument('-c', '--confidence-level', type=float,
+                      help='Repeatedly runs each benchmark until specified '
+                      'confidence level is reached. The value is interpreted '
+                      'as the number of standard deviations from the mean that '
+                      'all values must lie within. Typical values are 1, 2 and '
+                      '3 and correspond to 68%%, 95%% and 99.7%% probability '
+                      'that the measured value is within 0.1%% of the true '
+                      'value. Larger values result in more retries and thus '
+                      'longer runtime, but also provide more reliable results.')
  for option in sorted(BOTS):
    parser.add_argument(
        option, dest='bots', action='append_const', const=BOTS[option],
@ -98,11 +107,14 @@ def main():

  cmd = ['git cl try', '-B', 'luci.v8-internal.try']
  cmd += ['-b %s' % bot for bot in options.bots]
-  if options.revision: cmd += ['-r %s' % options.revision]
+  if options.revision:
+    cmd.append('-r %s' % options.revision)
  benchmarks = ['"%s"' % benchmark for benchmark in options.benchmarks]
-  cmd += ['-p \'testfilter=[%s]\'' % ','.join(benchmarks)]
+  cmd.append('-p \'testfilter=[%s]\'' % ','.join(benchmarks))
  if options.extra_flags:
-    cmd += ['-p \'extra_flags="%s"\'' % options.extra_flags]
+    cmd.append('-p \'extra_flags="%s"\'' % options.extra_flags)
+  if options.confidence_level:
+    cmd.append('-p confidence_level=%f' % options.confidence_level)
  if options.verbose:
    cmd.append('-vv')
    print('Running %s' % ' '.join(cmd))