[tools] Add support for --confidence-level flag to tools/try_perf.py

See example v8_linux64_perf_try(_triggered) builds on this CL triggered using
the following command lines:

  python tools/try_perf.py --linux64 --confidence-level 1 compile
  python tools/try_perf.py --linux64 --confidence-level 1 arewefastyet
  python tools/try_perf.py --linux64 --confidence-level 3 arewefastyet

This also fixes running tools/run_perf.py --help and adds logging for the
current confidence level, which allows users to monitor progress. Example runs:

  https://chrome-swarming.appspot.com/task?id=456e4d6e743cc510 (Compile)
  https://chrome-swarming.appspot.com/task?id=456e5145615aa510 (JetStream)
  https://chrome-swarming.appspot.com/task?id=456e53eeb9104410 (JSBench)
  https://chrome-swarming.appspot.com/task?id=456e541e0e13bc10 (AreWeFastYet)

Finally, this adds support for fractional confidence levels. Example runs:

  https://chrome-swarming.appspot.com/task?id=456e5970e6f24410 (AreWeFastYet)
  https://chrome-swarming.appspot.com/task?id=456e5a8f3f407c10 (Compile)

R=tmrts@chromium.org, machenbach@chromium.org

Bug: chromium:880724
Change-Id: I725a83060c0bdd3ef08a7f0e4df843611c712d37
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1660471
Reviewed-by: Tamer Tas <tmrts@chromium.org>
Commit-Queue: Sergiy Belozorov <sergiyb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62176}
This commit is contained in:
Sergiy Belozorov 2019-06-14 12:32:00 +02:00 committed by Commit Bot
parent 4fb050565a
commit 1de4631ef4
2 changed files with 22 additions and 9 deletions

View File

@ -266,6 +266,7 @@ class ResultTracker(object):
mean = numpy.mean(results) mean = numpy.mean(results)
mean_stderr = numpy.std(results) / numpy.sqrt(len(results)) mean_stderr = numpy.std(results) / numpy.sqrt(len(results))
logging.debug(' Mean: %.2f, mean_stderr: %.2f', mean, mean_stderr) logging.debug(' Mean: %.2f, mean_stderr: %.2f', mean, mean_stderr)
logging.info('>>> Confidence level is %.2f', mean / (1000.0 * mean_stderr))
return confidence_level * mean_stderr < mean / 1000.0 return confidence_level * mean_stderr < mean / 1000.0
def __str__(self): # pragma: no cover def __str__(self): # pragma: no cover
@ -928,16 +929,16 @@ def Main(argv):
'--filter=JSTests/TypedArrays/ will run only TypedArray ' '--filter=JSTests/TypedArrays/ will run only TypedArray '
'benchmarks from the JSTests suite.', 'benchmarks from the JSTests suite.',
default='') default='')
parser.add_argument('--confidence-level', type=int, parser.add_argument('--confidence-level', type=float,
help='Repeatedly runs each benchmark until specified ' help='Repeatedly runs each benchmark until specified '
'confidence level is reached. The value is interpreted ' 'confidence level is reached. The value is interpreted '
'as the number of standard deviations from the mean that ' 'as the number of standard deviations from the mean that '
'all values must lie within. Typical values are 1, 2 and ' 'all values must lie within. Typical values are 1, 2 and '
'3 and correspond to 68%, 95% and 99.7% probability that ' '3 and correspond to 68%%, 95%% and 99.7%% probability '
'the measured value is within 0.1% of the true value. ' 'that the measured value is within 0.1%% of the true '
'Larger values result in more retries and thus longer ' 'value. Larger values result in more retries and thus '
'runtime, but also provide more reliable results. Also ' 'longer runtime, but also provide more reliable results. '
'see --max-total-duration flag.') 'Also see --max-total-duration flag.')
parser.add_argument('--max-total-duration', type=int, default=7140, # 1h 59m parser.add_argument('--max-total-duration', type=int, default=7140, # 1h 59m
help='Max total duration in seconds allowed for retries ' help='Max total duration in seconds allowed for retries '
'across all tests. This is especially useful in ' 'across all tests. This is especially useful in '

View File

@ -66,6 +66,15 @@ def main():
'try server; see its waterfall for more info') 'try server; see its waterfall for more info')
parser.add_argument('-v', '--verbose', action='store_true', parser.add_argument('-v', '--verbose', action='store_true',
help='Print debug information') help='Print debug information')
parser.add_argument('-c', '--confidence-level', type=float,
help='Repeatedly runs each benchmark until specified '
'confidence level is reached. The value is interpreted '
'as the number of standard deviations from the mean that '
'all values must lie within. Typical values are 1, 2 and '
'3 and correspond to 68%%, 95%% and 99.7%% probability '
'that the measured value is within 0.1%% of the true '
'value. Larger values result in more retries and thus '
'longer runtime, but also provide more reliable results.')
for option in sorted(BOTS): for option in sorted(BOTS):
parser.add_argument( parser.add_argument(
option, dest='bots', action='append_const', const=BOTS[option], option, dest='bots', action='append_const', const=BOTS[option],
@ -98,11 +107,14 @@ def main():
cmd = ['git cl try', '-B', 'luci.v8-internal.try'] cmd = ['git cl try', '-B', 'luci.v8-internal.try']
cmd += ['-b %s' % bot for bot in options.bots] cmd += ['-b %s' % bot for bot in options.bots]
if options.revision: cmd += ['-r %s' % options.revision] if options.revision:
cmd.append('-r %s' % options.revision)
benchmarks = ['"%s"' % benchmark for benchmark in options.benchmarks] benchmarks = ['"%s"' % benchmark for benchmark in options.benchmarks]
cmd += ['-p \'testfilter=[%s]\'' % ','.join(benchmarks)] cmd.append('-p \'testfilter=[%s]\'' % ','.join(benchmarks))
if options.extra_flags: if options.extra_flags:
cmd += ['-p \'extra_flags="%s"\'' % options.extra_flags] cmd.append('-p \'extra_flags="%s"\'' % options.extra_flags)
if options.confidence_level:
cmd.append('-p confidence_level=%f' % options.confidence_level)
if options.verbose: if options.verbose:
cmd.append('-vv') cmd.append('-vv')
print('Running %s' % ' '.join(cmd)) print('Running %s' % ' '.join(cmd))