[tools] Implement confidence-based number of runs

R=machenbach@chromium.org, tmrts@chromium.org Bug: chromium:880724 Change-Id: I2b8ede244fa09868eef384b967223a3788ddd2a1 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1581180 Commit-Queue: Sergiy Belozorov <sergiyb@chromium.org> Reviewed-by: Michael Achenbach <machenbach@chromium.org> Cr-Commit-Position: refs/heads/master@{#61370}
2019-05-09 11:05:37 +02:00 · 2019-05-09 11:05:37 +02:00 · 80451b07bc
commit 80451b07bc
parent 615d61bfc7
6 changed files with 228 additions and 79 deletions
--- a/.vpython
+++ b/.vpython
@ -66,3 +66,11 @@ wheel: <
  name: "infra/python/wheels/mock-py2_py3"
  version: "version:2.0.0"
 >
+
+# Used by:
+#   tools/run_perf.py
+#   tools/unittests/run_perf_test.py
+wheel: <
+  name: "infra/python/wheels/numpy/${vpython_platform}"
+  version: "version:1.11.3"
+>
--- a/BUILD.gn
+++ b/BUILD.gn
@ -3875,6 +3875,12 @@ group("gn_all") {
  }
 }

+group("v8_python_base") {
+  data = [
+    ".vpython",
+  ]
+}
+
 group("v8_clusterfuzz") {
  testonly = true

--- a/test/BUILD.gn
+++ b/test/BUILD.gn
@ -44,6 +44,7 @@ group("v8_perf") {
  testonly = true

  data_deps = [
+    "..:v8_python_base",
    "cctest:cctest",
    "..:d8",
    "../tools:v8_android_test_runner_deps",
--- a/tools/BUILD.gn
+++ b/tools/BUILD.gn
@ -43,6 +43,7 @@ group("v8_testrunner") {
  testonly = true

  data_deps = [
+    "..:v8_python_base",
    "..:v8_dump_build_config",
    ":v8_android_test_runner_deps",
  ]
--- a/tools/run_perf.py
+++ b/tools/run_perf.py
@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # Copyright 2014 the V8 project authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
@ -120,6 +119,8 @@ import sys
 import time
 import traceback

+import numpy
+
 from testrunner.local import android
 from testrunner.local import command
 from testrunner.local import utils
@ -142,6 +143,7 @@ RESULT_STDDEV_RE = re.compile(r'^\{([^\}]+)\}$')
 RESULT_LIST_RE = re.compile(r'^\[([^\]]+)\]$')
 TOOLS_BASE = os.path.abspath(os.path.dirname(__file__))
 INFRA_FAILURE_RETCODE = 87
+MIN_RUNS_FOR_CONFIDENCE = 10


 def GeometricMean(values):
@ -150,7 +152,7 @@ def GeometricMean(values):
  The mean is calculated using log to avoid overflow.
  """
  values = map(float, values)
-  return str(math.exp(sum(map(math.log, values)) / len(values)))
+  return math.exp(sum(map(math.log, values)) / len(values))


 class ResultTracker(object):
@ -241,6 +243,42 @@ class ResultTracker(object):
    with open(file_name, 'w') as f:
      f.write(json.dumps(self.ToDict()))

+  def HasEnoughRuns(self, graph_config, confidence_level):
+    """Checks if the mean of the results for a given trace config is within
+    0.1% of the true value with the specified confidence level.
+
+    This assumes Gaussian distribution of the noise and based on
+    https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule.
+
+    Args:
+      graph_config: An instance of GraphConfig.
+      confidence_level: Number of standard deviations from the mean that all
+          values must lie within. Typical values are 1, 2 and 3 and correspond
+          to 68%, 95% and 99.7% probability that the measured value is within
+          0.1% of the true value.
+
+    Returns:
+      True if specified confidence level have been achieved.
+    """
+    if not isinstance(graph_config, TraceConfig):
+      return all(self.HasEnoughRuns(child, confidence_level)
+                 for child in graph_config.children)
+
+    trace = self.traces.get(graph_config.name, {})
+    results = trace.get('results', [])
+    logging.debug('HasEnoughRuns for %s', graph_config.name)
+
+    if len(results) < MIN_RUNS_FOR_CONFIDENCE:
+      logging.debug('  Ran %d times, need at least %d',
+                    len(results), MIN_RUNS_FOR_CONFIDENCE)
+      return False
+
+    logging.debug('  Results: %d entries', len(results))
+    mean = numpy.mean(results)
+    mean_stderr = numpy.std(results) / numpy.sqrt(len(results))
+    logging.debug('  Mean: %.2f, mean_stderr: %.2f', mean, mean_stderr)
+    return confidence_level * mean_stderr < mean / 1000.0
+
  def __str__(self):  # pragma: no cover
    return json.dumps(self.ToDict(), indent=2, separators=(',', ': '))

@ -383,8 +421,8 @@ class TraceConfig(GraphConfig):
    stddev = None

    try:
-      result = str(float(
-        re.search(self.results_regexp, output.stdout, re.M).group(1)))
+      result = float(
+        re.search(self.results_regexp, output.stdout, re.M).group(1))
    except ValueError:
      result_tracker.AddError(
          'Regexp "%s" returned a non-numeric for test %s.' %
@ -740,6 +778,7 @@ class AndroidPlatform(Platform):  # pragma: no cover
    output.duration = time.time() - start
    return output

+
 class CustomMachineConfiguration:
  def __init__(self, disable_aslr = False, governor = None):
    self.aslr_backup = None
@ -844,6 +883,12 @@ class CustomMachineConfiguration:
      raise Exception('Could not set CPU governor. Present value is %s'
                      % cur_value )

+
+class MaxTotalDurationReachedError(Exception):
+  """Exception used to stop running tests when max total duration is reached."""
+  pass
+
+
 def Main(argv):
  parser = argparse.ArgumentParser()
  parser.add_argument('--arch',
@ -900,12 +945,28 @@ def Main(argv):
                      '--filter=JSTests/TypedArrays/ will run only TypedArray '
                      'benchmarks from the JSTests suite.',
                      default='')
+  parser.add_argument('--confidence-level', type=int,
+                      help='Repeatedly runs each benchmark until specified '
+                      'confidence level is reached. The value is interpreted '
+                      'as the number of standard deviations from the mean that '
+                      'all values must lie within. Typical values are 1, 2 and '
+                      '3 and correspond to 68%, 95% and 99.7% probability that '
+                      'the measured value is within 0.1% of the true value. '
+                      'Larger values result in more retries and thus longer '
+                      'runtime, but also provide more reliable results. Also '
+                      'see --max-total-duration flag.')
+  parser.add_argument('--max-total-duration', type=int, default=7140,  # 1h 59m
+                      help='Max total duration in seconds allowed for retries '
+                      'across all tests. This is especially useful in '
+                      'combination with the --confidence-level flag.')
  parser.add_argument('--dump-logcats-to',
                      help='Writes logcat output from each test into specified '
                      'directory. Only supported for android targets.')
-  parser.add_argument("--run-count", type=int, default=0,
-                      help="Override the run count specified by the test "
-                           "suite. The default 0 uses the suite's config.")
+  parser.add_argument('--run-count', type=int, default=0,
+                      help='Override the run count specified by the test '
+                      'suite. The default 0 uses the suite\'s config.')
+  parser.add_argument('-v', '--verbose', default=False, action='store_true',
+                      help='Be verbose and print debug output.')
  parser.add_argument('suite', nargs='+', help='Path to the suite config file.')

  try:
@ -914,7 +975,8 @@ def Main(argv):
    return INFRA_FAILURE_RETCODE

  logging.basicConfig(
-      level=logging.INFO, format='%(asctime)s %(levelname)-8s  %(message)s')
+      level=logging.DEBUG if args.verbose else logging.INFO,
+      format='%(asctime)s %(levelname)-8s  %(message)s')

  if args.arch == 'auto':  # pragma: no cover
    args.arch = utils.DefaultArch()
@ -973,8 +1035,7 @@ def Main(argv):

  result_tracker = ResultTracker()
  result_tracker_secondary = ResultTracker()
-  # We use list here to allow modification in nested function below.
-  have_failed_tests = [False]
+  have_failed_tests = False
  with CustomMachineConfiguration(governor = args.cpu_governor,
                                  disable_aslr = args.noaslr) as conf:
    for path in args.suite:
@ -1000,39 +1061,61 @@ def Main(argv):
        platform.PreTests(node, path)

      # Traverse graph/trace tree and iterate over all runnables.
-      for runnable in FlattenRunnables(root, NodeCB):
-        runnable_name = '/'.join(runnable.graphs)
-        if (not runnable_name.startswith(args.filter) and
-            runnable_name + '/' != args.filter):
-          continue
-        logging.info('>>> Running suite: %s', runnable_name)
+      start = time.time()
+      try:
+        for runnable in FlattenRunnables(root, NodeCB):
+          runnable_name = '/'.join(runnable.graphs)
+          if (not runnable_name.startswith(args.filter) and
+              runnable_name + '/' != args.filter):
+            continue
+          logging.info('>>> Running suite: %s', runnable_name)

-        for i in range(0, max(1, args.run_count or runnable.run_count)):
-          attempts_left = runnable.retry_count + 1
-          while attempts_left:
-            output, output_secondary = platform.Run(
-                runnable, i, secondary=args.shell_dir_secondary)
-            result_tracker.AddRunnableDuration(runnable, output.duration)
-            result_tracker_secondary.AddRunnableDuration(
-                runnable, output_secondary.duration)
-
-            if output.IsSuccess() and output_secondary.IsSuccess():
-              runnable.ProcessOutput(output, result_tracker, i)
-              if output_secondary is not NULL_OUTPUT:
-                runnable.ProcessOutput(
-                    output_secondary, result_tracker_secondary, i)
-              break
-
-            attempts_left -= 1
-            if not attempts_left:  # ignore failures until last attempt
-              have_failed_tests[0] = True
+          def RunGenerator(runnable):
+            if args.confidence_level:
+              counter = 0
+              while not result_tracker.HasEnoughRuns(
+                  runnable, args.confidence_level):
+                yield counter
+                counter += 1
            else:
-              logging.info('>>> Retrying suite: %s', runnable_name)
+              for i in range(0, max(1, args.run_count or runnable.run_count)):
+                yield i

-        if runnable.has_timeouts:
-          result_tracker.timeouts.append(runnable_name)
-        if runnable.has_near_timeouts:
-          result_tracker.near_timeouts.append(runnable_name)
+          for i in RunGenerator(runnable):
+            attempts_left = runnable.retry_count + 1
+            while attempts_left:
+              total_duration = time.time() - start
+              if total_duration > args.max_total_duration:
+                logging.info(
+                    '>>> Stopping now since running for too long (%ds > %ds)',
+                    total_duration, args.max_total_duration)
+                raise MaxTotalDurationReachedError()
+
+              output, output_secondary = platform.Run(
+                  runnable, i, secondary=args.shell_dir_secondary)
+              result_tracker.AddRunnableDuration(runnable, output.duration)
+              result_tracker_secondary.AddRunnableDuration(
+                  runnable, output_secondary.duration)
+
+              if output.IsSuccess() and output_secondary.IsSuccess():
+                runnable.ProcessOutput(output, result_tracker, i)
+                if output_secondary is not NULL_OUTPUT:
+                  runnable.ProcessOutput(
+                      output_secondary, result_tracker_secondary, i)
+                break
+
+              attempts_left -= 1
+              if not attempts_left:  # ignore failures until last attempt
+                have_failed_tests = True
+              else:
+                logging.info('>>> Retrying suite: %s', runnable_name)
+
+          if runnable.has_timeouts:
+            result_tracker.timeouts.append(runnable_name)
+          if runnable.has_near_timeouts:
+            result_tracker.near_timeouts.append(runnable_name)
+      except MaxTotalDurationReachedError:
+        have_failed_tests = True

      platform.PostExecution()

@ -1048,7 +1131,7 @@ def Main(argv):
      print('Secondary results:', result_tracker_secondary)

  if (result_tracker.errors or result_tracker_secondary.errors or
-      have_failed_tests[0]):
+      have_failed_tests):
    return 1

  return 0
--- a/tools/unittests/run_perf_test.py
+++ b/tools/unittests/run_perf_test.py
@ -7,9 +7,7 @@
 from __future__ import print_function

 from collections import namedtuple
-import coverage
 import json
-import mock
 import os
 import platform
 import shutil
@ -18,6 +16,9 @@ import sys
 import tempfile
 import unittest

+import coverage
+import mock
+
 # Requires python-coverage and python-mock. Native python coverage
 # version >= 3.7.1 should be installed to get the best speed.

@ -208,8 +209,8 @@ class PerfTest(unittest.TestCase):
    self._MockCommand(['.'], ['x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n'])
    self.assertEqual(0, self._CallMain())
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['1.234'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [1.234], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
    ])
    self._VerifyRunnableDurations(1, 60)
    self._VerifyErrors([])
@ -223,8 +224,8 @@ class PerfTest(unittest.TestCase):
    self._MockCommand(['.'], ['Richards: 1.234\nDeltaBlue: 10657567'])
    self.assertEqual(0, self._CallMain())
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['1.234'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [1.234], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
    ])
    self._VerifyErrors([])
    self._VerifyMock(os.path.join(
@ -241,8 +242,8 @@ class PerfTest(unittest.TestCase):
                       'Richards: 50\nDeltaBlue: 300\n'])
    self.assertEqual(0, self._CallMain())
    self._VerifyResults('v8', 'ms', [
-      {'name': 'Richards', 'results': ['50.0', '100.0'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['300.0', '200.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [50.0, 100.0], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [300.0, 200.0], 'stddev': ''},
    ])
    self._VerifyErrors([])
    self._VerifyMock(os.path.join(
@ -260,8 +261,57 @@ class PerfTest(unittest.TestCase):
                       'Richards: 50\nDeltaBlue: 300\n'])
    self.assertEqual(0, self._CallMain())
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['50.0', '100.0'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['300.0', '200.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [50.0, 100.0], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [300.0, 200.0], 'stddev': ''},
+    ])
+    self._VerifyErrors([])
+    self._VerifyMock(os.path.join(
+      'out', 'x64.release', 'd7'), '--flag', 'run.js')
+
+  def testPerfectConfidenceRuns(self):
+    self._WriteTestInput(V8_JSON)
+    self._MockCommand(
+        ['.'], ['x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n'] * 10)
+    self.assertEqual(0, self._CallMain('--confidence-level', '1'))
+    self._VerifyResults('test', 'score', [
+      {'name': 'Richards', 'results': [1.234] * 10, 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0] * 10, 'stddev': ''},
+    ])
+    self._VerifyErrors([])
+    self._VerifyMock(os.path.join(
+      'out', 'x64.release', 'd7'), '--flag', 'run.js')
+
+  def testNoisyConfidenceRuns(self):
+    self._WriteTestInput(V8_JSON)
+    self._MockCommand(
+        ['.'],
+        reversed([
+          # First 10 runs are mandatory. DeltaBlue is slightly noisy.
+          'x\nRichards: 1.234\nDeltaBlue: 10757567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10557567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          # Need 4 more runs for confidence in DeltaBlue results.
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+          'x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n',
+        ]),
+    )
+    self.assertEqual(0, self._CallMain('--confidence-level', '1'))
+    self._VerifyResults('test', 'score', [
+      {'name': 'Richards', 'results': [1.234] * 14, 'stddev': ''},
+      {
+        'name': 'DeltaBlue',
+        'results': [10757567.0, 10557567.0] + [10657567.0] * 12,
+        'stddev': '',
+      },
    ])
    self._VerifyErrors([])
    self._VerifyMock(os.path.join(
@ -280,15 +330,15 @@ class PerfTest(unittest.TestCase):
    self.assertListEqual(sorted([
      {'units': 'score',
       'graphs': ['test', 'Richards'],
-       'results': ['50.0', '100.0'],
+       'results': [50.0, 100.0],
       'stddev': ''},
      {'units': 'ms',
       'graphs': ['test', 'Sub', 'Leaf'],
-       'results': ['3.0', '2.0', '1.0'],
+       'results': [3.0, 2.0, 1.0],
       'stddev': ''},
      {'units': 'score',
       'graphs': ['test', 'DeltaBlue'],
-       'results': ['200.0'],
+       'results': [200.0],
       'stddev': ''},
      ]), sorted(self._LoadResults()['traces']))
    self._VerifyErrors([])
@ -309,8 +359,8 @@ class PerfTest(unittest.TestCase):
                              'DeltaBlue: 10657567\nDeltaBlue-stddev: 106\n'])
    self.assertEqual(0, self._CallMain())
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['1.234'], 'stddev': '0.23'},
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': '106'},
+      {'name': 'Richards', 'results': [1.234], 'stddev': '0.23'},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': '106'},
    ])
    self._VerifyErrors([])
    self._VerifyMock(
@ -327,8 +377,8 @@ class PerfTest(unittest.TestCase):
                              'DeltaBlue: 5\nDeltaBlue-stddev: 0.8\n'])
    self.assertEqual(1, self._CallMain())
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['2.0', '3.0'], 'stddev': '0.7'},
-      {'name': 'DeltaBlue', 'results': ['5.0', '6.0'], 'stddev': '0.8'},
+      {'name': 'Richards', 'results': [2.0, 3.0], 'stddev': '0.7'},
+      {'name': 'DeltaBlue', 'results': [5.0, 6.0], 'stddev': '0.8'},
    ])
    self._VerifyErrors(
        ['Test test/Richards should only run once since a stddev is provided '
@ -348,8 +398,8 @@ class PerfTest(unittest.TestCase):
        mock.MagicMock(return_value={'is_android': False})).start()
    self.assertEqual(0, self._CallMain('--buildbot'))
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['1.234'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [1.234], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
    ])
    self._VerifyErrors([])
    self._VerifyMock(os.path.join('out', 'Release', 'd7'), '--flag', 'run.js')
@ -364,9 +414,9 @@ class PerfTest(unittest.TestCase):
        mock.MagicMock(return_value={'is_android': False})).start()
    self.assertEqual(0, self._CallMain('--buildbot'))
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['1.234'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
-      {'name': 'Total', 'results': ['3626.49109719'], 'stddev': ''},
+      {'name': 'Richards', 'results': [1.234], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
+      {'name': 'Total', 'results': [3626.491097190233], 'stddev': ''},
    ])
    self._VerifyErrors([])
    self._VerifyMock(os.path.join('out', 'Release', 'd7'), '--flag', 'run.js')
@ -381,7 +431,7 @@ class PerfTest(unittest.TestCase):
        mock.MagicMock(return_value={'is_android': False})).start()
    self.assertEqual(1, self._CallMain('--buildbot'))
    self._VerifyResults('test', 'score', [
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
    ])
    self._VerifyErrors(
        ['Regexp "^Richards: (.+)$" '
@ -395,7 +445,7 @@ class PerfTest(unittest.TestCase):
    self._MockCommand(['.'], ['x\nRichaards: 1.234\nDeltaBlue: 10657567\ny\n'])
    self.assertEqual(1, self._CallMain())
    self._VerifyResults('test', 'score', [
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
    ])
    self._VerifyErrors(
        ['Regexp "^Richards: (.+)$" did not match for test test/Richards.'])
@ -442,8 +492,8 @@ class PerfTest(unittest.TestCase):
        return_value={'is_android': True}).start()
    self.assertEqual(0, self._CallMain('--arch', 'arm'))
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['1.234'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [1.234], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
    ])

  def testTwoRuns_Trybot(self):
@ -462,12 +512,12 @@ class PerfTest(unittest.TestCase):
        '--json-test-results-secondary', test_output_secondary,
    ))
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['100.0', '200.0'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['20.0', '20.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [100.0, 200.0], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [20.0, 20.0], 'stddev': ''},
    ])
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['50.0', '100.0'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['200.0', '200.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [50.0, 100.0], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [200.0, 200.0], 'stddev': ''},
    ], test_output_secondary)
    self._VerifyRunnableDurations(2, 60, test_output_secondary)
    self._VerifyErrors([])
@ -486,8 +536,8 @@ class PerfTest(unittest.TestCase):
    self._MockCommand(['.'], ['x\nRichards: 1.234\nDeltaBlue: 10657567\ny\n'])
    self.assertEqual(0, self._CallMain('--extra-flags=--prof'))
    self._VerifyResults('test', 'score', [
-      {'name': 'Richards', 'results': ['1.234'], 'stddev': ''},
-      {'name': 'DeltaBlue', 'results': ['10657567.0'], 'stddev': ''},
+      {'name': 'Richards', 'results': [1.234], 'stddev': ''},
+      {'name': 'DeltaBlue', 'results': [10657567.0], 'stddev': ''},
    ])
    self._VerifyErrors([])
    self._VerifyMock(os.path.join('out', 'x64.release', 'd7'),
@ -514,13 +564,13 @@ class PerfTest(unittest.TestCase):
      {
        'units': 'score',
        'graphs': ['test1', 'Richards'],
-        'results': [u'1.2', u'1.2'],
+        'results': [1.2, 1.2],
        'stddev': '',
      },
      {
        'units': 'score',
        'graphs': ['test1', 'DeltaBlue'],
-        'results': [u'2.1', u'2.1'],
+        'results': [2.1, 2.1],
        'stddev': '',
      },
    ]), sorted(results['traces']))
@ -532,13 +582,13 @@ class PerfTest(unittest.TestCase):
      {
        'units': 'score',
        'graphs': ['test2', 'Richards'],
-        'results': [u'1.2', u'1.2'],
+        'results': [1.2, 1.2],
        'stddev': '',
      },
      {
        'units': 'score',
        'graphs': ['test2', 'DeltaBlue'],
-        'results': [u'2.1', u'2.1'],
+        'results': [2.1, 2.1],
        'stddev': '',
      },
    ], results['traces'])
@ -550,13 +600,13 @@ class PerfTest(unittest.TestCase):
      {
        'units': 'score',
        'graphs': ['test3', 'Octane', 'Richards'],
-        'results': [u'1.2'],
+        'results': [1.2],
        'stddev': '',
      },
      {
        'units': 'score',
        'graphs': ['test3', 'Octane', 'DeltaBlue'],
-        'results': [u'2.1'],
+        'results': [2.1],
        'stddev': '',
      },
    ], results['traces'])