2015-06-01 17:44:11 +00:00
|
|
|
#!/usr/bin/python
|
2017-01-01 00:14:16 +00:00
|
|
|
# Copyright (C) 2015-2017 Free Software Foundation, Inc.
|
2015-06-01 17:44:11 +00:00
|
|
|
# This file is part of the GNU C Library.
|
|
|
|
#
|
|
|
|
# The GNU C Library is free software; you can redistribute it and/or
|
|
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
|
|
# License as published by the Free Software Foundation; either
|
|
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# The GNU C Library is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
# Lesser General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
|
|
# License along with the GNU C Library; if not, see
|
|
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
"""Compare two benchmark results
|
|
|
|
|
|
|
|
Given two benchmark result files and a threshold, this script compares the
|
|
|
|
benchmark results and flags differences in performance beyond a given
|
|
|
|
threshold.
|
|
|
|
"""
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import pylab
|
|
|
|
import import_bench as bench
|
|
|
|
|
|
|
|
def do_compare(func, var, tl1, tl2, par, threshold):
|
|
|
|
"""Compare one of the aggregate measurements
|
|
|
|
|
|
|
|
Helper function to compare one of the aggregate measurements of a function
|
|
|
|
variant.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
func: Function name
|
|
|
|
var: Function variant name
|
|
|
|
tl1: The first timings list
|
|
|
|
tl2: The second timings list
|
|
|
|
par: The aggregate to measure
|
|
|
|
threshold: The threshold for differences, beyond which the script should
|
|
|
|
print a warning.
|
|
|
|
"""
|
|
|
|
d = abs(tl2[par] - tl1[par]) * 100 / tl1[str(par)]
|
|
|
|
if d > threshold:
|
|
|
|
if tl1[par] > tl2[par]:
|
|
|
|
ind = '+++'
|
|
|
|
else:
|
|
|
|
ind = '---'
|
|
|
|
print('%s %s(%s)[%s]: (%.2lf%%) from %g to %g' %
|
|
|
|
(ind, func, var, par, d, tl1[par], tl2[par]))
|
|
|
|
|
|
|
|
|
|
|
|
def compare_runs(pts1, pts2, threshold):
|
|
|
|
"""Compare two benchmark runs
|
|
|
|
|
|
|
|
Args:
|
|
|
|
pts1: Timing data from first machine
|
|
|
|
pts2: Timing data from second machine
|
|
|
|
"""
|
|
|
|
|
|
|
|
# XXX We assume that the two benchmarks have identical functions and
|
|
|
|
# variants. We cannot compare two benchmarks that may have different
|
|
|
|
# functions or variants. Maybe that is something for the future.
|
|
|
|
for func in pts1['functions'].keys():
|
|
|
|
for var in pts1['functions'][func].keys():
|
|
|
|
tl1 = pts1['functions'][func][var]
|
|
|
|
tl2 = pts2['functions'][func][var]
|
|
|
|
|
|
|
|
# Compare the consolidated numbers
|
|
|
|
# do_compare(func, var, tl1, tl2, 'max', threshold)
|
|
|
|
do_compare(func, var, tl1, tl2, 'min', threshold)
|
|
|
|
do_compare(func, var, tl1, tl2, 'mean', threshold)
|
|
|
|
|
|
|
|
# Skip over to the next variant or function if there is no detailed
|
|
|
|
# timing info for the function variant.
|
|
|
|
if 'timings' not in pts1['functions'][func][var].keys() or \
|
|
|
|
'timings' not in pts2['functions'][func][var].keys():
|
|
|
|
return
|
|
|
|
|
|
|
|
# If two lists do not have the same length then it is likely that
|
|
|
|
# the performance characteristics of the function have changed.
|
|
|
|
# XXX: It is also likely that there was some measurement that
|
|
|
|
# strayed outside the usual range. Such ouiers should not
|
|
|
|
# happen on an idle machine with identical hardware and
|
|
|
|
# configuration, but ideal environments are hard to come by.
|
|
|
|
if len(tl1['timings']) != len(tl2['timings']):
|
|
|
|
print('* %s(%s): Timing characteristics changed' %
|
|
|
|
(func, var))
|
|
|
|
print('\tBefore: [%s]' %
|
|
|
|
', '.join([str(x) for x in tl1['timings']]))
|
|
|
|
print('\tAfter: [%s]' %
|
|
|
|
', '.join([str(x) for x in tl2['timings']]))
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Collect numbers whose differences cross the threshold we have
|
|
|
|
# set.
|
|
|
|
issues = [(x, y) for x, y in zip(tl1['timings'], tl2['timings']) \
|
|
|
|
if abs(y - x) * 100 / x > threshold]
|
|
|
|
|
|
|
|
# Now print them.
|
|
|
|
for t1, t2 in issues:
|
|
|
|
d = abs(t2 - t1) * 100 / t1
|
|
|
|
if t2 > t1:
|
|
|
|
ind = '-'
|
|
|
|
else:
|
|
|
|
ind = '+'
|
|
|
|
|
|
|
|
print("%s %s(%s): (%.2lf%%) from %g to %g" %
|
|
|
|
(ind, func, var, d, t1, t2))
|
|
|
|
|
|
|
|
|
|
|
|
def plot_graphs(bench1, bench2):
|
|
|
|
"""Plot graphs for functions
|
|
|
|
|
|
|
|
Make scatter plots for the functions and their variants.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
bench1: Set of points from the first machine
|
|
|
|
bench2: Set of points from the second machine.
|
|
|
|
"""
|
|
|
|
for func in bench1['functions'].keys():
|
|
|
|
for var in bench1['functions'][func].keys():
|
|
|
|
# No point trying to print a graph if there are no detailed
|
|
|
|
# timings.
|
|
|
|
if u'timings' not in bench1['functions'][func][var].keys():
|
|
|
|
print('Skipping graph for %s(%s)' % (func, var))
|
|
|
|
continue
|
|
|
|
|
|
|
|
pylab.clf()
|
|
|
|
pylab.ylabel('Time (cycles)')
|
|
|
|
|
|
|
|
# First set of points
|
|
|
|
length = len(bench1['functions'][func][var]['timings'])
|
|
|
|
X = [float(x) for x in range(length)]
|
|
|
|
lines = pylab.scatter(X, bench1['functions'][func][var]['timings'],
|
|
|
|
1.5 + 100 / length)
|
|
|
|
pylab.setp(lines, 'color', 'r')
|
|
|
|
|
|
|
|
# Second set of points
|
|
|
|
length = len(bench2['functions'][func][var]['timings'])
|
|
|
|
X = [float(x) for x in range(length)]
|
|
|
|
lines = pylab.scatter(X, bench2['functions'][func][var]['timings'],
|
|
|
|
1.5 + 100 / length)
|
|
|
|
pylab.setp(lines, 'color', 'g')
|
|
|
|
|
|
|
|
if var:
|
|
|
|
filename = "%s-%s.png" % (func, var)
|
|
|
|
else:
|
|
|
|
filename = "%s.png" % func
|
|
|
|
print('Writing out %s' % filename)
|
|
|
|
pylab.savefig(filename)
|
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
"""Program Entry Point
|
|
|
|
|
|
|
|
Take two benchmark output files and compare their timings.
|
|
|
|
"""
|
|
|
|
if len(args) > 4 or len(args) < 3:
|
|
|
|
print('Usage: %s <schema> <file1> <file2> [threshold in %%]' % sys.argv[0])
|
|
|
|
sys.exit(os.EX_USAGE)
|
|
|
|
|
|
|
|
bench1 = bench.parse_bench(args[1], args[0])
|
|
|
|
bench2 = bench.parse_bench(args[2], args[0])
|
|
|
|
if len(args) == 4:
|
|
|
|
threshold = float(args[3])
|
|
|
|
else:
|
|
|
|
threshold = 10.0
|
|
|
|
|
|
|
|
if (bench1['timing_type'] != bench2['timing_type']):
|
|
|
|
print('Cannot compare benchmark outputs: timing types are different')
|
|
|
|
return
|
|
|
|
|
|
|
|
plot_graphs(bench1, bench2)
|
|
|
|
|
|
|
|
bench.compress_timings(bench1)
|
|
|
|
bench.compress_timings(bench2)
|
|
|
|
|
|
|
|
compare_runs(bench1, bench2, threshold)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main(sys.argv[1:])
|