Refine bench_record and bench_playback:

- use high-precision wall timer only
  - warm caches once before measuring
  - measure independent samples, calculating statistics
  - add --verbose to control how much data we output

Also removed some unloved features from bench_record.

BUG=skia:
R=jcgregorio@google.com, mtklein@google.com

Author: mtklein@chromium.org

Review URL: https://codereview.chromium.org/338203002
This commit is contained in:
mtklein 2014-06-16 14:04:32 -07:00 committed by Commit bot
parent ad2ab61b4e
commit 90c471e73f
5 changed files with 197 additions and 65 deletions

View File

@ -76,3 +76,19 @@ void BenchTimer::truncatedEnd() {
fTruncatedCpu = fTruncatedSysTimer->endCpu() * fDurationScale;
fTruncatedWall = fTruncatedSysTimer->endWall() * fDurationScale;
}
WallTimer::WallTimer() : fWall(-1.0), fSysTimer(new BenchSysTimer) {}
WallTimer::~WallTimer() {
delete fSysTimer;
}
void WallTimer::start(double durationScale) {
fDurationScale = durationScale;
fSysTimer->startWall();
}
void WallTimer::end() {
fWall = fSysTimer->endWall() * fDurationScale;
}

View File

@ -39,12 +39,29 @@ public:
double fGpu;
private:
BenchSysTimer *fSysTimer;
BenchSysTimer *fTruncatedSysTimer;
BenchSysTimer* fSysTimer;
BenchSysTimer* fTruncatedSysTimer;
#if SK_SUPPORT_GPU
BenchGpuTimer *fGpuTimer;
BenchGpuTimer* fGpuTimer;
#endif
double fDurationScale; // for this start/end session
};
// Same as BenchTimer above, supporting only fWall but with much lower overhead.
// (Typically, ~30ns instead of BenchTimer's ~1us.)
class WallTimer {
public:
WallTimer();
~WallTimer();
void start(double durationScale = 1);
void end();
double fWall;
private:
BenchSysTimer* fSysTimer;
double fDurationScale;
};
#endif

32
tools/Stats.h Normal file
View File

@ -0,0 +1,32 @@
#ifndef Stats_DEFINED
#define Stats_DEFINED
struct Stats {
Stats(const double samples[], int n) {
min = samples[0];
max = samples[0];
for (int i = 0; i < n; i++) {
if (samples[i] < min) { min = samples[i]; }
if (samples[i] > max) { max = samples[i]; }
}
double sum = 0.0;
for (int i = 0 ; i < n; i++) {
sum += samples[i];
}
mean = sum / n;
double err = 0.0;
for (int i = 0 ; i < n; i++) {
err += (samples[i] - mean) * (samples[i] - mean);
}
var = err / (n-1);
}
double min;
double max;
double mean; // Estimate of population mean.
double var; // Estimate of population variance.
};
#endif//Stats_DEFINED

View File

@ -5,7 +5,6 @@
* found in the LICENSE file.
*/
#include "BenchTimer.h"
#include "SkCommandLineFlags.h"
#include "SkForceLinking.h"
#include "SkGraphics.h"
@ -16,19 +15,27 @@
#include "SkStream.h"
#include "SkString.h"
#include "BenchTimer.h"
#include "Stats.h"
typedef WallTimer Timer;
__SK_FORCE_IMAGE_DECODER_LINKING;
DEFINE_string2(skps, r, "skps", "Directory containing SKPs to read and re-record.");
DEFINE_int32(loops, 10, "Number of times to play back each SKP.");
DEFINE_string2(skps, r, "skps", "Directory containing SKPs to playback.");
DEFINE_int32(samples, 10, "Gather this many samples of each picture playback.");
DEFINE_bool(skr, false, "Play via SkRecord instead of SkPicture.");
DEFINE_int32(tile, 1000000000, "Simulated tile size.");
DEFINE_string(match, "", "The usual filters on file names of SKPs to bench.");
DEFINE_string(timescale, "ms", "Print times in ms, us, or ns");
DEFINE_int32(verbose, 0, "0: print min sample; "
"1: print min, mean, max and noise indication "
"2: print all samples");
static double scale_time(double ms) {
if (FLAGS_timescale.contains("us")) ms *= 1000;
if (FLAGS_timescale.contains("ns")) ms *= 1000000;
return ms;
static double timescale() {
if (FLAGS_timescale.contains("us")) return 1000;
if (FLAGS_timescale.contains("ns")) return 1000000;
return 1;
}
static SkPicture* rerecord_with_tilegrid(SkPicture& src) {
@ -49,6 +56,14 @@ static EXPERIMENTAL::SkPlayback* rerecord_with_skr(SkPicture& src) {
return recording.releasePlayback();
}
static void draw(const EXPERIMENTAL::SkPlayback& skr, const SkPicture& skp, SkCanvas* canvas) {
if (FLAGS_skr) {
skr.draw(canvas);
} else {
skp.draw(canvas);
}
}
static void bench(SkPMColor* scratch, SkPicture& src, const char* name) {
SkAutoTUnref<SkPicture> picture(rerecord_with_tilegrid(src));
SkAutoTDelete<EXPERIMENTAL::SkPlayback> record(rerecord_with_skr(src));
@ -59,19 +74,34 @@ static void bench(SkPMColor* scratch, SkPicture& src, const char* name) {
src.width() * sizeof(SkPMColor)));
canvas->clipRect(SkRect::MakeWH(SkIntToScalar(FLAGS_tile), SkIntToScalar(FLAGS_tile)));
BenchTimer timer;
timer.start();
for (int i = 0; i < FLAGS_loops; i++) {
if (FLAGS_skr) {
record->draw(canvas.get());
} else {
picture->draw(canvas.get());
}
}
timer.end();
// Draw once to warm any caches. The first sample otherwise can be very noisy.
draw(*record, *picture, canvas.get());
const double msPerLoop = timer.fCpu / (double)FLAGS_loops;
printf("%f\t%s\n", scale_time(msPerLoop), name);
Timer timer;
SkAutoTMalloc<double> samples(FLAGS_samples);
for (int i = 0; i < FLAGS_samples; i++) {
// We assume timer overhead (typically, ~30ns) is insignificant
// compared to draw runtime (at least ~100us, usually several ms).
timer.start(timescale());
draw(*record, *picture, canvas.get());
timer.end();
samples[i] = timer.fWall;
}
Stats stats(samples.get(), FLAGS_samples);
if (FLAGS_verbose == 0) {
printf("%g\t%s\n", stats.min, name);
} else if (FLAGS_verbose == 1) {
// Get a rough idea of how noisy the measurements were.
const double noisePercent = 100 * sqrt(stats.var) / stats.mean;
printf("%g\t%g\t%g\t±%.0f%%\t%s\n", stats.min, stats.mean, stats.max, noisePercent, name);
} else if (FLAGS_verbose == 2) {
printf("%s", name);
for (int i = 0; i < FLAGS_samples; i++) {
printf("\t%g", samples[i]);
}
printf("\n");
}
}
int tool_main(int argc, char** argv);

View File

@ -5,7 +5,6 @@
* found in the LICENSE file.
*/
#include "BenchTimer.h"
#include "SkCommandLineFlags.h"
#include "SkForceLinking.h"
#include "SkGraphics.h"
@ -15,28 +14,32 @@
#include "SkRecording.h"
#include "SkStream.h"
#include "SkString.h"
#include "BenchTimer.h"
#include "LazyDecodeBitmap.h"
#include "Stats.h"
typedef WallTimer Timer;
__SK_FORCE_IMAGE_DECODER_LINKING;
// Just reading all the SKPs takes about 2 seconds for me, which is the same as about 100 loops of
// rerecording all the SKPs. So we default to --loops=900, which makes ~90% of our time spent in
// recording, and this should take ~20 seconds to run.
DEFINE_string2(skps, r, "skps", "Directory containing SKPs to read and re-record.");
DEFINE_int32(loops, 900, "Number of times to re-record each SKP.");
DEFINE_bool(endRecording, true, "If false, don't time SkPicture::endRecording()");
DEFINE_int32(nullSize, 1000, "Pretend dimension of null source picture.");
DEFINE_int32(samples, 10, "Number of times to re-record each SKP.");
DEFINE_int32(tileGridSize, 512, "Set the tile grid size. Has no effect if bbh is not set to tilegrid.");
DEFINE_string(bbh, "", "Turn on the bbh and select the type, one of rtree, tilegrid, quadtree");
DEFINE_bool(skr, false, "Record SKR instead of SKP.");
DEFINE_string(match, "", "The usual filters on file names of SKPs to bench.");
DEFINE_string(timescale, "us", "Print times in ms, us, or ns");
DEFINE_double(overheadGoal, 0.0001,
"Try to make timer overhead at most this fraction of our sample measurements.");
DEFINE_int32(verbose, 0, "0: print min sample; "
"1: print min, mean, max and noise indication "
"2: print all samples");
static double scale_time(double ms) {
if (FLAGS_timescale.contains("us")) ms *= 1000;
if (FLAGS_timescale.contains("ns")) ms *= 1000000;
return ms;
static double timescale() {
if (FLAGS_timescale.contains("us")) return 1000;
if (FLAGS_timescale.contains("ns")) return 1000000;
return 1;
}
static SkBBHFactory* parse_FLAGS_bbh() {
@ -61,35 +64,62 @@ static SkBBHFactory* parse_FLAGS_bbh() {
return NULL;
}
static void bench_record(SkPicture* src, const char* name, SkBBHFactory* bbhFactory) {
BenchTimer timer;
timer.start();
const int width = src ? src->width() : FLAGS_nullSize;
const int height = src ? src->height() : FLAGS_nullSize;
for (int i = 0; i < FLAGS_loops; i++) {
if (FLAGS_skr) {
EXPERIMENTAL::SkRecording recording(width, height);
if (NULL != src) {
src->draw(recording.canvas());
}
// Release and delete the SkPlayback so that recording optimizes its SkRecord.
SkDELETE(recording.releasePlayback());
} else {
SkPictureRecorder recorder;
SkCanvas* canvas = recorder.beginRecording(width, height, bbhFactory);
if (NULL != src) {
src->draw(canvas);
}
if (FLAGS_endRecording) {
SkAutoTUnref<SkPicture> dst(recorder.endRecording());
}
}
static void rerecord(const SkPicture& src, SkBBHFactory* bbhFactory) {
if (FLAGS_skr) {
EXPERIMENTAL::SkRecording recording(src.width(), src.height());
src.draw(recording.canvas());
// Release and delete the SkPlayback so that recording optimizes its SkRecord.
SkDELETE(recording.releasePlayback());
} else {
SkPictureRecorder recorder;
src.draw(recorder.beginRecording(src.width(), src.height(), bbhFactory));
SkAutoTUnref<SkPicture> dst(recorder.endRecording());
}
timer.end();
}
const double msPerLoop = timer.fCpu / (double)FLAGS_loops;
printf("%f\t%s\n", scale_time(msPerLoop), name);
static void bench_record(const SkPicture& src,
const double timerOverhead,
const char* name,
SkBBHFactory* bbhFactory) {
// Rerecord once to warm up any caches. Otherwise the first sample can be very noisy.
rerecord(src, bbhFactory);
// Rerecord once to see how many times we should loop to make timer overhead insignificant.
Timer timer;
do {
timer.start(timescale());
rerecord(src, bbhFactory);
timer.end();
} while (timer.fWall < timerOverhead); // Loop just in case something bizarre happens.
// We want (timer overhead / measurement) to be less than FLAGS_overheadGoal.
// So in each sample, we'll loop enough times to have made that true for our first measurement.
const int loops = (int)ceil(timerOverhead / timer.fWall / FLAGS_overheadGoal);
SkAutoTMalloc<double> samples(FLAGS_samples);
for (int i = 0; i < FLAGS_samples; i++) {
timer.start(timescale());
for (int j = 0; j < loops; j++) {
rerecord(src, bbhFactory);
}
timer.end();
samples[i] = timer.fWall / loops;
}
Stats stats(samples.get(), FLAGS_samples);
if (FLAGS_verbose == 0) {
printf("%g\t%s\n", stats.min, name);
} else if (FLAGS_verbose == 1) {
// Get a rough idea of how noisy the measurements were.
const double noisePercent = 100 * sqrt(stats.var) / stats.mean;
printf("%g\t%g\t%g\t±%.0f%%\t%s\n", stats.min, stats.mean, stats.max, noisePercent, name);
} else if (FLAGS_verbose == 2) {
printf("%s", name);
for (int i = 0; i < FLAGS_samples; i++) {
printf("\t%g", samples[i]);
}
printf("\n");
}
}
int tool_main(int argc, char** argv);
@ -103,10 +133,17 @@ int tool_main(int argc, char** argv) {
}
SkAutoTDelete<SkBBHFactory> bbhFactory(parse_FLAGS_bbh());
bench_record(NULL, "NULL", bbhFactory.get());
if (FLAGS_skps.isEmpty()) {
return 0;
// Each run will use this timer overhead estimate to guess how many times it should run.
static const int kOverheadLoops = 10000000;
Timer timer;
double overheadEstimate = 0.0;
for (int i = 0; i < kOverheadLoops; i++) {
timer.start(timescale());
timer.end();
overheadEstimate += timer.fWall;
}
overheadEstimate /= kOverheadLoops;
SkOSFile::Iter it(FLAGS_skps[0], ".skp");
SkString filename;
@ -131,7 +168,7 @@ int tool_main(int argc, char** argv) {
failed = true;
continue;
}
bench_record(src, filename.c_str(), bbhFactory.get());
bench_record(*src, overheadEstimate, filename.c_str(), bbhFactory.get());
}
return failed ? 1 : 0;
}