skia2/tools/ok_srcs.cpp

215 lines
5.7 KiB
C++
Raw Normal View History

/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
ok, add a bench source This new source acts like other sources (GMs, SKPs) for benchmarks. It times multiple samples (controlled by samples=N, default 20), and each of those samples uses the same strategy as monobench, growing loops exponentially until it runs for at least 10ms. When done it prints the fastest and the two slowest samples. In practice the 100th percentile sample is very different from the next slowest due to caching, and the fastest is always interesting. Because these benchmarks run in whatever execution engine ok has selected, on non-Windows platforms you have some real control over the interaction between benchmarks. In its default "fork" mode each benchmark runs independently in its own process, so the 100th percentiles really stand out. The other modes "thread" and "serial" work as you'd expect too. Here's an example where you can see how the different interactions work: out/ok bench:samples=100 8888 filter:search=text_16_AA fork [text_16_AA_WT] 2.32µs @0 6.23µs @99 24.3ms @100 [text_16_AA_FF] 2.41µs @0 5.7µs @99 23.3ms @100 [text_16_AA_88] 2.55µs @0 5.6µs @99 24.8ms @100 [text_16_AA_BK] 1.97µs @0 5.44µs @99 23.2ms @100 out/ok bench:samples=100 8888 filter:search=text_16_AA thread [text_16_AA_FF] 2.45µs @0 23.5µs @99 24.8ms @100 [text_16_AA_WT] 2.52µs @0 17.8µs @99 24.7ms @100 [text_16_AA_88] 2.55µs @0 19.7µs @99 25.1ms @100 [text_16_AA_BK] 1.8µs @0 14.7µs @99 25.1ms @100 out/ok bench:samples=100 8888 filter:search=text_16_AA serial [text_16_AA_88] 2.35µs @0 3.53µs @99 16.7ms @100 [text_16_AA_FF] 2.09µs @0 2.73µs @99 2.91µs @100 [text_16_AA_BK] 1.75µs @0 2.46µs @99 2.65µs @100 [text_16_AA_WT] 2.1µs @0 3.16µs @99 3.17µs @100 In the first "fork" case all runs are independent and have roughly the same profile. "thread" looks similar except you can see them contending at the 99th percentile. In "serial", the first bench warms up the rest, so their 100th percentiles are all much faster. Change-Id: I01a9f8c54b540221a9f232b271bb8ef3fda2569c Reviewed-on: https://skia-review.googlesource.com/33585 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
2017-08-11 14:37:35 +00:00
#include "Benchmark.h"
2017-05-03 19:16:58 +00:00
#include "SkData.h"
#include "SkOSFile.h"
#include "SkPicture.h"
ok, add a bench source This new source acts like other sources (GMs, SKPs) for benchmarks. It times multiple samples (controlled by samples=N, default 20), and each of those samples uses the same strategy as monobench, growing loops exponentially until it runs for at least 10ms. When done it prints the fastest and the two slowest samples. In practice the 100th percentile sample is very different from the next slowest due to caching, and the fastest is always interesting. Because these benchmarks run in whatever execution engine ok has selected, on non-Windows platforms you have some real control over the interaction between benchmarks. In its default "fork" mode each benchmark runs independently in its own process, so the 100th percentiles really stand out. The other modes "thread" and "serial" work as you'd expect too. Here's an example where you can see how the different interactions work: out/ok bench:samples=100 8888 filter:search=text_16_AA fork [text_16_AA_WT] 2.32µs @0 6.23µs @99 24.3ms @100 [text_16_AA_FF] 2.41µs @0 5.7µs @99 23.3ms @100 [text_16_AA_88] 2.55µs @0 5.6µs @99 24.8ms @100 [text_16_AA_BK] 1.97µs @0 5.44µs @99 23.2ms @100 out/ok bench:samples=100 8888 filter:search=text_16_AA thread [text_16_AA_FF] 2.45µs @0 23.5µs @99 24.8ms @100 [text_16_AA_WT] 2.52µs @0 17.8µs @99 24.7ms @100 [text_16_AA_88] 2.55µs @0 19.7µs @99 25.1ms @100 [text_16_AA_BK] 1.8µs @0 14.7µs @99 25.1ms @100 out/ok bench:samples=100 8888 filter:search=text_16_AA serial [text_16_AA_88] 2.35µs @0 3.53µs @99 16.7ms @100 [text_16_AA_FF] 2.09µs @0 2.73µs @99 2.91µs @100 [text_16_AA_BK] 1.75µs @0 2.46µs @99 2.65µs @100 [text_16_AA_WT] 2.1µs @0 3.16µs @99 3.17µs @100 In the first "fork" case all runs are independent and have roughly the same profile. "thread" looks similar except you can see them contending at the 99th percentile. In "serial", the first bench warms up the rest, so their 100th percentiles are all much faster. Change-Id: I01a9f8c54b540221a9f232b271bb8ef3fda2569c Reviewed-on: https://skia-review.googlesource.com/33585 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
2017-08-11 14:37:35 +00:00
#include "Timer.h"
#include "gm.h"
#include "ok.h"
#include <algorithm>
#include <chrono>
#include <limits>
#include <stdlib.h>
#include <vector>
struct GMStream : Stream {
const skiagm::GMRegistry* registry = skiagm::GMRegistry::Head();
static std::unique_ptr<Stream> Create(Options) {
GMStream stream;
return move_unique(stream);
}
struct GMSrc : Src {
skiagm::GM* (*factory)(void*);
std::unique_ptr<skiagm::GM> gm;
void init() {
if (gm) { return; }
gm.reset(factory(nullptr));
}
std::string name() override {
this->init();
return gm->getName();
}
SkISize size() override {
this->init();
return gm->getISize();
}
Status draw(SkCanvas* canvas) override {
this->init();
canvas->clear(0xffffffff);
gm->draw(canvas);
return Status::OK;
}
};
std::unique_ptr<Src> next() override {
if (!registry) {
return nullptr;
}
GMSrc src;
src.factory = registry->factory();
registry = registry->next();
return move_unique(src);
}
};
static Register gm{"gm", "draw GMs linked into this binary", GMStream::Create};
struct SKPStream : Stream {
std::string dir;
std::vector<std::string> skps;
static std::unique_ptr<Stream> Create(Options options) {
SKPStream stream;
stream.dir = options("dir", "skps");
SkOSFile::Iter it{stream.dir.c_str(), ".skp"};
for (SkString path; it.next(&path); ) {
stream.skps.push_back(path.c_str());
}
return move_unique(stream);
}
struct SKPSrc : Src {
std::string dir, path;
sk_sp<SkPicture> pic;
void init() {
if (pic) { return; }
auto skp = SkData::MakeFromFileName((dir+"/"+path).c_str());
pic = SkPicture::MakeFromData(skp.get());
}
std::string name() override {
return path;
}
SkISize size() override {
this->init();
return pic->cullRect().roundOut().size();
}
Status draw(SkCanvas* canvas) override {
this->init();
canvas->clear(0xffffffff);
pic->playback(canvas);
return Status::OK;
}
};
std::unique_ptr<Src> next() override {
if (skps.empty()) {
return nullptr;
}
SKPSrc src;
src.dir = dir;
src.path = skps.back();
skps.pop_back();
return move_unique(src);
}
};
static Register skp{"skp", "draw SKPs from dir=skps", SKPStream::Create};
ok, add a bench source This new source acts like other sources (GMs, SKPs) for benchmarks. It times multiple samples (controlled by samples=N, default 20), and each of those samples uses the same strategy as monobench, growing loops exponentially until it runs for at least 10ms. When done it prints the fastest and the two slowest samples. In practice the 100th percentile sample is very different from the next slowest due to caching, and the fastest is always interesting. Because these benchmarks run in whatever execution engine ok has selected, on non-Windows platforms you have some real control over the interaction between benchmarks. In its default "fork" mode each benchmark runs independently in its own process, so the 100th percentiles really stand out. The other modes "thread" and "serial" work as you'd expect too. Here's an example where you can see how the different interactions work: out/ok bench:samples=100 8888 filter:search=text_16_AA fork [text_16_AA_WT] 2.32µs @0 6.23µs @99 24.3ms @100 [text_16_AA_FF] 2.41µs @0 5.7µs @99 23.3ms @100 [text_16_AA_88] 2.55µs @0 5.6µs @99 24.8ms @100 [text_16_AA_BK] 1.97µs @0 5.44µs @99 23.2ms @100 out/ok bench:samples=100 8888 filter:search=text_16_AA thread [text_16_AA_FF] 2.45µs @0 23.5µs @99 24.8ms @100 [text_16_AA_WT] 2.52µs @0 17.8µs @99 24.7ms @100 [text_16_AA_88] 2.55µs @0 19.7µs @99 25.1ms @100 [text_16_AA_BK] 1.8µs @0 14.7µs @99 25.1ms @100 out/ok bench:samples=100 8888 filter:search=text_16_AA serial [text_16_AA_88] 2.35µs @0 3.53µs @99 16.7ms @100 [text_16_AA_FF] 2.09µs @0 2.73µs @99 2.91µs @100 [text_16_AA_BK] 1.75µs @0 2.46µs @99 2.65µs @100 [text_16_AA_WT] 2.1µs @0 3.16µs @99 3.17µs @100 In the first "fork" case all runs are independent and have roughly the same profile. "thread" looks similar except you can see them contending at the 99th percentile. In "serial", the first bench warms up the rest, so their 100th percentiles are all much faster. Change-Id: I01a9f8c54b540221a9f232b271bb8ef3fda2569c Reviewed-on: https://skia-review.googlesource.com/33585 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
2017-08-11 14:37:35 +00:00
struct BenchStream : Stream {
const BenchRegistry* registry = BenchRegistry::Head();
int samples;
static std::unique_ptr<Stream> Create(Options options) {
BenchStream stream;
stream.samples = std::max(1, atoi(options("samples", "20").c_str()));
return move_unique(stream);
}
struct BenchSrc : Src {
Benchmark* (*factory)(void*);
std::unique_ptr<Benchmark> bench;
int samples;
void init() {
if (bench) { return; }
bench.reset(factory(nullptr));
}
std::string name() override {
this->init();
return bench->getName();
}
SkISize size() override {
this->init();
return { bench->getSize().x(), bench->getSize().y() };
}
Status draw(SkCanvas* canvas) override {
this->init();
using ms = std::chrono::duration<double, std::milli>;
std::vector<ms> sample(samples);
bench->delayedSetup();
if (canvas) {
bench->perCanvasPreDraw(canvas);
}
for (int i = 0; i < samples; i++) {
using clock = std::chrono::high_resolution_clock;
for (int loops = 1; loops < 1000000000; loops *= 2) {
bench->preDraw(canvas);
auto start = clock::now();
bench->draw(loops, canvas);
ms elapsed = clock::now() - start;
bench->postDraw(canvas);
if (elapsed.count() < 10) {
continue;
}
sample[i] = elapsed / loops;
break;
}
}
if (canvas) {
bench->perCanvasPostDraw(canvas);
}
std::sort(sample.begin(), sample.end());
SkString msg = SkStringPrintf("%s\t@0", HumanizeMs(sample[0].count()).c_str());
if (samples > 2) {
msg.appendf("\t%s\t@%g", HumanizeMs(sample[samples-2].count()).c_str()
, 100.0*(samples-1) / samples);
}
if (samples > 1) {
msg.appendf("\t%s\t@100", HumanizeMs(sample[samples-1].count()).c_str());
}
ok_log(msg.c_str());
return Status::OK;
}
};
std::unique_ptr<Src> next() override {
if (!registry) {
return nullptr;
}
BenchSrc src;
src.factory = registry->factory();
src.samples = samples;
registry = registry->next();
return move_unique(src);
}
};
static Register bench{
"bench",
"time benchmarks linked into this binary samples=20 times each",
BenchStream::Create,
};