/* * Copyright 2016 Google Inc. * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "GrCaps.h" #include "GrContextFactory.h" #include "Benchmark.h" #include "ResultsWriter.h" #include "SkCommandLineFlags.h" #include "SkOSFile.h" #include "SkStream.h" #include "SkSurface.h" #include "SkTime.h" #include "SkTLList.h" #include "SkThreadUtils.h" #include "Stats.h" #include "Timer.h" #include "VisualSKPBench.h" #include "gl/GrGLDefines.h" #include "../private/SkMutex.h" #include "../private/SkSemaphore.h" #include "../private/SkGpuFenceSync.h" // posix only for now #include #include #include /* * This is an experimental GPU only benchmarking program. The initial implementation will only * support SKPs. */ static const int kAutoTuneLoops = 0; static const int kDefaultLoops = #ifdef SK_DEBUG 1; #else kAutoTuneLoops; #endif static SkString loops_help_txt() { SkString help; help.printf("Number of times to run each bench. Set this to %d to auto-" "tune for each bench. Timings are only reported when auto-tuning.", kAutoTuneLoops); return help; } DEFINE_string(skps, "skps", "Directory to read skps from."); DEFINE_string2(match, m, nullptr, "[~][^]substring[$] [...] of GM name to run.\n" "Multiple matches may be separated by spaces.\n" "~ causes a matching bench to always be skipped\n" "^ requires the start of the bench to match\n" "$ requires the end of the bench to match\n" "^ and $ requires an exact match\n" "If a bench does not match any list entry,\n" "it is skipped unless some list entry starts with ~"); DEFINE_int32(gpuFrameLag, 5, "If unknown, estimated maximum number of frames GPU allows to lag."); DEFINE_int32(samples, 10, "Number of samples to measure for each bench."); DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this."); DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str()); DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU."); DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs."); DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together"); DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process"); static SkString humanize(double ms) { return HumanizeMs(ms); } #define HUMANIZE(ms) humanize(ms).c_str() namespace kilobench { class BenchmarkStream { public: BenchmarkStream() : fCurrentSKP(0) { for (int i = 0; i < FLAGS_skps.count(); i++) { if (SkStrEndsWith(FLAGS_skps[i], ".skp")) { fSKPs.push_back() = FLAGS_skps[i]; } else { SkOSFile::Iter it(FLAGS_skps[i], ".skp"); SkString path; while (it.next(&path)) { fSKPs.push_back() = SkOSPath::Join(FLAGS_skps[0], path.c_str()); } } } } Benchmark* next() { Benchmark* bench = nullptr; // skips non matching benches while ((bench = this->innerNext()) && (SkCommandLineFlags::ShouldSkip(FLAGS_match, bench->getUniqueName()) || !bench->isSuitableFor(Benchmark::kGPU_Backend))) { delete bench; } return bench; } private: static bool ReadPicture(const char* path, SkAutoTUnref* pic) { // Not strictly necessary, as it will be checked again later, // but helps to avoid a lot of pointless work if we're going to skip it. if (SkCommandLineFlags::ShouldSkip(FLAGS_match, path)) { return false; } SkAutoTDelete stream(SkStream::NewFromFile(path)); if (stream.get() == nullptr) { SkDebugf("Could not read %s.\n", path); return false; } pic->reset(SkPicture::CreateFromStream(stream.get())); if (pic->get() == nullptr) { SkDebugf("Could not read %s as an SkPicture.\n", path); return false; } return true; } Benchmark* innerNext() { // Render skps while (fCurrentSKP < fSKPs.count()) { const SkString& path = fSKPs[fCurrentSKP++]; SkAutoTUnref pic; if (!ReadPicture(path.c_str(), &pic)) { continue; } SkString name = SkOSPath::Basename(path.c_str()); return new VisualSKPBench(name.c_str(), pic.get()); } return nullptr; } SkTArray fSKPs; int fCurrentSKP; }; struct GPUTarget { void setup() { fGL->makeCurrent(); // Make sure we're done with whatever came before. SK_GL(*fGL, Finish()); } SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; } void endTiming(bool usePlatformSwapBuffers) { if (fGL) { SK_GL(*fGL, Flush()); if (usePlatformSwapBuffers) { fGL->swapBuffers(); } else { fGL->waitOnSyncOrSwap(); } } } void finish() { SK_GL(*fGL, Finish()); } bool needsFrameTiming(int* maxFrameLag) const { if (!fGL->getMaxGpuFrameLag(maxFrameLag)) { // Frame lag is unknown. *maxFrameLag = FLAGS_gpuFrameLag; } return true; } bool init(Benchmark* bench, GrContextFactory* factory, bool useDfText, GrContextFactory::GLContextType ctxType, GrContextFactory::GLContextOptions ctxOptions, int numSamples) { GrContext* context = factory->get(ctxType, ctxOptions); int maxRTSize = context->caps()->maxRenderTargetSize(); SkImageInfo info = SkImageInfo::Make(SkTMin(bench->getSize().fX, maxRTSize), SkTMin(bench->getSize().fY, maxRTSize), kN32_SkColorType, kPremul_SkAlphaType); uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag : 0; SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType); fSurface.reset(SkSurface::NewRenderTarget(context, SkBudgeted::kNo, info, numSamples, &props)); fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext; if (!fSurface.get()) { return false; } // Kilobench should only be used on platforms with fence sync support SkASSERT(fGL->fenceSyncSupport()); return true; } SkCanvas* getCanvas() const { if (!fSurface.get()) { return nullptr; } return fSurface->getCanvas(); } bool capturePixels(SkBitmap* bmp) { SkCanvas* canvas = this->getCanvas(); if (!canvas) { return false; } bmp->setInfo(canvas->imageInfo()); if (!canvas->readPixels(bmp, 0, 0)) { SkDebugf("Can't read canvas pixels.\n"); return false; } return true; } SkGLContext* gl() { return fGL; } private: SkGLContext* fGL; SkAutoTDelete fSurface; }; static bool write_canvas_png(GPUTarget* target, const SkString& filename) { if (filename.isEmpty()) { return false; } if (target->getCanvas() && kUnknown_SkColorType == target->getCanvas()->imageInfo().colorType()) { return false; } SkBitmap bmp; if (!target->capturePixels(&bmp)) { return false; } SkString dir = SkOSPath::Dirname(filename.c_str()); if (!sk_mkdir(dir.c_str())) { SkDebugf("Can't make dir %s.\n", dir.c_str()); return false; } SkFILEWStream stream(filename.c_str()); if (!stream.isValid()) { SkDebugf("Can't write %s.\n", filename.c_str()); return false; } if (!SkImageEncoder::EncodeStream(&stream, bmp, SkImageEncoder::kPNG_Type, 100)) { SkDebugf("Can't encode a PNG.\n"); return false; } return true; } static int detect_forever_loops(int loops) { // look for a magic run-forever value if (loops < 0) { loops = SK_MaxS32; } return loops; } static int clamp_loops(int loops) { if (loops < 1) { SkDebugf("ERROR: clamping loops from %d to 1. " "There's probably something wrong with the bench.\n", loops); return 1; } if (loops > FLAGS_maxLoops) { SkDebugf("WARNING: clamping loops from %d to FLAGS_maxLoops, %d.\n", loops, FLAGS_maxLoops); return FLAGS_maxLoops; } return loops; } static double now_ms() { return SkTime::GetNSecs() * 1e-6; } struct TimingThread { TimingThread(SkGLContext* mainContext) : fFenceSync(mainContext->fenceSync()) , fMainContext(mainContext) , fDone(false) {} static void Loop(void* data) { TimingThread* timingThread = reinterpret_cast(data); timingThread->timingLoop(); } // To ensure waiting for the sync actually does something, we check to make sure the we exceed // some small value const double kMinElapsed = 1e-6; bool sanity(double start) const { double elapsed = now_ms() - start; return elapsed > kMinElapsed; } void waitFence(SkPlatformGpuFence sync) { SkDEBUGCODE(double start = now_ms()); fFenceSync->waitFence(sync, false); SkASSERT(sanity(start)); } void timingLoop() { // Create a context which shares display lists with the main thread SkAutoTDelete glContext(SkCreatePlatformGLContext(kNone_GrGLStandard, fMainContext)); glContext->makeCurrent(); // Basic timing methodology is: // 1) Wait on semaphore until main thread indicates its time to start timing the frame // 2) Wait on frame start sync, record time. This is start of the frame. // 3) Wait on semaphore until main thread indicates its time to finish timing the frame // 4) Wait on frame end sync, record time. FrameEndTime - FrameStartTime = frame time // 5) Wait on semaphore until main thread indicates we should time the next frame or quit while (true) { fSemaphore.wait(); // get start sync SkPlatformGpuFence startSync = this->popStartSync(); // wait on sync this->waitFence(startSync); double start = kilobench::now_ms(); // do we want to sleep here? // wait for end sync fSemaphore.wait(); // get end sync SkPlatformGpuFence endSync = this->popEndSync(); // wait on sync this->waitFence(endSync); double elapsed = kilobench::now_ms() - start; // No mutex needed, client won't touch timings until we're done fTimings.push_back(elapsed); // clean up fences fFenceSync->deleteFence(startSync); fFenceSync->deleteFence(endSync); fSemaphore.wait(); if (this->isDone()) { break; } } } void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); } SkPlatformGpuFence popStartSync() { return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); } void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); } SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); } void setDone() { SkAutoMutexAcquire done(fDoneMutex); fDone = true; fSemaphore.signal(); } typedef SkTLList SyncQueue; void pushSync(SyncQueue* queue, SkMutex* mutex) { SkAutoMutexAcquire am(mutex); *queue->addToHead() = fFenceSync->insertFence(); fSemaphore.signal(); } SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) { SkAutoMutexAcquire am(mutex); SkPlatformGpuFence sync = *queue->head(); queue->popHead(); return sync; } bool isDone() { SkAutoMutexAcquire am1(fFrameStartSyncsMutex); SkAutoMutexAcquire done(fDoneMutex); if (fDone && fFrameStartSyncs.isEmpty()) { return true; } else { return false; } } const SkTArray& timings() const { SkASSERT(fDone); return fTimings; } private: SkGpuFenceSync* fFenceSync; SkSemaphore fSemaphore; SkMutex fFrameStartSyncsMutex; SyncQueue fFrameStartSyncs; SkMutex fFrameEndSyncsMutex; SyncQueue fFrameEndSyncs; SkTArray fTimings; SkMutex fDoneMutex; SkGLContext* fMainContext; bool fDone; }; static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) { SkCanvas* canvas = target->getCanvas(); canvas->clear(SK_ColorWHITE); bench->preDraw(canvas); if (timingThread) { timingThread->pushStartSync(); } double start = now_ms(); canvas = target->beginTiming(canvas); bench->draw(loops, canvas); canvas->flush(); target->endTiming(timingThread ? true : false); double elapsed = now_ms() - start; if (timingThread) { timingThread->pushEndSync(); timingThread->setDone(); } bench->postDraw(canvas); return elapsed; } // TODO For now we don't use the background timing thread to tune loops static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) { // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs. int loops = bench->calculateLoops(FLAGS_loops); if (kAutoTuneLoops == loops) { loops = 1; double elapsed = 0; do { if (1<<30 == loops) { // We're about to wrap. Something's wrong with the bench. loops = 0; break; } loops *= 2; // If the GPU lets frames lag at all, we need to make sure we're timing // _this_ round, not still timing last round. for (int i = 0; i < maxGpuFrameLag; i++) { elapsed = time(loops, bench, target, nullptr); } } while (elapsed < FLAGS_gpuMs); // We've overshot at least a little. Scale back linearly. loops = (int)ceil(loops * FLAGS_gpuMs / elapsed); loops = clamp_loops(loops); // Make sure we're not still timing our calibration. target->finish(); } else { loops = detect_forever_loops(loops); } // Pretty much the same deal as the calibration: do some warmup to make // sure we're timing steady-state pipelined frames. for (int i = 0; i < maxGpuFrameLag - 1; i++) { time(loops, bench, target, nullptr); } return loops; } struct AutoSetupContextBenchAndTarget { AutoSetupContextBenchAndTarget(Benchmark* bench) : fBenchmark(bench) { GrContextOptions grContextOpts; fCtxFactory.reset(new GrContextFactory(grContextOpts)); SkAssertResult(fTarget.init(bench, fCtxFactory, false, GrContextFactory::kNative_GLContextType, GrContextFactory::kNone_GLContextOptions, 0)); fCanvas = fTarget.getCanvas(); fTarget.setup(); bench->perCanvasPreDraw(fCanvas); fTarget.needsFrameTiming(&fMaxFrameLag); } int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); } double timeSample(int loops, TimingThread* timingThread) { for (int i = 0; i < fMaxFrameLag; i++) { time(loops, fBenchmark, &fTarget, timingThread); } return time(loops, fBenchmark, &fTarget, timingThread) / loops; } void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); } SkAutoTDelete fCtxFactory; GPUTarget fTarget; SkCanvas* fCanvas; Benchmark* fBenchmark; int fMaxFrameLag; }; int setup_loops(Benchmark* bench) { AutoSetupContextBenchAndTarget ascbt(bench); int loops = ascbt.getLoops(); ascbt.teardownBench(); if (!FLAGS_writePath.isEmpty() && FLAGS_writePath[0]) { SkString pngFilename = SkOSPath::Join(FLAGS_writePath[0], "gpu"); pngFilename = SkOSPath::Join(pngFilename.c_str(), bench->getUniqueName()); pngFilename.append(".png"); write_canvas_png(&ascbt.fTarget, pngFilename); } return loops; } struct Sample { double fCpu; double fGpu; }; Sample time_sample(Benchmark* bench, int loops) { AutoSetupContextBenchAndTarget ascbt(bench); Sample sample; if (FLAGS_useBackgroundThread) { TimingThread timingThread(ascbt.fTarget.gl()); SkAutoTDelete nativeThread(new SkThread(TimingThread::Loop, &timingThread)); nativeThread->start(); sample.fCpu = ascbt.timeSample(loops, &timingThread); nativeThread->join(); // return the min double min = SK_ScalarMax; for (int i = 0; i < timingThread.timings().count(); i++) { min = SkTMin(min, timingThread.timings()[i]); } sample.fGpu = min; } else { sample.fCpu = ascbt.timeSample(loops, nullptr); } ascbt.teardownBench(); return sample; } } // namespace kilobench static const int kOutResultSize = 1024; void printResult(const SkTArray& samples, int loops, const char* name, const char* mod) { SkString newName(name); newName.appendf("_%s", mod); Stats stats(samples); const double stddev_percent = 100 * sqrt(stats.var) / stats.mean; SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n" , loops , HUMANIZE(stats.min) , HUMANIZE(stats.median) , HUMANIZE(stats.mean) , HUMANIZE(stats.max) , stddev_percent , stats.plot.c_str() , "gpu" , newName.c_str() ); } int kilobench_main() { kilobench::BenchmarkStream benchStream; SkDebugf("loops\tmin\tmedian\tmean\tmax\tstddev\t%-*s\tconfig\tbench\n", FLAGS_samples, "samples"); int descriptors[2]; if (pipe(descriptors) != 0) { SkFAIL("Failed to open a pipe\n"); } while (Benchmark* b = benchStream.next()) { SkAutoTDelete bench(b); int loops = 1; SkTArray cpuSamples; SkTArray gpuSamples; for (int i = 0; i < FLAGS_samples + 1; i++) { // We fork off a new process to setup the grcontext and run the test while we wait if (FLAGS_useMultiProcess) { int childPid = fork(); if (childPid > 0) { char result[kOutResultSize]; if (read(descriptors[0], result, kOutResultSize) < 0) { SkFAIL("Failed to read from pipe\n"); } // if samples == 0 then parse # of loops // else parse float if (i == 0) { sscanf(result, "%d", &loops); } else { sscanf(result, "%lf %lf", &cpuSamples.push_back(), &gpuSamples.push_back()); } // wait until exit int status; waitpid(childPid, &status, 0); } else if (0 == childPid) { char result[kOutResultSize]; if (i == 0) { sprintf(result, "%d", kilobench::setup_loops(bench)); } else { kilobench::Sample sample = kilobench::time_sample(bench, loops); sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu); } // Make sure to write the null terminator if (write(descriptors[1], result, strlen(result) + 1) < 0) { SkFAIL("Failed to write to pipe\n"); } return 0; } else { SkFAIL("Fork failed\n"); } } else { if (i == 0) { loops = kilobench::setup_loops(bench); } else { kilobench::Sample sample = kilobench::time_sample(bench, loops); cpuSamples.push_back(sample.fCpu); gpuSamples.push_back(sample.fGpu); } } } printResult(cpuSamples, loops, bench->getUniqueName(), "cpu"); if (FLAGS_useBackgroundThread) { printResult(gpuSamples, loops, bench->getUniqueName(), "gpu"); } } return 0; } #if !defined SK_BUILD_FOR_IOS int main(int argc, char** argv) { SkCommandLineFlags::Parse(argc, argv); return kilobench_main(); } #endif