Add a background timing thread to kilobench

BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1612513002 Review URL: https://codereview.chromium.org/1612513002
2016-01-28 06:26:35 -08:00 · 2016-01-28 06:26:35 -08:00 · b35c82dc94
commit b35c82dc94
parent 21ab1209f5
2 changed files with 278 additions and 82 deletions
--- a/include/gpu/gl/SkGLContext.h
+++ b/include/gpu/gl/SkGLContext.h
@ -85,6 +85,11 @@ public:

    class GLFenceSync;  // SkGpuFenceSync implementation that uses the OpenGL functionality.

+    /*
+     * returns the fencesync object owned by this SkGLContext
+     */
+    SkGpuFenceSync* fenceSync() { return fFenceSync.get(); }
+
 protected:
    SkGLContext();

--- a/tools/kilobench/kilobench.cpp
+++ b/tools/kilobench/kilobench.cpp
@ -14,10 +14,15 @@
 #include "SkStream.h"
 #include "SkSurface.h"
 #include "SkTime.h"
+#include "SkTLList.h"
+#include "SkThreadUtils.h"
 #include "Stats.h"
 #include "Timer.h"
 #include "VisualSKPBench.h"
 #include "gl/GrGLDefines.h"
+#include "../private/SkMutex.h"
+#include "../private/SkSemaphore.h"
+#include "../private/SkGpuFenceSync.h"

 // posix only for now
 #include <unistd.h>
@ -34,7 +39,6 @@
 #include "SkImageDecoder.h"
 __SK_FORCE_IMAGE_DECODER_LINKING;

-
 static const int kAutoTuneLoops = 0;

 static const int kDefaultLoops =
@ -68,6 +72,8 @@ DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this.");
 DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str());
 DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU.");
 DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs.");
+DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together");
+DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process");

 static SkString humanize(double ms) {
    return HumanizeMs(ms);
@ -146,25 +152,29 @@ private:

 struct GPUTarget {
    void setup() {
-        this->gl->makeCurrent();
+        fGL->makeCurrent();
        // Make sure we're done with whatever came before.
-        SK_GL(*this->gl, Finish());
+        SK_GL(*fGL, Finish());
    }

    SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; }

-    void endTiming() {
-        if (this->gl) {
-            SK_GL(*this->gl, Flush());
-            this->gl->swapBuffers();
+    void endTiming(bool usePlatformSwapBuffers) {
+        if (fGL) {
+            SK_GL(*fGL, Flush());
+            if (usePlatformSwapBuffers) {
+                fGL->swapBuffers();
+            } else {
+                fGL->waitOnSyncOrSwap();
+            }
        }
    }
-    void fence() {
-        SK_GL(*this->gl, Finish());
+    void finish() {
+        SK_GL(*fGL, Finish());
    }

    bool needsFrameTiming(int* maxFrameLag) const {
-        if (!this->gl->getMaxGpuFrameLag(maxFrameLag)) {
+        if (!fGL->getMaxGpuFrameLag(maxFrameLag)) {
            // Frame lag is unknown.
            *maxFrameLag = FLAGS_gpuFrameLag;
        }
@ -182,24 +192,24 @@ struct GPUTarget {
        uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag :
                                                  0;
        SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType);
-        this->surface.reset(SkSurface::NewRenderTarget(context,
-                                                       SkSurface::kNo_Budgeted, info,
-                                                       numSamples, &props));
-        this->gl = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
-        if (!this->surface.get()) {
+        fSurface.reset(SkSurface::NewRenderTarget(context,
+                                                  SkSurface::kNo_Budgeted, info,
+                                                  numSamples, &props));
+        fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
+        if (!fSurface.get()) {
            return false;
        }

        // Kilobench should only be used on platforms with fence sync support
-        SkASSERT(this->gl->fenceSyncSupport());
+        SkASSERT(fGL->fenceSyncSupport());
        return true;
    }

    SkCanvas* getCanvas() const {
-        if (!surface.get()) {
+        if (!fSurface.get()) {
            return nullptr;
        }
-        return surface->getCanvas();
+        return fSurface->getCanvas();
    }

    bool capturePixels(SkBitmap* bmp) {
@ -215,10 +225,11 @@ struct GPUTarget {
        return true;
    }

+    SkGLContext* gl() { return fGL; }
+
 private:
-    //const Config config;
-    SkGLContext* gl;
-    SkAutoTDelete<SkSurface> surface;
+    SkGLContext* fGL;
+    SkAutoTDelete<SkSurface> fSurface;
 };

 static bool write_canvas_png(GPUTarget* target, const SkString& filename) {
@ -276,24 +287,159 @@ static int clamp_loops(int loops) {
 }

 static double now_ms() { return SkTime::GetNSecs() * 1e-6; }
-static double time(int loops, Benchmark* bench, GPUTarget* target) {
-    SkCanvas* canvas = target->getCanvas();
-    if (canvas) {
-        canvas->clear(SK_ColorWHITE);
+
+struct TimingThread {
+    TimingThread(SkGLContext* mainContext)
+        : fFenceSync(mainContext->fenceSync())
+        ,  fMainContext(mainContext)
+        ,  fDone(false) {}
+
+    static void Loop(void* data) {
+        TimingThread* timingThread = reinterpret_cast<TimingThread*>(data);
+        timingThread->timingLoop();
    }
+
+    // To ensure waiting for the sync actually does something, we check to make sure the we exceed
+    // some small value
+    const double kMinElapsed = 1e-6;
+    bool sanity(double start) const {
+        double elapsed = now_ms() - start;
+        return elapsed > kMinElapsed;
+    }
+
+    void waitFence(SkPlatformGpuFence sync) {
+        SkDEBUGCODE(double start = now_ms());
+        fFenceSync->waitFence(sync, false);
+        SkASSERT(sanity(start));
+    }
+
+    void timingLoop() {
+        // Create a context which shares display lists with the main thread
+        SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard,
+                                                                       fMainContext));
+        glContext->makeCurrent();
+
+        // Basic timing methodology is:
+        // 1) Wait on semaphore until main thread indicates its time to start timing the frame
+        // 2) Wait on frame start sync, record time.  This is start of the frame.
+        // 3) Wait on semaphore until main thread indicates its time to finish timing the frame
+        // 4) Wait on frame end sync, record time.  FrameEndTime - FrameStartTime = frame time
+        // 5) Wait on semaphore until main thread indicates we should time the next frame or quit
+        while (true) {
+            fSemaphore.wait();
+
+            // get start sync
+            SkPlatformGpuFence startSync = this->popStartSync();
+
+            // wait on sync
+            this->waitFence(startSync);
+            double start = kilobench::now_ms();
+
+            // do we want to sleep here?
+            // wait for end sync
+            fSemaphore.wait();
+
+            // get end sync
+            SkPlatformGpuFence endSync = this->popEndSync();
+
+            // wait on sync
+            this->waitFence(endSync);
+            double elapsed = kilobench::now_ms() - start;
+
+            // No mutex needed, client won't touch timings until we're done
+            fTimings.push_back(elapsed);
+
+            // clean up fences
+            fFenceSync->deleteFence(startSync);
+            fFenceSync->deleteFence(endSync);
+
+            fSemaphore.wait();
+            if (this->isDone()) {
+                break;
+            }
+        }
+    }
+
+    void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); }
+
+    SkPlatformGpuFence popStartSync() {
+        return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex);
+    }
+
+    void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }
+
+    SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }
+
+    void setDone() {
+        SkAutoMutexAcquire done(fDoneMutex);
+        fDone = true;
+        fSemaphore.signal();
+    }
+
+    typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue;
+
+    void pushSync(SyncQueue* queue, SkMutex* mutex) {
+        SkAutoMutexAcquire am(mutex);
+        *queue->addToHead() = fFenceSync->insertFence();
+        fSemaphore.signal();
+    }
+
+    SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) {
+        SkAutoMutexAcquire am(mutex);
+        SkPlatformGpuFence sync = *queue->head();
+        queue->popHead();
+        return sync;
+    }
+
+    bool isDone() {
+        SkAutoMutexAcquire am1(fFrameStartSyncsMutex);
+        SkAutoMutexAcquire done(fDoneMutex);
+        if (fDone && fFrameStartSyncs.isEmpty()) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; }
+
+private:
+    SkGpuFenceSync* fFenceSync;
+    SkSemaphore fSemaphore;
+    SkMutex fFrameStartSyncsMutex;
+    SyncQueue fFrameStartSyncs;
+    SkMutex fFrameEndSyncsMutex;
+    SyncQueue fFrameEndSyncs;
+    SkTArray<double> fTimings;
+    SkMutex fDoneMutex;
+    SkGLContext* fMainContext;
+    bool fDone;
+};
+
+static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) {
+    SkCanvas* canvas = target->getCanvas();
+    canvas->clear(SK_ColorWHITE);
    bench->preDraw(canvas);
+
+    if (timingThread) {
+        timingThread->pushStartSync();
+    }
    double start = now_ms();
    canvas = target->beginTiming(canvas);
    bench->draw(loops, canvas);
-    if (canvas) {
-        canvas->flush();
-    }
-    target->endTiming();
+    canvas->flush();
+    target->endTiming(timingThread ? true : false);
+
    double elapsed = now_ms() - start;
+    if (timingThread) {
+        timingThread->pushEndSync();
+        timingThread->setDone();
+    }
    bench->postDraw(canvas);
    return elapsed;
 }

+// TODO For now we don't use the background timing thread to tune loops
 static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) {
    // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.
    int loops = bench->calculateLoops(FLAGS_loops);
@ -310,7 +456,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL
            // If the GPU lets frames lag at all, we need to make sure we're timing
            // _this_ round, not still timing last round.
            for (int i = 0; i < maxGpuFrameLag; i++) {
-                elapsed = time(loops, bench, target);
+                elapsed = time(loops, bench, target, nullptr);
            }
        } while (elapsed < FLAGS_gpuMs);

@ -319,7 +465,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL
        loops = clamp_loops(loops);

        // Make sure we're not still timing our calibration.
-        target->fence();
+        target->finish();
    } else {
        loops = detect_forever_loops(loops);
    }
@ -327,7 +473,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL
    // Pretty much the same deal as the calibration: do some warmup to make
    // sure we're timing steady-state pipelined frames.
    for (int i = 0; i < maxGpuFrameLag - 1; i++) {
-        time(loops, bench, target);
+        time(loops, bench, target, nullptr);
    }

    return loops;
@ -351,13 +497,14 @@ struct AutoSetupContextBenchAndTarget {

    int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); }

-    double timeSample(int loops) {
+    double timeSample(int loops, TimingThread* timingThread) {
        for (int i = 0; i < fMaxFrameLag; i++) {
-            time(loops, fBenchmark, &fTarget);
+            time(loops, fBenchmark, &fTarget, timingThread);
        }

-        return time(loops, fBenchmark, &fTarget) / loops;
+        return time(loops, fBenchmark, &fTarget, timingThread) / loops;
    }
+
    void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); }

    SkAutoTDelete<GrContextFactory> fCtxFactory;
@ -381,9 +528,32 @@ int setup_loops(Benchmark* bench) {
    return loops;
 }

-double time_sample(Benchmark* bench, int loops) {
+struct Sample {
+    double fCpu;
+    double fGpu;
+};
+
+Sample time_sample(Benchmark* bench, int loops) {
    AutoSetupContextBenchAndTarget ascbt(bench);
-    double sample = ascbt.timeSample(loops);
+
+    Sample sample;
+    if (FLAGS_useBackgroundThread) {
+        TimingThread timingThread(ascbt.fTarget.gl());
+        SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread));
+        nativeThread->start();
+        sample.fCpu = ascbt.timeSample(loops, &timingThread);
+        nativeThread->join();
+
+        // return the min
+        double min = SK_ScalarMax;
+        for (int i = 0; i < timingThread.timings().count(); i++) {
+            min = SkTMin(min, timingThread.timings()[i]);
+        }
+        sample.fGpu = min;
+    } else {
+        sample.fCpu = ascbt.timeSample(loops, nullptr);
+    }
+
    ascbt.teardownBench();

    return sample;
@ -393,6 +563,24 @@ double time_sample(Benchmark* bench, int loops) {

 static const int kOutResultSize = 1024;

+void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) {
+    SkString newName(name);
+    newName.appendf("_%s", mod);
+    Stats stats(samples);
+    const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
+    SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
+        , loops
+        , HUMANIZE(stats.min)
+        , HUMANIZE(stats.median)
+        , HUMANIZE(stats.mean)
+        , HUMANIZE(stats.max)
+        , stddev_percent
+        , stats.plot.c_str()
+        , "gpu"
+        , newName.c_str()
+    );
+}
+
 int kilobench_main() {
    kilobench::BenchmarkStream benchStream;

@ -407,60 +595,63 @@ int kilobench_main() {
    while (Benchmark* b = benchStream.next()) {
        SkAutoTDelete<Benchmark> bench(b);

-        int loops;
-        SkTArray<double> samples;
+        int loops = 1;
+        SkTArray<double> cpuSamples;
+        SkTArray<double> gpuSamples;
        for (int i = 0; i < FLAGS_samples + 1; i++) {
            // We fork off a new process to setup the grcontext and run the test while we wait
-            int childPid = fork();
-            if (childPid > 0) {
-                char result[kOutResultSize];
-                if (read(descriptors[0], result, kOutResultSize) < 0) {
-                     SkFAIL("Failed to read from pipe\n");
-                }
+            if (FLAGS_useMultiProcess) {
+                int childPid = fork();
+                if (childPid > 0) {
+                    char result[kOutResultSize];
+                    if (read(descriptors[0], result, kOutResultSize) < 0) {
+                         SkFAIL("Failed to read from pipe\n");
+                    }

-                // if samples == 0 then parse # of loops
-                // else parse float
-                if (i == 0) {
-                    sscanf(result, "%d", &loops);
+                    // if samples == 0 then parse # of loops
+                    // else parse float
+                    if (i == 0) {
+                        sscanf(result, "%d", &loops);
+                    } else {
+                        sscanf(result, "%lf %lf", &cpuSamples.push_back(),
+                                                  &gpuSamples.push_back());
+                    }
+
+                    // wait until exit
+                    int status;
+                    waitpid(childPid, &status, 0);
+                } else if (0 == childPid) {
+                    char result[kOutResultSize];
+                    if (i == 0) {
+                        sprintf(result, "%d", kilobench::setup_loops(bench));
+                    } else {
+                        kilobench::Sample sample = kilobench::time_sample(bench, loops);
+                        sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu);
+                    }
+
+                    // Make sure to write the null terminator
+                    if (write(descriptors[1], result, strlen(result) + 1) < 0) {
+                        SkFAIL("Failed to write to pipe\n");
+                    }
+                    return 0;
                } else {
-                    sscanf(result, "%lf", &samples.push_back());
+                    SkFAIL("Fork failed\n");
                }
-
-                // wait until exit
-                int status;
-                waitpid(childPid, &status, 0);
-            } else if (0 == childPid) {
-                char result[kOutResultSize];
-                if (i == 0) {
-                    sprintf(result, "%d", kilobench::setup_loops(bench));
-                } else {
-                    sprintf(result, "%lf", kilobench::time_sample(bench, loops));
-                }
-
-                // Make sure to write the null terminator
-                if (write(descriptors[1], result, strlen(result) + 1) < 0) {
-                    SkFAIL("Failed to write to pipe\n");
-                }
-                return 0;
            } else {
-                SkFAIL("Fork failed\n");
+                if (i == 0) {
+                    loops = kilobench::setup_loops(bench);
+                } else {
+                    kilobench::Sample sample = kilobench::time_sample(bench, loops);
+                    cpuSamples.push_back(sample.fCpu);
+                    gpuSamples.push_back(sample.fGpu);
+                }
            }
        }

-        Stats stats(samples);
-        const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
-        SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
-                , loops
-                , HUMANIZE(stats.min)
-                , HUMANIZE(stats.median)
-                , HUMANIZE(stats.mean)
-                , HUMANIZE(stats.max)
-                , stddev_percent
-                , stats.plot.c_str()
-                , "gpu"
-                , bench->getUniqueName()
-                );
-
+        printResult(cpuSamples, loops, bench->getUniqueName(), "cpu");
+        if (FLAGS_useBackgroundThread) {
+            printResult(gpuSamples, loops, bench->getUniqueName(), "gpu");
+        }
    }
    return 0;
 }