skia2/tools/kilobench/kilobench.cpp

/*
 * Copyright 2016 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "GrCaps.h"
#include "GrContextFactory.h"
#include "Benchmark.h"
#include "ResultsWriter.h"
#include "SkCommandLineFlags.h"
#include "SkOSFile.h"
#include "SkStream.h"
#include "SkSurface.h"
#include "SkTime.h"
#include "SkTLList.h"
#include "SkThreadUtils.h"
#include "Stats.h"
#include "Timer.h"
#include "VisualSKPBench.h"
#include "gl/GrGLDefines.h"
#include "../private/SkMutex.h"
#include "../private/SkSemaphore.h"
#include "../private/SkGpuFenceSync.h"

// posix only for now
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>

/*
 * This is an experimental GPU only benchmarking program.  The initial implementation will only
 * support SKPs.
 */

// To get image decoders linked in we have to do the below magic
#include "SkForceLinking.h"
#include "SkImageDecoder.h"
__SK_FORCE_IMAGE_DECODER_LINKING;

static const int kAutoTuneLoops = 0;

static const int kDefaultLoops =
#ifdef SK_DEBUG
    1;
#else
    kAutoTuneLoops;
#endif

static SkString loops_help_txt() {
    SkString help;
    help.printf("Number of times to run each bench. Set this to %d to auto-"
                "tune for each bench. Timings are only reported when auto-tuning.",
                kAutoTuneLoops);
    return help;
}

DEFINE_string(skps, "skps", "Directory to read skps from.");
DEFINE_string2(match, m, nullptr,
               "[~][^]substring[$] [...] of GM name to run.\n"
               "Multiple matches may be separated by spaces.\n"
               "~ causes a matching bench to always be skipped\n"
               "^ requires the start of the bench to match\n"
               "$ requires the end of the bench to match\n"
               "^ and $ requires an exact match\n"
               "If a bench does not match any list entry,\n"
               "it is skipped unless some list entry starts with ~");
DEFINE_int32(gpuFrameLag, 5, "If unknown, estimated maximum number of frames GPU allows to lag.");
DEFINE_int32(samples, 10, "Number of samples to measure for each bench.");
DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this.");
DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str());
DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU.");
DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs.");
DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together");
DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process");

static SkString humanize(double ms) {
    return HumanizeMs(ms);
}
#define HUMANIZE(ms) humanize(ms).c_str()

namespace kilobench {
class BenchmarkStream {
public:
    BenchmarkStream() : fCurrentSKP(0) {
        for (int i = 0; i < FLAGS_skps.count(); i++) {
            if (SkStrEndsWith(FLAGS_skps[i], ".skp")) {
                fSKPs.push_back() = FLAGS_skps[i];
            } else {
                SkOSFile::Iter it(FLAGS_skps[i], ".skp");
                SkString path;
                while (it.next(&path)) {
                    fSKPs.push_back() = SkOSPath::Join(FLAGS_skps[0], path.c_str());
                }
            }
        }
    }

    Benchmark* next() {
        Benchmark* bench = nullptr;
        // skips non matching benches
        while ((bench = this->innerNext()) &&
               (SkCommandLineFlags::ShouldSkip(FLAGS_match, bench->getUniqueName()) ||
                !bench->isSuitableFor(Benchmark::kGPU_Backend))) {
            delete bench;
        }
        return bench;
    }

private:
    static bool ReadPicture(const char* path, SkAutoTUnref<SkPicture>* pic) {
        // Not strictly necessary, as it will be checked again later,
        // but helps to avoid a lot of pointless work if we're going to skip it.
        if (SkCommandLineFlags::ShouldSkip(FLAGS_match, path)) {
            return false;
        }

        SkAutoTDelete<SkStream> stream(SkStream::NewFromFile(path));
        if (stream.get() == nullptr) {
            SkDebugf("Could not read %s.\n", path);
            return false;
        }

        pic->reset(SkPicture::CreateFromStream(stream.get()));
        if (pic->get() == nullptr) {
            SkDebugf("Could not read %s as an SkPicture.\n", path);
            return false;
        }
        return true;
    }

    Benchmark* innerNext() {
        // Render skps
        while (fCurrentSKP < fSKPs.count()) {
            const SkString& path = fSKPs[fCurrentSKP++];
            SkAutoTUnref<SkPicture> pic;
            if (!ReadPicture(path.c_str(), &pic)) {
                continue;
            }

            SkString name = SkOSPath::Basename(path.c_str());
            return new VisualSKPBench(name.c_str(), pic.get());
        }

        return nullptr;
    }

    SkTArray<SkString> fSKPs;
    int fCurrentSKP;
};

struct GPUTarget {
    void setup() {
        fGL->makeCurrent();
        // Make sure we're done with whatever came before.
        SK_GL(*fGL, Finish());
    }

    SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; }

    void endTiming(bool usePlatformSwapBuffers) {
        if (fGL) {
            SK_GL(*fGL, Flush());
            if (usePlatformSwapBuffers) {
                fGL->swapBuffers();
            } else {
                fGL->waitOnSyncOrSwap();
            }
        }
    }
    void finish() {
        SK_GL(*fGL, Finish());
    }

    bool needsFrameTiming(int* maxFrameLag) const {
        if (!fGL->getMaxGpuFrameLag(maxFrameLag)) {
            // Frame lag is unknown.
            *maxFrameLag = FLAGS_gpuFrameLag;
        }
        return true;
    }

    bool init(Benchmark* bench, GrContextFactory* factory, bool useDfText,
              GrContextFactory::GLContextType ctxType,
              GrContextFactory::GLContextOptions ctxOptions, int numSamples) {
        GrContext* context = factory->get(ctxType, ctxOptions);
        int maxRTSize = context->caps()->maxRenderTargetSize();
        SkImageInfo info = SkImageInfo::Make(SkTMin(bench->getSize().fX, maxRTSize),
                                             SkTMin(bench->getSize().fY, maxRTSize),
                                              kN32_SkColorType, kPremul_SkAlphaType);
        uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag :
                                                  0;
        SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType);
        fSurface.reset(SkSurface::NewRenderTarget(context,
                                                  SkBudgeted::kNo, info,
                                                  numSamples, &props));
        fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
        if (!fSurface.get()) {
            return false;
        }

        // Kilobench should only be used on platforms with fence sync support
        SkASSERT(fGL->fenceSyncSupport());
        return true;
    }

    SkCanvas* getCanvas() const {
        if (!fSurface.get()) {
            return nullptr;
        }
        return fSurface->getCanvas();
    }

    bool capturePixels(SkBitmap* bmp) {
        SkCanvas* canvas = this->getCanvas();
        if (!canvas) {
            return false;
        }
        bmp->setInfo(canvas->imageInfo());
        if (!canvas->readPixels(bmp, 0, 0)) {
            SkDebugf("Can't read canvas pixels.\n");
            return false;
        }
        return true;
    }

    SkGLContext* gl() { return fGL; }

private:
    SkGLContext* fGL;
    SkAutoTDelete<SkSurface> fSurface;
};

static bool write_canvas_png(GPUTarget* target, const SkString& filename) {

    if (filename.isEmpty()) {
        return false;
    }
    if (target->getCanvas() &&
        kUnknown_SkColorType == target->getCanvas()->imageInfo().colorType()) {
        return false;
    }

    SkBitmap bmp;

    if (!target->capturePixels(&bmp)) {
        return false;
    }

    SkString dir = SkOSPath::Dirname(filename.c_str());
    if (!sk_mkdir(dir.c_str())) {
        SkDebugf("Can't make dir %s.\n", dir.c_str());
        return false;
    }
    SkFILEWStream stream(filename.c_str());
    if (!stream.isValid()) {
        SkDebugf("Can't write %s.\n", filename.c_str());
        return false;
    }
    if (!SkImageEncoder::EncodeStream(&stream, bmp, SkImageEncoder::kPNG_Type, 100)) {
        SkDebugf("Can't encode a PNG.\n");
        return false;
    }
    return true;
}

static int detect_forever_loops(int loops) {
    // look for a magic run-forever value
    if (loops < 0) {
        loops = SK_MaxS32;
    }
    return loops;
}

static int clamp_loops(int loops) {
    if (loops < 1) {
        SkDebugf("ERROR: clamping loops from %d to 1. "
                 "There's probably something wrong with the bench.\n", loops);
        return 1;
    }
    if (loops > FLAGS_maxLoops) {
        SkDebugf("WARNING: clamping loops from %d to FLAGS_maxLoops, %d.\n", loops, FLAGS_maxLoops);
        return FLAGS_maxLoops;
    }
    return loops;
}

static double now_ms() { return SkTime::GetNSecs() * 1e-6; }

struct TimingThread {
    TimingThread(SkGLContext* mainContext)
        : fFenceSync(mainContext->fenceSync())
        ,  fMainContext(mainContext)
        ,  fDone(false) {}

    static void Loop(void* data) {
        TimingThread* timingThread = reinterpret_cast<TimingThread*>(data);
        timingThread->timingLoop();
    }

    // To ensure waiting for the sync actually does something, we check to make sure the we exceed
    // some small value
    const double kMinElapsed = 1e-6;
    bool sanity(double start) const {
        double elapsed = now_ms() - start;
        return elapsed > kMinElapsed;
    }

    void waitFence(SkPlatformGpuFence sync) {
        SkDEBUGCODE(double start = now_ms());
        fFenceSync->waitFence(sync, false);
        SkASSERT(sanity(start));
    }

    void timingLoop() {
        // Create a context which shares display lists with the main thread
        SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard,
                                                                       fMainContext));
        glContext->makeCurrent();

        // Basic timing methodology is:
        // 1) Wait on semaphore until main thread indicates its time to start timing the frame
        // 2) Wait on frame start sync, record time.  This is start of the frame.
        // 3) Wait on semaphore until main thread indicates its time to finish timing the frame
        // 4) Wait on frame end sync, record time.  FrameEndTime - FrameStartTime = frame time
        // 5) Wait on semaphore until main thread indicates we should time the next frame or quit
        while (true) {
            fSemaphore.wait();

            // get start sync
            SkPlatformGpuFence startSync = this->popStartSync();

            // wait on sync
            this->waitFence(startSync);
            double start = kilobench::now_ms();

            // do we want to sleep here?
            // wait for end sync
            fSemaphore.wait();

            // get end sync
            SkPlatformGpuFence endSync = this->popEndSync();

            // wait on sync
            this->waitFence(endSync);
            double elapsed = kilobench::now_ms() - start;

            // No mutex needed, client won't touch timings until we're done
            fTimings.push_back(elapsed);

            // clean up fences
            fFenceSync->deleteFence(startSync);
            fFenceSync->deleteFence(endSync);

            fSemaphore.wait();
            if (this->isDone()) {
                break;
            }
        }
    }

    void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); }

    SkPlatformGpuFence popStartSync() {
        return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex);
    }

    void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

    SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

    void setDone() {
        SkAutoMutexAcquire done(fDoneMutex);
        fDone = true;
        fSemaphore.signal();
    }

    typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue;

    void pushSync(SyncQueue* queue, SkMutex* mutex) {
        SkAutoMutexAcquire am(mutex);
        *queue->addToHead() = fFenceSync->insertFence();
        fSemaphore.signal();
    }

    SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) {
        SkAutoMutexAcquire am(mutex);
        SkPlatformGpuFence sync = *queue->head();
        queue->popHead();
        return sync;
    }

    bool isDone() {
        SkAutoMutexAcquire am1(fFrameStartSyncsMutex);
        SkAutoMutexAcquire done(fDoneMutex);
        if (fDone && fFrameStartSyncs.isEmpty()) {
            return true;
        } else {
            return false;
        }
    }

    const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; }

private:
    SkGpuFenceSync* fFenceSync;
    SkSemaphore fSemaphore;
    SkMutex fFrameStartSyncsMutex;
    SyncQueue fFrameStartSyncs;
    SkMutex fFrameEndSyncsMutex;
    SyncQueue fFrameEndSyncs;
    SkTArray<double> fTimings;
    SkMutex fDoneMutex;
    SkGLContext* fMainContext;
    bool fDone;
};

static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) {
    SkCanvas* canvas = target->getCanvas();
    canvas->clear(SK_ColorWHITE);
    bench->preDraw(canvas);

    if (timingThread) {
        timingThread->pushStartSync();
    }
    double start = now_ms();
    canvas = target->beginTiming(canvas);
    bench->draw(loops, canvas);
    canvas->flush();
    target->endTiming(timingThread ? true : false);

    double elapsed = now_ms() - start;
    if (timingThread) {
        timingThread->pushEndSync();
        timingThread->setDone();
    }
    bench->postDraw(canvas);
    return elapsed;
}

// TODO For now we don't use the background timing thread to tune loops
static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) {
    // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.
    int loops = bench->calculateLoops(FLAGS_loops);
    if (kAutoTuneLoops == loops) {
        loops = 1;
        double elapsed = 0;
        do {
            if (1<<30 == loops) {
                // We're about to wrap.  Something's wrong with the bench.
                loops = 0;
                break;
            }
            loops *= 2;
            // If the GPU lets frames lag at all, we need to make sure we're timing
            // _this_ round, not still timing last round.
            for (int i = 0; i < maxGpuFrameLag; i++) {
                elapsed = time(loops, bench, target, nullptr);
            }
        } while (elapsed < FLAGS_gpuMs);

        // We've overshot at least a little.  Scale back linearly.
        loops = (int)ceil(loops * FLAGS_gpuMs / elapsed);
        loops = clamp_loops(loops);

        // Make sure we're not still timing our calibration.
        target->finish();
    } else {
        loops = detect_forever_loops(loops);
    }

    // Pretty much the same deal as the calibration: do some warmup to make
    // sure we're timing steady-state pipelined frames.
    for (int i = 0; i < maxGpuFrameLag - 1; i++) {
        time(loops, bench, target, nullptr);
    }

    return loops;
}

struct AutoSetupContextBenchAndTarget {
    AutoSetupContextBenchAndTarget(Benchmark* bench) : fBenchmark(bench) {
        GrContextOptions grContextOpts;
        fCtxFactory.reset(new GrContextFactory(grContextOpts));

        SkAssertResult(fTarget.init(bench, fCtxFactory, false,
                                    GrContextFactory::kNative_GLContextType,
                                    GrContextFactory::kNone_GLContextOptions, 0));

        fCanvas = fTarget.getCanvas();
        fTarget.setup();

        bench->perCanvasPreDraw(fCanvas);
        fTarget.needsFrameTiming(&fMaxFrameLag);
    }

    int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); }

    double timeSample(int loops, TimingThread* timingThread) {
        for (int i = 0; i < fMaxFrameLag; i++) {
            time(loops, fBenchmark, &fTarget, timingThread);
        }

        return time(loops, fBenchmark, &fTarget, timingThread) / loops;
    }

    void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); }

    SkAutoTDelete<GrContextFactory> fCtxFactory;
    GPUTarget fTarget;
    SkCanvas* fCanvas;
    Benchmark* fBenchmark;
    int fMaxFrameLag;
};

int setup_loops(Benchmark* bench) {
    AutoSetupContextBenchAndTarget ascbt(bench);
    int loops = ascbt.getLoops();
    ascbt.teardownBench();

    if (!FLAGS_writePath.isEmpty() && FLAGS_writePath[0]) {
        SkString pngFilename = SkOSPath::Join(FLAGS_writePath[0], "gpu");
        pngFilename = SkOSPath::Join(pngFilename.c_str(), bench->getUniqueName());
        pngFilename.append(".png");
        write_canvas_png(&ascbt.fTarget, pngFilename);
    }
    return loops;
}

struct Sample {
    double fCpu;
    double fGpu;
};

Sample time_sample(Benchmark* bench, int loops) {
    AutoSetupContextBenchAndTarget ascbt(bench);

    Sample sample;
    if (FLAGS_useBackgroundThread) {
        TimingThread timingThread(ascbt.fTarget.gl());
        SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread));
        nativeThread->start();
        sample.fCpu = ascbt.timeSample(loops, &timingThread);
        nativeThread->join();

        // return the min
        double min = SK_ScalarMax;
        for (int i = 0; i < timingThread.timings().count(); i++) {
            min = SkTMin(min, timingThread.timings()[i]);
        }
        sample.fGpu = min;
    } else {
        sample.fCpu = ascbt.timeSample(loops, nullptr);
    }

    ascbt.teardownBench();

    return sample;
}

} // namespace kilobench

static const int kOutResultSize = 1024;

void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) {
    SkString newName(name);
    newName.appendf("_%s", mod);
    Stats stats(samples);
    const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
    SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
        , loops
        , HUMANIZE(stats.min)
        , HUMANIZE(stats.median)
        , HUMANIZE(stats.mean)
        , HUMANIZE(stats.max)
        , stddev_percent
        , stats.plot.c_str()
        , "gpu"
        , newName.c_str()
    );
}

int kilobench_main() {
    kilobench::BenchmarkStream benchStream;

    SkDebugf("loops\tmin\tmedian\tmean\tmax\tstddev\t%-*s\tconfig\tbench\n",
             FLAGS_samples, "samples");

    int descriptors[2];
    if (pipe(descriptors) != 0) {
        SkFAIL("Failed to open a pipe\n");
    }

    while (Benchmark* b = benchStream.next()) {
        SkAutoTDelete<Benchmark> bench(b);

        int loops = 1;
        SkTArray<double> cpuSamples;
        SkTArray<double> gpuSamples;
        for (int i = 0; i < FLAGS_samples + 1; i++) {
            // We fork off a new process to setup the grcontext and run the test while we wait
            if (FLAGS_useMultiProcess) {
                int childPid = fork();
                if (childPid > 0) {
                    char result[kOutResultSize];
                    if (read(descriptors[0], result, kOutResultSize) < 0) {
                         SkFAIL("Failed to read from pipe\n");
                    }

                    // if samples == 0 then parse # of loops
                    // else parse float
                    if (i == 0) {
                        sscanf(result, "%d", &loops);
                    } else {
                        sscanf(result, "%lf %lf", &cpuSamples.push_back(),
                                                  &gpuSamples.push_back());
                    }

                    // wait until exit
                    int status;
                    waitpid(childPid, &status, 0);
                } else if (0 == childPid) {
                    char result[kOutResultSize];
                    if (i == 0) {
                        sprintf(result, "%d", kilobench::setup_loops(bench));
                    } else {
                        kilobench::Sample sample = kilobench::time_sample(bench, loops);
                        sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu);
                    }

                    // Make sure to write the null terminator
                    if (write(descriptors[1], result, strlen(result) + 1) < 0) {
                        SkFAIL("Failed to write to pipe\n");
                    }
                    return 0;
                } else {
                    SkFAIL("Fork failed\n");
                }
            } else {
                if (i == 0) {
                    loops = kilobench::setup_loops(bench);
                } else {
                    kilobench::Sample sample = kilobench::time_sample(bench, loops);
                    cpuSamples.push_back(sample.fCpu);
                    gpuSamples.push_back(sample.fGpu);
                }
            }
        }

        printResult(cpuSamples, loops, bench->getUniqueName(), "cpu");
        if (FLAGS_useBackgroundThread) {
            printResult(gpuSamples, loops, bench->getUniqueName(), "gpu");
        }
    }
    return 0;
}

#if !defined SK_BUILD_FOR_IOS
int main(int argc, char** argv) {
    SkCommandLineFlags::Parse(argc, argv);
    return kilobench_main();
}
#endif