skia2/experimental/ddlbench/ddlbench.cpp

// Copyright 2021 Google LLC.
// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.

#include "include/core/SkCanvas.h"
#include "include/core/SkGraphics.h"
#include "include/gpu/GrDirectContext.h"

#include "tools/DDLPromiseImageHelper.h"
#include "tools/DDLTileHelper.h"

#include "tools/flags/CommandLineFlags.h"
#include "tools/gpu/GrContextFactory.h"
#include "tools/gpu/TestContext.h"

#include "src/sksl/SkSLDefines.h"

#include <chrono>
#include <deque>
#include <thread>

using hires_clock = std::chrono::high_resolution_clock;
using duration = std::chrono::nanoseconds;

using sk_gpu_test::ContextInfo;
using sk_gpu_test::GrContextFactory;
using sk_gpu_test::TestContext;

static DEFINE_int(ddlNumRecordingThreads, 1, "number of DDL recording threads");
static DEFINE_string(src, "", "input .skp file");

static void exitf(const char* format, ...) {
    va_list args;
    va_start(args, format);
    vfprintf(stderr, format, args);
    va_end(args);

    exit(1);
}

// Each thread holds this chunk of data thread_locally
struct ThreadInfo {
    ThreadInfo() = default;

    ThreadInfo(const SkString& name, GrDirectContext* directContext, TestContext* testContext)
            : fName(name)
            , fDirectContext(directContext)
            , fTestContext(testContext) {
    }

    double elapsedWorkSeconds() const {
        return std::chrono::duration<double>(fWorkElapsed).count();
    }

    void dump() const {
        duration totalThreadTime = fThreadStop - fThreadStart;
        double totalThreadTimeSeconds = std::chrono::duration<double>(totalThreadTime).count();

        printf("%s: num work units %d work: %.2gs total: %.2gs utilization %.2g%%\n",
               fName.c_str(),
               fWorkUnit,
               this->elapsedWorkSeconds(),
               totalThreadTimeSeconds,
               100.0f * this->elapsedWorkSeconds() / totalThreadTimeSeconds);
    }

    SkString                fName;

    // These two can be null on recording/utility threads
    GrDirectContext*        fDirectContext = nullptr;
    TestContext*            fTestContext = nullptr;

    int                     fWorkUnit = 0;
    duration                fWorkElapsed {0};
    hires_clock::time_point fThreadStart;
    hires_clock::time_point fThreadStop;
};

#if SKSL_USE_THREAD_LOCAL

static thread_local ThreadInfo* gThreadInfo;

static ThreadInfo* get_thread_local_info() {
    return gThreadInfo;
}

static void set_thread_local_info(ThreadInfo* threadInfo) {
    gThreadInfo = threadInfo;
}

#else

#include <pthread.h>

static pthread_key_t get_pthread_key() {
    static pthread_key_t sKey = []{
        pthread_key_t key;
        int result = pthread_key_create(&key, /*destructor=*/nullptr);
        if (result != 0) {
            SK_ABORT("pthread_key_create failure: %d", result);
        }
        return key;
    }();
    return sKey;
}

static ThreadInfo* get_thread_local_info() {
    return static_cast<ThreadInfo*>(pthread_getspecific(get_pthread_key()));
}

static void set_thread_local_info(ThreadInfo* threadInfo) {
    pthread_setspecific(get_pthread_key(), threadInfo);
}

#endif

static void set_up_context_on_thread(ThreadInfo* threadInfo) {
    if (threadInfo->fDirectContext) {
        threadInfo->fTestContext->makeCurrent();
    }

    threadInfo->fThreadStart = hires_clock::now();

    set_thread_local_info(threadInfo);
}

// TODO: upstream this back into SkThreadPool - the only difference is adding some initialization
// at the start of each thread and some thread_local data to hold the utility context/
class GrThreadPool {
public:
    static std::unique_ptr<GrThreadPool> MakeFIFOThreadPool(SkSpan<ThreadInfo> threadInfo,
                                                            bool allowBorrowing = true) {
        return std::make_unique<GrThreadPool>(threadInfo, allowBorrowing);
    }

    explicit GrThreadPool(SkSpan<ThreadInfo> threadInfo, bool allowBorrowing)
            : fAllowBorrowing(allowBorrowing) {

        for (size_t i = 0; i < threadInfo.size(); i++) {
            fThreads.emplace_back(&Loop, this, &threadInfo[i]);
        }
    }

    ~GrThreadPool() {
        // Signal each thread that it's time to shut down.
        for (int i = 0; i < fThreads.count(); i++) {
            this->add(nullptr);
        }
        // Wait for each thread to shut down.
        for (int i = 0; i < fThreads.count(); i++) {
            fThreads[i].join();
        }
    }

    void add(std::function<void(void)> work) {
        // Add some work to our pile of work to do.
        {
            SkAutoMutexExclusive lock(fWorkLock);
            fWork.emplace_back(std::move(work));
        }
        // Tell the Loop() threads to pick it up.
        fWorkAvailable.signal(1);
    }

    void borrow() {
        // If there is work waiting and we're allowed to borrow work, do it.
        if (fAllowBorrowing && fWorkAvailable.try_wait()) {
            SkAssertResult(this->do_work(nullptr));
        }
    }

private:
    // This method should be called only when fWorkAvailable indicates there's work to do.
    bool do_work(ThreadInfo* threadInfo) {
        std::function<void(void)> work;
        {
            SkAutoMutexExclusive lock(fWorkLock);
            SkASSERT(!fWork.empty());        // TODO: if (fWork.empty()) { return true; } ?
            work = std::move(fWork.front());
            fWork.pop_front();
        }

        if (!work) {
            threadInfo->fThreadStop = hires_clock::now();
            return false;  // This is Loop()'s signal to shut down.
        }

        hires_clock::time_point start = hires_clock::now();
        work();
        threadInfo->fWorkElapsed = hires_clock::now() - start;
        threadInfo->fWorkUnit++;

        return true;
    }

    static void Loop(void* ctx, ThreadInfo* threadInfo) {
        set_up_context_on_thread(threadInfo);

        auto pool = (GrThreadPool*)ctx;
        do {
            pool->fWorkAvailable.wait();
        } while (pool->do_work(threadInfo));
    }

    using WorkList = std::deque<std::function<void(void)>>;

    SkTArray<std::thread> fThreads;
    WorkList              fWork;
    SkMutex               fWorkLock;
    SkSemaphore           fWorkAvailable;
    bool                  fAllowBorrowing;
};

class GrTaskGroup : SkNoncopyable {
public:
    explicit GrTaskGroup(GrThreadPool& executor) : fPending(0), fExecutor(executor) {}
    ~GrTaskGroup() { this->wait(); }

    // Add a task to this SkTaskGroup.
    void add(std::function<void(void)> fn) {
        fPending.fetch_add(+1, std::memory_order_relaxed);
        fExecutor.add([this, fn{std::move(fn)}] {
            fn();
            fPending.fetch_add(-1, std::memory_order_release);
        });
    }

    // Returns true if all Tasks previously add()ed to this SkTaskGroup have run.
    // It is safe to reuse this SkTaskGroup once done().
    bool done() const {
        return fPending.load(std::memory_order_acquire) == 0;
    }

    // Block until done().
    void wait() {
        // Actively help the executor do work until our task group is done.
        // This lets SkTaskGroups nest arbitrarily deep on a single SkExecutor:
        // no thread ever blocks waiting for others to do its work.
        // (We may end up doing work that's not part of our task group.  That's fine.)
        while (!this->done()) {
            fExecutor.borrow();
        }
    }

private:
    std::atomic<int32_t> fPending;
    GrThreadPool&        fExecutor;
};

static bool create_contexts(GrContextFactory* factory,
                            GrContextFactory::ContextType contextType,
                            const GrContextFactory::ContextOverrides& overrides,
                            ThreadInfo* gpuThread,
                            SkSpan<ThreadInfo> utilityThreads) {

    ContextInfo mainInfo = factory->getContextInfo(contextType, overrides);
    if (!mainInfo.directContext()) {
        exitf("Could not create primary direct context.");
    }

    *gpuThread = { SkString("g0"), mainInfo.directContext(), mainInfo.testContext() };

    bool allSucceeded = true, allFailed = true;
    // Create the utility contexts in a share group with the primary one. This is allowed to fail
    // but either they should all work or the should all fail.
    for (size_t i = 0; i < utilityThreads.size(); ++i) {
        SkString name = SkStringPrintf("r%zu", i);

        ContextInfo tmp = factory->getSharedContextInfo(mainInfo.directContext(), i);

        utilityThreads[i] = { name, tmp.directContext(), tmp.testContext() };
        allSucceeded &= SkToBool(tmp.directContext());
        allFailed &= !tmp.directContext();
    }

    return allSucceeded || allFailed;
}

static sk_sp<SkPicture> create_shared_skp(const char* src,
                                          GrDirectContext* dContext,
                                          DDLPromiseImageHelper* promiseImageHelper) {
    SkString srcfile(src);

    std::unique_ptr<SkStream> srcstream(SkStream::MakeFromFile(srcfile.c_str()));
    if (!srcstream) {
        exitf("failed to open file %s", srcfile.c_str());
    }

    sk_sp<SkPicture> skp = SkPicture::MakeFromStream(srcstream.get());
    if (!skp) {
        exitf("failed to parse file %s", srcfile.c_str());
    }

    sk_sp<SkData> compressedPictureData = promiseImageHelper->deflateSKP(skp.get());
    if (!compressedPictureData) {
        exitf("skp deflation failed %s", srcfile.c_str());
    }

    // TODO: use the new shared promise images to just create one skp here

    return skp;
}

int main(int argc, char** argv) {
    CommandLineFlags::Parse(argc, argv);

    if (FLAGS_src.count() != 1) {
        exitf("Missing input file");
    }

    // TODO: get these from the command line flags
    const GrContextFactory::ContextType kContextType = GrContextFactory::kGL_ContextType;
    const GrContextOptions kContextOptions;
    const GrContextFactory::ContextOverrides kOverrides = GrContextFactory::ContextOverrides::kNone;

    SkGraphics::Init();

    GrContextFactory factory(kContextOptions);

    std::unique_ptr<ThreadInfo> mainContext(new ThreadInfo);
    std::unique_ptr<ThreadInfo[]> utilityContexts(new ThreadInfo[FLAGS_ddlNumRecordingThreads]);

    if (!create_contexts(&factory,
                         kContextType,
                         kOverrides,
                         mainContext.get(),
                         SkSpan<ThreadInfo>(utilityContexts.get(), FLAGS_ddlNumRecordingThreads))) {
        return 1;
    }

    mainContext->fTestContext->makeCurrent();

    SkYUVAPixmapInfo::SupportedDataTypes supportedYUVADTypes(*mainContext->fDirectContext);
    DDLPromiseImageHelper promiseImageHelper(supportedYUVADTypes);

    sk_sp<SkPicture> skp = create_shared_skp(FLAGS_src[0],
                                             mainContext->fDirectContext,
                                             &promiseImageHelper);

    promiseImageHelper.createCallbackContexts(mainContext->fDirectContext);

    // TODO: do this later on a utility thread!
    promiseImageHelper.uploadAllToGPU(nullptr, mainContext->fDirectContext);

    mainContext->fTestContext->makeNotCurrent();

    {
        std::unique_ptr<GrThreadPool> fGPUExecutor(GrThreadPool::MakeFIFOThreadPool(
                                                SkSpan<ThreadInfo>(mainContext.get(), 1),
                                                false));
        std::unique_ptr<GrThreadPool> fRecordingExecutor(GrThreadPool::MakeFIFOThreadPool(
                                                SkSpan<ThreadInfo>(utilityContexts.get(),
                                                                   FLAGS_ddlNumRecordingThreads),
                                                false));
        GrTaskGroup gpuTaskGroup(*fGPUExecutor);
        GrTaskGroup recordingTaskGroup(*fRecordingExecutor);

        for (int i = 0; i < FLAGS_ddlNumRecordingThreads; ++i) {
            recordingTaskGroup.add([] {
                                       ThreadInfo* threadLocal = get_thread_local_info();
                                       printf("%s: dContext %p\n", threadLocal->fName.c_str(),
                                                                   threadLocal->fDirectContext);
                                       std::this_thread::sleep_for(std::chrono::seconds(1));
                                   });
        }

        gpuTaskGroup.add([] {
                             ThreadInfo* threadLocal = get_thread_local_info();
                             printf("%s: dContext %p\n", threadLocal->fName.c_str(),
                                                         threadLocal->fDirectContext);
                         });

        gpuTaskGroup.add([] {
                             ThreadInfo* threadLocal = get_thread_local_info();
                             threadLocal->fTestContext->makeNotCurrent();
                         });

        recordingTaskGroup.wait();
        gpuTaskGroup.wait();
    }

    mainContext->fTestContext->makeCurrent();

    mainContext->dump();
    for (int i = 0; i < FLAGS_ddlNumRecordingThreads; ++i) {
        utilityContexts[i].dump();
    }

    return 0;
}