[graphite] Use finished callbacks in nanobench to manage frames.

In nanobench we want to try and simulate a GPUs swapbuffering and not get too far ahead on the CPU. Thus we use finished callbacks to know if we get more than 3 frames ahead of the GPU. This CL adds support for Graphite to do this. Bug: skia:12974 Change-Id: I8be505c5769399dcc0f5954f9f999f4448633647 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/525186 Reviewed-by: Michael Ludwig <michaelludwig@google.com> Commit-Queue: Greg Daniel <egdaniel@google.com> Reviewed-by: Jim Van Verth <jvanverth@google.com>
2022-03-28 15:27:44 -04:00 · 2022-03-28 15:27:44 -04:00 · 5d67b1797a
commit 5d67b1797a
parent b2af4f4e8d
8 changed files with 105 additions and 29 deletions
--- a/bench/nanobench.cpp
+++ b/bench/nanobench.cpp
@ -256,7 +256,7 @@ struct GPUTarget : public Target {
            this->contextInfo.testContext()->flushAndWaitOnSync(contextInfo.directContext());
        }
    }
-    void fence() override { this->contextInfo.testContext()->finish(); }
+    void syncCPU() override { this->contextInfo.testContext()->finish(); }

    bool needsFrameTiming(int* maxFrameLag) const override {
        if (!this->contextInfo.testContext()->getMaxGpuFrameLag(maxFrameLag)) {
@ -297,20 +297,16 @@ struct GPUTarget : public Target {
 #ifdef SK_GRAPHITE_ENABLED
 struct GraphiteTarget : public Target {
    explicit GraphiteTarget(const Config& c) : Target(c) {}
-    using ContextInfo = skiatest::graphite::ContextFactory::ContextInfo;
+    using TestContext = skiatest::graphite::GraphiteTestContext;
    using ContextFactory = skiatest::graphite::ContextFactory;

    std::unique_ptr<ContextFactory> factory;

+    TestContext* testContext;
    skgpu::Context* context;
    std::unique_ptr<skgpu::Recorder> recorder;

-    ~GraphiteTarget() override {
-        // TODO: We need to get the ref counting correct for MtlPipeline and MTLDepthStencilState
-        // since right now they live on the Recorder. Until then make sure the Context has finished
-        // all its work.
-        this->fence();
-    }
+    ~GraphiteTarget() override {}

    void setup() override {}

@ -318,17 +314,15 @@ struct GraphiteTarget : public Target {
        if (context && recorder) {
            std::unique_ptr<skgpu::Recording> recording = this->recorder->snap();
            if (recording) {
-                skgpu::InsertRecordingInfo info;
-                info.fRecording = recording.get();
-                this->context->insertRecording(info);
+                this->testContext->submitRecordingAndWaitOnSync(this->context, recording.get());
            }
-            context->submit(skgpu::SyncToCpu::kNo);
        }
    }
-    void fence() override {
+    void syncCPU() override {
        if (context && recorder) {
            // TODO: have a way to sync work with out submitting a Recording which is currently
-            // required.
+            // required. Probably need to get to the point where the backend command buffers are
+            // stored on the Context and not Recordings before this is feasible.
            std::unique_ptr<skgpu::Recording> recording = this->recorder->snap();
            if (recording) {
                skgpu::InsertRecordingInfo info;
@ -340,14 +334,7 @@ struct GraphiteTarget : public Target {
    }

    bool needsFrameTiming(int* maxFrameLag) const override {
-        // TODO
-#if 0
-        if (!this->contextInfo.testContext()->getMaxGpuFrameLag(maxFrameLag)) {
-            // Frame lag is unknown.
-            *maxFrameLag = FLAGS_gpuFrameLag;
-        }
-#endif
-        *maxFrameLag = FLAGS_gpuFrameLag;
+        SkAssertResult(this->testContext->getMaxGpuFrameLag(maxFrameLag));
        return true;
    }
    bool init(SkImageInfo info, Benchmark* bench) override {
@ -357,11 +344,12 @@ struct GraphiteTarget : public Target {
        // context options when we make the factory here.
        this->factory = std::make_unique<ContextFactory>();

-        auto [testContext, ctx] = this->factory->getContextInfo(this->config.graphiteCtxType);
+        auto [testCtx, ctx] = this->factory->getContextInfo(this->config.graphiteCtxType);
        if (!ctx) {
            return false;
        }
-        context = ctx;
+        this->testContext = testCtx;
+        this->context = ctx;

        this->recorder = this->context->makeRecorder();
        if (!this->recorder) {
@ -539,7 +527,7 @@ static int setup_gpu_bench(Target* target, Benchmark* bench, int maxGpuFrameLag)
        loops = clamp_loops(loops);

        // Make sure we're not still timing our calibration.
-        target->fence();
+        target->syncCPU();
    } else {
        loops = detect_forever_loops(loops);
    }
--- a/bench/nanobench.h
+++ b/bench/nanobench.h
@ -54,7 +54,7 @@ struct Target {
    /** Called between benchmarks (or between calibration and measured
        runs) to make sure all pending work in drivers / threads is
        complete. */
-    virtual void fence() { }
+    virtual void syncCPU() { }

    /** CPU-like targets can just be timed, but GPU-like
        targets need to pay attention to frame boundaries
--- a/experimental/graphite/include/Context.h
+++ b/experimental/graphite/include/Context.h
@ -74,6 +74,11 @@ public:
    void insertRecording(const InsertRecordingInfo&);
    void submit(SyncToCpu = SyncToCpu::kNo);

+    /**
+     * Checks whether any asynchronous work is complete and if so calls related callbacks.
+     */
+    void checkAsyncWorkCompletion();
+
    void preCompile(const PaintCombo&);

    /**
--- a/experimental/graphite/src/Context.cpp
+++ b/experimental/graphite/src/Context.cpp
@ -80,6 +80,10 @@ void Context::submit(SyncToCpu syncToCpu) {
    fGpu->checkForFinishedWork(syncToCpu);
 }

+void Context::checkAsyncWorkCompletion() {
+    fGpu->checkForFinishedWork(SyncToCpu::kNo);
+}
+
 void Context::preCompile(const PaintCombo& paintCombo) {
    static const Renderer* kRenderers[] = {
            &Renderer::StencilTessellatedCurvesAndTris(SkPathFillType::kWinding),
--- a/tools/gpu/FlushFinishTracker.cpp
+++ b/tools/gpu/FlushFinishTracker.cpp
@ -10,6 +10,10 @@
 #include "include/gpu/GrDirectContext.h"
 #include "src/core/SkTraceEvent.h"

+#ifdef SK_GRAPHITE_ENABLED
+#include "experimental/graphite/include/Context.h"
+#endif
+
 #include <chrono>

 namespace sk_gpu_test {
@ -19,7 +23,16 @@ void FlushFinishTracker::waitTillFinished() {
    auto begin = std::chrono::steady_clock::now();
    auto end = begin;
    while (!fIsFinished && (end - begin) < std::chrono::seconds(2)) {
-        fContext->checkAsyncWorkCompletion();
+        if (fContext) {
+            fContext->checkAsyncWorkCompletion();
+        } else {
+#ifdef SK_GRAPHITE_ENABLED
+            SkASSERT(fGraphiteContext);
+            fGraphiteContext->checkAsyncWorkCompletion();
+#else
+            SkDEBUGFAIL("No valid context");
+#endif
+        }
        end = std::chrono::steady_clock::now();
    }
    if (!fIsFinished) {
--- a/tools/gpu/FlushFinishTracker.h
+++ b/tools/gpu/FlushFinishTracker.h
@ -12,6 +12,10 @@

 class GrDirectContext;

+#ifdef SK_GRAPHITE_ENABLED
+namespace skgpu { class Context; }
+#endif
+
 namespace sk_gpu_test {

 class FlushFinishTracker : public SkRefCnt {
@ -23,13 +27,19 @@ public:
    }

    FlushFinishTracker(GrDirectContext* context) : fContext(context) {}
+#ifdef SK_GRAPHITE_ENABLED
+    FlushFinishTracker(skgpu::Context* context) : fGraphiteContext(context) {}
+#endif

    void setFinished() { fIsFinished = true; }

    void waitTillFinished();

 private:
-    GrDirectContext* fContext;
+    GrDirectContext* fContext = nullptr;
+#ifdef SK_GRAPHITE_ENABLED
+    skgpu::Context*  fGraphiteContext = nullptr;
+#endif

    // Currently we don't have the this bool be atomic cause all current uses of this class happen
    // on a single thread. In other words we call flush, checkAsyncWorkCompletion, and
--- a/tools/graphite/GraphiteTestContext.cpp
+++ b/tools/graphite/GraphiteTestContext.cpp
@ -7,10 +7,43 @@

 #include "tools/graphite/GraphiteTestContext.h"

+#include "experimental/graphite/include/Context.h"
+#include "experimental/graphite/include/GraphiteTypes.h"
+#include "experimental/graphite/include/Recording.h"
+#include "src/core/SkTraceEvent.h"
+#include "tools/gpu/FlushFinishTracker.h"
+
 namespace skiatest::graphite {

 GraphiteTestContext::GraphiteTestContext() {}

 GraphiteTestContext::~GraphiteTestContext() {}

+void GraphiteTestContext::submitRecordingAndWaitOnSync(skgpu::Context* context,
+                                                       skgpu::Recording* recording) {
+    TRACE_EVENT0("skia.gpu", TRACE_FUNC);
+    SkASSERT(context);
+    SkASSERT(recording);
+
+    if (fFinishTrackers[fCurrentFlushIdx]) {
+        fFinishTrackers[fCurrentFlushIdx]->waitTillFinished();
+    }
+
+    fFinishTrackers[fCurrentFlushIdx].reset(new sk_gpu_test::FlushFinishTracker(context));
+
+    // We add an additional ref to the current flush tracker here. This ref is owned by the finish
+    // callback on the flush call. The finish callback will unref the tracker when called.
+    fFinishTrackers[fCurrentFlushIdx]->ref();
+
+    skgpu::InsertRecordingInfo info;
+    info.fRecording = recording;
+    info.fFinishedContext = fFinishTrackers[fCurrentFlushIdx].get();
+    info.fFinishedProc = sk_gpu_test::FlushFinishTracker::FlushFinished;
+    context->insertRecording(info);
+
+    context->submit(skgpu::SyncToCpu::kNo);
+
+    fCurrentFlushIdx = (fCurrentFlushIdx + 1) % SK_ARRAY_COUNT(fFinishTrackers);
+}
+
 }  // namespace skiatest::graphite
--- a/tools/graphite/GraphiteTestContext.h
+++ b/tools/graphite/GraphiteTestContext.h
@ -11,7 +11,12 @@
 #include "experimental/graphite/include/GraphiteTypes.h"
 #include "include/core/SkRefCnt.h"

-namespace skgpu { class Context; }
+namespace skgpu {
+class Context;
+class Recording;
+}
+
+namespace sk_gpu_test { class FlushFinishTracker; }

 namespace skiatest::graphite {

@ -30,7 +35,25 @@ public:

    virtual std::unique_ptr<skgpu::Context> makeContext() = 0;

+    bool getMaxGpuFrameLag(int *maxFrameLag) const {
+        *maxFrameLag = kMaxFrameLag;
+        return true;
+    }
+
+    /**
+     * This will insert a Recording and submit work to the GPU. Additionally, we will add a finished
+     * callback to our insert recording call. We allow ourselves to have kMaxFrameLag number of
+     * unfinished flushes active on the GPU at a time. If we have 2 outstanding flushes then we will
+     * wait on the CPU until one has finished.
+     */
+    void submitRecordingAndWaitOnSync(skgpu::Context*, skgpu::Recording*);
+
 protected:
+    static constexpr int kMaxFrameLag = 3;
+
+    sk_sp<sk_gpu_test::FlushFinishTracker> fFinishTrackers[kMaxFrameLag - 1];
+    int fCurrentFlushIdx = 0;
+
    GraphiteTestContext();
 };