[ganesh][dawn] Support blocking on async WebGPU events in WASM

The `wgpu::Device::Tick()` function is not available when the Dawn backend is compiled using emsdk to target WebAssembly. As an alternative, this CL introduces code that takes advantage of emscripten's Asyncify feature to yield execution to the browser's event loop and allowing it to execute async tasks from blocking Skia code. * Introduced the GrDawnAsyncWait class which abstracts over Asyncify vs wgpu::Device::Tick depending on the platform and implements common busy-wait boilerplate. * Refactored the fence management in GrDawnGpu to make use of GrDawnAsyncWait. The GPUQueue.onSubmittedWorkDone is now handled by a callback on GrDawnGpu instead of per-fence callbacks since the latter cannot easily prevent a use-after-free if a fence is destroyed before the callback runs. Bug: skia:12512 Change-Id: I255e92ec87c799dc7a50bd034a815c0aaca0ef5f Reviewed-on: https://skia-review.googlesource.com/c/skia/+/530736 Reviewed-by: Kevin Lubick <kjlubick@google.com> Commit-Queue: Arman Uguray <armansito@google.com>
2022-04-15 15:58:16 -07:00 · 2022-04-15 15:58:16 -07:00 · 0c3dda058c
commit 0c3dda058c
parent 1a62f30069
7 changed files with 192 additions and 49 deletions
--- a/gn/gpu.gni
+++ b/gn/gpu.gni
@ -708,6 +708,8 @@ skia_direct3d_sources = [
 skia_dawn_sources = [
  "$_include/gpu/dawn/GrDawnTypes.h",
  "$_include/private/gpu/ganesh/GrDawnTypesPriv.h",
+  "$_src/gpu/ganesh/dawn/GrDawnAsyncWait.cpp",
+  "$_src/gpu/ganesh/dawn/GrDawnAsyncWait.h",
  "$_src/gpu/ganesh/dawn/GrDawnAttachment.cpp",
  "$_src/gpu/ganesh/dawn/GrDawnAttachment.h",
  "$_src/gpu/ganesh/dawn/GrDawnBuffer.cpp",
--- a/src/gpu/ganesh/BUILD.bazel
+++ b/src/gpu/ganesh/BUILD.bazel
@ -205,6 +205,7 @@ cc_library(
 cc_library(
    name = "dawn_srcs",
    deps = [
+        "//src/gpu/ganesh/dawn:GrDawnAsyncWait_src",
        "//src/gpu/ganesh/dawn:GrDawnAttachment_src",
        "//src/gpu/ganesh/dawn:GrDawnBuffer_src",
        "//src/gpu/ganesh/dawn:GrDawnCaps_src",
--- a/src/gpu/ganesh/dawn/BUILD.bazel
+++ b/src/gpu/ganesh/dawn/BUILD.bazel
@ -75,6 +75,7 @@ generated_cc_atom(
    visibility = ["//:__subpackages__"],
    deps = [
        ":GrDawnRingBuffer_hdr",
+        "//include/private:SkTHash_hdr",
        "//src/core:SkLRUCache_hdr",
        "//src/gpu/ganesh:GrFinishCallbacks_hdr",
        "//src/gpu/ganesh:GrGpu_hdr",
@ -90,6 +91,7 @@ generated_cc_atom(
    srcs = ["GrDawnGpu.cpp"],
    visibility = ["//:__subpackages__"],
    deps = [
+        ":GrDawnAsyncWait_hdr",
        ":GrDawnAttachment_hdr",
        ":GrDawnBuffer_hdr",
        ":GrDawnCaps_hdr",
@ -315,3 +317,20 @@ generated_cc_atom(
    visibility = ["//:__subpackages__"],
    deps = [":GrDawnUtil_hdr"],
 )
+
+generated_cc_atom(
+    name = "GrDawnAsyncWait_hdr",
+    hdrs = ["GrDawnAsyncWait.h"],
+    visibility = ["//:__subpackages__"],
+    deps = ["//third_party:dawn"],
+)
+
+generated_cc_atom(
+    name = "GrDawnAsyncWait_src",
+    srcs = ["GrDawnAsyncWait.cpp"],
+    visibility = ["//:__subpackages__"],
+    deps = [
+        ":GrDawnAsyncWait_hdr",
+        "//include/core:SkTypes_hdr",
+    ],
+)
--- a/src/gpu/ganesh/dawn/GrDawnAsyncWait.cpp
+++ b/src/gpu/ganesh/dawn/GrDawnAsyncWait.cpp
@ -0,0 +1,52 @@
+/*
+ * Copyright 2022 Google LLC.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "src/gpu/ganesh/dawn/GrDawnAsyncWait.h"
+
+#include "include/core/SkTypes.h"
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#endif  // __EMSCRIPTEN__
+
+namespace {
+
+#ifdef __EMSCRIPTEN__
+
+// When we use Dawn/WebGPU in a WebAssembly environment, we do not have access to
+// `wgpu::Device::Tick()`, which is only available to dawn_native. Here we emulate the same
+// behavior by scheduling and awaiting on a single async task, which will yield to the browser's
+// underlying event loop.
+//
+// This requires that Emscripten is configured with `-s ASYNCIFY` to work as expected.
+EM_ASYNC_JS(void, asyncSleep, (), {
+    await new Promise((resolve, _) => {
+        setTimeout(resolve, 0);
+    })
+});
+
+#endif  // __EMSCRIPTEN__
+
+}  // namespace
+
+GrDawnAsyncWait::GrDawnAsyncWait(const wgpu::Device& device) : fDevice(device), fSignaled(false) {}
+
+bool GrDawnAsyncWait::yieldAndCheck() const {
+    if (fSignaled.load()) {
+        return true;
+    }
+#ifdef __EMSCRIPTEN__
+    asyncSleep();
+#else
+    fDevice.Tick();
+#endif  // __EMSCRIPTEN__
+    return fSignaled.load();
+}
+
+void GrDawnAsyncWait::busyWait() const {
+    while (!this->yieldAndCheck()) {}
+}
--- a/src/gpu/ganesh/dawn/GrDawnAsyncWait.h
+++ b/src/gpu/ganesh/dawn/GrDawnAsyncWait.h
@ -0,0 +1,39 @@
+/*
+ * Copyright 2022 Google LLC.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrDawnAsyncWait_DEFINED
+#define GrDawnAsyncWait_DEFINED
+
+#include "webgpu/webgpu_cpp.h"
+
+#include <atomic>
+#include <functional>
+
+// Utility for monitoring the execution of an asynchronous Dawn-API event.
+class GrDawnAsyncWait final {
+public:
+    explicit GrDawnAsyncWait(const wgpu::Device& device);
+
+    // Returns true if the wait has been signaled and false otherwise. This function yields
+    // execution to the event loop where Dawn's asynchronous tasks get scheduled and returns
+    // as soon as the loop yields the execution back to the caller.
+    bool yieldAndCheck() const;
+
+    // Busy-waits until this wait has been signaled.
+    // TODO(armansito): This could benefit from a timeout in the case the wait never gets signaled.
+    void busyWait() const;
+
+    // Marks this wait as resolved. Once called, all calls to `yieldAndCheck` and `busyWait` will
+    // return true immediately.
+    void signal() { fSignaled.store(true); }
+
+private:
+    wgpu::Device fDevice;
+    std::atomic_bool fSignaled;
+};
+
+#endif  // GrDawnAsyncWait_DEFINED
--- a/src/gpu/ganesh/dawn/GrDawnGpu.cpp
+++ b/src/gpu/ganesh/dawn/GrDawnGpu.cpp
@ -23,6 +23,7 @@
 #include "src/gpu/ganesh/GrStencilSettings.h"
 #include "src/gpu/ganesh/GrTexture.h"
 #include "src/gpu/ganesh/GrThreadSafePipelineBuilder.h"
+#include "src/gpu/ganesh/dawn/GrDawnAsyncWait.h"
 #include "src/gpu/ganesh/dawn/GrDawnAttachment.h"
 #include "src/gpu/ganesh/dawn/GrDawnBuffer.h"
 #include "src/gpu/ganesh/dawn/GrDawnCaps.h"
@ -42,32 +43,6 @@

 static const int kMaxRenderPipelineEntries = 1024;

-namespace {
-
-class Fence {
-public:
-    Fence(const wgpu::Device& device)
-      : fDevice(device), fCalled(false) {
-        device.GetQueue().OnSubmittedWorkDone(0, callback, this);
-    }
-
-    static void callback(WGPUQueueWorkDoneStatus status, void* userData) {
-        Fence* fence = static_cast<Fence*>(userData);
-        fence->fCalled = true;
-    }
-
-    bool check() {
-        fDevice.Tick();
-        return fCalled;
-    }
-
-private:
-    wgpu::Device            fDevice;
-    bool                    fCalled;
-};
-
-}
-
 static wgpu::FilterMode to_dawn_filter_mode(GrSamplerState::Filter filter) {
    switch (filter) {
        case GrSamplerState::Filter::kNearest:
@ -538,8 +513,9 @@ void GrDawnGpu::checkForCompletedStagingBuffers() {
 }

 void GrDawnGpu::waitOnAllBusyStagingBuffers() {
+    GrDawnAsyncWait wait(fDevice);
    while (!fBusyStagingBuffers.empty()) {
-        fDevice.Tick();
+        wait.yieldAndCheck();
        this->checkForCompletedStagingBuffers();
    }
 }
@ -548,24 +524,33 @@ void GrDawnGpu::takeOwnershipOfBuffer(sk_sp<GrGpuBuffer> buffer) {
    fSubmittedStagingBuffers.push_back(std::move(buffer));
 }

-
-static void callback(WGPUQueueWorkDoneStatus status, void* userData) {
-    *static_cast<bool*>(userData) = true;
-}
-
 bool GrDawnGpu::onSubmitToGpu(bool syncCpu) {
    this->flushCopyEncoder();
+
    if (!fCommandBuffers.empty()) {
        fQueue.Submit(fCommandBuffers.size(), &fCommandBuffers.front());
        fCommandBuffers.clear();
    }

+    // Schedule the queue done callback if it hasn't been scheduled already and if we just submitted
+    // a new batch of recorded commands. If a callback was already registered in a prior call to
+    // onSubmitToGpu then it will include the commands we just submitted.
+    if (!fSubmittedWorkDoneCallbackPending) {
+        auto callback = [](WGPUQueueWorkDoneStatus status, void* userData) {
+            static_cast<GrDawnGpu*>(userData)->onSubmittedWorkDone(status);
+        };
+        fDevice.GetQueue().OnSubmittedWorkDone(0u, callback, this);
+        fSubmittedWorkDoneCallbackPending = true;
+    }
+
    this->moveStagingBuffersToBusyAndMapAsync();
    if (syncCpu) {
-        bool called = false;
-        fDevice.GetQueue().OnSubmittedWorkDone(0, callback, &called);
-        while (!called) {
-            fDevice.Tick();
+        // If no callback was scheduled then there is no pending work and we don't need to spin on a
+        // fence.
+        if (fSubmittedWorkDoneCallbackPending) {
+            GrDawnAsyncWait* fence = this->createFence();
+            fence->busyWait();
+            this->destroyFence(fence);
        }
        fFinishCallbacks.callAll(true);
    }
@ -575,6 +560,24 @@ bool GrDawnGpu::onSubmitToGpu(bool syncCpu) {
    return true;
 }

+void GrDawnGpu::onSubmittedWorkDone(WGPUQueueWorkDoneStatus status) {
+    fSubmittedWorkDoneCallbackPending = false;
+    fQueueFences.foreach([](GrDawnAsyncWait* fence) {
+        fence->signal();
+    });
+}
+
+GrDawnAsyncWait* GrDawnGpu::createFence() {
+    auto* fence = new GrDawnAsyncWait(fDevice);
+    fQueueFences.add(fence);
+    return fence;
+}
+
+void GrDawnGpu::destroyFence(GrDawnAsyncWait* fence) {
+    fQueueFences.remove(fence);
+    delete fence;
+}
+
 static wgpu::Texture get_dawn_texture_from_surface(GrSurface* src) {
    if (auto t = static_cast<GrDawnTexture*>(src->asTexture())) {
        return t->texture();
@ -606,10 +609,6 @@ bool GrDawnGpu::onCopySurface(GrSurface* dst,
    return true;
 }

-static void callback(WGPUBufferMapAsyncStatus status, void* userdata) {
-    *static_cast<bool*>(userdata) = true;
-}
-
 bool GrDawnGpu::onReadPixels(GrSurface* surface,
                             SkIRect rect,
                             GrColorType surfaceColorType,
@ -646,11 +645,17 @@ bool GrDawnGpu::onReadPixels(GrSurface* surface,
    this->getCopyEncoder().CopyTextureToBuffer(&srcTexture, &dstBuffer, &copySize);
    this->submitToGpu(true);

-    bool mapped = false;
-    buf.MapAsync(wgpu::MapMode::Read, 0, 0, callback, &mapped);
-    while (!mapped) {
-        device().Tick();
-    }
+    GrDawnAsyncWait wait(fDevice);
+    buf.MapAsync(
+            wgpu::MapMode::Read, 0, 0,
+            [](WGPUBufferMapAsyncStatus, void* userData) {
+                // TODO(armansito): Check and take action on the operation status instead of
+                // assuming success.
+                static_cast<GrDawnAsyncWait*>(userData)->signal();
+            },
+            &wait);
+    wait.busyWait();
+
    const void* readPixelsPtr = buf.GetConstMappedRange();

    if (rowBytes == origRowBytes) {
@ -807,15 +812,15 @@ void GrDawnGpu::submit(GrOpsRenderPass* renderPass) {
 }

 GrFence SK_WARN_UNUSED_RESULT GrDawnGpu::insertFence() {
-    return reinterpret_cast<GrFence>(new Fence(fDevice));
+    return reinterpret_cast<GrFence>(this->createFence());
 }

 bool GrDawnGpu::waitFence(GrFence fence) {
-    return reinterpret_cast<Fence*>(fence)->check();
+    return reinterpret_cast<const GrDawnAsyncWait*>(fence)->yieldAndCheck();
 }

 void GrDawnGpu::deleteFence(GrFence fence) {
-    delete reinterpret_cast<Fence*>(fence);
+    this->destroyFence(reinterpret_cast<GrDawnAsyncWait*>(fence));
 }

 std::unique_ptr<GrSemaphore> SK_WARN_UNUSED_RESULT GrDawnGpu::makeSemaphore(bool isOwned) {
@ -844,6 +849,14 @@ void GrDawnGpu::checkFinishProcs() {

 void GrDawnGpu::finishOutstandingGpuWork() {
    this->waitOnAllBusyStagingBuffers();
+
+    // If a callback is pending then any fence added here is guaranteed to get signaled when the
+    // callback eventually runs.
+    if (fSubmittedWorkDoneCallbackPending) {
+        GrDawnAsyncWait* fence = this->createFence();
+        fence->busyWait();
+        this->destroyFence(fence);
+    }
 }

 std::unique_ptr<GrSemaphore> GrDawnGpu::prepareTextureForCrossContextUsage(GrTexture* texture) {
--- a/src/gpu/ganesh/dawn/GrDawnGpu.h
+++ b/src/gpu/ganesh/dawn/GrDawnGpu.h
@ -10,16 +10,18 @@

 #include "src/gpu/ganesh/GrGpu.h"

-#include "webgpu/webgpu_cpp.h"
+#include "include/private/SkTHash.h"
 #include "src/core/SkLRUCache.h"
 #include "src/gpu/ganesh/GrFinishCallbacks.h"
 #include "src/gpu/ganesh/GrProgramDesc.h"
 #include "src/gpu/ganesh/GrStagingBufferManager.h"
 #include "src/gpu/ganesh/dawn/GrDawnRingBuffer.h"
 #include "src/sksl/ir/SkSLProgram.h"
+#include "webgpu/webgpu_cpp.h"

 #include <unordered_map>

+class GrDawnAsyncWait;
 class GrDawnOpsRenderPass;
 class GrDawnStagingBuffer;
 class GrDirectContext;
@ -213,6 +215,10 @@ private:
                                        GrXferBarrierFlags renderPassXferBarriers) override;

    bool onSubmitToGpu(bool syncCpu) override;
+    void onSubmittedWorkDone(WGPUQueueWorkDoneStatus status);
+
+    GrDawnAsyncWait* createFence();
+    void destroyFence(GrDawnAsyncWait* fence);

    void uploadTextureData(GrColorType srcColorType, const GrMipLevel texels[], int mipLevelCount,
                           const SkIRect& rect, wgpu::Texture texture);
@ -226,6 +232,7 @@ private:
    GrDawnRingBuffer                                fUniformRingBuffer;
    wgpu::CommandEncoder                            fCopyEncoder;
    std::vector<wgpu::CommandBuffer>                fCommandBuffers;
+
    GrStagingBufferManager                          fStagingBufferManager;
    std::list<sk_sp<GrGpuBuffer>>                   fBusyStagingBuffers;
    // Temporary array of staging buffers to hold refs on the staging buffers between detaching
@ -233,6 +240,16 @@ private:
    // submission.
    std::vector<sk_sp<GrGpuBuffer>>                 fSubmittedStagingBuffers;

+    // Every time command buffers are submitted to the queue (in onSubmitToGpu) we register a single
+    // OnSubmittedWorkDone callback which is responsible for signaling all fences added via
+    // `insertFence`.
+    //
+    // NOTE: We use this approach instead of registering an individual callback for each
+    // fence because Dawn currently does not support unregistering a callback to prevent a potential
+    // use-after-free.
+    bool fSubmittedWorkDoneCallbackPending = false;
+    SkTHashSet<GrDawnAsyncWait*> fQueueFences;
+
    struct ProgramDescHash {
        uint32_t operator()(const GrProgramDesc& desc) const {
            return SkOpts::hash_fn(desc.asKey(), desc.keyLength(), 0);