Metal: Hold refs for input buffers from bindBuffer calls.

Mirrors what we have in Vulkan and Direct3D. Also adds command buffer tracking, again like Vulkan and Direct3D. Change-Id: I2280d92274d81830aec7950afc64a0147e38c317 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/305396 Reviewed-by: Greg Daniel <egdaniel@google.com> Commit-Queue: Jim Van Verth <jvanverth@google.com>
2020-07-23 11:39:35 -04:00 · 2020-07-23 11:39:35 -04:00 · a75dc33a67
commit a75dc33a67
parent 90787fefef
7 changed files with 106 additions and 52 deletions
--- a/src/gpu/mtl/GrMtlBuffer.h
+++ b/src/gpu/mtl/GrMtlBuffer.h
@ -25,7 +25,6 @@ public:

    id<MTLBuffer> mtlBuffer() const { return fMtlBuffer; }
    size_t offset() const { return fOffset; }
-    void bind(); // for initial binding of XferGpuToCpu buffers

 protected:
    GrMtlBuffer(GrMtlGpu*, size_t size, GrGpuBufferType intendedType, GrAccessPattern);
--- a/src/gpu/mtl/GrMtlBuffer.mm
+++ b/src/gpu/mtl/GrMtlBuffer.mm
@ -35,21 +35,26 @@ GrMtlBuffer::GrMtlBuffer(GrMtlGpu* gpu, size_t size, GrGpuBufferType intendedTyp
        : INHERITED(gpu, size, intendedType, accessPattern)
        , fIsDynamic(accessPattern != kStatic_GrAccessPattern)
        , fOffset(0) {
-    // In most cases, we'll allocate dynamic buffers when we map them, below.
-    if (!fIsDynamic) {
-        NSUInteger options = 0;
-        if (@available(macOS 10.11, iOS 9.0, *)) {
+    NSUInteger options = 0;
+    if (@available(macOS 10.11, iOS 9.0, *)) {
+        if (fIsDynamic) {
+#ifdef SK_BUILD_FOR_MAC
+            options |= MTLResourceStorageModeManaged;
+#else
+            options |= MTLResourceStorageModeShared;
+#endif
+        } else {
            options |= MTLResourceStorageModePrivate;
        }
-#ifdef SK_BUILD_FOR_MAC
-        // Mac requires 4-byte alignment for copies so we need
-        // to ensure we have space for the extra data
-        size = SkAlign4(size);
-#endif
-        fMtlBuffer = size == 0 ? nil :
-                [gpu->device() newBufferWithLength: size
-                                           options: options];
    }
+#ifdef SK_BUILD_FOR_MAC
+    // Mac requires 4-byte alignment for copies so we need
+    // to ensure we have space for the extra data
+    size = SkAlign4(size);
+#endif
+    fMtlBuffer = size == 0 ? nil :
+            [gpu->device() newBufferWithLength: size
+                                       options: options];
    this->registerWithCache(SkBudgeted::kYes);
    VALIDATE();
 }
@ -60,11 +65,6 @@ GrMtlBuffer::~GrMtlBuffer() {
    SkASSERT(fMapPtr == nullptr);
 }

-void GrMtlBuffer::bind() {
-    SkASSERT(fIsDynamic && GrGpuBufferType::kXferGpuToCpu == this->intendedType());
-    fMtlBuffer = this->mtlGpu()->resourceProvider().getDynamicBuffer(this->size(), &fOffset);
-}
-
 bool GrMtlBuffer::onUpdateData(const void* src, size_t srcInBytes) {
    if (!fIsDynamic) {
        if (fMtlBuffer == nil) {
@ -122,9 +122,6 @@ void GrMtlBuffer::internalMap(size_t sizeInBytes) {
    VALIDATE();
    SkASSERT(!this->isMapped());
    if (fIsDynamic) {
-        if (GrGpuBufferType::kXferGpuToCpu != this->intendedType()) {
-            fMtlBuffer = this->mtlGpu()->resourceProvider().getDynamicBuffer(sizeInBytes, &fOffset);
-        }
        fMappedBuffer = fMtlBuffer;
        fMapPtr = static_cast<char*>(fMtlBuffer.contents) + fOffset;
    } else {
--- a/src/gpu/mtl/GrMtlCommandBuffer.h
+++ b/src/gpu/mtl/GrMtlCommandBuffer.h
@ -11,15 +11,16 @@
 #import <Metal/Metal.h>

 #include "include/core/SkRefCnt.h"
+#include "src/gpu/GrBuffer.h"
 #include "src/gpu/mtl/GrMtlUtil.h"

 class GrMtlGpu;
 class GrMtlPipelineState;
 class GrMtlOpsRenderPass;

-class GrMtlCommandBuffer {
+class GrMtlCommandBuffer : public SkRefCnt {
 public:
-    static GrMtlCommandBuffer* Create(id<MTLCommandQueue> queue);
+    static sk_sp<GrMtlCommandBuffer> Make(id<MTLCommandQueue> queue);
    ~GrMtlCommandBuffer();

    void commit(bool waitUntilCompleted);
@ -33,10 +34,16 @@ public:
        [fCmdBuffer addCompletedHandler:block];
    }

+    void addGrBuffer(sk_sp<const GrBuffer> buffer) {
+        fTrackedGrBuffers.push_back(std::move(buffer));
+    }
+
    void encodeSignalEvent(id<MTLEvent>, uint64_t value) SK_API_AVAILABLE(macos(10.14), ios(12.0));
    void encodeWaitForEvent(id<MTLEvent>, uint64_t value) SK_API_AVAILABLE(macos(10.14), ios(12.0));

 private:
+    static const int kInitialTrackedResourcesCount = 32;
+
    GrMtlCommandBuffer(id<MTLCommandBuffer> cmdBuffer)
        : fCmdBuffer(cmdBuffer)
        , fPreviousRenderPassDescriptor(nil) {}
@ -47,6 +54,8 @@ private:
    id<MTLBlitCommandEncoder>   fActiveBlitCommandEncoder;
    id<MTLRenderCommandEncoder> fActiveRenderCommandEncoder;
    MTLRenderPassDescriptor*    fPreviousRenderPassDescriptor;
+
+    SkSTArray<kInitialTrackedResourcesCount, sk_sp<const GrBuffer>> fTrackedGrBuffers;
 };

 #endif
--- a/src/gpu/mtl/GrMtlCommandBuffer.mm
+++ b/src/gpu/mtl/GrMtlCommandBuffer.mm
@ -15,7 +15,7 @@
 #error This file must be compiled with Arc. Use -fobjc-arc flag
 #endif

-GrMtlCommandBuffer* GrMtlCommandBuffer::Create(id<MTLCommandQueue> queue) {
+sk_sp<GrMtlCommandBuffer> GrMtlCommandBuffer::Make(id<MTLCommandQueue> queue) {
    id<MTLCommandBuffer> mtlCommandBuffer;
    mtlCommandBuffer = [queue commandBuffer];
    if (nil == mtlCommandBuffer) {
@ -24,11 +24,12 @@ GrMtlCommandBuffer* GrMtlCommandBuffer::Create(id<MTLCommandQueue> queue) {

    mtlCommandBuffer.label = @"GrMtlCommandBuffer::Create";

-    return new GrMtlCommandBuffer(mtlCommandBuffer);
+    return sk_sp<GrMtlCommandBuffer>(new GrMtlCommandBuffer(mtlCommandBuffer));
 }

 GrMtlCommandBuffer::~GrMtlCommandBuffer() {
    this->endAllEncoding();
+    fTrackedGrBuffers.reset();
    fCmdBuffer = nil;
 }

--- a/src/gpu/mtl/GrMtlGpu.h
+++ b/src/gpu/mtl/GrMtlGpu.h
@ -8,6 +8,7 @@
 #ifndef GrMtlGpu_DEFINED
 #define GrMtlGpu_DEFINED

+#include "include/private/SkDeque.h"
 #include "src/gpu/GrFinishCallbacks.h"
 #include "src/gpu/GrGpu.h"
 #include "src/gpu/GrRenderTarget.h"
@ -15,6 +16,7 @@
 #include "src/gpu/GrTexture.h"

 #include "src/gpu/mtl/GrMtlCaps.h"
+#include "src/gpu/mtl/GrMtlCommandBuffer.h"
 #include "src/gpu/mtl/GrMtlResourceProvider.h"
 #include "src/gpu/mtl/GrMtlStencilAttachment.h"
 #include "src/gpu/mtl/GrMtlUtil.h"
@ -52,11 +54,6 @@ public:
        kSkip_SyncQueue
    };

-    // Commits the current command buffer to the queue and then creates a new command buffer. If
-    // sync is set to kForce_SyncQueue, the function will wait for all work in the committed
-    // command buffer to finish before returning.
-    void submitCommandBuffer(SyncQueue sync);
-
    void deleteBackendTexture(const GrBackendTexture&) override;

    bool compile(const GrProgramDesc&, const GrProgramInfo&) override;
@ -214,6 +211,13 @@ private:

    bool onSubmitToGpu(bool syncCpu) override;

+    // Commits the current command buffer to the queue and then creates a new command buffer. If
+    // sync is set to kForce_SyncQueue, the function will wait for all work in the committed
+    // command buffer to finish before returning.
+    void submitCommandBuffer(SyncQueue sync);
+
+    void checkForFinishedCommandBuffers();
+
    // Function that uploads data onto textures with private storage mode (GPU access only).
    bool uploadToTexture(GrMtlTexture* tex, int left, int top, int width, int height,
                         GrColorType dataColorType, const GrMipLevel texels[], int mipLevels);
@ -247,7 +251,16 @@ private:
    id<MTLDevice> fDevice;
    id<MTLCommandQueue> fQueue;

-    GrMtlCommandBuffer* fCmdBuffer;
+    sk_sp<GrMtlCommandBuffer> fCurrentCmdBuffer;
+
+    struct OutstandingCommandBuffer {
+        OutstandingCommandBuffer(sk_sp<GrMtlCommandBuffer> commandBuffer, GrFence fence)
+            : fCommandBuffer(std::move(commandBuffer))
+            , fFence(fence) {}
+        sk_sp<GrMtlCommandBuffer> fCommandBuffer;
+        GrFence fFence;
+    };
+    SkDeque fOutstandingCommandBuffers;

    std::unique_ptr<SkSL::Compiler> fCompiler;

--- a/src/gpu/mtl/GrMtlGpu.mm
+++ b/src/gpu/mtl/GrMtlGpu.mm
@ -112,12 +112,18 @@ sk_sp<GrGpu> GrMtlGpu::Make(GrDirectContext* direct, const GrContextOptions& opt
    return sk_sp<GrGpu>(new GrMtlGpu(direct, options, device, queue, featureSet));
 }

+// This constant determines how many OutstandingCommandBuffers are allocated together as a block in
+// the deque. As such it needs to balance allocating too much memory vs. incurring
+// allocation/deallocation thrashing. It should roughly correspond to the max number of outstanding
+// command buffers we expect to see.
+static const int kDefaultOutstandingAllocCnt = 8;
+
 GrMtlGpu::GrMtlGpu(GrDirectContext* direct, const GrContextOptions& options,
                   id<MTLDevice> device, id<MTLCommandQueue> queue, MTLFeatureSet featureSet)
        : INHERITED(direct)
        , fDevice(device)
        , fQueue(queue)
-        , fCmdBuffer(nullptr)
+        , fOutstandingCommandBuffers(sizeof(OutstandingCommandBuffer), kDefaultOutstandingAllocCnt)
        , fCompiler(new SkSL::Compiler())
        , fResourceProvider(this)
        , fDisconnected(false)
@ -135,24 +141,25 @@ GrMtlGpu::~GrMtlGpu() {
 void GrMtlGpu::disconnect(DisconnectType type) {
    INHERITED::disconnect(type);

-    if (DisconnectType::kCleanup == type) {
+    if (!fDisconnected) {
        this->destroyResources();
-    } else {
-        delete fCmdBuffer;
-        fCmdBuffer = nullptr;
-
-        fResourceProvider.destroyResources();
-
-        fQueue = nil;
-        fDevice = nil;
-
        fDisconnected = true;
    }
 }

 void GrMtlGpu::destroyResources() {
-    // Will implicitly delete the command buffer
    this->submitCommandBuffer(SyncQueue::kForce_SyncQueue);
+
+    // We used a placement new for each object in fOutstandingCommandBuffers, so we're responsible
+    // for calling the destructor on each of them as well.
+    while (!fOutstandingCommandBuffers.empty()) {
+        OutstandingCommandBuffer* buffer =
+                (OutstandingCommandBuffer*)fOutstandingCommandBuffers.front();
+        this->deleteFence(buffer->fFence);
+        buffer->~OutstandingCommandBuffer();
+        fOutstandingCommandBuffers.pop_front();
+    }
+
    fResourceProvider.destroyResources();

    fQueue = nil;
@ -175,18 +182,44 @@ void GrMtlGpu::submit(GrOpsRenderPass* renderPass) {
 }

 GrMtlCommandBuffer* GrMtlGpu::commandBuffer() {
-    if (!fCmdBuffer) {
-        fCmdBuffer = GrMtlCommandBuffer::Create(fQueue);
+    if (!fCurrentCmdBuffer) {
+        fCurrentCmdBuffer = GrMtlCommandBuffer::Make(fQueue);
+
+        // This should be done after we have a new command buffer in case the freeing of any
+        // resources held by a finished command buffer causes us to send a new command to the gpu
+        // (like changing the resource state).
+        this->checkForFinishedCommandBuffers();
    }
-    return fCmdBuffer;
+    return fCurrentCmdBuffer.get();
 }

 void GrMtlGpu::submitCommandBuffer(SyncQueue sync) {
-    if (fCmdBuffer) {
-        fResourceProvider.addBufferCompletionHandler(fCmdBuffer);
-        fCmdBuffer->commit(SyncQueue::kForce_SyncQueue == sync);
-        delete fCmdBuffer;
-        fCmdBuffer = nullptr;
+    // TODO: handle sync with empty command buffer
+    if (fCurrentCmdBuffer) {
+        fResourceProvider.addBufferCompletionHandler(fCurrentCmdBuffer.get());
+
+        GrFence fence = this->insertFence();
+        new (fOutstandingCommandBuffers.push_back()) OutstandingCommandBuffer(
+                fCurrentCmdBuffer, fence);
+
+        fCurrentCmdBuffer->commit(SyncQueue::kForce_SyncQueue == sync);
+        fCurrentCmdBuffer.reset();
+    }
+}
+
+void GrMtlGpu::checkForFinishedCommandBuffers() {
+    // Iterate over all the outstanding command buffers to see if any have finished. The command
+    // buffers are in order from oldest to newest, so we start at the front to check if their fence
+    // has signaled. If so we pop it off and move onto the next.
+    // Repeat till we find a command list that has not finished yet (and all others afterwards are
+    // also guaranteed to not have finished).
+    OutstandingCommandBuffer* front = (OutstandingCommandBuffer*)fOutstandingCommandBuffers.front();
+    while (front && this->waitFence(front->fFence)) {
+        // Since we used placement new we are responsible for calling the destructor manually.
+        this->deleteFence(front->fFence);
+        front->~OutstandingCommandBuffer();
+        fOutstandingCommandBuffers.pop_front();
+        front = (OutstandingCommandBuffer*)fOutstandingCommandBuffers.front();
    }
 }

@ -1254,7 +1287,6 @@ bool GrMtlGpu::onTransferPixelsFrom(GrSurface* surface, int left, int top, int w
    }

    GrMtlBuffer* grMtlBuffer = static_cast<GrMtlBuffer*>(transferBuffer);
-    grMtlBuffer->bind();

    size_t transBufferRowBytes = bpp * width;
    size_t transBufferImageBytes = transBufferRowBytes * height;
--- a/src/gpu/mtl/GrMtlOpsRenderPass.mm
+++ b/src/gpu/mtl/GrMtlOpsRenderPass.mm
@ -264,6 +264,7 @@ void GrMtlOpsRenderPass::onBindBuffers(sk_sp<const GrBuffer> indexBuffer,
        SkASSERT(!vertexBuffer->isCpuBuffer());
        SkASSERT(!static_cast<const GrGpuBuffer*>(vertexBuffer.get())->isMapped());
        fActiveVertexBuffer = std::move(vertexBuffer);
+        fGpu->commandBuffer()->addGrBuffer(fActiveVertexBuffer);
        ++inputBufferIndex;
    }
    if (instanceBuffer) {
@ -271,11 +272,13 @@ void GrMtlOpsRenderPass::onBindBuffers(sk_sp<const GrBuffer> indexBuffer,
        SkASSERT(!static_cast<const GrGpuBuffer*>(instanceBuffer.get())->isMapped());
        this->setVertexBuffer(fActiveRenderCmdEncoder, instanceBuffer.get(), 0, inputBufferIndex++);
        fActiveInstanceBuffer = std::move(instanceBuffer);
+        fGpu->commandBuffer()->addGrBuffer(fActiveInstanceBuffer);
    }
    if (indexBuffer) {
        SkASSERT(!indexBuffer->isCpuBuffer());
        SkASSERT(!static_cast<const GrGpuBuffer*>(indexBuffer.get())->isMapped());
        fActiveIndexBuffer = std::move(indexBuffer);
+        fGpu->commandBuffer()->addGrBuffer(fActiveIndexBuffer);
    }
 }