From 6986c6539e8814fdc56e42a604fe8740238a0ee1 Mon Sep 17 00:00:00 2001
From: Brian Salomon <bsalomon@google.com>
Date: Thu, 12 Dec 2019 10:58:47 -0500
Subject: [PATCH] Make Gr[Op]MemoryPool allocate itself into its initial block.

Saves one heap allocation per DDL recorded.

Change-Id: I9393aedc3b48031cd2ea5f0160b107915077099a
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/259419
Commit-Queue: Brian Salomon <bsalomon@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
---
 bench/GrMemoryPoolBench.cpp    | 36 +++++++++++------
 src/gpu/GrMemoryPool.cpp       | 71 +++++++++++++++++++++++++---------
 src/gpu/GrMemoryPool.h         | 55 +++++++++++++-------------
 src/gpu/GrProcessor.cpp        |  2 +-
 src/gpu/GrRecordingContext.cpp |  2 +-
 tests/GrMemoryPoolTest.cpp     | 70 ++++++++++++++-------------------
 6 files changed, 137 insertions(+), 99 deletions(-)
diff --git a/bench/GrMemoryPoolBench.cpp b/bench/GrMemoryPoolBench.cpp
index 4d52662abd..f2486a5843 100644
--- a/bench/GrMemoryPoolBench.cpp
+++ b/bench/GrMemoryPoolBench.cpp
@@ -21,12 +21,16 @@
 struct A {
     int gStuff[10];
 #if OVERRIDE_NEW
-    void* operator new (size_t size) { return gBenchPool.allocate(size); }
-    void operator delete (void* mem) { if (mem) { return gBenchPool.release(mem); } }
+    void* operator new(size_t size) { return gBenchPool->allocate(size); }
+    void operator delete(void* mem) {
+        if (mem) {
+            return gBenchPool->release(mem);
+        }
+    }
 #endif
-    static GrMemoryPool gBenchPool;
+    static std::unique_ptr<GrMemoryPool> gBenchPool;
 };
-GrMemoryPool A::gBenchPool(10 * (1 << 10), 10 * (1 << 10));
+std::unique_ptr<GrMemoryPool> A::gBenchPool = GrMemoryPool::Make(10 * (1 << 10), 10 * (1 << 10));
 
 /**
  * This benchmark creates and deletes objects in stack order
@@ -83,12 +87,16 @@ private:
 struct B {
     int gStuff[10];
 #if OVERRIDE_NEW
-    void* operator new (size_t size) { return gBenchPool.allocate(size); }
-    void operator delete (void* mem) { if (mem) { return gBenchPool.release(mem); } }
+    void* operator new(size_t size) { return gBenchPool->allocate(size); }
+    void operator delete(void* mem) {
+        if (mem) {
+            return gBenchPool->release(mem);
+        }
+    }
 #endif
-    static GrMemoryPool gBenchPool;
+    static std::unique_ptr<GrMemoryPool> gBenchPool;
 };
-GrMemoryPool B::gBenchPool(10 * (1 << 10), 10 * (1 << 10));
+std::unique_ptr<GrMemoryPool> B::gBenchPool = GrMemoryPool::Make(10 * (1 << 10), 10 * (1 << 10));
 
 /**
  * This benchmark creates objects and deletes them in random order
@@ -128,12 +136,16 @@ private:
 struct C {
     int gStuff[10];
 #if OVERRIDE_NEW
-    void* operator new (size_t size) { return gBenchPool.allocate(size); }
-    void operator delete (void* mem) { if (mem) { return gBenchPool.release(mem); } }
+    void* operator new(size_t size) { return gBenchPool->allocate(size); }
+    void operator delete(void* mem) {
+        if (mem) {
+            return gBenchPool->release(mem);
+        }
+    }
 #endif
-    static GrMemoryPool gBenchPool;
+    static std::unique_ptr<GrMemoryPool> gBenchPool;
 };
-GrMemoryPool C::gBenchPool(10 * (1 << 10), 10 * (1 << 10));
+std::unique_ptr<GrMemoryPool> C::gBenchPool = GrMemoryPool::Make(10 * (1 << 10), 10 * (1 << 10));
 
 /**
  * This benchmark creates objects and deletes them in queue order
diff --git a/src/gpu/GrMemoryPool.cpp b/src/gpu/GrMemoryPool.cpp
index 8a92a5e384..a474469e21 100644
--- a/src/gpu/GrMemoryPool.cpp
+++ b/src/gpu/GrMemoryPool.cpp
@@ -18,26 +18,26 @@
     #define VALIDATE
 #endif
 
-void GrOpMemoryPool::release(std::unique_ptr<GrOp> op) {
-    GrOp* tmp = op.release();
-    SkASSERT(tmp);
-    tmp->~GrOp();
-    fMemoryPool.release(tmp);
+std::unique_ptr<GrMemoryPool> GrMemoryPool::Make(size_t preallocSize, size_t minAllocSize) {
+    preallocSize = std::max(preallocSize, kMinAllocationSize);
+    static constexpr size_t kPoolSize = GrSizeAlignUp(sizeof(GrMemoryPool), kAlignment);
+    size_t size = kPoolSize + preallocSize;
+    void* mem = operator new(size);
+    void* preallocStart = static_cast<char*>(mem) + kPoolSize;
+    return std::unique_ptr<GrMemoryPool>(
+            new (mem) GrMemoryPool(preallocStart, preallocSize, minAllocSize));
 }
 
-constexpr size_t GrMemoryPool::kSmallestMinAllocSize;
-
-GrMemoryPool::GrMemoryPool(size_t preallocSize, size_t minAllocSize) {
+GrMemoryPool::GrMemoryPool(void* preallocStart, size_t preallocSize, size_t minAllocSize) {
     SkDEBUGCODE(fAllocationCnt = 0);
     SkDEBUGCODE(fAllocBlockCnt = 0);
 
-    minAllocSize = SkTMax<size_t>(GrSizeAlignUp(minAllocSize, kAlignment), kSmallestMinAllocSize);
-    preallocSize = SkTMax<size_t>(GrSizeAlignUp(preallocSize, kAlignment), minAllocSize);
+    minAllocSize = std::max(minAllocSize, kMinAllocationSize);
 
     fMinAllocSize = minAllocSize;
     fSize = 0;
 
-    fHead = CreateBlock(preallocSize);
+    fHead = InitBlock(preallocStart, preallocSize);
     fTail = fHead;
     fHead->fNext = nullptr;
     fHead->fPrev = nullptr;
@@ -62,7 +62,7 @@ GrMemoryPool::~GrMemoryPool() {
     SkASSERT(0 == fAllocationCnt);
     SkASSERT(fHead == fTail);
     SkASSERT(0 == fHead->fLiveCount);
-    DeleteBlock(fHead);
+    SkASSERT(kAssignedMarker == fHead->fBlockSentinal);
 };
 
 void* GrMemoryPool::allocate(size_t size) {
@@ -71,7 +71,7 @@ void* GrMemoryPool::allocate(size_t size) {
     size = GrSizeAlignUp(size, kAlignment);
     if (fTail->fFreeSize < size) {
         size_t blockSize = size + kHeaderSize;
-        blockSize = SkTMax<size_t>(blockSize, fMinAllocSize);
+        blockSize = std::max(blockSize, fMinAllocSize);
         BlockHeader* block = CreateBlock(blockSize);
 
         block->fPrev = fTail;
@@ -149,11 +149,13 @@ void GrMemoryPool::release(void* p) {
 }
 
 GrMemoryPool::BlockHeader* GrMemoryPool::CreateBlock(size_t blockSize) {
-    blockSize = SkTMax<size_t>(blockSize, kHeaderSize);
-    BlockHeader* block =
-        reinterpret_cast<BlockHeader*>(sk_malloc_throw(blockSize));
-    // we assume malloc gives us aligned memory
-    SkASSERT(!(reinterpret_cast<intptr_t>(block) % kAlignment));
+    blockSize = std::max(blockSize, kHeaderSize);
+    return InitBlock(sk_malloc_throw(blockSize), blockSize);
+}
+
+auto GrMemoryPool::InitBlock(void* mem, size_t blockSize) -> BlockHeader* {
+    SkASSERT(!(reinterpret_cast<intptr_t>(mem) % kAlignment));
+    auto block = reinterpret_cast<BlockHeader*>(mem);
     SkDEBUGCODE(block->fBlockSentinal = kAssignedMarker);
     block->fLiveCount = 0;
     block->fFreeSize = blockSize - kHeaderSize;
@@ -215,3 +217,36 @@ void GrMemoryPool::validate() {
     SkASSERT(fAllocBlockCnt != 0 || fSize == 0);
 #endif
 }
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+static constexpr size_t kOpPoolSize =
+        GrSizeAlignUp(sizeof(GrOpMemoryPool), GrMemoryPool::kAlignment);
+
+GrOpMemoryPool::~GrOpMemoryPool() { this->pool()->~GrMemoryPool(); }
+
+std::unique_ptr<GrOpMemoryPool> GrOpMemoryPool::Make(size_t preallocSize, size_t minAllocSize) {
+    preallocSize = std::max(preallocSize, GrMemoryPool::kMinAllocationSize);
+    static constexpr size_t kOpPoolSize =
+            GrSizeAlignUp(sizeof(GrOpMemoryPool), GrMemoryPool::kAlignment);
+    static constexpr size_t kPoolSize =
+            GrSizeAlignUp(sizeof(GrMemoryPool), GrMemoryPool::kAlignment);
+    size_t size = kOpPoolSize + kPoolSize + preallocSize;
+    void* mem = operator new(size);
+    void* memPoolPtr = static_cast<char*>(mem) + kOpPoolSize;
+    void* preallocStart = static_cast<char*>(mem) + kOpPoolSize + kPoolSize;
+    new (memPoolPtr) GrMemoryPool(preallocStart, preallocSize, minAllocSize);
+    return std::unique_ptr<GrOpMemoryPool>(new (mem) GrOpMemoryPool());
+}
+
+void GrOpMemoryPool::release(std::unique_ptr<GrOp> op) {
+    GrOp* tmp = op.release();
+    SkASSERT(tmp);
+    tmp->~GrOp();
+    this->pool()->release(tmp);
+}
+
+GrMemoryPool* GrOpMemoryPool::pool() const {
+    auto addr = reinterpret_cast<const char*>(this) + kOpPoolSize;
+    return reinterpret_cast<GrMemoryPool*>(const_cast<char*>(addr));
+}
diff --git a/src/gpu/GrMemoryPool.h b/src/gpu/GrMemoryPool.h
index 535ad9bb75..364f58e32e 100644
--- a/src/gpu/GrMemoryPool.h
+++ b/src/gpu/GrMemoryPool.h
@@ -21,23 +21,27 @@
  * requests. It is optimized for allocate / release speed over memory
  * efficiency. The interface is designed to be used to implement operator new
  * and delete overrides. All allocations are expected to be released before the
- * pool's destructor is called. Allocations will be 8-byte aligned.
+ * pool's destructor is called. Allocations will be aligned to
+ * sizeof(std::max_align_t).
  */
 class GrMemoryPool {
 public:
+    // Guaranteed alignment of pointer returned by allocate().
+    static constexpr size_t kAlignment = alignof(std::max_align_t);
+    // Minimum size this class will allocate at once.
+    static constexpr size_t kMinAllocationSize = 1 << 10;
+
     /**
      * Prealloc size is the amount of space to allocate at pool creation
      * time and keep around until pool destruction. The min alloc size is
      * the smallest allowed size of additional allocations. Both sizes are
-     * adjusted to ensure that:
-     *   1. they are are 8-byte aligned
-     *   2. minAllocSize >= kSmallestMinAllocSize
-     *   3. preallocSize >= minAllocSize
+     * adjusted to ensure that they are at least as large as kMinAllocationSize.
      *
-     * Both sizes is what the pool will end up allocating from the system, and
+     * Both sizes are what the pool will end up allocating from the system, and
      * portions of the allocated memory is used for internal bookkeeping.
      */
-    GrMemoryPool(size_t preallocSize, size_t minAllocSize);
+    static std::unique_ptr<GrMemoryPool> Make(size_t preallocSize, size_t minAllocSize);
+    void operator delete(void* p) { ::operator delete(p); }
 
     ~GrMemoryPool();
 
@@ -66,15 +70,14 @@ public:
      */
     size_t preallocSize() const { return fHead->fSize; }
 
-    /**
-     * Minimum value of minAllocSize constructor argument.
-     */
-    constexpr static size_t kSmallestMinAllocSize = 1 << 10;
 
 private:
+    GrMemoryPool(void* preallocStart, size_t preallocSize, size_t minAllocSize);
+
     struct BlockHeader;
 
     static BlockHeader* CreateBlock(size_t size);
+    static BlockHeader* InitBlock(void* mem, size_t blockSize);
 
     static void DeleteBlock(BlockHeader* block);
 
@@ -115,39 +118,37 @@ private:
     SkTHashSet<int32_t>               fAllocatedIDs;
 #endif
 
-protected:
-    enum {
-        // We assume this alignment is good enough for everybody.
-        kAlignment    = 8,
-        kHeaderSize   = GrSizeAlignUp(sizeof(BlockHeader), kAlignment),
-        kPerAllocPad  = GrSizeAlignUp(sizeof(AllocHeader), kAlignment),
-    };
+    friend class GrOpMemoryPool;
+
+    static constexpr size_t kHeaderSize  = GrSizeAlignUp(sizeof(BlockHeader), kAlignment);
+    static constexpr size_t kPerAllocPad = GrSizeAlignUp(sizeof(AllocHeader), kAlignment);
 };
 
 class GrOp;
 
 class GrOpMemoryPool {
 public:
-    GrOpMemoryPool(size_t preallocSize, size_t minAllocSize)
-            : fMemoryPool(preallocSize, minAllocSize) {
-    }
+    static std::unique_ptr<GrOpMemoryPool> Make(size_t preallocSize, size_t minAllocSize);
+    void operator delete(void* p) { ::operator delete(p); }
+
+    ~GrOpMemoryPool();
 
     template <typename Op, typename... OpArgs>
     std::unique_ptr<Op> allocate(OpArgs&&... opArgs) {
-        char* mem = (char*) fMemoryPool.allocate(sizeof(Op));
+        auto mem = this->pool()->allocate(sizeof(Op));
         return std::unique_ptr<Op>(new (mem) Op(std::forward<OpArgs>(opArgs)...));
     }
 
-    void* allocate(size_t size) {
-        return fMemoryPool.allocate(size);
-    }
+    void* allocate(size_t size) { return this->pool()->allocate(size); }
 
     void release(std::unique_ptr<GrOp> op);
 
-    bool isEmpty() const { return fMemoryPool.isEmpty(); }
+    bool isEmpty() const { return this->pool()->isEmpty(); }
 
 private:
-    GrMemoryPool fMemoryPool;
+    GrMemoryPool* pool() const;
+
+    GrOpMemoryPool() = default;
 };
 
 #endif
diff --git a/src/gpu/GrProcessor.cpp b/src/gpu/GrProcessor.cpp
index aae1029750..4b7b2b1ec9 100644
--- a/src/gpu/GrProcessor.cpp
+++ b/src/gpu/GrProcessor.cpp
@@ -116,7 +116,7 @@ public:
 #endif
 
     GrMemoryPool* pool() const {
-        static GrMemoryPool* gPool = new GrMemoryPool(4096, 4096);
+        static GrMemoryPool* gPool = GrMemoryPool::Make(4096, 4096).release();
         return gPool;
     }
 };
diff --git a/src/gpu/GrRecordingContext.cpp b/src/gpu/GrRecordingContext.cpp
index aa7d1a1a6b..cf38a21669 100644
--- a/src/gpu/GrRecordingContext.cpp
+++ b/src/gpu/GrRecordingContext.cpp
@@ -122,7 +122,7 @@ GrOpMemoryPool* GrRecordingContext::opMemoryPool() {
         // DDL TODO: should the size of the memory pool be decreased in DDL mode? CPU-side memory
         // consumed in DDL mode vs. normal mode for a single skp might be a good metric of wasted
         // memory.
-        fOpMemoryPool = std::make_unique<GrOpMemoryPool>(16384, 16384);
+        fOpMemoryPool = GrOpMemoryPool::Make(16384, 16384);
     }
 
     return fOpMemoryPool.get();
diff --git a/tests/GrMemoryPoolTest.cpp b/tests/GrMemoryPoolTest.cpp
index 3eb10ce12c..f5b341eb08 100644
--- a/tests/GrMemoryPoolTest.cpp
+++ b/tests/GrMemoryPoolTest.cpp
@@ -27,7 +27,7 @@ public:
     virtual ~A() {}
 
     void* operator new(size_t size) {
-        if (!gPool.get()) {
+        if (!gPool) {
             return ::operator new(size);
         } else {
             return gPool->allocate(size);
@@ -35,7 +35,7 @@ public:
     }
 
     void operator delete(void* p) {
-        if (!gPool.get()) {
+        if (!gPool) {
             ::operator delete(p);
         } else {
             return gPool->release(p);
@@ -45,13 +45,10 @@ public:
     static A* Create(SkRandom* r);
 
     static void SetAllocator(size_t preallocSize, size_t minAllocSize) {
-        GrMemoryPool* pool = new GrMemoryPool(preallocSize, minAllocSize);
-        gPool.reset(pool);
+        gPool = GrMemoryPool::Make(preallocSize, minAllocSize);
     }
 
-    static void ResetAllocator() {
-        gPool.reset(nullptr);
-    }
+    static void ResetAllocator() { gPool.reset(); }
 
 private:
     static std::unique_ptr<GrMemoryPool> gPool;
@@ -246,9 +243,9 @@ private:
 };
 
 DEF_TEST(GrMemoryPoolAPI, reporter) {
-    constexpr size_t kSmallestMinAllocSize = GrMemoryPool::kSmallestMinAllocSize;
+    constexpr size_t kSmallestMinAllocSize = GrMemoryPool::kMinAllocationSize;
 
-    // Allocates memory until pool adds a new block (pool.size() changes).
+    // Allocates memory until pool adds a new block (pool->size() changes).
     auto allocateMemory = [](GrMemoryPool& pool, AutoPoolReleaser& r) {
         size_t origPoolSize = pool.size();
         while (pool.size() == origPoolSize) {
@@ -256,65 +253,58 @@ DEF_TEST(GrMemoryPoolAPI, reporter) {
         }
     };
 
-    // Effective prealloc space capacity is >= kSmallestMinAllocSize.
+    // Effective prealloc space capacity is >= kMinAllocationSize.
     {
-        GrMemoryPool pool(0, 0);
-        REPORTER_ASSERT(reporter, pool.preallocSize() == kSmallestMinAllocSize);
+        auto pool = GrMemoryPool::Make(0, 0);
+        REPORTER_ASSERT(reporter, pool->preallocSize() == kSmallestMinAllocSize);
     }
 
-    // Effective prealloc space capacity is >= minAllocSize.
+    // Effective block size capacity >= kMinAllocationSize.
     {
-        constexpr size_t kMinAllocSize = kSmallestMinAllocSize * 2;
-        GrMemoryPool pool(kSmallestMinAllocSize, kMinAllocSize);
-        REPORTER_ASSERT(reporter, pool.preallocSize() == kMinAllocSize);
-    }
+        auto pool = GrMemoryPool::Make(kSmallestMinAllocSize, kSmallestMinAllocSize / 2);
+        AutoPoolReleaser r(*pool);
 
-    // Effective block size capacity >= kSmallestMinAllocSize.
-    {
-        GrMemoryPool pool(kSmallestMinAllocSize, kSmallestMinAllocSize / 2);
-        AutoPoolReleaser r(pool);
-
-        allocateMemory(pool, r);
-        REPORTER_ASSERT(reporter, pool.size() == kSmallestMinAllocSize);
+        allocateMemory(*pool, r);
+        REPORTER_ASSERT(reporter, pool->size() == kSmallestMinAllocSize);
     }
 
     // Pool allocates exactly preallocSize on creation.
     {
         constexpr size_t kPreallocSize = kSmallestMinAllocSize * 5;
-        GrMemoryPool pool(kPreallocSize, 0);
-        REPORTER_ASSERT(reporter, pool.preallocSize() == kPreallocSize);
+        auto pool = GrMemoryPool::Make(kPreallocSize, 0);
+        REPORTER_ASSERT(reporter, pool->preallocSize() == kPreallocSize);
     }
 
     // Pool allocates exactly minAllocSize when it expands.
     {
         constexpr size_t kMinAllocSize = kSmallestMinAllocSize * 7;
-        GrMemoryPool pool(0, kMinAllocSize);
-        AutoPoolReleaser r(pool);
+        auto pool = GrMemoryPool::Make(0, kMinAllocSize);
+        AutoPoolReleaser r(*pool);
 
-        allocateMemory(pool, r);
-        REPORTER_ASSERT(reporter, pool.size() == kMinAllocSize);
+        allocateMemory(*pool, r);
+        REPORTER_ASSERT(reporter, pool->size() == kMinAllocSize);
 
-        allocateMemory(pool, r);
-        REPORTER_ASSERT(reporter, pool.size() == 2 * kMinAllocSize);
+        allocateMemory(*pool, r);
+        REPORTER_ASSERT(reporter, pool->size() == 2 * kMinAllocSize);
     }
 
     // When asked to allocate amount > minAllocSize, pool allocates larger block
     // to accommodate all internal structures.
     {
         constexpr size_t kMinAllocSize = kSmallestMinAllocSize * 2;
-        GrMemoryPool pool(kSmallestMinAllocSize, kMinAllocSize);
-        AutoPoolReleaser r(pool);
+        auto pool = GrMemoryPool::Make(kSmallestMinAllocSize, kMinAllocSize);
+        AutoPoolReleaser r(*pool);
 
-        REPORTER_ASSERT(reporter, pool.size() == 0);
+        REPORTER_ASSERT(reporter, pool->size() == 0);
 
         constexpr size_t hugeSize = 10 * kMinAllocSize;
-        r.add(pool.allocate(hugeSize));
-        REPORTER_ASSERT(reporter, pool.size() > hugeSize);
+        r.add(pool->allocate(hugeSize));
+        REPORTER_ASSERT(reporter, pool->size() > hugeSize);
 
         // Block size allocated to accommodate huge request doesn't include any extra
         // space, so next allocation request allocates a new block.
-        size_t hugeBlockSize = pool.size();
-        r.add(pool.allocate(0));
-        REPORTER_ASSERT(reporter, pool.size() == hugeBlockSize + kMinAllocSize);
+        size_t hugeBlockSize = pool->size();
+        r.add(pool->allocate(0));
+        REPORTER_ASSERT(reporter, pool->size() == hugeBlockSize + kMinAllocSize);
     }
 }