diff --git a/dm/DMSrcSink.cpp b/dm/DMSrcSink.cpp
index 24dd6c08fb..2d6832e738 100644
--- a/dm/DMSrcSink.cpp
+++ b/dm/DMSrcSink.cpp
@@ -1813,13 +1813,11 @@ Result GPUDDLSink::ddlDraw(const Src& src,
 
     SkYUVAPixmapInfo::SupportedDataTypes supportedYUVADataTypes(*dContext);
     DDLPromiseImageHelper promiseImageHelper(supportedYUVADataTypes);
-    sk_sp<SkData> compressedPictureData = promiseImageHelper.deflateSKP(inputPicture.get());
-    if (!compressedPictureData) {
-        return Result::Fatal("GPUDDLSink: Couldn't deflate SkPicture");
+    sk_sp<SkPicture> newSKP = promiseImageHelper.recreateSKP(dContext, inputPicture.get());
+    if (!newSKP) {
+        return Result::Fatal("GPUDDLSink: Couldn't recreate the SKP");
     }
 
-    promiseImageHelper.createCallbackContexts(dContext);
-
     // 'gpuTestCtx/gpuThreadCtx' is being shifted to the gpuThread. Leave the main (this)
     // thread w/o a context.
     gpuTestCtx->makeNotCurrent();
@@ -1840,10 +1838,7 @@ Result GPUDDLSink::ddlDraw(const Src& src,
 
     tiles.createBackendTextures(gpuTaskGroup, dContext);
 
-    // Reinflate the compressed picture.
-    tiles.createSKP(dContext->threadSafeProxy(), compressedPictureData.get(), promiseImageHelper);
-
-    tiles.kickOffThreadedWork(recordingTaskGroup, gpuTaskGroup, dContext);
+    tiles.kickOffThreadedWork(recordingTaskGroup, gpuTaskGroup, dContext, newSKP.get());
 
     // We have to wait for the recording threads to schedule all their work on the gpu thread
     // before we can schedule the composition draw and the flush. Note that the gpu thread
diff --git a/experimental/ddlbench/ddlbench.cpp b/experimental/ddlbench/ddlbench.cpp
index 586beb000d..9bb09d10cb 100644
--- a/experimental/ddlbench/ddlbench.cpp
+++ b/experimental/ddlbench/ddlbench.cpp
@@ -285,14 +285,7 @@ static sk_sp<SkPicture> create_shared_skp(const char* src,
         exitf("failed to parse file %s", srcfile.c_str());
     }
 
-    sk_sp<SkData> compressedPictureData = promiseImageHelper->deflateSKP(skp.get());
-    if (!compressedPictureData) {
-        exitf("skp deflation failed %s", srcfile.c_str());
-    }
-
-    // TODO: use the new shared promise images to just create one skp here
-
-    return skp;
+    return promiseImageHelper->recreateSKP(dContext, skp.get());
 }
 
 static void check_params(GrDirectContext* dContext,
@@ -383,8 +376,6 @@ int main(int argc, char** argv) {
 
     check_params(mainContext->fDirectContext, width, height, ct, at, FLAGS_numSamples);
 
-    promiseImageHelper.createCallbackContexts(mainContext->fDirectContext);
-
     // TODO: do this later on a utility thread!
     promiseImageHelper.uploadAllToGPU(nullptr, mainContext->fDirectContext);
 
diff --git a/tools/DDLPromiseImageHelper.cpp b/tools/DDLPromiseImageHelper.cpp
index 1a4119cfbb..9514321b42 100644
--- a/tools/DDLPromiseImageHelper.cpp
+++ b/tools/DDLPromiseImageHelper.cpp
@@ -94,7 +94,8 @@ void PromiseImageCallbackContext::destroyBackendTexture() {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-sk_sp<SkData> DDLPromiseImageHelper::deflateSKP(const SkPicture* inputPicture) {
+sk_sp<SkPicture> DDLPromiseImageHelper::recreateSKP(GrDirectContext* dContext,
+                                                    SkPicture* inputPicture) {
     SkSerialProcs procs;
 
     procs.fImageCtx = this;
@@ -107,7 +108,14 @@ sk_sp<SkData> DDLPromiseImageHelper::deflateSKP(const SkPicture* inputPicture) {
         return SkData::MakeWithCopy(&id, sizeof(id));
     };
 
-    return inputPicture->serialize(&procs);
+    sk_sp<SkData> compressedPictureData = inputPicture->serialize(&procs);
+    if (!compressedPictureData) {
+        return nullptr;
+    }
+
+    this->createCallbackContexts(dContext);
+
+    return this->reinflateSKP(dContext->threadSafeProxy(), compressedPictureData.get());
 }
 
 static GrBackendTexture create_yuva_texture(GrDirectContext* direct,
@@ -279,9 +287,8 @@ void DDLPromiseImageHelper::deleteAllFromGPU(SkTaskGroup* taskGroup, GrDirectCon
 
 sk_sp<SkPicture> DDLPromiseImageHelper::reinflateSKP(
                                                    sk_sp<GrContextThreadSafeProxy> threadSafeProxy,
-                                                   SkData* compressedPictureData,
-                                                   SkTArray<sk_sp<SkImage>>* promiseImages) const {
-    DeserialImageProcContext procContext { std::move(threadSafeProxy), this, promiseImages };
+                                                   SkData* compressedPictureData) {
+    DeserialImageProcContext procContext { std::move(threadSafeProxy), this };
 
     SkDeserialProcs procs;
     procs.fImageCtx = (void*) &procContext;
@@ -290,13 +297,12 @@ sk_sp<SkPicture> DDLPromiseImageHelper::reinflateSKP(
     return SkPicture::MakeFromData(compressedPictureData, &procs);
 }
 
-// This generates promise images to replace the indices in the compressed picture. This
-// reconstitution is performed separately in each thread so we end up with multiple
-// promise images referring to the same GrBackendTexture.
+// This generates promise images to replace the indices in the compressed picture.
 sk_sp<SkImage> DDLPromiseImageHelper::CreatePromiseImages(const void* rawData,
-                                                          size_t length, void* ctxIn) {
+                                                          size_t length,
+                                                          void* ctxIn) {
     DeserialImageProcContext* procContext = static_cast<DeserialImageProcContext*>(ctxIn);
-    const DDLPromiseImageHelper* helper = procContext->fHelper;
+    DDLPromiseImageHelper* helper = procContext->fHelper;
 
     SkASSERT(length == sizeof(int));
 
@@ -308,7 +314,7 @@ sk_sp<SkImage> DDLPromiseImageHelper::CreatePromiseImages(const void* rawData,
     const DDLPromiseImageHelper::PromiseImageInfo& curImage = helper->getInfo(*indexPtr);
 
     // If there is no callback context that means 'createCallbackContexts' determined the
-    // texture wouldn't fit on the GPU. Create a separate bitmap-backed image for each thread.
+    // texture wouldn't fit on the GPU. Create a bitmap-backed image.
     if (!curImage.isYUV() && !curImage.callbackContext(0)) {
         SkASSERT(curImage.baseLevel().isImmutable());
         return curImage.baseLevel().asImage();
@@ -361,7 +367,7 @@ sk_sp<SkImage> DDLPromiseImageHelper::CreatePromiseImages(const void* rawData,
                                             (void*)curImage.refCallbackContext(0).release());
         curImage.callbackContext(0)->wasAddedToImage();
     }
-    procContext->fPromiseImages->push_back(image);
+    helper->fPromiseImages.push_back(image);
     SkASSERT(image);
     return image;
 }
diff --git a/tools/DDLPromiseImageHelper.h b/tools/DDLPromiseImageHelper.h
index bdeaa75ae8..63ea0ec220 100644
--- a/tools/DDLPromiseImageHelper.h
+++ b/tools/DDLPromiseImageHelper.h
@@ -87,14 +87,13 @@ private:
 //
 // The way this works is:
 //    the original skp is converted to SkData and all its image info is extracted into this
-//       class and only indices into this class are left in the SkData (via deflateSKP)
+//       class and only indices into this class are left in the SkData
+//    the PromiseImageCallbackContexts are created for each image
+//    the SkData is then reinflated into an SkPicture with promise images replacing all the indices
+//       (all in recreateSKP)
 //
-//    Prior to replaying in threads, all the images stored in this class are uploaded to the
-//       gpu and PromiseImageCallbackContexts are created for them (via uploadAllToGPU)
-//
-//    Each thread reinflates the SkData into an SkPicture replacing all the indices w/
-//       promise images (all using the same GrBackendTexture and getting a ref to the
-//       appropriate PromiseImageCallbackContext) (via reinflateSKP).
+//    Prior to replaying in threads, all the images are uploaded to the gpu
+//       (in uploadAllToGPU)
 //
 //    This class is then reset - dropping all of its refs on the PromiseImageCallbackContexts
 //
@@ -110,23 +109,24 @@ public:
             : fSupportedYUVADataTypes(supportedYUVADataTypes) {}
     ~DDLPromiseImageHelper() = default;
 
-    // Convert the SkPicture into SkData replacing all the SkImages with an index.
-    sk_sp<SkData> deflateSKP(const SkPicture* inputPicture);
-
-    void createCallbackContexts(GrDirectContext*);
+    // Convert the input SkPicture into a new one which has promise images rather than live
+    // images.
+    sk_sp<SkPicture> recreateSKP(GrDirectContext*, SkPicture*);
 
     void uploadAllToGPU(SkTaskGroup*, GrDirectContext*);
     void deleteAllFromGPU(SkTaskGroup*, GrDirectContext*);
 
-    // reinflate a deflated SKP, replacing all the indices with promise images.
-    sk_sp<SkPicture> reinflateSKP(sk_sp<GrContextThreadSafeProxy>,
-                                  SkData* compressedPicture,
-                                  SkTArray<sk_sp<SkImage>>* promiseImages) const;
-
-    // Remove this class' refs on the PromiseImageCallbackContexts
-    void reset() { fImageInfo.reset(); }
+    // Remove this class' refs on the promise images and the PromiseImageCallbackContexts
+    void reset() {
+        fImageInfo.reset();
+        fPromiseImages.reset();
+    }
 
 private:
+    void createCallbackContexts(GrDirectContext*);
+    // reinflate a deflated SKP, replacing all the indices with promise images.
+    sk_sp<SkPicture> reinflateSKP(sk_sp<GrContextThreadSafeProxy>, SkData* deflatedSKP);
+
     // This is the information extracted into this class from the parsing of the skp file.
     // Once it has all been uploaded to the GPU and distributed to the promise images, it
     // is all dropped via "reset".
@@ -213,8 +213,7 @@ private:
 
     struct DeserialImageProcContext {
         sk_sp<GrContextThreadSafeProxy> fThreadSafeProxy;
-        const DDLPromiseImageHelper*    fHelper;
-        SkTArray<sk_sp<SkImage>>*       fPromiseImages;
+        DDLPromiseImageHelper*          fHelper;
     };
 
     static void CreateBETexturesForPromiseImage(GrDirectContext*, PromiseImageInfo*);
@@ -236,7 +235,11 @@ private:
     int findOrDefineImage(SkImage* image);
 
     SkYUVAPixmapInfo::SupportedDataTypes fSupportedYUVADataTypes;
-    SkTArray<PromiseImageInfo> fImageInfo;
+    SkTArray<PromiseImageInfo>           fImageInfo;
+
+    // TODO: review the use of 'fPromiseImages' - it doesn't seem useful/necessary
+    SkTArray<sk_sp<SkImage>>             fPromiseImages;    // All the promise images in the
+                                                            // reconstituted picture
 };
 
 #endif
diff --git a/tools/DDLTileHelper.cpp b/tools/DDLTileHelper.cpp
index ae83ca3a0a..f9a242aa31 100644
--- a/tools/DDLTileHelper.cpp
+++ b/tools/DDLTileHelper.cpp
@@ -259,26 +259,17 @@ DDLTileHelper::DDLTileHelper(GrDirectContext* direct,
     }
 }
 
-void DDLTileHelper::createSKP(sk_sp<GrContextThreadSafeProxy> threadSafeProxy,
-                              SkData* compressedPictureData,
-                              const DDLPromiseImageHelper& helper) {
-    SkASSERT(!fReconstitutedPicture);
-
-    fReconstitutedPicture = helper.reinflateSKP(std::move(threadSafeProxy), compressedPictureData,
-                                                &fPromiseImages);
-}
-
-void DDLTileHelper::createDDLsInParallel() {
+void DDLTileHelper::createDDLsInParallel(SkPicture* picture) {
 #if 1
     SkTaskGroup().batch(this->numTiles(), [&](int i) {
-        fTiles[i].createDDL(fReconstitutedPicture.get());
+        fTiles[i].createDDL(picture);
     });
     SkTaskGroup().add([this]{ this->createComposeDDL(); });
     SkTaskGroup().wait();
 #else
     // Use this code path to debug w/o threads
     for (int i = 0; i < this->numTiles(); ++i) {
-        fTiles[i].createDDL(fReconstitutedPicture.get());
+        fTiles[i].createDDL(picture);
     }
     this->createComposeDDL();
 #endif
@@ -301,7 +292,8 @@ static void do_gpu_stuff(GrDirectContext* direct, DDLTileHelper::TileData* tile)
 // We expect to have more than one recording thread but just one gpu thread
 void DDLTileHelper::kickOffThreadedWork(SkTaskGroup* recordingTaskGroup,
                                         SkTaskGroup* gpuTaskGroup,
-                                        GrDirectContext* dContext) {
+                                        GrDirectContext* dContext,
+                                        SkPicture* picture) {
     SkASSERT(recordingTaskGroup && gpuTaskGroup && dContext);
 
     for (int i = 0; i < this->numTiles(); ++i) {
@@ -315,8 +307,8 @@ void DDLTileHelper::kickOffThreadedWork(SkTaskGroup* recordingTaskGroup,
         //    schedule gpu-thread processing of the DDL
         // Note: a finer grained approach would be add a scheduling task which would evaluate
         //       which DDLs were ready to be rendered based on their prerequisites
-        recordingTaskGroup->add([this, tile, gpuTaskGroup, dContext]() {
-                                    tile->createDDL(fReconstitutedPicture.get());
+        recordingTaskGroup->add([tile, gpuTaskGroup, dContext, picture]() {
+                                    tile->createDDL(picture);
 
                                     gpuTaskGroup->add([dContext, tile]() {
                                         do_gpu_stuff(dContext, tile);
@@ -328,17 +320,17 @@ void DDLTileHelper::kickOffThreadedWork(SkTaskGroup* recordingTaskGroup,
 }
 
 // Only called from skpbench
-void DDLTileHelper::interleaveDDLCreationAndDraw(GrDirectContext* direct) {
+void DDLTileHelper::interleaveDDLCreationAndDraw(GrDirectContext* dContext, SkPicture* picture) {
     for (int i = 0; i < this->numTiles(); ++i) {
-        fTiles[i].createDDL(fReconstitutedPicture.get());
-        fTiles[i].draw(direct);
+        fTiles[i].createDDL(picture);
+        fTiles[i].draw(dContext);
     }
 }
 
 // Only called from skpbench
-void DDLTileHelper::drawAllTilesDirectly(GrDirectContext* context) {
+void DDLTileHelper::drawAllTilesDirectly(GrDirectContext* dContext, SkPicture* picture) {
     for (int i = 0; i < this->numTiles(); ++i) {
-        fTiles[i].drawSKPDirectly(context, fReconstitutedPicture.get());
+        fTiles[i].drawSKPDirectly(dContext, picture);
     }
 }
 
diff --git a/tools/DDLTileHelper.h b/tools/DDLTileHelper.h
index 7b77785d01..3df3150a7e 100644
--- a/tools/DDLTileHelper.h
+++ b/tools/DDLTileHelper.h
@@ -104,17 +104,12 @@ public:
                   int numXDivisions, int numYDivisions,
                   bool addRandomPaddingToDst);
 
-    // TODO: Move this to PromiseImageHelper and have one method that does all the work and
-    // returns the shared SkP.
-    void createSKP(sk_sp<GrContextThreadSafeProxy>,
-                   SkData* compressedPictureData,
-                   const DDLPromiseImageHelper&);
-
     void kickOffThreadedWork(SkTaskGroup* recordingTaskGroup,
                              SkTaskGroup* gpuTaskGroup,
-                             GrDirectContext*);
+                             GrDirectContext*,
+                             SkPicture*);
 
-    void createDDLsInParallel();
+    void createDDLsInParallel(SkPicture*);
 
     // Create the DDL that will compose all the tile images into a final result.
     void createComposeDDL();
@@ -125,11 +120,11 @@ public:
     // DDL creations and draws are interleaved to prevent starvation of the GPU.
     // Note: this is somewhat of a misuse/pessimistic-use of DDLs since they are supposed to
     // be created on a separate thread.
-    void interleaveDDLCreationAndDraw(GrDirectContext*);
+    void interleaveDDLCreationAndDraw(GrDirectContext*, SkPicture*);
 
     // This draws all the per-tile SKPs directly into all of the tiles w/o converting them to
     // DDLs first - all on a single thread.
-    void drawAllTilesDirectly(GrDirectContext*);
+    void drawAllTilesDirectly(GrDirectContext*, SkPicture*);
 
     void dropCallbackContexts();
     void resetAllTiles();
@@ -147,9 +142,6 @@ private:
     sk_sp<SkDeferredDisplayList>           fComposeDDL;
 
     const SkSurfaceCharacterization        fDstCharacterization;
-    sk_sp<SkPicture>                       fReconstitutedPicture;
-    SkTArray<sk_sp<SkImage>>               fPromiseImages; // All the promise images in the
-                                                           // reconstituted picture
 };
 
 #endif
diff --git a/tools/skpbench/skpbench.cpp b/tools/skpbench/skpbench.cpp
index 131e3f3fcd..033bfd94c8 100644
--- a/tools/skpbench/skpbench.cpp
+++ b/tools/skpbench/skpbench.cpp
@@ -207,9 +207,10 @@ private:
     std::vector<SkDocumentPage> fFrames;
 };
 
-static void ddl_sample(GrDirectContext* context, DDLTileHelper* tiles, GpuSync& gpuSync,
+static void ddl_sample(GrDirectContext* dContext, DDLTileHelper* tiles, GpuSync& gpuSync,
                        Sample* sample, SkTaskGroup* recordingTaskGroup, SkTaskGroup* gpuTaskGroup,
-                       std::chrono::high_resolution_clock::time_point* startStopTime) {
+                       std::chrono::high_resolution_clock::time_point* startStopTime,
+                       SkPicture* picture) {
     using clock = std::chrono::high_resolution_clock;
 
     clock::time_point start = *startStopTime;
@@ -221,23 +222,23 @@ static void ddl_sample(GrDirectContext* context, DDLTileHelper* tiles, GpuSync&
         // thread. The interleaving is so that we don't starve the GPU.
         // One unfortunate side effect of this is that we can't delete the DDLs until after
         // the GPU work is flushed.
-        tiles->interleaveDDLCreationAndDraw(context);
+        tiles->interleaveDDLCreationAndDraw(dContext, picture);
     } else if (FLAGS_comparableSKP) {
         // In this mode simply draw the re-inflated per-tile SKPs directly to the GPU w/o going
         // through a DDL.
-        tiles->drawAllTilesDirectly(context);
+        tiles->drawAllTilesDirectly(dContext, picture);
     } else {
-        tiles->kickOffThreadedWork(recordingTaskGroup, gpuTaskGroup, context);
+        tiles->kickOffThreadedWork(recordingTaskGroup, gpuTaskGroup, dContext, picture);
         recordingTaskGroup->wait();
     }
 
     if (gpuTaskGroup) {
         gpuTaskGroup->add([&]{
-            flush_with_sync(context, gpuSync);
+            flush_with_sync(dContext, gpuSync);
         });
         gpuTaskGroup->wait();
     } else {
-        flush_with_sync(context, gpuSync);
+        flush_with_sync(dContext, gpuSync);
     }
 
     *startStopTime = clock::now();
@@ -248,7 +249,7 @@ static void ddl_sample(GrDirectContext* context, DDLTileHelper* tiles, GpuSync&
     }
 }
 
-static void run_ddl_benchmark(sk_gpu_test::TestContext* testContext, GrDirectContext *context,
+static void run_ddl_benchmark(sk_gpu_test::TestContext* testContext, GrDirectContext *dContext,
                               sk_sp<SkSurface> dstSurface, SkPicture* inputPicture,
                               std::vector<Sample>* samples) {
     using clock = std::chrono::high_resolution_clock;
@@ -260,24 +261,20 @@ static void run_ddl_benchmark(sk_gpu_test::TestContext* testContext, GrDirectCon
 
     SkIRect viewport = dstSurface->imageInfo().bounds();
 
-    SkYUVAPixmapInfo::SupportedDataTypes supportedYUVADataTypes(*context);
+    SkYUVAPixmapInfo::SupportedDataTypes supportedYUVADataTypes(*dContext);
     DDLPromiseImageHelper promiseImageHelper(supportedYUVADataTypes);
-    sk_sp<SkData> compressedPictureData = promiseImageHelper.deflateSKP(inputPicture);
-    if (!compressedPictureData) {
+    sk_sp<SkPicture> newSKP = promiseImageHelper.recreateSKP(dContext, inputPicture);
+    if (!newSKP) {
         exitf(ExitErr::kUnavailable, "DDL: conversion of skp failed");
     }
 
-    promiseImageHelper.createCallbackContexts(context);
+    promiseImageHelper.uploadAllToGPU(nullptr, dContext);
 
-    promiseImageHelper.uploadAllToGPU(nullptr, context);
-
-    DDLTileHelper tiles(context, dstCharacterization, viewport,
+    DDLTileHelper tiles(dContext, dstCharacterization, viewport,
                         FLAGS_ddlTilingWidthHeight, FLAGS_ddlTilingWidthHeight,
                         /* addRandomPaddingToDst */ false);
 
-    tiles.createBackendTextures(nullptr, context);
-
-    tiles.createSKP(context->threadSafeProxy(), compressedPictureData.get(), promiseImageHelper);
+    tiles.createBackendTextures(nullptr, dContext);
 
     // In comparable modes, there is no GPU thread. The following pointers are all null.
     // Otherwise, we transfer testContext onto the GPU thread until after the bench.
@@ -297,8 +294,8 @@ static void run_ddl_benchmark(sk_gpu_test::TestContext* testContext, GrDirectCon
     clock::time_point startStopTime = clock::now();
 
     GpuSync gpuSync;
-    ddl_sample(context, &tiles, gpuSync, nullptr, recordingTaskGroup.get(),
-               gpuTaskGroup.get(), &startStopTime);
+    ddl_sample(dContext, &tiles, gpuSync, nullptr, recordingTaskGroup.get(),
+               gpuTaskGroup.get(), &startStopTime, newSKP.get());
 
     clock::duration cumulativeDuration = std::chrono::milliseconds(0);
 
@@ -308,8 +305,8 @@ static void run_ddl_benchmark(sk_gpu_test::TestContext* testContext, GrDirectCon
 
         do {
             tiles.resetAllTiles();
-            ddl_sample(context, &tiles, gpuSync, &sample, recordingTaskGroup.get(),
-                       gpuTaskGroup.get(), &startStopTime);
+            ddl_sample(dContext, &tiles, gpuSync, &sample, recordingTaskGroup.get(),
+                       gpuTaskGroup.get(), &startStopTime, newSKP.get());
         } while (sample.fDuration < sampleDuration);
 
         cumulativeDuration += sample.fDuration;
@@ -334,12 +331,12 @@ static void run_ddl_benchmark(sk_gpu_test::TestContext* testContext, GrDirectCon
 
     // Make sure the gpu has finished all its work before we exit this function and delete the
     // fence.
-    context->flush();
-    context->submit(true);
+    dContext->flush();
+    dContext->submit(true);
 
-    promiseImageHelper.deleteAllFromGPU(nullptr, context);
+    promiseImageHelper.deleteAllFromGPU(nullptr, dContext);
 
-    tiles.deleteBackendTextures(nullptr, context);
+    tiles.deleteBackendTextures(nullptr, dContext);
 
 }