From 6f0124ad687bcd9c2b23b4ab71849555d3d32881 Mon Sep 17 00:00:00 2001
From: Robert Phillips <robertphillips@google.com>
Date: Fri, 6 Mar 2020 12:31:28 -0500
Subject: [PATCH] Update skpbench's DDL timing

Chrome is seeing some extra overhead when using DDLs for rasterization. This CL updates skpbench to try to replicate their usage of DDLs (or, at least, better illustrate the overhead of using DDLs).

Bug: skia:9455
Change-Id: I2abc7cf2d597c97d1d7a47425064c621a7ef0eb3
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/275496
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Robert Phillips <robertphillips@google.com>
---
 tools/DDLTileHelper.cpp     | 47 ++++++++++++++++++++++++++++++-------
 tools/DDLTileHelper.h       | 16 +++++++++++++
 tools/skpbench/skpbench.cpp | 28 +++++++++++++++++-----
 3 files changed, 76 insertions(+), 15 deletions(-)
diff --git a/tools/DDLTileHelper.cpp b/tools/DDLTileHelper.cpp
index 7063bc71e2..d5d0045323 100644
--- a/tools/DDLTileHelper.cpp
+++ b/tools/DDLTileHelper.cpp
@@ -52,19 +52,19 @@ void DDLTileHelper::TileData::createTileSpecificSKP(SkData* compressedPictureDat
 }
 
 void DDLTileHelper::TileData::createDDL() {
-    SkASSERT(!fDisplayList);
+    SkASSERT(!fDisplayList && fReconstitutedPicture);
 
     SkDeferredDisplayListRecorder recorder(fCharacterization);
 
     // DDL TODO: the DDLRecorder's GrContext isn't initialized until getCanvas is called.
     // Maybe set it up in the ctor?
-    SkCanvas* subCanvas = recorder.getCanvas();
+    SkCanvas* recordingCanvas = recorder.getCanvas();
 
     // Because we cheated in createTileSpecificSKP and used the wrong DDLRecorder, the GrContext's
     // stored in fReconstitutedPicture's promise images are incorrect. Patch them with the correct
     // one now.
     for (int i = 0; i < fPromiseImages.count(); ++i) {
-        GrContext* newContext = subCanvas->getGrContext();
+        GrContext* newContext = recordingCanvas->getGrContext();
 
         if (fPromiseImages[i]->isTextureBacked()) {
             SkImage_GpuBase* gpuImage = (SkImage_GpuBase*) fPromiseImages[i].get();
@@ -72,14 +72,12 @@ void DDLTileHelper::TileData::createDDL() {
         }
     }
 
-    subCanvas->clipRect(SkRect::MakeWH(fClip.width(), fClip.height()));
-    subCanvas->translate(-fClip.fLeft, -fClip.fTop);
+    recordingCanvas->clipRect(SkRect::MakeWH(fClip.width(), fClip.height()));
+    recordingCanvas->translate(-fClip.fLeft, -fClip.fTop);
 
     // Note: in this use case we only render a picture to the deferred canvas
     // but, more generally, clients will use arbitrary draw calls.
-    if (fReconstitutedPicture) {
-        subCanvas->drawPicture(fReconstitutedPicture);
-    }
+    recordingCanvas->drawPicture(fReconstitutedPicture);
 
     fDisplayList = recorder.detach();
 }
@@ -93,6 +91,23 @@ void DDLTileHelper::TileData::precompile(GrContext* context) {
     }
 }
 
+void DDLTileHelper::TileData::drawSKPDirectly(GrContext* context) {
+    SkASSERT(!fDisplayList && !fImage && fReconstitutedPicture);
+
+    sk_sp<SkSurface> tileSurface = SkSurface::MakeRenderTarget(context, fCharacterization,
+                                                               SkBudgeted::kYes);
+    if (tileSurface) {
+        SkCanvas* tileCanvas = tileSurface->getCanvas();
+
+        tileCanvas->clipRect(SkRect::MakeWH(fClip.width(), fClip.height()));
+        tileCanvas->translate(-fClip.fLeft, -fClip.fTop);
+
+        tileCanvas->drawPicture(fReconstitutedPicture);
+
+        fImage = tileSurface->makeImageSnapshot();
+    }
+}
+
 void DDLTileHelper::TileData::draw(GrContext* context) {
     SkASSERT(fDisplayList && !fImage);
 
@@ -120,6 +135,7 @@ void DDLTileHelper::TileData::compose() {
 void DDLTileHelper::TileData::reset() {
     // TODO: when DDLs are re-renderable we don't need to do this
     fDisplayList = nullptr;
+    fImage = nullptr;
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -164,7 +180,7 @@ void DDLTileHelper::createDDLsInParallel() {
     SkTaskGroup().wait();
 #else
     // Use this code path to debug w/o threads
-    for (int i = 0; i < fTiles.count(); ++i) {
+    for (int i = 0; i < this->numTiles(); ++i) {
         fTiles[i].createDDL();
     }
 #endif
@@ -217,6 +233,19 @@ void DDLTileHelper::precompileAndDrawAllTiles(GrContext* context) {
     }
 }
 
+void DDLTileHelper::interleaveDDLCreationAndDraw(GrContext* context) {
+    for (int i = 0; i < this->numTiles(); ++i) {
+        fTiles[i].createDDL();
+        fTiles[i].draw(context);
+    }
+}
+
+void DDLTileHelper::drawAllTilesDirectly(GrContext* context) {
+    for (int i = 0; i < this->numTiles(); ++i) {
+        fTiles[i].drawSKPDirectly(context);
+    }
+}
+
 void DDLTileHelper::composeAllTiles() {
     for (int i = 0; i < this->numTiles(); ++i) {
         fTiles[i].compose();
diff --git a/tools/DDLTileHelper.h b/tools/DDLTileHelper.h
index f565db810a..1f54490f72 100644
--- a/tools/DDLTileHelper.h
+++ b/tools/DDLTileHelper.h
@@ -45,6 +45,11 @@ public:
         // Precompile all the programs required to draw this tile's DDL
         void precompile(GrContext*);
 
+        // Just draw the re-inflated per-tile SKP directly into this tile w/o going through a DDL
+        // first. This is used for determining the overhead of using DDLs (i.e., it replaces
+        // a 'createDDL' and 'draw' pair.
+        void drawSKPDirectly(GrContext*);
+
         // Replay the recorded DDL into the tile surface - creating 'fImage'.
         void draw(GrContext*);
 
@@ -89,6 +94,17 @@ public:
 
     void precompileAndDrawAllTiles(GrContext*);
 
+    // For each tile, create its DDL and then draw it - all on a single thread. This is to allow
+    // comparison w/ just drawing the SKP directly (i.e., drawAllTilesDirectly). The
+    // DDL creations and draws are interleaved to prevent starvation of the GPU.
+    // Note: this is somewhat of a misuse/pessimistic-use of DDLs since they are supposed to
+    // be created on a separate thread.
+    void interleaveDDLCreationAndDraw(GrContext*);
+
+    // This draws all the per-tile SKPs directly into all of the tiles w/o converting them to
+    // DDLs first - all on a single thread.
+    void drawAllTilesDirectly(GrContext*);
+
     void composeAllTiles();
 
     void resetAllTiles();
diff --git a/tools/skpbench/skpbench.cpp b/tools/skpbench/skpbench.cpp
index 41909db25d..f86ddf6db5 100644
--- a/tools/skpbench/skpbench.cpp
+++ b/tools/skpbench/skpbench.cpp
@@ -63,7 +63,9 @@ static DEFINE_bool(ddl, false, "record the skp into DDLs before rendering");
 static DEFINE_int(ddlNumAdditionalThreads, 0,
                     "number of DDL recording threads in addition to main one");
 static DEFINE_int(ddlTilingWidthHeight, 0, "number of tiles along one edge when in DDL mode");
-static DEFINE_bool(ddlRecordTime, false, "report just the cpu time spent recording DDLs");
+
+static DEFINE_bool(comparableDDL, false, "render in a way that is comparable to 'comparableSKP'");
+static DEFINE_bool(comparableSKP, false, "report in a way that is comparable to 'comparableDDL'");
 
 static DEFINE_int(duration, 5000, "number of milliseconds to run the benchmark");
 static DEFINE_int(sampleMs, 50, "minimum duration of a sample");
@@ -205,17 +207,28 @@ static void ddl_sample(GrContext* context, DDLTileHelper* tiles, GpuSync& gpuSyn
 
     clock::time_point start = *startStopTime;
 
-    tiles->createDDLsInParallel();
+    if (FLAGS_comparableDDL) {
+        SkASSERT(!FLAGS_comparableSKP);
 
-    if (!FLAGS_ddlRecordTime) {
+        // In this mode we simply alternate between creating a DDL and drawing it - all on one
+        // thread. The interleaving is so that we don't starve the GPU.
+        // One unfortunate side effect of this is that we can't delete the DDLs until after
+        // the GPU work is flushed.
+        tiles->interleaveDDLCreationAndDraw(context);
+    } else if (FLAGS_comparableSKP) {
+        // In this mode simply draw the re-inflated per-tile SKPs directly to the GPU w/o going
+        // through a DDL.
+        tiles->drawAllTilesDirectly(context);
+    } else {
+        // TODO: Here we create all the DDLs, wait, and then draw them all. This should be updated
+        // to use the GPUDDLSink method of having a separate GPU thread.
+        tiles->createDDLsInParallel();
         tiles->precompileAndDrawAllTiles(context);
-        flush_with_sync(context, gpuSync);
     }
+    flush_with_sync(context, gpuSync);
 
     *startStopTime = clock::now();
 
-    tiles->resetAllTiles();
-
     if (sample) {
         sample->fDuration += *startStopTime - start;
         sample->fFrames++;
@@ -261,6 +274,7 @@ static void run_ddl_benchmark(GrContext* context, sk_sp<SkSurface> surface,
         Sample& sample = samples->back();
 
         do {
+            tiles.resetAllTiles();
             ddl_sample(context, &tiles, gpuSync, &sample, &startStopTime);
         } while (sample.fDuration < sampleDuration);
 
@@ -272,6 +286,8 @@ static void run_ddl_benchmark(GrContext* context, sk_sp<SkSurface> surface,
         tiles.composeAllTiles();
     }
 
+    tiles.resetAllTiles();
+
     // Make sure the gpu has finished all its work before we exit this function and delete the
     // fence.
     GrFlushInfo flushInfo;