Update skpbench's DDL timing

Chrome is seeing some extra overhead when using DDLs for rasterization. This CL updates skpbench to try to replicate their usage of DDLs (or, at least, better illustrate the overhead of using DDLs).

Bug: skia:9455
Change-Id: I2abc7cf2d597c97d1d7a47425064c621a7ef0eb3
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/275496
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Robert Phillips <robertphillips@google.com>
This commit is contained in:
Robert Phillips 2020-03-06 12:31:28 -05:00 committed by Skia Commit-Bot
parent beaaf4700f
commit 6f0124ad68
3 changed files with 76 additions and 15 deletions

View File

@ -52,19 +52,19 @@ void DDLTileHelper::TileData::createTileSpecificSKP(SkData* compressedPictureDat
}
void DDLTileHelper::TileData::createDDL() {
SkASSERT(!fDisplayList);
SkASSERT(!fDisplayList && fReconstitutedPicture);
SkDeferredDisplayListRecorder recorder(fCharacterization);
// DDL TODO: the DDLRecorder's GrContext isn't initialized until getCanvas is called.
// Maybe set it up in the ctor?
SkCanvas* subCanvas = recorder.getCanvas();
SkCanvas* recordingCanvas = recorder.getCanvas();
// Because we cheated in createTileSpecificSKP and used the wrong DDLRecorder, the GrContext's
// stored in fReconstitutedPicture's promise images are incorrect. Patch them with the correct
// one now.
for (int i = 0; i < fPromiseImages.count(); ++i) {
GrContext* newContext = subCanvas->getGrContext();
GrContext* newContext = recordingCanvas->getGrContext();
if (fPromiseImages[i]->isTextureBacked()) {
SkImage_GpuBase* gpuImage = (SkImage_GpuBase*) fPromiseImages[i].get();
@ -72,14 +72,12 @@ void DDLTileHelper::TileData::createDDL() {
}
}
subCanvas->clipRect(SkRect::MakeWH(fClip.width(), fClip.height()));
subCanvas->translate(-fClip.fLeft, -fClip.fTop);
recordingCanvas->clipRect(SkRect::MakeWH(fClip.width(), fClip.height()));
recordingCanvas->translate(-fClip.fLeft, -fClip.fTop);
// Note: in this use case we only render a picture to the deferred canvas
// but, more generally, clients will use arbitrary draw calls.
if (fReconstitutedPicture) {
subCanvas->drawPicture(fReconstitutedPicture);
}
recordingCanvas->drawPicture(fReconstitutedPicture);
fDisplayList = recorder.detach();
}
@ -93,6 +91,23 @@ void DDLTileHelper::TileData::precompile(GrContext* context) {
}
}
void DDLTileHelper::TileData::drawSKPDirectly(GrContext* context) {
SkASSERT(!fDisplayList && !fImage && fReconstitutedPicture);
sk_sp<SkSurface> tileSurface = SkSurface::MakeRenderTarget(context, fCharacterization,
SkBudgeted::kYes);
if (tileSurface) {
SkCanvas* tileCanvas = tileSurface->getCanvas();
tileCanvas->clipRect(SkRect::MakeWH(fClip.width(), fClip.height()));
tileCanvas->translate(-fClip.fLeft, -fClip.fTop);
tileCanvas->drawPicture(fReconstitutedPicture);
fImage = tileSurface->makeImageSnapshot();
}
}
void DDLTileHelper::TileData::draw(GrContext* context) {
SkASSERT(fDisplayList && !fImage);
@ -120,6 +135,7 @@ void DDLTileHelper::TileData::compose() {
void DDLTileHelper::TileData::reset() {
// TODO: when DDLs are re-renderable we don't need to do this
fDisplayList = nullptr;
fImage = nullptr;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
@ -164,7 +180,7 @@ void DDLTileHelper::createDDLsInParallel() {
SkTaskGroup().wait();
#else
// Use this code path to debug w/o threads
for (int i = 0; i < fTiles.count(); ++i) {
for (int i = 0; i < this->numTiles(); ++i) {
fTiles[i].createDDL();
}
#endif
@ -217,6 +233,19 @@ void DDLTileHelper::precompileAndDrawAllTiles(GrContext* context) {
}
}
void DDLTileHelper::interleaveDDLCreationAndDraw(GrContext* context) {
for (int i = 0; i < this->numTiles(); ++i) {
fTiles[i].createDDL();
fTiles[i].draw(context);
}
}
void DDLTileHelper::drawAllTilesDirectly(GrContext* context) {
for (int i = 0; i < this->numTiles(); ++i) {
fTiles[i].drawSKPDirectly(context);
}
}
void DDLTileHelper::composeAllTiles() {
for (int i = 0; i < this->numTiles(); ++i) {
fTiles[i].compose();

View File

@ -45,6 +45,11 @@ public:
// Precompile all the programs required to draw this tile's DDL
void precompile(GrContext*);
// Just draw the re-inflated per-tile SKP directly into this tile w/o going through a DDL
// first. This is used for determining the overhead of using DDLs (i.e., it replaces
// a 'createDDL' and 'draw' pair.
void drawSKPDirectly(GrContext*);
// Replay the recorded DDL into the tile surface - creating 'fImage'.
void draw(GrContext*);
@ -89,6 +94,17 @@ public:
void precompileAndDrawAllTiles(GrContext*);
// For each tile, create its DDL and then draw it - all on a single thread. This is to allow
// comparison w/ just drawing the SKP directly (i.e., drawAllTilesDirectly). The
// DDL creations and draws are interleaved to prevent starvation of the GPU.
// Note: this is somewhat of a misuse/pessimistic-use of DDLs since they are supposed to
// be created on a separate thread.
void interleaveDDLCreationAndDraw(GrContext*);
// This draws all the per-tile SKPs directly into all of the tiles w/o converting them to
// DDLs first - all on a single thread.
void drawAllTilesDirectly(GrContext*);
void composeAllTiles();
void resetAllTiles();

View File

@ -63,7 +63,9 @@ static DEFINE_bool(ddl, false, "record the skp into DDLs before rendering");
static DEFINE_int(ddlNumAdditionalThreads, 0,
"number of DDL recording threads in addition to main one");
static DEFINE_int(ddlTilingWidthHeight, 0, "number of tiles along one edge when in DDL mode");
static DEFINE_bool(ddlRecordTime, false, "report just the cpu time spent recording DDLs");
static DEFINE_bool(comparableDDL, false, "render in a way that is comparable to 'comparableSKP'");
static DEFINE_bool(comparableSKP, false, "report in a way that is comparable to 'comparableDDL'");
static DEFINE_int(duration, 5000, "number of milliseconds to run the benchmark");
static DEFINE_int(sampleMs, 50, "minimum duration of a sample");
@ -205,17 +207,28 @@ static void ddl_sample(GrContext* context, DDLTileHelper* tiles, GpuSync& gpuSyn
clock::time_point start = *startStopTime;
tiles->createDDLsInParallel();
if (FLAGS_comparableDDL) {
SkASSERT(!FLAGS_comparableSKP);
if (!FLAGS_ddlRecordTime) {
// In this mode we simply alternate between creating a DDL and drawing it - all on one
// thread. The interleaving is so that we don't starve the GPU.
// One unfortunate side effect of this is that we can't delete the DDLs until after
// the GPU work is flushed.
tiles->interleaveDDLCreationAndDraw(context);
} else if (FLAGS_comparableSKP) {
// In this mode simply draw the re-inflated per-tile SKPs directly to the GPU w/o going
// through a DDL.
tiles->drawAllTilesDirectly(context);
} else {
// TODO: Here we create all the DDLs, wait, and then draw them all. This should be updated
// to use the GPUDDLSink method of having a separate GPU thread.
tiles->createDDLsInParallel();
tiles->precompileAndDrawAllTiles(context);
flush_with_sync(context, gpuSync);
}
flush_with_sync(context, gpuSync);
*startStopTime = clock::now();
tiles->resetAllTiles();
if (sample) {
sample->fDuration += *startStopTime - start;
sample->fFrames++;
@ -261,6 +274,7 @@ static void run_ddl_benchmark(GrContext* context, sk_sp<SkSurface> surface,
Sample& sample = samples->back();
do {
tiles.resetAllTiles();
ddl_sample(context, &tiles, gpuSync, &sample, &startStopTime);
} while (sample.fDuration < sampleDuration);
@ -272,6 +286,8 @@ static void run_ddl_benchmark(GrContext* context, sk_sp<SkSurface> surface,
tiles.composeAllTiles();
}
tiles.resetAllTiles();
// Make sure the gpu has finished all its work before we exit this function and delete the
// fence.
GrFlushInfo flushInfo;