DM: make GPU tasks multithreaded again. Big refactor.

The main meat of things is in SkThreadPool. We can now give SkThreadPool a type for each thread to create and destroy on its local stack. It's TLS without going through SkTLS. I've split the DM tasks into CpuTasks that run on threads with no TLS, and GpuTasks that run on threads with a thread local GrContextFactory. The old CpuTask and GpuTask have been renamed to CpuGMTask and GpuGMTask. Upshot: default run of out/Debug/dm goes from ~45 seconds to ~20 seconds. BUG=skia: R=bsalomon@google.com, mtklein@google.com, reed@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/179233005 git-svn-id: http://skia.googlecode.com/svn/trunk@13632 2bbb7eff-a529-9590-31e7-b0007b416f81
2014-02-28 20:31:31 +00:00 · 2014-02-28 20:31:31 +00:00 · ef57b7e653
commit ef57b7e653
parent d1a7e2ec6f
30 changed files with 388 additions and 359 deletions
--- a/dm/DM.cpp
+++ b/dm/DM.cpp
@ -12,8 +12,8 @@
 #include "gm.h"

 #include "DMBenchTask.h"
-#include "DMCpuTask.h"
-#include "DMGpuTask.h"
+#include "DMCpuGMTask.h"
+#include "DMGpuGMTask.h"
 #include "DMReporter.h"
 #include "DMTask.h"
 #include "DMTaskRunner.h"
@ -28,6 +28,7 @@ using skiatest::Test;
 using skiatest::TestRegistry;

 DEFINE_int32(threads, -1, "Threads for CPU work. Default NUM_CPUS.");
+DEFINE_int32(gpuThreads, 1, "Threads for GPU work.");
 DEFINE_string2(expectations, r, "",
               "If a directory, compare generated images against images under this path. "
               "If a file, compare generated images against JSON expectations at this path.");
@ -86,15 +87,15 @@ static void kick_off_gms(const SkTDArray<GMRegistry::Factory>& gms,
    }
    for (int i = 0; i < gms.count(); i++) {
        for (int j = 0; j < configs.count(); j++) {
-            START("565",      CpuTask, kRGB_565_SkColorType);
-            START("8888",     CpuTask, kPMColor_SkColorType);
-            START("gpu",      GpuTask, native, 0);
-            START("msaa4",    GpuTask, native, 4);
-            START("msaa16",   GpuTask, native, 16);
-            START("gpunull",  GpuTask, null,   0);
-            START("gpudebug", GpuTask, debug,  0);
-            START("angle",    GpuTask, angle,  0);
-            START("mesa",     GpuTask, mesa,   0);
+            START("565",      CpuGMTask, kRGB_565_SkColorType);
+            START("8888",     CpuGMTask, kPMColor_SkColorType);
+            START("gpu",      GpuGMTask, native, 0);
+            START("msaa4",    GpuGMTask, native, 4);
+            START("msaa16",   GpuGMTask, native, 16);
+            START("gpunull",  GpuGMTask, null,   0);
+            START("gpudebug", GpuGMTask, debug,  0);
+            START("angle",    GpuGMTask, angle,  0);
+            START("mesa",     GpuGMTask, mesa,   0);
        }
    }
 #undef START
@ -129,7 +130,12 @@ static void kick_off_tests(const SkTDArray<TestRegistry::Factory>& tests,
                           DM::Reporter* reporter,
                           DM::TaskRunner* tasks) {
    for (int i = 0; i < tests.count(); i++) {
-        tasks->add(SkNEW_ARGS(DM::TestTask, (reporter, tasks, tests[i])));
+        SkAutoTDelete<Test> test(tests[i](NULL));
+        if (test->isGPUTest()) {
+            tasks->add(SkNEW_ARGS(DM::GpuTestTask, (reporter, tasks, tests[i])));
+        } else {
+            tasks->add(SkNEW_ARGS(DM::CpuTestTask, (reporter, tasks, tests[i])));
+        }
    }
 }

@ -201,7 +207,7 @@ int tool_main(int argc, char** argv) {
    SkDebugf("(%d GMs, %d benches) x %d configs, %d tests\n",
             gms.count(), benches.count(), configs.count(), tests.count());
    DM::Reporter reporter;
-    DM::TaskRunner tasks(FLAGS_threads);
+    DM::TaskRunner tasks(FLAGS_threads, FLAGS_gpuThreads);
    kick_off_gms(gms, configs, *expectations, &reporter, &tasks);
    kick_off_benches(benches, configs, &reporter, &tasks);
    kick_off_tests(tests, &reporter, &tasks);
--- a/dm/DMBenchTask.cpp
+++ b/dm/DMBenchTask.cpp
@ -14,7 +14,7 @@ NonRenderingBenchTask::NonRenderingBenchTask(const char* config,
                                             Reporter* reporter,
                                             TaskRunner* tasks,
                                             BenchRegistry::Factory factory)
-    : Task(reporter, tasks)
+    : CpuTask(reporter, tasks)
    , fBench(factory(NULL))
    , fName(bench_name(fBench->getName(), config)) {}

@ -23,7 +23,7 @@ CpuBenchTask::CpuBenchTask(const char* config,
                           TaskRunner* tasks,
                           BenchRegistry::Factory factory,
                           SkColorType colorType)
-    : Task(reporter, tasks)
+    : CpuTask(reporter, tasks)
    , fBench(factory(NULL))
    , fName(bench_name(fBench->getName(), config))
    , fColorType(colorType) {}
@ -34,7 +34,7 @@ GpuBenchTask::GpuBenchTask(const char* config,
                           BenchRegistry::Factory factory,
                           GrContextFactory::GLContextType contextType,
                           int sampleCount)
-    : Task(reporter, tasks)
+    : GpuTask(reporter, tasks)
    , fBench(factory(NULL))
    , fName(bench_name(fBench->getName(), config))
    , fContextType(contextType)
@ -70,13 +70,13 @@ void CpuBenchTask::draw() {
    draw_raster(fBench.get(), fColorType);
 }

-void GpuBenchTask::draw() {
+void GpuBenchTask::draw(GrContextFactory* grFactory) {
    SkImageInfo info = SkImageInfo::Make(fBench->getSize().x(),
                                         fBench->getSize().y(),
                                         kPMColor_SkColorType,
                                         kPremul_SkAlphaType);
    SkAutoTUnref<SkSurface> surface(SkSurface::NewRenderTarget(
-            this->getGrContextFactory()->get(fContextType), info, fSampleCount));
+            grFactory->get(fContextType), info, fSampleCount));

    fBench->preDraw();
    fBench->draw(1, surface->getCanvas());
--- a/dm/DMBenchTask.h
+++ b/dm/DMBenchTask.h
@ -12,12 +12,11 @@

 namespace DM {

-class NonRenderingBenchTask : public Task {
+class NonRenderingBenchTask : public CpuTask {
 public:
    NonRenderingBenchTask(const char* config, Reporter*, TaskRunner*, BenchRegistry::Factory);

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

@ -26,12 +25,11 @@ private:
    const SkString fName;
 };

-class CpuBenchTask : public Task {
+class CpuBenchTask : public CpuTask {
 public:
    CpuBenchTask(const char* config, Reporter*, TaskRunner*, BenchRegistry::Factory, SkColorType);

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

@ -41,7 +39,7 @@ private:
    const SkColorType fColorType;
 };

-class GpuBenchTask : public Task {
+class GpuBenchTask : public GpuTask {
 public:
    GpuBenchTask(const char* config,
                 Reporter*,
@ -50,8 +48,7 @@ public:
                 GrContextFactory::GLContextType,
                 int sampleCount);

-    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return true; }
+    virtual void draw(GrContextFactory*) SK_OVERRIDE;
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

--- a/dm/DMCpuGMTask.cpp
+++ b/dm/DMCpuGMTask.cpp
@ -1,4 +1,4 @@
-#include "DMCpuTask.h"
+#include "DMCpuGMTask.h"
 #include "DMExpectationsTask.h"
 #include "DMPipeTask.h"
 #include "DMReplayTask.h"
@ -9,13 +9,13 @@

 namespace DM {

-CpuTask::CpuTask(const char* config,
-                 Reporter* reporter,
-                 TaskRunner* taskRunner,
-                 const Expectations& expectations,
-                 skiagm::GMRegistry::Factory gmFactory,
-                 SkColorType colorType)
-    : Task(reporter, taskRunner)
+CpuGMTask::CpuGMTask(const char* config,
+                     Reporter* reporter,
+                     TaskRunner* taskRunner,
+                     const Expectations& expectations,
+                     skiagm::GMRegistry::Factory gmFactory,
+                     SkColorType colorType)
+    : CpuTask(reporter, taskRunner)
    , fGMFactory(gmFactory)
    , fGM(fGMFactory(NULL))
    , fName(UnderJoin(fGM->getName(), config))
@ -23,7 +23,7 @@ CpuTask::CpuTask(const char* config,
    , fColorType(colorType)
    {}

-void CpuTask::draw() {
+void CpuGMTask::draw() {
    SkBitmap bitmap;
    SetupBitmap(fColorType, fGM.get(), &bitmap);

@ -47,7 +47,7 @@ void CpuTask::draw() {
 #undef SPAWN
 }

-bool CpuTask::shouldSkip() const {
+bool CpuGMTask::shouldSkip() const {
    if (kRGB_565_SkColorType == fColorType && (fGM->getFlags() & skiagm::GM::kSkip565_Flag)) {
        return true;
    }
--- a/dm/DMCpuGMTask.h
+++ b/dm/DMCpuGMTask.h
@ -1,5 +1,5 @@
-#ifndef DMCpuTask_DEFINED
-#define DMCpuTask_DEFINED
+#ifndef DMCpuGMTask_DEFINED
+#define DMCpuGMTask_DEFINED

 #include "DMExpectations.h"
 #include "DMReporter.h"
@ -15,17 +15,16 @@

 namespace DM {

-class CpuTask : public Task {
+class CpuGMTask : public CpuTask {
 public:
-    CpuTask(const char* config,
-            Reporter*,
-            TaskRunner*,
-            const Expectations&,
-            skiagm::GMRegistry::Factory,
-            SkColorType);
+    CpuGMTask(const char* config,
+              Reporter*,
+              TaskRunner*,
+              const Expectations&,
+              skiagm::GMRegistry::Factory,
+              SkColorType);

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

@ -39,4 +38,4 @@ private:

 }  // namespace DM

-#endif // DMCpuTask_DEFINED
+#endif // DMCpuGMTask_DEFINED
--- a/dm/DMExpectationsTask.cpp
+++ b/dm/DMExpectationsTask.cpp
@ -6,7 +6,7 @@ namespace DM {
 ExpectationsTask::ExpectationsTask(const Task& parent,
                                   const Expectations& expectations,
                                   SkBitmap bitmap)
-    : Task(parent)
+    : CpuTask(parent)
    , fName(parent.name())  // Masquerade as parent so failures are attributed to it.
    , fExpectations(expectations)
    , fBitmap(bitmap)
--- a/dm/DMExpectationsTask.h
+++ b/dm/DMExpectationsTask.h
@ -10,12 +10,11 @@ namespace DM {

 // ExpectationsTask compares an SkBitmap against some Expectations.
 // Moving this off the GPU threadpool is a nice (~30%) runtime win.
-class ExpectationsTask : public Task {
+class ExpectationsTask : public CpuTask {
 public:
    ExpectationsTask(const Task& parent, const Expectations&, SkBitmap);

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE { return false; }
    virtual SkString name() const SK_OVERRIDE { return fName; }

--- a/dm/DMGpuGMTask.cpp
+++ b/dm/DMGpuGMTask.cpp
@ -1,4 +1,4 @@
-#include "DMGpuTask.h"
+#include "DMGpuGMTask.h"

 #include "DMExpectationsTask.h"
 #include "DMUtil.h"
@ -9,14 +9,14 @@

 namespace DM {

-GpuTask::GpuTask(const char* config,
-                 Reporter* reporter,
-                 TaskRunner* taskRunner,
-                 const Expectations& expectations,
-                 skiagm::GMRegistry::Factory gmFactory,
-                 GrContextFactory::GLContextType contextType,
-                 int sampleCount)
-    : Task(reporter, taskRunner)
+GpuGMTask::GpuGMTask(const char* config,
+                     Reporter* reporter,
+                     TaskRunner* taskRunner,
+                     const Expectations& expectations,
+                     skiagm::GMRegistry::Factory gmFactory,
+                     GrContextFactory::GLContextType contextType,
+                     int sampleCount)
+    : GpuTask(reporter, taskRunner)
    , fGM(gmFactory(NULL))
    , fName(UnderJoin(fGM->getName(), config))
    , fExpectations(expectations)
@ -24,13 +24,13 @@ GpuTask::GpuTask(const char* config,
    , fSampleCount(sampleCount)
    {}

-void GpuTask::draw() {
+void GpuGMTask::draw(GrContextFactory* grFactory) {
    SkImageInfo info = SkImageInfo::Make(SkScalarCeilToInt(fGM->width()),
                                         SkScalarCeilToInt(fGM->height()),
                                         kPMColor_SkColorType,
                                         kPremul_SkAlphaType);
    SkAutoTUnref<SkSurface> surface(SkSurface::NewRenderTarget(
-            this->getGrContextFactory()->get(fContextType), info, fSampleCount));
+            grFactory->get(fContextType), info, fSampleCount));
    SkCanvas* canvas = surface->getCanvas();

    canvas->concat(fGM->getInitialTransform());
@ -49,7 +49,7 @@ void GpuTask::draw() {
    this->spawnChild(SkNEW_ARGS(WriteTask, (*this, bitmap)));
 }

-bool GpuTask::shouldSkip() const {
+bool GpuGMTask::shouldSkip() const {
    return SkToBool(fGM->getFlags() & skiagm::GM::kSkipGPU_Flag);
 }

--- a/dm/DMGpuGMTask.h
+++ b/dm/DMGpuGMTask.h
@ -1,5 +1,5 @@
-#ifndef DMGpuTask_DEFINED
-#define DMGpuTask_DEFINED
+#ifndef DMGpuGMTask_DEFINED
+#define DMGpuGMTask_DEFINED

 #include "DMExpectations.h"
 #include "DMReporter.h"
@ -15,18 +15,17 @@

 namespace DM {

-class GpuTask : public Task {
+class GpuGMTask : public GpuTask {
 public:
-    GpuTask(const char* config,
-            Reporter*,
-            TaskRunner*,
-            const Expectations&,
-            skiagm::GMRegistry::Factory,
-            GrContextFactory::GLContextType,
-            int sampleCount);
+    GpuGMTask(const char* config,
+              Reporter*,
+              TaskRunner*,
+              const Expectations&,
+              skiagm::GMRegistry::Factory,
+              GrContextFactory::GLContextType,
+              int sampleCount);

-    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return true; }
+    virtual void draw(GrContextFactory*) SK_OVERRIDE;
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

@ -40,4 +39,4 @@ private:

 }  // namespace DM

-#endif  // DMGpuTask_DEFINED
+#endif  // DMGpuGMTask_DEFINED
--- a/dm/DMPipeTask.cpp
+++ b/dm/DMPipeTask.cpp
@ -38,7 +38,7 @@ PipeTask::PipeTask(const Task& parent,
                   SkBitmap reference,
                   bool crossProcess,
                   bool sharedAddressSpace)
-    : Task(parent)
+    : CpuTask(parent)
    , fFlags(get_flags(crossProcess, sharedAddressSpace))
    , fName(UnderJoin(parent.name().c_str(), get_name(fFlags)))
    , fGM(gm)
--- a/dm/DMPipeTask.h
+++ b/dm/DMPipeTask.h
@ -11,7 +11,7 @@

 namespace DM {

-class PipeTask : public Task {
+class PipeTask : public CpuTask {

 public:
    PipeTask(const Task& parent,        // PipeTask must be a child task.  Pass its parent here.
@ -21,7 +21,6 @@ public:
             bool sharedAddressSpace);  // If cross process, should it assume shared address space?

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

--- a/dm/DMReplayTask.cpp
+++ b/dm/DMReplayTask.cpp
@ -14,7 +14,7 @@ ReplayTask::ReplayTask(const Task& parent,
                       skiagm::GM* gm,
                       SkBitmap reference,
                       bool useRTree)
-    : Task(parent)
+    : CpuTask(parent)
    , fName(UnderJoin(parent.name().c_str(), useRTree ? "rtree" : "replay"))
    , fGM(gm)
    , fReference(reference)
--- a/dm/DMReplayTask.h
+++ b/dm/DMReplayTask.h
@ -11,7 +11,7 @@

 namespace DM {

-class ReplayTask : public Task {
+class ReplayTask : public CpuTask {

 public:
    ReplayTask(const Task& parent,  // ReplayTask must be a child task.  Pass its parent here.
@ -20,7 +20,6 @@ public:
               bool useRTree);      // Record with an RTree?

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

--- a/dm/DMSerializeTask.cpp
+++ b/dm/DMSerializeTask.cpp
@ -13,7 +13,7 @@ namespace DM {
 SerializeTask::SerializeTask(const Task& parent,
                             skiagm::GM* gm,
                             SkBitmap reference)
-    : Task(parent)
+    : CpuTask(parent)
    , fName(UnderJoin(parent.name().c_str(), "serialize"))
    , fGM(gm)
    , fReference(reference)
--- a/dm/DMSerializeTask.h
+++ b/dm/DMSerializeTask.h
@ -11,7 +11,7 @@

 namespace DM {

-class SerializeTask : public Task {
+class SerializeTask : public CpuTask {

 public:
    SerializeTask(const Task& parent,
@ -19,7 +19,6 @@ public:
                  SkBitmap reference);

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

--- a/dm/DMTask.cpp
+++ b/dm/DMTask.cpp
@ -1,43 +1,22 @@
 #include "DMTask.h"
-
 #include "DMTaskRunner.h"
-#include "DMUtil.h"
-#include "SkBitmap.h"
-#include "SkCommandLineFlags.h"

 namespace DM {

 Task::Task(Reporter* reporter, TaskRunner* taskRunner)
-    : fReporter(reporter), fTaskRunner(taskRunner), fDepth(0) {
+    : fReporter(reporter)
+    , fTaskRunner(taskRunner)
+    , fDepth(0) {
    fReporter->start();
 }

 Task::Task(const Task& parent)
-    : INHERITED(parent)
-    , fReporter(parent.fReporter)
+    : fReporter(parent.fReporter)
    , fTaskRunner(parent.fTaskRunner)
-    , fDepth(parent.depth()+1) {
+    , fDepth(parent.depth() + 1) {
    fReporter->start();
 }

-Task::~Task() {}
-
-void Task::run() {
-    if (!this->shouldSkip()) {
-        this->draw();
-    }
-    fReporter->finish(this->name());
-    delete this;
-}
-
-void Task::spawnChild(Task* task) {
-    if (!task->usesGpu()) {
-        fTaskRunner->add(task);
-    } else {
-        SkDEBUGFAIL("Sorry, we can't spawn GPU tasks. :(  See comment in TaskRunner::wait().");
-    }
-}
-
 void Task::fail(const char* msg) {
    SkString failure(this->name());
    if (msg) {
@ -46,8 +25,35 @@ void Task::fail(const char* msg) {
    fReporter->fail(failure);
 }

-GrContextFactory* Task::getGrContextFactory() const {
-    return fTaskRunner->getGrContextFactory();
+void Task::finish() {
+    fReporter->finish(this->name());
 }

+void Task::spawnChild(CpuTask* task) {
+    fTaskRunner->add(task);
+}
+
+CpuTask::CpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
+CpuTask::CpuTask(const Task& parent) : Task(parent) {}
+
+void CpuTask::run() {
+    if (!this->shouldSkip()) {
+        this->draw();
+    }
+    this->finish();
+    SkDELETE(this);
+}
+
+GpuTask::GpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
+
+void GpuTask::run(GrContextFactory& factory) {
+    if (!this->shouldSkip()) {
+        this->draw(&factory);
+    }
+    this->finish();
+    SkDELETE(this);
+}
+
+
+
 }  // namespace DM
--- a/dm/DMTask.h
+++ b/dm/DMTask.h
@ -4,28 +4,21 @@
 #include "DMReporter.h"
 #include "GrContextFactory.h"
 #include "SkRunnable.h"
-#include "SkThreadPool.h"

-// DM will run() these tasks on one of two threadpools, depending on the result
-// of usesGpu().  The subclasses can call fail() to mark this task as failed,
-// or make any number of spawnChild() calls to kick off dependent tasks.
+// DM will run() these tasks on one of two threadpools.
+// Subclasses can call fail() to mark this task as failed, or make any number of spawnChild() calls
+// to kick off dependent tasks.
 //
-// Task deletes itself when run.
+// Tasks delete themselves when run.

 namespace DM {

 class TaskRunner;

-class Task : public SkRunnable {
+class CpuTask;
+
+class Task {
 public:
-    Task(Reporter* reporter, TaskRunner* taskRunner);
-    Task(const Task& parent);
-    virtual ~Task();
-
-    void run() SK_OVERRIDE;
-
-    virtual void draw() = 0;
-    virtual bool usesGpu() const = 0;
    virtual bool shouldSkip() const = 0;
    virtual SkString name() const = 0;

@ -34,19 +27,37 @@ public:
    int depth() const { return fDepth; }

 protected:
-    void spawnChild(Task* task);
-    void fail(const char* msg = NULL);
+    Task(Reporter* reporter, TaskRunner* taskRunner);
+    Task(const Task& parent);
+    virtual ~Task() {}

-    // This can only be safely called from a GPU task's draw() method.
-    GrContextFactory* getGrContextFactory() const;
+    void fail(const char* msg = NULL);
+    void finish();
+    void spawnChild(CpuTask* task);  // For now we don't allow GPU child tasks.

 private:
-    // Both unowned.
-    Reporter* fReporter;
-    TaskRunner* fTaskRunner;
+    Reporter* fReporter;      // Unowned.
+    TaskRunner* fTaskRunner;  // Unowned.
    int fDepth;
+};

-    typedef SkRunnable INHERITED;
+class CpuTask : public Task, public SkRunnable {
+public:
+    CpuTask(Reporter* reporter, TaskRunner* taskRunner);
+    CpuTask(const Task& parent);
+    virtual ~CpuTask() {}
+
+    void run() SK_OVERRIDE;
+    virtual void draw() = 0;
+};
+
+class GpuTask : public Task, public SkTRunnable<GrContextFactory> {
+ public:
+    GpuTask(Reporter* reporter, TaskRunner* taskRunner);
+    virtual ~GpuTask() {}
+
+    void run(GrContextFactory&) SK_OVERRIDE;
+    virtual void draw(GrContextFactory*) = 0;
 };

 }  // namespace DM
--- a/dm/DMTaskRunner.cpp
+++ b/dm/DMTaskRunner.cpp
@ -3,48 +3,19 @@

 namespace DM {

+TaskRunner::TaskRunner(int cpuThreads, int gpuThreads) : fCpu(cpuThreads), fGpu(gpuThreads) {}

-TaskRunner::TaskRunner(int cputhreads)
-    : fMain(cputhreads)
-    , fGpu(1) {
-    // Enqueue a task on the GPU thread to create a GrContextFactory.
-    struct Create : public SkRunnable {
-        Create(GrContextFactory** ptr) : fPtr(ptr) {}
-        void run() SK_OVERRIDE {
-            *fPtr = SkNEW(GrContextFactory);
-            delete this;
-        }
-        GrContextFactory** fPtr;
-    };
-    fGpu.add(SkNEW_ARGS(Create, (&fGrContextFactory)));
-}
+void TaskRunner::add(CpuTask* task) { fCpu.add(task); }

-void TaskRunner::add(Task* task) {
-    if (task->usesGpu()) {
-        fGpu.add(task);
-    } else {
-        fMain.add(task);
-    }
-}
+void TaskRunner::add(GpuTask* task) { fGpu.add(task); }

 void TaskRunner::wait() {
-    // Enqueue a task on the GPU thread to destroy the GrContextFactory.
-    struct Delete : public SkRunnable {
-        Delete(GrContextFactory* ptr) : fPtr(ptr) {}
-        void run() SK_OVERRIDE {
-            delete fPtr;
-            delete this;
-        }
-        GrContextFactory* fPtr;
-    };
-    fGpu.add(SkNEW_ARGS(Delete, (fGrContextFactory)));
-
-    // These wait calls block until the threadpool is done.  We don't allow
-    // children to spawn new GPU tasks so we can wait for that first knowing
-    // we'll never try to add to it later.  Same can't be said of fMain: fGpu
-    // and fMain can both add tasks to fMain, so we have to wait for that last.
+    // These wait calls block until each threadpool is done.  We don't allow
+    // spawning new child GPU tasks, so we can wait for that first knowing
+    // we'll never try to add to it later.  Same can't be said of the CPU pool:
+    // both CPU and GPU tasks can spawn off new CPU work, so we wait for that last.
    fGpu.wait();
-    fMain.wait();
+    fCpu.wait();
 }

 }  // namespace DM
--- a/dm/DMTaskRunner.h
+++ b/dm/DMTaskRunner.h
@ -5,26 +5,25 @@
 #include "SkThreadPool.h"
 #include "SkTypes.h"

-// TaskRunner runs Tasks on one of two threadpools depending on the Task's usesGpu() method.  This
-// lets us drive the GPU from a single thread while parallelizing CPU-bound work.
+// TaskRunner runs Tasks on one of two threadpools depending on the need for a GrContextFactory.
+// It's typically a good idea to run fewer GPU threads than CPU threads (go nuts with those).

 namespace DM {

-class Task;
+class CpuTask;
+class GpuTask;

 class TaskRunner : SkNoncopyable {
 public:
-    explicit TaskRunner(int cputhreads);
+    explicit TaskRunner(int cpuThreads, int gpuThreads);

-    void add(Task* task);
+    void add(CpuTask* task);
+    void add(GpuTask* task);
    void wait();

-    // This can only be safely called from a GPU task's draw() method.
-    GrContextFactory* getGrContextFactory() const { return fGrContextFactory; }
-
 private:
-    SkThreadPool fMain, fGpu;
-    GrContextFactory* fGrContextFactory;  // Created and destroyed on fGpu threadpool.
+    SkTThreadPool<void> fCpu;
+    SkTThreadPool<GrContextFactory> fGpu;
 };

 }  // namespace DM
--- a/dm/DMTestTask.cpp
+++ b/dm/DMTestTask.cpp
@ -8,23 +8,32 @@ DEFINE_bool2(pathOpsVerbose,      V, false, "Tell pathOps tests to be verbose.")

 namespace DM {

+bool TestReporter::allowExtendedTest() const { return FLAGS_pathOpsExtended; }
+bool TestReporter::allowThreaded()     const { return !FLAGS_pathOpsSingleThread; }
+bool TestReporter::verbose()           const { return FLAGS_pathOpsVerbose; }
+
 static SkString test_name(const char* name) {
    SkString result("test ");
    result.append(name);
    return result;
 }

-TestTask::TestTask(Reporter* reporter,
-                   TaskRunner* taskRunner,
-                   skiatest::TestRegistry::Factory factory)
-    : Task(reporter, taskRunner)
+CpuTestTask::CpuTestTask(Reporter* reporter,
+                         TaskRunner* taskRunner,
+                         skiatest::TestRegistry::Factory factory)
+    : CpuTask(reporter, taskRunner)
    , fTest(factory(NULL))
    , fName(test_name(fTest->getName())) {}

-void TestTask::draw() {
-    if (this->usesGpu()) {
-        fTest->setGrContextFactory(this->getGrContextFactory());
-    }
+GpuTestTask::GpuTestTask(Reporter* reporter,
+                         TaskRunner* taskRunner,
+                         skiatest::TestRegistry::Factory factory)
+    : GpuTask(reporter, taskRunner)
+    , fTest(factory(NULL))
+    , fName(test_name(fTest->getName())) {}
+
+
+void CpuTestTask::draw() {
    fTest->setReporter(&fTestReporter);
    fTest->run();
    if (!fTest->passed()) {
@ -32,8 +41,13 @@ void TestTask::draw() {
    }
 }

-bool TestTask::TestReporter::allowExtendedTest() const { return FLAGS_pathOpsExtended; }
-bool TestTask::TestReporter::allowThreaded()     const { return !FLAGS_pathOpsSingleThread; }
-bool TestTask::TestReporter::verbose()           const { return FLAGS_pathOpsVerbose; }
+void GpuTestTask::draw(GrContextFactory* grFactory) {
+    fTest->setGrContextFactory(grFactory);
+    fTest->setReporter(&fTestReporter);
+    fTest->run();
+    if (!fTest->passed()) {
+        this->fail(fTestReporter.failure());
+    }
+}

 }  // namespace DM
--- a/dm/DMTestTask.h
+++ b/dm/DMTestTask.h
@ -11,34 +11,47 @@
 // Runs a unit test.
 namespace DM {

-class TestTask : public Task {
+class TestReporter : public skiatest::Reporter {
 public:
-    TestTask(Reporter*, TaskRunner*, skiatest::TestRegistry::Factory);
+  TestReporter() {}
+
+  const char* failure() const { return fFailure.c_str(); }
+
+private:
+  virtual bool allowExtendedTest() const SK_OVERRIDE;
+  virtual bool allowThreaded()     const SK_OVERRIDE;
+  virtual bool verbose()           const SK_OVERRIDE;
+
+  virtual void onReportFailed(const SkString& desc) SK_OVERRIDE {
+      fFailure = desc;
+  }
+
+  SkString fFailure;
+};
+
+class CpuTestTask : public CpuTask {
+public:
+    CpuTestTask(Reporter*, TaskRunner*, skiatest::TestRegistry::Factory);

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return fTest->isGPUTest(); }
    virtual bool shouldSkip() const SK_OVERRIDE { return false; }
    virtual SkString name() const SK_OVERRIDE { return fName; }

 private:
-    class TestReporter : public skiatest::Reporter {
-    public:
-      TestReporter() {}
+    TestReporter fTestReporter;
+    SkAutoTDelete<skiatest::Test> fTest;
+    const SkString fName;
+};

-      const char* failure() const { return fFailure.c_str(); }
+class GpuTestTask : public GpuTask {
+public:
+    GpuTestTask(Reporter*, TaskRunner*, skiatest::TestRegistry::Factory);

-    private:
-      virtual bool allowExtendedTest() const SK_OVERRIDE;
-      virtual bool allowThreaded()     const SK_OVERRIDE;
-      virtual bool verbose()           const SK_OVERRIDE;
-
-      virtual void onReportFailed(const SkString& desc) SK_OVERRIDE {
-          fFailure = desc;
-      }
-
-      SkString fFailure;
-    };
+    virtual void draw(GrContextFactory*) SK_OVERRIDE;
+    virtual bool shouldSkip() const SK_OVERRIDE { return false; }
+    virtual SkString name() const SK_OVERRIDE { return fName; }

+private:
    TestReporter fTestReporter;
    SkAutoTDelete<skiatest::Test> fTest;
    const SkString fName;
--- a/dm/DMTileGridTask.cpp
+++ b/dm/DMTileGridTask.cpp
@ -12,7 +12,7 @@ DEFINE_bool(tileGrid, false, "If true, run picture replay tests with a tile grid
 namespace DM {

 TileGridTask::TileGridTask(const Task& parent, skiagm::GM* gm, SkBitmap reference, SkISize tileSize)
-    : Task(parent)
+    : CpuTask(parent)
    , fName(UnderJoin(parent.name().c_str(), "tilegrid"))
    , fGM(gm)
    , fReference(reference)
--- a/dm/DMTileGridTask.h
+++ b/dm/DMTileGridTask.h
@ -11,7 +11,7 @@

 namespace DM {

-class TileGridTask : public Task {
+class TileGridTask : public CpuTask {

 public:
    TileGridTask(const Task& parent,  // TileGridTask must be a child task.  Pass its parent here.
@ -20,7 +20,6 @@ public:
                 SkISize tileSize);   // Tile size to use.

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE { return fName; }

--- a/dm/DMWriteTask.cpp
+++ b/dm/DMWriteTask.cpp
@ -26,7 +26,7 @@ static int split_suffixes(int N, const char* name, SkTArray<SkString>* out) {
    return consumed;
 }

-WriteTask::WriteTask(const Task& parent, SkBitmap bitmap) : Task(parent), fBitmap(bitmap) {
+WriteTask::WriteTask(const Task& parent, SkBitmap bitmap) : CpuTask(parent), fBitmap(bitmap) {
    const int suffixes = parent.depth() + 1;
    const SkString& name = parent.name();
    const int totalSuffixLength = split_suffixes(suffixes, name.c_str(), &fSuffixes);
--- a/dm/DMWriteTask.h
+++ b/dm/DMWriteTask.h
@ -12,14 +12,13 @@

 namespace DM {

-class WriteTask : public Task {
+class WriteTask : public CpuTask {

 public:
    WriteTask(const Task& parent,  // WriteTask must be a child Task.  Pass its parent here.
              SkBitmap bitmap);    // Bitmap to write.

    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
    virtual bool shouldSkip() const SK_OVERRIDE;
    virtual SkString name() const SK_OVERRIDE;

--- a/gyp/dm.gyp
+++ b/gyp/dm.gyp
@ -29,9 +29,9 @@
        'sources': [
            '../dm/DM.cpp',
            '../dm/DMBenchTask.cpp',
-            '../dm/DMCpuTask.cpp',
+            '../dm/DMCpuGMTask.cpp',
            '../dm/DMExpectationsTask.cpp',
-            '../dm/DMGpuTask.cpp',
+            '../dm/DMGpuGMTask.cpp',
            '../dm/DMPipeTask.cpp',
            '../dm/DMReplayTask.cpp',
            '../dm/DMReporter.cpp',
--- a/gyp/utils.gyp
+++ b/gyp/utils.gyp
@ -30,7 +30,6 @@
        '../include/utils/SkThreadPool.h',
        '../src/utils/SkCondVar.cpp',
        '../src/utils/SkCountdown.cpp',
-        '../src/utils/SkThreadPool.cpp',

        '../include/utils/SkBoundaryPatch.h',
        '../include/utils/SkFrontBufferedStream.h',
@ -227,6 +226,7 @@
      'direct_dependent_settings': {
        'include_dirs': [
          '../include/utils',
+          '../src/utils',
        ],
      },
    },
--- a/include/utils/SkRunnable.h
+++ b/include/utils/SkRunnable.h
@ -8,10 +8,18 @@
 #ifndef SkRunnable_DEFINED
 #define SkRunnable_DEFINED

-class SkRunnable {
-public:
-    virtual ~SkRunnable() {};
+template <typename T>
+struct SkTRunnable {
+    virtual ~SkTRunnable() {};
+    virtual void run(T&) = 0;
+};
+
+template <>
+struct SkTRunnable<void> {
+    virtual ~SkTRunnable() {};
    virtual void run() = 0;
 };

+typedef SkTRunnable<void> SkRunnable;
+
 #endif
--- a/include/utils/SkThreadPool.h
+++ b/include/utils/SkThreadPool.h
@ -12,24 +12,42 @@
 #include "SkRunnable.h"
 #include "SkTDArray.h"
 #include "SkTInternalLList.h"
+#include "SkThreadUtils.h"
+#include "SkTypes.h"

-class SkThread;
+#if defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
+#    include <unistd.h>
+#endif

-class SkThreadPool {
+// Returns the number of cores on this machine.
+static inline int num_cores() {
+#if defined(SK_BUILD_FOR_WIN32)
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+#elif defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#else
+    return 1;
+#endif
+}

+template <typename T>
+class SkTThreadPool {
 public:
    /**
     * Create a threadpool with count threads, or one thread per core if kThreadPerCore.
     */
    static const int kThreadPerCore = -1;
-    explicit SkThreadPool(int count);
-    ~SkThreadPool();
+    explicit SkTThreadPool(int count);
+    ~SkTThreadPool();

    /**
-     * Queues up an SkRunnable to run when a thread is available, or immediately if
-     * count is 0.  NULL is a safe no-op.  Does not take ownership.
+     * Queues up an SkRunnable to run when a thread is available, or synchronously if count is 0.
+     * Does not take ownership. NULL is a safe no-op.  If T is not void, the runnable will be passed
+     * a reference to a T on the thread's local stack.
     */
-    void add(SkRunnable*);
+    void add(SkTRunnable<T>*);

    /**
     * Block until all added SkRunnables have completed.  Once called, calling add() is undefined.
@ -38,10 +56,7 @@ public:

 private:
    struct LinkedRunnable {
-        // Unowned pointer.
-        SkRunnable* fRunnable;
-
-    private:
+        SkTRunnable<T>* fRunnable;  // Unowned.
        SK_DECLARE_INTERNAL_LLIST_INTERFACE(LinkedRunnable);
    };

@ -60,4 +75,128 @@ public:
    static void Loop(void*);  // Static because we pass in this.
 };

+template <typename T>
+SkTThreadPool<T>::SkTThreadPool(int count) : fState(kRunning_State), fBusyThreads(0) {
+    if (count < 0) {
+        count = num_cores();
+    }
+    // Create count threads, all running SkTThreadPool::Loop.
+    for (int i = 0; i < count; i++) {
+        SkThread* thread = SkNEW_ARGS(SkThread, (&SkTThreadPool::Loop, this));
+        *fThreads.append() = thread;
+        thread->start();
+    }
+}
+
+template <typename T>
+SkTThreadPool<T>::~SkTThreadPool() {
+    if (kRunning_State == fState) {
+        this->wait();
+    }
+}
+
+namespace SkThreadPoolPrivate {
+
+template <typename T>
+struct ThreadLocal {
+    void run(SkTRunnable<T>* r) { r->run(data); }
+    T data;
+};
+
+template <>
+struct ThreadLocal<void> {
+    void run(SkTRunnable<void>* r) { r->run(); }
+};
+
+}  // namespace SkThreadPoolPrivate
+
+template <typename T>
+void SkTThreadPool<T>::add(SkTRunnable<T>* r) {
+    if (r == NULL) {
+        return;
+    }
+
+    if (fThreads.isEmpty()) {
+        SkThreadPoolPrivate::ThreadLocal<T> threadLocal;
+        threadLocal.run(r);
+        return;
+    }
+
+    LinkedRunnable* linkedRunnable = SkNEW(LinkedRunnable);
+    linkedRunnable->fRunnable = r;
+    fReady.lock();
+    SkASSERT(fState != kHalting_State);  // Shouldn't be able to add work when we're halting.
+    fQueue.addToHead(linkedRunnable);
+    fReady.signal();
+    fReady.unlock();
+}
+
+
+template <typename T>
+void SkTThreadPool<T>::wait() {
+    fReady.lock();
+    fState = kWaiting_State;
+    fReady.broadcast();
+    fReady.unlock();
+
+    // Wait for all threads to stop.
+    for (int i = 0; i < fThreads.count(); i++) {
+        fThreads[i]->join();
+        SkDELETE(fThreads[i]);
+    }
+    SkASSERT(fQueue.isEmpty());
+}
+
+template <typename T>
+/*static*/ void SkTThreadPool<T>::Loop(void* arg) {
+    // The SkTThreadPool passes itself as arg to each thread as they're created.
+    SkTThreadPool<T>* pool = static_cast<SkTThreadPool<T>*>(arg);
+    SkThreadPoolPrivate::ThreadLocal<T> threadLocal;
+
+    while (true) {
+        // We have to be holding the lock to read the queue and to call wait.
+        pool->fReady.lock();
+        while(pool->fQueue.isEmpty()) {
+            // Does the client want to stop and are all the threads ready to stop?
+            // If so, we move into the halting state, and whack all the threads so they notice.
+            if (kWaiting_State == pool->fState && pool->fBusyThreads == 0) {
+                pool->fState = kHalting_State;
+                pool->fReady.broadcast();
+            }
+            // Any time we find ourselves in the halting state, it's quitting time.
+            if (kHalting_State == pool->fState) {
+                pool->fReady.unlock();
+                return;
+            }
+            // wait yields the lock while waiting, but will have it again when awoken.
+            pool->fReady.wait();
+        }
+        // We've got the lock back here, no matter if we ran wait or not.
+
+        // The queue is not empty, so we have something to run.  Claim it.
+        LinkedRunnable* r = pool->fQueue.tail();
+
+        pool->fQueue.remove(r);
+
+        // Having claimed our SkRunnable, we now give up the lock while we run it.
+        // Otherwise, we'd only ever do work on one thread at a time, which rather
+        // defeats the point of this code.
+        pool->fBusyThreads++;
+        pool->fReady.unlock();
+
+        // OK, now really do the work.
+        threadLocal.run(r->fRunnable);
+        SkDELETE(r);
+
+        // Let everyone know we're not busy.
+        pool->fReady.lock();
+        pool->fBusyThreads--;
+        pool->fReady.unlock();
+    }
+
+    SkASSERT(false); // Unreachable.  The only exit happens when pool->fState is kHalting_State.
+}
+
+typedef SkTThreadPool<void> SkThreadPool;
+
 #endif
--- a/src/utils/SkThreadPool.cpp
+++ b/src/utils/SkThreadPool.cpp
@ -1,127 +0,0 @@
-/*
- * Copyright 2012 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkRunnable.h"
-#include "SkThreadPool.h"
-#include "SkThreadUtils.h"
-#include "SkTypes.h"
-
-#if defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
-#include <unistd.h>
-#endif
-
-// Returns the number of cores on this machine.
-static int num_cores() {
-#if defined(SK_BUILD_FOR_WIN32)
-    SYSTEM_INFO sysinfo;
-    GetSystemInfo(&sysinfo);
-    return sysinfo.dwNumberOfProcessors;
-#elif defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
-    return sysconf(_SC_NPROCESSORS_ONLN);
-#else
-    return 1;
-#endif
-}
-
-SkThreadPool::SkThreadPool(int count)
-: fState(kRunning_State), fBusyThreads(0) {
-    if (count < 0) count = num_cores();
-    // Create count threads, all running SkThreadPool::Loop.
-    for (int i = 0; i < count; i++) {
-        SkThread* thread = SkNEW_ARGS(SkThread, (&SkThreadPool::Loop, this));
-        *fThreads.append() = thread;
-        thread->start();
-    }
-}
-
-SkThreadPool::~SkThreadPool() {
-    if (kRunning_State == fState) {
-        this->wait();
-    }
-}
-
-void SkThreadPool::wait() {
-    fReady.lock();
-    fState = kWaiting_State;
-    fReady.broadcast();
-    fReady.unlock();
-
-    // Wait for all threads to stop.
-    for (int i = 0; i < fThreads.count(); i++) {
-        fThreads[i]->join();
-        SkDELETE(fThreads[i]);
-    }
-    SkASSERT(fQueue.isEmpty());
-}
-
-/*static*/ void SkThreadPool::Loop(void* arg) {
-    // The SkThreadPool passes itself as arg to each thread as they're created.
-    SkThreadPool* pool = static_cast<SkThreadPool*>(arg);
-
-    while (true) {
-        // We have to be holding the lock to read the queue and to call wait.
-        pool->fReady.lock();
-        while(pool->fQueue.isEmpty()) {
-            // Does the client want to stop and are all the threads ready to stop?
-            // If so, we move into the halting state, and whack all the threads so they notice.
-            if (kWaiting_State == pool->fState && pool->fBusyThreads == 0) {
-                pool->fState = kHalting_State;
-                pool->fReady.broadcast();
-            }
-            // Any time we find ourselves in the halting state, it's quitting time.
-            if (kHalting_State == pool->fState) {
-                pool->fReady.unlock();
-                return;
-            }
-            // wait yields the lock while waiting, but will have it again when awoken.
-            pool->fReady.wait();
-        }
-        // We've got the lock back here, no matter if we ran wait or not.
-
-        // The queue is not empty, so we have something to run.  Claim it.
-        LinkedRunnable* r = pool->fQueue.tail();
-
-        pool->fQueue.remove(r);
-
-        // Having claimed our SkRunnable, we now give up the lock while we run it.
-        // Otherwise, we'd only ever do work on one thread at a time, which rather
-        // defeats the point of this code.
-        pool->fBusyThreads++;
-        pool->fReady.unlock();
-
-        // OK, now really do the work.
-        r->fRunnable->run();
-        SkDELETE(r);
-
-        // Let everyone know we're not busy.
-        pool->fReady.lock();
-        pool->fBusyThreads--;
-        pool->fReady.unlock();
-    }
-
-    SkASSERT(false); // Unreachable.  The only exit happens when pool->fState is kHalting_State.
-}
-
-void SkThreadPool::add(SkRunnable* r) {
-    if (NULL == r) {
-        return;
-    }
-
-    // If we don't have any threads, obligingly just run the thing now.
-    if (fThreads.isEmpty()) {
-        return r->run();
-    }
-
-    // We have some threads.  Queue it up!
-    fReady.lock();
-    SkASSERT(fState != kHalting_State);  // Shouldn't be able to add work when we're halting.
-    LinkedRunnable* linkedRunnable = SkNEW(LinkedRunnable);
-    linkedRunnable->fRunnable = r;
-    fQueue.addToHead(linkedRunnable);
-    fReady.signal();
-    fReady.unlock();
-}