diff --git a/include/core/SkMultiPictureDraw.h b/include/core/SkMultiPictureDraw.h
index 809e2dfe29..8d24e615e8 100644
--- a/include/core/SkMultiPictureDraw.h
+++ b/include/core/SkMultiPictureDraw.h
@@ -66,8 +66,6 @@ private:
         void draw();
 
         static void Reset(SkTDArray<DrawData>&);
-
-        static void Draw(DrawData* d) { d->draw(); }
     };
 
     SkTDArray<DrawData> fThreadSafeDrawData;
diff --git a/samplecode/SamplePathFuzz.cpp b/samplecode/SamplePathFuzz.cpp
index f2595c59ea..55f7f4129d 100644
--- a/samplecode/SamplePathFuzz.cpp
+++ b/samplecode/SamplePathFuzz.cpp
@@ -617,42 +617,31 @@ static bool contains_only_moveTo(const SkPath& path) {
 #include "SkTaskGroup.h"
 #include "SkTDArray.h"
 
-struct ThreadState {
-    int fSeed;
-    const SkBitmap* fBitmap;
-};
-
-static void test_fuzz(ThreadState* data) {
-    FuzzPath fuzzPath;
-    fuzzPath.setStrokeOnly();
-    fuzzPath.setSeed(data->fSeed);
-    fuzzPath.randomize();
-    const SkPath& path = fuzzPath.getPath();
-    const SkPaint& paint = fuzzPath.getPaint();
-    const SkImageInfo& info = data->fBitmap->info();
-    SkCanvas* canvas(SkCanvas::NewRasterDirect(info, data->fBitmap->getPixels(),
-            data->fBitmap->rowBytes()));
-    int w = info.width() / 4;
-    int h = info.height() / 4;
-    int x = data->fSeed / 4 % 4;
-    int y = data->fSeed % 4;
-    SkRect clipBounds = SkRect::MakeXYWH(SkIntToScalar(x) * w, SkIntToScalar(y) * h, 
-        SkIntToScalar(w), SkIntToScalar(h));
-    canvas->save();
-        canvas->clipRect(clipBounds);
-        canvas->translate(SkIntToScalar(x) * w, SkIntToScalar(y) * h);
-        canvas->drawPath(path, paint);
-    canvas->restore();
-}
-
 static void path_fuzz_stroker(SkBitmap* bitmap, int seed) {
-    ThreadState states[100];
-    for (size_t i = 0; i < SK_ARRAY_COUNT(states); i++) {
-        states[i].fSeed   = seed + (int) i;
-        states[i].fBitmap = bitmap;
-    }
-    SkTaskGroup tg;
-    tg.batch(test_fuzz, states, SK_ARRAY_COUNT(states));
+    sk_parallel_for(100, [&](int i) {
+        int localSeed = seed + i;
+
+        FuzzPath fuzzPath;
+        fuzzPath.setStrokeOnly();
+        fuzzPath.setSeed(localSeed);
+        fuzzPath.randomize();
+        const SkPath& path = fuzzPath.getPath();
+        const SkPaint& paint = fuzzPath.getPaint();
+        const SkImageInfo& info = bitmap->info();
+        SkCanvas* canvas(
+            SkCanvas::NewRasterDirect(info, bitmap->getPixels(), bitmap->rowBytes()));
+        int w = info.width() / 4;
+        int h = info.height() / 4;
+        int x = localSeed / 4 % 4;
+        int y = localSeed % 4;
+        SkRect clipBounds = SkRect::MakeXYWH(SkIntToScalar(x) * w, SkIntToScalar(y) * h,
+            SkIntToScalar(w), SkIntToScalar(h));
+        canvas->save();
+            canvas->clipRect(clipBounds);
+            canvas->translate(SkIntToScalar(x) * w, SkIntToScalar(y) * h);
+            canvas->drawPath(path, paint);
+        canvas->restore();
+    });
 }
 
 class PathFuzzView : public SampleView {
@@ -673,7 +662,7 @@ protected:
 
     void onOnceBeforeDraw() override {
         fIndex = 0;
-        SkImageInfo info(SkImageInfo::MakeN32Premul(SkScalarRoundToInt(width()), 
+        SkImageInfo info(SkImageInfo::MakeN32Premul(SkScalarRoundToInt(width()),
                 SkScalarRoundToInt(height())));
         offscreen.allocPixels(info);
         path_fuzz_stroker(&offscreen, fIndex);
diff --git a/src/core/SkMultiPictureDraw.cpp b/src/core/SkMultiPictureDraw.cpp
index 22fd05fb08..286d3b6489 100644
--- a/src/core/SkMultiPictureDraw.cpp
+++ b/src/core/SkMultiPictureDraw.cpp
@@ -91,17 +91,16 @@ void SkMultiPictureDraw::draw(bool flush) {
 
 #ifdef FORCE_SINGLE_THREAD_DRAWING_FOR_TESTING
     for (int i = 0; i < fThreadSafeDrawData.count(); ++i) {
-        DrawData* dd = &fThreadSafeDrawData.begin()[i];
-        dd->fCanvas->drawPicture(dd->fPicture, &dd->fMatrix, dd->fPaint);
+        fThreadSafeDrawData[i].draw();
     }
 #else
-    // we place the taskgroup after the MPDReset, to ensure that we don't delete the DrawData
-    // objects until after we're finished the tasks (which have pointers to the data).
-    SkTaskGroup group;
-    group.batch(DrawData::Draw, fThreadSafeDrawData.begin(), fThreadSafeDrawData.count());
+    sk_parallel_for(fThreadSafeDrawData.count(), [&](int i) {
+        fThreadSafeDrawData[i].draw();
+    });
 #endif
-    // we deliberately don't call wait() here, since the destructor will do that, this allows us
-    // to continue processing gpu-data without having to wait on the cpu tasks.
+
+    // N.B. we could get going on any GPU work from this main thread while the CPU work runs.
+    // But in practice, we've either got GPU work or CPU work, not both.
 
     const int count = fGPUDrawData.count();
     if (0 == count) {
diff --git a/src/core/SkTaskGroup.cpp b/src/core/SkTaskGroup.cpp
index 59319c148d..4504defb1b 100644
--- a/src/core/SkTaskGroup.cpp
+++ b/src/core/SkTaskGroup.cpp
@@ -5,6 +5,7 @@
  * found in the LICENSE file.
  */
 
+#include "SkOnce.h"
 #include "SkRunnable.h"
 #include "SkSemaphore.h"
 #include "SkSpinlock.h"
@@ -13,18 +14,27 @@
 #include "SkThreadUtils.h"
 
 #if defined(SK_BUILD_FOR_WIN32)
-    static inline int num_cores() {
+    static void query_num_cores(int* num_cores) {
         SYSTEM_INFO sysinfo;
         GetSystemInfo(&sysinfo);
-        return sysinfo.dwNumberOfProcessors;
+        *num_cores = sysinfo.dwNumberOfProcessors;
     }
 #else
     #include <unistd.h>
-    static inline int num_cores() {
-        return (int) sysconf(_SC_NPROCESSORS_ONLN);
+    static void query_num_cores(int* num_cores) {
+        *num_cores = (int)sysconf(_SC_NPROCESSORS_ONLN);
     }
 #endif
 
+// We cache sk_num_cores() so we only query the OS once.
+SK_DECLARE_STATIC_ONCE(g_query_num_cores_once);
+int sk_num_cores() {
+    static int num_cores = 0;
+    SkOnce(&g_query_num_cores_once, query_num_cores, &num_cores);
+    SkASSERT(num_cores > 0);
+    return num_cores;
+}
+
 namespace {
 
 class ThreadPool : SkNoncopyable {
@@ -98,7 +108,7 @@ private:
 
     explicit ThreadPool(int threads) {
         if (threads == -1) {
-            threads = num_cores();
+            threads = sk_num_cores();
         }
         for (int i = 0; i < threads; i++) {
             fThreads.push(SkNEW_ARGS(SkThread, (&ThreadPool::Loop, this)));
diff --git a/src/core/SkTaskGroup.h b/src/core/SkTaskGroup.h
index 8c7369da25..3af64d7753 100644
--- a/src/core/SkTaskGroup.h
+++ b/src/core/SkTaskGroup.h
@@ -10,6 +10,7 @@
 
 #include "SkTypes.h"
 #include "SkAtomics.h"
+#include "SkTemplates.h"
 
 struct SkRunnable;
 
@@ -49,4 +50,42 @@ private:
     SkAtomic<int32_t> fPending;
 };
 
+// Returns best estimate of number of CPU cores available to use.
+int sk_num_cores();
+
+// Call f(i) for i in [0, end).
+template <typename Func>
+void sk_parallel_for(int end, const Func& f) {
+    if (end <= 0) { return; }
+
+    struct Chunk {
+        const Func* f;
+        int start, end;
+    };
+
+    // TODO(mtklein): this chunking strategy could probably use some tuning.
+    int max_chunks  = sk_num_cores() * 2,
+        stride      = (end + max_chunks - 1 ) / max_chunks,
+        nchunks     = (end + stride - 1 ) / stride;
+    SkASSERT(nchunks <= max_chunks);
+
+    // With the chunking strategy above this won't malloc until we have a machine with >512 cores.
+    SkAutoSTMalloc<1024, Chunk> chunks(nchunks);
+
+    for (int i = 0; i < nchunks; i++) {
+        Chunk& c = chunks[i];
+        c.f     = &f;
+        c.start = i * stride;
+        c.end   = SkTMin(c.start + stride, end);
+        SkASSERT(c.start < c.end);  // Nothing will break if start >= end, but it's a wasted chunk.
+    }
+
+    void(*run_chunk)(Chunk*) = [](Chunk* c) {
+        for (int i = c->start; i < c->end; i++) {
+            (*c->f)(i);
+        }
+    };
+    SkTaskGroup().batch(run_chunk, chunks.get(), nchunks);
+}
+
 #endif//SkTaskGroup_DEFINED
diff --git a/tests/LazyPtrTest.cpp b/tests/LazyPtrTest.cpp
index 1b845bc502..89443f9a80 100644
--- a/tests/LazyPtrTest.cpp
+++ b/tests/LazyPtrTest.cpp
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 #include "Test.h"
 #include "SkLazyPtr.h"
 #include "SkRunnable.h"
@@ -44,37 +51,20 @@ DEF_TEST(LazyPtr, r) {
     SkDELETE(ptr);
 }
 
-namespace {
-
-struct Racer : public SkRunnable {
-    Racer() : fLazy(NULL), fSeen(NULL) {}
-
-    void run() override { fSeen = fLazy->get(); }
-
-    SkLazyPtr<int>* fLazy;
-    int* fSeen;
-};
-
-} // namespace
-
 DEF_TEST(LazyPtr_Threaded, r) {
     static const int kRacers = 321;
 
+    // Race to intialize the pointer by calling .get().
     SkLazyPtr<int> lazy;
+    int* seen[kRacers];
 
-    Racer racers[kRacers];
-    for (int i = 0; i < kRacers; i++) {
-        racers[i].fLazy = &lazy;
-    }
-
-    SkTaskGroup tg;
-    for (int i = 0; i < kRacers; i++) {
-        tg.add(racers + i);
-    }
-    tg.wait();
+    sk_parallel_for(kRacers, [&](int i) {
+        seen[i] = lazy.get();
+    });
 
+    // lazy.get() should return the same pointer to all threads.
     for (int i = 1; i < kRacers; i++) {
-        REPORTER_ASSERT(r, racers[i].fSeen);
-        REPORTER_ASSERT(r, racers[i].fSeen == racers[0].fSeen);
+        REPORTER_ASSERT(r, seen[i] != nullptr);
+        REPORTER_ASSERT(r, seen[i] == seen[0]);
     }
 }
diff --git a/tests/OnceTest.cpp b/tests/OnceTest.cpp
index 034c5d912f..35c2015166 100644
--- a/tests/OnceTest.cpp
+++ b/tests/OnceTest.cpp
@@ -28,42 +28,14 @@ DEF_TEST(SkOnce_Singlethreaded, r) {
     REPORTER_ASSERT(r, 5 == x);
 }
 
-static void add_six(int* x) {
-    *x += 6;
-}
-
-namespace {
-
-class Racer : public SkRunnable {
-public:
-    SkOnceFlag* once;
-    int* ptr;
-
-    void run() override {
-        SkOnce(once, add_six, ptr);
-    }
-};
-
-}  // namespace
-
 SK_DECLARE_STATIC_ONCE(mt_once);
 DEF_TEST(SkOnce_Multithreaded, r) {
-    const int kTasks = 16;
-
-    // Make a bunch of tasks that will race to be the first to add six to x.
-    Racer racers[kTasks];
     int x = 0;
-    for (int i = 0; i < kTasks; i++) {
-        racers[i].once = &mt_once;
-        racers[i].ptr = &x;
-    }
-
-    // Let them race.
-    SkTaskGroup tg;
-    for (int i = 0; i < kTasks; i++) {
-        tg.add(&racers[i]);
-    }
-    tg.wait();
+    // Run a bunch of tasks to be the first to add six to x.
+    sk_parallel_for(1021, [&](int) {
+        void(*add_six)(int*) = [](int* p) { *p += 6; };
+        SkOnce(&mt_once, add_six, &x);
+    });
 
     // Only one should have done the +=.
     REPORTER_ASSERT(r, 6 == x);
diff --git a/tests/PathOpsExtendedTest.cpp b/tests/PathOpsExtendedTest.cpp
index a8079cf0a9..df80f4673e 100644
--- a/tests/PathOpsExtendedTest.cpp
+++ b/tests/PathOpsExtendedTest.cpp
@@ -14,7 +14,6 @@
 #include "SkPaint.h"
 #include "SkRTConf.h"
 #include "SkStream.h"
-#include "SkTaskGroup.h"
 #include "SkThread.h"
 
 #ifdef SK_BUILD_FOR_MAC
diff --git a/tests/PathOpsSkpClipTest.cpp b/tests/PathOpsSkpClipTest.cpp
index 1f69ff05e1..7ea3184442 100755
--- a/tests/PathOpsSkpClipTest.cpp
+++ b/tests/PathOpsSkpClipTest.cpp
@@ -303,10 +303,11 @@ TestRunner::~TestRunner() {
 }
 
 void TestRunner::render() {
-    SkTaskGroup tg;
-    for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
-    }
+    // TODO: this doesn't really need to use SkRunnables any more.
+    // We can just write the code to run in the for-loop directly.
+    sk_parallel_for(fRunnables.count(), [&](int i) {
+        fRunnables[i]->run();
+    });
 }
 
 ////////////////////////////////////////////////
diff --git a/tests/PathOpsThreadedCommon.cpp b/tests/PathOpsThreadedCommon.cpp
index 0adde915e0..10501d139c 100644
--- a/tests/PathOpsThreadedCommon.cpp
+++ b/tests/PathOpsThreadedCommon.cpp
@@ -16,8 +16,7 @@ PathOpsThreadedTestRunner::~PathOpsThreadedTestRunner() {
 }
 
 void PathOpsThreadedTestRunner::render() {
-    SkTaskGroup tg;
-    for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
-    }
+    sk_parallel_for(fRunnables.count(), [&](int i) {
+        fRunnables[i]->run();
+    });
 }
diff --git a/tests/SkpSkGrTest.cpp b/tests/SkpSkGrTest.cpp
index 2c3c5b7fbf..212b0f6abd 100644
--- a/tests/SkpSkGrTest.cpp
+++ b/tests/SkpSkGrTest.cpp
@@ -1,6 +1,9 @@
-#if !SK_SUPPORT_GPU
-#error "GPU support required"
-#endif
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
 
 #include "GrContext.h"
 #include "GrContextFactory.h"
@@ -27,6 +30,10 @@
 #include "SkTime.h"
 #include "Test.h"
 
+#if !SK_SUPPORT_GPU
+#error "GPU support required"
+#endif
+
 #ifdef SK_BUILD_FOR_WIN
     #define PATH_SLASH "\\"
     #define IN_DIR "D:\\9-30-13\\"
@@ -162,10 +169,11 @@ SkpSkGrThreadedTestRunner::~SkpSkGrThreadedTestRunner() {
 }
 
 void SkpSkGrThreadedTestRunner::render() {
-    SkTaskGroup tg;
-    for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
-    }
+    // TODO: we don't really need to be using SkRunnables here anymore.
+    // We can just write the code we'd run right in the for loop.
+    sk_parallel_for(fRunnables.count(), [&](int i) {
+        fRunnables[i]->run();
+    });
 }
 
 ////////////////////////////////////////////////
diff --git a/tools/skpdiff/SkDiffContext.cpp b/tools/skpdiff/SkDiffContext.cpp
index c422636f6e..d14546046c 100644
--- a/tools/skpdiff/SkDiffContext.cpp
+++ b/tools/skpdiff/SkDiffContext.cpp
@@ -209,26 +209,6 @@ void SkDiffContext::addDiff(const char* baselinePath, const char* testPath) {
     }
 }
 
-class SkThreadedDiff : public SkRunnable {
-public:
-    SkThreadedDiff() : fDiffContext(NULL) { }
-
-    void setup(SkDiffContext* diffContext, const SkString& baselinePath, const SkString& testPath) {
-        fDiffContext = diffContext;
-        fBaselinePath = baselinePath;
-        fTestPath = testPath;
-    }
-
-    void run() override {
-        fDiffContext->addDiff(fBaselinePath.c_str(), fTestPath.c_str());
-    }
-
-private:
-    SkDiffContext* fDiffContext;
-    SkString fBaselinePath;
-    SkString fTestPath;
-};
-
 void SkDiffContext::diffDirectories(const char baselinePath[], const char testPath[]) {
     // Get the files in the baseline, we will then look for those inside the test path
     SkTArray<SkString> baselineEntries;
@@ -237,12 +217,8 @@ void SkDiffContext::diffDirectories(const char baselinePath[], const char testPa
         return;
     }
 
-    SkTaskGroup tg;
-    SkTArray<SkThreadedDiff> runnableDiffs;
-    runnableDiffs.reset(baselineEntries.count());
-
-    for (int x = 0; x < baselineEntries.count(); x++) {
-        const char* baseFilename = baselineEntries[x].c_str();
+    sk_parallel_for(baselineEntries.count(), [&](int i) {
+        const char* baseFilename = baselineEntries[i].c_str();
 
         // Find the real location of each file to compare
         SkString baselineFile = SkOSPath::Join(baselinePath, baseFilename);
@@ -250,13 +226,11 @@ void SkDiffContext::diffDirectories(const char baselinePath[], const char testPa
 
         // Check that the test file exists and is a file
         if (sk_exists(testFile.c_str()) && !sk_isdir(testFile.c_str())) {
-            // Queue up the comparison with the differ
-            runnableDiffs[x].setup(this, baselineFile, testFile);
-            tg.add(&runnableDiffs[x]);
+            this->addDiff(baselineFile.c_str(), testFile.c_str());
         } else {
             SkDebugf("Baseline file \"%s\" has no corresponding test file\n", baselineFile.c_str());
         }
-    }
+    });
 }
 
 
@@ -281,15 +255,9 @@ void SkDiffContext::diffPatterns(const char baselinePattern[], const char testPa
         return;
     }
 
-    SkTaskGroup tg;
-    SkTArray<SkThreadedDiff> runnableDiffs;
-    runnableDiffs.reset(baselineEntries.count());
-
-    for (int x = 0; x < baselineEntries.count(); x++) {
-        runnableDiffs[x].setup(this, baselineEntries[x], testEntries[x]);
-        tg.add(&runnableDiffs[x]);
-    }
-    tg.wait();
+    sk_parallel_for(baselineEntries.count(), [&](int i) {
+        this->addDiff(baselineEntries[i].c_str(), testEntries[i].c_str());
+    });
 }
 
 void SkDiffContext::outputRecords(SkWStream& stream, bool useJSONP) {