convert over to 2d-mode

[√] convert all stages to use SkJumper_MemoryCtx / be 2d-compatible [√] convert compile to 2d also, remove 1d run/compile [√] convert all call sites [√] no diffs Change-Id: I3b806eb8fe0c3ec043359616409f7cd1211a1e43 Reviewed-on: https://skia-review.googlesource.com/24263 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Florin Malita <fmalita@chromium.org>
2017-07-18 18:15:13 -04:00 · 2017-07-18 18:15:13 -04:00 · 45c16fa82c
commit 45c16fa82c
parent 3f4671871f
25 changed files with 15945 additions and 15716 deletions
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@ -8,6 +8,7 @@
 #include "Benchmark.h"
 #include "SkOpts.h"
 #include "SkRasterPipeline.h"
+#include "../src/jumper/SkJumper.h"

 static const int N = 15;

@ -35,9 +36,9 @@ public:
    }

    void onDraw(int loops, SkCanvas*) override {
-        void* mask_ctx = mask;
-        void*  src_ctx = src;
-        void*  dst_ctx = dst;
+        SkJumper_MemoryCtx mask_ctx = {mask, 0},
+                            src_ctx = {src,  0},
+                            dst_ctx = {dst,  0};

        SkRasterPipeline_<256> p;
        p.append(SkRasterPipeline::load_8888, &src_ctx);
@ -59,7 +60,7 @@ public:
        }

        while (loops --> 0) {
-            p.run(0,0,N);
+            p.run(0,0,N,1);
        }
    }
 };
@ -76,8 +77,8 @@ public:
    }

    void onDraw(int loops, SkCanvas*) override {
-        void*  src_ctx = src;
-        void*  dst_ctx = dst;
+        SkJumper_MemoryCtx src_ctx = {src, 0},
+                           dst_ctx = {dst, 0};

        SkRasterPipeline_<256> p;
        p.append(SkRasterPipeline::load_8888, &dst_ctx);
@ -89,11 +90,11 @@ public:
        if (fCompile) {
            auto fn = p.compile();
            while (loops --> 0) {
-                fn(0,0,N);
+                fn(0,0,N,1);
            }
        } else {
            while (loops --> 0) {
-                p.run(0,0,N);
+                p.run(0,0,N,1);
            }
        }
    }
@ -132,7 +133,7 @@ public:
        p.append(SkRasterPipeline::parametric_b, &  to_2dot2);

        while (loops --> 0) {
-            p.run(0,0,N);
+            p.run(0,0,N,1);
        }
    }
 };
@ -150,7 +151,7 @@ public:
        p.append(SkRasterPipeline::to_srgb);

        while (loops --> 0) {
-            p.run(0,0,N);
+            p.run(0,0,N,1);
        }
    }
 };
--- a/dm/DMSrcSink.cpp
+++ b/dm/DMSrcSink.cpp
@ -44,6 +44,7 @@
 #include "SkTLogic.h"
 #include <cmath>
 #include <functional>
+#include "../src/jumper/SkJumper.h"

 #if defined(SK_BUILD_FOR_WIN)
    #include "SkAutoCoInitialize.h"
@ -324,15 +325,14 @@ static void premultiply_if_necessary(SkBitmap& bitmap) {
    }

    switch (bitmap.colorType()) {
-        case kRGBA_F16_SkColorType:
-            for (int y = 0; y < bitmap.height(); y++) {
-                void* row = bitmap.getAddr(0, y);
-                SkRasterPipeline_<256> p;
-                p.append(SkRasterPipeline::load_f16, &row);
-                p.append(SkRasterPipeline::premul);
-                p.append(SkRasterPipeline::store_f16, &row);
-                p.run(0,y, bitmap.width());
-            }
+        case kRGBA_F16_SkColorType: {
+            SkJumper_MemoryCtx ctx = { bitmap.getAddr(0,0), bitmap.rowBytesAsPixels() };
+            SkRasterPipeline_<256> p;
+            p.append(SkRasterPipeline::load_f16, &ctx);
+            p.append(SkRasterPipeline::premul);
+            p.append(SkRasterPipeline::store_f16, &ctx);
+            p.run(0,0, bitmap.width(), bitmap.height());
+        }
            break;
        case kN32_SkColorType:
            for (int y = 0; y < bitmap.height(); y++) {
@ -1020,7 +1020,8 @@ void clamp_if_necessary(const SkBitmap& bitmap, SkColorType dstCT) {
        return;
    }

-    void* ptr = bitmap.getAddr(0, 0);
+    SkJumper_MemoryCtx ptr = { bitmap.getAddr(0,0), bitmap.rowBytesAsPixels() };
+
    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline::load_f16, &ptr);
    p.append(SkRasterPipeline::clamp_0);
@ -1031,11 +1032,7 @@ void clamp_if_necessary(const SkBitmap& bitmap, SkColorType dstCT) {
    }
    p.append(SkRasterPipeline::store_f16, &ptr);

-    auto run = p.compile();
-    for (int y = 0; y < bitmap.height(); y++) {
-        run(0, y, bitmap.width());
-        ptr = SkTAddOffset<void>(ptr, bitmap.rowBytes());
-    }
+    p.run(0,0, bitmap.width(), bitmap.height());
 }

 Error ColorCodecSrc::draw(SkCanvas* canvas) const {
--- a/src/codec/SkGifCodec.cpp
+++ b/src/codec/SkGifCodec.cpp
@ -38,6 +38,7 @@
 #include "SkRasterPipeline.h"
 #include "SkStream.h"
 #include "SkSwizzler.h"
+#include "../jumper/SkJumper.h"

 #include <algorithm>

@ -520,16 +521,19 @@ bool SkGifCodec::haveDecodedRow(int frameIndex, const unsigned char* rowBegin,
        SkRasterPipeline::StockStage storeDst;
        void* src = SkTAddOffset<void>(fTmpBuffer.get(), offsetBytes);
        void* dst = SkTAddOffset<void>(dstLine, offsetBytes);
+
+        SkJumper_MemoryCtx src_ctx = { src, 0 },
+                           dst_ctx = { dst, 0 };
        switch (dstInfo.colorType()) {
            case kBGRA_8888_SkColorType:
            case kRGBA_8888_SkColorType:
-                p.append(SkRasterPipeline::load_8888_dst, &dst);
-                p.append(SkRasterPipeline::load_8888, &src);
+                p.append(SkRasterPipeline::load_8888_dst, &dst_ctx);
+                p.append(SkRasterPipeline::load_8888, &src_ctx);
                storeDst = SkRasterPipeline::store_8888;
                break;
            case kRGBA_F16_SkColorType:
-                p.append(SkRasterPipeline::load_f16_dst, &dst);
-                p.append(SkRasterPipeline::load_f16, &src);
+                p.append(SkRasterPipeline::load_f16_dst, &dst_ctx);
+                p.append(SkRasterPipeline::load_f16, &src_ctx);
                storeDst = SkRasterPipeline::store_f16;
                break;
            default:
@ -539,7 +543,7 @@ bool SkGifCodec::haveDecodedRow(int frameIndex, const unsigned char* rowBegin,
        }
        p.append(SkRasterPipeline::srcover);
        p.append(storeDst, &dst);
-        p.run(0, 0, fSwizzler->swizzleWidth());
+        p.run(0,0, fSwizzler->swizzleWidth(),1);
    }

    // Tell the frame to copy the row data if need be.
--- a/src/codec/SkWebpCodec.cpp
+++ b/src/codec/SkWebpCodec.cpp
@ -16,6 +16,7 @@
 #include "SkStreamPriv.h"
 #include "SkTemplates.h"
 #include "SkWebpCodec.h"
+#include "../jumper/SkJumper.h"

 // A WebP decoder on top of (subset of) libwebp
 // For more information on WebP image format, and libwebp library, see:
@ -351,7 +352,7 @@ static void pick_memory_stages(SkColorType ct, SkRasterPipeline::StockStage* loa
 }

 static void blend_line(SkColorType dstCT, void* dst,
-                       SkColorType srcCT, void* src,
+                       SkColorType srcCT, const void* src,
                       bool needsSrgbToLinear, SkAlphaType at,
                       int width) {
    // Setup conversion from the source and dest, which will be the same.
@ -364,19 +365,22 @@ static void blend_line(SkColorType dstCT, void* dst,
        convert_to_linear_premul.append(SkRasterPipeline::premul);
    }

+    SkJumper_MemoryCtx dst_ctx = { (void*)dst, 0 },
+                       src_ctx = { (void*)src, 0 };
+
    SkRasterPipeline_<256> p;
    SkRasterPipeline::StockStage load_dst, store_dst;
    pick_memory_stages(dstCT, &load_dst, &store_dst);

    // Load the final dst.
-    p.append(load_dst, dst);
+    p.append(load_dst, &dst_ctx);
    p.extend(convert_to_linear_premul);
    p.append(SkRasterPipeline::move_src_dst);

    // Load the src.
    SkRasterPipeline::StockStage load_src;
    pick_memory_stages(srcCT, &load_src, nullptr);
-    p.append(load_src, src);
+    p.append(load_src, &src_ctx);
    p.extend(convert_to_linear_premul);

    p.append(SkRasterPipeline::srcover);
@ -388,9 +392,9 @@ static void blend_line(SkColorType dstCT, void* dst,
    if (needsSrgbToLinear) {
        p.append(SkRasterPipeline::to_srgb);
    }
-    p.append(store_dst, dst);
+    p.append(store_dst, &dst_ctx);

-    p.run(0,0, width);
+    p.run(0,0, width,1);
 }

 SkCodec::Result SkWebpCodec::onGetPixels(const SkImageInfo& dstInfo, void* dst, size_t rowBytes,
@ -601,7 +605,7 @@ SkCodec::Result SkWebpCodec::onGetPixels(const SkImageInfo& dstInfo, void* dst,
        for (int y = 0; y < rowsDecoded; y++) {
            this->applyColorXform(xformDst, xformSrc, scaledWidth, xformAlphaType);
            if (blendWithPrevFrame) {
-                blend_line(dstCT, &dst, dstCT, &xformDst, needsSrgbToLinear, xformAlphaType,
+                blend_line(dstCT, dst, dstCT, xformDst, needsSrgbToLinear, xformAlphaType,
                        scaledWidth);
                dst = SkTAddOffset<void>(dst, rowBytes);
            } else {
@ -613,7 +617,7 @@ SkCodec::Result SkWebpCodec::onGetPixels(const SkImageInfo& dstInfo, void* dst,
        const uint8_t* src = config.output.u.RGBA.rgba;

        for (int y = 0; y < rowsDecoded; y++) {
-            blend_line(dstCT, &dst, webpDst.colorType(), &src, needsSrgbToLinear,
+            blend_line(dstCT, dst, webpDst.colorType(), src, needsSrgbToLinear,
                    xformAlphaType, scaledWidth);
            src = SkTAddOffset<const uint8_t>(src, srcRowBytes);
            dst = SkTAddOffset<void>(dst, rowBytes);
--- a/src/core/SkBlendMode.cpp
+++ b/src/core/SkBlendMode.cpp
@ -7,6 +7,7 @@

 #include "SkBlendModePriv.h"
 #include "SkRasterPipeline.h"
+#include "../jumper/SkJumper.h"

 bool SkBlendMode_SupportsCoverageAsAlpha(SkBlendMode mode) {
    switch (mode) {
@ -122,16 +123,16 @@ SkPM4f SkBlendMode_Apply(SkBlendMode mode, const SkPM4f& src, const SkPM4f& dst)
    SkRasterPipeline_<256> p;
    SkPM4f                 src_storage = src,
                           dst_storage = dst,
-                           result_storage,
-                           *src_ctx = &src_storage,
-                           *dst_ctx = &dst_storage,
-                           *res_ctx = &result_storage;
+                           res_storage;
+    SkJumper_MemoryCtx src_ctx = { &src_storage, 0 },
+                       dst_ctx = { &dst_storage, 0 },
+                       res_ctx = { &res_storage, 0 };

    p.append(SkRasterPipeline::load_f32, &dst_ctx);
    p.append(SkRasterPipeline::move_src_dst);
    p.append(SkRasterPipeline::load_f32, &src_ctx);
    SkBlendMode_AppendStages(mode, &p);
    p.append(SkRasterPipeline::store_f32, &res_ctx);
-    p.run(0, 0, 1);
-    return result_storage;
+    p.run(0,0, 1,1);
+    return res_storage;
 }
--- a/src/core/SkBlitter_Sprite.cpp
+++ b/src/core/SkBlitter_Sprite.cpp
@ -12,6 +12,7 @@
 #include "SkPM4fPriv.h"
 #include "SkRasterPipeline.h"
 #include "SkSpriteBlitter.h"
+#include "../jumper/SkJumper.h"

 SkSpriteBlitter::SkSpriteBlitter(const SkPixmap& source)
    : fSource(source) {}
@ -101,7 +102,7 @@ public:
        : INHERITED(src)
        , fAlloc(alloc)
        , fBlitter(nullptr)
-        , fSrcPtr(nullptr)
+        , fSrcPtr{nullptr, 0}
    {}

    void setup(const SkPixmap& dst, int left, int top, const SkPaint& paint) override {
@ -141,23 +142,20 @@ public:
    }

    void blitRect(int x, int y, int width, int height) override {
-        fSrcPtr = (const char*)fSource.addr(x-fLeft,y-fTop);
+        int bpp = fSource.info().bytesPerPixel();

-        // Our pipeline will load from fSrcPtr+x, fSrcPtr+x+1, etc.,
-        // so we back up an extra x pixels to start at 0.
-        fSrcPtr -= fSource.info().bytesPerPixel() * x;
+        fSrcPtr.stride = fSource.rowBytesAsPixels();
+        fSrcPtr.pixels = (char*)fSource.addr(x-fLeft, y-fTop) - bpp * x
+                                                              - bpp * y * fSrcPtr.stride;

-        while (height --> 0) {
-            fBlitter->blitH(x,y++, width);
-            fSrcPtr += fSource.rowBytes();
-        }
+        fBlitter->blitRect(x,y,width,height);
    }

 private:
-    SkArenaAlloc* fAlloc;
-    SkBlitter*    fBlitter;
-    const char*   fSrcPtr;
-    SkColor4f     fPaintColor;
+    SkArenaAlloc*      fAlloc;
+    SkBlitter*         fBlitter;
+    SkJumper_MemoryCtx fSrcPtr;
+    SkColor4f          fPaintColor;

    typedef SkSpriteBlitter INHERITED;
 };
--- a/src/core/SkColorFilter.cpp
+++ b/src/core/SkColorFilter.cpp
@ -71,9 +71,9 @@ SkColor4f SkColorFilter::filterColor4f(const SkColor4f& c) const {

    pipeline.append_uniform_color(&alloc, src);
    this->onAppendStages(&pipeline, nullptr, &alloc, c.fA == 1);
-    SkPM4f* dstPtr = &dst;
+    SkJumper_MemoryCtx dstPtr = { &dst, 0 };
    pipeline.append(SkRasterPipeline::store_f32, &dstPtr);
-    pipeline.run(0,0, 1);
+    pipeline.run(0,0, 1,1);

    return dst.unpremul();
 }
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@ -18,6 +18,7 @@
 #include "SkPM4fPriv.h"
 #include "SkRasterPipeline.h"
 #include "SkSRGB.h"
+#include "../jumper/SkJumper.h"

 static constexpr float sk_linear_from_2dot2[256] = {
        0.000000000000000000f, 0.000005077051900662f, 0.000023328004666099f, 0.000056921765712193f,
@ -1162,11 +1163,14 @@ bool SkColorSpaceXform_XYZ<kCSM>
                const void* src, int len, SkAlphaType alphaType) const {
    SkRasterPipeline_<256> pipeline;

+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
+
    LoadTablesContext loadTables;
    switch (srcColorFormat) {
        case kRGBA_8888_ColorFormat:
            if (kLinear_SrcGamma == fSrcGamma) {
-                pipeline.append(SkRasterPipeline::load_8888, &src);
+                pipeline.append(SkRasterPipeline::load_8888, &src_ctx);
            } else {
                loadTables.fSrc = src;
                loadTables.fR = fSrcGammaTables[0];
@ -1178,7 +1182,7 @@ bool SkColorSpaceXform_XYZ<kCSM>
            break;
        case kBGRA_8888_ColorFormat:
            if (kLinear_SrcGamma == fSrcGamma) {
-                pipeline.append(SkRasterPipeline::load_bgra, &src);
+                pipeline.append(SkRasterPipeline::load_bgra, &src_ctx);
            } else {
                loadTables.fSrc = src;
                loadTables.fR = fSrcGammaTables[2];
@ -1193,21 +1197,21 @@ bool SkColorSpaceXform_XYZ<kCSM>
            if (kLinear_SrcGamma != fSrcGamma) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::load_f16, &src);
+            pipeline.append(SkRasterPipeline::load_f16, &src_ctx);
            break;
        case kRGBA_F32_ColorFormat:
            if (kLinear_SrcGamma != fSrcGamma) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::load_f32, &src);
+            pipeline.append(SkRasterPipeline::load_f32, &src_ctx);
            break;
        case kRGBA_U16_BE_ColorFormat:
            switch (fSrcGamma) {
                case kLinear_SrcGamma:
-                    pipeline.append(SkRasterPipeline::load_u16_be, &src);
+                    pipeline.append(SkRasterPipeline::load_u16_be, &src_ctx);
                    break;
                case kSRGB_SrcGamma:
-                    pipeline.append(SkRasterPipeline::load_u16_be, &src);
+                    pipeline.append(SkRasterPipeline::load_u16_be, &src_ctx);
                    pipeline.append_from_srgb(kUnpremul_SkAlphaType);
                    break;
                case kTable_SrcGamma:
@ -1222,10 +1226,10 @@ bool SkColorSpaceXform_XYZ<kCSM>
        case kRGB_U16_BE_ColorFormat:
            switch (fSrcGamma) {
                case kLinear_SrcGamma:
-                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src_ctx);
                    break;
                case kSRGB_SrcGamma:
-                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src_ctx);
                    pipeline.append_from_srgb(kUnpremul_SkAlphaType);
                    break;
                case kTable_SrcGamma:
@ -1290,34 +1294,34 @@ bool SkColorSpaceXform_XYZ<kCSM>

    switch (dstColorFormat) {
        case kRGBA_8888_ColorFormat:
-             pipeline.append(SkRasterPipeline::store_8888, &dst);
+             pipeline.append(SkRasterPipeline::store_8888, &dst_ctx);
            break;
        case kBGRA_8888_ColorFormat:
-            pipeline.append(SkRasterPipeline::store_bgra, &dst);
+            pipeline.append(SkRasterPipeline::store_bgra, &dst_ctx);
            break;
        case kRGBA_F16_ColorFormat:
            if (kLinear_DstGamma != fDstGamma) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::store_f16, &dst);
+            pipeline.append(SkRasterPipeline::store_f16, &dst_ctx);
            break;
        case kRGBA_F32_ColorFormat:
            if (kLinear_DstGamma != fDstGamma) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::store_f32, &dst);
+            pipeline.append(SkRasterPipeline::store_f32, &dst_ctx);
            break;
        case kBGR_565_ColorFormat:
            if (kOpaque_SkAlphaType != alphaType) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::store_565, &dst);
+            pipeline.append(SkRasterPipeline::store_565, &dst_ctx);
            break;
        default:
            return false;
    }

-    pipeline.run(0,0, len);
+    pipeline.run(0,0, len,1);
    return true;
 }

--- a/src/core/SkColorSpaceXform_A2B.cpp
+++ b/src/core/SkColorSpaceXform_A2B.cpp
@ -21,18 +21,22 @@
 bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorFormat srcFormat,
                                    const void* src, int count, SkAlphaType alphaType) const {
    SkRasterPipeline_<256> pipeline;
+
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
+
    switch (srcFormat) {
        case kBGRA_8888_ColorFormat:
-            pipeline.append(SkRasterPipeline::load_bgra, &src);
+            pipeline.append(SkRasterPipeline::load_bgra, &src_ctx);
            break;
        case kRGBA_8888_ColorFormat:
-            pipeline.append(SkRasterPipeline::load_8888, &src);
+            pipeline.append(SkRasterPipeline::load_8888, &src_ctx);
            break;
        case kRGBA_U16_BE_ColorFormat:
-            pipeline.append(SkRasterPipeline::load_u16_be, &src);
+            pipeline.append(SkRasterPipeline::load_u16_be, &src_ctx);
            break;
        case kRGB_U16_BE_ColorFormat:
-            pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+            pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src_ctx);
            break;
        default:
            SkCSXformPrintf("F16/F32 sources must be linear.\n");
@ -47,33 +51,33 @@ bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorForma

    switch (dstFormat) {
        case kBGRA_8888_ColorFormat:
-            pipeline.append(SkRasterPipeline::store_bgra, &dst);
+            pipeline.append(SkRasterPipeline::store_bgra, &dst_ctx);
            break;
        case kRGBA_8888_ColorFormat:
-            pipeline.append(SkRasterPipeline::store_8888, &dst);
+            pipeline.append(SkRasterPipeline::store_8888, &dst_ctx);
            break;
        case kRGBA_F16_ColorFormat:
            if (!fLinearDstGamma) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::store_f16, &dst);
+            pipeline.append(SkRasterPipeline::store_f16, &dst_ctx);
            break;
        case kRGBA_F32_ColorFormat:
            if (!fLinearDstGamma) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::store_f32, &dst);
+            pipeline.append(SkRasterPipeline::store_f32, &dst_ctx);
            break;
        case kBGR_565_ColorFormat:
            if (kOpaque_SkAlphaType != alphaType) {
                return false;
            }
-            pipeline.append(SkRasterPipeline::store_565, &dst);
+            pipeline.append(SkRasterPipeline::store_565, &dst_ctx);
            break;
        default:
            return false;
    }
-    pipeline.run(0,0, count);
+    pipeline.run(0,0, count,1);

    return true;
 }
--- a/src/core/SkConvertPixels.cpp
+++ b/src/core/SkConvertPixels.cpp
@ -264,25 +264,29 @@ static void convert_to_alpha8(uint8_t* dst, size_t dstRB, const SkImageInfo& src
 static void convert_with_pipeline(const SkImageInfo& dstInfo, void* dstRow, size_t dstRB,
                                  const SkImageInfo& srcInfo, const void* srcRow, size_t srcRB,
                                  bool isColorAware, SkTransferFunctionBehavior behavior) {
+
+    SkJumper_MemoryCtx src = { (void*)srcRow, (int)(srcRB / srcInfo.bytesPerPixel()) },
+                       dst = { (void*)dstRow, (int)(dstRB / dstInfo.bytesPerPixel()) };
+
    SkRasterPipeline_<256> pipeline;
    switch (srcInfo.colorType()) {
        case kRGBA_8888_SkColorType:
-            pipeline.append(SkRasterPipeline::load_8888, &srcRow);
+            pipeline.append(SkRasterPipeline::load_8888, &src);
            break;
        case kBGRA_8888_SkColorType:
-            pipeline.append(SkRasterPipeline::load_bgra, &srcRow);
+            pipeline.append(SkRasterPipeline::load_bgra, &src);
            break;
        case kRGB_565_SkColorType:
-            pipeline.append(SkRasterPipeline::load_565, &srcRow);
+            pipeline.append(SkRasterPipeline::load_565, &src);
            break;
        case kRGBA_F16_SkColorType:
-            pipeline.append(SkRasterPipeline::load_f16, &srcRow);
+            pipeline.append(SkRasterPipeline::load_f16, &src);
            break;
        case kGray_8_SkColorType:
-            pipeline.append(SkRasterPipeline::load_g8, &srcRow);
+            pipeline.append(SkRasterPipeline::load_g8, &src);
            break;
        case kARGB_4444_SkColorType:
-            pipeline.append(SkRasterPipeline::load_4444, &srcRow);
+            pipeline.append(SkRasterPipeline::load_4444, &src);
            break;
        default:
            SkASSERT(false);
@ -359,33 +363,26 @@ static void convert_with_pipeline(const SkImageInfo& dstInfo, void* dstRow, size

    switch (dstInfo.colorType()) {
        case kRGBA_8888_SkColorType:
-            pipeline.append(SkRasterPipeline::store_8888, &dstRow);
+            pipeline.append(SkRasterPipeline::store_8888, &dst);
            break;
        case kBGRA_8888_SkColorType:
-            pipeline.append(SkRasterPipeline::store_bgra, &dstRow);
+            pipeline.append(SkRasterPipeline::store_bgra, &dst);
            break;
        case kRGB_565_SkColorType:
-            pipeline.append(SkRasterPipeline::store_565, &dstRow);
+            pipeline.append(SkRasterPipeline::store_565, &dst);
            break;
        case kRGBA_F16_SkColorType:
-            pipeline.append(SkRasterPipeline::store_f16, &dstRow);
+            pipeline.append(SkRasterPipeline::store_f16, &dst);
            break;
        case kARGB_4444_SkColorType:
-            pipeline.append(SkRasterPipeline::store_4444, &dstRow);
+            pipeline.append(SkRasterPipeline::store_4444, &dst);
            break;
        default:
            SkASSERT(false);
            break;
    }

-    auto run = pipeline.compile();
-    for (int y = 0; y < srcInfo.height(); ++y) {
-        run(0,y, srcInfo.width());
-        // The pipeline has pointers to srcRow and dstRow, so we just need to update them in the
-        // loop to move between rows of src/dst.
-        dstRow = SkTAddOffset<void>(dstRow, dstRB);
-        srcRow = SkTAddOffset<const void>(srcRow, srcRB);
-    }
+    pipeline.run(0,0, srcInfo.width(), srcInfo.height());
 }

 void SkConvertPixels(const SkImageInfo& dstInfo, void* dstPixels, size_t dstRB,
--- a/src/core/SkPM4fPriv.h
+++ b/src/core/SkPM4fPriv.h
@ -15,6 +15,7 @@
 #include "SkPM4f.h"
 #include "SkRasterPipeline.h"
 #include "SkSRGB.h"
+#include "../jumper/SkJumper.h"

 static inline Sk4f set_alpha(const Sk4f& px, float alpha) {
    return { px[0], px[1], px[2], alpha };
@ -148,16 +149,16 @@ static inline void append_gamut_transform(SkRasterPipeline* p, SkArenaAlloc* scr
 static inline SkColor4f to_colorspace(const SkColor4f& c, SkColorSpace* src, SkColorSpace* dst) {
    SkColor4f color4f = c;
    if (src && dst) {
-        void* color4f_ptr = &color4f;
+        SkJumper_MemoryCtx color4f_ptr = { &color4f, 0 };

        float scratch_matrix_3x4[12];

        SkRasterPipeline_<256> p;
-        p.append(SkRasterPipeline::uniform_color, color4f_ptr);
+        p.append(SkRasterPipeline::uniform_color, &color4f);
        append_gamut_transform(&p, scratch_matrix_3x4, src, dst, kUnpremul_SkAlphaType);
        p.append(SkRasterPipeline::store_f32, &color4f_ptr);

-        p.run(0,0,1);
+        p.run(0,0,1,1);
    }
    return color4f;
 }
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -91,8 +91,7 @@ struct SkPM4f;
    M(xy_to_2pt_conical_linear)                                  \
    M(mask_2pt_conical_degenerates) M(apply_vector_mask)         \
    M(byte_tables) M(byte_tables_rgb)                            \
-    M(rgb_to_hsl) M(hsl_to_rgb)                                  \
-    M(store_8888_2d)
+    M(rgb_to_hsl) M(hsl_to_rgb)

 class SkRasterPipeline {
 public:
@ -117,14 +116,11 @@ public:
    // Append all stages to this pipeline.
    void extend(const SkRasterPipeline&);

-    // Runs the pipeline walking x through [x,x+n).
-    void run(size_t x, size_t y, size_t n) const;
-
    // Runs the pipeline in 2d from (x,y) inclusive to (x+w,y+h) exclusive.
-    void run_2d(size_t x, size_t y, size_t w, size_t h) const;
+    void run(size_t x, size_t y, size_t w, size_t h) const;

    // Allocates a thunk which amortizes run() setup cost in alloc.
-    std::function<void(size_t, size_t, size_t)> compile() const;
+    std::function<void(size_t, size_t, size_t, size_t)> compile() const;

    void dump() const;

--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@ -55,7 +55,7 @@ private:
    void append_store (SkRasterPipeline*) const;

    // If we have an burst context, use it to fill our shader buffer.
-    void maybe_shade(int x, int y, int w);
+    void burst_shade(int x, int y, int w);

    SkPixmap               fDst;
    SkBlendMode            fBlend;
@ -63,23 +63,24 @@ private:
    SkShaderBase::Context* fBurstCtx;
    SkRasterPipeline       fColorPipeline;

+    SkJumper_MemoryCtx fShaderOutput = {nullptr,0},  // Possibly updated each call to burst_shade().
+                       fDstPtr       = {nullptr,0},  // Always points to the top-left of fDst.
+                       fMaskPtr      = {nullptr,0};  // Updated each call to blitMask().
+
    // We may be able to specialize blitH() into a memset.
    bool     fCanMemsetInBlitH = false;
    uint64_t fMemsetColor      = 0;     // Big enough for largest dst format, F16.

    // Built lazily on first use.
-    std::function<void(size_t, size_t, size_t)> fBlitH,
-                                                fBlitAntiH,
-                                                fBlitMaskA8,
-                                                fBlitMaskLCD16;
+    std::function<void(size_t, size_t, size_t, size_t)> fBlitH,
+                                                        fBlitAntiH,
+                                                        fBlitMaskA8,
+                                                        fBlitMaskLCD16;

    // These values are pointed to by the blit pipelines above,
    // which allows us to adjust them from call to call.
-    void*              fShaderOutput    = nullptr;
-    void*              fDstPtr          = nullptr;
-    const void*        fMaskPtr         = nullptr;
-    float              fCurrentCoverage = 0.0f;
-    float              fDitherRate      = 0.0f;
+    float fCurrentCoverage = 0.0f;
+    float fDitherRate      = 0.0f;

    std::vector<SkPM4f> fShaderBuffer;

@ -191,14 +192,14 @@ SkBlitter* SkRasterPipelineBlitter::Create(const SkPixmap& dst,

    // A pipeline that's still constant here can collapse back into a constant color.
    if (is_constant) {
-        SkPM4f storage;
-        SkPM4f* constantColor = &storage;
-        colorPipeline->append(SkRasterPipeline::store_f32, &constantColor);
-        colorPipeline->run(0,0,1);
+        SkPM4f constantColor;
+        SkJumper_MemoryCtx constantColorPtr = { &constantColor, 0 };
+        colorPipeline->append(SkRasterPipeline::store_f32, &constantColorPtr);
+        colorPipeline->run(0,0,1,1);
        colorPipeline->reset();
-        colorPipeline->append_uniform_color(alloc, *constantColor);
+        colorPipeline->append_uniform_color(alloc, constantColor);

-        is_opaque = constantColor->a() == 1.0f;
+        is_opaque = constantColor.a() == 1.0f;
    }

    // We can strength-reduce SrcOver into Src when opaque.
@ -213,13 +214,18 @@ SkBlitter* SkRasterPipelineBlitter::Create(const SkPixmap& dst,
        // Not all blits can memset, so we need to keep colorPipeline too.
        SkRasterPipeline_<256> p;
        p.extend(*colorPipeline);
-        blitter->fDstPtr = &blitter->fMemsetColor;
+        blitter->fDstPtr = SkJumper_MemoryCtx{&blitter->fMemsetColor, 0};
        blitter->append_store(&p);
-        p.run(0,0,1);
+        p.run(0,0,1,1);

        blitter->fCanMemsetInBlitH = true;
    }

+    blitter->fDstPtr = SkJumper_MemoryCtx{
+        blitter->fDst.writable_addr(),
+        blitter->fDst.rowBytesAsPixels(),
+    };
+
    return blitter;
 }

@ -275,26 +281,23 @@ void SkRasterPipelineBlitter::maybe_clamp(SkRasterPipeline* p) const {
    SkBlendMode_AppendClampIfNeeded(fBlend, p);
 }

-void SkRasterPipelineBlitter::maybe_shade(int x, int y, int w) {
-    if (fBurstCtx) {
-        if (w > SkToInt(fShaderBuffer.size())) {
-            fShaderBuffer.resize(w);
-        }
-        fBurstCtx->shadeSpan4f(x,y, fShaderBuffer.data(), w);
-        // We'll be reading from fShaderOutput + x.
-        fShaderOutput = fShaderBuffer.data() - x;
+void SkRasterPipelineBlitter::burst_shade(int x, int y, int w) {
+    SkASSERT(fBurstCtx);
+    if (w > SkToInt(fShaderBuffer.size())) {
+        fShaderBuffer.resize(w);
    }
+    fBurstCtx->shadeSpan4f(x,y, fShaderBuffer.data(), w);
+    // We'll be reading from fShaderOutput.pixels + x, so back up by x.
+    fShaderOutput = SkJumper_MemoryCtx{ fShaderBuffer.data() - x, 0 };
 }

 void SkRasterPipelineBlitter::blitH(int x, int y, int w) {
-    fDstPtr = fDst.writable_addr(0,y);
-
    if (fCanMemsetInBlitH) {
        switch (fDst.shiftPerPixel()) {
-            case 0:    memset  ((uint8_t *)fDstPtr + x, fMemsetColor, w); return;
-            case 1: sk_memset16((uint16_t*)fDstPtr + x, fMemsetColor, w); return;
-            case 2: sk_memset32((uint32_t*)fDstPtr + x, fMemsetColor, w); return;
-            case 3: sk_memset64((uint64_t*)fDstPtr + x, fMemsetColor, w); return;
+            case 0:    memset  (fDst.writable_addr8 (x,y), fMemsetColor, w); return;
+            case 1: sk_memset16(fDst.writable_addr16(x,y), fMemsetColor, w); return;
+            case 2: sk_memset32(fDst.writable_addr32(x,y), fMemsetColor, w); return;
+            case 3: sk_memset64(fDst.writable_addr64(x,y), fMemsetColor, w); return;
            default: break;
        }
    }
@ -318,8 +321,10 @@ void SkRasterPipelineBlitter::blitH(int x, int y, int w) {
        }
        fBlitH = p.compile();
    }
-    this->maybe_shade(x,y,w);
-    fBlitH(x,y,w);
+    if (fBurstCtx) {
+        this->burst_shade(x,y,w);
+    }
+    fBlitH(x,y,w,1);
 }

 void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const int16_t runs[]) {
@ -340,15 +345,16 @@ void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const
        fBlitAntiH = p.compile();
    }

-    fDstPtr = fDst.writable_addr(0,y);
    for (int16_t run = *runs; run > 0; run = *runs) {
        switch (*aa) {
            case 0x00:                       break;
            case 0xff: this->blitH(x,y,run); break;
            default:
-                this->maybe_shade(x,y,run);
                fCurrentCoverage = *aa * (1/255.0f);
-                fBlitAntiH(x,y,run);
+                if (fBurstCtx) {
+                    this->burst_shade(x,y,run);
+                }
+                fBlitAntiH(x,y,run,1);
        }
        x    += run;
        runs += run;
@ -375,6 +381,7 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip)
        return INHERITED::blitMask(mask, clip);
    }

+    // Lazily build whichever pipeline we need, specialized for each mask format.
    if (mask.fFormat == SkMask::kA8_Format && !fBlitMaskA8) {
        SkRasterPipeline p(fAlloc);
        p.extend(fColorPipeline);
@ -391,7 +398,6 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip)
        this->append_store(&p);
        fBlitMaskA8 = p.compile();
    }
-
    if (mask.fFormat == SkMask::kLCD16_Format && !fBlitMaskLCD16) {
        SkRasterPipeline p(fAlloc);
        p.extend(fColorPipeline);
@ -403,23 +409,35 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip)
        fBlitMaskLCD16 = p.compile();
    }

-    int x = clip.left();
-    for (int y = clip.top(); y < clip.bottom(); y++) {
-        fDstPtr = fDst.writable_addr(0,y);
+    std::function<void(size_t,size_t,size_t,size_t)>* blitter = nullptr;
+    // Update fMaskPtr to point "into" this current mask, but lined up with fDstPtr at (0,0).
+    switch (mask.fFormat) {
+        case SkMask::kA8_Format:
+            fMaskPtr.stride = mask.fRowBytes;
+            fMaskPtr.pixels = (uint8_t*)mask.fImage - mask.fBounds.left()
+                                                    - mask.fBounds.top() * fMaskPtr.stride;
+            blitter = &fBlitMaskA8;
+            break;
+        case SkMask::kLCD16_Format:
+            fMaskPtr.stride = mask.fRowBytes / 2;
+            fMaskPtr.pixels = (uint16_t*)mask.fImage - mask.fBounds.left()
+                                                     - mask.fBounds.top() * fMaskPtr.stride;
+            blitter = &fBlitMaskLCD16;
+            break;
+        default:
+            return;
+    }

-        this->maybe_shade(x,y,clip.width());
-        switch (mask.fFormat) {
-            case SkMask::kA8_Format:
-                fMaskPtr = mask.getAddr8(x,y)-x;
-                fBlitMaskA8(x,y,clip.width());
-                break;
-            case SkMask::kLCD16_Format:
-                fMaskPtr = mask.getAddrLCD16(x,y)-x;
-                fBlitMaskLCD16(x,y,clip.width());
-                break;
-            default:
-                // TODO
-                break;
+    SkASSERT(blitter);
+    if (fBurstCtx) {
+        // We can only burst shade one row at a time.
+        int x = clip.left();
+        for (int y = clip.top(); y < clip.bottom(); y++) {
+            this->burst_shade(x,y,clip.width());
+            (*blitter)(x,y, clip.width(),1);
        }
+    } else {
+        // If not bursting we can blit the entire mask at once.
+        (*blitter)(clip.left(),clip.top(), clip.width(),clip.height());
    }
 }
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@ -16,6 +16,7 @@
 #include "SkString.h"
 #include "SkWriteBuffer.h"
 #include "SkXfermodePriv.h"
+#include "../jumper/SkJumper.h"

 #if SK_SUPPORT_GPU
 #include "GrFragmentProcessor.h"
@ -36,26 +37,30 @@ public:

        SkRasterPipeline_<256> p;

+        SkJumper_MemoryCtx dst_ctx = { (void*)dst, 0 },
+                           src_ctx = { (void*)src, 0 },
+                            aa_ctx = { (void*)aa,  0 };
+
        if (kN32_SkColorType == kBGRA_8888_SkColorType) {
-            p.append(SkRasterPipeline::load_bgra_dst, &dst);
-            p.append(SkRasterPipeline::load_bgra    , &src);
+            p.append(SkRasterPipeline::load_bgra_dst, &dst_ctx);
+            p.append(SkRasterPipeline::load_bgra    , &src_ctx);
        } else {
-            p.append(SkRasterPipeline::load_8888_dst, &dst);
-            p.append(SkRasterPipeline::load_8888,     &src);
+            p.append(SkRasterPipeline::load_8888_dst, &dst_ctx);
+            p.append(SkRasterPipeline::load_8888,     &src_ctx);
        }

        SkBlendMode_AppendStagesNoClamp(fMode, &p);
        if (aa) {
-            p.append(SkRasterPipeline::lerp_u8, &aa);
+            p.append(SkRasterPipeline::lerp_u8, &aa_ctx);
        }
        SkBlendMode_AppendClampIfNeeded(fMode, &p);

        if (kN32_SkColorType == kBGRA_8888_SkColorType) {
-            p.append(SkRasterPipeline::store_bgra, &dst);
+            p.append(SkRasterPipeline::store_bgra, &dst_ctx);
        } else {
-            p.append(SkRasterPipeline::store_8888, &dst);
+            p.append(SkRasterPipeline::store_8888, &dst_ctx);
        }
-        p.run(0, 0, count);
+        p.run(0, 0, count,1);
    }

 private:
--- a/src/images/SkImageEncoderFns.h
+++ b/src/images/SkImageEncoderFns.h
@ -22,6 +22,7 @@
 #include "SkRasterPipeline.h"
 #include "SkUnPreMultiply.h"
 #include "SkUnPreMultiplyPriv.h"
+#include "../jumper/SkJumper.h"

 /**
 * Function template for transforming scanlines.
@ -153,18 +154,20 @@ static inline void transform_scanline_bgrA(char* SK_RESTRICT dst, const char* SK

 template <bool kIsRGBA>
 static inline void transform_scanline_unpremultiply_sRGB(void* dst, const void* src, int width) {
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
    SkRasterPipeline_<256> p;
    if (kIsRGBA) {
-        p.append(SkRasterPipeline::load_8888, &src);
+        p.append(SkRasterPipeline::load_8888, &src_ctx);
    } else {
-        p.append(SkRasterPipeline::load_bgra, &src);
+        p.append(SkRasterPipeline::load_bgra, &src_ctx);
    }

    p.append_from_srgb(kPremul_SkAlphaType);
    p.append(SkRasterPipeline::unpremul);
    p.append(SkRasterPipeline::to_srgb);
-    p.append(SkRasterPipeline::store_8888, &dst);
-    p.run(0,0, width);
+    p.append(SkRasterPipeline::store_8888, &dst_ctx);
+    p.run(0,0, width,1);
 }

 /**
@ -182,13 +185,15 @@ static inline void transform_scanline_to_premul_legacy(char* SK_RESTRICT dst,
 static inline void transform_scanline_to_premul_linear(char* SK_RESTRICT dst,
                                                       const char* SK_RESTRICT src,
                                                       int width, int, const SkPMColor*) {
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
    SkRasterPipeline_<256> p;
-    p.append(SkRasterPipeline::load_8888, (const void**) &src);
+    p.append(SkRasterPipeline::load_8888, &src_ctx);
    p.append_from_srgb(kUnpremul_SkAlphaType);
    p.append(SkRasterPipeline::premul);
    p.append(SkRasterPipeline::to_srgb);
-    p.append(SkRasterPipeline::store_8888, (void**) &dst);
-    p.run(0,0, width);
+    p.append(SkRasterPipeline::store_8888, &dst_ctx);
+    p.run(0,0, width,1);
 }

 /**
@ -256,11 +261,13 @@ static inline void transform_scanline_4444(char* SK_RESTRICT dst, const char* SK
 */
 static inline void transform_scanline_F16(char* SK_RESTRICT dst, const char* SK_RESTRICT src,
                                          int width, int, const SkPMColor*) {
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
    SkRasterPipeline_<256> p;
-    p.append(SkRasterPipeline::load_f16, (const void**) &src);
+    p.append(SkRasterPipeline::load_f16, &src_ctx);
    p.append(SkRasterPipeline::to_srgb);
-    p.append(SkRasterPipeline::store_u16_be, (void**) &dst);
-    p.run(0,0, width);
+    p.append(SkRasterPipeline::store_u16_be, &dst_ctx);
+    p.run(0,0, width,1);
 }

 /**
@ -268,12 +275,14 @@ static inline void transform_scanline_F16(char* SK_RESTRICT dst, const char* SK_
 */
 static inline void transform_scanline_F16_premul(char* SK_RESTRICT dst, const char* SK_RESTRICT src,
                                                 int width, int, const SkPMColor*) {
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
    SkRasterPipeline_<256> p;
-    p.append(SkRasterPipeline::load_f16, (const void**) &src);
+    p.append(SkRasterPipeline::load_f16, &src_ctx);
    p.append(SkRasterPipeline::unpremul);
    p.append(SkRasterPipeline::to_srgb);
-    p.append(SkRasterPipeline::store_u16_be, (void**) &dst);
-    p.run(0,0, width);
+    p.append(SkRasterPipeline::store_u16_be, &dst_ctx);
+    p.run(0,0, width,1);
 }

 /**
@ -282,11 +291,13 @@ static inline void transform_scanline_F16_premul(char* SK_RESTRICT dst, const ch
 static inline void transform_scanline_F16_to_8888(char* SK_RESTRICT dst,
                                                  const char* SK_RESTRICT src, int width, int,
                                                  const SkPMColor*) {
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
    SkRasterPipeline_<256> p;
-    p.append(SkRasterPipeline::load_f16, (const void**) &src);
+    p.append(SkRasterPipeline::load_f16, &src_ctx);
    p.append(SkRasterPipeline::to_srgb);
-    p.append(SkRasterPipeline::store_8888, (void**) &dst);
-    p.run(0,0, width);
+    p.append(SkRasterPipeline::store_8888, &dst_ctx);
+    p.run(0,0, width,1);
 }

 /**
@ -295,12 +306,14 @@ static inline void transform_scanline_F16_to_8888(char* SK_RESTRICT dst,
 static inline void transform_scanline_F16_premul_to_8888(char* SK_RESTRICT dst,
                                                         const char* SK_RESTRICT src, int width,
                                                         int, const SkPMColor*) {
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
    SkRasterPipeline_<256> p;
-    p.append(SkRasterPipeline::load_f16, (const void**) &src);
+    p.append(SkRasterPipeline::load_f16, &src_ctx);
    p.append(SkRasterPipeline::unpremul);
    p.append(SkRasterPipeline::to_srgb);
-    p.append(SkRasterPipeline::store_8888, (void**) &dst);
-    p.run(0,0, width);
+    p.append(SkRasterPipeline::store_8888, &dst_ctx);
+    p.run(0,0, width,1);
 }

 /**
@ -308,12 +321,14 @@ static inline void transform_scanline_F16_premul_to_8888(char* SK_RESTRICT dst,
 */
 static inline void transform_scanline_F16_to_premul_8888(char* SK_RESTRICT dst,
        const char* SK_RESTRICT src, int width, int, const SkPMColor*) {
+    SkJumper_MemoryCtx src_ctx = { (void*)src, 0 },
+                       dst_ctx = { (void*)dst, 0 };
    SkRasterPipeline_<256> p;
-    p.append(SkRasterPipeline::load_f16, (const void**) &src);
+    p.append(SkRasterPipeline::load_f16, &src_ctx);
    p.append(SkRasterPipeline::premul);
    p.append(SkRasterPipeline::to_srgb);
-    p.append(SkRasterPipeline::store_8888, (void**) &dst);
-    p.run(0,0, width);
+    p.append(SkRasterPipeline::store_8888, &dst_ctx);
+    p.run(0,0, width,1);
 }

 static inline sk_sp<SkData> icc_from_color_space(const SkImageInfo& info) {
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -58,10 +58,9 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
 #endif

 // We can't express the real types of most stage functions portably, so we use a stand-in.
-// We'll only ever call start_pipeline() or start_pipeline_2d(), which then chain into the rest.
-using StageFn           = void(void);
-using StartPipelineFn   = void(size_t,size_t,size_t,        void**,K*);
-using StartPipeline2dFn = void(size_t,size_t,size_t,size_t, void**,K*);
+// We'll only ever call start_pipeline(), which then chains into the rest.
+using StageFn         = void(void);
+using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*);

 // Some platforms expect C "name" maps to asm "_name", others to "name".
 #if defined(__APPLE__)
@ -107,16 +106,14 @@ extern "C" {
    // We'll just run portable code.

 #elif defined(__aarch64__)
-    StartPipelineFn   ASM(start_pipeline   ,aarch64);
-    StartPipeline2dFn ASM(start_pipeline_2d,aarch64);
+    StartPipelineFn ASM(start_pipeline,aarch64);
    StageFn ASM(just_return,aarch64);
    #define M(st) StageFn ASM(st,aarch64);
        SK_RASTER_PIPELINE_STAGES(M)
    #undef M

 #elif defined(__arm__)
-    StartPipelineFn   ASM(start_pipeline   ,vfp4);
-    StartPipeline2dFn ASM(start_pipeline_2d,vfp4);
+    StartPipelineFn ASM(start_pipeline,vfp4);
    StageFn ASM(just_return,vfp4);
    #define M(st) StageFn ASM(st,vfp4);
        SK_RASTER_PIPELINE_STAGES(M)
@ -130,13 +127,6 @@ extern "C" {
                    ASM(start_pipeline,hsw_lowp  ),
                    ASM(start_pipeline,ssse3_lowp);

-    StartPipeline2dFn ASM(start_pipeline_2d,hsw       ),
-                      ASM(start_pipeline_2d,avx       ),
-                      ASM(start_pipeline_2d,sse41     ),
-                      ASM(start_pipeline_2d,sse2      ),
-                      ASM(start_pipeline_2d,hsw_lowp  ),
-                      ASM(start_pipeline_2d,ssse3_lowp);
-
    StageFn ASM(just_return,hsw),
            ASM(just_return,avx),
            ASM(just_return,sse41),
@ -166,8 +156,7 @@ extern "C" {

 #elif (defined(__i386__) || defined(_M_IX86)) && \
        !(defined(_MSC_VER) && defined(SK_SUPPORT_LEGACY_WIN32_JUMPER))
-    StartPipelineFn   ASM(start_pipeline   ,sse2);
-    StartPipeline2dFn ASM(start_pipeline_2d,sse2);
+    StartPipelineFn ASM(start_pipeline,sse2);
    StageFn ASM(just_return,sse2);
    #define M(st) StageFn ASM(st,sse2);
        SK_RASTER_PIPELINE_STAGES(M)
@ -176,8 +165,7 @@ extern "C" {
 #endif

    // Portable, single-pixel stages.
-    StartPipelineFn   sk_start_pipeline;
-    StartPipeline2dFn sk_start_pipeline_2d;
+    StartPipelineFn sk_start_pipeline;
    StageFn sk_just_return;
    #define M(st) StageFn sk_##st;
        SK_RASTER_PIPELINE_STAGES(M)
@ -204,10 +192,9 @@ extern "C" {

 // Engines comprise everything we need to run SkRasterPipelines.
 struct SkJumper_Engine {
-    StageFn*           stages[kNumStages];
-    StartPipelineFn*   start_pipeline;
-    StartPipeline2dFn* start_pipeline_2d;
-    StageFn*           just_return;
+    StageFn*         stages[kNumStages];
+    StartPipelineFn* start_pipeline;
+    StageFn*         just_return;
 };

 // We'll default to this portable engine, but try to choose a better one at runtime.
@ -216,7 +203,6 @@ static const SkJumper_Engine kPortable = {
    { SK_RASTER_PIPELINE_STAGES(M) },
 #undef M
    sk_start_pipeline,
-    sk_start_pipeline_2d,
    sk_just_return,
 };
 static SkJumper_Engine gEngine = kPortable;
@ -231,7 +217,6 @@ static SkJumper_Engine choose_engine() {
    #define M(stage) ASM(stage, aarch64),
        { SK_RASTER_PIPELINE_STAGES(M) },
        M(start_pipeline)
-        M(start_pipeline_2d)
        M(just_return)
    #undef M
    };
@ -242,7 +227,6 @@ static SkJumper_Engine choose_engine() {
        #define M(stage) ASM(stage, vfp4),
            { SK_RASTER_PIPELINE_STAGES(M) },
            M(start_pipeline)
-            M(start_pipeline_2d)
            M(just_return)
        #undef M
        };
@ -254,7 +238,6 @@ static SkJumper_Engine choose_engine() {
        #define M(stage) ASM(stage, hsw),
            { SK_RASTER_PIPELINE_STAGES(M) },
            M(start_pipeline)
-            M(start_pipeline_2d)
            M(just_return)
        #undef M
        };
@ -264,7 +247,6 @@ static SkJumper_Engine choose_engine() {
        #define M(stage) ASM(stage, avx),
            { SK_RASTER_PIPELINE_STAGES(M) },
            M(start_pipeline)
-            M(start_pipeline_2d)
            M(just_return)
        #undef M
        };
@ -274,7 +256,6 @@ static SkJumper_Engine choose_engine() {
        #define M(stage) ASM(stage, sse41),
            { SK_RASTER_PIPELINE_STAGES(M) },
            M(start_pipeline)
-            M(start_pipeline_2d)
            M(just_return)
        #undef M
        };
@ -284,7 +265,6 @@ static SkJumper_Engine choose_engine() {
        #define M(stage) ASM(stage, sse2),
            { SK_RASTER_PIPELINE_STAGES(M) },
            M(start_pipeline)
-            M(start_pipeline_2d)
            M(just_return)
        #undef M
        };
@ -297,7 +277,6 @@ static SkJumper_Engine choose_engine() {
        #define M(stage) ASM(stage, sse2),
            { SK_RASTER_PIPELINE_STAGES(M) },
            M(start_pipeline)
-            M(start_pipeline_2d)
            M(just_return)
        #undef M
        };
@ -314,7 +293,6 @@ static SkJumper_Engine choose_engine() {
    #undef M
        nullptr,
        nullptr,
-        nullptr,
    };
    static SkJumper_Engine gLowp = kNone;
    static SkOnce gChooseLowpOnce;
@ -325,9 +303,8 @@ static SkJumper_Engine choose_engine() {
            return {
            #define M(st) hsw_lowp<SkRasterPipeline::st>(),
                { SK_RASTER_PIPELINE_STAGES(M) },
-                ASM(start_pipeline   ,hsw_lowp),
-                ASM(start_pipeline_2d,hsw_lowp),
-                ASM(just_return      ,hsw_lowp)
+                ASM(start_pipeline,hsw_lowp),
+                ASM(just_return   ,hsw_lowp)
            #undef M
            };
        }
@ -335,9 +312,8 @@ static SkJumper_Engine choose_engine() {
            return {
            #define M(st) ssse3_lowp<SkRasterPipeline::st>(),
                { SK_RASTER_PIPELINE_STAGES(M) },
-                ASM(start_pipeline   ,ssse3_lowp),
-                ASM(start_pipeline_2d,ssse3_lowp),
-                ASM(just_return      ,ssse3_lowp)
+                ASM(start_pipeline,ssse3_lowp),
+                ASM(just_return   ,ssse3_lowp)
            #undef M
            };
        }
@ -387,7 +363,7 @@ const SkJumper_Engine& SkRasterPipeline::build_pipeline(void** ip) const {
    return gEngine;
 }

-void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
+void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
    if (this->empty()) {
        return;
    }
@ -396,31 +372,19 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
    SkAutoSTMalloc<64, void*> program(fSlotsNeeded);

    const SkJumper_Engine& engine = this->build_pipeline(program.get() + fSlotsNeeded);
-    engine.start_pipeline(x,y,x+n, program.get(), &kConstants);
+    engine.start_pipeline(x,y,x+w,y+h, program.get(), &kConstants);
 }

-std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
+std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile() const {
    if (this->empty()) {
-        return [](size_t, size_t, size_t) {};
+        return [](size_t, size_t, size_t, size_t) {};
    }

    void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
    const SkJumper_Engine& engine = this->build_pipeline(program + fSlotsNeeded);

    auto start_pipeline = engine.start_pipeline;
-    return [=](size_t x, size_t y, size_t n) {
-        start_pipeline(x,y,x+n, program, &kConstants);
+    return [=](size_t x, size_t y, size_t w, size_t h) {
+        start_pipeline(x,y,x+w,y+h, program, &kConstants);
    };
 }
-
-void SkRasterPipeline::run_2d(size_t x, size_t y, size_t w, size_t h) const {
-    if (this->empty()) {
-        return;
-    }
-
-    // Like in run(), it's best to not use fAlloc here... we can't bound how often we'll be called.
-    SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
-
-    const SkJumper_Engine& engine = this->build_pipeline(program.get() + fSlotsNeeded);
-    engine.start_pipeline_2d(x,y,x+w,y+h, program.get(), &kConstants);
-}
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -52,35 +52,24 @@ using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,
    __attribute__((disable_tail_calls))
 #endif
 MAYBE_MSABI
-extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
+extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
+                                     void** program, K* k) {
 #if defined(JUMPER)
    F v;
 #else
    F v{};
 #endif
    auto start = (Stage*)load_and_inc(program);
-    while (x + kStride <= limit) {
-        start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
-        x += kStride;
-    }
-    if (size_t tail = limit - x) {
-        start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
-    }
-}
-
-#if defined(JUMPER) && defined(__AVX__)
-    // We really want to make sure all paths go through this function's (implicit) vzeroupper.
-    // If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
-    __attribute__((disable_tail_calls))
-#endif
-#if defined(JUMPER)
-    __attribute__((flatten))  // Force-inline the call to start_pipeline().
-#endif
-MAYBE_MSABI
-extern "C" void WRAP(start_pipeline_2d)(size_t x, size_t y, size_t xlimit, size_t ylimit,
-                                        void** program, K* k) {
+    size_t x0 = x;
    for (; y < ylimit; y++) {
-        WRAP(start_pipeline)(x,y,xlimit, program, k);
+        x = x0;
+        while (x + kStride <= xlimit) {
+            start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
+            x += kStride;
+        }
+        if (size_t tail = xlimit - x) {
+            start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
+        }
    }
 }

@ -206,6 +195,13 @@ SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
    *a = cast((_8888 >> 24)       ) * (1/255.0f);
 }

+// Used by load_ and store_ stages to get to the right (x,y) starting point of contiguous memory.
+template <typename T>
+SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
+    return (T*)ctx->pixels + y*ctx->stride + x;
+}
+
+// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
 template <typename T>
 SI U32 ix_and_ptr(T** ptr, const SkJumper_MemoryCtx* ctx, F x, F y) {
    *ptr = (const T*)ctx->pixels;
@ -483,7 +479,7 @@ STAGE(luminosity) {
 }

 STAGE(srcover_rgba_8888) {
-    auto ptr = *(uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);

    U32 dst = load<U32>(ptr, tail);
    dr = cast((dst      ) & 0xff);
@ -668,7 +664,7 @@ STAGE(scale_1_float) {
    a = a * c;
 }
 STAGE(scale_u8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    auto scales = load<U8>(ptr, tail);
    auto c = from_byte(scales);
@ -692,7 +688,7 @@ STAGE(lerp_1_float) {
    a = lerp(da, a, c);
 }
 STAGE(lerp_u8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    auto scales = load<U8>(ptr, tail);
    auto c = from_byte(scales);
@ -703,7 +699,7 @@ STAGE(lerp_u8) {
    a = lerp(da, a, c);
 }
 STAGE(lerp_565) {
-    auto ptr = *(const uint16_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);

    F cr,cg,cb;
    from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
@ -808,13 +804,13 @@ STAGE(lab_to_xyz) {
 }

 STAGE(load_a8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    r = g = b = 0.0f;
    a = from_byte(load<U8>(ptr, tail));
 }
 STAGE(load_a8_dst) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    dr = dg = db = 0.0f;
    da = from_byte(load<U8>(ptr, tail));
@ -826,20 +822,20 @@ STAGE(gather_a8) {
    a = from_byte(gather(ptr, ix));
 }
 STAGE(store_a8) {
-    auto ptr = *(uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint8_t>(ctx, x,y);

    U8 packed = pack(pack(round(a, 255.0f)));
    store(ptr, packed, tail);
 }

 STAGE(load_g8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    r = g = b = from_byte(load<U8>(ptr, tail));
    a = 1.0f;
 }
 STAGE(load_g8_dst) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    dr = dg = db = from_byte(load<U8>(ptr, tail));
    da = 1.0f;
@ -852,13 +848,13 @@ STAGE(gather_g8) {
 }

 STAGE(load_565) {
-    auto ptr = *(const uint16_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);

    from_565(load<U16>(ptr, tail), &r,&g,&b);
    a = 1.0f;
 }
 STAGE(load_565_dst) {
-    auto ptr = *(const uint16_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);

    from_565(load<U16>(ptr, tail), &dr,&dg,&db);
    da = 1.0f;
@ -870,7 +866,7 @@ STAGE(gather_565) {
    a = 1.0f;
 }
 STAGE(store_565) {
-    auto ptr = *(uint16_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint16_t>(ctx, x,y);

    U16 px = pack( round(r, 31.0f) << 11
                 | round(g, 63.0f) <<  5
@ -879,11 +875,11 @@ STAGE(store_565) {
 }

 STAGE(load_4444) {
-    auto ptr = *(const uint16_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
    from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
 }
 STAGE(load_4444_dst) {
-    auto ptr = *(const uint16_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
    from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
 }
 STAGE(gather_4444) {
@ -892,7 +888,7 @@ STAGE(gather_4444) {
    from_4444(gather(ptr, ix), &r,&g,&b,&a);
 }
 STAGE(store_4444) {
-    auto ptr = *(uint16_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint16_t>(ctx, x,y);
    U16 px = pack( round(r, 15.0f) << 12
                 | round(g, 15.0f) <<  8
                 | round(b, 15.0f) <<  4
@ -901,11 +897,11 @@ STAGE(store_4444) {
 }

 STAGE(load_8888) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
 }
 STAGE(load_8888_dst) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
 }
 STAGE(gather_8888) {
@ -914,18 +910,7 @@ STAGE(gather_8888) {
    from_8888(gather(ptr, ix), &r,&g,&b,&a);
 }
 STAGE(store_8888) {
-    auto ptr = *(uint32_t**)ctx + x;
-
-    U32 px = round(r, 255.0f)
-           | round(g, 255.0f) <<  8
-           | round(b, 255.0f) << 16
-           | round(a, 255.0f) << 24;
-    store(ptr, px, tail);
-}
-
-STAGE(store_8888_2d) {
-    auto c = (const SkJumper_MemoryCtx*)ctx;
-    auto ptr = (uint32_t*)c->pixels + y*c->stride + x;
+    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);

    U32 px = round(r, 255.0f)
           | round(g, 255.0f) <<  8
@ -935,11 +920,11 @@ STAGE(store_8888_2d) {
 }

 STAGE(load_bgra) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &b,&g,&r,&a);
 }
 STAGE(load_bgra_dst) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &db,&dg,&dr,&da);
 }
 STAGE(gather_bgra) {
@ -948,7 +933,7 @@ STAGE(gather_bgra) {
    from_8888(gather(ptr, ix), &b,&g,&r,&a);
 }
 STAGE(store_bgra) {
-    auto ptr = *(uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);

    U32 px = round(b, 255.0f)
           | round(g, 255.0f) <<  8
@ -958,7 +943,7 @@ STAGE(store_bgra) {
 }

 STAGE(load_f16) {
-    auto ptr = *(const uint64_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint64_t>(ctx, x,y);

    U16 R,G,B,A;
    load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
@ -968,7 +953,7 @@ STAGE(load_f16) {
    a = from_half(A);
 }
 STAGE(load_f16_dst) {
-    auto ptr = *(const uint64_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint64_t>(ctx, x,y);

    U16 R,G,B,A;
    load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
@ -990,7 +975,7 @@ STAGE(gather_f16) {
    a = from_half(A);
 }
 STAGE(store_f16) {
-    auto ptr = *(uint64_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint64_t>(ctx, x,y);
    store4((uint16_t*)ptr,tail, to_half(r)
                              , to_half(g)
                              , to_half(b)
@ -998,7 +983,7 @@ STAGE(store_f16) {
 }

 STAGE(load_u16_be) {
-    auto ptr = *(const uint16_t**)ctx + 4*x;
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, 4*x,y);

    U16 R,G,B,A;
    load4(ptr,tail, &R,&G,&B,&A);
@ -1009,7 +994,7 @@ STAGE(load_u16_be) {
    a = (1/65535.0f) * cast(expand(bswap(A)));
 }
 STAGE(load_rgb_u16_be) {
-    auto ptr = *(const uint16_t**)ctx + 3*x;
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, 3*x,y);

    U16 R,G,B;
    load3(ptr,tail, &R,&G,&B);
@ -1020,7 +1005,7 @@ STAGE(load_rgb_u16_be) {
    a = 1.0f;
 }
 STAGE(store_u16_be) {
-    auto ptr = *(uint16_t**)ctx + 4*x;
+    auto ptr = ptr_at_xy<uint16_t>(ctx, 4*x,y);

    U16 R = bswap(pack(round(r, 65535.0f))),
        G = bswap(pack(round(g, 65535.0f))),
@ -1031,15 +1016,15 @@ STAGE(store_u16_be) {
 }

 STAGE(load_f32) {
-    auto ptr = *(const float**)ctx + 4*x;
+    auto ptr = ptr_at_xy<const float>(ctx, 4*x,y);
    load4(ptr,tail, &r,&g,&b,&a);
 }
 STAGE(load_f32_dst) {
-    auto ptr = *(const float**)ctx + 4*x;
+    auto ptr = ptr_at_xy<const float>(ctx, 4*x,y);
    load4(ptr,tail, &dr,&dg,&db,&da);
 }
 STAGE(store_f32) {
-    auto ptr = *(float**)ctx + 4*x;
+    auto ptr = ptr_at_xy<float>(ctx, 4*x,y);
    store4(ptr,tail, r,g,b,a);
 }

--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@ -70,29 +70,20 @@ using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,
    __attribute__((disable_tail_calls))
 #endif
 MAYBE_MSABI
-extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
+extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
+                                     void** program, K* k) {
    F v;
    auto start = (Stage*)load_and_inc(program);
-    while (x + kStride <= limit) {
-        start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
-        x += kStride;
-    }
-    if (size_t tail = limit - x) {
-        start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
-    }
-}
-
-#if defined(__AVX__)
-    // We really want to make sure all paths go through this function's (implicit) vzeroupper.
-    // If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
-    __attribute__((disable_tail_calls))
-#endif
-__attribute__((flatten))  // Force-inline the call to start_pipeline().
-MAYBE_MSABI
-extern "C" void WRAP(start_pipeline_2d)(size_t x, size_t y, size_t xlimit, size_t ylimit,
-                                        void** program, K* k) {
+    size_t x0 = x;
    for (; y < ylimit; y++) {
-        WRAP(start_pipeline)(x,y,xlimit, program, k);
+        x = x0;
+        while (x + kStride <= xlimit) {
+            start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
+            x += kStride;
+        }
+        if (size_t tail = xlimit - x) {
+            start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
+        }
    }
 }

@ -219,6 +210,11 @@ SI U32 to_8888(F r, F g, F b, F a) {
         | __builtin_convertvector(to_wide_byte(a), U32) << 24;
 }

+template <typename T>
+SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
+    return (T*)ctx->pixels + y*ctx->stride + x;
+}
+
 // Stages!

 STAGE(uniform_color) {
@ -256,60 +252,60 @@ STAGE(premul) {
 }

 STAGE(load_8888) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
 }
 STAGE(load_8888_dst) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
 }
 STAGE(store_8888) {
-    auto ptr = *(uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);
    store(ptr, to_8888(r,g,b,a), tail);
 }

 STAGE(load_bgra) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &b,&g,&r,&a);
 }
 STAGE(load_bgra_dst) {
-    auto ptr = *(const uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
    from_8888(load<U32>(ptr, tail), &db,&dg,&dr,&da);
 }
 STAGE(store_bgra) {
-    auto ptr = *(uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);
    store(ptr, to_8888(b,g,r,a), tail);
 }

 STAGE(load_a8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
    r = g = b = 0.0f;
    a = from_byte(load<U8>(ptr, tail));
 }
 STAGE(load_a8_dst) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
    dr = dg = db = 0.0f;
    da = from_byte(load<U8>(ptr, tail));
 }
 STAGE(store_a8) {
-    auto ptr = *(uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint8_t>(ctx, x,y);
    store(ptr, to_byte(a), tail);
 }

 STAGE(load_g8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
    r = g = b = from_byte(load<U8>(ptr, tail));
    a = 1.0f;
 }

 STAGE(load_g8_dst) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
    dr = dg = db = from_byte(load<U8>(ptr, tail));
    da = 1.0f;
 }

 STAGE(srcover_rgba_8888) {
-    auto ptr = *(uint32_t**)ctx + x;
+    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);

    from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);

@ -330,7 +326,7 @@ STAGE(scale_1_float) {
    a = a * c;
 }
 STAGE(scale_u8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    U8 scales = load<U8>(ptr, tail);
    F c = from_byte(scales);
@ -350,7 +346,7 @@ STAGE(lerp_1_float) {
    a = lerp(da, a, c);
 }
 STAGE(lerp_u8) {
-    auto ptr = *(const uint8_t**)ctx + x;
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);

    U8 scales = load<U8>(ptr, tail);
    F c = from_byte(scales);
--- a/tests/F16StagesTest.cpp
+++ b/tests/F16StagesTest.cpp
@ -7,6 +7,7 @@

 #include "SkRasterPipeline.h"
 #include "Test.h"
+#include "../src/jumper/SkJumper.h"

 DEF_TEST(F16Stages, r) {
    // Make sure SkRasterPipeline::load_f16 and store_f16 can handle a range of
@ -18,36 +19,36 @@ DEF_TEST(F16Stages, r) {
    };
    uint16_t halfs[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};

-    float*    f32 = floats;
-    uint16_t* f16 = halfs;
+    SkJumper_MemoryCtx f32 = { floats, 0 },
+                       f16 = { halfs,  0 };

    {
        SkRasterPipeline_<256> p;
        p.append(SkRasterPipeline:: load_f32, &f32);
        p.append(SkRasterPipeline::store_f16, &f16);
-        p.run(0,0,16/4);
+        p.run(0,0,16/4,1);
    }
-    REPORTER_ASSERT(r, f16[0] == 0x0000);
-    REPORTER_ASSERT(r, f16[1] == 0x3400);
-    REPORTER_ASSERT(r, f16[2] == 0x3800);
-    REPORTER_ASSERT(r, f16[3] == 0x3c00);
-    REPORTER_ASSERT(r, f16[4] == 0xbd00);
-    REPORTER_ASSERT(r, f16[5] == 0xb800);
-    REPORTER_ASSERT(r, f16[6] == 0x3d00);
-    REPORTER_ASSERT(r, f16[7] == 0x4000);
+    REPORTER_ASSERT(r, halfs[0] == 0x0000);
+    REPORTER_ASSERT(r, halfs[1] == 0x3400);
+    REPORTER_ASSERT(r, halfs[2] == 0x3800);
+    REPORTER_ASSERT(r, halfs[3] == 0x3c00);
+    REPORTER_ASSERT(r, halfs[4] == 0xbd00);
+    REPORTER_ASSERT(r, halfs[5] == 0xb800);
+    REPORTER_ASSERT(r, halfs[6] == 0x3d00);
+    REPORTER_ASSERT(r, halfs[7] == 0x4000);

    {
        SkRasterPipeline_<256> p;
        p.append(SkRasterPipeline:: load_f16, &f16);
        p.append(SkRasterPipeline::store_f32, &f32);
-        p.run(0,0,16/4);
+        p.run(0,0,16/4,1);
    }
-    REPORTER_ASSERT(r, f32[0] ==  0.00f);
-    REPORTER_ASSERT(r, f32[1] ==  0.25f);
-    REPORTER_ASSERT(r, f32[2] ==  0.50f);
-    REPORTER_ASSERT(r, f32[3] ==  1.00f);
-    REPORTER_ASSERT(r, f32[4] == -1.25f);
-    REPORTER_ASSERT(r, f32[5] == -0.50f);
-    REPORTER_ASSERT(r, f32[6] ==  1.25f);
-    REPORTER_ASSERT(r, f32[7] ==  2.00f);
+    REPORTER_ASSERT(r, floats[0] ==  0.00f);
+    REPORTER_ASSERT(r, floats[1] ==  0.25f);
+    REPORTER_ASSERT(r, floats[2] ==  0.50f);
+    REPORTER_ASSERT(r, floats[3] ==  1.00f);
+    REPORTER_ASSERT(r, floats[4] == -1.25f);
+    REPORTER_ASSERT(r, floats[5] == -0.50f);
+    REPORTER_ASSERT(r, floats[6] ==  1.25f);
+    REPORTER_ASSERT(r, floats[7] ==  2.00f);
 }
--- a/tests/ParametricStageTest.cpp
+++ b/tests/ParametricStageTest.cpp
@ -8,6 +8,7 @@
 #include "SkColorSpace.h"
 #include "SkRasterPipeline.h"
 #include "Test.h"
+#include "../src/jumper/SkJumper.h"

 static void check_error(skiatest::Reporter* r, float limit, SkColorSpaceTransferFn fn) {
    float in[256], out[256];
@ -16,8 +17,8 @@ static void check_error(skiatest::Reporter* r, float limit, SkColorSpaceTransfer
        out[i] = 0.0f;  // Not likely important.  Just being tidy.
    }

-    const float* ip = in;
-    float*       op = out;
+    SkJumper_MemoryCtx ip = { in, 0},
+                       op = {out, 0};

    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline::load_f32, &ip);
@ -27,7 +28,7 @@ static void check_error(skiatest::Reporter* r, float limit, SkColorSpaceTransfer
    p.append(SkRasterPipeline::parametric_a, &fn);
    p.append(SkRasterPipeline::store_f32, &op);

-    p.run(0,0, 256/4);
+    p.run(0,0, 256/4,1);


    for (int i = 0; i < 256; i++) {
--- a/tests/SRGBTest.cpp
+++ b/tests/SRGBTest.cpp
@ -11,6 +11,7 @@
 #include "SkTypes.h"
 #include "Test.h"
 #include <math.h>
+#include "../src/jumper/SkJumper.h"

 static uint8_t linear_to_srgb(float l) {
    return (uint8_t)sk_linear_to_srgb(Sk4f{l})[0];
@ -46,7 +47,7 @@ DEF_TEST(sk_pipeline_srgb_roundtrip, r) {
        reds[i] = i;
    }

-    auto ptr = (void*)reds;
+    SkJumper_MemoryCtx ptr = { reds, 0 };

    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline::load_8888,  &ptr);
@ -54,7 +55,7 @@ DEF_TEST(sk_pipeline_srgb_roundtrip, r) {
    p.append(SkRasterPipeline::to_srgb);
    p.append(SkRasterPipeline::store_8888, &ptr);

-    p.run(0,0,256);
+    p.run(0,0,256,1);

    for (int i = 0; i < 256; i++) {
        if (reds[i] != (uint32_t)i) {
@ -67,13 +68,14 @@ DEF_TEST(sk_pipeline_srgb_edge_cases, r) {
    // We need to run at least 4 pixels to make sure we hit all specializations.
    SkPM4f colors[4] = { {{0,1,1,1}}, {{0,0,0,0}}, {{0,0,0,0}}, {{0,0,0,0}} };
    auto& color = colors[0];
-    void* dst = &color;
+
+    SkJumper_MemoryCtx dst = { &color, 0 };

    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline::uniform_color, &color);
    p.append(SkRasterPipeline::to_srgb);
    p.append(SkRasterPipeline::store_f32, &dst);
-    p.run(0,0,4);
+    p.run(0,0,4,1);

    if (color.r() != 0.0f) {
        ERRORF(r, "expected to_srgb() to map 0.0f to 0.0f, got %f", color.r());
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@ -17,16 +17,16 @@ DEF_TEST(SkRasterPipeline, r) {
             blue = 0x3800380000000000ull,
             result;

-    void* load_s_ctx = &blue;
-    void* load_d_ctx = &red;
-    void* store_ctx  = &result;
+    SkJumper_MemoryCtx load_s_ctx = { &blue, 0 },
+                       load_d_ctx = { &red, 0 },
+                       store_ctx  = { &result, 0 };

    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline::load_f16,     &load_s_ctx);
    p.append(SkRasterPipeline::load_f16_dst, &load_d_ctx);
    p.append(SkRasterPipeline::srcover);
    p.append(SkRasterPipeline::store_f16, &store_ctx);
-    p.run(0,0,1);
+    p.run(0,0,1,1);

    // We should see half-intensity magenta.
    REPORTER_ASSERT(r, ((result >>  0) & 0xffff) == 0x3800);
@ -38,7 +38,7 @@ DEF_TEST(SkRasterPipeline, r) {
 DEF_TEST(SkRasterPipeline_empty, r) {
    // No asserts... just a test that this is safe to run.
    SkRasterPipeline_<256> p;
-    p.run(0,0,20);
+    p.run(0,0,20,1);
 }

 DEF_TEST(SkRasterPipeline_nonsense, r) {
@ -46,7 +46,7 @@ DEF_TEST(SkRasterPipeline_nonsense, r) {
    // srcover() calls st->next(); this makes sure we've always got something there to call.
    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline::srcover);
-    p.run(0,0,20);
+    p.run(0,0,20,1);
 }

 DEF_TEST(SkRasterPipeline_JIT, r) {
@ -61,20 +61,20 @@ DEF_TEST(SkRasterPipeline_JIT, r) {
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    };

-    const uint32_t* src = buf +  0;
-    uint32_t*       dst = buf + 36;
+    SkJumper_MemoryCtx src = { buf +  0, 0 },
+                       dst = { buf + 36, 0 };

    // Copy buf[x] to buf[x+36] for x in [15,35).
    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline:: load_8888, &src);
    p.append(SkRasterPipeline::store_8888, &dst);
-    p.run(15,0, 20);
+    p.run(15,0, 20,1);

    for (int i = 0; i < 36; i++) {
        if (i < 15 || i == 35) {
-            REPORTER_ASSERT(r, dst[i] == 0);
+            REPORTER_ASSERT(r, buf[i+36] == 0);
        } else {
-            REPORTER_ASSERT(r, dst[i] == (uint32_t)(i - 11));
+            REPORTER_ASSERT(r, buf[i+36] == (uint32_t)(i - 11));
        }
    }
 }
@ -111,15 +111,16 @@ DEF_TEST(SkRasterPipeline_tail, r) {
        };

        float buffer[4][4];
-        float* src = &data[0][0];
-        float* dst = &buffer[0][0];
+
+        SkJumper_MemoryCtx src = { &data[0][0], 0 },
+                           dst = { &buffer[0][0], 0 };

        for (unsigned i = 1; i <= 4; i++) {
            memset(buffer, 0xff, sizeof(buffer));
            SkRasterPipeline_<256> p;
            p.append(SkRasterPipeline::load_f32, &src);
            p.append(SkRasterPipeline::store_f32, &dst);
-            p.run(0,0, i);
+            p.run(0,0, i,1);
            for (unsigned j = 0; j < i; j++) {
                for (unsigned k = 0; k < 4; k++) {
                    if (buffer[j][k] != data[j][k]) {
@ -143,15 +144,15 @@ DEF_TEST(SkRasterPipeline_tail, r) {
            {h(30), h(31), h(32), h(33)},
        };
        uint16_t buffer[4][4];
-        uint16_t* src = &data[0][0];
-        uint16_t* dst = &buffer[0][0];
+        SkJumper_MemoryCtx src = { &data[0][0], 0 },
+                           dst = { &buffer[0][0], 0 };

        for (unsigned i = 1; i <= 4; i++) {
            memset(buffer, 0xff, sizeof(buffer));
            SkRasterPipeline_<256> p;
            p.append(SkRasterPipeline::load_f16, &src);
            p.append(SkRasterPipeline::store_f16, &dst);
-            p.run(0,0, i);
+            p.run(0,0, i,1);
            for (unsigned j = 0; j < i; j++) {
                REPORTER_ASSERT(r,
                                !memcmp(&data[j][0], &buffer[j][0], sizeof(buffer[j])));
@ -180,15 +181,15 @@ DEF_TEST(SkRasterPipeline_tail, r) {
        };

        float buffer[4][4];
-        uint16_t* src = &data[0][0];
-        float* dst = &buffer[0][0];
+        SkJumper_MemoryCtx src = { &data[0][0], 0 },
+                           dst = { &buffer[0][0], 0 };

        for (unsigned i = 1; i <= 4; i++) {
            memset(buffer, 0xff, sizeof(buffer));
            SkRasterPipeline_<256> p;
            p.append(SkRasterPipeline::load_rgb_u16_be, &src);
            p.append(SkRasterPipeline::store_f32, &dst);
-            p.run(0,0, i);
+            p.run(0,0, i,1);
            for (unsigned j = 0; j < i; j++) {
                for (unsigned k = 0; k < 4; k++) {
                    if (buffer[j][k] != answer[j][k]) {
@ -214,12 +215,12 @@ DEF_TEST(SkRasterPipeline_lowp, r) {
                | (4*i+3) << 24;
    }

-    void* ptr = rgba;
+    SkJumper_MemoryCtx ptr = { rgba, 0 };

    SkRasterPipeline_<256> p;
    p.append(SkRasterPipeline::load_bgra,  &ptr);
    p.append(SkRasterPipeline::store_8888, &ptr);
-    p.run(0,0,64);
+    p.run(0,0,64,1);

    for (int i = 0; i < 64; i++) {
        uint32_t want = (4*i+0) << 16
@ -247,9 +248,9 @@ DEF_TEST(SkRasterPipeline_2d, r) {

    // Write out to rgba, with row stride = 2 pixels.
    SkJumper_MemoryCtx ctx = { rgba, 2 };
-    p.append(SkRasterPipeline::store_8888_2d, &ctx);
+    p.append(SkRasterPipeline::store_8888, &ctx);

-    p.run_2d(0,0, 2,2);
+    p.run(0,0, 2,2);

    REPORTER_ASSERT(r, ((rgba[0] >> 0) & 0xff) ==   0);
    REPORTER_ASSERT(r, ((rgba[1] >> 0) & 0xff) == 128);
--- a/tools/picture_utils.cpp
+++ b/tools/picture_utils.cpp
@ -83,8 +83,8 @@ namespace sk_tools {

        SkAutoTMalloc<uint32_t> rgba(w*h);

-        const void* src = bitmap.getPixels();
-        uint32_t*   dst = rgba.get();
+        SkJumper_MemoryCtx src = { bitmap.getPixels(), bitmap.rowBytesAsPixels() },
+                           dst = { rgba.get(), w };

        SkRasterPipeline_<256> p;
        switch (bitmap.colorType()) {
@ -106,12 +106,7 @@ namespace sk_tools {
        }
        p.append(SkRasterPipeline::store_8888, &dst);

-        auto run = p.compile();
-        for (int y = 0; y < h; y++) {
-            run(0,y, w);
-            src = SkTAddOffset<const void>(src, bitmap.rowBytes());
-            dst += w;
-        }
+        p.run(0,0, w,h);

        return SkData::MakeFromMalloc(rgba.release(), w*h*sizeof(uint32_t));
    }