Revert "Reland "make SkJumper stages normal Skia code""

This reverts commit 78cb579f33. Reason for revert: lowp should be controlled by defined(JUMPER_IS_SCALAR), not defined(__clang__). So close. Original change's description: > Reland "make SkJumper stages normal Skia code" > > This is a reland of 22e536e3a1 > > Now with fixed #include paths in SkRasterPipeline_opts.h, > and -ffp-contract=fast for the :hsw target to minimize > diffs on non-Windows Clang AVX2/AVX-512 bots. > > Original change's description: > > make SkJumper stages normal Skia code > > > > Enough clients are using Clang now that we can say, use Clang to build > > if you want these software pipeline stages to go fast. > > > > This lets us drop the offline build aspect of SkJumper stages, instead > > building as part of Skia using the SkOpts framework. > > > > I think everything should work, except I've (temporarily) removed > > AVX-512 support. I will put this back in a follow up. > > > > I have had to drop Windows down to __vectorcall and our narrower > > stage calling convention that keeps the d-registers on the stack. > > I tried forcing sysv_abi, but that crashed Clang. :/ > > > > Added a TODO to up the same narrower stage calling convention > > for lowp stages... we just *don't* today, for no good reason. > > > > Change-Id: Iaaa792ffe4deab3508d2dc5d0008c163c24b3383 > > Reviewed-on: https://skia-review.googlesource.com/110641 > > Commit-Queue: Mike Klein <mtklein@chromium.org> > > Reviewed-by: Herb Derby <herb@google.com> > > Reviewed-by: Florin Malita <fmalita@chromium.org> > > Change-Id: I44f2c03d33958e3807747e40904b6351957dd448 > Reviewed-on: https://skia-review.googlesource.com/112742 > Reviewed-by: Mike Klein <mtklein@chromium.org> TBR=mtklein@chromium.org,herb@google.com,fmalita@chromium.org Change-Id: Ie64da98f5187d44e03c0ce05d7cb189d4a6e6663 No-Presubmit: true No-Tree-Checks: true No-Try: true Reviewed-on: https://skia-review.googlesource.com/112743 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2018-03-07 17:04:18 +00:00 · 2018-03-07 17:04:18 +00:00 · 5cc94cc393
commit 5cc94cc393
parent 61d56b92a4
21 changed files with 158946 additions and 1213 deletions
--- a/BUILD.gn
+++ b/BUILD.gn
@ -48,6 +48,10 @@ declare_args() {
  skia_compile_processors = false
  skia_lex = false

+  skia_jumper_clang = ""
+  skia_jumper_objdump = ""
+  skia_jumper_ccache = ""
+
  skia_skqp_enable_driver_correctness_workarounds = false
  skia_skqp_global_error_tolerance = 0
 }
@ -310,28 +314,6 @@ opts("avx") {
  }
 }

-opts("hsw") {
-  enabled = is_x86
-  sources = skia_opts.hsw_sources
-  if (!is_clang && is_win) {
-    cflags = [ "/arch:AVX2" ]
-  } else {
-    cflags = [
-      "-mavx2",
-      "-mf16c",
-      "-mfma",
-    ]
-  }
-
-  # Oddly, clang-cl doesn't recognize this as a valid flag.
-  # If it ever does, it'd nice to move this up with -mavx2 and co.
-  if (is_clang && !is_win) {
-    # This flag lets Clang generate FMAs when it sees a mul-then-add.  It's optional,
-    # but nice to have, generating slightly better code for paths without explicit FMAs.
-    cflags += [ "-ffp-contract=fast" ]
-  }
-}
-
 # Any feature of Skia that requires third-party code should be optional and use this template.
 template("optional") {
  if (invoker.enabled) {
@ -793,7 +775,6 @@ component("skia") {
    ":fontmgr_fuchsia",
    ":gpu",
    ":heif",
-    ":hsw",
    ":jpeg",
    ":none",
    ":pdf",
@ -2120,3 +2101,28 @@ if (skia_enable_tools) {
    }
  }
 }
+
+if (skia_jumper_clang != "") {
+  action("regen_jumper") {
+    script = "src/jumper/build_stages.py"
+
+    inputs = [
+      "src/jumper/SkJumper_stages.cpp",
+      "src/jumper/SkJumper_stages_lowp.cpp",
+    ]
+
+    # GN insists its outputs should go somewhere underneath target_out_dir, so we trick it.
+    outputs = [
+      "$target_out_dir/" +
+          rebase_path("src/jumper/SkJumper_generated.S", target_out_dir),
+      "$target_out_dir/" +
+          rebase_path("src/jumper/SkJumper_generated_win.S", target_out_dir),
+    ]
+
+    args = [
+             skia_jumper_clang,
+             skia_jumper_objdump,
+             skia_jumper_ccache,
+           ] + rebase_path(inputs) + rebase_path(outputs)
+  }
+}
--- a/gn/core.gni
+++ b/gn/core.gni
@ -440,8 +440,6 @@ skia_core_sources = [
  "$_include/core/SkUnPreMultiply.h",
  "$_include/core/SkVertices.h",

-  "$_src/jumper/SkJumper.cpp",
-
  # private
  "$_include/private/SkAtomics.h",
  "$_include/private/SkChecksum.h",
@ -529,4 +527,20 @@ skia_core_sources = [
  "$_src/pathops/SkReduceOrder.h",
 ]

-skia_core_defines = []  #  Used to be used by Chromium, but no longer.
+skia_core_sources += [
+  "$_src/jumper/SkJumper.cpp",
+  "$_src/jumper/SkJumper_stages.cpp",
+  "$_src/jumper/SkJumper_stages_lowp.cpp",
+]
+skia_core_defines = []
+if (is_win) {
+  if (host_os == "win") {
+    skia_core_sources += [ "$_src/jumper/SkJumper_generated_win.S" ]
+  } else {
+    # TODO(thakis): Enable jumper in linux->win cross builds once the
+    # assembler situation is figured out, https://crbug.com/762167
+    skia_core_defines += [ "SK_JUMPER_USE_ASSEMBLY=0" ]
+  }
+} else if (target_cpu != "wasm") {
+  skia_core_sources += [ "$_src/jumper/SkJumper_generated.S" ]
+}
--- a/gn/gn_to_bp.py
+++ b/gn/gn_to_bp.py
@ -289,8 +289,7 @@ with open('Android.bp', 'w') as f:
                               defs['ssse3'] +
                               defs['sse41'] +
                               defs['sse42'] +
-                               defs['avx'  ] +
-                               defs['hsw'  ]),
+                               defs['avx'  ]),

    'dm_includes'       : bpfmt(8, dm_includes),
    'dm_srcs'           : bpfmt(8, dm_srcs),
--- a/gn/opts.gni
+++ b/gn/opts.gni
@ -51,4 +51,3 @@ ssse3 = [
 sse41 = [ "$_src/opts/SkOpts_sse41.cpp" ]
 sse42 = [ "$_src/opts/SkOpts_sse42.cpp" ]
 avx = [ "$_src/opts/SkOpts_avx.cpp" ]
-hsw = [ "$_src/opts/SkOpts_hsw.cpp" ]
--- a/gn/shared_sources.gni
+++ b/gn/shared_sources.gni
@ -24,7 +24,7 @@ skia_opts = {
  sse41_sources = sse41
  sse42_sources = sse42
  avx_sources = avx
-  hsw_sources = hsw
+  hsw_sources = []  # remove after we update Chrome
 }

 # Skia Chromium defines. These flags will be defined in chromium If these
--- a/site/dev/contrib/jumper.md
+++ b/site/dev/contrib/jumper.md
@ -0,0 +1,103 @@
+Contributing to SkJumper
+========================
+
+SkJumper is the execution engine of SkRasterPipeline, a system we've been using
+to accelerate CPU-bound work inside Skia, most notably color-space conversions
+and color-correct drawing.
+
+(This is where I'd put my link to design document if I had one...)
+
+SkJumper is more annoying to contribute to than most Skia code because of its
+offline compilation step.  You'll need particular tools installed on your
+machine and to tell GN about them.  This document is designed to guide you
+through this process and ease some of that annoyance.
+
+One-time Setup
+--------------
+
+To generate stage code you need Clang 5.0, objdump, and ccache.  It's best that
+Clang is exactly the same version we typically use (as of writing 5.0.0) and
+you'll need objdump to be compiled with support for x86-64, ARMv7, and ARMv8.
+
+The easiest way to satisfy these contraints is to get your hands on a Mac and
+install Xcode, Xcode command line tools, and [Homebrew](https://brew.sh).  Once
+you have `brew` installed, run these commands to get the tools you need:
+
+<!--?prettify lang=sh?-->
+
+    ls -d /usr/include >/dev/null || xcode-select --install
+    brew install llvm binutils ccache
+
+Setting up GN
+-------------------------
+
+With your tools installed, tell GN about them
+
+    skia_jumper_clang = path/to/clang-5.0
+    skia_jumper_objdump = path/to/gobjdump
+    skia_jumper_ccache = path/to/ccache
+
+then regenerate and build as normal.
+
+If you look in your GN out directory, you should now see a bunch of `.o` files,
+and `git status` should show no changes to `src/jumper/SkJumper_generated*.S`.
+That's good.  Those object files are the intermediates we parse to produce
+the assembly files.  We just leave them around in case you want to look at
+them yourself.
+
+Make A Change
+-------------
+
+Let's use the `from_srgb` stage as a little playground to make a real change.
+Linearizing sRGB encoded bytes is slow, so let's pretend we've decided to trade
+quality for speed, approximating the existing implementation with a simple square.
+
+Open up `SkJumper_stages.cpp` and find the `from_srgb` stage.  It'll look like
+
+<!--?prettify lang=cc?-->
+
+    STAGE(from_srgb) {
+        r = from_srgb(r);
+        g = from_srgb(g);
+        b = from_srgb(b);
+    }
+
+Let's replace whatever's there with our fast approximation:
+
+<!--?prettify lang=cc?-->
+
+    STAGE(from_srgb) {
+        r *= r;
+        g *= g;
+        b *= b;
+    }
+
+When you save and re-Ninja, you should now see changes to
+`src/jumper/SkJumper_generated.S` and `src/jumper/SkJumper_generated_win.S`.
+If you can't read assembly, no big deal.  If you can, run `git diff`.  You
+should see the various `sk_from_srgb_*` functions get dramatically simpler,
+something like three multiplies and a couple other bookkeeping instructions.
+
+It's not unusual for isolated changes in one stage to cause seemingly unrelated
+changes in another.  When adding or removing any code you'll usually see all
+the comments in branch instructions change a little bit, but the actual
+instruction on the left won't change.  When adding or removing uses of
+constants, you'll often see both the comment and instruction on the left change
+for other loads of constants from memory, especially on x86-64.  You'll also
+see some code that looks like garbage change; those are the constants.  If
+any of this worries you, please do go running to someone who knows more for
+help, but odds are everything is fine.
+
+At this point things should just be business as usual.  Any time you change
+`SkJumper_stages.cpp`, Ninja ought to notice and regenerate the assembly files.
+
+Adding a new Stage
+------------------
+
+Adding a new stage is a lot like changing an existing stage.  Edit
+`SkJumper_stages.cpp`, build Skia, test, repeat until correct.
+
+You'll just need to also edit `SkRasterPipeline.h` to add your new stage to the
+macro listing all the stages.  The stage name is the handle normal Skia code
+uses to refer to the stage abstractly, and the wiring between
+`SkRasterPipeline::foo` and `STAGE(foo) { ... }` should work automatically.
--- a/site/user/build.md
+++ b/site/user/build.md
@ -33,18 +33,6 @@ link Skia against the headers and libaries found on the system paths.
 use `extra_cflags` and `extra_ldflags` to add include or library paths if
 needed.

-A note on software backend performance
--------------------------------------
-
-A number of routines in Skia's software backend have been written to run
-fastest when compiled by Clang.  If you depend on software rasterization, image
-decoding, or color space conversion and compile Skia with GCC, MSVC or another
-compiler, you will see dramatically worse performance than if you use Clang.
-
-This choice was only a matter of prioritization; there is nothing fundamentally
-wrong with non-Clang compilers.  So if this is a serious issue for you, please
-let us know on the mailing list.
-
 Quickstart
 ----------

--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@ -40,7 +40,6 @@
 #include "SkBlitRow_opts.h"
 #include "SkChecksum_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
-#include "SkRasterPipeline_opts.h"
 #include "SkSwizzler_opts.h"
 #include "SkUtils_opts.h"
 #include "SkXfermode_opts.h"
@ -82,26 +81,11 @@ namespace SkOpts {

 #undef DEFINE_DEFAULT

-#define M(st) (StageFn)SK_OPTS_NS::st,
-    StageFn stages_highp[] = { SK_RASTER_PIPELINE_STAGES(M) };
-    StageFn just_return_highp = (StageFn)SK_OPTS_NS::just_return;
-    void (*start_pipeline_highp)(size_t,size_t,size_t,size_t,void**)
-        = SK_OPTS_NS::start_pipeline;
-#undef M
-
-#define M(st) (StageFn)SK_OPTS_NS::lowp::st,
-    StageFn stages_lowp[] = { SK_RASTER_PIPELINE_STAGES(M) };
-    StageFn just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
-    void (*start_pipeline_lowp)(size_t,size_t,size_t,size_t,void**)
-        = SK_OPTS_NS::lowp::start_pipeline;
-#undef M
-
    // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
    void Init_ssse3();
    void Init_sse41();
    void Init_sse42();
    void Init_avx();
-    void Init_hsw();
    void Init_crc32();

    static void init() {
@ -120,8 +104,7 @@ namespace SkOpts {
        #endif

        #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_AVX
-            if (SkCpu::Supports(SkCpu::AVX)) { Init_avx();   }
-            if (SkCpu::Supports(SkCpu::HSW)) { Init_hsw();   }
+            if (SkCpu::Supports(SkCpu::AVX  )) { Init_avx();   }
        #endif

    #elif defined(SK_CPU_ARM64)
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@ -54,17 +54,6 @@ namespace SkOpts {
    static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {
        return hash_fn(data, bytes, seed);
    }
-
-#define M(st) +1
-    // We can't necessarily express the type of SkJumper stage functions here,
-    // so we just use this void(*)(void) as a stand-in.
-    using StageFn = void(*)(void);
-    extern StageFn stages_highp[SK_RASTER_PIPELINE_STAGES(M)], just_return_highp;
-    extern StageFn stages_lowp [SK_RASTER_PIPELINE_STAGES(M)], just_return_lowp;
-
-    extern void (*start_pipeline_highp)(size_t,size_t,size_t,size_t, void**);
-    extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
-#undef M
 }

 #endif//SkOpts_DEFINED
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -17,6 +17,8 @@
 #include <functional>
 #include <vector>

+struct SkJumper_Engine;
+
 /**
 * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
 *
@ -161,9 +163,7 @@ private:
        void*      ctx;
    };

-    using StartPipelineFn = void(*)(size_t,size_t,size_t,size_t, void** program);
-    StartPipelineFn build_pipeline(void**) const;
-
+    const SkJumper_Engine& build_pipeline(void**) const;
    void unchecked_append(StockStage, void*);

    SkArenaAlloc* fAlloc;
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -5,46 +5,468 @@
 * found in the LICENSE file.
 */

+#include "SkColorData.h"
+#include "SkCpu.h"
 #include "SkJumper.h"
-#include "SkOpts.h"
+#include "SkOnce.h"
 #include "SkRasterPipeline.h"
 #include "SkTemplates.h"

-SkRasterPipeline::StartPipelineFn SkRasterPipeline::build_pipeline(void** ip) const {
-#ifndef SK_JUMPER_DISABLE_8BIT
-    // We'll try to build a lowp pipeline, but if that fails fallback to a highp float pipeline.
-    void** reset_point = ip;
+#if !defined(SK_JUMPER_USE_ASSEMBLY)
+    // We'll use __has_feature(memory_sanitizer) to detect MSAN.
+    // SkJumper_generated.S is not compiled with MSAN, so MSAN would yell really loud.
+    #if !defined(__has_feature)
+        #define __has_feature(x) 0
+    #endif

-    // Stages are stored backwards in fStages, so we reverse here, back to front.
-    *--ip = (void*)SkOpts::just_return_lowp;
+    #if 0 || __has_feature(memory_sanitizer)
+        #define SK_JUMPER_USE_ASSEMBLY 0
+    #else
+        #define SK_JUMPER_USE_ASSEMBLY 1
+    #endif
+#endif
+
+#define M(st) +1
+static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
+#undef M
+
+#ifndef SK_JUMPER_DISABLE_8BIT
+    // Intentionally commented out; optional logging for local debugging.
+    #if 0 && SK_JUMPER_USE_ASSEMBLY && (defined(__x86_64__) || defined(_M_X64))
+        #include <atomic>
+
+        #define M(st) #st,
+        static const char* kStageNames[] = { SK_RASTER_PIPELINE_STAGES(M) };
+        #undef M
+
+        static std::atomic<int> gMissingStageCounters[kNumStages];
+
+        static void log_missing(SkRasterPipeline::StockStage st) {
+            static SkOnce once;
+            once([] { atexit([] {
+                int total = 0;
+                for (int i = 0; i < kNumStages; i++) {
+                    if (int count = gMissingStageCounters[i].load()) {
+                        SkDebugf("%7d\t%s\n", count, kStageNames[i]);
+                        total += count;
+                    }
+                }
+                SkDebugf("%7d total\n", total);
+            }); });
+
+            gMissingStageCounters[st]++;
+        }
+    #else
+        static void log_missing(SkRasterPipeline::StockStage) {}
+    #endif
+#endif
+
+// We can't express the real types of most stage functions portably, so we use a stand-in.
+// We'll only ever call start_pipeline(), which then chains into the rest.
+using StageFn         = void(void);
+using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**);
+
+// Some platforms expect C "name" maps to asm "_name", others to "name".
+#if defined(__APPLE__)
+    #define ASM(name, suffix)  sk_##name##_##suffix
+#else
+    #define ASM(name, suffix) _sk_##name##_##suffix
+#endif
+
+extern "C" {
+
+#if !SK_JUMPER_USE_ASSEMBLY
+    // We'll just run baseline code.
+
+#elif defined(__x86_64__) || defined(_M_X64)
+    StartPipelineFn ASM(start_pipeline,       skx),
+                    ASM(start_pipeline,       hsw),
+                    ASM(start_pipeline,       avx),
+                    ASM(start_pipeline,     sse41),
+                    ASM(start_pipeline,      sse2),
+                    ASM(start_pipeline,  hsw_lowp),
+                    ASM(start_pipeline,sse41_lowp),
+                    ASM(start_pipeline, sse2_lowp);
+
+    StageFn ASM(just_return,       skx),
+            ASM(just_return,       hsw),
+            ASM(just_return,       avx),
+            ASM(just_return,     sse41),
+            ASM(just_return,      sse2),
+            ASM(just_return,  hsw_lowp),
+            ASM(just_return,sse41_lowp),
+            ASM(just_return, sse2_lowp);
+
+    #define M(st) StageFn ASM(st,  skx),      \
+                          ASM(st,  hsw),      \
+                          ASM(st,  avx),      \
+                          ASM(st,sse41),      \
+                          ASM(st, sse2),      \
+                          ASM(st,  hsw_lowp), \
+                          ASM(st,sse41_lowp), \
+                          ASM(st, sse2_lowp);
+        SK_RASTER_PIPELINE_STAGES(M)
+    #undef M
+
+#elif defined(__i386__) || defined(_M_IX86)
+    StartPipelineFn ASM(start_pipeline,sse2),
+                    ASM(start_pipeline,sse2_lowp);
+    StageFn ASM(just_return,sse2),
+            ASM(just_return,sse2_lowp);
+    #define M(st) StageFn ASM(st,sse2),      \
+                          ASM(st,sse2_lowp);
+        SK_RASTER_PIPELINE_STAGES(M)
+    #undef M
+
+#endif
+
+    // Baseline code compiled as a normal part of Skia.
+    StartPipelineFn sk_start_pipeline;
+    StageFn sk_just_return;
+    #define M(st) StageFn sk_##st;
+        SK_RASTER_PIPELINE_STAGES(M)
+    #undef M
+
+#if defined(JUMPER_HAS_NEON_LOWP)
+    StartPipelineFn sk_start_pipeline_lowp;
+    StageFn sk_just_return_lowp;
+    #define M(st) StageFn sk_##st##_lowp;
+        SK_RASTER_PIPELINE_STAGES(M)
+    #undef M
+#endif
+
+}
+
+#if SK_JUMPER_USE_ASSEMBLY
+    #if defined(__x86_64__) || defined(_M_X64)
+        template <SkRasterPipeline::StockStage st>
+        static constexpr StageFn* hsw_lowp();
+
+        template <SkRasterPipeline::StockStage st>
+        static constexpr StageFn* sse41_lowp();
+
+        template <SkRasterPipeline::StockStage st>
+        static constexpr StageFn* sse2_lowp();
+
+        #define LOWP(st) \
+            template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() {   \
+                return ASM(st,hsw_lowp);                                        \
+            }                                                                   \
+            template <> constexpr StageFn* sse41_lowp<SkRasterPipeline::st>() { \
+                return ASM(st,sse41_lowp);                                      \
+            }                                                                   \
+            template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() {  \
+                return ASM(st,sse2_lowp);                                       \
+            }
+        #define NOPE(st) \
+            template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() {   \
+                return nullptr;                                                 \
+            }                                                                   \
+            template <> constexpr StageFn* sse41_lowp<SkRasterPipeline::st>() { \
+                return nullptr;                                                 \
+            }                                                                   \
+            template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() {  \
+                return nullptr;                                                 \
+            }
+
+    #elif defined(__i386__) || defined(_M_IX86)
+        template <SkRasterPipeline::StockStage st>
+        static constexpr StageFn* sse2_lowp();
+
+        #define LOWP(st) \
+            template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() {  \
+                return ASM(st,sse2_lowp);                                       \
+            }
+        #define NOPE(st) \
+            template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() {  \
+                return nullptr;                                                 \
+            }
+
+    #elif defined(JUMPER_HAS_NEON_LOWP)
+        template <SkRasterPipeline::StockStage st>
+        static constexpr StageFn* neon_lowp();
+
+        #define LOWP(st)                                                         \
+            template <> constexpr StageFn* neon_lowp<SkRasterPipeline::st>() {   \
+                return sk_##st##_lowp;                                           \
+            }
+        #define NOPE(st)                                                         \
+            template <> constexpr StageFn* neon_lowp<SkRasterPipeline::st>() {   \
+                return nullptr;                                                  \
+            }
+
+    #else
+        #define LOWP(st)
+        #define NOPE(st)
+
+    #endif
+
+    #define TODO(st) NOPE(st)  // stages that should be implemented in lowp, but aren't.
+
+    NOPE(callback)
+    LOWP(move_src_dst) LOWP(move_dst_src)
+    NOPE(clamp_0) NOPE(clamp_1) LOWP(clamp_a) LOWP(clamp_a_dst)
+    NOPE(unpremul) LOWP(premul) LOWP(premul_dst)
+    LOWP(force_opaque) LOWP(force_opaque_dst)
+    LOWP(set_rgb) LOWP(swap_rb) LOWP(invert)
+    NOPE(from_srgb) NOPE(from_srgb_dst) NOPE(to_srgb)
+    LOWP(black_color) LOWP(white_color) LOWP(uniform_color)
+    LOWP(seed_shader) NOPE(dither)
+    LOWP(load_a8)   LOWP(load_a8_dst)   LOWP(store_a8)   LOWP(gather_a8)
+    LOWP(load_g8)   LOWP(load_g8_dst)                    LOWP(gather_g8)
+    LOWP(load_565)  LOWP(load_565_dst)  LOWP(store_565)  LOWP(gather_565)
+    LOWP(load_4444) LOWP(load_4444_dst) LOWP(store_4444) LOWP(gather_4444)
+    NOPE(load_f16)  NOPE(load_f16_dst)  NOPE(store_f16)  NOPE(gather_f16)
+    NOPE(load_f32)  NOPE(load_f32_dst)  NOPE(store_f32)
+    LOWP(load_8888) LOWP(load_8888_dst) LOWP(store_8888) LOWP(gather_8888)
+    LOWP(load_bgra) LOWP(load_bgra_dst) LOWP(store_bgra) LOWP(gather_bgra)
+    NOPE(load_1010102) NOPE(load_1010102_dst) NOPE(store_1010102) NOPE(gather_1010102)
+    TODO(bilerp_clamp_8888)
+    TODO(load_u16_be) TODO(load_rgb_u16_be) TODO(store_u16_be)
+    NOPE(load_tables_u16_be) NOPE(load_tables_rgb_u16_be) NOPE(load_tables)
+    NOPE(load_rgba) NOPE(store_rgba)
+    LOWP(scale_u8) LOWP(scale_565) LOWP(scale_1_float)
+    LOWP( lerp_u8) LOWP( lerp_565) LOWP( lerp_1_float)
+    LOWP(dstatop) LOWP(dstin) LOWP(dstout) LOWP(dstover)
+    LOWP(srcatop) LOWP(srcin) LOWP(srcout) LOWP(srcover)
+    LOWP(clear) LOWP(modulate) LOWP(multiply) LOWP(plus_) LOWP(screen) LOWP(xor_)
+    NOPE(colorburn) NOPE(colordodge) LOWP(darken) LOWP(difference)
+    LOWP(exclusion) LOWP(hardlight) LOWP(lighten) LOWP(overlay) NOPE(softlight)
+    NOPE(hue) NOPE(saturation) NOPE(color) NOPE(luminosity)
+    LOWP(srcover_rgba_8888) LOWP(srcover_bgra_8888)
+    LOWP(luminance_to_alpha)
+    LOWP(matrix_translate) LOWP(matrix_scale_translate)
+    LOWP(matrix_2x3) NOPE(matrix_3x4) TODO(matrix_4x5) TODO(matrix_4x3)
+    LOWP(matrix_perspective)
+    NOPE(parametric_r) NOPE(parametric_g) NOPE(parametric_b)
+    NOPE(parametric_a) NOPE(gamma) NOPE(gamma_dst)
+    NOPE(table_r) NOPE(table_g) NOPE(table_b) NOPE(table_a)
+    NOPE(lab_to_xyz)
+                    TODO(mirror_x)   TODO(repeat_x)
+                    TODO(mirror_y)   TODO(repeat_y)
+                    LOWP(decal_x)    LOWP(decal_y)   LOWP(decal_x_and_y)
+    LOWP(check_decal_mask)
+    TODO(bilinear_nx) TODO(bilinear_px) TODO(bilinear_ny) TODO(bilinear_py)
+    TODO(bicubic_n3x) TODO(bicubic_n1x) TODO(bicubic_p1x) TODO(bicubic_p3x)
+    TODO(bicubic_n3y) TODO(bicubic_n1y) TODO(bicubic_p1y) TODO(bicubic_p3y)
+    TODO(save_xy) TODO(accumulate)
+    LOWP(clamp_x_1) LOWP(mirror_x_1) LOWP(repeat_x_1)
+    LOWP(evenly_spaced_gradient)
+    LOWP(gradient)
+    LOWP(evenly_spaced_2_stop_gradient)
+    LOWP(xy_to_unit_angle)
+    LOWP(xy_to_radius)
+    TODO(negate_x)
+    TODO(xy_to_2pt_conical_strip)
+    TODO(xy_to_2pt_conical_focal_on_circle)
+    TODO(xy_to_2pt_conical_well_behaved)
+    TODO(xy_to_2pt_conical_greater)
+    TODO(xy_to_2pt_conical_smaller)
+    TODO(alter_2pt_conical_compensate_focal)
+    TODO(alter_2pt_conical_unswap)
+    TODO(mask_2pt_conical_nan) TODO(mask_2pt_conical_degenerates) TODO(apply_vector_mask)
+    TODO(byte_tables) TODO(byte_tables_rgb)
+    NOPE(rgb_to_hsl) NOPE(hsl_to_rgb)
+    NOPE(clut_3D) NOPE(clut_4D)
+    NOPE(gauss_a_to_rgba)
+
+    #undef LOWP
+    #undef TODO
+    #undef NOPE
+#endif
+
+// Engines comprise everything we need to run SkRasterPipelines.
+struct SkJumper_Engine {
+    StageFn*         stages[kNumStages];
+    StartPipelineFn* start_pipeline;
+    StageFn*         just_return;
+};
+
+// We'll default to this baseline engine, but try to choose a better one at runtime.
+static const SkJumper_Engine kBaseline = {
+#define M(stage) sk_##stage,
+    { SK_RASTER_PIPELINE_STAGES(M) },
+#undef M
+    sk_start_pipeline,
+    sk_just_return,
+};
+static SkJumper_Engine gEngine = kBaseline;
+static SkOnce gChooseEngineOnce;
+
+static SkJumper_Engine choose_engine() {
+#if !SK_JUMPER_USE_ASSEMBLY
+    // We'll just run baseline code.
+
+#elif defined(__x86_64__) || defined(_M_X64)
+    #if !defined(_MSC_VER)  // No _skx stages for Windows yet.
+        if (1 && SkCpu::Supports(SkCpu::SKX)) {
+            return {
+            #define M(stage) ASM(stage, skx),
+                { SK_RASTER_PIPELINE_STAGES(M) },
+                M(start_pipeline)
+                M(just_return)
+            #undef M
+            };
+        }
+    #endif
+    if (1 && SkCpu::Supports(SkCpu::HSW)) {
+        return {
+        #define M(stage) ASM(stage, hsw),
+            { SK_RASTER_PIPELINE_STAGES(M) },
+            M(start_pipeline)
+            M(just_return)
+        #undef M
+        };
+    }
+    if (1 && SkCpu::Supports(SkCpu::AVX)) {
+        return {
+        #define M(stage) ASM(stage, avx),
+            { SK_RASTER_PIPELINE_STAGES(M) },
+            M(start_pipeline)
+            M(just_return)
+        #undef M
+        };
+    }
+    if (1 && SkCpu::Supports(SkCpu::SSE41)) {
+        return {
+        #define M(stage) ASM(stage, sse41),
+            { SK_RASTER_PIPELINE_STAGES(M) },
+            M(start_pipeline)
+            M(just_return)
+        #undef M
+        };
+    }
+    if (1 && SkCpu::Supports(SkCpu::SSE2)) {
+        return {
+        #define M(stage) ASM(stage, sse2),
+            { SK_RASTER_PIPELINE_STAGES(M) },
+            M(start_pipeline)
+            M(just_return)
+        #undef M
+        };
+    }
+
+#elif defined(__i386__) || defined(_M_IX86)
+    if (1 && SkCpu::Supports(SkCpu::SSE2)) {
+        return {
+        #define M(stage) ASM(stage, sse2),
+            { SK_RASTER_PIPELINE_STAGES(M) },
+            M(start_pipeline)
+            M(just_return)
+        #undef M
+        };
+    }
+
+#endif
+    return kBaseline;
+}
+
+#ifndef SK_JUMPER_DISABLE_8BIT
+    static const SkJumper_Engine kNone = {
+    #define M(stage) nullptr,
+        { SK_RASTER_PIPELINE_STAGES(M) },
+    #undef M
+        nullptr,
+        nullptr,
+    };
+    static SkJumper_Engine gLowp = kNone;
+    static SkOnce gChooseLowpOnce;
+
+    static SkJumper_Engine choose_lowp() {
+    #if SK_JUMPER_USE_ASSEMBLY
+        #if defined(__x86_64__) || defined(_M_X64)
+            if (1 && SkCpu::Supports(SkCpu::HSW)) {
+                return {
+                #define M(st) hsw_lowp<SkRasterPipeline::st>(),
+                    { SK_RASTER_PIPELINE_STAGES(M) },
+                    ASM(start_pipeline,hsw_lowp),
+                    ASM(just_return   ,hsw_lowp),
+                #undef M
+                };
+            }
+            if (1 && SkCpu::Supports(SkCpu::SSE41)) {
+                return {
+                #define M(st) sse41_lowp<SkRasterPipeline::st>(),
+                    { SK_RASTER_PIPELINE_STAGES(M) },
+                    ASM(start_pipeline,sse41_lowp),
+                    ASM(just_return   ,sse41_lowp),
+                #undef M
+                };
+            }
+            if (1 && SkCpu::Supports(SkCpu::SSE2)) {
+                return {
+                #define M(st) sse2_lowp<SkRasterPipeline::st>(),
+                    { SK_RASTER_PIPELINE_STAGES(M) },
+                    ASM(start_pipeline,sse2_lowp),
+                    ASM(just_return   ,sse2_lowp),
+                #undef M
+                };
+            }
+        #elif defined(__i386__) || defined(_M_IX86)
+            if (1 && SkCpu::Supports(SkCpu::SSE2)) {
+                return {
+                #define M(st) sse2_lowp<SkRasterPipeline::st>(),
+                    { SK_RASTER_PIPELINE_STAGES(M) },
+                    ASM(start_pipeline,sse2_lowp),
+                    ASM(just_return   ,sse2_lowp),
+                #undef M
+                };
+            }
+
+        #elif defined(JUMPER_HAS_NEON_LOWP)
+            return {
+            #define M(st) neon_lowp<SkRasterPipeline::st>(),
+                { SK_RASTER_PIPELINE_STAGES(M) },
+                sk_start_pipeline_lowp,
+                sk_just_return_lowp,
+            #undef M
+            };
+        #endif
+    #endif
+        return kNone;
+    }
+#endif
+
+const SkJumper_Engine& SkRasterPipeline::build_pipeline(void** ip) const {
+#ifndef SK_JUMPER_DISABLE_8BIT
+    gChooseLowpOnce([]{ gLowp = choose_lowp(); });
+
+    // First try to build a lowp pipeline.  If that fails, fall back to normal float gEngine.
+    void** reset_point = ip;
+    *--ip = (void*)gLowp.just_return;
    for (const StageList* st = fStages; st; st = st->prev) {
        if (st->stage == SkRasterPipeline::clamp_0 ||
            st->stage == SkRasterPipeline::clamp_1) {
            continue;  // No-ops in lowp.
        }
-        if (auto fn = SkOpts::stages_lowp[st->stage]) {
+        if (StageFn* fn = gLowp.stages[st->stage]) {
            if (st->ctx) {
                *--ip = st->ctx;
            }
            *--ip = (void*)fn;
        } else {
+            log_missing(st->stage);
            ip = reset_point;
            break;
        }
    }
    if (ip != reset_point) {
-        return SkOpts::start_pipeline_lowp;
+        return gLowp;
    }
 #endif

-    *--ip = (void*)SkOpts::just_return_highp;
+    gChooseEngineOnce([]{ gEngine = choose_engine(); });
+    // We're building the pipeline backwards, so we start with the final stage just_return.
+    *--ip = (void*)gEngine.just_return;
+
+    // Still going backwards, each stage's context pointer then its StageFn.
    for (const StageList* st = fStages; st; st = st->prev) {
        if (st->ctx) {
            *--ip = st->ctx;
        }
-        *--ip = (void*)SkOpts::stages_highp[st->stage];
+        *--ip = (void*)gEngine.stages[st->stage];
    }
-    return SkOpts::start_pipeline_highp;
+    return gEngine;
 }

 void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
@ -55,8 +477,8 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
    // Best to not use fAlloc here... we can't bound how often run() will be called.
    SkAutoSTMalloc<64, void*> program(fSlotsNeeded);

-    auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded);
-    start_pipeline(x,y,x+w,y+h, program.get());
+    const SkJumper_Engine& engine = this->build_pipeline(program.get() + fSlotsNeeded);
+    engine.start_pipeline(x,y,x+w,y+h, program.get());
 }

 std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile() const {
@ -65,8 +487,9 @@ std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile()
    }

    void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
+    const SkJumper_Engine& engine = this->build_pipeline(program + fSlotsNeeded);

-    auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
+    auto start_pipeline = engine.start_pipeline;
    return [=](size_t x, size_t y, size_t w, size_t h) {
        start_pipeline(x,y,x+w,y+h, program);
    };
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@ -11,11 +11,34 @@
 #include <stddef.h>
 #include <stdint.h>

-// This file contains definitions shared by SkJumper.cpp/SkJumper_stages.cpp
-// and the rest of Skia.  It is important to keep the interface to SkJumper
-// limited and simple to avoid serious ODR violation pitfalls, especially when
-// using Microsoft's <math.h> and similar headers with inline-but-not-static
-// function definitions.
+// This file contains definitions shared by SkJumper.cpp (compiled normally as part of Skia)
+// and SkJumper_stages.cpp (compiled into Skia _and_ offline into SkJumper_generated.h).
+// Keep it simple!
+
+// Externally facing functions (start_pipeline) are called a little specially on Windows.
+#if defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__x86_64__)
+    #define MAYBE_MSABI __attribute__((ms_abi))                   // Use MS' ABI, not System V.
+#elif defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__i386__)
+    #define MAYBE_MSABI __attribute__((force_align_arg_pointer))  // Re-align stack 4 -> 16 bytes.
+#else
+    #define MAYBE_MSABI
+#endif
+
+// Any custom ABI to use for all non-externally-facing stage functions.
+#if defined(__ARM_NEON) && defined(__arm__)
+    // This lets us pass vectors more efficiently on 32-bit ARM.
+    #define ABI __attribute__((pcs("aapcs-vfp")))
+#else
+    #define ABI
+#endif
+
+// On ARM we expect that you're using Clang if you want SkJumper to be fast.
+// If you are, the baseline float stages will use NEON, and lowp stages will
+// also be available. (If somehow you're building for ARM not using Clang,
+// you'll get scalar baseline stages and no lowp support.)
+#if defined(__clang__) && defined(__ARM_NEON)
+    #define JUMPER_HAS_NEON_LOWP
+#endif

 static const int SkJumper_kMaxStride = 16;

@ -53,7 +76,7 @@ struct SkJumper_DecalTileCtx {
 };

 struct SkJumper_CallbackCtx {
-    void (*fn)(SkJumper_CallbackCtx* self, int active_pixels/*<= SkJumper_kMaxStride*/);
+    MAYBE_MSABI void (*fn)(SkJumper_CallbackCtx* self, int active_pixels/*<= SkJumper_kMaxStride*/);

    // When called, fn() will have our active pixels available in rgba.
    // When fn() returns, the pipeline will read back those active pixels from read_from.
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@ -0,0 +1,961 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// This restricted SkJumper backend works on 8-bit per channel pixels stored in
+// 16-bit channels.  This is a last attempt to write a performant low-precision
+// backend with stage definitions that can be shared by x86 and ARM.
+
+#include "SkJumper.h"
+#include "SkJumper_misc.h"
+
+#if defined(__clang__)  // This file is empty when not compiled by Clang.
+
+#if defined(__ARM_NEON)
+    #include <arm_neon.h>
+#elif defined(__SSE2__)
+    #include <immintrin.h>
+#else
+    #include <math.h>
+#endif
+
+#if !defined(JUMPER_IS_OFFLINE)
+    #define WRAP(name) sk_##name##_lowp
+#elif defined(__AVX2__)
+    #define WRAP(name) sk_##name##_hsw_lowp
+#elif defined(__SSE4_1__)
+    #define WRAP(name) sk_##name##_sse41_lowp
+#elif defined(__SSE2__)
+    #define WRAP(name) sk_##name##_sse2_lowp
+#endif
+
+#if defined(__AVX2__)
+    using U8  = uint8_t  __attribute__((ext_vector_type(16)));
+    using U16 = uint16_t __attribute__((ext_vector_type(16)));
+    using I16 =  int16_t __attribute__((ext_vector_type(16)));
+    using I32 =  int32_t __attribute__((ext_vector_type(16)));
+    using U32 = uint32_t __attribute__((ext_vector_type(16)));
+    using F   = float    __attribute__((ext_vector_type(16)));
+#else
+    using U8  = uint8_t  __attribute__((ext_vector_type(8)));
+    using U16 = uint16_t __attribute__((ext_vector_type(8)));
+    using I16 =  int16_t __attribute__((ext_vector_type(8)));
+    using I32 =  int32_t __attribute__((ext_vector_type(8)));
+    using U32 = uint32_t __attribute__((ext_vector_type(8)));
+    using F   = float    __attribute__((ext_vector_type(8)));
+#endif
+
+static const size_t N = sizeof(U16) / sizeof(uint16_t);
+
+// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
+using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
+                          U16  r, U16  g, U16  b, U16  a,
+                          U16 dr, U16 dg, U16 db, U16 da);
+
+extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0,
+                                                 const size_t y0,
+                                                 const size_t xlimit,
+                                                 const size_t ylimit,
+                                                 void** program) {
+    auto start = (Stage)load_and_inc(program);
+    for (size_t dy = y0; dy < ylimit; dy++) {
+        size_t dx = x0;
+        for (; dx + N <= xlimit; dx += N) {
+            start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0);
+        }
+        if (size_t tail = xlimit - dx) {
+            start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
+        }
+    }
+}
+
+extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t,
+                                      U16,U16,U16,U16, U16,U16,U16,U16) {}
+
+// All stages use the same function call ABI to chain into each other, but there are three types:
+//   GG: geometry in, geometry out  -- think, a matrix
+//   GP: geometry in, pixels out.   -- think, a memory gather
+//   PP: pixels in, pixels out.     -- think, a blend mode
+//
+// (Some stages ignore their inputs or produce no logical output.  That's perfectly fine.)
+//
+// These three STAGE_ macros let you define each type of stage,
+// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
+
+#define STAGE_GG(name, ...)                                                            \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
+    extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
+                                   U16  r, U16  g, U16  b, U16  a,                     \
+                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
+        auto x = join<F>(r,g),                                                         \
+             y = join<F>(b,a);                                                         \
+        name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
+        split(x, &r,&g);                                                               \
+        split(y, &b,&a);                                                               \
+        auto next = (Stage)load_and_inc(program);                                      \
+        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
+    }                                                                                  \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
+
+#define STAGE_GP(name, ...)                                                            \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
+                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                     U16& dr, U16& dg, U16& db, U16& da);                              \
+    extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
+                                   U16  r, U16  g, U16  b, U16  a,                     \
+                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
+        auto x = join<F>(r,g),                                                         \
+             y = join<F>(b,a);                                                         \
+        name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
+        auto next = (Stage)load_and_inc(program);                                      \
+        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
+    }                                                                                  \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
+                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                     U16& dr, U16& dg, U16& db, U16& da)
+
+#define STAGE_PP(name, ...)                                                            \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
+                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                     U16& dr, U16& dg, U16& db, U16& da);                              \
+    extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
+                                   U16  r, U16  g, U16  b, U16  a,                     \
+                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
+        name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
+        auto next = (Stage)load_and_inc(program);                                      \
+        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
+    }                                                                                  \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
+                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                     U16& dr, U16& dg, U16& db, U16& da)
+
+// ~~~~~~ Commonly used helper functions ~~~~~~ //
+
+SI U16 div255(U16 v) {
+#if 0
+    return (v+127)/255;  // The ideal rounding divide by 255.
+#else
+    return (v+255)/256;  // A good approximation of (v+127)/255.
+#endif
+}
+
+SI U16 inv(U16 v) { return 255-v; }
+
+SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
+SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
+
+SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
+SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
+SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
+SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
+
+SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
+
+SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
+
+template <typename D, typename S>
+SI D cast(S src) {
+    return __builtin_convertvector(src, D);
+}
+
+template <typename D, typename S>
+SI void split(S v, D* lo, D* hi) {
+    static_assert(2*sizeof(D) == sizeof(S), "");
+    memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
+    memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
+}
+template <typename D, typename S>
+SI D join(S lo, S hi) {
+    static_assert(sizeof(D) == 2*sizeof(S), "");
+    D v;
+    memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
+    memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
+    return v;
+}
+template <typename V, typename H>
+SI V map(V v, H (*fn)(H)) {
+    H lo,hi;
+    split(v, &lo,&hi);
+    lo = fn(lo);
+    hi = fn(hi);
+    return join<V>(lo,hi);
+}
+
+// TODO: do we need platform-specific intrinsics for any of these?
+SI F if_then_else(I32 c, F t, F e) {
+    return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
+}
+SI F max(F x, F y) { return if_then_else(x < y, y, x); }
+SI F min(F x, F y) { return if_then_else(x < y, x, y); }
+
+SI F mad(F f, F m, F a) { return f*m+a; }
+SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
+
+SI F rcp(F x) {
+#if defined(__AVX2__)
+    return map(x, _mm256_rcp_ps);
+#elif defined(__SSE__)
+    return map(x, _mm_rcp_ps);
+#elif defined(__ARM_NEON)
+    return map(x, +[](float32x4_t v) {
+        auto est = vrecpeq_f32(v);
+        return vrecpsq_f32(v,est)*est;
+    });
+#else
+    return 1.0f / x;
+#endif
+}
+SI F sqrt_(F x) {
+#if defined(__AVX2__)
+    return map(x, _mm256_sqrt_ps);
+#elif defined(__SSE__)
+    return map(x, _mm_sqrt_ps);
+#elif defined(__aarch64__)
+    return map(x, vsqrtq_f32);
+#elif defined(__ARM_NEON)
+    return map(x, +[](float32x4_t v) {
+        auto est = vrsqrteq_f32(v);  // Estimate and two refinement steps for est = rsqrt(v).
+        est *= vrsqrtsq_f32(v,est*est);
+        est *= vrsqrtsq_f32(v,est*est);
+        return v*est;                // sqrt(v) == v*rsqrt(v).
+    });
+#else
+    return F{
+        sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
+        sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
+    };
+#endif
+}
+
+SI F floor_(F x) {
+#if defined(__aarch64__)
+    return map(x, vrndmq_f32);
+#elif defined(__AVX2__)
+    return map(x, +[](__m256 v){ return _mm256_floor_ps(v); });  // _mm256_floor_ps is a macro...
+#elif defined(__SSE4_1__)
+    return map(x, +[](__m128 v){ return    _mm_floor_ps(v); });  // _mm_floor_ps() is a macro too.
+#else
+    F roundtrip = cast<F>(cast<I32>(x));
+    return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
+#endif
+}
+SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); }
+
+// ~~~~~~ Basic / misc. stages ~~~~~~ //
+
+STAGE_GG(seed_shader, const float* iota) {
+    x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
+    y = cast<F>(I32(dy)) + 0.5f;
+}
+
+STAGE_GG(matrix_translate, const float* m) {
+    x += m[0];
+    y += m[1];
+}
+STAGE_GG(matrix_scale_translate, const float* m) {
+    x = mad(x,m[0], m[2]);
+    y = mad(y,m[1], m[3]);
+}
+STAGE_GG(matrix_2x3, const float* m) {
+    auto X = mad(x,m[0], mad(y,m[2], m[4])),
+         Y = mad(x,m[1], mad(y,m[3], m[5]));
+    x = X;
+    y = Y;
+}
+STAGE_GG(matrix_perspective, const float* m) {
+    // N.B. Unlike the other matrix_ stages, this matrix is row-major.
+    auto X = mad(x,m[0], mad(y,m[1], m[2])),
+         Y = mad(x,m[3], mad(y,m[4], m[5])),
+         Z = mad(x,m[6], mad(y,m[7], m[8]));
+    x = X * rcp(Z);
+    y = Y * rcp(Z);
+}
+
+STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) {
+    r = c->rgba[0];
+    g = c->rgba[1];
+    b = c->rgba[2];
+    a = c->rgba[3];
+}
+STAGE_PP(black_color, Ctx::None) { r = g = b =   0; a = 255; }
+STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
+
+STAGE_PP(set_rgb, const float rgb[3]) {
+    r = from_float(rgb[0]);
+    g = from_float(rgb[1]);
+    b = from_float(rgb[2]);
+}
+
+STAGE_PP(clamp_a, Ctx::None) {
+    r = min(r, a);
+    g = min(g, a);
+    b = min(b, a);
+}
+STAGE_PP(clamp_a_dst, Ctx::None) {
+    dr = min(dr, da);
+    dg = min(dg, da);
+    db = min(db, da);
+}
+
+STAGE_PP(premul, Ctx::None) {
+    r = div255(r * a);
+    g = div255(g * a);
+    b = div255(b * a);
+}
+STAGE_PP(premul_dst, Ctx::None) {
+    dr = div255(dr * da);
+    dg = div255(dg * da);
+    db = div255(db * da);
+}
+
+STAGE_PP(force_opaque    , Ctx::None) {  a = 255; }
+STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; }
+
+STAGE_PP(swap_rb, Ctx::None) {
+    auto tmp = r;
+    r = b;
+    b = tmp;
+}
+
+STAGE_PP(move_src_dst, Ctx::None) {
+    dr = r;
+    dg = g;
+    db = b;
+    da = a;
+}
+
+STAGE_PP(move_dst_src, Ctx::None) {
+    r = dr;
+    g = dg;
+    b = db;
+    a = da;
+}
+
+STAGE_PP(invert, Ctx::None) {
+    r = inv(r);
+    g = inv(g);
+    b = inv(b);
+    a = inv(a);
+}
+
+// ~~~~~~ Blend modes ~~~~~~ //
+
+// The same logic applied to all 4 channels.
+#define BLEND_MODE(name)                                 \
+    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
+    STAGE_PP(name, Ctx::None) {                          \
+        r = name##_channel(r,dr,a,da);                   \
+        g = name##_channel(g,dg,a,da);                   \
+        b = name##_channel(b,db,a,da);                   \
+        a = name##_channel(a,da,a,da);                   \
+    }                                                    \
+    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
+
+    BLEND_MODE(clear)    { return 0; }
+    BLEND_MODE(srcatop)  { return div255( s*da + d*inv(sa) ); }
+    BLEND_MODE(dstatop)  { return div255( d*sa + s*inv(da) ); }
+    BLEND_MODE(srcin)    { return div255( s*da ); }
+    BLEND_MODE(dstin)    { return div255( d*sa ); }
+    BLEND_MODE(srcout)   { return div255( s*inv(da) ); }
+    BLEND_MODE(dstout)   { return div255( d*inv(sa) ); }
+    BLEND_MODE(srcover)  { return s + div255( d*inv(sa) ); }
+    BLEND_MODE(dstover)  { return d + div255( s*inv(da) ); }
+    BLEND_MODE(modulate) { return div255( s*d ); }
+    BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
+    BLEND_MODE(plus_)    { return min(s+d, 255); }
+    BLEND_MODE(screen)   { return s + d - div255( s*d ); }
+    BLEND_MODE(xor_)     { return div255( s*inv(da) + d*inv(sa) ); }
+#undef BLEND_MODE
+
+// The same logic applied to color, and srcover for alpha.
+#define BLEND_MODE(name)                                 \
+    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
+    STAGE_PP(name, Ctx::None) {                          \
+        r = name##_channel(r,dr,a,da);                   \
+        g = name##_channel(g,dg,a,da);                   \
+        b = name##_channel(b,db,a,da);                   \
+        a = a + div255( da*inv(a) );                     \
+    }                                                    \
+    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
+
+    BLEND_MODE(darken)     { return s + d -   div255( max(s*da, d*sa) ); }
+    BLEND_MODE(lighten)    { return s + d -   div255( min(s*da, d*sa) ); }
+    BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
+    BLEND_MODE(exclusion)  { return s + d - 2*div255( s*d ); }
+
+    BLEND_MODE(hardlight) {
+        return div255( s*inv(da) + d*inv(sa) +
+                       if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
+    }
+    BLEND_MODE(overlay) {
+        return div255( s*inv(da) + d*inv(sa) +
+                       if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
+    }
+#undef BLEND_MODE
+
+// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
+
+template <typename T>
+SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
+    return (T*)ctx->pixels + dy*ctx->stride + dx;
+}
+
+template <typename T>
+SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
+    auto clamp = [](F v, F limit) {
+        limit = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
+        return min(max(0, v), limit);
+    };
+    x = clamp(x, ctx->width);
+    y = clamp(y, ctx->height);
+
+    *ptr = (const T*)ctx->pixels;
+    return trunc_(y)*ctx->stride + trunc_(x);
+}
+
+template <typename V, typename T>
+SI V load(const T* ptr, size_t tail) {
+    V v = 0;
+    switch (tail & (N-1)) {
+        case  0: memcpy(&v, ptr, sizeof(v)); break;
+    #if defined(__AVX2__)
+        case 15: v[14] = ptr[14];
+        case 14: v[13] = ptr[13];
+        case 13: v[12] = ptr[12];
+        case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
+        case 11: v[10] = ptr[10];
+        case 10: v[ 9] = ptr[ 9];
+        case  9: v[ 8] = ptr[ 8];
+        case  8: memcpy(&v, ptr,  8*sizeof(T)); break;
+    #endif
+        case  7: v[ 6] = ptr[ 6];
+        case  6: v[ 5] = ptr[ 5];
+        case  5: v[ 4] = ptr[ 4];
+        case  4: memcpy(&v, ptr,  4*sizeof(T)); break;
+        case  3: v[ 2] = ptr[ 2];
+        case  2: memcpy(&v, ptr,  2*sizeof(T)); break;
+        case  1: v[ 0] = ptr[ 0];
+    }
+    return v;
+}
+template <typename V, typename T>
+SI void store(T* ptr, size_t tail, V v) {
+    switch (tail & (N-1)) {
+        case  0: memcpy(ptr, &v, sizeof(v)); break;
+    #if defined(__AVX2__)
+        case 15: ptr[14] = v[14];
+        case 14: ptr[13] = v[13];
+        case 13: ptr[12] = v[12];
+        case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
+        case 11: ptr[10] = v[10];
+        case 10: ptr[ 9] = v[ 9];
+        case  9: ptr[ 8] = v[ 8];
+        case  8: memcpy(ptr, &v,  8*sizeof(T)); break;
+    #endif
+        case  7: ptr[ 6] = v[ 6];
+        case  6: ptr[ 5] = v[ 5];
+        case  5: ptr[ 4] = v[ 4];
+        case  4: memcpy(ptr, &v,  4*sizeof(T)); break;
+        case  3: ptr[ 2] = v[ 2];
+        case  2: memcpy(ptr, &v,  2*sizeof(T)); break;
+        case  1: ptr[ 0] = v[ 0];
+    }
+}
+
+#if defined(__AVX2__)
+    template <typename V, typename T>
+    SI V gather(const T* ptr, U32 ix) {
+        return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+                  ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
+                  ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
+                  ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
+    }
+
+    template<>
+    F gather(const float* p, U32 ix) {
+        __m256i lo, hi;
+        split(ix, &lo, &hi);
+
+        return join<F>(_mm256_i32gather_ps(p, lo, 4),
+                       _mm256_i32gather_ps(p, hi, 4));
+    }
+
+    template<>
+    U32 gather(const uint32_t* p, U32 ix) {
+        __m256i lo, hi;
+        split(ix, &lo, &hi);
+
+        return join<U32>(_mm256_i32gather_epi32(p, lo, 4),
+                         _mm256_i32gather_epi32(p, hi, 4));
+    }
+#else
+    template <typename V, typename T>
+    SI V gather(const T* ptr, U32 ix) {
+        return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+                  ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
+    }
+#endif
+
+
+// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
+
+SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
+#if 1 && defined(__AVX2__)
+    // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
+    __m256i _01,_23;
+    split(rgba, &_01, &_23);
+    __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
+            _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
+    rgba = join<U32>(_02, _13);
+
+    auto cast_U16 = [](U32 v) -> U16 {
+        __m256i _02,_13;
+        split(v, &_02,&_13);
+        return _mm256_packus_epi32(_02,_13);
+    };
+#else
+    auto cast_U16 = [](U32 v) -> U16 {
+        return cast<U16>(v);
+    };
+#endif
+    *r = cast_U16(rgba & 65535) & 255;
+    *g = cast_U16(rgba & 65535) >>  8;
+    *b = cast_U16(rgba >>   16) & 255;
+    *a = cast_U16(rgba >>   16) >>  8;
+}
+
+SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+#if 1 && defined(__ARM_NEON)
+    uint8x8x4_t rgba;
+    switch (tail & (N-1)) {
+        case 0: rgba = vld4_u8     ((const uint8_t*)(ptr+0)         ); break;
+        case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
+        case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
+        case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
+        case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
+        case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
+        case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
+        case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
+    }
+    *r = cast<U16>(rgba.val[0]);
+    *g = cast<U16>(rgba.val[1]);
+    *b = cast<U16>(rgba.val[2]);
+    *a = cast<U16>(rgba.val[3]);
+#else
+    from_8888(load<U32>(ptr, tail), r,g,b,a);
+#endif
+}
+SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+#if 1 && defined(__ARM_NEON)
+    uint8x8x4_t rgba = {{
+        cast<U8>(r),
+        cast<U8>(g),
+        cast<U8>(b),
+        cast<U8>(a),
+    }};
+    switch (tail & (N-1)) {
+        case 0: vst4_u8     ((uint8_t*)(ptr+0), rgba   ); break;
+        case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
+        case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
+        case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
+        case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
+        case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
+        case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
+        case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
+    }
+#else
+    store(ptr, tail, cast<U32>(r | (g<<8)) <<  0
+                   | cast<U32>(b | (a<<8)) << 16);
+#endif
+}
+
+STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) {
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
+}
+STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
+}
+STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) {
+    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
+}
+
+STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) {
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
+}
+STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
+}
+STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) {
+    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
+}
+
+STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) {
+    const uint32_t* ptr;
+    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+    from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
+}
+STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) {
+    const uint32_t* ptr;
+    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+    from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a);
+}
+
+// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
+
+SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
+    // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
+    U16 R = (rgb >> 11) & 31,
+        G = (rgb >>  5) & 63,
+        B = (rgb >>  0) & 31;
+
+    // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
+    *r = (R << 3) | (R >> 2);
+    *g = (G << 2) | (G >> 4);
+    *b = (B << 3) | (B >> 2);
+}
+SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
+    from_565(load<U16>(ptr, tail), r,g,b);
+}
+SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
+    // Select the top 5,6,5 bits.
+    U16 R = r >> 3,
+        G = g >> 2,
+        B = b >> 3;
+    // Pack them back into 15|rrrrr gggggg bbbbb|0.
+    store(ptr, tail, R << 11
+                   | G <<  5
+                   | B <<  0);
+}
+
+STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) {
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
+    a = 255;
+}
+STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) {
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
+    da = 255;
+}
+STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) {
+    store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
+}
+STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) {
+    const uint16_t* ptr;
+    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+    from_565(gather<U16>(ptr, ix), &r, &g, &b);
+    a = 255;
+}
+
+SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
+    // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
+    U16 R = (rgba >> 12) & 15,
+        G = (rgba >>  8) & 15,
+        B = (rgba >>  4) & 15,
+        A = (rgba >>  0) & 15;
+
+    // Scale [0,15] to [0,255].
+    *r = (R << 4) | R;
+    *g = (G << 4) | G;
+    *b = (B << 4) | B;
+    *a = (A << 4) | A;
+}
+SI void load_4444(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+    from_4444(load<U16>(ptr, tail), r,g,b,a);
+}
+SI void store_4444(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+    // Select the top 4 bits of each.
+    U16 R = r >> 4,
+        G = g >> 4,
+        B = b >> 4,
+        A = a >> 4;
+    // Pack them back into 15|rrrr gggg bbbb aaaa|0.
+    store(ptr, tail, R << 12
+                   | G <<  8
+                   | B <<  4
+                   | A <<  0);
+}
+
+STAGE_PP(load_4444, const SkJumper_MemoryCtx* ctx) {
+    load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
+}
+STAGE_PP(load_4444_dst, const SkJumper_MemoryCtx* ctx) {
+    load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
+}
+STAGE_PP(store_4444, const SkJumper_MemoryCtx* ctx) {
+    store_4444(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
+}
+STAGE_GP(gather_4444, const SkJumper_GatherCtx* ctx) {
+    const uint16_t* ptr;
+    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+    from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
+}
+
+// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
+
+SI U16 load_8(const uint8_t* ptr, size_t tail) {
+    return cast<U16>(load<U8>(ptr, tail));
+}
+SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
+    store(ptr, tail, cast<U8>(v));
+}
+
+STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) {
+    r = g = b = 0;
+    a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
+}
+STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
+    dr = dg = db = 0;
+    da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
+}
+STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) {
+    store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
+}
+STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) {
+    const uint8_t* ptr;
+    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+    r = g = b = 0;
+    a = cast<U16>(gather<U8>(ptr, ix));
+}
+
+STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) {
+    r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
+    a = 255;
+}
+STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
+    dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
+    da = 255;
+}
+STAGE_PP(luminance_to_alpha, Ctx::None) {
+    a = (r*54 + g*183 + b*19)/256;  // 0.2126, 0.7152, 0.0722 with 256 denominator.
+    r = g = b = 0;
+}
+STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) {
+    const uint8_t* ptr;
+    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+    r = g = b = cast<U16>(gather<U8>(ptr, ix));
+    a = 255;
+}
+
+// ~~~~~~ Coverage scales / lerps ~~~~~~ //
+
+STAGE_PP(scale_1_float, const float* f) {
+    U16 c = from_float(*f);
+    r = div255( r * c );
+    g = div255( g * c );
+    b = div255( b * c );
+    a = div255( a * c );
+}
+STAGE_PP(lerp_1_float, const float* f) {
+    U16 c = from_float(*f);
+    r = lerp(dr, r, c);
+    g = lerp(dg, g, c);
+    b = lerp(db, b, c);
+    a = lerp(da, a, c);
+}
+
+STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) {
+    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
+    r = div255( r * c );
+    g = div255( g * c );
+    b = div255( b * c );
+    a = div255( a * c );
+}
+STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) {
+    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
+    r = lerp(dr, r, c);
+    g = lerp(dg, g, c);
+    b = lerp(db, b, c);
+    a = lerp(da, a, c);
+}
+
+// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
+SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
+    return if_then_else(a < da, min(cr,cg,cb)
+                              , max(cr,cg,cb));
+}
+STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) {
+    U16 cr,cg,cb;
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
+    U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
+
+    r = div255( r * cr );
+    g = div255( g * cg );
+    b = div255( b * cb );
+    a = div255( a * ca );
+}
+STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) {
+    U16 cr,cg,cb;
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
+    U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
+
+    r = lerp(dr, r, cr);
+    g = lerp(dg, g, cg);
+    b = lerp(db, b, cb);
+    a = lerp(da, a, ca);
+}
+
+// ~~~~~~ Gradient stages ~~~~~~ //
+
+// Clamp x to [0,1], both sides inclusive (think, gradients).
+// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
+SI F clamp_01(F v) { return min(max(0, v), 1); }
+
+STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
+STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
+STAGE_GG(mirror_x_1, Ctx::None) {
+    auto two = [](F x){ return x+x; };
+    x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
+}
+
+SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
+
+STAGE_GG(decal_x, SkJumper_DecalTileCtx* ctx) {
+    auto w = ctx->limit_x;
+    unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
+}
+STAGE_GG(decal_y, SkJumper_DecalTileCtx* ctx) {
+    auto h = ctx->limit_y;
+    unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
+}
+STAGE_GG(decal_x_and_y, SkJumper_DecalTileCtx* ctx) {
+    auto w = ctx->limit_x;
+    auto h = ctx->limit_y;
+    unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
+}
+STAGE_PP(check_decal_mask, SkJumper_DecalTileCtx* ctx) {
+    auto mask = unaligned_load<U16>(ctx->mask);
+    r = r & mask;
+    g = g & mask;
+    b = b & mask;
+    a = a & mask;
+}
+
+
+SI U16 round_F_to_U16(F x) { return cast<U16>(x * 255.0f + 0.5f); }
+
+SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
+                        U16* r, U16* g, U16* b, U16* a) {
+
+    F fr, fg, fb, fa, br, bg, bb, ba;
+#if defined(__AVX2__)
+    if (c->stopCount <=8) {
+        __m256i lo, hi;
+        split(idx, &lo, &hi);
+
+        fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
+        br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
+        fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
+        bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
+        fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
+        bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
+        fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
+        ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
+                     _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
+    } else
+#endif
+    {
+        fr = gather<F>(c->fs[0], idx);
+        fg = gather<F>(c->fs[1], idx);
+        fb = gather<F>(c->fs[2], idx);
+        fa = gather<F>(c->fs[3], idx);
+        br = gather<F>(c->bs[0], idx);
+        bg = gather<F>(c->bs[1], idx);
+        bb = gather<F>(c->bs[2], idx);
+        ba = gather<F>(c->bs[3], idx);
+    }
+    *r = round_F_to_U16(mad(t, fr, br));
+    *g = round_F_to_U16(mad(t, fg, bg));
+    *b = round_F_to_U16(mad(t, fb, bb));
+    *a = round_F_to_U16(mad(t, fa, ba));
+}
+
+STAGE_GP(gradient, const SkJumper_GradientCtx* c) {
+    auto t = x;
+    U32 idx = 0;
+
+    // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
+    for (size_t i = 1; i < c->stopCount; i++) {
+        idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
+    }
+
+    gradient_lookup(c, idx, t, &r, &g, &b, &a);
+}
+
+STAGE_GP(evenly_spaced_gradient, const SkJumper_GradientCtx* c) {
+    auto t = x;
+    auto idx = trunc_(t * (c->stopCount-1));
+    gradient_lookup(c, idx, t, &r, &g, &b, &a);
+}
+
+STAGE_GP(evenly_spaced_2_stop_gradient, const void* ctx) {
+    // TODO: Rename Ctx SkJumper_EvenlySpaced2StopGradientCtx.
+    struct Ctx { float f[4], b[4]; };
+    auto c = (const Ctx*)ctx;
+
+    auto t = x;
+    r = round_F_to_U16(mad(t, c->f[0], c->b[0]));
+    g = round_F_to_U16(mad(t, c->f[1], c->b[1]));
+    b = round_F_to_U16(mad(t, c->f[2], c->b[2]));
+    a = round_F_to_U16(mad(t, c->f[3], c->b[3]));
+}
+
+STAGE_GG(xy_to_unit_angle, Ctx::None) {
+    F xabs = abs_(x),
+      yabs = abs_(y);
+
+    F slope = min(xabs, yabs)/max(xabs, yabs);
+    F s = slope * slope;
+
+    // Use a 7th degree polynomial to approximate atan.
+    // This was generated using sollya.gforge.inria.fr.
+    // A float optimized polynomial was generated using the following command.
+    // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
+    F phi = slope
+             * (0.15912117063999176025390625f     + s
+             * (-5.185396969318389892578125e-2f   + s
+             * (2.476101927459239959716796875e-2f + s
+             * (-7.0547382347285747528076171875e-3f))));
+
+    phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
+    phi = if_then_else(x < 0.0f   , 1.0f/2.0f - phi, phi);
+    phi = if_then_else(y < 0.0f   , 1.0f - phi     , phi);
+    phi = if_then_else(phi != phi , 0              , phi);  // Check for NaN.
+    x = phi;
+}
+STAGE_GG(xy_to_radius, Ctx::None) {
+    x = sqrt_(x*x + y*y);
+}
+
+// ~~~~~~ Compound stages ~~~~~~ //
+
+STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
+    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
+
+    load_8888(ptr, tail, &dr,&dg,&db,&da);
+    r = r + div255( dr*inv(a) );
+    g = g + div255( dg*inv(a) );
+    b = b + div255( db*inv(a) );
+    a = a + div255( da*inv(a) );
+    store_8888(ptr, tail, r,g,b,a);
+}
+STAGE_PP(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) {
+    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
+
+    load_8888(ptr, tail, &db,&dg,&dr,&da);
+    r = r + div255( dr*inv(a) );
+    g = g + div255( dg*inv(a) );
+    b = b + div255( db*inv(a) );
+    a = a + div255( da*inv(a) );
+    store_8888(ptr, tail, b,g,r,a);
+}
+
+#endif//defined(__clang__)
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python2.7
+#
+# Copyright 2017 Google Inc.
+#
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import re
+import subprocess
+import sys
+
+clang         = 'clang-5.0'
+objdump       = 'gobjdump'
+ccache        = 'ccache'
+stages        = 'src/jumper/SkJumper_stages.cpp'
+stages_lowp   = 'src/jumper/SkJumper_stages_lowp.cpp'
+generated     = 'src/jumper/SkJumper_generated.S'
+generated_win = 'src/jumper/SkJumper_generated_win.S'
+
+clang         = sys.argv[1] if len(sys.argv) > 1 else clang
+objdump       = sys.argv[2] if len(sys.argv) > 2 else objdump
+ccache        = sys.argv[3] if len(sys.argv) > 3 else ccache
+stages        = sys.argv[4] if len(sys.argv) > 4 else stages
+stages_lowp   = sys.argv[5] if len(sys.argv) > 5 else stages_lowp
+generated     = sys.argv[6] if len(sys.argv) > 6 else generated
+generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win
+
+clang = [ccache, clang, '-x', 'c++']
+
+
+cflags = ['-std=c++11', '-Os', '-DJUMPER_IS_OFFLINE',
+          '-momit-leaf-frame-pointer', '-ffp-contract=fast',
+          '-fno-exceptions', '-fno-rtti', '-fno-unwind-tables']
+
+x86 = [ '-m32' ]
+win = ['-DWIN', '-mno-red-zone']
+sse2 = ['-msse2', '-mno-sse3', '-mno-ssse3', '-mno-sse4.1']
+subprocess.check_call(clang + cflags + sse2 +
+                      ['-c', stages] +
+                      ['-o', 'sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + win +
+                      ['-c', stages] +
+                      ['-o', 'win_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + x86 +
+                      ['-c', stages] +
+                      ['-o', 'x86_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + win + x86 +
+                      ['-c', stages] +
+                      ['-o', 'win_x86_sse2.o'])
+
+subprocess.check_call(clang + cflags + sse2 +
+                      ['-c', stages_lowp] +
+                      ['-o', 'lowp_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + win +
+                      ['-c', stages_lowp] +
+                      ['-o', 'win_lowp_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + x86 +
+                      ['-c', stages_lowp] +
+                      ['-o', 'x86_lowp_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + win + x86 +
+                      ['-c', stages_lowp] +
+                      ['-o', 'win_x86_lowp_sse2.o'])
+
+sse41 = ['-msse4.1']
+subprocess.check_call(clang + cflags + sse41 +
+                      ['-c', stages] +
+                      ['-o', 'sse41.o'])
+subprocess.check_call(clang + cflags + sse41 + win +
+                      ['-c', stages] +
+                      ['-o', 'win_sse41.o'])
+
+subprocess.check_call(clang + cflags + sse41 +
+                      ['-c', stages_lowp] +
+                      ['-o', 'lowp_sse41.o'])
+subprocess.check_call(clang + cflags + sse41 + win +
+                      ['-c', stages_lowp] +
+                      ['-o', 'win_lowp_sse41.o'])
+
+avx = ['-mavx']
+subprocess.check_call(clang + cflags + avx +
+                      ['-c', stages] +
+                      ['-o', 'avx.o'])
+subprocess.check_call(clang + cflags + avx + win +
+                      ['-c', stages] +
+                      ['-o', 'win_avx.o'])
+
+hsw = ['-mavx2', '-mfma', '-mf16c']
+subprocess.check_call(clang + cflags + hsw +
+                      ['-c', stages] +
+                      ['-o', 'hsw.o'])
+subprocess.check_call(clang + cflags + hsw + win +
+                      ['-c', stages] +
+                      ['-o', 'win_hsw.o'])
+
+subprocess.check_call(clang + cflags + hsw +
+                      ['-c', stages_lowp] +
+                      ['-o', 'lowp_hsw.o'])
+subprocess.check_call(clang + cflags + hsw + win +
+                      ['-c', stages_lowp] +
+                      ['-o', 'win_lowp_hsw.o'])
+
+skx = ['-march=skylake-avx512']
+subprocess.check_call(clang + cflags + skx +
+                      ['-c', stages] +
+                      ['-o', 'skx.o'])
+
+# Merge x86-64 object files to deduplicate constants.
+# (No other platform has more than one specialization.)
+subprocess.check_call(['ld', '-r', '-o', 'merged.o',
+                       'skx.o', 'hsw.o', 'avx.o', 'sse41.o', 'sse2.o',
+                       'lowp_hsw.o', 'lowp_sse41.o', 'lowp_sse2.o'])
+subprocess.check_call(['ld', '-r', '-o', 'win_merged.o',
+                       'win_hsw.o', 'win_avx.o', 'win_sse41.o', 'win_sse2.o',
+                       'win_lowp_hsw.o', 'win_lowp_sse41.o', 'win_lowp_sse2.o'])
+
+subprocess.check_call(['ld', '-r', '-o', 'x86_merged.o',
+                       'x86_sse2.o',
+                       'x86_lowp_sse2.o'])
+subprocess.check_call(['ld', '-r', '-o', 'win_x86_merged.o',
+                       'win_x86_sse2.o',
+                       'win_x86_lowp_sse2.o'])
+
+def parse_object_file(dot_o, directive, target=None):
+  globl, hidden, label, comment, align = \
+      '.globl', 'HIDDEN', ':', '// ', 'BALIGN'
+  if 'win' in dot_o:
+    globl, hidden, label, comment, align = \
+        'PUBLIC', '', ' LABEL PROC', '; ', 'ALIGN '
+
+  cmd = [objdump]
+  if target:
+    cmd += ['--target', target]
+
+  # Look for sections we know we can't handle.
+  section_headers = subprocess.check_output(cmd + ['-h', dot_o])
+  for snippet in ['.rodata']:
+    if snippet in section_headers:
+      print >>sys.stderr, 'Found %s in section.' % snippet
+      assert snippet not in section_headers
+
+  if directive == '.long':
+    disassemble = ['-d', dot_o]
+    dehex = lambda h: '0x'+h
+  else:
+    # x86-64... as long as we're using %rip-relative addressing,
+    # literal sections should be fine to just dump in with .text.
+    disassemble = ['-d',               # DO NOT USE -D.
+                   '-z',               # Print zero bytes instead of ...
+                   '--insn-width=11',
+                   '-j', '.text',
+                   '-j', '.literal4',
+                   '-j', '.literal8',
+                   '-j', '.literal16',
+                   '-j', '.const',
+                   dot_o]
+    dehex = lambda h: str(int(h,16))
+
+  # Ok.  Let's disassemble.
+  for line in subprocess.check_output(cmd + disassemble).split('\n'):
+    line = line.strip()
+
+    if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
+      continue
+
+    # E.g. 00000000000003a4 <_load_f16>:
+    m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
+    if m:
+      print
+      sym = m.group(1)
+      if sym.startswith('.literal'):  # .literal4, .literal16, etc
+        print sym.replace('.literal', align)
+      elif sym.startswith('.const'):  # 32-byte constants
+        print align + '32'
+      elif not sym.startswith('sk_'):
+        print >>sys.stderr, "build_stages.py can't handle '%s' (yet?)." % sym
+        assert sym.startswith('sk_')
+      else:  # a stage function
+        if hidden:
+          print hidden + ' _' + sym
+        print globl + ' _' + sym
+        if 'win' not in dot_o:
+          print 'FUNCTION(_' + sym + ')'
+        print '_' + sym + label
+      continue
+
+    columns = line.split('\t')
+   #print >>sys.stderr, columns
+    code = columns[1]
+    if len(columns) >= 4:
+      inst = columns[2]
+      args = columns[3]
+    else:
+      inst, args = columns[2], ''
+      if ' ' in columns[2]:
+        inst, args = columns[2].split(' ', 1)
+    code, inst, args = code.strip(), inst.strip(), args.strip()
+
+    hexed = ','.join(dehex(x) for x in code.split(' '))
+    print '  ' + directive + '  ' + hexed + ' '*(36-len(hexed)) + \
+          comment + inst + (' '*(14-len(inst)) + args if args else '')
+
+sys.stdout = open(generated, 'w')
+
+print '''# Copyright 2017 Google Inc.
+#
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This file is generated semi-automatically with this command:
+#   $ src/jumper/build_stages.py
+'''
+print '#if defined(__MACH__)'
+print '    #define HIDDEN .private_extern'
+print '    #define FUNCTION(name)'
+print '    #define BALIGN4  .align 2'
+print '    #define BALIGN8  .align 3'
+print '    #define BALIGN16 .align 4'
+print '    #define BALIGN32 .align 5'
+print '#else'
+print '    .section .note.GNU-stack,"",%progbits'
+print '    #define HIDDEN .hidden'
+print '    #define FUNCTION(name) .type name,%function'
+print '    #define BALIGN4  .balign 4'
+print '    #define BALIGN8  .balign 8'
+print '    #define BALIGN16 .balign 16'
+print '    #define BALIGN32 .balign 32'
+print '#endif'
+
+print '.text'
+print '#if defined(__x86_64__)'
+print 'BALIGN32'
+parse_object_file('merged.o', '.byte')
+
+print '#elif defined(__i386__)'
+print 'BALIGN32'
+parse_object_file('x86_merged.o', '.byte')
+
+print '#endif'
+
+sys.stdout = open(generated_win, 'w')
+print '''; Copyright 2017 Google Inc.
+;
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+; This file is generated semi-automatically with this command:
+;   $ src/jumper/build_stages.py
+'''
+print 'IFDEF RAX'
+print "_text32 SEGMENT ALIGN(32) 'CODE'"
+print 'ALIGN 32'
+parse_object_file('win_merged.o',   'DB')
+
+print 'ELSE'
+print '.MODEL FLAT,C'
+print "_text32 SEGMENT ALIGN(32) 'CODE'"
+print 'ALIGN 32'
+parse_object_file('win_x86_merged.o', 'DB')
+
+print 'ENDIF'
+print 'END'
--- a/src/opts/SkChecksum_opts.h
+++ b/src/opts/SkChecksum_opts.h
@ -19,11 +19,11 @@

 namespace SK_OPTS_NS {

-template <typename T, typename P>
-static inline T unaligned_load(const P* p) {
-    T v;
-    memcpy(&v, p, sizeof(v));
-    return v;
+template <typename T>
+static inline T unaligned_load(const uint8_t* src) {
+    T val;
+    memcpy(&val, src, sizeof(val));
+    return val;
 }

 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
--- a/src/opts/SkOpts_avx.cpp
+++ b/src/opts/SkOpts_avx.cpp
@ -5,10 +5,14 @@
 * found in the LICENSE file.
 */

+#include "SkSafe_math.h"   // Keep this first.
 #include "SkOpts.h"

+#if defined(_INC_MATH) && !defined(INC_MATH_IS_SAFE_NOW)
+    #error We have included ucrt\math.h without protecting it against ODR violation.
+#endif
+
 #define SK_OPTS_NS avx
-#include "SkRasterPipeline_opts.h"
 #include "SkUtils_opts.h"

 namespace SkOpts {
@ -16,17 +20,5 @@ namespace SkOpts {
        memset16 = SK_OPTS_NS::memset16;
        memset32 = SK_OPTS_NS::memset32;
        memset64 = SK_OPTS_NS::memset64;
-
-    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_highp = (StageFn)SK_OPTS_NS::just_return;
-        start_pipeline_highp = SK_OPTS_NS::start_pipeline;
-    #undef M
-
-    #define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
-        start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
-    #undef M
    }
 }
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@ -1,28 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkOpts.h"
-
-#define SK_OPTS_NS hsw
-#include "SkRasterPipeline_opts.h"
-#include "SkUtils_opts.h"
-
-namespace SkOpts {
-    void Init_hsw() {
-    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_highp = (StageFn)SK_OPTS_NS::just_return;
-        start_pipeline_highp = SK_OPTS_NS::start_pipeline;
-    #undef M
-
-    #define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
-        start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
-    #undef M
-    }
-}
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@ -8,23 +8,10 @@
 #include "SkOpts.h"

 #define SK_OPTS_NS sse41
-#include "SkRasterPipeline_opts.h"
 #include "SkBlitRow_opts.h"

 namespace SkOpts {
    void Init_sse41() {
        blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
-
-    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_highp = (StageFn)SK_OPTS_NS::just_return;
-        start_pipeline_highp = SK_OPTS_NS::start_pipeline;
-    #undef M
-
-    #define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
-        start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
-    #undef M
    }
 }