From 78594b26ddff7f8773e957e4107ef57c8d7a1bd8 Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@chromium.org>
Date: Thu, 16 Feb 2017 10:17:00 -0500
Subject: [PATCH] SkJumper: everyone gets a start_pipeline().

Windows needs this as a shim to switch to the System V ABI and back.

Other platforms need it too, if only to make UBSAN happy about calling
functions through the correct function pointers.

One day maybe we can move the looping logic inside start_pipeline?

Change-Id: I47d9ef48752becc6c43fc052b12a540c157bcaaa
Reviewed-on: https://skia-review.googlesource.com/8542
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
---
 src/jumper/SkJumper.cpp         |  71 +++++++-----
 src/jumper/SkJumper_generated.h | 189 +++++++++++++++++++++++++++++++-
 src/jumper/SkJumper_stages.cpp  |  21 +++-
 3 files changed, 246 insertions(+), 35 deletions(-)

diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 3b27ffe81e..2ed95a1e0e 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -21,11 +21,6 @@ static const SkJumper_constants kConstants = {
     0x77800000, 0x07800000, 0x04000400,                // fp16 <-> fp32
 };
 
-using JumperStage = void(size_t, void**, const SkJumper_constants*);
-// Jumper stages actually pass around 8 floating point vectors too.
-// They're designed to work when those vectors start unintialized,
-// so we don't need to mention them here.
-
 #define STAGES(M)     \
     M(seed_shader)    \
     M(constant_color) \
@@ -57,8 +52,12 @@ using JumperStage = void(size_t, void**, const SkJumper_constants*);
 
 // Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o.
 extern "C" {
-    JumperStage sk_just_return;
-#define M(st) JumperStage sk_##st;
+    void sk_start_pipeline(size_t, void**, const SkJumper_constants*);
+
+    // We use void() as a convenient stand-in for the real stage function type.
+    // We never call these directly, so we don't really need to know their real types.
+    void sk_just_return(void);
+#define M(st) void sk_##st(void);
     STAGES(M)
 #undef M
 }
@@ -123,33 +122,51 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
     // We'll look for the best vector instruction set and stride we can use.
     size_t stride                                 = 0;
     void* (*lookup)(SkRasterPipeline::StockStage) = nullptr;
+    void* start_pipeline                          = nullptr;
     void* just_return                             = nullptr;
 
 #if defined(__aarch64__)
-    stride      = 4;
-    lookup      = aarch64_lookup;
-    just_return = (void*)aarch64_sk_just_return;
+    stride         = 4;
+    lookup         = aarch64_lookup;
+    start_pipeline = (void*)aarch64_sk_start_pipeline;
+    just_return    = (void*)aarch64_sk_just_return;
 
 #elif defined(__ARM_NEON__)
     if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
-        stride      = 2;
-        lookup      = armv7_lookup;
-        just_return = (void*)armv7_sk_just_return;
+        stride         = 2;
+        lookup         = armv7_lookup;
+        start_pipeline = (void*)armv7_sk_start_pipeline;
+        just_return    = (void*)armv7_sk_just_return;
     }
 
 #elif defined(__x86_64__) || defined(_M_X64)
-    stride      = 4;
-    lookup      = sse2_lookup;
-    just_return = (void*)sse2_sk_just_return;
+    stride         = 4;
+    lookup         = sse2_lookup;
+    start_pipeline = (void*)sse2_sk_start_pipeline;
+    just_return    = (void*)sse2_sk_just_return;
     if (SkCpu::Supports(SkCpu::SSE41)) {
-        stride      = 4;
-        lookup      = sse41_lookup;
-        just_return = (void*)sse41_sk_just_return;
+        stride         = 4;
+        lookup         = sse41_lookup;
+        start_pipeline = (void*)sse41_sk_start_pipeline;
+        just_return    = (void*)sse41_sk_just_return;
     }
     if (SkCpu::Supports(SkCpu::HSW)) {
-        stride      = 8;
-        lookup      = hsw_lookup;
-        just_return = (void*)hsw_sk_just_return;
+        stride         = 8;
+        lookup         = hsw_lookup;
+        start_pipeline = (void*)hsw_sk_start_pipeline;
+        just_return    = (void*)hsw_sk_just_return;
+    }
+#endif
+
+#if defined(_MSC_VER)
+    if (start_pipeline == (void*)sse2_sk_start_pipeline) {
+        start_pipeline =  (void*)sse2_sk_start_pipeline_ms;
+    }
+    if (start_pipeline == (void*)sse41_sk_start_pipeline) {
+        start_pipeline =  (void*)sse41_sk_start_pipeline_ms;
+    }
+    if (start_pipeline == (void*)hsw_sk_start_pipeline) {
+        start_pipeline =  (void*)hsw_sk_start_pipeline_ms;
     }
 #endif
 
@@ -170,10 +187,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
         }
         *ip = (void*)just_return;
 
-        ip = program.get();
-        auto start = (JumperStage*)*ip++;
+        auto start = (decltype(&sk_start_pipeline))start_pipeline;
         while (x + stride <= limit) {
-            start(x, ip, &kConstants);
+            start(x, program.get(), &kConstants);
             x += stride;
         }
     }
@@ -193,10 +209,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
         }
         *ip = (void*)sk_just_return;
 
-        ip = program.get();
-        auto start = (JumperStage*)*ip++;
+        auto start = sk_start_pipeline;
         while (x + stride <= limit) {
-            start(x, ip, &kConstants);
+            start(x, program.get(), &kConstants);
             x += stride;
         }
     }
diff --git a/src/jumper/SkJumper_generated.h b/src/jumper/SkJumper_generated.h
index ac0dd4855e..cf25d6740d 100644
--- a/src/jumper/SkJumper_generated.h
+++ b/src/jumper/SkJumper_generated.h
@@ -11,6 +11,18 @@
 // This file is generated semi-automatically with this command:
 //   $ src/jumper/build_stages.py
 
+static const unsigned int aarch64_sk_start_pipeline[] = {
+    0xf8408423,                                     //  ldr           x3, [x1],#8
+    0x6f00e400,                                     //  movi          v0.2d, #0x0
+    0x6f00e401,                                     //  movi          v1.2d, #0x0
+    0x6f00e402,                                     //  movi          v2.2d, #0x0
+    0x6f00e403,                                     //  movi          v3.2d, #0x0
+    0x6f00e404,                                     //  movi          v4.2d, #0x0
+    0x6f00e405,                                     //  movi          v5.2d, #0x0
+    0x6f00e406,                                     //  movi          v6.2d, #0x0
+    0x6f00e407,                                     //  movi          v7.2d, #0x0
+    0xd61f0060,                                     //  br            x3
+};
 static const unsigned int aarch64_sk_just_return[] = {
     0xd65f03c0,                                     //  ret
 };
@@ -507,6 +519,18 @@ static const unsigned int aarch64_sk_linear_gradient_2stops[] = {
     0x4eb01e00,                                     //  mov           v0.16b, v16.16b
     0xd61f0060,                                     //  br            x3
 };
+static const unsigned int armv7_sk_start_pipeline[] = {
+    0xe4913004,                                     //  ldr           r3, [r1], #4
+    0xf2800010,                                     //  vmov.i32      d0, #0
+    0xf2801010,                                     //  vmov.i32      d1, #0
+    0xf2802010,                                     //  vmov.i32      d2, #0
+    0xf2803010,                                     //  vmov.i32      d3, #0
+    0xf2804010,                                     //  vmov.i32      d4, #0
+    0xf2805010,                                     //  vmov.i32      d5, #0
+    0xf2806010,                                     //  vmov.i32      d6, #0
+    0xf2807010,                                     //  vmov.i32      d7, #0
+    0xe12fff13,                                     //  bx            r3
+};
 static const unsigned int armv7_sk_just_return[] = {
     0xe12fff1e,                                     //  bx            lr
 };
@@ -1066,9 +1090,62 @@ static const unsigned int armv7_sk_linear_gradient_2stops[] = {
     0xf22001b0,                                     //  vorr          d0, d16, d16
     0xe12fff1c,                                     //  bx            ip
 };
+static const unsigned char hsw_sk_start_pipeline[] = {
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x57,0xc0,                            //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0xf4,0x57,0xc9,                            //  vxorps        %ymm1,%ymm1,%ymm1
+    0xc5,0xec,0x57,0xd2,                            //  vxorps        %ymm2,%ymm2,%ymm2
+    0xc5,0xe4,0x57,0xdb,                            //  vxorps        %ymm3,%ymm3,%ymm3
+    0xc5,0xdc,0x57,0xe4,                            //  vxorps        %ymm4,%ymm4,%ymm4
+    0xc5,0xd4,0x57,0xed,                            //  vxorps        %ymm5,%ymm5,%ymm5
+    0xc5,0xcc,0x57,0xf6,                            //  vxorps        %ymm6,%ymm6,%ymm6
+    0xc5,0xc4,0x57,0xff,                            //  vxorps        %ymm7,%ymm7,%ymm7
+    0xff,0xe0,                                      //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_start_pipeline_ms[] = {
+    0x56,                                           //  push          %rsi
+    0x57,                                           //  push          %rdi
+    0x48,0x81,0xec,0xa8,0x00,0x00,0x00,             //  sub           $0xa8,%rsp
+    0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00,   //  vmovaps       %xmm15,0x90(%rsp)
+    0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00,   //  vmovaps       %xmm14,0x80(%rsp)
+    0xc5,0x78,0x29,0x6c,0x24,0x70,                  //  vmovaps       %xmm13,0x70(%rsp)
+    0xc5,0x78,0x29,0x64,0x24,0x60,                  //  vmovaps       %xmm12,0x60(%rsp)
+    0xc5,0x78,0x29,0x5c,0x24,0x50,                  //  vmovaps       %xmm11,0x50(%rsp)
+    0xc5,0x78,0x29,0x54,0x24,0x40,                  //  vmovaps       %xmm10,0x40(%rsp)
+    0xc5,0x78,0x29,0x4c,0x24,0x30,                  //  vmovaps       %xmm9,0x30(%rsp)
+    0xc5,0x78,0x29,0x44,0x24,0x20,                  //  vmovaps       %xmm8,0x20(%rsp)
+    0xc5,0xf8,0x29,0x7c,0x24,0x10,                  //  vmovaps       %xmm7,0x10(%rsp)
+    0xc5,0xf8,0x29,0x34,0x24,                       //  vmovaps       %xmm6,(%rsp)
+    0x48,0x89,0xd6,                                 //  mov           %rdx,%rsi
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x57,0xc0,                            //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0xf4,0x57,0xc9,                            //  vxorps        %ymm1,%ymm1,%ymm1
+    0xc5,0xec,0x57,0xd2,                            //  vxorps        %ymm2,%ymm2,%ymm2
+    0xc5,0xe4,0x57,0xdb,                            //  vxorps        %ymm3,%ymm3,%ymm3
+    0xc5,0xdc,0x57,0xe4,                            //  vxorps        %ymm4,%ymm4,%ymm4
+    0xc5,0xd4,0x57,0xed,                            //  vxorps        %ymm5,%ymm5,%ymm5
+    0xc5,0xcc,0x57,0xf6,                            //  vxorps        %ymm6,%ymm6,%ymm6
+    0xc5,0xc4,0x57,0xff,                            //  vxorps        %ymm7,%ymm7,%ymm7
+    0x48,0x89,0xcf,                                 //  mov           %rcx,%rdi
+    0x4c,0x89,0xc2,                                 //  mov           %r8,%rdx
+    0xff,0xd0,                                      //  callq         *%rax
+    0xc5,0xf8,0x28,0x34,0x24,                       //  vmovaps       (%rsp),%xmm6
+    0xc5,0xf8,0x28,0x7c,0x24,0x10,                  //  vmovaps       0x10(%rsp),%xmm7
+    0xc5,0x78,0x28,0x44,0x24,0x20,                  //  vmovaps       0x20(%rsp),%xmm8
+    0xc5,0x78,0x28,0x4c,0x24,0x30,                  //  vmovaps       0x30(%rsp),%xmm9
+    0xc5,0x78,0x28,0x54,0x24,0x40,                  //  vmovaps       0x40(%rsp),%xmm10
+    0xc5,0x78,0x28,0x5c,0x24,0x50,                  //  vmovaps       0x50(%rsp),%xmm11
+    0xc5,0x78,0x28,0x64,0x24,0x60,                  //  vmovaps       0x60(%rsp),%xmm12
+    0xc5,0x78,0x28,0x6c,0x24,0x70,                  //  vmovaps       0x70(%rsp),%xmm13
+    0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00,   //  vmovaps       0x80(%rsp),%xmm14
+    0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00,   //  vmovaps       0x90(%rsp),%xmm15
+    0x48,0x81,0xc4,0xa8,0x00,0x00,0x00,             //  add           $0xa8,%rsp
+    0x5f,                                           //  pop           %rdi
+    0x5e,                                           //  pop           %rsi
+    0xc5,0xf8,0x77,                                 //  vzeroupper
+    0xc3,                                           //  retq
+};
 static const unsigned char hsw_sk_just_return[] = {
-    0xc5,0xf8,0x77,                                 //  vzeroupper
-    0xc5,0xf8,0x77,                                 //  vzeroupper
     0xc3,                                           //  retq
 };
 static const unsigned char hsw_sk_seed_shader[] = {
@@ -1514,6 +1591,60 @@ static const unsigned char hsw_sk_linear_gradient_2stops[] = {
     0xc5,0x7c,0x29,0xc0,                            //  vmovaps       %ymm8,%ymm0
     0xff,0xe0,                                      //  jmpq          *%rax
 };
+static const unsigned char sse41_sk_start_pipeline[] = {
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0xff,0xe0,                                      //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_start_pipeline_ms[] = {
+    0x56,                                           //  push          %rsi
+    0x57,                                           //  push          %rdi
+    0x48,0x81,0xec,0xa8,0x00,0x00,0x00,             //  sub           $0xa8,%rsp
+    0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        %xmm15,0x90(%rsp)
+    0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        %xmm14,0x80(%rsp)
+    0x44,0x0f,0x29,0x6c,0x24,0x70,                  //  movaps        %xmm13,0x70(%rsp)
+    0x44,0x0f,0x29,0x64,0x24,0x60,                  //  movaps        %xmm12,0x60(%rsp)
+    0x44,0x0f,0x29,0x5c,0x24,0x50,                  //  movaps        %xmm11,0x50(%rsp)
+    0x44,0x0f,0x29,0x54,0x24,0x40,                  //  movaps        %xmm10,0x40(%rsp)
+    0x44,0x0f,0x29,0x4c,0x24,0x30,                  //  movaps        %xmm9,0x30(%rsp)
+    0x44,0x0f,0x29,0x44,0x24,0x20,                  //  movaps        %xmm8,0x20(%rsp)
+    0x0f,0x29,0x7c,0x24,0x10,                       //  movaps        %xmm7,0x10(%rsp)
+    0x0f,0x29,0x34,0x24,                            //  movaps        %xmm6,(%rsp)
+    0x48,0x89,0xd6,                                 //  mov           %rdx,%rsi
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0x48,0x89,0xcf,                                 //  mov           %rcx,%rdi
+    0x4c,0x89,0xc2,                                 //  mov           %r8,%rdx
+    0xff,0xd0,                                      //  callq         *%rax
+    0x0f,0x28,0x34,0x24,                            //  movaps        (%rsp),%xmm6
+    0x0f,0x28,0x7c,0x24,0x10,                       //  movaps        0x10(%rsp),%xmm7
+    0x44,0x0f,0x28,0x44,0x24,0x20,                  //  movaps        0x20(%rsp),%xmm8
+    0x44,0x0f,0x28,0x4c,0x24,0x30,                  //  movaps        0x30(%rsp),%xmm9
+    0x44,0x0f,0x28,0x54,0x24,0x40,                  //  movaps        0x40(%rsp),%xmm10
+    0x44,0x0f,0x28,0x5c,0x24,0x50,                  //  movaps        0x50(%rsp),%xmm11
+    0x44,0x0f,0x28,0x64,0x24,0x60,                  //  movaps        0x60(%rsp),%xmm12
+    0x44,0x0f,0x28,0x6c,0x24,0x70,                  //  movaps        0x70(%rsp),%xmm13
+    0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        0x80(%rsp),%xmm14
+    0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        0x90(%rsp),%xmm15
+    0x48,0x81,0xc4,0xa8,0x00,0x00,0x00,             //  add           $0xa8,%rsp
+    0x5f,                                           //  pop           %rdi
+    0x5e,                                           //  pop           %rsi
+    0xc3,                                           //  retq
+};
 static const unsigned char sse41_sk_just_return[] = {
     0xc3,                                           //  retq
 };
@@ -2145,6 +2276,60 @@ static const unsigned char sse41_sk_linear_gradient_2stops[] = {
     0x41,0x0f,0x28,0xc0,                            //  movaps        %xmm8,%xmm0
     0xff,0xe0,                                      //  jmpq          *%rax
 };
+static const unsigned char sse2_sk_start_pipeline[] = {
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0xff,0xe0,                                      //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_start_pipeline_ms[] = {
+    0x56,                                           //  push          %rsi
+    0x57,                                           //  push          %rdi
+    0x48,0x81,0xec,0xa8,0x00,0x00,0x00,             //  sub           $0xa8,%rsp
+    0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        %xmm15,0x90(%rsp)
+    0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        %xmm14,0x80(%rsp)
+    0x44,0x0f,0x29,0x6c,0x24,0x70,                  //  movaps        %xmm13,0x70(%rsp)
+    0x44,0x0f,0x29,0x64,0x24,0x60,                  //  movaps        %xmm12,0x60(%rsp)
+    0x44,0x0f,0x29,0x5c,0x24,0x50,                  //  movaps        %xmm11,0x50(%rsp)
+    0x44,0x0f,0x29,0x54,0x24,0x40,                  //  movaps        %xmm10,0x40(%rsp)
+    0x44,0x0f,0x29,0x4c,0x24,0x30,                  //  movaps        %xmm9,0x30(%rsp)
+    0x44,0x0f,0x29,0x44,0x24,0x20,                  //  movaps        %xmm8,0x20(%rsp)
+    0x0f,0x29,0x7c,0x24,0x10,                       //  movaps        %xmm7,0x10(%rsp)
+    0x0f,0x29,0x34,0x24,                            //  movaps        %xmm6,(%rsp)
+    0x48,0x89,0xd6,                                 //  mov           %rdx,%rsi
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0x48,0x89,0xcf,                                 //  mov           %rcx,%rdi
+    0x4c,0x89,0xc2,                                 //  mov           %r8,%rdx
+    0xff,0xd0,                                      //  callq         *%rax
+    0x0f,0x28,0x34,0x24,                            //  movaps        (%rsp),%xmm6
+    0x0f,0x28,0x7c,0x24,0x10,                       //  movaps        0x10(%rsp),%xmm7
+    0x44,0x0f,0x28,0x44,0x24,0x20,                  //  movaps        0x20(%rsp),%xmm8
+    0x44,0x0f,0x28,0x4c,0x24,0x30,                  //  movaps        0x30(%rsp),%xmm9
+    0x44,0x0f,0x28,0x54,0x24,0x40,                  //  movaps        0x40(%rsp),%xmm10
+    0x44,0x0f,0x28,0x5c,0x24,0x50,                  //  movaps        0x50(%rsp),%xmm11
+    0x44,0x0f,0x28,0x64,0x24,0x60,                  //  movaps        0x60(%rsp),%xmm12
+    0x44,0x0f,0x28,0x6c,0x24,0x70,                  //  movaps        0x70(%rsp),%xmm13
+    0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        0x80(%rsp),%xmm14
+    0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        0x90(%rsp),%xmm15
+    0x48,0x81,0xc4,0xa8,0x00,0x00,0x00,             //  add           $0xa8,%rsp
+    0x5f,                                           //  pop           %rdi
+    0x5e,                                           //  pop           %rsi
+    0xc3,                                           //  retq
+};
 static const unsigned char sse2_sk_just_return[] = {
     0xc3,                                           //  retq
 };
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index fa771c03e6..0c310515e3 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -200,13 +200,24 @@ static void* load_and_inc(void**& program) {
     static void name##_k(size_t& x, void* ctx, K* k,                          \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
 
-// A glue Stage to end the tail call chain, finally returning to the caller.
-extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {
-#if defined(JUMPER) && defined(__AVX2__)
-    _mm256_zeroupper();
-#endif
+// Some glue stages that don't fit the normal pattern of stages.
+
+extern "C" void sk_start_pipeline(size_t x, void** program, K* k) {
+    auto next = (Stage*)load_and_inc(program);
+    F v{};   // TODO: faster uninitialized?
+    next(x,program,k, v,v,v,v, v,v,v,v);
 }
 
+#if defined(JUMPER) && defined(__x86_64__)
+    __attribute__((ms_abi))
+    extern "C" void sk_start_pipeline_ms(size_t x, void** program, K* k) {
+        sk_start_pipeline(x,program,k);
+    }
+#endif
+
+// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
+extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
+
 // We can now define Stages!
 
 // Some things to keep in mind while writing Stages: