SkSplicer: start on arm64

Seems to be working. The jump to loop_start might be a little off, but not by much. Correctness is really still a big TODO. $ adb shell 'cd /data/local/tmp; ./monobench SkRasterPipeline 200' SkRasterPipeline_… 200 …f16_compile 1x …f16_run 1.42x …srgb_compile 2.21x …srgb_run 2.59x⏎ Change-Id: I0e1acc6404cf3ce8084d9ef8011cbe0b5f1fd6e3 Reviewed-on: https://skia-review.googlesource.com/6811 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
2017-01-09 17:21:32 -05:00 · 2017-01-09 17:21:32 -05:00 · 8e619a2b4e
commit 8e619a2b4e
parent 1e74cad9b4
4 changed files with 482 additions and 87 deletions
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@ -29,38 +29,86 @@ namespace {
        12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
    };

-    // Short x86-64 instruction sequences that we'll use as glue to splice together Stages.
-    static const uint8_t   vzeroupper[] = { 0xc5, 0xf8, 0x77 };        // clear top half of all ymm
-    static const uint8_t          ret[] = { 0xc3 };                    // return
-    static const uint8_t  movabsq_rcx[] = { 0x48, 0xb9 };              // move next 8 bytes into rcx
-    static const uint8_t  movabsq_rdx[] = { 0x48, 0xba };              // move next 8 bytes into rdx
-    static const uint8_t   addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };  // rdi += 8
-    static const uint8_t cmpq_rsi_rdi[] = { 0x48, 0x39, 0xf7 };        // rdi cmp? rsi
-    static const uint8_t      jb_near[] = { 0x0f, 0x8c };              // jump relative next 4 bytes
-                                                                       //  if cmp set unsigned < bit
-
    // We do this a lot, so it's nice to infer the correct size.  Works fine with arrays.
    template <typename T>
-    void splice(SkWStream* stream, const T& val) {
-        stream->write(&val, sizeof(val));
+    static void splice(SkWStream* buf, const T& val) {
+        buf->write(&val, sizeof(val));
    }

+#if defined(__aarch64__)
+    static constexpr int kStride = 4;
+    static void set_k(SkWStream* buf, const SkSplicer_constants* k) {
+        uint16_t parts[4];
+        memcpy(parts, &k, 8);
+        splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x3);  // move  16-bit intermediate << 48 into x3
+        splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x3);  // merge 16-bit intermediate << 32 into x3
+        splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x3);  // merge 16-bit intermediate << 16 into x3
+        splice(buf, 0xf2800000 | (parts[0] << 5) | 0x3);  // merge 16-bit intermediate <<  0 into x3
+    }
+    static void set_ctx(SkWStream* buf, void* ctx) {
+        uint16_t parts[4];
+        memcpy(parts, &ctx, 8);
+        splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x2);  // move  16-bit intermediate << 48 into x2
+        splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x2);  // merge 16-bit intermediate << 32 into x2
+        splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x2);  // merge 16-bit intermediate << 16 into x2
+        splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2);  // merge 16-bit intermediate <<  0 into x2
+    }
+    static void loop(SkWStream* buf, int loop_start) {
+        splice(buf, 0x91001000);        // add x0, x0, #4
+        splice(buf, 0xeb01001f);        // cmp x0, x1
+        int off = loop_start - (int)(buf->bytesWritten() + 4);  // TODO: check that this is right
+        off /= 4;   // bytes -> instructions, still signed
+        off = (off & 0x7ffff) << 5;  // 19 bit maximum range (+- 256K instructions)
+        splice(buf, 0x54000003 | off); // b.cc loop_start  (cc == "carry clear", unsigned less than)
+    }
+    static void ret(SkWStream* buf) {
+        splice(buf, 0xd65f03c0);  // ret
+    }
+#else
+    static constexpr int kStride = 8;
+    static void set_k(SkWStream* buf, const SkSplicer_constants* k) {
+        static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 };
+        splice(buf, movabsq_rcx);  // movabsq <next 8 bytes>, %rcx
+        splice(buf, k);
+    }
+    static void set_ctx(SkWStream* buf, void* ctx) {
+        static const uint8_t movabsq_rdx[] = { 0x48, 0xba };
+        splice(buf, movabsq_rdx);  // movabsq <next 8 bytes>, %rdx
+        splice(buf, ctx);
+    }
+    static void loop(SkWStream* buf, int loop_start) {
+        static const uint8_t  addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };
+        static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
+        static const uint8_t     jb_near[] = { 0x0f, 0x8c };
+        splice(buf, addq_8_rdi);   // addq $8, %rdi
+        splice(buf, cmp_rsi_rdi);  // cmp %rsi, %rdi
+        splice(buf, jb_near);      // jb <next 4 bytes>  (b == "before", unsigned less than)
+        splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
+    }
+    static void ret(SkWStream* buf) {
+        static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
+        static const uint8_t        ret[] = { 0xc3 };
+        splice(buf, vzeroupper);
+        splice(buf, ret);
+    }
+#endif
+
 #ifdef IACA_DUMP
    static const uint8_t      ud2[] = { 0x0f, 0x0b };         // undefined... crashes when run
    static const uint8_t     nop3[] = { 0x64, 0x67, 0x90 };   // 3 byte no-op
    static const uint8_t movl_ebx[] = { 0xbb };               // move next 4 bytes into ebx

-    static void iaca_start(SkWStream* stream) {
-        splice(stream, ud2);
-        splice(stream, movl_ebx);
-        splice(stream, 111);
-        splice(stream, nop3);
+    static void iaca_start(SkWStream* buf) {
+        splice(buf, ud2);
+        splice(buf, movl_ebx);
+        splice(buf, 111);
+        splice(buf, nop3);
    }
-    static void iaca_end(SkWStream* stream) {
-        splice(stream, movl_ebx);
-        splice(stream, 222);
-        splice(stream, nop3);
-        splice(stream, ud2);
+    static void iaca_end(SkWStream* buf) {
+        splice(buf, movl_ebx);
+        splice(buf, 222);
+        splice(buf, nop3);
+        splice(buf, ud2);
    }
 #else
    static void iaca_start(SkWStream*) {}
@ -87,32 +135,32 @@ namespace {
        Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
            // We always create a backup interpreter pipeline,
            //   - to handle any program we can't, and
-            //   - to handle the n < 8 tails.
+            //   - to handle the n < kStride tails.
            fBackup     = SkOpts::compile_pipeline(stages, nstages);
            fSplicedLen = 0;
            fSpliced    = nullptr;
            // If we return early anywhere in here, !fSpliced means we'll use fBackup instead.

+        #if !defined(__aarch64__)
            // To keep things simple, only one target supported: Haswell+ x86-64.
            if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
                return;
            }
+        #endif

            SkDynamicMemoryWStream buf;

-            // Put the address of kConstants in rcx, Stage argument 4 "k".
-            splice(&buf, movabsq_rcx);
-            splice(&buf, &kConstants);
+            // Put the address of kConstants in rcx/x3, Stage argument 4 "k".
+            set_k(&buf, &kConstants);

-            // We'll loop back to here as long as x<n after x+=8.
+            // We'll loop back to here as long as x<n after x += kStride.
            iaca_start(&buf);
            auto loop_start = buf.bytesWritten();  // Think of this like a label, loop_start:

            for (int i = 0; i < nstages; i++) {
-                // If a stage has a context pointer, load it into rdx, Stage argument 3 "ctx".
+                // If a stage has a context pointer, load it into rdx/x2, Stage argument 3 "ctx".
                if (stages[i].ctx) {
-                    splice(&buf, movabsq_rdx);
-                    splice(&buf, stages[i].ctx);
+                    set_ctx(&buf, stages[i].ctx);
                }

                // Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
@ -144,16 +192,9 @@ namespace {
                }
            }

-            // See if we should loop back to handle more pixels.
-            splice(&buf, addq_8_rdi);    // x += 8
-            splice(&buf, cmpq_rsi_rdi);  // if (x < n)
-            splice(&buf, jb_near);       //     goto loop_start;
-            splice(&buf, (int)loop_start - (int)(buf.bytesWritten() + 4));
+            loop(&buf, loop_start);  // Loop back to handle more pixels if not done.
            iaca_end(&buf);
-
-            // Nope!  We're done.
-            splice(&buf, vzeroupper);
-            splice(&buf, ret);
+            ret(&buf);  // We're done.

            auto data = buf.detachAsData();
            fSplicedLen = data->size();
@ -175,14 +216,15 @@ namespace {

        // Here's where we call fSpliced if we created it, fBackup if not.
        void operator()(size_t x, size_t y, size_t n) const {
-            // TODO: The looping logic is probably not correct for handling n<8 tails.
-            if (fSpliced) {
+            // TODO: The looping logic is probably not correct for n < kStride tails or x != 0.
+
+            size_t body = n/kStride*kStride;   // Largest multiple of kStride (4 or 8) <= n.
+            if (fSpliced && body) {            // Can we run fSpliced for at least one kStride?
                // TODO: At some point we will want to pass in y...
                using Fn = void(size_t x, size_t n);
-                ((Fn*)fSpliced)(x,n);
+                ((Fn*)fSpliced)(x,body);

-                // Fall through to fBackup for any n<8 last pixels.
-                size_t body = n/8*8;
+                // Fall through to fBackup for any n<kStride last pixels.
                x += body;
                n -= body;
            }
--- a/src/splicer/SkSplicer_generated.h
+++ b/src/splicer/SkSplicer_generated.h
@ -11,6 +11,279 @@
 // This file is generated semi-automatically with this command:
 //   $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h

+#if defined(__aarch64__)
+
+static const unsigned int kSplice_clear[] = {
+    0x6f00e400,    // movi v0.2d, #0x0
+    0x6f00e401,    // movi v1.2d, #0x0
+    0x6f00e402,    // movi v2.2d, #0x0
+    0x6f00e403,    // movi v3.2d, #0x0
+};
+static const unsigned int kSplice_plus[] = {
+    0x4e24d400,    // fadd v0.4s, v0.4s, v4.4s
+    0x4e25d421,    // fadd v1.4s, v1.4s, v5.4s
+    0x4e26d442,    // fadd v2.4s, v2.4s, v6.4s
+    0x4e27d463,    // fadd v3.4s, v3.4s, v7.4s
+};
+static const unsigned int kSplice_srcover[] = {
+    0x91001068,    // add x8, x3, #0x4
+    0x4d40c910,    // ld1r {v16.4s}, [x8]
+    0x4ea3d610,    // fsub v16.4s, v16.4s, v3.4s
+    0x4e24ce00,    // fmla v0.4s, v16.4s, v4.4s
+    0x4e25ce01,    // fmla v1.4s, v16.4s, v5.4s
+    0x4e26ce02,    // fmla v2.4s, v16.4s, v6.4s
+    0x4e26ce03,    // fmla v3.4s, v16.4s, v6.4s
+};
+static const unsigned int kSplice_dstover[] = {
+    0x91001068,    // add x8, x3, #0x4
+    0x4d40c910,    // ld1r {v16.4s}, [x8]
+    0x4ea7d610,    // fsub v16.4s, v16.4s, v7.4s
+    0x4e20ce04,    // fmla v4.4s, v16.4s, v0.4s
+    0x4e21ce05,    // fmla v5.4s, v16.4s, v1.4s
+    0x4e22ce06,    // fmla v6.4s, v16.4s, v2.4s
+    0x4e22ce07,    // fmla v7.4s, v16.4s, v2.4s
+};
+static const unsigned int kSplice_clamp_0[] = {
+    0x6f00e410,    // movi v16.2d, #0x0
+    0x4e30f400,    // fmax v0.4s, v0.4s, v16.4s
+    0x4e30f421,    // fmax v1.4s, v1.4s, v16.4s
+    0x4e30f442,    // fmax v2.4s, v2.4s, v16.4s
+    0x4e30f463,    // fmax v3.4s, v3.4s, v16.4s
+};
+static const unsigned int kSplice_clamp_1[] = {
+    0x91001068,    // add x8, x3, #0x4
+    0x4d40c910,    // ld1r {v16.4s}, [x8]
+    0x4eb0f400,    // fmin v0.4s, v0.4s, v16.4s
+    0x4eb0f421,    // fmin v1.4s, v1.4s, v16.4s
+    0x4eb0f442,    // fmin v2.4s, v2.4s, v16.4s
+    0x4eb0f463,    // fmin v3.4s, v3.4s, v16.4s
+};
+static const unsigned int kSplice_clamp_a[] = {
+    0x91001068,    // add x8, x3, #0x4
+    0x4d40c910,    // ld1r {v16.4s}, [x8]
+    0x4eb0f463,    // fmin v3.4s, v3.4s, v16.4s
+    0x4ea3f400,    // fmin v0.4s, v0.4s, v3.4s
+    0x4ea3f421,    // fmin v1.4s, v1.4s, v3.4s
+    0x4ea3f442,    // fmin v2.4s, v2.4s, v3.4s
+};
+static const unsigned int kSplice_swap[] = {
+    0x4ea31c70,    // mov v16.16b, v3.16b
+    0x4ea21c51,    // mov v17.16b, v2.16b
+    0x4ea11c32,    // mov v18.16b, v1.16b
+    0x4ea01c13,    // mov v19.16b, v0.16b
+    0x4ea41c80,    // mov v0.16b, v4.16b
+    0x4ea51ca1,    // mov v1.16b, v5.16b
+    0x4ea61cc2,    // mov v2.16b, v6.16b
+    0x4ea71ce3,    // mov v3.16b, v7.16b
+    0x4eb31e64,    // mov v4.16b, v19.16b
+    0x4eb21e45,    // mov v5.16b, v18.16b
+    0x4eb11e26,    // mov v6.16b, v17.16b
+    0x4eb01e07,    // mov v7.16b, v16.16b
+};
+static const unsigned int kSplice_move_src_dst[] = {
+    0x4ea01c04,    // mov v4.16b, v0.16b
+    0x4ea11c25,    // mov v5.16b, v1.16b
+    0x4ea21c46,    // mov v6.16b, v2.16b
+    0x4ea31c67,    // mov v7.16b, v3.16b
+};
+static const unsigned int kSplice_move_dst_src[] = {
+    0x4ea41c80,    // mov v0.16b, v4.16b
+    0x4ea51ca1,    // mov v1.16b, v5.16b
+    0x4ea61cc2,    // mov v2.16b, v6.16b
+    0x4ea71ce3,    // mov v3.16b, v7.16b
+};
+static const unsigned int kSplice_premul[] = {
+    0x6e23dc00,    // fmul v0.4s, v0.4s, v3.4s
+    0x6e23dc21,    // fmul v1.4s, v1.4s, v3.4s
+    0x6e23dc42,    // fmul v2.4s, v2.4s, v3.4s
+};
+static const unsigned int kSplice_unpremul[] = {
+    0x91001068,    // add x8, x3, #0x4
+    0x4d40c910,    // ld1r {v16.4s}, [x8]
+    0x4ea0d871,    // fcmeq v17.4s, v3.4s, #0.0
+    0x6e23fe10,    // fdiv v16.4s, v16.4s, v3.4s
+    0x4e711e10,    // bic v16.16b, v16.16b, v17.16b
+    0x6e20de00,    // fmul v0.4s, v16.4s, v0.4s
+    0x6e21de01,    // fmul v1.4s, v16.4s, v1.4s
+    0x6e22de02,    // fmul v2.4s, v16.4s, v2.4s
+};
+static const unsigned int kSplice_from_srgb[] = {
+    0x91005068,    // add x8, x3, #0x14
+    0x4d40c910,    // ld1r {v16.4s}, [x8]
+    0x91004068,    // add x8, x3, #0x10
+    0x4d40c911,    // ld1r {v17.4s}, [x8]
+    0x2d434c72,    // ldp s18, s19, [x3,#24]
+    0x6e22dc54,    // fmul v20.4s, v2.4s, v2.4s
+    0x4eb01e15,    // mov v21.16b, v16.16b
+    0x4eb01e17,    // mov v23.16b, v16.16b
+    0x4f921050,    // fmla v16.4s, v2.4s, v18.s[0]
+    0x4eb11e36,    // mov v22.16b, v17.16b
+    0x4eb11e38,    // mov v24.16b, v17.16b
+    0x4e34ce11,    // fmla v17.4s, v16.4s, v20.4s
+    0x6e20dc10,    // fmul v16.4s, v0.4s, v0.4s
+    0x91008068,    // add x8, x3, #0x20
+    0x4f921015,    // fmla v21.4s, v0.4s, v18.s[0]
+    0x4e30ceb6,    // fmla v22.4s, v21.4s, v16.4s
+    0x4d40c910,    // ld1r {v16.4s}, [x8]
+    0x6e21dc34,    // fmul v20.4s, v1.4s, v1.4s
+    0x4f921037,    // fmla v23.4s, v1.4s, v18.s[0]
+    0x4f939015,    // fmul v21.4s, v0.4s, v19.s[0]
+    0x4f939032,    // fmul v18.4s, v1.4s, v19.s[0]
+    0x4f939053,    // fmul v19.4s, v2.4s, v19.s[0]
+    0x6ea0e600,    // fcmgt v0.4s, v16.4s, v0.4s
+    0x6ea1e601,    // fcmgt v1.4s, v16.4s, v1.4s
+    0x6ea2e602,    // fcmgt v2.4s, v16.4s, v2.4s
+    0x4e34cef8,    // fmla v24.4s, v23.4s, v20.4s
+    0x6e761ea0,    // bsl v0.16b, v21.16b, v22.16b
+    0x6e781e41,    // bsl v1.16b, v18.16b, v24.16b
+    0x6e711e62,    // bsl v2.16b, v19.16b, v17.16b
+};
+static const unsigned int kSplice_to_srgb[] = {
+    0x6ea1d810,    // frsqrte v16.4s, v0.4s
+    0x6ea1d835,    // frsqrte v21.4s, v1.4s
+    0x6e30de17,    // fmul v23.4s, v16.4s, v16.4s
+    0x6ea1d856,    // frsqrte v22.4s, v2.4s
+    0x6e35deb9,    // fmul v25.4s, v21.4s, v21.4s
+    0x4eb7fc17,    // frsqrts v23.4s, v0.4s, v23.4s
+    0x9100c068,    // add x8, x3, #0x30
+    0x6e36deda,    // fmul v26.4s, v22.4s, v22.4s
+    0x4eb9fc39,    // frsqrts v25.4s, v1.4s, v25.4s
+    0x6e37de10,    // fmul v16.4s, v16.4s, v23.4s
+    0x2d44c871,    // ldp s17, s18, [x3,#36]
+    0x4d40c914,    // ld1r {v20.4s}, [x8]
+    0x4ebafc5a,    // frsqrts v26.4s, v2.4s, v26.4s
+    0x6e39deb5,    // fmul v21.4s, v21.4s, v25.4s
+    0x4ea1da17,    // frecpe v23.4s, v16.4s
+    0xbd402c73,    // ldr s19, [x3,#44]
+    0x9100d068,    // add x8, x3, #0x34
+    0x6e3aded6,    // fmul v22.4s, v22.4s, v26.4s
+    0x4ea1dabb,    // frecpe v27.4s, v21.4s
+    0x4e37fe1d,    // frecps v29.4s, v16.4s, v23.4s
+    0x4d40c918,    // ld1r {v24.4s}, [x8]
+    0x4ea1dadc,    // frecpe v28.4s, v22.4s
+    0x6e3ddef7,    // fmul v23.4s, v23.4s, v29.4s
+    0x4e3bfebd,    // frecps v29.4s, v21.4s, v27.4s
+    0x6e3ddf7b,    // fmul v27.4s, v27.4s, v29.4s
+    0x4e3cfedd,    // frecps v29.4s, v22.4s, v28.4s
+    0x6e3ddf9c,    // fmul v28.4s, v28.4s, v29.4s
+    0x4eb41e9d,    // mov v29.16b, v20.16b
+    0x6ea1da19,    // frsqrte v25.4s, v16.4s
+    0x4f9312fd,    // fmla v29.4s, v23.4s, v19.s[0]
+    0x4eb41e97,    // mov v23.16b, v20.16b
+    0x4f91901a,    // fmul v26.4s, v0.4s, v17.s[0]
+    0x4f931377,    // fmla v23.4s, v27.4s, v19.s[0]
+    0x6ea1dabb,    // frsqrte v27.4s, v21.4s
+    0x4f931394,    // fmla v20.4s, v28.4s, v19.s[0]
+    0x4f919033,    // fmul v19.4s, v1.4s, v17.s[0]
+    0x4f919051,    // fmul v17.4s, v2.4s, v17.s[0]
+    0x6ea0e700,    // fcmgt v0.4s, v24.4s, v0.4s
+    0x6ea1e701,    // fcmgt v1.4s, v24.4s, v1.4s
+    0x6ea2e702,    // fcmgt v2.4s, v24.4s, v2.4s
+    0x6e39df38,    // fmul v24.4s, v25.4s, v25.4s
+    0x6ea1dadc,    // frsqrte v28.4s, v22.4s
+    0x4eb8fe10,    // frsqrts v16.4s, v16.4s, v24.4s
+    0x6e3bdf78,    // fmul v24.4s, v27.4s, v27.4s
+    0x4eb8feb5,    // frsqrts v21.4s, v21.4s, v24.4s
+    0x6e3cdf98,    // fmul v24.4s, v28.4s, v28.4s
+    0x91001068,    // add x8, x3, #0x4
+    0x4eb8fed6,    // frsqrts v22.4s, v22.4s, v24.4s
+    0x4d40c918,    // ld1r {v24.4s}, [x8]
+    0x6e30df30,    // fmul v16.4s, v25.4s, v16.4s
+    0x6e35df75,    // fmul v21.4s, v27.4s, v21.4s
+    0x6e36df96,    // fmul v22.4s, v28.4s, v22.4s
+    0x4f92121d,    // fmla v29.4s, v16.4s, v18.s[0]
+    0x4f9212b7,    // fmla v23.4s, v21.4s, v18.s[0]
+    0x4f9212d4,    // fmla v20.4s, v22.4s, v18.s[0]
+    0x4ebdf710,    // fmin v16.4s, v24.4s, v29.4s
+    0x4eb7f712,    // fmin v18.4s, v24.4s, v23.4s
+    0x4eb4f714,    // fmin v20.4s, v24.4s, v20.4s
+    0x6e701f40,    // bsl v0.16b, v26.16b, v16.16b
+    0x6e721e61,    // bsl v1.16b, v19.16b, v18.16b
+    0x6e741e22,    // bsl v2.16b, v17.16b, v20.16b
+};
+static const unsigned int kSplice_scale_u8[] = {
+    0xf9400048,    // ldr x8, [x2]
+    0xbd400c71,    // ldr s17, [x3,#12]
+    0x8b000108,    // add x8, x8, x0
+    0x39400109,    // ldrb w9, [x8]
+    0x3940050a,    // ldrb w10, [x8,#1]
+    0x4e021d30,    // mov v16.h[0], w9
+    0x39400909,    // ldrb w9, [x8,#2]
+    0x39400d08,    // ldrb w8, [x8,#3]
+    0x4e061d50,    // mov v16.h[1], w10
+    0x4e0a1d30,    // mov v16.h[2], w9
+    0x4e0e1d10,    // mov v16.h[3], w8
+    0x2f07b7f0,    // bic v16.4h, #0xff, lsl #8
+    0x2f10a610,    // uxtl v16.4s, v16.4h
+    0x6e21da10,    // ucvtf v16.4s, v16.4s
+    0x4f919210,    // fmul v16.4s, v16.4s, v17.s[0]
+    0x6e20de00,    // fmul v0.4s, v16.4s, v0.4s
+    0x6e21de01,    // fmul v1.4s, v16.4s, v1.4s
+    0x6e22de02,    // fmul v2.4s, v16.4s, v2.4s
+    0x6e23de03,    // fmul v3.4s, v16.4s, v3.4s
+};
+static const unsigned int kSplice_load_8888[] = {
+    0xf9400048,    // ldr x8, [x2]
+    0xd37ef409,    // lsl x9, x0, #2
+    0x4d40c860,    // ld1r {v0.4s}, [x3]
+    0xbd400c63,    // ldr s3, [x3,#12]
+    0x3ce96901,    // ldr q1, [x8,x9]
+    0x4e211c02,    // and v2.16b, v0.16b, v1.16b
+    0x6f380430,    // ushr v16.4s, v1.4s, #8
+    0x6f300431,    // ushr v17.4s, v1.4s, #16
+    0x6f280421,    // ushr v1.4s, v1.4s, #24
+    0x4e21d842,    // scvtf v2.4s, v2.4s
+    0x4e301c10,    // and v16.16b, v0.16b, v16.16b
+    0x4e311c11,    // and v17.16b, v0.16b, v17.16b
+    0x4e21d832,    // scvtf v18.4s, v1.4s
+    0x4f839040,    // fmul v0.4s, v2.4s, v3.s[0]
+    0x4e21da01,    // scvtf v1.4s, v16.4s
+    0x4e21da22,    // scvtf v2.4s, v17.4s
+    0x4f839021,    // fmul v1.4s, v1.4s, v3.s[0]
+    0x4f839042,    // fmul v2.4s, v2.4s, v3.s[0]
+    0x4f839243,    // fmul v3.4s, v18.4s, v3.s[0]
+};
+static const unsigned int kSplice_store_8888[] = {
+    0xbd400870,    // ldr s16, [x3,#8]
+    0xf9400048,    // ldr x8, [x2]
+    0xd37ef409,    // lsl x9, x0, #2
+    0x4f909032,    // fmul v18.4s, v1.4s, v16.s[0]
+    0x4f909011,    // fmul v17.4s, v0.4s, v16.s[0]
+    0x6e21aa52,    // fcvtnu v18.4s, v18.4s
+    0x6e21aa31,    // fcvtnu v17.4s, v17.4s
+    0x4f285652,    // shl v18.4s, v18.4s, #8
+    0x4eb11e51,    // orr v17.16b, v18.16b, v17.16b
+    0x4f909052,    // fmul v18.4s, v2.4s, v16.s[0]
+    0x4f909070,    // fmul v16.4s, v3.4s, v16.s[0]
+    0x6e21aa52,    // fcvtnu v18.4s, v18.4s
+    0x6e21aa10,    // fcvtnu v16.4s, v16.4s
+    0x4f305652,    // shl v18.4s, v18.4s, #16
+    0x4eb21e31,    // orr v17.16b, v17.16b, v18.16b
+    0x4f385610,    // shl v16.4s, v16.4s, #24
+    0x4eb01e30,    // orr v16.16b, v17.16b, v16.16b
+    0x3ca96910,    // str q16, [x8,x9]
+};
+static const unsigned int kSplice_load_f16[] = {
+    0xf9400048,    // ldr x8, [x2]
+    0x8b000d08,    // add x8, x8, x0, lsl #3
+    0x0c400510,    // ld4 {v16.4h-v19.4h}, [x8]
+    0x0e217a00,    // fcvtl v0.4s, v16.4h
+    0x0e217a21,    // fcvtl v1.4s, v17.4h
+    0x0e217a42,    // fcvtl v2.4s, v18.4h
+    0x0e217a63,    // fcvtl v3.4s, v19.4h
+};
+static const unsigned int kSplice_store_f16[] = {
+    0xf9400048,    // ldr x8, [x2]
+    0x0e216810,    // fcvtn v16.4h, v0.4s
+    0x0e216831,    // fcvtn v17.4h, v1.4s
+    0x0e216852,    // fcvtn v18.4h, v2.4s
+    0x8b000d08,    // add x8, x8, x0, lsl #3
+    0x0e216873,    // fcvtn v19.4h, v3.4s
+    0x0c000510,    // st4 {v16.4h-v19.4h}, [x8]
+};
+
+#else
+
 static const unsigned char kSplice_clear[] = {
    0xc5,0xfc,0x57,0xc0,                        // vxorps       %ymm0, %ymm0, %ymm0
    0xc5,0xf4,0x57,0xc9,                        // vxorps       %ymm1, %ymm1, %ymm1
@ -255,4 +528,7 @@ static const unsigned char kSplice_store_f16[] = {
    0xc4,0x41,0x39,0x6a,0xc2,                   // vpunpckhdq   %xmm10, %xmm8, %xmm8
    0xc5,0x7a,0x7f,0x44,0xf8,0x30,              // vmovdqu      %xmm8, 0x30(%rax,%rdi,8)
 };
+
+#endif
+
 #endif//SkSplicer_generated_DEFINED
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@ -6,52 +6,71 @@
 */

 #include "SkSplicer_shared.h"
-#include <immintrin.h>
 #include <string.h>

-#if !defined(__clang__) || !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
-    #error This file is not like the rest of Skia.
-    #error It must be compiled with clang and with -mavx2 -mfma -mf16c -fomit-frame-pointer.
+#if !defined(__clang__)
+    #error This file is not like the rest of Skia.  It must be compiled with clang.
 #endif

 // We have very specific inlining requirements.  It helps to just take total control.
 #define AI __attribute__((always_inline)) inline

+#if defined(__aarch64__)
+    #include <arm_neon.h>
+
+    // Since we know we're using Clang, we can use its vector extensions.
+    using F   = float    __attribute__((ext_vector_type(4)));
+    using I32 =  int32_t __attribute__((ext_vector_type(4)));
+    using U32 = uint32_t __attribute__((ext_vector_type(4)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(4)));
+
+    // We polyfill a few routines that Clang doesn't build into ext_vector_types.
+    AI static U32 round(F v)                           { return vcvtnq_u32_f32(v);       }
+    AI static F   min(F a, F b)                        { return vminq_f32(a,b);          }
+    AI static F   max(F a, F b)                        { return vmaxq_f32(a,b);          }
+    AI static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
+    AI static F   rcp  (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
+    AI static F   rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
+    AI static F   if_then_else(I32 c, F t, F e)        { return vbslq_f32((U32)c,t,e);   }
+#else
+    #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
+        #error On x86, compile with -mavx2 -mfma -mf16c.
+    #endif
+    #include <immintrin.h>
+
+    // These are __m256 and __m256i, but friendlier and strongly-typed.
+    using F   = float    __attribute__((ext_vector_type(8)));
+    using I32 =  int32_t __attribute__((ext_vector_type(8)));
+    using U32 = uint32_t __attribute__((ext_vector_type(8)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(8)));
+
+    AI static U32 round(F v)                    { return _mm256_cvtps_epi32(v); }
+    AI static F   min(F a, F b)                 { return _mm256_min_ps  (a,b);  }
+    AI static F   max(F a, F b)                 { return _mm256_max_ps  (a,b);  }
+    AI static F   fma(F f, F m, F a)            { return _mm256_fmadd_ps(f,m,a);}
+    AI static F   rcp  (F v)                    { return _mm256_rcp_ps     (v); }
+    AI static F   rsqrt(F v)                    { return _mm256_rsqrt_ps   (v); }
+    AI static F   if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+#endif
+
+AI static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
+AI static U32 expand(U8  v) { return __builtin_convertvector(     v, U32); }
+
 // We'll be compiling this file to an object file, then extracting parts of it into
 // SkSplicer_generated.h.  It's easier to do if the function names are not C++ mangled.
 #define C extern "C"

-// Since we know we're using Clang, we can use its vector extensions.
-// These are __m256 and __m256i, but friendlier and strongly-typed.
-using F   = float    __attribute__((ext_vector_type(8)));
-using I32 =  int32_t __attribute__((ext_vector_type(8)));
-using U32 = uint32_t __attribute__((ext_vector_type(8)));
-using U8  = uint8_t  __attribute__((ext_vector_type(8)));
-
-// We polyfill a few routines that Clang doesn't build into ext_vector_types.
-AI static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F); }
-AI static U32 round (F   v) { return _mm256_cvtps_epi32(v); }
-AI static U32 expand(U8  v) { return __builtin_convertvector(v, U32); }
-
-AI static F rcp  (F v)           { return _mm256_rcp_ps  (v); }
-AI static F rsqrt(F v)           { return _mm256_rsqrt_ps(v); }
-AI static F min  (F a, F b)      { return _mm256_min_ps  (a,b); }
-AI static F max  (F a, F b)      { return _mm256_max_ps  (a,b); }
-AI static F fma  (F f, F m, F a) { return _mm256_fmadd_ps(f,m,a); }
-
-AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
-
 // Stages all fit a common interface that allows SkSplicer to splice them together.
 using K = const SkSplicer_constants;
 using Stage = void(size_t x, size_t n, void* ctx, K* constants, F,F,F,F, F,F,F,F);

 // Stage's arguments act as the working set of registers within the final spliced function.
-// Here's a little primer on the ABI:
-//   x:         rdi         x and n work to drive the loop, like for (; x < n; x += 8)
-//   n:         rsi
-//   ctx:       rdx         Look for movabsq_rdx in SkSplicer.cpp to see how this works.
-//   constants: rcx         Look for movabsq_rcx in SkSplicer.cpp to see how this works.
-//   vectors:   ymm0-ymm7
+// Here's a little primer on the x86-64/aarch64 ABIs:
+//   x:         rdi/x0          x and n work to drive the loop, like for (; x < n; x += 4 or 8)
+//   n:         rsi/x1
+//   ctx:       rdx/x2          Look for movabsq_rdx in SkSplicer.cpp to see how this works.
+//   constants: rcx/x3          Look for movabsq_rcx in SkSplicer.cpp to see how this works.
+//   vectors:   ymm0-ymm7/v0-v7


 // done() is the key to this entire splicing strategy.
@ -231,6 +250,13 @@ STAGE(store_8888) {
 STAGE(load_f16) {
    auto ptr = *(const uint64_t**)ctx + x;

+#if defined(__aarch64__)
+    auto halfs = vld4_f16((const float16_t*)ptr);
+    r = vcvt_f32_f16(halfs.val[0]);
+    g = vcvt_f32_f16(halfs.val[1]);
+    b = vcvt_f32_f16(halfs.val[2]);
+    a = vcvt_f32_f16(halfs.val[3]);
+#else
    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
         _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
@ -250,11 +276,21 @@ STAGE(load_f16) {
    g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
    b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
    a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#endif
 }

 STAGE(store_f16) {
    auto ptr = *(uint64_t**)ctx + x;

+#if defined(__aarch64__)
+    float16x4x4_t halfs = {{
+        vcvt_f16_f32(r),
+        vcvt_f16_f32(g),
+        vcvt_f16_f32(b),
+        vcvt_f16_f32(a),
+    }};
+    vst4_f16((float16_t*)ptr, halfs);
+#else
    auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
         G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
         B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
@ -269,4 +305,5 @@ STAGE(store_f16) {
    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
    _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
    _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#endif
 }
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@ -9,11 +9,21 @@ import re
 import subprocess
 import sys

-cflags = '-std=c++11 -Os -fomit-frame-pointer -mavx2 -mfma -mf16c'
+cflags = '-std=c++11 -Os -fomit-frame-pointer'.split()

-subprocess.check_call(['clang++'] + cflags.split() +
+hsw = '-mavx2 -mfma -mf16c'.split()
+subprocess.check_call(['clang++'] + cflags + hsw +
                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
-                      ['-o', 'stages.o'])
+                      ['-o', 'hsw.o'])
+
+aarch64 = [
+    '--target=aarch64-linux-android',
+    '--sysroot=' +
+    '/Users/mtklein/brew/opt/android-ndk/platforms/android-21/arch-arm64',
+]
+subprocess.check_call(['clang++'] + cflags + aarch64 +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'aarch64.o'])

 print '''/*
 * Copyright 2017 Google Inc.
@ -27,11 +37,37 @@ print '''/*

 // This file is generated semi-automatically with this command:
 //   $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+
+#if defined(__aarch64__)
+'''
+for line in subprocess.check_output(['gobjdump', '-d',
+                                     'aarch64.o']).split('\n'):
+  line = line.strip()
+  if not line or line.startswith('aarch64.o') or line.startswith('Disassembly'):
+    continue
+
+  m = re.match('''................ <(.*)>:''', line)
+  if m:
+    print 'static const unsigned int kSplice_' + m.group(1) + '[] = {'
+    continue
+
+  _, code, inst, args = line.split('\t')
+  code = code.strip()
+
+  # b done, where done has not yet been filled in by the linker.
+  if code == '14000000':
+    print '};'
+    continue
+  print '   ', '0x'+code+',' + '    // ' + inst + ' ' + args
+
+print '''
+#else
 '''

-for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'):
+# TODO: port this to gobjdump too
+for line in subprocess.check_output(['otool', '-tvj', 'hsw.o']).split('\n'):
  line = line.strip()
-  if line == '' or line == 'stages.o:' or line == '(__TEXT,__text) section':
+  if line == '' or line == 'hsw.o:' or line == '(__TEXT,__text) section':
    continue

  m = re.match('_(.*):', line)
@ -41,33 +77,37 @@ for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'):
    continue

  # Skip the leading 16 byte address and a tab,
-  # leaving the hex and mnemonics of each instruction.
+  # leaving the code, instruction mnemonic, and its arguments.
  line = line[17:]
  columns = line.split('\t')
-  _hex  = columns[0].strip()
-  instr = columns[1]
-  args  = columns[2:]
+  code = columns[0].strip()
+  inst = columns[1]
+  args = columns[2:]

  # We can't splice code that uses rip relative addressing.
  for arg in args:
    assert 'rip' not in arg

  # jmp done, the end of each stage (the address of done is not yet filled in)
-  if _hex == 'e9 00 00 00 00':
+  if code == 'e9 00 00 00 00':
    print '};'
    continue

  sys.stdout.write('    ')
-  _bytes = _hex.split(' ')
+  _bytes = code.split(' ')
  # This is the meat of things: copy the code to a C unsigned char array.
  for byte in _bytes:
    sys.stdout.write('0x' + byte + ',')
  # From here on we're just making the generated file readable and pretty.
  sys.stdout.write(' ' * (44 - 5*len(_bytes)))
-  sys.stdout.write('// ' + instr)
+  sys.stdout.write('// ' + inst)
  if args:
-    sys.stdout.write(' ' * (13 - len(instr)))
+    sys.stdout.write(' ' * (13 - len(inst)))
    sys.stdout.write(' '.join(args))
  sys.stdout.write('\n')

+print '''
+#endif
+'''
+
 print '''#endif//SkSplicer_generated_DEFINED'''