handle any N in aarch64 JIT code

Add a tail loop to handle elements one at a time. Just like in the interpreter, the only instructions that need to be changed are the loads and stores, 16 byte -> 4 byte and 4 byte -> 1 byte. With this we can mark the interpreter as SkUNREACHABLE, and it even completely compiles away, saving a few KB. Example profile for the SkVMTool float-squaring program running N=15 over and over: Samples│ │ skvm-jit-3663518994(): 42 │40: cmp x0, #0x4 │44: ↓ b.lt 60 51 │48: ldr q0, [x1] 197 │4c: mul v0.4s, v0.4s, v0.4s 135 │50: str q0, [x1] │54: add x1, x1, #0x10 43 │58: sub x0, x0, #0x4 │5c: b.al 40 150 │60: ↓ cbz x0, 7c 67 │64: ldr s0, [x1] 130 │68: mul v0.4s, v0.4s, v0.4s 135 │6c: str s0, [x1] 18 │70: add x1, x1, #0x4 17 │74: sub x0, x0, #0x1 20 │78: b.al 60 124 │7c: ← ret Change-Id: I153d7bc247942366a686e30a9cad60c935f754ed Reviewed-on: https://skia-review.googlesource.com/c/skia/+/227138 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-07-12 10:09:14 -05:00 · 2019-07-12 10:09:14 -05:00 · 57b0b09dfd
commit 57b0b09dfd
parent aa04068be4
1 changed files with 82 additions and 16 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -983,12 +983,35 @@ namespace skvm {
        *code = a.size();
-        A::Label loop_label;
+        // Our program runs a 4-at-a-time body loop, then a 1-at-at-time tail loop to
-        for (int i = 0; i < (int)instructions.size(); i++) {
+        // handle all N values, with an overall layout looking like
-            if (i == loop) {
+        //
-                loop_label = a.here();
+        // buf:   ...
-            }
+        //        data for splats and tbl
-            const Program::Instruction& inst = instructions[i];
+        //        ...
        //
        // code:  ...
        //        hoisted instructions
        //        ...
        //
        // body:  cmp N,4       # if (n < 4)
        //        b.lt tail     #    goto tail
        //        ...
        //        instructions handling 4 at a time
        //        ...
        //        sub N,4
        //        b body
        //
        // tail:  cbz N,done    # if (n == 0) goto done
        //        ...
        //        instructions handling 1 at a time
        //        ...
        //        sub N,1
        //        b tail
        //
        // done:  ret
        auto emit = [&](const Program::Instruction& inst, bool scalar) {
            Op  op = inst.op;
            Reg   d = inst.d,
@ -999,15 +1022,24 @@ namespace skvm {
            switch (op) {
                case Op::store8: a.xtns2h(r(tmp), r(x));
                                 a.xtnh2b(r(tmp), r(tmp));
-                                 a.strs  (r(tmp), arg[imm]);
+                   if (scalar) { a.strb  (r(tmp), arg[imm]); }
                   else        { a.strs  (r(tmp), arg[imm]); }
                                 break;
                case Op::store32:
                   if (scalar) { a.strs(r(x), arg[imm]); }
                   else        { a.strq(r(x), arg[imm]); }
                                 break;
                case Op::store32: a.strq(r(x), arg[imm]); break;
-                case Op::load8: a.ldrs   (r(tmp), arg[imm]);
+                case Op::load8:
-                                a.uxtlb2h(r(tmp), r(tmp));
+                   if (scalar) { a.ldrb   (r(tmp), arg[imm]); }
-                                a.uxtlh2s(r(d)  , r(tmp));
+                   else        { a.ldrs   (r(tmp), arg[imm]); }
-                                break;
+                                 a.uxtlb2h(r(tmp), r(tmp));
-                case Op::load32: a.ldrq(r(d), arg[imm]); break;
+                                 a.uxtlh2s(r(d)  , r(tmp));
                                 break;
                case Op::load32:
                   if (scalar) { a.ldrs(r(d), arg[imm]); }
                   else        { a.ldrq(r(d), arg[imm]); }
                                 break;
                case Op::splat: a.ldrq(r(d), splats.find(imm)); break;
@ -1060,16 +1092,47 @@ namespace skvm {
                                a.tbl (r(d), r(x), r(tmp));
                                break;
            }
        };
        A::Label body,
                 tail,
                 done;
        // Hoisted instructions.
        for (int i = 0; i < loop; i++) {
            emit(instructions[i], /*scalar=*/false);
        }
        // Body 4-at-a-time loop.
    a.label(&body);
        a.cmp(N, K);
        a.blt(&tail);
        for (int i = loop; i < (int)instructions.size(); i++) {
            emit(instructions[i], /*scalar=*/false);
        }
        for (int i = 0; i < nargs; i++) {
            a.add(arg[i], arg[i], K*(int)strides[i]);
        }
-        a.subs(N, N, K);
+        a.sub(N, N, K);
-        a.bne(&loop_label);
+        a.b(&body);
        // Tail 1-at-a-time loop.
    a.label(&tail);
        a.cbz(N, &done);
        for (int i = loop; i < (int)instructions.size(); i++) {
            emit(instructions[i], /*scalar=*/true);
        }
        for (int i = 0; i < nargs; i++) {
            a.add(arg[i], arg[i], 1*(int)strides[i]);
        }
        a.sub(N, N, 1);
        a.b(&tail);
    a.label(&done);
        a.ret(A::x30);
-        return ~(K-1);
+        // We can handle any N.
        return ~0;
    }
    #else  // not x86-64 or aarch64
@ -1281,6 +1344,9 @@ namespace skvm {
        }
        if (n) {
    #if defined(__aarch64__) && defined(SKVM_JIT)
            SkUNREACHABLE;
    #endif
            // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
            constexpr int K = 16;
            using I32 = skvx::Vec<K, int>;