handle any N in aarch64 JIT code

Add a tail loop to handle elements one at a time. Just like in the interpreter, the only instructions that need to be changed are the loads and stores, 16 byte -> 4 byte and 4 byte -> 1 byte. With this we can mark the interpreter as SkUNREACHABLE, and it even completely compiles away, saving a few KB. Example profile for the SkVMTool float-squaring program running N=15 over and over: Samples│ │ skvm-jit-3663518994(): 42 │40: cmp x0, #0x4 │44: ↓ b.lt 60 51 │48: ldr q0, [x1] 197 │4c: mul v0.4s, v0.4s, v0.4s 135 │50: str q0, [x1] │54: add x1, x1, #0x10 43 │58: sub x0, x0, #0x4 │5c: b.al 40 150 │60: ↓ cbz x0, 7c 67 │64: ldr s0, [x1] 130 │68: mul v0.4s, v0.4s, v0.4s 135 │6c: str s0, [x1] 18 │70: add x1, x1, #0x4 17 │74: sub x0, x0, #0x1 20 │78: b.al 60 124 │7c: ← ret Change-Id: I153d7bc247942366a686e30a9cad60c935f754ed Reviewed-on: https://skia-review.googlesource.com/c/skia/+/227138 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-07-12 10:09:14 -05:00 · 2019-07-12 10:09:14 -05:00 · 57b0b09dfd
commit 57b0b09dfd
parent aa04068be4
1 changed files with 82 additions and 16 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -983,12 +983,35 @@ namespace skvm {

        *code = a.size();

-        A::Label loop_label;
-        for (int i = 0; i < (int)instructions.size(); i++) {
-            if (i == loop) {
-                loop_label = a.here();
-            }
-            const Program::Instruction& inst = instructions[i];
+        // Our program runs a 4-at-a-time body loop, then a 1-at-at-time tail loop to
+        // handle all N values, with an overall layout looking like
+        //
+        // buf:   ...
+        //        data for splats and tbl
+        //        ...
+        //
+        // code:  ...
+        //        hoisted instructions
+        //        ...
+        //
+        // body:  cmp N,4       # if (n < 4)
+        //        b.lt tail     #    goto tail
+        //        ...
+        //        instructions handling 4 at a time
+        //        ...
+        //        sub N,4
+        //        b body
+        //
+        // tail:  cbz N,done    # if (n == 0) goto done
+        //        ...
+        //        instructions handling 1 at a time
+        //        ...
+        //        sub N,1
+        //        b tail
+        //
+        // done:  ret
+
+        auto emit = [&](const Program::Instruction& inst, bool scalar) {
            Op  op = inst.op;

            Reg   d = inst.d,
@ -999,15 +1022,24 @@ namespace skvm {
            switch (op) {
                case Op::store8: a.xtns2h(r(tmp), r(x));
                                 a.xtnh2b(r(tmp), r(tmp));
-                                 a.strs  (r(tmp), arg[imm]);
+                   if (scalar) { a.strb  (r(tmp), arg[imm]); }
+                   else        { a.strs  (r(tmp), arg[imm]); }
+                                 break;
+                case Op::store32:
+                   if (scalar) { a.strs(r(x), arg[imm]); }
+                   else        { a.strq(r(x), arg[imm]); }
                                 break;
-                case Op::store32: a.strq(r(x), arg[imm]); break;

-                case Op::load8: a.ldrs   (r(tmp), arg[imm]);
-                                a.uxtlb2h(r(tmp), r(tmp));
-                                a.uxtlh2s(r(d)  , r(tmp));
-                                break;
-                case Op::load32: a.ldrq(r(d), arg[imm]); break;
+                case Op::load8:
+                   if (scalar) { a.ldrb   (r(tmp), arg[imm]); }
+                   else        { a.ldrs   (r(tmp), arg[imm]); }
+                                 a.uxtlb2h(r(tmp), r(tmp));
+                                 a.uxtlh2s(r(d)  , r(tmp));
+                                 break;
+                case Op::load32:
+                   if (scalar) { a.ldrs(r(d), arg[imm]); }
+                   else        { a.ldrq(r(d), arg[imm]); }
+                                 break;

                case Op::splat: a.ldrq(r(d), splats.find(imm)); break;

@ -1060,16 +1092,47 @@ namespace skvm {
                                a.tbl (r(d), r(x), r(tmp));
                                break;
            }
+        };
+
+        A::Label body,
+                 tail,
+                 done;
+
+        // Hoisted instructions.
+        for (int i = 0; i < loop; i++) {
+            emit(instructions[i], /*scalar=*/false);
        }

+        // Body 4-at-a-time loop.
+    a.label(&body);
+        a.cmp(N, K);
+        a.blt(&tail);
+        for (int i = loop; i < (int)instructions.size(); i++) {
+            emit(instructions[i], /*scalar=*/false);
+        }
        for (int i = 0; i < nargs; i++) {
            a.add(arg[i], arg[i], K*(int)strides[i]);
        }
-        a.subs(N, N, K);
-        a.bne(&loop_label);
+        a.sub(N, N, K);
+        a.b(&body);
+
+        // Tail 1-at-a-time loop.
+    a.label(&tail);
+        a.cbz(N, &done);
+        for (int i = loop; i < (int)instructions.size(); i++) {
+            emit(instructions[i], /*scalar=*/true);
+        }
+        for (int i = 0; i < nargs; i++) {
+            a.add(arg[i], arg[i], 1*(int)strides[i]);
+        }
+        a.sub(N, N, 1);
+        a.b(&tail);
+
+    a.label(&done);
        a.ret(A::x30);

-        return ~(K-1);
+        // We can handle any N.
+        return ~0;
    }

    #else  // not x86-64 or aarch64
@ -1281,6 +1344,9 @@ namespace skvm {
        }

        if (n) {
+    #if defined(__aarch64__) && defined(SKVM_JIT)
+            SkUNREACHABLE;
+    #endif
            // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
            constexpr int K = 16;
            using I32 = skvx::Vec<K, int>;