From 57b0b09dfdb6f01c43b76cd56e8477ab30230ab0 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Fri, 12 Jul 2019 10:09:14 -0500 Subject: [PATCH] handle any N in aarch64 JIT code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a tail loop to handle elements one at a time. Just like in the interpreter, the only instructions that need to be changed are the loads and stores, 16 byte -> 4 byte and 4 byte -> 1 byte. With this we can mark the interpreter as SkUNREACHABLE, and it even completely compiles away, saving a few KB. Example profile for the SkVMTool float-squaring program running N=15 over and over: Samples│ │ skvm-jit-3663518994(): 42 │40: cmp x0, #0x4 │44: ↓ b.lt 60 51 │48: ldr q0, [x1] 197 │4c: mul v0.4s, v0.4s, v0.4s 135 │50: str q0, [x1] │54: add x1, x1, #0x10 43 │58: sub x0, x0, #0x4 │5c: b.al 40 150 │60: ↓ cbz x0, 7c 67 │64: ldr s0, [x1] 130 │68: mul v0.4s, v0.4s, v0.4s 135 │6c: str s0, [x1] 18 │70: add x1, x1, #0x4 17 │74: sub x0, x0, #0x1 20 │78: b.al 60 124 │7c: ← ret Change-Id: I153d7bc247942366a686e30a9cad60c935f754ed Reviewed-on: https://skia-review.googlesource.com/c/skia/+/227138 Commit-Queue: Mike Klein Reviewed-by: Herb Derby --- src/core/SkVM.cpp | 98 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 16 deletions(-) diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp index 70eb981ea3..d3f5bcef31 100644 --- a/src/core/SkVM.cpp +++ b/src/core/SkVM.cpp @@ -983,12 +983,35 @@ namespace skvm { *code = a.size(); - A::Label loop_label; - for (int i = 0; i < (int)instructions.size(); i++) { - if (i == loop) { - loop_label = a.here(); - } - const Program::Instruction& inst = instructions[i]; + // Our program runs a 4-at-a-time body loop, then a 1-at-at-time tail loop to + // handle all N values, with an overall layout looking like + // + // buf: ... + // data for splats and tbl + // ... + // + // code: ... + // hoisted instructions + // ... + // + // body: cmp N,4 # if (n < 4) + // b.lt tail # goto tail + // ... + // instructions handling 4 at a time + // ... + // sub N,4 + // b body + // + // tail: cbz N,done # if (n == 0) goto done + // ... + // instructions handling 1 at a time + // ... + // sub N,1 + // b tail + // + // done: ret + + auto emit = [&](const Program::Instruction& inst, bool scalar) { Op op = inst.op; Reg d = inst.d, @@ -999,15 +1022,24 @@ namespace skvm { switch (op) { case Op::store8: a.xtns2h(r(tmp), r(x)); a.xtnh2b(r(tmp), r(tmp)); - a.strs (r(tmp), arg[imm]); + if (scalar) { a.strb (r(tmp), arg[imm]); } + else { a.strs (r(tmp), arg[imm]); } + break; + case Op::store32: + if (scalar) { a.strs(r(x), arg[imm]); } + else { a.strq(r(x), arg[imm]); } break; - case Op::store32: a.strq(r(x), arg[imm]); break; - case Op::load8: a.ldrs (r(tmp), arg[imm]); - a.uxtlb2h(r(tmp), r(tmp)); - a.uxtlh2s(r(d) , r(tmp)); - break; - case Op::load32: a.ldrq(r(d), arg[imm]); break; + case Op::load8: + if (scalar) { a.ldrb (r(tmp), arg[imm]); } + else { a.ldrs (r(tmp), arg[imm]); } + a.uxtlb2h(r(tmp), r(tmp)); + a.uxtlh2s(r(d) , r(tmp)); + break; + case Op::load32: + if (scalar) { a.ldrs(r(d), arg[imm]); } + else { a.ldrq(r(d), arg[imm]); } + break; case Op::splat: a.ldrq(r(d), splats.find(imm)); break; @@ -1060,16 +1092,47 @@ namespace skvm { a.tbl (r(d), r(x), r(tmp)); break; } + }; + + A::Label body, + tail, + done; + + // Hoisted instructions. + for (int i = 0; i < loop; i++) { + emit(instructions[i], /*scalar=*/false); } + // Body 4-at-a-time loop. + a.label(&body); + a.cmp(N, K); + a.blt(&tail); + for (int i = loop; i < (int)instructions.size(); i++) { + emit(instructions[i], /*scalar=*/false); + } for (int i = 0; i < nargs; i++) { a.add(arg[i], arg[i], K*(int)strides[i]); } - a.subs(N, N, K); - a.bne(&loop_label); + a.sub(N, N, K); + a.b(&body); + + // Tail 1-at-a-time loop. + a.label(&tail); + a.cbz(N, &done); + for (int i = loop; i < (int)instructions.size(); i++) { + emit(instructions[i], /*scalar=*/true); + } + for (int i = 0; i < nargs; i++) { + a.add(arg[i], arg[i], 1*(int)strides[i]); + } + a.sub(N, N, 1); + a.b(&tail); + + a.label(&done); a.ret(A::x30); - return ~(K-1); + // We can handle any N. + return ~0; } #else // not x86-64 or aarch64 @@ -1281,6 +1344,9 @@ namespace skvm { } if (n) { + #if defined(__aarch64__) && defined(SKVM_JIT) + SkUNREACHABLE; + #endif // We'll operate in SIMT style, knocking off K-size chunks from n while possible. constexpr int K = 16; using I32 = skvx::Vec;