handle any N in aarch64 JIT code

Add a tail loop to handle elements one at a time.

Just like in the interpreter, the only instructions
that need to be changed are the loads and stores,
16 byte -> 4 byte and 4 byte -> 1 byte.

With this we can mark the interpreter as SkUNREACHABLE,
and it even completely compiles away, saving a few KB.

Example profile for the SkVMTool float-squaring program
running N=15 over and over:

    Samples│
           │      skvm-jit-3663518994():
        42 │40:   cmp    x0, #0x4
           │44: ↓ b.lt   60
        51 │48:   ldr    q0, [x1]
       197 │4c:   mul    v0.4s, v0.4s, v0.4s
       135 │50:   str    q0, [x1]
           │54:   add    x1, x1, #0x10
        43 │58:   sub    x0, x0, #0x4
           │5c:   b.al   40
       150 │60: ↓ cbz    x0, 7c
        67 │64:   ldr    s0, [x1]
       130 │68:   mul    v0.4s, v0.4s, v0.4s
       135 │6c:   str    s0, [x1]
        18 │70:   add    x1, x1, #0x4
        17 │74:   sub    x0, x0, #0x1
        20 │78:   b.al   60
       124 │7c: ← ret

Change-Id: I153d7bc247942366a686e30a9cad60c935f754ed
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/227138
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2019-07-12 10:09:14 -05:00 committed by Skia Commit-Bot
parent aa04068be4
commit 57b0b09dfd

View File

@ -983,12 +983,35 @@ namespace skvm {
*code = a.size();
A::Label loop_label;
for (int i = 0; i < (int)instructions.size(); i++) {
if (i == loop) {
loop_label = a.here();
}
const Program::Instruction& inst = instructions[i];
// Our program runs a 4-at-a-time body loop, then a 1-at-at-time tail loop to
// handle all N values, with an overall layout looking like
//
// buf: ...
// data for splats and tbl
// ...
//
// code: ...
// hoisted instructions
// ...
//
// body: cmp N,4 # if (n < 4)
// b.lt tail # goto tail
// ...
// instructions handling 4 at a time
// ...
// sub N,4
// b body
//
// tail: cbz N,done # if (n == 0) goto done
// ...
// instructions handling 1 at a time
// ...
// sub N,1
// b tail
//
// done: ret
auto emit = [&](const Program::Instruction& inst, bool scalar) {
Op op = inst.op;
Reg d = inst.d,
@ -999,15 +1022,24 @@ namespace skvm {
switch (op) {
case Op::store8: a.xtns2h(r(tmp), r(x));
a.xtnh2b(r(tmp), r(tmp));
a.strs (r(tmp), arg[imm]);
if (scalar) { a.strb (r(tmp), arg[imm]); }
else { a.strs (r(tmp), arg[imm]); }
break;
case Op::store32:
if (scalar) { a.strs(r(x), arg[imm]); }
else { a.strq(r(x), arg[imm]); }
break;
case Op::store32: a.strq(r(x), arg[imm]); break;
case Op::load8: a.ldrs (r(tmp), arg[imm]);
a.uxtlb2h(r(tmp), r(tmp));
a.uxtlh2s(r(d) , r(tmp));
break;
case Op::load32: a.ldrq(r(d), arg[imm]); break;
case Op::load8:
if (scalar) { a.ldrb (r(tmp), arg[imm]); }
else { a.ldrs (r(tmp), arg[imm]); }
a.uxtlb2h(r(tmp), r(tmp));
a.uxtlh2s(r(d) , r(tmp));
break;
case Op::load32:
if (scalar) { a.ldrs(r(d), arg[imm]); }
else { a.ldrq(r(d), arg[imm]); }
break;
case Op::splat: a.ldrq(r(d), splats.find(imm)); break;
@ -1060,16 +1092,47 @@ namespace skvm {
a.tbl (r(d), r(x), r(tmp));
break;
}
};
A::Label body,
tail,
done;
// Hoisted instructions.
for (int i = 0; i < loop; i++) {
emit(instructions[i], /*scalar=*/false);
}
// Body 4-at-a-time loop.
a.label(&body);
a.cmp(N, K);
a.blt(&tail);
for (int i = loop; i < (int)instructions.size(); i++) {
emit(instructions[i], /*scalar=*/false);
}
for (int i = 0; i < nargs; i++) {
a.add(arg[i], arg[i], K*(int)strides[i]);
}
a.subs(N, N, K);
a.bne(&loop_label);
a.sub(N, N, K);
a.b(&body);
// Tail 1-at-a-time loop.
a.label(&tail);
a.cbz(N, &done);
for (int i = loop; i < (int)instructions.size(); i++) {
emit(instructions[i], /*scalar=*/true);
}
for (int i = 0; i < nargs; i++) {
a.add(arg[i], arg[i], 1*(int)strides[i]);
}
a.sub(N, N, 1);
a.b(&tail);
a.label(&done);
a.ret(A::x30);
return ~(K-1);
// We can handle any N.
return ~0;
}
#else // not x86-64 or aarch64
@ -1281,6 +1344,9 @@ namespace skvm {
}
if (n) {
#if defined(__aarch64__) && defined(SKVM_JIT)
SkUNREACHABLE;
#endif
// We'll operate in SIMT style, knocking off K-size chunks from n while possible.
constexpr int K = 16;
using I32 = skvx::Vec<K, int>;