handle any N in aarch64 JIT code
Add a tail loop to handle elements one at a time. Just like in the interpreter, the only instructions that need to be changed are the loads and stores, 16 byte -> 4 byte and 4 byte -> 1 byte. With this we can mark the interpreter as SkUNREACHABLE, and it even completely compiles away, saving a few KB. Example profile for the SkVMTool float-squaring program running N=15 over and over: Samples│ │ skvm-jit-3663518994(): 42 │40: cmp x0, #0x4 │44: ↓ b.lt 60 51 │48: ldr q0, [x1] 197 │4c: mul v0.4s, v0.4s, v0.4s 135 │50: str q0, [x1] │54: add x1, x1, #0x10 43 │58: sub x0, x0, #0x4 │5c: b.al 40 150 │60: ↓ cbz x0, 7c 67 │64: ldr s0, [x1] 130 │68: mul v0.4s, v0.4s, v0.4s 135 │6c: str s0, [x1] 18 │70: add x1, x1, #0x4 17 │74: sub x0, x0, #0x1 20 │78: b.al 60 124 │7c: ← ret Change-Id: I153d7bc247942366a686e30a9cad60c935f754ed Reviewed-on: https://skia-review.googlesource.com/c/skia/+/227138 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
parent
aa04068be4
commit
57b0b09dfd
@ -983,12 +983,35 @@ namespace skvm {
|
|||||||
|
|
||||||
*code = a.size();
|
*code = a.size();
|
||||||
|
|
||||||
A::Label loop_label;
|
// Our program runs a 4-at-a-time body loop, then a 1-at-at-time tail loop to
|
||||||
for (int i = 0; i < (int)instructions.size(); i++) {
|
// handle all N values, with an overall layout looking like
|
||||||
if (i == loop) {
|
//
|
||||||
loop_label = a.here();
|
// buf: ...
|
||||||
}
|
// data for splats and tbl
|
||||||
const Program::Instruction& inst = instructions[i];
|
// ...
|
||||||
|
//
|
||||||
|
// code: ...
|
||||||
|
// hoisted instructions
|
||||||
|
// ...
|
||||||
|
//
|
||||||
|
// body: cmp N,4 # if (n < 4)
|
||||||
|
// b.lt tail # goto tail
|
||||||
|
// ...
|
||||||
|
// instructions handling 4 at a time
|
||||||
|
// ...
|
||||||
|
// sub N,4
|
||||||
|
// b body
|
||||||
|
//
|
||||||
|
// tail: cbz N,done # if (n == 0) goto done
|
||||||
|
// ...
|
||||||
|
// instructions handling 1 at a time
|
||||||
|
// ...
|
||||||
|
// sub N,1
|
||||||
|
// b tail
|
||||||
|
//
|
||||||
|
// done: ret
|
||||||
|
|
||||||
|
auto emit = [&](const Program::Instruction& inst, bool scalar) {
|
||||||
Op op = inst.op;
|
Op op = inst.op;
|
||||||
|
|
||||||
Reg d = inst.d,
|
Reg d = inst.d,
|
||||||
@ -999,15 +1022,24 @@ namespace skvm {
|
|||||||
switch (op) {
|
switch (op) {
|
||||||
case Op::store8: a.xtns2h(r(tmp), r(x));
|
case Op::store8: a.xtns2h(r(tmp), r(x));
|
||||||
a.xtnh2b(r(tmp), r(tmp));
|
a.xtnh2b(r(tmp), r(tmp));
|
||||||
a.strs (r(tmp), arg[imm]);
|
if (scalar) { a.strb (r(tmp), arg[imm]); }
|
||||||
|
else { a.strs (r(tmp), arg[imm]); }
|
||||||
|
break;
|
||||||
|
case Op::store32:
|
||||||
|
if (scalar) { a.strs(r(x), arg[imm]); }
|
||||||
|
else { a.strq(r(x), arg[imm]); }
|
||||||
break;
|
break;
|
||||||
case Op::store32: a.strq(r(x), arg[imm]); break;
|
|
||||||
|
|
||||||
case Op::load8: a.ldrs (r(tmp), arg[imm]);
|
case Op::load8:
|
||||||
a.uxtlb2h(r(tmp), r(tmp));
|
if (scalar) { a.ldrb (r(tmp), arg[imm]); }
|
||||||
a.uxtlh2s(r(d) , r(tmp));
|
else { a.ldrs (r(tmp), arg[imm]); }
|
||||||
break;
|
a.uxtlb2h(r(tmp), r(tmp));
|
||||||
case Op::load32: a.ldrq(r(d), arg[imm]); break;
|
a.uxtlh2s(r(d) , r(tmp));
|
||||||
|
break;
|
||||||
|
case Op::load32:
|
||||||
|
if (scalar) { a.ldrs(r(d), arg[imm]); }
|
||||||
|
else { a.ldrq(r(d), arg[imm]); }
|
||||||
|
break;
|
||||||
|
|
||||||
case Op::splat: a.ldrq(r(d), splats.find(imm)); break;
|
case Op::splat: a.ldrq(r(d), splats.find(imm)); break;
|
||||||
|
|
||||||
@ -1060,16 +1092,47 @@ namespace skvm {
|
|||||||
a.tbl (r(d), r(x), r(tmp));
|
a.tbl (r(d), r(x), r(tmp));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
A::Label body,
|
||||||
|
tail,
|
||||||
|
done;
|
||||||
|
|
||||||
|
// Hoisted instructions.
|
||||||
|
for (int i = 0; i < loop; i++) {
|
||||||
|
emit(instructions[i], /*scalar=*/false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Body 4-at-a-time loop.
|
||||||
|
a.label(&body);
|
||||||
|
a.cmp(N, K);
|
||||||
|
a.blt(&tail);
|
||||||
|
for (int i = loop; i < (int)instructions.size(); i++) {
|
||||||
|
emit(instructions[i], /*scalar=*/false);
|
||||||
|
}
|
||||||
for (int i = 0; i < nargs; i++) {
|
for (int i = 0; i < nargs; i++) {
|
||||||
a.add(arg[i], arg[i], K*(int)strides[i]);
|
a.add(arg[i], arg[i], K*(int)strides[i]);
|
||||||
}
|
}
|
||||||
a.subs(N, N, K);
|
a.sub(N, N, K);
|
||||||
a.bne(&loop_label);
|
a.b(&body);
|
||||||
|
|
||||||
|
// Tail 1-at-a-time loop.
|
||||||
|
a.label(&tail);
|
||||||
|
a.cbz(N, &done);
|
||||||
|
for (int i = loop; i < (int)instructions.size(); i++) {
|
||||||
|
emit(instructions[i], /*scalar=*/true);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nargs; i++) {
|
||||||
|
a.add(arg[i], arg[i], 1*(int)strides[i]);
|
||||||
|
}
|
||||||
|
a.sub(N, N, 1);
|
||||||
|
a.b(&tail);
|
||||||
|
|
||||||
|
a.label(&done);
|
||||||
a.ret(A::x30);
|
a.ret(A::x30);
|
||||||
|
|
||||||
return ~(K-1);
|
// We can handle any N.
|
||||||
|
return ~0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // not x86-64 or aarch64
|
#else // not x86-64 or aarch64
|
||||||
@ -1281,6 +1344,9 @@ namespace skvm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (n) {
|
if (n) {
|
||||||
|
#if defined(__aarch64__) && defined(SKVM_JIT)
|
||||||
|
SkUNREACHABLE;
|
||||||
|
#endif
|
||||||
// We'll operate in SIMT style, knocking off K-size chunks from n while possible.
|
// We'll operate in SIMT style, knocking off K-size chunks from n while possible.
|
||||||
constexpr int K = 16;
|
constexpr int K = 16;
|
||||||
using I32 = skvx::Vec<K, int>;
|
using I32 = skvx::Vec<K, int>;
|
||||||
|
Loading…
Reference in New Issue
Block a user