hoist tbl masks
This adds a warmup phase to let each instruction do any setup it needs, adding lookup entries for splat and bytes, and on aarch64, hoisting the mask to a register when we can. Oddly, this measures as a ~3x slowdown on the phone I'm testing, an international Galaxy S9 with a Samsung Mongoose 3 processor. I've got to imagine this somehow makes the processor think there's a carried loop dependency when there is not? Anyway, we already know that that's a pretty crazy CPU (reports FP16 compute but cannot), and this does deliver a speedup on the Pixel 2's Kryo 280 / Cortex A73, so I think maybe I'll just swap back to testing with the Pixel 2 and forget about that S9. Here's a before/after codelisting with a hoisted tbl mask. In the before case it's loaded in the loop with `ldr q3, #152`, and becomes `ldr q0, #168` outside the loop. llvm-mca says this should cut one cycle per loop, and with optimal out of order execution the loop cost would drop from ~8.7 cycles to ~8.3. In practice, it looks like about a 15% speedup. before: ldr q0, #188 ldr q1, #200 cmp x0, #4 // =4 b.lt #76 ldr q2, [x1] ldr q3, #152 tbl v3.16b, { v2.16b }, v3.16b sub v3.8h, v0.8h, v3.8h ldr q4, [x2] and v5.16b, v4.16b, v1.16b ushr v4.8h, v4.8h, #8 mul v5.8h, v5.8h, v3.8h ushr v5.8h, v5.8h, #8 mul v3.8h, v4.8h, v3.8h bic v3.16b, v3.16b, v1.16b orr v3.16b, v5.16b, v3.16b add v2.4s, v2.4s, v3.4s str q2, [x2] add x1, x1, #16 // =16 add x2, x2, #16 // =16 sub x0, x0, #4 // =4 b.al #-76 cmp x0, #1 // =1 b.lt #76 ldr s2, [x1] ldr q3, #72 tbl v3.16b, { v2.16b }, v3.16b sub v3.8h, v0.8h, v3.8h ldr s4, [x2] and v5.16b, v4.16b, v1.16b ushr v4.8h, v4.8h, #8 mul v5.8h, v5.8h, v3.8h ushr v5.8h, v5.8h, #8 mul v3.8h, v4.8h, v3.8h bic v3.16b, v3.16b, v1.16b orr v3.16b, v5.16b, v3.16b add v2.4s, v2.4s, v3.4s str s2, [x2] add x1, x1, #4 // =4 add x2, x2, #4 // =4 sub x0, x0, #1 // =1 b.al #-76 ret after: ldr q0, #168 ldr q1, #180 ldr q2, #192 cmp x0, #4 // =4 b.lt #72 ldr q3, [x1] tbl v4.16b, { v3.16b }, v0.16b sub v4.8h, v1.8h, v4.8h ldr q5, [x2] and v6.16b, v5.16b, v2.16b ushr v5.8h, v5.8h, #8 mul v6.8h, v6.8h, v4.8h ushr v6.8h, v6.8h, #8 mul v4.8h, v5.8h, v4.8h bic v4.16b, v4.16b, v2.16b orr v4.16b, v6.16b, v4.16b add v3.4s, v3.4s, v4.4s str q3, [x2] add x1, x1, #16 // =16 add x2, x2, #16 // =16 sub x0, x0, #4 // =4 b.al #-72 cmp x0, #1 // =1 b.lt #72 ldr s3, [x1] tbl v4.16b, { v3.16b }, v0.16b sub v4.8h, v1.8h, v4.8h ldr s5, [x2] and v6.16b, v5.16b, v2.16b ushr v5.8h, v5.8h, #8 mul v6.8h, v6.8h, v4.8h ushr v6.8h, v6.8h, #8 mul v4.8h, v5.8h, v4.8h bic v4.16b, v4.16b, v2.16b orr v4.16b, v6.16b, v4.16b add v3.4s, v3.4s, v4.4s str s3, [x2] add x1, x1, #4 // =4 add x2, x2, #4 // =4 sub x0, x0, #1 // =1 b.al #-72 ret Change-Id: I352a98d3ac2ad84c338330ef4cfae0292a0b32da Reviewed-on: https://skia-review.googlesource.com/c/skia/+/229064 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
00c680d2bb
commit
fae63e477c
@ -1062,8 +1062,48 @@ namespace skvm {
|
||||
auto hoisted = [&](Val id) { return hoist && instructions[id].hoist; };
|
||||
|
||||
std::vector<Reg> r(instructions.size());
|
||||
SkTHashMap<int, A::Label> bytes_masks,
|
||||
splats;
|
||||
|
||||
struct LabelAndReg {
|
||||
A::Label label;
|
||||
Reg reg;
|
||||
};
|
||||
SkTHashMap<int, LabelAndReg> splats,
|
||||
bytes_masks;
|
||||
|
||||
auto warmup = [&](Val id) {
|
||||
const Builder::Instruction& inst = instructions[id];
|
||||
if (inst.death == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Op op = inst.op;
|
||||
int imm = inst.imm;
|
||||
|
||||
switch (op) {
|
||||
default: break;
|
||||
|
||||
case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
|
||||
break;
|
||||
|
||||
case Op::bytes: if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
|
||||
if (hoist) {
|
||||
// vpshufb can always work with the mask from memory,
|
||||
// but it helps to hoist the mask to a register for tbl.
|
||||
#if defined(__aarch64__)
|
||||
LabelAndReg* entry = bytes_masks.find(imm);
|
||||
if (int found = __builtin_ffs(avail)) {
|
||||
entry->reg = (Reg)(found-1);
|
||||
avail ^= 1 << entry->reg;
|
||||
a->ldrq(entry->reg, &entry->label);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto emit = [&](Val id, bool scalar) {
|
||||
const Builder::Instruction& inst = instructions[id];
|
||||
@ -1182,15 +1222,14 @@ namespace skvm {
|
||||
else { a->vmovups( dst(), arg[imm]); }
|
||||
break;
|
||||
|
||||
case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
|
||||
a->vbroadcastss(dst(), splats.find(imm));
|
||||
break;
|
||||
// TODO: many of these instructions have variants that
|
||||
// can read one of their arugments from 32-byte memory
|
||||
// instead of a register. Find a way to avoid needing
|
||||
// to splat most* constants out at all?
|
||||
// (*Might work for x - 255 but not 255 - x, so will
|
||||
// always need to be able to splat to a register.)
|
||||
case Op::splat: a->vbroadcastss(dst(), &splats.find(imm)->label);
|
||||
break;
|
||||
// TODO: many of these instructions have variants that
|
||||
// can read one of their arugments from 32-byte memory
|
||||
// instead of a register. Find a way to avoid needing
|
||||
// to splat most* constants out at all?
|
||||
// (*Might work for x - 255 but not 255 - x, so will
|
||||
// always need to be able to splat to a register.)
|
||||
|
||||
case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break;
|
||||
case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
|
||||
@ -1236,9 +1275,9 @@ namespace skvm {
|
||||
case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break;
|
||||
case Op::to_i32: a->vcvttps2dq(dst(), r[x]); break;
|
||||
|
||||
case Op::bytes: if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
|
||||
a->vpshufb(dst(), r[x], bytes_masks.find(imm));
|
||||
break;
|
||||
case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(imm)->label);
|
||||
break;
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
case Op::store8: a->xtns2h(tmp(), r[x]);
|
||||
a->xtnh2b(tmp(), tmp());
|
||||
@ -1261,12 +1300,11 @@ namespace skvm {
|
||||
else { a->ldrq(dst(), arg[imm]); }
|
||||
break;
|
||||
|
||||
case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
|
||||
a->ldrq(dst(), splats.find(imm));
|
||||
break;
|
||||
// TODO: If we hoist these, pack 4 values in each register
|
||||
// and use vector/lane operations, cutting the register
|
||||
// pressure cost of hoisting by 4?
|
||||
case Op::splat: a->ldrq(dst(), &splats.find(imm)->label);
|
||||
break;
|
||||
// TODO: If we hoist these, pack 4 values in each register
|
||||
// and use vector/lane operations, cutting the register
|
||||
// pressure cost of hoisting by 4?
|
||||
|
||||
case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
|
||||
case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
|
||||
@ -1312,10 +1350,10 @@ namespace skvm {
|
||||
case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
|
||||
case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;
|
||||
|
||||
case Op::bytes: if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
|
||||
a->ldrq(tmp(), bytes_masks.find(imm)); // TODO: hoist these
|
||||
a->tbl (dst(), r[x], tmp());
|
||||
break;
|
||||
case Op::bytes: if (hoist) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); }
|
||||
else { a->ldrq(tmp(), &bytes_masks.find(imm)->label);
|
||||
a->tbl (dst(), r[x], tmp()); }
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1349,6 +1387,9 @@ namespace skvm {
|
||||
done;
|
||||
|
||||
for (Val id = 0; id < (Val)instructions.size(); id++) {
|
||||
if (!warmup(id)) {
|
||||
return false;
|
||||
}
|
||||
if (hoisted(id) && !emit(id, /*scalar=*/false)) {
|
||||
return false;
|
||||
}
|
||||
@ -1391,7 +1432,7 @@ namespace skvm {
|
||||
exit();
|
||||
}
|
||||
|
||||
bytes_masks.foreach([&](int imm, A::Label* l) {
|
||||
bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
|
||||
// One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
|
||||
#if defined(__x86_64__)
|
||||
a->align(32);
|
||||
@ -1399,7 +1440,7 @@ namespace skvm {
|
||||
a->align(4);
|
||||
#endif
|
||||
|
||||
a->label(l);
|
||||
a->label(&entry->label);
|
||||
int mask[4];
|
||||
bytes_control(imm, mask);
|
||||
a->bytes(mask, sizeof(mask));
|
||||
@ -1408,10 +1449,10 @@ namespace skvm {
|
||||
#endif
|
||||
});
|
||||
|
||||
splats.foreach([&](int imm, A::Label* l) {
|
||||
splats.foreach([&](int imm, LabelAndReg* entry) {
|
||||
// vbroadcastss 4 bytes on x86-64, or simply load 16-bytes on aarch64.
|
||||
a->align(4);
|
||||
a->label(l);
|
||||
a->label(&entry->label);
|
||||
a->word(imm);
|
||||
#if defined(__aarch64__)
|
||||
a->word(imm);
|
||||
|
Loading…
Reference in New Issue
Block a user