hoist tbl masks

This adds a warmup phase to let each instruction do any setup
it needs, adding lookup entries for splat and bytes, and
on aarch64, hoisting the mask to a register when we can.

Oddly, this measures as a ~3x slowdown on the phone I'm testing, an
international Galaxy S9 with a Samsung Mongoose 3 processor.  I've got
to imagine this somehow makes the processor think there's a carried loop
dependency when there is not?  Anyway, we already know that that's a
pretty crazy CPU (reports FP16 compute but cannot), and this does deliver
a speedup on the Pixel 2's Kryo 280 / Cortex A73, so I think maybe I'll
just swap back to testing with the Pixel 2 and forget about that S9.

Here's a before/after codelisting with a hoisted tbl mask.  In the
before case it's loaded in the loop with `ldr q3, #152`, and becomes
`ldr q0, #168` outside the loop.  llvm-mca says this should cut one
cycle per loop, and with optimal out of order execution the loop cost
would drop from ~8.7 cycles to ~8.3.  In practice, it looks like about a
15% speedup.

before:
	ldr	q0, #188
	ldr	q1, #200
	cmp	x0, #4                  // =4
	b.lt	#76
	ldr	q2, [x1]
	ldr	q3, #152
	tbl	v3.16b, { v2.16b }, v3.16b
	sub	v3.8h, v0.8h, v3.8h
	ldr	q4, [x2]
	and	v5.16b, v4.16b, v1.16b
	ushr	v4.8h, v4.8h, #8
	mul	v5.8h, v5.8h, v3.8h
	ushr	v5.8h, v5.8h, #8
	mul	v3.8h, v4.8h, v3.8h
	bic	v3.16b, v3.16b, v1.16b
	orr	v3.16b, v5.16b, v3.16b
	add	v2.4s, v2.4s, v3.4s
	str	q2, [x2]
	add	x1, x1, #16             // =16
	add	x2, x2, #16             // =16
	sub	x0, x0, #4              // =4
	b.al	#-76
	cmp	x0, #1                  // =1
	b.lt	#76
	ldr	s2, [x1]
	ldr	q3, #72
	tbl	v3.16b, { v2.16b }, v3.16b
	sub	v3.8h, v0.8h, v3.8h
	ldr	s4, [x2]
	and	v5.16b, v4.16b, v1.16b
	ushr	v4.8h, v4.8h, #8
	mul	v5.8h, v5.8h, v3.8h
	ushr	v5.8h, v5.8h, #8
	mul	v3.8h, v4.8h, v3.8h
	bic	v3.16b, v3.16b, v1.16b
	orr	v3.16b, v5.16b, v3.16b
	add	v2.4s, v2.4s, v3.4s
	str	s2, [x2]
	add	x1, x1, #4              // =4
	add	x2, x2, #4              // =4
	sub	x0, x0, #1              // =1
	b.al	#-76
	ret

after: ldr	q0, #168
	ldr	q1, #180
	ldr	q2, #192
	cmp	x0, #4                  // =4
	b.lt	#72
	ldr	q3, [x1]
	tbl	v4.16b, { v3.16b }, v0.16b
	sub	v4.8h, v1.8h, v4.8h
	ldr	q5, [x2]
	and	v6.16b, v5.16b, v2.16b
	ushr	v5.8h, v5.8h, #8
	mul	v6.8h, v6.8h, v4.8h
	ushr	v6.8h, v6.8h, #8
	mul	v4.8h, v5.8h, v4.8h
	bic	v4.16b, v4.16b, v2.16b
	orr	v4.16b, v6.16b, v4.16b
	add	v3.4s, v3.4s, v4.4s
	str	q3, [x2]
	add	x1, x1, #16             // =16
	add	x2, x2, #16             // =16
	sub	x0, x0, #4              // =4
	b.al	#-72
	cmp	x0, #1                  // =1
	b.lt	#72
	ldr	s3, [x1]
	tbl	v4.16b, { v3.16b }, v0.16b
	sub	v4.8h, v1.8h, v4.8h
	ldr	s5, [x2]
	and	v6.16b, v5.16b, v2.16b
	ushr	v5.8h, v5.8h, #8
	mul	v6.8h, v6.8h, v4.8h
	ushr	v6.8h, v6.8h, #8
	mul	v4.8h, v5.8h, v4.8h
	bic	v4.16b, v4.16b, v2.16b
	orr	v4.16b, v6.16b, v4.16b
	add	v3.4s, v3.4s, v4.4s
	str	s3, [x2]
	add	x1, x1, #4              // =4
	add	x2, x2, #4              // =4
	sub	x0, x0, #1              // =1
	b.al	#-72
	ret
Change-Id: I352a98d3ac2ad84c338330ef4cfae0292a0b32da
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/229064
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2019-07-22 15:31:01 -05:00 committed by Skia Commit-Bot
parent 00c680d2bb
commit fae63e477c

View File

@ -1062,8 +1062,48 @@ namespace skvm {
auto hoisted = [&](Val id) { return hoist && instructions[id].hoist; };
std::vector<Reg> r(instructions.size());
SkTHashMap<int, A::Label> bytes_masks,
splats;
struct LabelAndReg {
A::Label label;
Reg reg;
};
SkTHashMap<int, LabelAndReg> splats,
bytes_masks;
auto warmup = [&](Val id) {
const Builder::Instruction& inst = instructions[id];
if (inst.death == 0) {
return true;
}
Op op = inst.op;
int imm = inst.imm;
switch (op) {
default: break;
case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
break;
case Op::bytes: if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
if (hoist) {
// vpshufb can always work with the mask from memory,
// but it helps to hoist the mask to a register for tbl.
#if defined(__aarch64__)
LabelAndReg* entry = bytes_masks.find(imm);
if (int found = __builtin_ffs(avail)) {
entry->reg = (Reg)(found-1);
avail ^= 1 << entry->reg;
a->ldrq(entry->reg, &entry->label);
} else {
return false;
}
#endif
}
break;
}
return true;
};
auto emit = [&](Val id, bool scalar) {
const Builder::Instruction& inst = instructions[id];
@ -1182,15 +1222,14 @@ namespace skvm {
else { a->vmovups( dst(), arg[imm]); }
break;
case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
a->vbroadcastss(dst(), splats.find(imm));
break;
// TODO: many of these instructions have variants that
// can read one of their arugments from 32-byte memory
// instead of a register. Find a way to avoid needing
// to splat most* constants out at all?
// (*Might work for x - 255 but not 255 - x, so will
// always need to be able to splat to a register.)
case Op::splat: a->vbroadcastss(dst(), &splats.find(imm)->label);
break;
// TODO: many of these instructions have variants that
// can read one of their arugments from 32-byte memory
// instead of a register. Find a way to avoid needing
// to splat most* constants out at all?
// (*Might work for x - 255 but not 255 - x, so will
// always need to be able to splat to a register.)
case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break;
case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
@ -1236,9 +1275,9 @@ namespace skvm {
case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break;
case Op::to_i32: a->vcvttps2dq(dst(), r[x]); break;
case Op::bytes: if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
a->vpshufb(dst(), r[x], bytes_masks.find(imm));
break;
case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(imm)->label);
break;
#elif defined(__aarch64__)
case Op::store8: a->xtns2h(tmp(), r[x]);
a->xtnh2b(tmp(), tmp());
@ -1261,12 +1300,11 @@ namespace skvm {
else { a->ldrq(dst(), arg[imm]); }
break;
case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
a->ldrq(dst(), splats.find(imm));
break;
// TODO: If we hoist these, pack 4 values in each register
// and use vector/lane operations, cutting the register
// pressure cost of hoisting by 4?
case Op::splat: a->ldrq(dst(), &splats.find(imm)->label);
break;
// TODO: If we hoist these, pack 4 values in each register
// and use vector/lane operations, cutting the register
// pressure cost of hoisting by 4?
case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
@ -1312,10 +1350,10 @@ namespace skvm {
case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;
case Op::bytes: if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
a->ldrq(tmp(), bytes_masks.find(imm)); // TODO: hoist these
a->tbl (dst(), r[x], tmp());
break;
case Op::bytes: if (hoist) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); }
else { a->ldrq(tmp(), &bytes_masks.find(imm)->label);
a->tbl (dst(), r[x], tmp()); }
break;
#endif
}
@ -1349,6 +1387,9 @@ namespace skvm {
done;
for (Val id = 0; id < (Val)instructions.size(); id++) {
if (!warmup(id)) {
return false;
}
if (hoisted(id) && !emit(id, /*scalar=*/false)) {
return false;
}
@ -1391,7 +1432,7 @@ namespace skvm {
exit();
}
bytes_masks.foreach([&](int imm, A::Label* l) {
bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
// One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
#if defined(__x86_64__)
a->align(32);
@ -1399,7 +1440,7 @@ namespace skvm {
a->align(4);
#endif
a->label(l);
a->label(&entry->label);
int mask[4];
bytes_control(imm, mask);
a->bytes(mask, sizeof(mask));
@ -1408,10 +1449,10 @@ namespace skvm {
#endif
});
splats.foreach([&](int imm, A::Label* l) {
splats.foreach([&](int imm, LabelAndReg* entry) {
// vbroadcastss 4 bytes on x86-64, or simply load 16-bytes on aarch64.
a->align(4);
a->label(l);
a->label(&entry->label);
a->word(imm);
#if defined(__aarch64__)
a->word(imm);