add used_in_loop bit to skvm::Builder::Instruction

Most hoisted values are used in the loop body (and that's really the
whole point of hoisting) but some are just temporaries to help produce
other hoisted values.  This used_in_loop bit helps us distinguish the
two, and lets us recycle registers holding temporary hoisted values not
used in the loop.

The can-we-recycle logic now becomes:
   - is this a real value?
   - is it time for it to die?
   - is it either not hoisted or a hoisted temporary?

The set-death-to-infinity approach for hoisted values is now gone.  That
worked great for hoisted values used inside the loop, but was too
conservative for hoisted temporaries.  This lifetime extension was
preventing us from recycling those registers, pinning enough registers
that we run out and fail to JIT.

Small amounts of refactoring to make this clearer:
   - move the Instruction hash function definition near its operator==
   - rename the two "hoist" variables to "can_hoist" for Instructions
     and "try_hoisting" for the JIT approach
   - add ↟ to mark hoisted temporaries, _really_ hoisted values.

There's some redundancy here between tracking the can_hoist bit, the
used_in_loop bit, and lifetime tracking.  I think it should be true, for
instance, that !can_hoist && !used_in_loop implies an instruction is
dead code.  I plan to continue refactoring lifetime analysis (in
particular reordering instructions to decrease register pressure) so
hopefully by the time I'm done that metadata will shake out a little
crisper.

Change-Id: I6460ca96d1cbec0315bed3c9a0774cd88ab5be26
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248986
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2019-10-16 10:46:01 -05:00 committed by Skia Commit-Bot
parent 04b06774be
commit 0f61c12737
4 changed files with 93 additions and 79 deletions

View File

@ -210,7 +210,7 @@ G8 over G8
↑ v1 = splat 3B808081 (0.0039215689)
v2 = to_f32 v0
v3 = mul_f32 v1 v2
v4 = splat 3F800000 (1)
v4 = splat 3F800000 (1)
v5 = load8 arg(1)
v6 = to_f32 v5
v7 = mul_f32 v1 v6
@ -730,8 +730,8 @@ r3 = add_i32 r2 r3
store32 arg(1) r3
6 values:
v0 = splat 1 (1.4012985e-45)
v1 = splat 2 (2.8025969e-45)
v0 = splat 1 (1.4012985e-45)
v1 = splat 2 (2.8025969e-45)
↑ v2 = add_i32 v0 v1
v3 = load32 arg(0)
v4 = mul_i32 v3 v2

View File

@ -96,8 +96,10 @@ namespace skvm {
y = inst.y,
z = inst.z;
int imm = inst.imm;
write(o, inst.death == 0 ? "☠️ " :
inst.hoist ? "" : " ");
write(o, inst.death == 0 ? "☠️ " :
!inst.can_hoist ? " " :
inst.used_in_loop ? "" :
"");
switch (op) {
case Op::store8: write(o, "store8" , Arg{imm}, V{x}); break;
case Op::store16: write(o, "store16", Arg{imm}, V{x}); break;
@ -317,50 +319,60 @@ namespace skvm {
// Varying loads (and gathers) and stores cannot be hoisted out of the loop.
if (inst.op <= Op::gather32) {
inst.hoist = false;
inst.can_hoist = false;
}
// If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
if (inst.hoist) {
if (inst.x != NA) { inst.hoist &= fProgram[inst.x].hoist; }
if (inst.y != NA) { inst.hoist &= fProgram[inst.y].hoist; }
if (inst.z != NA) { inst.hoist &= fProgram[inst.z].hoist; }
if (inst.can_hoist) {
if (inst.x != NA) { inst.can_hoist &= fProgram[inst.x].can_hoist; }
if (inst.y != NA) { inst.can_hoist &= fProgram[inst.y].can_hoist; }
if (inst.z != NA) { inst.can_hoist &= fProgram[inst.z].can_hoist; }
}
// Any hoisted values used inside the loop need to live forever.
// TODO: this extends the lifetime of _hoistable_ values used inside the loop.
// Ultimately the JIT may or may not decide to actually hoist them, and if it
// doesn't, this is extending the lifetime of these values for no good reason,
// increasing register pressure, defeating the purpose of that no-hoist fallback.
if (!inst.hoist) {
auto make_immortal = [&](Val arg) {
if (fProgram[arg].death != 0) {
fProgram[arg].death = (Val)fProgram.size();
}
};
if (inst.x != NA && fProgram[inst.x].hoist) { make_immortal(inst.x); }
if (inst.y != NA && fProgram[inst.y].hoist) { make_immortal(inst.y); }
if (inst.z != NA && fProgram[inst.z].hoist) { make_immortal(inst.z); }
// We'll want to know if hoisted values are used in the loop;
// if not, we can recycle their registers like we do loop values.
if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used_in_loop*/) {
if (inst.x != NA) { fProgram[inst.x].used_in_loop = true; }
if (inst.y != NA) { fProgram[inst.y].used_in_loop = true; }
if (inst.z != NA) { fProgram[inst.z].used_in_loop = true; }
}
}
return {fProgram, fStrides, debug_name};
}
// TODO: it's probably not important that we include post-Builder::done() fields like
// death, can_hoist, and used_in_loop in operator==() and InstructionHash::operator().
// They'll always have the same, initial values as set in Builder::push().
static bool operator==(const Builder::Instruction& a, const Builder::Instruction& b) {
return a.op == b.op
&& a.x == b.x
&& a.y == b.y
&& a.z == b.z
&& a.imm == b.imm
&& a.death == b.death
&& a.hoist == b.hoist;
return a.op == b.op
&& a.x == b.x
&& a.y == b.y
&& a.z == b.z
&& a.imm == b.imm
&& a.death == b.death
&& a.can_hoist == b.can_hoist
&& a.used_in_loop == b.used_in_loop;
}
// TODO: replace with SkOpts::hash()?
size_t Builder::InstructionHash::operator()(const Instruction& inst) const {
return Hash((uint8_t)inst.op)
^ Hash(inst.x)
^ Hash(inst.y)
^ Hash(inst.z)
^ Hash(inst.imm)
^ Hash(inst.death)
^ Hash(inst.can_hoist)
^ Hash(inst.used_in_loop);
}
// Most instructions produce a value and return it by ID,
// the value-producing instruction's own index in the program vector.
Val Builder::push(Op op, Val x, Val y, Val z, int imm) {
Instruction inst{op, x, y, z, imm, /*death=*/0, /*hoist=*/true};
Instruction inst{op, x, y, z, imm,
/*death=*/0, /*can_hoist=*/true, /*used_in_loop=*/false};
// Basic common subexpression elimination:
// if we've already seen this exact Instruction, use it instead of creating a new one.
@ -1409,6 +1421,10 @@ namespace skvm {
//
// But recycling registers is fairly cheap, and good practice for the
// JITs where minimizing register pressure really is important.
//
// Since we have effectively infinite registers, we hoist any value we can.
// (The JIT may choose a more complex policy to reduce register pressure.)
auto hoisted = [&](Val id) { return instructions[id].can_hoist; };
fRegs = 0;
int live_instructions = 0;
@ -1422,7 +1438,9 @@ namespace skvm {
// If this is a real input and it's lifetime ends at this instruction,
// we can recycle the register it's occupying.
auto maybe_recycle_register = [&](Val input) {
if (input != NA && instructions[input].death == id) {
if (input != NA
&& instructions[input].death == id
&& !(hoisted(input) && instructions[input].used_in_loop)) {
avail.push_back(reg[input]);
}
};
@ -1443,16 +1461,14 @@ namespace skvm {
// Assign a register to each live hoisted instruction.
for (Val id = 0; id < (Val)instructions.size(); id++) {
const Builder::Instruction& inst = instructions[id];
if (inst.death != 0 && inst.hoist) {
if (instructions[id].death != 0 && hoisted(id)) {
assign_register(id);
}
}
// Assign registers to each live loop instruction.
for (Val id = 0; id < (Val)instructions.size(); id++) {
const Builder::Instruction& inst = instructions[id];
if (inst.death != 0 && !inst.hoist) {
if (instructions[id].death != 0 && !hoisted(id)) {
assign_register(id);
}
@ -1486,14 +1502,14 @@ namespace skvm {
for (Val id = 0; id < (Val)instructions.size(); id++) {
const Builder::Instruction& inst = instructions[id];
if (inst.death != 0 && inst.hoist) {
if (inst.death != 0 && hoisted(id)) {
push_instruction(id, inst);
fLoop++;
}
}
for (Val id = 0; id < (Val)instructions.size(); id++) {
const Builder::Instruction& inst = instructions[id];
if (inst.death != 0 && !inst.hoist) {
if (inst.death != 0 && !hoisted(id)) {
push_instruction(id, inst);
}
}
@ -1535,7 +1551,7 @@ namespace skvm {
}
bool Program::jit(const std::vector<Builder::Instruction>& instructions,
const bool hoist,
const bool try_hoisting,
Assembler* a) const {
using A = Assembler;
@ -1565,7 +1581,7 @@ namespace skvm {
A::X N = A::x0,
arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
// We can use v0-v7 and v16-v31 freely; we'd need to preseve v8-v15.
// We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15.
using Reg = A::V;
uint32_t avail = 0xffff00ff;
#endif
@ -1574,7 +1590,7 @@ namespace skvm {
return false;
}
auto hoisted = [&](Val id) { return hoist && instructions[id].hoist; };
auto hoisted = [&](Val id) { return try_hoisting && instructions[id].can_hoist; };
std::vector<Reg> r(instructions.size());
@ -1602,7 +1618,7 @@ namespace skvm {
case Op::bytes: if (!bytes_masks.find(imm)) {
bytes_masks.set(imm, {});
if (hoist) {
if (try_hoisting) {
// vpshufb can always work with the mask from memory,
// but it helps to hoist the mask to a register for tbl.
#if defined(__aarch64__)
@ -1676,9 +1692,16 @@ namespace skvm {
// Now make available any registers that are consumed by this instruction.
// (The register pool we can pick dst from is >= the pool for tmp, adding any of these.)
if (x != NA && instructions[x].death == id) { avail |= 1 << r[x]; }
if (y != NA && instructions[y].death == id) { avail |= 1 << r[y]; }
if (z != NA && instructions[z].death == id) { avail |= 1 << r[z]; }
auto maybe_recycle_register = [&](Val input) {
if (input != NA
&& instructions[input].death == id
&& !(hoisted(input) && instructions[input].used_in_loop)) {
avail |= 1 << r[input];
}
};
maybe_recycle_register(x);
maybe_recycle_register(y);
maybe_recycle_register(z);
// set_dst() and dst() will work read/write with this perhaps-just-updated avail.
// Some ops may decide dst on their own to best fit the instruction (see Op::mad_f32).
@ -1901,10 +1924,11 @@ namespace skvm {
case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;
case Op::bytes: if (hoist) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); }
else { a->ldrq(tmp(), &bytes_masks.find(imm)->label);
a->tbl (dst(), r[x], tmp()); }
break;
case Op::bytes:
if (try_hoisting) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); }
else { a->ldrq(tmp(), &bytes_masks.find(imm)->label);
a->tbl (dst(), r[x], tmp()); }
break;
#endif
}
@ -2026,10 +2050,10 @@ namespace skvm {
// First try allowing code hoisting (faster code)
// then again without if that fails (lower register pressure).
bool hoist = true;
if (!this->jit(instructions, hoist, &a)) {
hoist = false;
if (!this->jit(instructions, hoist, &a)) {
bool try_hoisting = true;
if (!this->jit(instructions, try_hoisting, &a)) {
try_hoisting = false;
if (!this->jit(instructions, try_hoisting, &a)) {
return;
}
}
@ -2041,7 +2065,7 @@ namespace skvm {
// Assemble the program for real.
a = Assembler{fJITBuf};
SkAssertResult(this->jit(instructions, hoist, &a));
SkAssertResult(this->jit(instructions, try_hoisting, &a));
SkASSERT(a.size() <= fJITSize);
// Remap as executable, and flush caches on platforms that need that.

View File

@ -289,8 +289,9 @@ namespace skvm {
int imm; // Immediate bit pattern, shift count, argument index, etc.
// Not populated until done() has been called.
int death; // Index of last live instruction taking this input; live if != 0.
bool hoist; // Value independent of all loop variables?
int death; // Index of last live instruction taking this input; live if != 0.
bool can_hoist; // Value independent of all loop variables?
bool used_in_loop; // Is the value used in the loop (or only by hoisted values)?
};
Program done(const char* debug_name = nullptr);
@ -435,16 +436,7 @@ namespace skvm {
static size_t Hash(T val) {
return std::hash<T>{}(val);
}
// TODO: replace with SkOpts::hash()?
size_t operator()(const Instruction& inst) const {
return Hash((uint8_t)inst.op)
^ Hash(inst.x)
^ Hash(inst.y)
^ Hash(inst.z)
^ Hash(inst.imm)
^ Hash(inst.death)
^ Hash(inst.hoist);
}
size_t operator()(const Instruction& inst) const;
};
Val push(Op, Val x, Val y=NA, Val z=NA, int imm=0);
@ -501,7 +493,7 @@ namespace skvm {
void setupJIT (const std::vector<Builder::Instruction>&, const char* debug_name);
bool jit(const std::vector<Builder::Instruction>&,
bool hoist,
bool try_hoisting,
Assembler*) const;
// Dump jit-*.dump files for perf inject.

View File

@ -100,7 +100,7 @@ DEF_TEST(SkVM, r) {
skvm::Arg arg = b.varying<int>();
// x and y can both be hoisted,
// and x can die at y, while y lives forever.
// and x can die at y, while y must live for the loop.
skvm::I32 x = b.splat(1),
y = b.add(x, b.splat(2));
b.store32(arg, b.mul(b.load32(arg), y));
@ -110,12 +110,12 @@ DEF_TEST(SkVM, r) {
std::vector<skvm::Builder::Instruction> insts = b.program();
REPORTER_ASSERT(r, insts.size() == 6);
REPORTER_ASSERT(r, insts[0].hoist && insts[0].death == 2);
REPORTER_ASSERT(r, insts[1].hoist && insts[1].death == 2);
REPORTER_ASSERT(r, insts[2].hoist && insts[2].death == 6);
REPORTER_ASSERT(r, !insts[3].hoist);
REPORTER_ASSERT(r, !insts[4].hoist);
REPORTER_ASSERT(r, !insts[5].hoist);
REPORTER_ASSERT(r, insts[0].can_hoist && insts[0].death == 2 && !insts[0].used_in_loop);
REPORTER_ASSERT(r, insts[1].can_hoist && insts[1].death == 2 && !insts[1].used_in_loop);
REPORTER_ASSERT(r, insts[2].can_hoist && insts[2].death == 4 && insts[2].used_in_loop);
REPORTER_ASSERT(r, !insts[3].can_hoist);
REPORTER_ASSERT(r, !insts[4].can_hoist);
REPORTER_ASSERT(r, !insts[5].can_hoist);
dump(b, &buf);
@ -245,7 +245,7 @@ DEF_TEST(SkVM_Pointless, r) {
});
for (const skvm::Builder::Instruction& inst : b.program()) {
REPORTER_ASSERT(r, inst.death == 0 && inst.hoist == true);
REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
}
}
@ -579,9 +579,7 @@ DEF_TEST(SkVM_hoist, r) {
b.store32(arg, x);
}
// TODO: this really should JIT... a bug slipped in making it fail to.
// See https://skia-review.googlesource.com/c/skia/+/242591.
test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
int x = 4;
program.eval(1, &x);
// x += 0 + 1 + 2 + 3 + ... + 30 + 31