always fma in mad_f32()
We can always move data around so that an FMA is possible using no more registers than we would otherwise, and on x86, evne using no more instructions. The basic idea here is that if we can't reuse one of the inputs to destructively host the FMA instruction, the next best thing is to copy one of the arguments into tmp() and accumulate the FMA there. Once the FMA has happened, we just need to copy that result to dst(). We can of course skip that copy if dst() == tmp(). On x86 we never need that copy; dst() and tmp() are picked using the same logic except that dst may alias one of its inputs, and we only fall into this case after we've already found it doesn't. So we can just assert dst() == tmp() rather than check it like we do on ARM. It's subtle, but I think sound. I'm using logical-or to copy registers around. This is a little lazy, but maybe not as lazy as it looks: on ARM that is _the_ way to copy registers. There's a vmovdqa instruction I could use on x86, TBD. All paths through this new code were being exercised on ARM, but we didn't have anything hitting the tmp case on x86, so I've added a new unit test that hits the corner cases of both implementations. Change-Id: I5422414fc50c64d491b4933b4b580b784596f291 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/228630 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
99b2a33c37
commit
4a13119a60
@ -1144,6 +1144,11 @@ namespace skvm {
|
||||
return r[id];
|
||||
};
|
||||
|
||||
// Because we use the same logic to pick an arbitrary dst and to pick tmp,
|
||||
// and we know that tmp will never overlap any of the inputs, `dst() == tmp()`
|
||||
// is a simple idiom to check that the destination does not overlap any of the inputs.
|
||||
// Sometimes we can use this knowledge to do better instruction selection.
|
||||
|
||||
// Ok! Keep in mind that we haven't assigned tmp or dst yet,
|
||||
// just laid out hooks for how to do so if we need them, depending on the instruction.
|
||||
//
|
||||
@ -1182,12 +1187,14 @@ namespace skvm {
|
||||
case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break;
|
||||
|
||||
case Op::mad_f32:
|
||||
if (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); } else
|
||||
if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); } else
|
||||
if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); } else
|
||||
{ a->vmulps (tmp(), r[x], r[y]);
|
||||
a->vaddps (dst(), tmp(), r[z]); }
|
||||
break;
|
||||
if (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); }
|
||||
else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); }
|
||||
else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); }
|
||||
else { SkASSERT(dst() == tmp());
|
||||
// TODO: vpor -> vmovdqa here?
|
||||
a->vpor (dst(),r[x], r[x]);
|
||||
a->vfmadd132ps(dst(),r[z], r[y]); }
|
||||
break;
|
||||
|
||||
case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break;
|
||||
case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break;
|
||||
@ -1252,9 +1259,10 @@ namespace skvm {
|
||||
case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break;
|
||||
|
||||
case Op::mad_f32:
|
||||
if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z], r[x], r[y]); }
|
||||
else { a->fmul4s(tmp(), r[x], r[y]);
|
||||
a->fadd4s(dst(), tmp(), r[z]); }
|
||||
if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z], r[x], r[y]); }
|
||||
else { a->orr16b(tmp(), r[z], r[z]);
|
||||
a->fmla4s(tmp(), r[x], r[y]);
|
||||
if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
|
||||
break;
|
||||
|
||||
|
||||
|
@ -370,6 +370,34 @@ DEF_TEST(SkVM_LoopCounts, r) {
|
||||
});
|
||||
}
|
||||
|
||||
DEF_TEST(SkVM_mad, r) {
|
||||
// This program is designed to exercise the tricky corners of instruction
|
||||
// and register selection for Op::mad_f32.
|
||||
|
||||
skvm::Builder b;
|
||||
{
|
||||
skvm::Arg arg = b.arg<int>();
|
||||
|
||||
skvm::F32 x = b.to_f32(b.load32(arg)),
|
||||
y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
|
||||
z = b.mad(y,y,x), // y is needed in the future, but r[z] = r[x] is ok.
|
||||
w = b.mad(z,z,y), // w can alias z but not y.
|
||||
v = b.mad(w,y,w); // Got to stop somewhere.
|
||||
b.store32(arg, b.to_i32(v));
|
||||
}
|
||||
|
||||
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
|
||||
int x = 2;
|
||||
program.eval(1, &x);
|
||||
// x = 2
|
||||
// y = 2*2 + 2 = 6
|
||||
// z = 6*6 + 2 = 38
|
||||
// w = 38*38 + 6 = 1450
|
||||
// v = 1450*6 + 1450 = 10150
|
||||
REPORTER_ASSERT(r, x == 10150);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template <typename Fn>
|
||||
static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
|
||||
|
Loading…
Reference in New Issue
Block a user