eliminate the need for a tmp ymm register
Like any other Instruction the store*s are assigned a destination register d, which doesn't really make sense, but works perfectly as a temporary register. This means store8 doesn't need to reserve xmm/ymm15 as a temporary... it already has one naturally. As you might expect, the examples we have so far assign the consumed input x register as the d register, so things that used to look like vpackusdw %ymm6 ,%ymm6 ,%ymm15 vpermq $0xd8 ,%ymm15,%ymm15 vpackuswb %ymm15,%ymm15,%ymm15 vmoq %xmm15,(%rdx) now look more like vpackusdw %ymm6,%ymm6,%ymm6 vpermq $0xd8,%ymm6,%ymm6 vpackuswb %ymm6,%ymm6,%ymm6 vmoq %xmm6,(%rdx) Should be no perf difference, just simplified register bookkeeping. This may suggest splitting load8/store8 into finer instructions, two to do the physical loads and stores, and two for the 8->32 and 32->8 widen and narrow? On the other hand load8 really is just one vpmovzxbd instruction, so it'd be a shame to split it. I suspect this will become more clear as I add 16-bit support. Change-Id: I7c2b4d6b1689d40b50382f65fc00c01c54529c8a Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220543 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
072e6fc374
commit
d3cc16c8bb
@ -419,9 +419,8 @@ namespace skvm {
|
||||
// All 16 ymm registers are available as scratch.
|
||||
Xbyak::Ymm r[] = {
|
||||
ymm0, ymm1, ymm2 , ymm3 , ymm4 , ymm5 , ymm6 , ymm7 ,
|
||||
ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14,
|
||||
}, tmp = ymm15;
|
||||
Xbyak::Xmm tmplo = xmm15;
|
||||
ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15,
|
||||
};
|
||||
#endif
|
||||
|
||||
// Label / 4-byte values we need to write after ret.
|
||||
@ -441,10 +440,13 @@ namespace skvm {
|
||||
z = inst.z;
|
||||
switch (op) {
|
||||
case Op::store8:
|
||||
vpackusdw(tmp, r[x], r[x]); // pack 32-bit -> 16-bit
|
||||
vpermq (tmp, tmp, 0xd8); // u64 tmp[0,1,2,3] = tmp[0,2,1,3]
|
||||
vpackuswb(tmp, tmp, tmp); // pack 16-bit -> 8-bit
|
||||
vmovq(ptr[arg[y.imm]], tmplo); // store low 8 bytes
|
||||
// Like any other instruction, store8 has been assigned
|
||||
// a "destination" register we can use as a temporary scratch.
|
||||
vpackusdw(r[d], r[x], r[x]); // pack 32-bit -> 16-bit
|
||||
vpermq (r[d], r[d], 0xd8); // u64 tmp[0,1,2,3] = tmp[0,2,1,3]
|
||||
vpackuswb(r[d], r[d], r[d]); // pack 16-bit -> 8-bit
|
||||
vmovq(ptr[arg[y.imm]], // store low 8 bytes
|
||||
Xbyak::Xmm{r[d].getIdx()}); // (arg must be an xmm register)
|
||||
break;
|
||||
|
||||
case Op::store32: vmovups(ptr[arg[y.imm]], r[x]); break;
|
||||
|
Loading…
Reference in New Issue
Block a user