eliminate the need for a tmp ymm register

Like any other Instruction the store*s are assigned a destination register d, which doesn't really make sense, but works perfectly as a temporary register. This means store8 doesn't need to reserve xmm/ymm15 as a temporary... it already has one naturally. As you might expect, the examples we have so far assign the consumed input x register as the d register, so things that used to look like vpackusdw %ymm6 ,%ymm6 ,%ymm15 vpermq $0xd8 ,%ymm15,%ymm15 vpackuswb %ymm15,%ymm15,%ymm15 vmoq %xmm15,(%rdx) now look more like vpackusdw %ymm6,%ymm6,%ymm6 vpermq $0xd8,%ymm6,%ymm6 vpackuswb %ymm6,%ymm6,%ymm6 vmoq %xmm6,(%rdx) Should be no perf difference, just simplified register bookkeeping. This may suggest splitting load8/store8 into finer instructions, two to do the physical loads and stores, and two for the 8->32 and 32->8 widen and narrow? On the other hand load8 really is just one vpmovzxbd instruction, so it'd be a shame to split it. I suspect this will become more clear as I add 16-bit support. Change-Id: I7c2b4d6b1689d40b50382f65fc00c01c54529c8a Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220543 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-06-12 10:04:37 -05:00 · 2019-06-12 10:04:37 -05:00 · d3cc16c8bb
commit d3cc16c8bb
parent 072e6fc374
1 changed files with 9 additions and 7 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -419,9 +419,8 @@ namespace skvm {
                // All 16 ymm registers are available as scratch.
                Xbyak::Ymm r[] = {
                    ymm0, ymm1, ymm2 , ymm3 , ymm4 , ymm5 , ymm6 , ymm7 ,
-                    ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14,
-                }, tmp = ymm15;
-                Xbyak::Xmm tmplo = xmm15;
+                    ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15,
+                };
             #endif

                // Label / 4-byte values we need to write after ret.
@ -441,10 +440,13 @@ namespace skvm {
                         z = inst.z;
                    switch (op) {
                        case Op::store8:
-                            vpackusdw(tmp, r[x], r[x]);    // pack 32-bit -> 16-bit
-                            vpermq   (tmp, tmp, 0xd8);     // u64 tmp[0,1,2,3] = tmp[0,2,1,3]
-                            vpackuswb(tmp, tmp, tmp);      // pack 16-bit -> 8-bit
-                            vmovq(ptr[arg[y.imm]], tmplo); // store low 8 bytes
+                            // Like any other instruction, store8 has been assigned
+                            // a "destination" register we can use as a temporary scratch.
+                            vpackusdw(r[d], r[x], r[x]);       // pack 32-bit -> 16-bit
+                            vpermq   (r[d], r[d], 0xd8);       // u64 tmp[0,1,2,3] = tmp[0,2,1,3]
+                            vpackuswb(r[d], r[d], r[d]);       // pack 16-bit -> 8-bit
+                            vmovq(ptr[arg[y.imm]],             // store low 8 bytes
+                                  Xbyak::Xmm{r[d].getIdx()});  // (arg must be an xmm register)
                            break;

                        case Op::store32: vmovups(ptr[arg[y.imm]], r[x]); break;