hoist tbl masks

This adds a warmup phase to let each instruction do any setup it needs, adding lookup entries for splat and bytes, and on aarch64, hoisting the mask to a register when we can. Oddly, this measures as a ~3x slowdown on the phone I'm testing, an international Galaxy S9 with a Samsung Mongoose 3 processor. I've got to imagine this somehow makes the processor think there's a carried loop dependency when there is not? Anyway, we already know that that's a pretty crazy CPU (reports FP16 compute but cannot), and this does deliver a speedup on the Pixel 2's Kryo 280 / Cortex A73, so I think maybe I'll just swap back to testing with the Pixel 2 and forget about that S9. Here's a before/after codelisting with a hoisted tbl mask. In the before case it's loaded in the loop with `ldr q3, #152`, and becomes `ldr q0, #168` outside the loop. llvm-mca says this should cut one cycle per loop, and with optimal out of order execution the loop cost would drop from ~8.7 cycles to ~8.3. In practice, it looks like about a 15% speedup. before: ldr q0, #188 ldr q1, #200 cmp x0, #4 // =4 b.lt #76 ldr q2, [x1] ldr q3, #152 tbl v3.16b, { v2.16b }, v3.16b sub v3.8h, v0.8h, v3.8h ldr q4, [x2] and v5.16b, v4.16b, v1.16b ushr v4.8h, v4.8h, #8 mul v5.8h, v5.8h, v3.8h ushr v5.8h, v5.8h, #8 mul v3.8h, v4.8h, v3.8h bic v3.16b, v3.16b, v1.16b orr v3.16b, v5.16b, v3.16b add v2.4s, v2.4s, v3.4s str q2, [x2] add x1, x1, #16 // =16 add x2, x2, #16 // =16 sub x0, x0, #4 // =4 b.al #-76 cmp x0, #1 // =1 b.lt #76 ldr s2, [x1] ldr q3, #72 tbl v3.16b, { v2.16b }, v3.16b sub v3.8h, v0.8h, v3.8h ldr s4, [x2] and v5.16b, v4.16b, v1.16b ushr v4.8h, v4.8h, #8 mul v5.8h, v5.8h, v3.8h ushr v5.8h, v5.8h, #8 mul v3.8h, v4.8h, v3.8h bic v3.16b, v3.16b, v1.16b orr v3.16b, v5.16b, v3.16b add v2.4s, v2.4s, v3.4s str s2, [x2] add x1, x1, #4 // =4 add x2, x2, #4 // =4 sub x0, x0, #1 // =1 b.al #-76 ret after: ldr q0, #168 ldr q1, #180 ldr q2, #192 cmp x0, #4 // =4 b.lt #72 ldr q3, [x1] tbl v4.16b, { v3.16b }, v0.16b sub v4.8h, v1.8h, v4.8h ldr q5, [x2] and v6.16b, v5.16b, v2.16b ushr v5.8h, v5.8h, #8 mul v6.8h, v6.8h, v4.8h ushr v6.8h, v6.8h, #8 mul v4.8h, v5.8h, v4.8h bic v4.16b, v4.16b, v2.16b orr v4.16b, v6.16b, v4.16b add v3.4s, v3.4s, v4.4s str q3, [x2] add x1, x1, #16 // =16 add x2, x2, #16 // =16 sub x0, x0, #4 // =4 b.al #-72 cmp x0, #1 // =1 b.lt #72 ldr s3, [x1] tbl v4.16b, { v3.16b }, v0.16b sub v4.8h, v1.8h, v4.8h ldr s5, [x2] and v6.16b, v5.16b, v2.16b ushr v5.8h, v5.8h, #8 mul v6.8h, v6.8h, v4.8h ushr v6.8h, v6.8h, #8 mul v4.8h, v5.8h, v4.8h bic v4.16b, v4.16b, v2.16b orr v4.16b, v6.16b, v4.16b add v3.4s, v3.4s, v4.4s str s3, [x2] add x1, x1, #4 // =4 add x2, x2, #4 // =4 sub x0, x0, #1 // =1 b.al #-72 ret Change-Id: I352a98d3ac2ad84c338330ef4cfae0292a0b32da Reviewed-on: https://skia-review.googlesource.com/c/skia/+/229064 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-07-22 15:31:01 -05:00 · 2019-07-22 15:31:01 -05:00 · fae63e477c
commit fae63e477c
parent 00c680d2bb
1 changed files with 69 additions and 28 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -1062,8 +1062,48 @@ namespace skvm {
        auto hoisted = [&](Val id) { return hoist && instructions[id].hoist; };

        std::vector<Reg> r(instructions.size());
-        SkTHashMap<int, A::Label> bytes_masks,
-                                  splats;
+
+        struct LabelAndReg {
+            A::Label label;
+            Reg      reg;
+        };
+        SkTHashMap<int, LabelAndReg> splats,
+                                     bytes_masks;
+
+        auto warmup = [&](Val id) {
+            const Builder::Instruction& inst = instructions[id];
+            if (inst.death == 0) {
+                return true;
+            }
+
+            Op op = inst.op;
+            int imm = inst.imm;
+
+            switch (op) {
+                default: break;
+
+                case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
+                                break;
+
+                case Op::bytes: if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
+                                if (hoist) {
+                                    // vpshufb can always work with the mask from memory,
+                                    // but it helps to hoist the mask to a register for tbl.
+                                #if defined(__aarch64__)
+                                    LabelAndReg* entry = bytes_masks.find(imm);
+                                    if (int found = __builtin_ffs(avail)) {
+                                        entry->reg = (Reg)(found-1);
+                                        avail ^= 1 << entry->reg;
+                                        a->ldrq(entry->reg, &entry->label);
+                                    } else {
+                                        return false;
+                                    }
+                                #endif
+                                }
+                                break;
+            }
+            return true;
+        };

        auto emit = [&](Val id, bool scalar) {
            const Builder::Instruction& inst = instructions[id];
@ -1182,15 +1222,14 @@ namespace skvm {
                                 else        { a->vmovups(        dst(), arg[imm]); }
                                 break;

-                case Op::splat:  if (!splats.find(imm)) { splats.set(imm, {}); }
-                                 a->vbroadcastss(dst(), splats.find(imm));
-                                 break;
-                                 // TODO: many of these instructions have variants that
-                                 // can read one of their arugments from 32-byte memory
-                                 // instead of a register.  Find a way to avoid needing
-                                 // to splat most* constants out at all?
-                                 // (*Might work for x - 255 but not 255 - x, so will
-                                 // always need to be able to splat to a register.)
+                case Op::splat: a->vbroadcastss(dst(), &splats.find(imm)->label);
+                                break;
+                                // TODO: many of these instructions have variants that
+                                // can read one of their arugments from 32-byte memory
+                                // instead of a register.  Find a way to avoid needing
+                                // to splat most* constants out at all?
+                                // (*Might work for x - 255 but not 255 - x, so will
+                                // always need to be able to splat to a register.)

                case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break;
                case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
@ -1236,9 +1275,9 @@ namespace skvm {
                case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break;
                case Op::to_i32: a->vcvttps2dq(dst(), r[x]); break;

-                case Op::bytes:  if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
-                                 a->vpshufb(dst(), r[x], bytes_masks.find(imm));
-                                 break;
+                case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(imm)->label);
+                                break;
+
            #elif defined(__aarch64__)
                case Op::store8: a->xtns2h(tmp(), r[x]);
                                 a->xtnh2b(tmp(), tmp());
@ -1261,12 +1300,11 @@ namespace skvm {
                                 else        { a->ldrq(dst(), arg[imm]); }
                                               break;

-                case Op::splat:  if (!splats.find(imm)) { splats.set(imm, {}); }
-                                 a->ldrq(dst(), splats.find(imm));
-                                 break;
-                                 // TODO: If we hoist these, pack 4 values in each register
-                                 // and use vector/lane operations, cutting the register
-                                 // pressure cost of hoisting by 4?
+                case Op::splat: a->ldrq(dst(), &splats.find(imm)->label);
+                                break;
+                                // TODO: If we hoist these, pack 4 values in each register
+                                // and use vector/lane operations, cutting the register
+                                // pressure cost of hoisting by 4?

                case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
                case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
@ -1312,10 +1350,10 @@ namespace skvm {
                case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
                case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;

-                case Op::bytes:  if (!bytes_masks.find(imm)) { bytes_masks.set(imm, {}); }
-                                 a->ldrq(tmp(), bytes_masks.find(imm));  // TODO: hoist these
-                                 a->tbl (dst(), r[x], tmp());
-                                 break;
+                case Op::bytes: if (hoist) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); }
+                                else       { a->ldrq(tmp(), &bytes_masks.find(imm)->label);
+                                             a->tbl (dst(), r[x], tmp()); }
+                                break;
            #endif
            }

@ -1349,6 +1387,9 @@ namespace skvm {
                 done;

        for (Val id = 0; id < (Val)instructions.size(); id++) {
+            if (!warmup(id)) {
+                return false;
+            }
            if (hoisted(id) && !emit(id, /*scalar=*/false)) {
                return false;
            }
@ -1391,7 +1432,7 @@ namespace skvm {
            exit();
        }

-        bytes_masks.foreach([&](int imm, A::Label* l) {
+        bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
            // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
        #if defined(__x86_64__)
            a->align(32);
@ -1399,7 +1440,7 @@ namespace skvm {
            a->align(4);
        #endif

-            a->label(l);
+            a->label(&entry->label);
            int mask[4];
            bytes_control(imm, mask);
            a->bytes(mask, sizeof(mask));
@ -1408,10 +1449,10 @@ namespace skvm {
        #endif
        });

-        splats.foreach([&](int imm, A::Label* l) {
+        splats.foreach([&](int imm, LabelAndReg* entry) {
            // vbroadcastss 4 bytes on x86-64, or simply load 16-bytes on aarch64.
            a->align(4);
-            a->label(l);
+            a->label(&entry->label);
            a->word(imm);
        #if defined(__aarch64__)
            a->word(imm);