proposed: add bytes() op

I'm staring at this assembly, vmovups (%rsi), %ymm3 vpsrld $24, %ymm3, %ymm4 vpslld $16, %ymm4, %ymm15 vorps %ymm4, %ymm15, %ymm4 vpsubw %ymm4, %ymm0, %ymm4 Just knowing that could be vmovups (%rsi), %ymm3 vpshufb 0x??(%rip), %ymm3, %ymm4 vpsubw %ymm4, %ymm0, %ymm4 That is, instead of shifting, shifting, and bit-oring to create the 0a0a scale factor from ymm3, we could just byte shuffle directly using some pre-baked control pattern (stored at the end of the program like other constants) pshufb lets you arbitrarily remix bytes from its argument and zero bytes, and NEON has a similar family of vtbl instructions, even including that same feature of injecting zeroes. I think I've got this working, and the speedup is great, from 0.19 to 0.16 ns/px for I32_SWAR, and from 0.43 to 0.38 ns/px for I32. Change-Id: Iab850275e826b4187f0efc9495a4b9eab4402c38 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220871 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-06-13 16:43:18 -05:00 · 2019-06-13 16:43:18 -05:00 · 342b1b2753
commit 342b1b2753
parent e5c1f97de1
5 changed files with 124 additions and 22 deletions
--- a/resources/SkVMTest.expected
+++ b/resources/SkVMTest.expected
@ -571,14 +571,14 @@ r0 = splat FF (3.5733111e-43)
 r1 = splat 100 (3.5873241e-43)
 loop:
 r2 = load32 arg(0)
-r3 = extract r2 0 r0
-r4 = extract r2 8 r0
-r5 = extract r2 16 r0
+r3 = bit_and r2 r0
+r4 = bytes r2 2
+r5 = bytes r2 3
 r2 = shr r2 24
 r6 = load32 arg(1)
-r7 = extract r6 0 r0
-r8 = extract r6 8 r0
-r9 = extract r6 16 r0
+r7 = bit_and r6 r0
+r8 = bytes r6 2
+r9 = bytes r6 3
 r6 = shr r6 24
 r10 = sub_i32 r1 r2
 r7 = mul_i16x2 r7 r10
@ -599,14 +599,13 @@ r10 = pack r8 r10 16
 store32 arg(1) r10

 I32 (SWAR) 8888 over 8888
-7 registers, 17 instructions:
+7 registers, 16 instructions:
 r0 = splat 1000100 (2.3510604e-38)
 r1 = splat FF00FF (2.3418409e-38)
 r2 = splat FF00FF00 (-1.7146522e+38)
 loop:
 r3 = load32 arg(0)
-r4 = shr r3 24
-r4 = pack r4 r4 16
+r4 = bytes r3 404
 r4 = sub_i16x2 r0 r4
 r5 = load32 arg(1)
 r6 = bit_and r5 r1
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -239,6 +239,10 @@ namespace skvm {
        return {this->push(Op::pack, x.id,y.id,NA, 0,bits)};
    }

+    I32 Builder::bytes(I32 x, int control) {
+        return {this->push(Op::bytes, x.id,NA,NA, control)};
+    }
+
    F32 Builder::to_f32(I32 x) { return {this->push(Op::to_f32, x.id)}; }
    I32 Builder::to_i32(F32 x) { return {this->push(Op::to_i32, x.id)}; }

@ -248,6 +252,7 @@ namespace skvm {
    struct R { ID id; };
    struct Shift { int bits; };
    struct Splat { int bits; };
+    struct Hex   { int bits; };

    static void write(SkWStream* o, const char* s) {
        o->writeText(s);
@ -277,6 +282,9 @@ namespace skvm {
        o->writeScalarAsText(f);
        write(o, ")");
    }
+    static void write(SkWStream* o, Hex h) {
+        o->writeHexAsText(h.bits);
+    }

    template <typename T, typename... Ts>
    static void write(SkWStream* o, T first, Ts... rest) {
@ -332,6 +340,8 @@ namespace skvm {
                case Op::extract: write(o, V{id}, "= extract", V{x}, Shift{immy}, V{z}); break;
                case Op::pack:    write(o, V{id}, "= pack",    V{x}, V{y}, Shift{immz}); break;

+                case Op::bytes:   write(o, V{id}, "= bytes", V{x}, Hex{immy}); break;
+
                case Op::to_f32: write(o, V{id}, "= to_f32", V{x}); break;
                case Op::to_i32: write(o, V{id}, "= to_i32", V{x}); break;
            }
@ -389,6 +399,8 @@ namespace skvm {
                case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{y.imm}, R{z.id}); break;
                case Op::pack:    write(o, R{d}, "= pack",    R{x}, R{y.id}, Shift{z.imm}); break;

+                case Op::bytes: write(o, R{d}, "= bytes", R{x}, Hex{y.imm}); break;
+
                case Op::to_f32: write(o, R{d}, "= to_f32", R{x}); break;
                case Op::to_i32: write(o, R{d}, "= to_i32", R{x}); break;
            }
@ -430,8 +442,15 @@ namespace skvm {

             #endif

-                // Label / 4-byte values we need to write after ret.
-                std::vector<std::pair<Xbyak::Label, int>> splats;
+                // Label / N-byte values we need to write after ret.
+                struct Data4  { Xbyak::Label label; int bits   ; };
+                struct Data32 { Xbyak::Label label; int bits[8]; };
+                std::vector<Data4 > data4;
+                std::vector<Data32> data32;
+
+                // Map from our bytes() control y.imm to index in data32;
+                // no need to splat out duplicate bytes for the same control.
+                std::unordered_map<int, int> vpshufb_masks;

                for (int i = 0; i < (int)instructions.size(); i++) {
                    if (i == loop) {
@ -468,8 +487,8 @@ namespace skvm {
                        case Op::load8:  vpmovzxbd(r[d], ptr[arg[y.imm]]); break;
                        case Op::load32: vmovups  (r[d], ptr[arg[y.imm]]); break;

-                        case Op::splat: splats.emplace_back(Xbyak::Label(), y.imm);
-                                        vbroadcastss(r[d], ptr[rip + splats.back().first]);
+                        case Op::splat: data4.push_back(Data4{Xbyak::Label(), y.imm});
+                                        vbroadcastss(r[d], ptr[rip + data4.back().label]);
                                        break;

                        case Op::add_f32: vaddps(r[d], r[x], r[y.id]); break;
@ -514,6 +533,47 @@ namespace skvm {

                        case Op::to_f32: vcvtdq2ps (r[d], r[x]); break;
                        case Op::to_i32: vcvttps2dq(r[d], r[x]); break;
+
+                        case Op::bytes: {
+                            if (vpshufb_masks.end() == vpshufb_masks.find(y.imm)) {
+                                // Translate bytes()'s control nibbles to vpshufb's control bytes.
+                                auto nibble_to_vpshufb = [](unsigned n) -> uint8_t {
+                                    return n == 0 ? 0xff  // Fill with zero.
+                                                  : n-1;  // Select n'th 1-indexed byte.
+                                };
+                                uint8_t control[] = {
+                                    nibble_to_vpshufb( (y.imm >>  0) & 0xf ),
+                                    nibble_to_vpshufb( (y.imm >>  4) & 0xf ),
+                                    nibble_to_vpshufb( (y.imm >>  8) & 0xf ),
+                                    nibble_to_vpshufb( (y.imm >> 12) & 0xf ),
+                                };
+
+                                // Now, vpshufb is one of those weird AVX instructions
+                                // that does everything in 2 128-bit chunks, so we'll
+                                // only really need 4 distinct values to write in our pattern:
+                                int p[4];
+                                for (int i = 0; i < 4; i++) {
+                                    p[i] = (int)control[0] <<  0
+                                         | (int)control[1] <<  8
+                                         | (int)control[2] << 16
+                                         | (int)control[3] << 24;
+
+                                    // Update each byte that refers to a byte index by 4 to
+                                    // point into the next 32-bit lane, but leave any 0xff
+                                    // that fills with zero alone.
+                                    control[0] += control[0] == 0xff ? 0 : 4;
+                                    control[1] += control[1] == 0xff ? 0 : 4;
+                                    control[2] += control[2] == 0xff ? 0 : 4;
+                                    control[3] += control[3] == 0xff ? 0 : 4;
+                                }
+
+                                // Notice, same patterns for top 4 32-bit lanes as bottom.
+                                data32.push_back(Data32{Xbyak::Label(), {p[0], p[1], p[2], p[3],
+                                                                         p[0], p[1], p[2], p[3]}});
+                                vpshufb_masks[y.imm] = data32.size() - 1;
+                            }
+                            vpshufb(r[d], r[x], ptr[rip + data32[vpshufb_masks[y.imm]].label]);
+                        } break;
                    }
                }

@ -526,10 +586,17 @@ namespace skvm {
                vzeroupper();
                ret();

-                for (auto splat : splats) {
+                for (auto data : data4) {
                    align(4);
-                    L(splat.first);
-                    dd(splat.second);
+                    L(data.label);
+                    dd(data.bits);
+                }
+                for (auto data : data32) {
+                    align(32);
+                    L(data.label);
+                    for (int i = 0; i < 8; i++) {
+                        dd(data.bits[i]);
+                    }
                }
            }
        };
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@ -26,6 +26,7 @@ namespace skvm {
        shl, shr, sra,
        extract,
        pack,
+        bytes,
        to_f32, to_i32,
    };

@ -115,6 +116,28 @@ namespace skvm {
        I32 extract(I32 x, int bits, I32 z);   // (x >> bits) & z
        I32 pack   (I32 x, I32 y, int bits);   // x | (y << bits)

+        // Shuffle the bytes in x according to each nibble of control, as if
+        //
+        //    uint8_t bytes[] = {
+        //        0,
+        //        ((uint32_t)x      ) & 0xff,
+        //        ((uint32_t)x >>  8) & 0xff,
+        //        ((uint32_t)x >> 16) & 0xff,
+        //        ((uint32_t)x >> 24) & 0xff,
+        //    };
+        //    return (uint32_t)bytes[(control >>  0) & 0xf] <<  0
+        //         | (uint32_t)bytes[(control >>  4) & 0xf] <<  8
+        //         | (uint32_t)bytes[(control >>  8) & 0xf] << 16
+        //         | (uint32_t)bytes[(control >> 12) & 0xf] << 24;
+        //
+        // So, e.g.,
+        //    - bytes(x, 0x1111) splats the low byte of x to all four bytes
+        //    - bytes(x, 0x4321) is x, an identity
+        //    - bytes(x, 0x0000) is 0
+        //    - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern.
+        //
+        I32 bytes(I32 x, int control);
+
        F32 to_f32(I32 x);
        I32 to_i32(F32 x);

--- a/src/opts/SkVM_opts.h
+++ b/src/opts/SkVM_opts.h
@ -148,6 +148,20 @@ namespace SK_OPTS_NS {
                    CASE(Op::extract): r(d).u32 = (r(x).u32 >> y.imm) & r(z.id).u32; break;
                    CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y.id).u32 << z.imm); break;

+                    CASE(Op::bytes): {
+                        const U32 table[] = {
+                            0,
+                            (r(x).u32      ) & 0xff,
+                            (r(x).u32 >>  8) & 0xff,
+                            (r(x).u32 >> 16) & 0xff,
+                            (r(x).u32 >> 24) & 0xff,
+                        };
+                        r(d).u32 = table[(y.imm >>  0) & 0xf] <<  0
+                                 | table[(y.imm >>  4) & 0xf] <<  8
+                                 | table[(y.imm >>  8) & 0xf] << 16
+                                 | table[(y.imm >> 12) & 0xf] << 24;
+                    } break;
+
                    CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
                    CASE(Op::to_i32): r(d).i32 = skvx::cast<int>  (r(x).f32); break;
                #undef CASE
--- a/tools/SkVMBuilders.cpp
+++ b/tools/SkVMBuilders.cpp
@ -97,9 +97,9 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
    auto load = [&](skvm::Arg ptr,
                    skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
        skvm::I32 rgba = load32(ptr);
-        *r = extract(rgba,  0, splat(0xff));
-        *g = extract(rgba,  8, splat(0xff));
-        *b = extract(rgba, 16, splat(0xff));
+        *r = bit_and(rgba, splat(0xff));
+        *g = bytes  (rgba, 0x0002);
+        *b = bytes  (rgba, 0x0003);
        *a = shr    (rgba, 24);
    };

@ -141,12 +141,11 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
    // The s += d*invA adds won't overflow,
    // so we don't have to unpack s beyond grabbing the alpha channel.
    skvm::I32 s = load32(src),
-              a = shr(s, 24);
+            ax2 = bytes(s, 0x0404);  // rgba -> a0a0

    // We'll use the same approximation math as above, this time making sure to
    // use both i16 multiplies to our benefit, one for r/g, the other for b/a.
-    skvm::I32 ax2    = pack(a,a,16),
-              invAx2 = sub_16x2(splat(0x01000100), ax2);
+    skvm::I32 invAx2 = sub_16x2(splat(0x01000100), ax2);

    skvm::I32 d  = load32(dst),
              rb = bit_and (d, splat(0x00ff00ff)),