finish up arm64 ops

Some small refactoring to common up redundant opcode building. Oddly, I think I've got better codegen than what Clang would do here. Clang doesn't generate uxtl-based code to unpack 8-bit to 32-bit, instead preferring to load each byte one at a time and insert them one at a time. Me: ldr s0, [x0] uxtl v0.8h, v0.8b uxtl v0.4s, v0.8h Clang: ldrb w8, [x0] ldrb w9, [x0, #1] ldrb w10, [x0, #2] ldrb w11, [x0, #3] fmov s0, w8 mov v0.s[1], w9 mov v0.s[2], w10 mov v0.s[3], w11 Change-Id: I0fdf5c6cdcde6a4eb9290936284fd3ffcb2159f6 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/224821 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-07-01 11:18:08 -05:00 · 2019-07-01 11:18:08 -05:00 · 1fa149a713
commit 1fa149a713
parent e0b2dafeb6
3 changed files with 61 additions and 37 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -741,36 +741,34 @@ namespace skvm {

    void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }

-    void Assembler::shift(uint32_t op, int imm, V n, V d) {
-        this->word( (op & 22_mask) << 10
-                  | imm            << 16   // imm is embedded inside op, bit size depends on op
-                  | (n &   5_mask) <<  5
-                  | (d &   5_mask) <<  0);
+    void Assembler::op(uint32_t op22, int imm, V n, V d) {
+        this->word( (op22 & 22_mask) << 10
+                  | imm              << 16   // imm is embedded inside op, bit size depends on op
+                  | (n    &  5_mask) <<  5
+                  | (d    &  5_mask) <<  0);
    }

    void Assembler::shl4s(V d, V n, int imm) {
-        this->shift(0b0'1'0'011110'0100'000'01010'1,    ( imm&31), n, d);
+        this->op(0b0'1'0'011110'0100'000'01010'1,    ( imm&31), n, d);
    }
    void Assembler::sshr4s(V d, V n, int imm) {
-        this->shift(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
+        this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
    }
    void Assembler::ushr4s(V d, V n, int imm) {
-        this->shift(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
+        this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
    }
    void Assembler::ushr8h(V d, V n, int imm) {
-        this->shift(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
+        this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
    }

-    void Assembler::scvtf4s(V d, V n) {
-        this->word(0b0'1'0'01110'0'0'10000'11101'10 << 10
-                  | (n & 5_mask) << 5
-                  | (d & 5_mask) << 0);
-    }
-    void Assembler::fcvtzs4s(V d, V n) {
-        this->word(0b0'1'0'01110'1'0'10000'1101'1'10 << 10
-                  | (n & 5_mask) << 5
-                  | (d & 5_mask) << 0);
-    }
+    void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
+    void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
+
+    void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
+    void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
+
+    void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
+    void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }

    void Assembler::ret(X n) {
        this->word(0b1101011'0'0'10'11111'0000'0'0 << 10
@ -784,7 +782,7 @@ namespace skvm {
                  | (d     &  5_mask) <<  0);
    }
    void Assembler::subs(X d, X n, int imm12) {
-        this->word( 0b1'1'1'10001'00     << 22
+        this->word( 0b1'1'1'10001'00  << 22
                  | (imm12 & 12_mask) << 10
                  | (n     &  5_mask) <<  5
                  | (d     &  5_mask) <<  0);
@ -798,17 +796,11 @@ namespace skvm {
                  | 0b0'0001          <<  0);
    }

-    void Assembler::ldrq(V dst, X src) {
-        this->word( 0b00'111'1'01'11'000000000000 << 10
-                  | (src & 5_mask) << 5
-                  | (dst & 5_mask) << 0);
-    }
+    void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
+    void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }

-    void Assembler::strq(V src, X dst) {
-        this->word( 0b00'111'1'01'10'000000000000 << 10
-                  | (dst & 5_mask) << 5
-                  | (src & 5_mask) << 0);
-    }
+    void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
+    void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }

    void Assembler::ldrq(V dst, Label l) {
        const int imm19 = (l.offset - here().offset) / 4;
@ -1095,11 +1087,16 @@ namespace skvm {
                  z = inst.z;
            int imm = inst.imm;
            switch (op) {
-            #define TODO if (0) SkDebugf("op %d\n", op); return 0
-                case Op::store8: TODO;
+                case Op::store8: a.xtns2h(r(tmp), r(x));
+                                 a.xtnh2b(r(tmp), r(tmp));
+                                 a.strs  (r(tmp), arg[imm]);
+                                 break;
                case Op::store32: a.strq(r(x), arg[imm]); break;

-                case Op::load8: TODO;
+                case Op::load8: a.ldrs   (r(tmp), arg[imm]);
+                                a.uxtlb2h(r(tmp), r(tmp));
+                                a.uxtlh2s(r(d)  , r(tmp));
+                                break;
                case Op::load32: a.ldrq(r(d), arg[imm]); break;

                case Op::splat: a.ldrq(r(d), *splats.find(imm)); break;
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@ -118,7 +118,12 @@ namespace skvm {

        // d = op(n)
        using DOpN = void(V d, V n);
-        DOpN scvtf4s, fcvtzs4s;
+        DOpN scvtf4s,   // int -> float
+             fcvtzs4s,  // truncate float -> int
+             xtns2h,    // u32 -> u16
+             xtnh2b,    // u16 -> u8
+             uxtlb2h,   // u8 -> u16
+             uxtlh2s;   // u16 -> u32

        // TODO: both these platforms support rounding float->int (vcvtps2dq, fcvtns.4s)... use?

@ -129,7 +134,9 @@ namespace skvm {

        void ldrq(V dst, Label);  // 128-bit PC-relative load
        void ldrq(V dst, X src);  // 128-bit dst = *src
+        void ldrs(V dst, X src);  //  32-bit dst[0] = *src
        void strq(V src, X dst);  // 128-bit *dst = src
+        void strs(V src, X dst);  //  32-bit *dst = src[0]

    private:
        // dst = op(dst, imm)
@ -155,12 +162,14 @@ namespace skvm {
        // *ptr = ymm or ymm = *ptr, depending on opcode.
        void load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr);

-        // General layout top to bottom is:
+        // Opcode for 3-arguments ops is split between hi and lo:
        //    [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d]
-        // where the opcode is split between hi and lo.
        void op(uint32_t hi, V m, uint32_t lo, V n, V d);

-        void shift(uint32_t op, int imm, V n, V d);
+        // 2-argument ops, with or without an immediate.
+        void op(uint32_t op22, int imm, V n, V d);
+        void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); }
+        void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); }

        uint8_t* fCode;
        size_t   fSize;
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@ -492,4 +492,22 @@ DEF_TEST(SkVM_Assembler, r) {
        0x00, 0x01, 0xc0, 0x3d,
        0x00, 0x01, 0x80, 0x3d,
    });
+
+    test_asm(r, [&](A& a) {
+        a.xtns2h(A::v0, A::v0);
+        a.xtnh2b(A::v0, A::v0);
+        a.strs  (A::v0, A::x0);
+
+        a.ldrs   (A::v0, A::x0);
+        a.uxtlb2h(A::v0, A::v0);
+        a.uxtlh2s(A::v0, A::v0);
+    },{
+        0x00,0x28,0x61,0x0e,
+        0x00,0x28,0x21,0x0e,
+        0x00,0x00,0x00,0xbd,
+
+        0x00,0x00,0x40,0xbd,
+        0x00,0xa4,0x08,0x2f,
+        0x00,0xa4,0x10,0x2f,
+    });
 }