Sk4px: alphas() and Load[24]Alphas()

alphas() extracts the 4 alphas from an existing Sk4px as another Sk4px. LoadNAlphas() constructs an Sk4px from N packed alphas. In both cases, we end up with 4x repeated alphas aligned with their pixels. alphas() A0 R0 G0 B0 A1 R1 G1 B1 A2 R2 G2 B2 A3 R3 G3 B3 -> A0 A0 A0 A0 A1 A1 A1 A1 A2 A2 A2 A2 A3 A3 A3 A3 Load4Alphas() A0 A1 A2 A3 -> A0 A0 A0 A0 A1 A1 A1 A1 A2 A2 A2 A2 A3 A3 A3 A3 Load2Alphas() A0 A1 -> A0 A0 A0 A0 A1 A1 A1 A1 0 0 0 0 0 0 0 0 This is a 5-10% speedup for AA on Intel, and wash on ARM. AA is still mostly dominated by the final lerp. alphas() isn't used yet, but it's similar enough to Load[24]Alphas() that it was easier to write all at once. BUG=skia: Review URL: https://codereview.chromium.org/1138333003
2015-05-13 12:19:42 -07:00 · 2015-05-13 12:19:42 -07:00 · 8a90edc2a5
commit 8a90edc2a5
parent 5ae1312c9f
4 changed files with 106 additions and 11 deletions
--- a/src/core/Sk4px.h
+++ b/src/core/Sk4px.h
@ -14,14 +14,22 @@
 // 1, 2 or 4 SkPMColors, generally vectorized.
 class Sk4px : public Sk16b {
 public:
-    Sk4px(SkPMColor);  // Duplicate 4x.
-    Sk4px(const Sk16b& v) : Sk16b(v) {}
+    Sk4px(SkAlpha a) : INHERITED(a) {} // Duplicate 16x.
+    Sk4px(SkPMColor);                  // Duplicate 4x.
+    Sk4px(const Sk16b& v) : INHERITED(v) {}
+
+    // ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx
+    Sk4px alphas() const;

    // When loading or storing fewer than 4 SkPMColors, we use the low lanes.
    static Sk4px Load4(const SkPMColor[4]);
    static Sk4px Load2(const SkPMColor[2]);
    static Sk4px Load1(const SkPMColor[1]);

+    // Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px.
+    static Sk4px Load4Alphas(const SkAlpha[4]);  // AaXx -> AAAA aaaa XXXX xxxx
+    static Sk4px Load2Alphas(const SkAlpha[2]);  // Aa   -> AAAA aaaa 0000 0000
+
    void store4(SkPMColor[4]) const;
    void store2(SkPMColor[2]) const;
    void store1(SkPMColor[1]) const;
@ -111,13 +119,10 @@ public:
    template <typename Fn>
    static void MapDstSrcAlpha(
            int count, SkPMColor* dst, const SkPMColor* src, const SkAlpha* a, Fn fn) {
-        // TODO: find a terser / faster way to construct Sk16b alphas.
        while (count > 0) {
            if (count >= 8) {
-                Sk16b alpha0(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1],
-                             a[2],a[2],a[2],a[2], a[3],a[3],a[3],a[3]),
-                      alpha4(a[4],a[4],a[4],a[4], a[5],a[5],a[5],a[5],
-                             a[6],a[6],a[6],a[6], a[7],a[7],a[7],a[7]);
+                Sk4px alpha0 = Load4Alphas(a+0),
+                      alpha4 = Load4Alphas(a+4);
                Sk4px dst0 = fn(Load4(dst+0), Load4(src+0), alpha0),
                      dst4 = fn(Load4(dst+4), Load4(src+4), alpha4);
                dst0.store4(dst+0);
@ -127,18 +132,17 @@ public:
            }
            SkASSERT(count <= 7);
            if (count >= 4) {
-                Sk16b alpha(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1],
-                            a[2],a[2],a[2],a[2], a[3],a[3],a[3],a[3]);
+                Sk4px alpha = Load4Alphas(a);
                fn(Load4(dst), Load4(src), alpha).store4(dst);
                dst += 4; src += 4; a += 4; count -= 4;
            }
            if (count >= 2) {
-                Sk16b alpha(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1], 0,0,0,0, 0,0,0,0);
+                Sk4px alpha = Load2Alphas(a);
                fn(Load2(dst), Load2(src), alpha).store2(dst);
                dst += 2; src += 2; a += 2; count -= 2;
            }
            if (count >= 1) {
-                Sk16b alpha(a[0],a[0],a[0],a[0], 0,0,0,0, 0,0,0,0, 0,0,0,0);
+                Sk4px alpha(*a);
                fn(Load1(dst), Load1(src), alpha).store1(dst);
            }
            break;
--- a/src/opts/Sk4px_NEON.h
+++ b/src/opts/Sk4px_NEON.h
@ -48,3 +48,33 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
    return Sk16b(vcombine_u8(vaddhn_u16(this->fLo.fVec, o.fLo.fVec),
                             vaddhn_u16(this->fHi.fVec, o.fHi.fVec)));
 }
+
+inline Sk4px Sk4px::alphas() const {
+    static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
+    auto as = vshrq_n_u32((uint32x4_t)this->fVec, 24);  // ___3 ___2 ___1 ___0
+    as = vorrq_u32(as, vshlq_n_u32(as,  8));            // __33 __22 __11 __11
+    as = vorrq_u32(as, vshlq_n_u32(as, 16));            // 3333 2222 1111 1111
+    return Sk16b((uint8x16_t)as);
+}
+
+inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+    uint8x16_t a8 = vdupq_n_u8(0);                        // ____ ____ ____ ____
+    a8 = vld1q_lane_u8(a+0, a8,  0);                      // ____ ____ ____ ___0
+    a8 = vld1q_lane_u8(a+1, a8,  4);                      // ____ ____ ___1 ___0
+    a8 = vld1q_lane_u8(a+2, a8,  8);                      // ____ ___2 ___1 ___0
+    a8 = vld1q_lane_u8(a+3, a8, 12);                      // ___3 ___2 ___1 ___0
+    auto a32 = (uint32x4_t)a8;                            //
+    a32 = vorrq_u32(a32, vshlq_n_u32(a32,  8));           // __33 __22 __11 __00
+    a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16));           // 3333 2222 1111 0000
+    return Sk16b((uint8x16_t)a32);
+}
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+    uint8x16_t a8 = vdupq_n_u8(0);                        // ____ ____ ____ ____
+    a8 = vld1q_lane_u8(a+0, a8,  0);                      // ____ ____ ____ ___0
+    a8 = vld1q_lane_u8(a+1, a8,  4);                      // ____ ____ ___1 ___0
+    auto a32 = (uint32x4_t)a8;                            //
+    a32 = vorrq_u32(a32, vshlq_n_u32(a32,  8));           // ____ ____ __11 __00
+    a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16));           // ____ ____ 1111 0000
+    return Sk16b((uint8x16_t)a32);
+}
--- a/src/opts/Sk4px_SSE2.h
+++ b/src/opts/Sk4px_SSE2.h
@ -37,3 +37,42 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
    Sk4px::Wide r = (*this + other) >> 8;
    return Sk4px(_mm_packus_epi16(r.fLo.fVec, r.fHi.fVec));
 }
+
+// Load4Alphas and Load2Alphas use possibly-unaligned loads (SkAlpha[] -> uint16_t or uint32_t).
+// These are safe on x86, often with no speed penalty.
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+    inline Sk4px Sk4px::alphas() const {
+        static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
+        __m128i splat = _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7,7,7, 3,3,3,3);
+        return Sk16b(_mm_shuffle_epi8(this->fVec, splat));
+    }
+
+    inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+        uint32_t as = *(const uint32_t*)a;
+        __m128i splat = _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+        return Sk16b(_mm_shuffle_epi8(_mm_cvtsi32_si128(as), splat));
+    }
+#else
+    inline Sk4px Sk4px::alphas() const {
+        static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
+        __m128i as = _mm_srli_epi32(this->fVec, 24);   // ___3 ___2 ___1 ___0
+        as = _mm_or_si128(as, _mm_slli_si128(as, 1));  // __33 __22 __11 __00
+        as = _mm_or_si128(as, _mm_slli_si128(as, 2));  // 3333 2222 1111 0000
+        return Sk16b(as);
+    }
+
+    inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+        __m128i as = _mm_cvtsi32_si128(*(const uint32_t*)a);  // ____ ____ ____ 3210
+        as = _mm_unpacklo_epi8 (as, _mm_setzero_si128());     // ____ ____ _3_2 _1_0
+        as = _mm_unpacklo_epi16(as, _mm_setzero_si128());     // ___3 ___2 ___1 ___0
+        as = _mm_or_si128(as, _mm_slli_si128(as, 1));         // __33 __22 __11 __00
+        as = _mm_or_si128(as, _mm_slli_si128(as, 2));         // 3333 2222 1111 0000
+        return Sk16b(as);
+    }
+#endif
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+    uint32_t as = *(const uint16_t*)a;   // Aa -> Aa00
+    return Load4Alphas((const SkAlpha*)&as);
+}
--- a/src/opts/Sk4px_none.h
+++ b/src/opts/Sk4px_none.h
@ -55,3 +55,25 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
                 r.kth< 8>(), r.kth< 9>(), r.kth<10>(), r.kth<11>(),
                 r.kth<12>(), r.kth<13>(), r.kth<14>(), r.kth<15>());
 }
+
+inline Sk4px Sk4px::alphas() const {
+    static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
+    return Sk16b(this->kth< 3>(), this->kth< 3>(), this->kth< 3>(), this->kth< 3>(),
+                 this->kth< 7>(), this->kth< 7>(), this->kth< 7>(), this->kth< 7>(),
+                 this->kth<11>(), this->kth<11>(), this->kth<11>(), this->kth<11>(),
+                 this->kth<15>(), this->kth<15>(), this->kth<15>(), this->kth<15>());
+}
+
+inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+    return Sk16b(a[0], a[0], a[0], a[0],
+                 a[1], a[1], a[1], a[1],
+                 a[2], a[2], a[2], a[2],
+                 a[3], a[3], a[3], a[3]);
+}
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+    return Sk16b(a[0], a[0], a[0], a[0],
+                 a[1], a[1], a[1], a[1],
+                 0,0,0,0,
+                 0,0,0,0);
+}