refactor SkBitmapProcState_opts.h a bit

Shouldn't be any major change in here, and the only codegen change should be the easy-to-predict branch on alpha < 256. This is mostly about makings sure I understand and can read the code. Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel Change-Id: I3e6260be76595275ba177551cbb8f4a84e4970ec Reviewed-on: https://skia-review.googlesource.com/c/171585 Auto-Submit: Mike Klein <mtklein@google.com> Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Herb Derby <herb@google.com>
2018-11-16 16:44:10 -05:00 · 2018-11-16 16:44:10 -05:00 · 2c8e2bc682
commit 2c8e2bc682
parent 0dd430242b
1 changed files with 71 additions and 134 deletions
--- a/src/opts/SkBitmapProcState_opts.h
+++ b/src/opts/SkBitmapProcState_opts.h
@ -27,7 +27,6 @@
 namespace SK_OPTS_NS {
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
 // This same basic packing scheme is used throughout the file.
 static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
    // The top 14 bits are the integer coordinate x0 or y0.
@ -40,6 +39,8 @@ namespace SK_OPTS_NS {
    *w = (packed >> 14) & 0xf;
 }
 #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
    // As above, 4x.
    static void decode_packed_coordinates_and_weight(__m128i packed,
                                                     int v0[4], int v1[4], __m128i* w) {
@ -180,7 +181,7 @@ namespace SK_OPTS_NS {
    }
-#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
    // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
@ -192,112 +193,63 @@ namespace SK_OPTS_NS {
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
        SkASSERT(s.fAlphaScale <= 256);
-        const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
+        int y0, y1, wy;
-        size_t rb = s.fPixmap.rowBytes();
+        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
        uint32_t XY = *xy++;
        unsigned y0 = XY >> 14;
        const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
        const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
        unsigned subY = y0 & 0xF;
-        // ( 0,  0,  0,  0,  0,  0,  0, 16)
+        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
-        __m128i sixteen = _mm_cvtsi32_si128(16);
+             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
-        // ( 0,  0,  0,  0, 16, 16, 16, 16)
+        // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
-        sixteen = _mm_shufflelo_epi16(sixteen, 0);
+        // and another in the upper 4 16-bit lanes to line up with 16 - wy.
        const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),
                                                _mm_set1_epi16(16-wy));
-        // ( 0,  0,  0,  0,  0,  0,  0,  y)
+        while (count --> 0) {
-        __m128i allY = _mm_cvtsi32_si128(subY);
+            int x0, x1, wx;
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
-        // ( 0,  0,  0,  0,  y,  y,  y,  y)
+            // Load the 4 pixels we're interpolating.
-        allY = _mm_shufflelo_epi16(allY, 0);
+            const __m128i a00 = _mm_cvtsi32_si128(row0[x0]),
                          a01 = _mm_cvtsi32_si128(row0[x1]),
                          a10 = _mm_cvtsi32_si128(row1[x0]),
                          a11 = _mm_cvtsi32_si128(row1[x1]);
-        // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
+            // Line up low-x pixels a00 and a10 with allY.
-        __m128i negY = _mm_sub_epi16(sixteen, allY);
+            __m128i a00a10 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a10, a00),
                                               _mm_setzero_si128());
-        // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
+            // Scale by allY and 16-wx.
        allY = _mm_unpacklo_epi64(allY, negY);
        // (16, 16, 16, 16, 16, 16, 16, 16 )
        sixteen = _mm_shuffle_epi32(sixteen, 0);
        // ( 0,  0,  0,  0,  0,  0,  0,  0)
        __m128i zero = _mm_setzero_si128();
        // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
        __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
        do {
            uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
            unsigned x0 = XX >> 18;
            unsigned x1 = XX & 0x3FFF;
            // (0, 0, 0, 0, 0, 0, 0, x)
            __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
            // (0, 0, 0, 0, x, x, x, x)
            allX = _mm_shufflelo_epi16(allX, 0);
            // (x, x, x, x, x, x, x, x)
            allX = _mm_shuffle_epi32(allX, 0);
            // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
            __m128i negX = _mm_sub_epi16(sixteen, allX);
            // Load 4 samples (pixels).
            __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
            __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
            __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
            __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
            // (0, 0, a00, a10)
            __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
            // Expand to 16 bits per component.
            a00a10 = _mm_unpacklo_epi8(a00a10, zero);
            // ((a00 * (16-y)), (a10 * y)).
            a00a10 = _mm_mullo_epi16(a00a10, allY);
            a00a10 = _mm_mullo_epi16(a00a10, _mm_set1_epi16(16-wx));
            // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
            a00a10 = _mm_mullo_epi16(a00a10, negX);
-            // (0, 0, a01, a10)
+            // Line up high-x pixels a01 and a11 with allY.
-            __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
+            __m128i a01a11 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a11, a01),
                                               _mm_setzero_si128());
-            // Expand to 16 bits per component.
+            // Scale by allY and wx.
            a01a11 = _mm_unpacklo_epi8(a01a11, zero);
            // (a01 * (16-y)), (a11 * y)
            a01a11 = _mm_mullo_epi16(a01a11, allY);
            a01a11 = _mm_mullo_epi16(a01a11, _mm_set1_epi16(wx));
            // (a01 * (16-y) * x), (a11 * y * x)
            a01a11 = _mm_mullo_epi16(a01a11, allX);
-            // (a00*w00 + a01*w01, a10*w10 + a11*w11)
+            // Add the two intermediates, summing across in one direction.
-            __m128i sum = _mm_add_epi16(a00a10, a01a11);
+            __m128i halves = _mm_add_epi16(a00a10, a01a11);
-            // (DC, a00*w00 + a01*w01)
+            // Add the two halves to each other to sum in the other direction.
-            __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
+            __m128i sum = _mm_add_epi16(halves, _mm_srli_si128(halves, 8));
-            // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
+            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
            sum = _mm_add_epi16(sum, shifted);
            // Divide each 16 bit component by 256.
            sum = _mm_srli_epi16(sum, 8);
-            // Multiply by alpha.
+            if (s.fAlphaScale < 256) {
-            sum = _mm_mullo_epi16(sum, alpha);
+                // Scale by alpha, which is in [0,256].
-
+                sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
            // Divide each 16 bit component by 256.
                sum = _mm_srli_epi16(sum, 8);
            }
-            // Pack lower 4 16 bit values of sum into lower 4 bytes.
+            // Pack back into 8-bit values and store.
-            sum = _mm_packus_epi16(sum, zero);
+            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
-
+        }
            // Extract low int and store.
            *colors++ = _mm_cvtsi128_si32(sum);
        } while (--count > 0);
    }
 #else
@ -337,9 +289,11 @@ namespace SK_OPTS_NS {
            tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
            tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
            if (scale < 256) {
                vscale = vdup_n_u16(scale);        // duplicate scale
                tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
                tmp = vmul_u16(tmp, vscale);       // multiply result by scale
            }
            vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
            vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
@ -372,17 +326,16 @@ namespace SK_OPTS_NS {
            lo += (a11 & mask) * xy;
            hi += ((a11 >> 8) & mask) * xy;
-            // TODO: if (alphaScale < 256) ...
+            if (alphaScale < 256) {
                lo = ((lo >> 8) & mask) * alphaScale;
                hi = ((hi >> 8) & mask) * alphaScale;
            }
            *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
        }
    #endif
    // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
    /*not static*/ inline
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
                                 const uint32_t* xy, int count, SkPMColor* colors) {
@ -391,38 +344,22 @@ namespace SK_OPTS_NS {
        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
        SkASSERT(s.fAlphaScale <= 256);
-        unsigned alphaScale = s.fAlphaScale;
+        int y0, y1, wy;
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
-        const char* srcAddr = (const char*)s.fPixmap.addr();
+        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
-        size_t rb = s.fPixmap.rowBytes();
+             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
        unsigned subY;
        const SkPMColor* row0;
        const SkPMColor* row1;
-        // setup row ptrs and update proc_table
+        while (count --> 0) {
-        {
+            int x0, x1, wx;
-            uint32_t XY = *xy++;
+            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
            unsigned y0 = XY >> 14;
            row0 = (const SkPMColor*)(srcAddr + (y0 >> 4) * rb);
            row1 = (const SkPMColor*)(srcAddr + (XY & 0x3FFF) * rb);
            subY = y0 & 0xF;
        }
-        do {
+            filter_and_scale_by_alpha(wx, wy,
            uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
            unsigned x0 = XX >> 14;
            unsigned x1 = XX & 0x3FFF;
            unsigned subX = x0 & 0xF;
            x0 >>= 4;
            filter_and_scale_by_alpha(subX, subY,
                                      row0[x0], row0[x1],
                                      row1[x0], row1[x1],
-                                      colors,
+                                      colors++,
-                                      alphaScale);
+                                      s.fAlphaScale);
-            colors += 1;
+        }
        } while (--count != 0);
    }
 #endif