hsw::S32_alpha_D32_filter_DX

This is kind of a first draft, writing bilerp naively using mostly skvx and some AVX2 intrinsics. Looks correct, and speed looks fine, better than SSSE3 but nothing mind blowing. Change-Id: I260467e577ea9b30a6aba8b5d7b3a3a3b6383ff3 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/243814 Reviewed-by: Mike Reed <reed@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-09-25 11:15:26 -05:00 · 2019-09-25 11:15:26 -05:00 · be47871249
commit be47871249
parent f9eb073b6a
2 changed files with 90 additions and 10 deletions
--- a/src/opts/SkBitmapProcState_opts.h
+++ b/src/opts/SkBitmapProcState_opts.h
@ -8,6 +8,7 @@
 #ifndef SkBitmapProcState_opts_DEFINED
 #define SkBitmapProcState_opts_DEFINED

+#include "include/private/SkVx.h"
 #include "src/core/SkBitmapProcState.h"

 // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
@ -28,18 +29,94 @@
 namespace SK_OPTS_NS {

 // This same basic packing scheme is used throughout the file.
-static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
-    // The top 14 bits are the integer coordinate x0 or y0.
-    *v0 = packed >> 18;
-
-    // The bottom 14 bits are the integer coordinate x1 or y1.
-    *v1 = packed & 0x3fff;
-
-    // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
-    *w = (packed >> 14) & 0xf;
+template <typename U32, typename Out>
+static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
+    *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
+    *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
+    *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
 }

-#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+    /*not static*/ inline
+    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+                                 const uint32_t* xy, int count, uint32_t* colors) {
+        SkASSERT(count > 0 && colors != nullptr);
+        SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
+        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
+        SkASSERT(s.fAlphaScale <= 256);
+
+        // In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight.
+        int y0, y1, wy;
+        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
+
+        auto row0 = (const int*)s.fPixmap.addr32(0,y0),
+             row1 = (const int*)s.fPixmap.addr32(0,y1);
+
+        auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> {
+            // Decode up to 8 output pixels' x-coordinates and weights.
+            skvx::Vec<8,uint32_t> x0,x1,wx;
+            decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx);
+
+            // Splat wx to each color channel.
+            wx = (wx <<  0)
+               | (wx <<  8)
+               | (wx << 16)
+               | (wx << 24);
+
+            // Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels.
+            // We need to drop into explicit AVX2 intrinsics for a moment to gather.
+            __m256i tl = _mm256_i32gather_epi32(row0, skvx::bit_pun<__m256i>(x0), 4),
+                    tr = _mm256_i32gather_epi32(row0, skvx::bit_pun<__m256i>(x1), 4),
+                    bl = _mm256_i32gather_epi32(row1, skvx::bit_pun<__m256i>(x0), 4),
+                    br = _mm256_i32gather_epi32(row1, skvx::bit_pun<__m256i>(x1), 4);
+
+            // Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply.
+            auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> {
+                return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v));
+            };
+
+            // Sum up weighted sample pixels.  The naive, redundant math would be,
+            //
+            //   sum = tl * (16-wy) * (16-wx)
+            //       + bl * (   wy) * (16-wx)
+            //       + tr * (16-wy) * (   wx)
+            //       + br * (   wy) * (   wx)
+            //
+            // But we refactor to eliminate a bunch of those common factors.
+            auto lerp = [](auto lo, auto hi, auto w) {
+                return 16*lo + (hi-lo)*w;
+            };
+            skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy),
+                                               lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx));
+
+            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
+            sum >>= 8;
+
+            // Scale by [0,256] alpha.
+            sum *= s.fAlphaScale;
+            sum >>= 8;
+
+            // Pack back to 8-bit channels, undoing to_16x4().
+            return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum));
+        };
+
+        while (count >= 8) {
+            bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors);
+            xy     += 8;
+            colors += 8;
+            count  -= 8;
+        }
+        if (count > 0) {
+            __m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ),
+                    coords = _mm256_maskload_epi32((const int*)xy, active),
+                    pixels;
+
+            bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels);
+            _mm256_maskstore_epi32((int*)colors, active, pixels);
+        }
+    }
+
+#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

    /*not static*/ inline
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@ -9,6 +9,7 @@

 #define SK_OPTS_NS hsw
 #include "src/core/SkCubicSolver.h"
+#include "src/opts/SkBitmapProcState_opts.h"
 #include "src/opts/SkBlitRow_opts.h"
 #include "src/opts/SkRasterPipeline_opts.h"
 #include "src/opts/SkUtils_opts.h"
@ -18,6 +19,8 @@ namespace SkOpts {
        blit_row_color32     = hsw::blit_row_color32;
        blit_row_s32a_opaque = hsw::blit_row_s32a_opaque;

+        S32_alpha_D32_filter_DX  = hsw::S32_alpha_D32_filter_DX;
+
        cubic_solver = SK_OPTS_NS::cubic_solver;

    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;