Add tail handling for SSE* to SkJumper.

Change-Id: Icb9d385333082de2f99b7a25cfd7251717e3f663 Reviewed-on: https://skia-review.googlesource.com/17580 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Herb Derby <herb@google.com>
2017-05-25 16:00:10 -04:00 · 2017-05-25 16:00:10 -04:00 · e7ba8b05d2
commit e7ba8b05d2
parent fabe0b26d0
7 changed files with 6497 additions and 4743 deletions
--- a/gn/tests.gni
+++ b/gn/tests.gni
@ -19,7 +19,6 @@ tests_sources = [
  "$_tests/BitSetTest.cpp",
  "$_tests/BlendTest.cpp",
  "$_tests/BlitMaskClip.cpp",
-  "$_tests/BlitRowTest.cpp",
  "$_tests/BlurTest.cpp",
  "$_tests/CachedDataTest.cpp",
  "$_tests/CachedDecodingPixelRefTest.cpp",
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -157,7 +157,7 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, sse41),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            4, M(start_pipeline) M(just_return)
+            1, M(start_pipeline) M(just_return)
        #undef M
        };
    }
@ -165,7 +165,7 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, sse2),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            4, M(start_pipeline) M(just_return)
+            1, M(start_pipeline) M(just_return)
        #undef M
        };
    }
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -76,10 +76,8 @@ struct LazyCtx {
 // We're finally going to get to what a Stage function looks like!
 // It's best to jump down to the #else case first, then to come back up here for AVX.

-#if defined(JUMPER) && defined(__AVX__)
-    // There's a big cost to switch between SSE and AVX, so we do a little
-    // extra work to handle even the jagged <kStride tail in AVX mode.
-    // Compared to normal stages, we maintain an extra tail register:
+#if defined(JUMPER) && defined(__SSE2__)
+    // Process the tail on all x86 processors with SSE2 or better instructions.
    //    tail == 0 ~~> work on a full kStride pixels
    //    tail != 0 ~~> work on only the first tail pixels
    // tail is always < kStride.
@ -113,8 +111,7 @@ struct LazyCtx {
                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)

 #else
-    // Other instruction sets (SSE, NEON, portable) can fall back on narrower
-    // pipelines cheaply, which frees us to always assume tail==0.
+    // Other instruction sets (NEON, portable) currently always assume tail == 0.

    // Stages tail call between each other by following program as described above.
    // x is our induction variable, stepping forward kStride at a time.
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@ -488,13 +488,27 @@
    }

    SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
-        // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
-        auto _01 =                _mm_loadu_si128((const __m128i*)(ptr + 0))    ,
-             _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
+        __m128i _0, _1, _2, _3;
+        if (__builtin_expect(tail,0)) {
+            _1 = _2 = _3 = _mm_setzero_si128();
+            auto load_rgb = [](const uint16_t* src) {
+                auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
+                return _mm_insert_epi16(v, src[2], 2);
+            };
+            if (  true  ) { _0 = load_rgb(ptr + 0); }
+            if (tail > 1) { _1 = load_rgb(ptr + 3); }
+            if (tail > 2) { _2 = load_rgb(ptr + 6); }
+        } else {
+            // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
+            auto _01 =                _mm_loadu_si128((const __m128i*)(ptr + 0))    ,
+                 _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);

-        // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
-        auto _0 = _01, _1 = _mm_srli_si128(_01, 6),
-             _2 = _23, _3 = _mm_srli_si128(_23, 6);
+            // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
+            _0 = _01;
+            _1 = _mm_srli_si128(_01, 6);
+            _2 = _23;
+            _3 = _mm_srli_si128(_23, 6);
+        }

        // De-interlace to R,G,B.
        auto _02 = _mm_unpacklo_epi16(_0, _2),  // r0 r2 g0 g2 b0 b2 xx xx
@ -508,9 +522,19 @@
        *g = unaligned_load<U16>(&G);
        *b = unaligned_load<U16>(&B);
    }
+
    SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
-        auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
-             _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+        __m128i _01, _23;
+        if (__builtin_expect(tail,0)) {
+            _01 = _23 = _mm_setzero_si128();
+            auto src = (const double*)ptr;
+            if (  true  ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00
+            if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
+            if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
+        } else {
+            _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
+            _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
+        }

        auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
             _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3
@ -523,30 +547,54 @@
        *b = unaligned_load<U16>((uint16_t*)&ba + 0);
        *a = unaligned_load<U16>((uint16_t*)&ba + 4);
    }
+
    SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
        auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
             ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
-        _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
-        _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
+
+        if (__builtin_expect(tail, 0)) {
+            auto dst = (double*)ptr;
+            if (  true  ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); }
+            if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
+            if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
+        } else {
+            _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
+            _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
+        }
    }

    SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
-        auto _0 = _mm_loadu_ps(ptr+ 0),
-             _1 = _mm_loadu_ps(ptr+ 4),
-             _2 = _mm_loadu_ps(ptr+ 8),
-             _3 = _mm_loadu_ps(ptr+12);
+        F _0, _1, _2, _3;
+        if (__builtin_expect(tail, 0)) {
+            _1 = _2 = _3 = _mm_setzero_si128();
+            if (  true  ) { _0 = _mm_loadu_ps(ptr + 0); }
+            if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
+            if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
+        } else {
+            _0 = _mm_loadu_ps(ptr + 0);
+            _1 = _mm_loadu_ps(ptr + 4);
+            _2 = _mm_loadu_ps(ptr + 8);
+            _3 = _mm_loadu_ps(ptr +12);
+        }
        _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
        *r = _0;
        *g = _1;
        *b = _2;
        *a = _3;
    }
+
    SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
        _MM_TRANSPOSE4_PS(r,g,b,a);
-        _mm_storeu_ps(ptr+ 0, r);
-        _mm_storeu_ps(ptr+ 4, g);
-        _mm_storeu_ps(ptr+ 8, b);
-        _mm_storeu_ps(ptr+12, a);
+        if (__builtin_expect(tail, 0)) {
+            if (  true  ) { _mm_storeu_ps(ptr + 0, r); }
+            if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
+            if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
+        } else {
+            _mm_storeu_ps(ptr + 0, r);
+            _mm_storeu_ps(ptr + 4, g);
+            _mm_storeu_ps(ptr + 8, b);
+            _mm_storeu_ps(ptr +12, a);
+        }
    }
 #endif

--- a/tests/BlitRowTest.cpp
+++ b/tests/BlitRowTest.cpp
@ -1,290 +0,0 @@
-/*
- * Copyright 2011 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBitmap.h"
-#include "SkCanvas.h"
-#include "SkColorPriv.h"
-#include "SkGradientShader.h"
-#include "SkRect.h"
-#include "SkVertices.h"
-#include "Test.h"
-
-#include "sk_tool_utils.h"
-
-// these are in the same order as the SkColorType enum
-static const char* gColorTypeName[] = {
-    "None", "A8", "565", "4444", "RGBA", "BGRA", "Index8"
-};
-
-/** Returns -1 on success, else the x coord of the first bad pixel, return its
-    value in bad
- */
-typedef int (*Proc)(const void*, int width, uint32_t expected, uint32_t* bad);
-
-static int proc_32(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
-    const SkPMColor* addr = static_cast<const SkPMColor*>(ptr);
-    for (int x = 0; x < w; x++) {
-        if (addr[x] != expected) {
-            *bad = addr[x];
-            return x;
-        }
-    }
-    return -1;
-}
-
-static int proc_16(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
-    const uint16_t* addr = static_cast<const uint16_t*>(ptr);
-    for (int x = 0; x < w; x++) {
-        if (addr[x] != expected) {
-            *bad = addr[x];
-            return x;
-        }
-    }
-    return -1;
-}
-
-static int proc_8(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
-    const SkPMColor* addr = static_cast<const SkPMColor*>(ptr);
-    for (int x = 0; x < w; x++) {
-        if (SkGetPackedA32(addr[x]) != expected) {
-            *bad = SkGetPackedA32(addr[x]);
-            return x;
-        }
-    }
-    return -1;
-}
-
-static int proc_bad(const void*, int, uint32_t, uint32_t* bad) {
-    *bad = 0;
-    return 0;
-}
-
-static Proc find_proc(const SkBitmap& bm, SkPMColor expect32, uint16_t expect16,
-                      uint8_t expect8, uint32_t* expect) {
-    switch (bm.colorType()) {
-        case kN32_SkColorType:
-            *expect = expect32;
-            return proc_32;
-        case kARGB_4444_SkColorType:
-        case kRGB_565_SkColorType:
-            *expect = expect16;
-            return proc_16;
-        case kAlpha_8_SkColorType:
-            *expect = expect8;
-            return proc_8;
-        default:
-            *expect = 0;
-            return proc_bad;
-    }
-}
-
-static bool check_color(const SkBitmap& bm, SkPMColor expect32,
-                        uint16_t expect16, uint8_t expect8,
-                        skiatest::Reporter* reporter) {
-    uint32_t expect;
-    Proc proc = find_proc(bm, expect32, expect16, expect8, &expect);
-    for (int y = 0; y < bm.height(); y++) {
-        uint32_t bad;
-        int x = proc(bm.getAddr(0, y), bm.width(), expect, &bad);
-        if (x >= 0) {
-            ERRORF(reporter, "BlitRow colortype=%s [%d %d] expected %x got %x",
-                   gColorTypeName[bm.colorType()], x, y, expect, bad);
-            return false;
-        }
-    }
-    return true;
-}
-
-// Make sure our blits always map src==0 to a noop, and src==FF to full opaque
-static void test_00_FF(skiatest::Reporter* reporter) {
-    static const int W = 256;
-
-    static const SkColorType gDstColorType[] = {
-        kN32_SkColorType,
-        kRGB_565_SkColorType,
-    };
-
-    static const struct {
-        SkColor     fSrc;
-        SkColor     fDst;
-        SkPMColor   fResult32;
-        uint16_t    fResult16;
-        uint8_t     fResult8;
-    } gSrcRec[] = {
-        { 0,            0,          0,                                    0,      0 },
-        { 0,            0xFFFFFFFF, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
-        { 0xFFFFFFFF,   0,          SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
-        { 0xFFFFFFFF,   0xFFFFFFFF, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
-    };
-
-    SkPaint paint;
-
-    SkBitmap srcBM;
-    srcBM.allocN32Pixels(W, 1);
-
-    for (size_t i = 0; i < SK_ARRAY_COUNT(gDstColorType); i++) {
-        SkImageInfo info = SkImageInfo::Make(W, 1, gDstColorType[i],
-                                             kPremul_SkAlphaType);
-        SkBitmap dstBM;
-        dstBM.allocPixels(info);
-
-        SkCanvas canvas(dstBM);
-        for (size_t j = 0; j < SK_ARRAY_COUNT(gSrcRec); j++) {
-            srcBM.eraseColor(gSrcRec[j].fSrc);
-            dstBM.eraseColor(gSrcRec[j].fDst);
-
-            for (int k = 0; k < 4; k++) {
-                bool dither = (k & 1) != 0;
-                bool blend = (k & 2) != 0;
-                if (gSrcRec[j].fSrc != 0 && blend) {
-                    // can't make a numerical promise about blending anything
-                    // but 0
-                 //   continue;
-                }
-                paint.setDither(dither);
-                paint.setAlpha(blend ? 0x80 : 0xFF);
-                canvas.drawBitmap(srcBM, 0, 0, &paint);
-                if (!check_color(dstBM, gSrcRec[j].fResult32, gSrcRec[j].fResult16,
-                                 gSrcRec[j].fResult8, reporter)) {
-                    SkDebugf("--- src index %d dither %d blend %d\n", j, dither, blend);
-                }
-            }
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-struct Mesh {
-    SkPoint     fPts[4];
-
-    Mesh(const SkBitmap& bm, SkPaint* paint) {
-        const SkScalar w = SkIntToScalar(bm.width());
-        const SkScalar h = SkIntToScalar(bm.height());
-        fPts[0].set(0, 0);
-        fPts[1].set(w, 0);
-        fPts[2].set(w, h);
-        fPts[3].set(0, h);
-        paint->setShader(SkShader::MakeBitmapShader(bm, SkShader::kClamp_TileMode,
-                                                    SkShader::kClamp_TileMode));
-    }
-
-    void draw(SkCanvas* canvas, SkPaint* paint) {
-        canvas->drawVertices(SkVertices::MakeCopy(SkVertices::kTriangleFan_VertexMode, 4, fPts,
-                                                  fPts, nullptr),
-                             SkBlendMode::kModulate, *paint);
-    }
-};
-
-#include "SkImageEncoder.h"
-static void save_bm(const SkBitmap& bm, const char name[]) {
-    sk_tool_utils::EncodeImageToFile(name, bm, SkEncodedImageFormat::kPNG, 100);
-}
-
-static int max_diff(uint32_t u, uint32_t v) {
-    int d0 = SkAbs32(int((u >> 24) & 0xFF) - int((v >> 24) & 0xFF));
-    int d1 = SkAbs32(int((u >> 16) & 0xFF) - int((v >> 16) & 0xFF));
-    int d2 = SkAbs32(int((u >>  8) & 0xFF) - int((v >>  8) & 0xFF));
-    int d3 = SkAbs32(int((u >>  0) & 0xFF) - int((v >>  0) & 0xFF));
-    return SkMax32(d0, SkMax32(d1, SkMax32(d2, d3)));
-}
-
-static bool nearly_eq(const SkBitmap& a, const SkBitmap& b) {
-    switch (a.colorType()) {
-        case kN32_SkColorType: {
-            for (int y = 0; y < a.width(); ++y) {
-                const SkPMColor* ap = a.getAddr32(0, y);
-                const SkPMColor* bp = b.getAddr32(0, y);
-                for (int x = 0; x < a.width(); ++x) {
-                    int diff = max_diff(ap[x], bp[x]);
-                    if (diff > 1) {
-                        return false;
-                    }
-                }
-            }
-            return true;
-        } break;
-        default:
-            break;
-    }
-    return !memcmp(a.getPixels(), b.getPixels(), a.getSize());
-}
-
-static bool gOnce;
-
-// Make sure our blits are invariant with the width of the blit (i.e. that
-// special case for 8 at a time have the same results as narrower blits)
-static void test_diagonal(skiatest::Reporter* reporter) {
-    static const int W = 64;
-    static const int H = W;
-
-    static const SkColorType gDstColorType[] = {
-        kN32_SkColorType,
-        kRGB_565_SkColorType,
-    };
-
-    static const SkColor gDstBG[] = { 0, 0xFFFFFFFF };
-    const SkRect srcR = SkRect::MakeIWH(W, H);
-
-    SkBitmap srcBM;
-    srcBM.allocN32Pixels(W, H);
-    SkImageInfo info = SkImageInfo::Make(W, H, kUnknown_SkColorType, kPremul_SkAlphaType);
-
-    for (size_t i = 0; i < SK_ARRAY_COUNT(gDstColorType); i++) {
-        info = info.makeColorType(gDstColorType[i]);
-
-        SkBitmap dstBM0, dstBM1;
-        dstBM0.allocPixels(info);
-        dstBM1.allocPixels(info);
-
-        SkCanvas canvas0(dstBM0);
-        SkCanvas canvas1(dstBM1);
-        SkColor bgColor;
-
-        for (size_t j = 0; j < SK_ARRAY_COUNT(gDstBG); j++) {
-            bgColor = gDstBG[j];
-
-            for (int c = 0; c <= 0xFF; c++) {
-                // cons up a mesh to draw the bitmap with
-                SkPaint paint;
-                srcBM.eraseARGB(0xFF, c, c, c);
-                Mesh mesh(srcBM, &paint);
-
-                for (int k = 0; k < 4; k++) {
-                    bool dither = (k & 1) != 0;
-                    uint8_t alpha = (k & 2) ? 0x80 : 0xFF;
-                    paint.setDither(dither);
-                    paint.setAlpha(alpha);
-
-                    dstBM0.eraseColor(bgColor);
-                    dstBM1.eraseColor(bgColor);
-
-                    canvas0.drawRect(srcR, paint);
-                    mesh.draw(&canvas1, &paint);
-
-                    if (!gOnce && false) {
-                        save_bm(dstBM0, "drawBitmap.png");
-                        save_bm(dstBM1, "drawMesh.png");
-                        gOnce = true;
-                    }
-
-                    if (!nearly_eq(dstBM0, dstBM1)) {
-                        ERRORF(reporter, "Diagonal colortype=%s bg=0x%x dither=%d"
-                               " alpha=0x%x src=0x%x",
-                               gColorTypeName[gDstColorType[i]], bgColor, dither,
-                               alpha, c);
-                    }
-                }
-            }
-        }
-    }
-}
-
-DEF_TEST(BlitRow, reporter) {
-    test_00_FF(reporter);
-    test_diagonal(reporter);
-}