add _hsw lowp backend

CQ_INCLUDE_TRYBOTS=skia.primary:Build-Ubuntu-Clang-x86_64-Debug-MSAN Change-Id: Id53279c17589b3434629bb644358ee238af8649f Reviewed-on: https://skia-review.googlesource.com/20269 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Mike Reed <reed@google.com>
2017-06-19 14:37:10 -07:00 · 2017-06-19 14:37:10 -07:00 · 8c3d5156c7
commit 8c3d5156c7
parent 7f7b902d51
5 changed files with 3643 additions and 117 deletions
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -31,8 +31,7 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
 #undef M

 #ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
-#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
-    #if 0
+    #if 0 && !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
        #include <atomic>

        #define M(st) #st,
@ -57,7 +56,6 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
        static void log_missing(SkRasterPipeline::StockStage) {}
    #endif
 #endif
-#endif

 // We can't express the real types of most stage functions portably, so we use a stand-in.
 // We'll only ever call start_pipeline(), which then chains into the rest for us.
@ -128,12 +126,14 @@ extern "C" {
                    ASM(start_pipeline,avx       ),
                    ASM(start_pipeline,sse41     ),
                    ASM(start_pipeline,sse2      ),
+                    ASM(start_pipeline,hsw_lowp  ),
                    ASM(start_pipeline,ssse3_lowp);

    StageFn ASM(just_return,hsw),
            ASM(just_return,avx),
            ASM(just_return,sse41),
            ASM(just_return,sse2),
+            ASM(just_return,hsw_lowp  ),
            ASM(just_return,ssse3_lowp);

    #define M(st) StageFn ASM(st,hsw);
@ -149,6 +149,9 @@ extern "C" {
        SK_RASTER_PIPELINE_STAGES(M)
    #undef M

+    #define M(st) StageFn ASM(st,hsw_lowp);
+        LOWP_STAGES(M)
+    #undef M
    #define M(st) StageFn ASM(st,ssse3_lowp);
        LOWP_STAGES(M)
    #undef M
@ -162,6 +165,24 @@ extern "C" {
    #undef M
 }

+#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
+    template <SkRasterPipeline::StockStage st>
+    static constexpr StageFn* hsw_lowp() { return nullptr; }
+
+    template <SkRasterPipeline::StockStage st>
+    static constexpr StageFn* ssse3_lowp() { return nullptr; }
+
+    #define M(st) \
+        template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() {   \
+            return ASM(st,hsw_lowp);                                        \
+        }                                                                   \
+        template <> constexpr StageFn* ssse3_lowp<SkRasterPipeline::st>() { \
+            return ASM(st,ssse3_lowp);                                      \
+        }
+        LOWP_STAGES(M)
+    #undef M
+#endif
+
 // Engines comprise everything we need to run SkRasterPipelines.
 struct SkJumper_Engine {
    StageFn*         stages[kNumStages];
@ -239,41 +260,70 @@ static SkJumper_Engine choose_engine() {
    return kPortable;
 }

+#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
+    static const SkJumper_Engine kNone = {
+    #define M(stage) nullptr,
+        { SK_RASTER_PIPELINE_STAGES(M) },
+    #undef M
+        nullptr,
+        nullptr,
+    };
+    static SkJumper_Engine gLowp = kNone;
+    static SkOnce gChooseLowpOnce;
+
+    static SkJumper_Engine choose_lowp() {
+    #if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
+        if (1 && SkCpu::Supports(SkCpu::HSW)) {
+            return {
+            #define M(st) hsw_lowp<SkRasterPipeline::st>(),
+                { SK_RASTER_PIPELINE_STAGES(M) },
+                ASM(start_pipeline,hsw_lowp),
+                ASM(just_return,hsw_lowp)
+            #undef M
+            };
+        }
+        if (1 && SkCpu::Supports(SkCpu::SSSE3)) {
+            return {
+            #define M(st) ssse3_lowp<SkRasterPipeline::st>(),
+                { SK_RASTER_PIPELINE_STAGES(M) },
+                ASM(start_pipeline,ssse3_lowp),
+                ASM(just_return,ssse3_lowp)
+            #undef M
+            };
+        }
+    #endif
+        return kNone;
+    }
+#endif
+
 StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
 #ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
-#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
-    if (SkCpu::Supports(SkCpu::SSSE3)) {
-        void** reset_point = ip;
+    gChooseLowpOnce([]{ gLowp = choose_lowp(); });

-        *--ip = (void*)ASM(just_return,ssse3_lowp);
-        for (const StageList* st = fStages; st; st = st->prev) {
-            StageFn* fn = nullptr;
-            switch (st->stage) {
-            #define M(st) case SkRasterPipeline::st: fn = ASM(st, ssse3_lowp); break;
-                LOWP_STAGES(M)
-            #undef M
-                case SkRasterPipeline::clamp_0: continue;  // clamp_0 is a no-op in lowp.
-                default:
-                    log_missing(st->stage);
-                    ip = reset_point;
-            }
-            if (ip == reset_point) {
-                break;
-            }
+    // First try to build a lowp pipeline.  If that fails, fall back to normal float gEngine.
+    void** reset_point = ip;
+    *--ip = (void*)gLowp.just_return;
+    for (const StageList* st = fStages; st; st = st->prev) {
+        if (st->stage == SkRasterPipeline::clamp_0) {
+            continue;  // No-op in lowp.
+        }
+        if (StageFn* fn = gLowp.stages[st->stage]) {
            if (st->ctx) {
                *--ip = st->ctx;
            }
            *--ip = (void*)fn;
-        }
-
-        if (ip != reset_point) {
-            return ASM(start_pipeline,ssse3_lowp);
+        } else {
+            log_missing(st->stage);
+            ip = reset_point;
+            break;
        }
    }
+    if (ip != reset_point) {
+        return gLowp.start_pipeline;
+    }
 #endif
-#endif
-    gChooseEngineOnce([]{ gEngine = choose_engine(); });

+    gChooseEngineOnce([]{ gEngine = choose_engine(); });
    // We're building the pipeline backwards, so we start with the final stage just_return.
    *--ip = (void*)gEngine.just_return;

--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@ -9,16 +9,22 @@
 #include "SkJumper_misc.h"
 #include <immintrin.h>

-#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
-    #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
+#if !defined(__clang__) || !defined(__x86_64__)
+    #error "We're starting with just x86-64 for now, and will always require Clang."
 #endif

-#define WRAP(name) sk_##name##_ssse3_lowp
-
 using K = const SkJumper_constants;
-static const size_t kStride = 8;

-template <typename T> using V = T __attribute__((ext_vector_type(8)));
+#if defined(__AVX2__)
+    #define WRAP(name) sk_##name##_hsw_lowp
+    template <typename T> using V = T __attribute__((ext_vector_type(16)));
+    static const size_t kStride = 16;
+#else
+    #define WRAP(name) sk_##name##_ssse3_lowp
+    template <typename T> using V = T __attribute__((ext_vector_type(8)));
+    static const size_t kStride = 8;
+#endif
+
 using U8  = V<uint8_t>;
 using U16 = V<uint16_t>;
 using U32 = V<uint32_t>;
@ -40,7 +46,14 @@ struct F {

 SI F operator+(F x, F y) { return x.vec + y.vec; }
 SI F operator-(F x, F y) { return x.vec - y.vec; }
-SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
+SI F operator*(F x, F y) {
+#if defined(__AVX2__)
+    return _mm256_abs_epi16(_mm256_mulhrs_epi16(x.vec, y.vec));
+#else
+    return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec));
+#endif
+}
+
 SI F mad(F f, F m, F a) { return f*m+a; }
 SI F inv(F v) { return 1.0f - v; }
 SI F two(F v) { return v + v; }
@ -51,6 +64,11 @@ SI F operator>>(F x, int bits) { return x.vec >> bits; }

 using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);

+#if defined(__AVX__)
+    // We really want to make sure all paths go through this function's (implicit) vzeroupper.
+    // If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
+    __attribute__((disable_tail_calls))
+#endif
 MAYBE_MSABI
 extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
    F v{};
@ -88,13 +106,21 @@ SI V load(const T* src, size_t tail) {
    if (__builtin_expect(tail, 0)) {
        V v{};  // Any inactive lanes are zeroed.
        switch (tail) {
-            case 7: v[6] = src[6];
-            case 6: v[5] = src[5];
-            case 5: v[4] = src[4];
-            case 4: memcpy(&v, src, 4*sizeof(T)); break;
-            case 3: v[2] = src[2];
-            case 2: memcpy(&v, src, 2*sizeof(T)); break;
-            case 1: memcpy(&v, src, 1*sizeof(T)); break;
+            case 15: v[14] = src[14];
+            case 14: v[13] = src[13];
+            case 13: v[12] = src[12];
+            case 12: memcpy(&v, src, 12*sizeof(T)); break;
+            case 11: v[10] = src[10];
+            case 10: v[ 9] = src[ 9];
+            case  9: v[ 8] = src[ 8];
+            case  8: memcpy(&v, src,  8*sizeof(T)); break;
+            case  7: v[6] = src[6];
+            case  6: v[5] = src[5];
+            case  5: v[4] = src[4];
+            case  4: memcpy(&v, src,  4*sizeof(T)); break;
+            case  3: v[2] = src[2];
+            case  2: memcpy(&v, src,  2*sizeof(T)); break;
+            case  1: memcpy(&v, src,  1*sizeof(T)); break;
        }
        return v;
    }
@ -106,25 +132,39 @@ SI void store(T* dst, V v, size_t tail) {
    __builtin_assume(tail < kStride);
    if (__builtin_expect(tail, 0)) {
        switch (tail) {
-            case 7: dst[6] = v[6];
-            case 6: dst[5] = v[5];
-            case 5: dst[4] = v[4];
-            case 4: memcpy(dst, &v, 4*sizeof(T)); break;
-            case 3: dst[2] = v[2];
-            case 2: memcpy(dst, &v, 2*sizeof(T)); break;
-            case 1: memcpy(dst, &v, 1*sizeof(T)); break;
+            case 15: dst[14] = v[14];
+            case 14: dst[13] = v[13];
+            case 13: dst[12] = v[12];
+            case 12: memcpy(dst, &v, 12*sizeof(T)); break;
+            case 11: dst[10] = v[10];
+            case 10: dst[ 9] = v[ 9];
+            case  9: dst[ 8] = v[ 8];
+            case  8: memcpy(dst, &v,  8*sizeof(T)); break;
+            case  7: dst[6] = v[6];
+            case  6: dst[5] = v[5];
+            case  5: dst[4] = v[4];
+            case  4: memcpy(dst, &v,  4*sizeof(T)); break;
+            case  3: dst[2] = v[2];
+            case  2: memcpy(dst, &v,  2*sizeof(T)); break;
+            case  1: memcpy(dst, &v,  1*sizeof(T)); break;
        }
        return;
    }
    unaligned_store(dst, v);
 }

+// TODO: mask loads and stores with AVX2
+
 // Scale from [0,255] up to [0,32768].
 SI F from_wide_byte(U16 bytes) {
    // Ideally we'd scale by 32768/255 = 128.50196, but instead we'll approximate
    // that a little more cheaply as 256*32897/65536 = 128.50391.
    // 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1 bit.
-    return _mm_mulhi_epu16(bytes << 8, U16(32897));
+#if defined(__AVX2__)
+    return _mm256_mulhi_epu16(bytes << 8, U16(32897));
+#else
+    return    _mm_mulhi_epu16(bytes << 8, U16(32897));
+#endif
 }
 SI F from_byte(U8 bytes) {
    return from_wide_byte(__builtin_convertvector(bytes, U16));
@ -133,13 +173,22 @@ SI F from_byte(U8 bytes) {
 // Pack from [0,32768] down to [0,255].
 SI U16 to_wide_byte(F v) {
    // The simplest thing works great: divide by 128 and saturate.
-    return _mm_min_epi16(v>>7, U16(255));
+#if defined(__AVX2__)
+    return _mm256_min_epi16(v >> 7, U16(255));
+#else
+    return    _mm_min_epi16(v >> 7, U16(255));
+#endif
 }
 SI U8 to_byte(F v) {
    // Like to_wide_byte(), but we'll bake the saturation into the 16->8 bit pack.
+#if defined(__AVX2__)
+    return _mm_packus_epi16(_mm256_extracti128_si256(v >> 7, 0),
+                            _mm256_extracti128_si256(v >> 7, 1));
+#else
    // Only the bottom 8 bytes are of interest... it doesn't matter what we pack on top.
-    __m128i packed = _mm_packus_epi16(v>>7, v>>7);
+    __m128i packed = _mm_packus_epi16(v >> 7, v >> 7);
    return unaligned_load<U8>(&packed);
+#endif
 }

 SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@ -60,6 +60,12 @@ subprocess.check_call(clang + cflags + hsw +
 subprocess.check_call(clang + cflags + hsw + win +
                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
                      ['-o', 'win_hsw.o'])
+subprocess.check_call(clang + cflags + hsw +
+                      ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+                      ['-o', 'lowp_hsw.o'])
+subprocess.check_call(clang + cflags + hsw + win +
+                      ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+                      ['-o', 'win_lowp_hsw.o'])

 aarch64 = [ '--target=aarch64' ]
 subprocess.check_call(clang + cflags + aarch64 +
@ -196,6 +202,8 @@ parse_object_file('sse41.o', '.byte')
 print 'BALIGN32'
 parse_object_file('sse2.o',  '.byte')
 print 'BALIGN32'
+parse_object_file('lowp_hsw.o',  '.byte')
+print 'BALIGN32'
 parse_object_file('lowp_ssse3.o',  '.byte')

 print '#endif'
@ -221,6 +229,8 @@ parse_object_file('win_sse41.o', 'DB')
 print 'ALIGN 32'
 parse_object_file('win_sse2.o',  'DB')
 print 'ALIGN 32'
+parse_object_file('win_lowp_hsw.o',  'DB')
+print 'ALIGN 32'
 parse_object_file('win_lowp_ssse3.o',  'DB')
 print 'ENDIF'
 print 'END'