Add AVX512 implementation for blit_row_s32a_opaque

blit_row_s32a_opaque time is improved by ~20-30% using icelake cpu. nanobench results: before after SkVM_4096_Opts 0.141ns 0.108ns SkVM_1024_Opts 0.161ns 0.110ns SkVM_256_Opts 0.155ns 0.109ns Change-Id: If46b3fbeb4a7b68b152aca2c0bc3e1417578d4b2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284528 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-26 14:44:32 +08:00 · 2020-04-26 14:44:32 +08:00 · 56f23a1d17
commit 56f23a1d17
parent 81ef385c1f
3 changed files with 88 additions and 2 deletions
--- a/src/core/SkCpu.h
+++ b/src/core/SkCpu.h
@ -84,6 +84,9 @@ inline bool SkCpu::Supports(uint32_t mask) {
    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
    features |= AVX2;
    #endif
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
+    features |= (AVX512F | AVX512DQ | AVX512CD | AVX512BW | AVX512VL);
+    #endif
    // FMA doesn't fit neatly into this total ordering.
    // It's available on Haswell+ just like AVX2, but it's technically a different bit.
    // TODO: circle back on this if we find ourselves limited by lack of compile-time FMA
--- a/src/core/SkOpts_skx.cpp
+++ b/src/core/SkOpts_skx.cpp
@ -44,11 +44,14 @@
    #endif

    #define SK_OPTS_NS skx
+    #include "src/opts/SkBlitRow_opts.h"
    #include "src/opts/SkRasterPipeline_opts.h"
    #include "src/opts/SkVM_opts.h"

    namespace SkOpts {
        void Init_skx() {
+            blit_row_s32a_opaque = SK_OPTS_NS::blit_row_s32a_opaque;
+
        #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
            SK_RASTER_PIPELINE_STAGES(M)
            just_return_highp = (StageFn)SK_OPTS_NS::just_return;
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@ -11,7 +11,37 @@
 #include "include/private/SkColorData.h"
 #include "include/private/SkVx.h"
 #include "src/core/SkMSAN.h"
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
+    #include <immintrin.h>
+
+    static inline __m512i SkPMSrcOver_SKX(const __m512i& src, const __m512i& dst) {
+        // Detailed explanations in SkPMSrcOver_AVX2
+        // b = s + (d*(256-srcA)) >> 8
+
+        // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
+        const uint8_t _ = -1;   // fills a literal 0 byte.
+        const uint8_t mask[64] = { 3, _,3, _, 7, _,7, _, 11,_,11,_, 15,_,15,_,
+                                   19,_,19,_, 23,_,23,_, 27,_,27,_, 31,_,31,_,
+                                   35,_,35,_, 39,_,39,_, 43,_,43,_, 47,_,47,_,
+                                   51,_,51,_, 55,_,55,_, 59,_,59,_, 63,_,63,_ };
+        __m512i srcA_x2 = _mm512_shuffle_epi8(src, _mm512_loadu_si512(mask));
+        __m512i scale_x2 = _mm512_sub_epi16(_mm512_set1_epi16(256),
+                                            srcA_x2);
+
+        // Scale red and blue, leaving results in the low byte of each 16-bit lane.
+        __m512i rb = _mm512_and_si512(_mm512_set1_epi32(0x00ff00ff), dst);
+        rb = _mm512_mullo_epi16(rb, scale_x2);
+        rb = _mm512_srli_epi16(rb, 8);
+
+        // Scale green and alpha, leaving results in the high byte, masking off the low bits.
+        __m512i ga = _mm512_srli_epi16(dst, 8);
+        ga = _mm512_mullo_epi16(ga, scale_x2);
+        ga = _mm512_andnot_si512(_mm512_set1_epi32(0x00ff00ff), ga);
+
+        return _mm512_add_epi32(src, _mm512_or_si512(rb, ga));
+    }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
    #include <immintrin.h>

    static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
@ -163,8 +193,58 @@ static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
 void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
    SkASSERT(alpha == 0xFF);
    sk_msan_assert_initialized(src, src+len);
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
+    while (len >= 64) {
+        // Load 64 source pixels.
+        auto s0 = _mm512_loadu_si512((const __m512i*)(src) + 0),
+             s1 = _mm512_loadu_si512((const __m512i*)(src) + 1),
+             s2 = _mm512_loadu_si512((const __m512i*)(src) + 2),
+             s3 = _mm512_loadu_si512((const __m512i*)(src) + 3);
+
+        const auto alphaMask = _mm512_set1_epi32(0xFF000000);
+
+        auto ORed = _mm512_or_si512(s3, _mm512_or_si512(s2, _mm512_or_si512(s1, s0)));
+        if (0 == _cvtmask64_u64(_mm512_cmpneq_epi8_mask(_mm512_and_si512(ORed, alphaMask),
+                                                        _mm512_setzero_si512()))) {
+            // All 64 source pixels are transparent.  Nothing to do.
+            src += 64;
+            dst += 64;
+            len -= 64;
+            continue;
+        }
+
+        auto d0 = (__m512i*)(dst) + 0,
+             d1 = (__m512i*)(dst) + 1,
+             d2 = (__m512i*)(dst) + 2,
+             d3 = (__m512i*)(dst) + 3;
+
+        auto ANDed = _mm512_and_si512(s3, _mm512_and_si512(s2, _mm512_and_si512(s1, s0)));
+        if (0 == _cvtmask64_u64(_mm512_cmpneq_epi8_mask(_mm512_and_si512(ANDed, alphaMask),
+                                                        alphaMask))) {
+            // All 64 source pixels are opaque.  SrcOver becomes Src.
+            _mm512_storeu_si512(d0, s0);
+            _mm512_storeu_si512(d1, s1);
+            _mm512_storeu_si512(d2, s2);
+            _mm512_storeu_si512(d3, s3);
+            src += 64;
+            dst += 64;
+            len -= 64;
+            continue;
+        }
+
+        // TODO: This math is wrong.
+        // Do SrcOver.
+        _mm512_storeu_si512(d0, SkPMSrcOver_SKX(s0, _mm512_loadu_si512(d0)));
+        _mm512_storeu_si512(d1, SkPMSrcOver_SKX(s1, _mm512_loadu_si512(d1)));
+        _mm512_storeu_si512(d2, SkPMSrcOver_SKX(s2, _mm512_loadu_si512(d2)));
+        _mm512_storeu_si512(d3, SkPMSrcOver_SKX(s3, _mm512_loadu_si512(d3)));
+        src += 64;
+        dst += 64;
+        len -= 64;
+    }
+
 // Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
    while (len >= 32) {
        // Load 32 source pixels.
        auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),