apply SkOpts_skx approach to SkOpts_hsw

Very slightly different build flags, switching from -march=haswell to -mavx2 -mfma -mf16c, but there are no diffs. Left some TODOs for the next ones, but _hsw being so common I figured I'd do this one stand alone and make sure it shakes down before doing the rest. clang-cl (but not clang) barfs when we've got a lambda using an AVX intrinsic inside a templated static helper function. Luckily they're all non-type template parameters, so we can just pass them as normal arguments, and it'll optimize the same as the templated code anyway. This was weird, but since we're covering almost all the SkFoo_opts.h headers with HSW here, I don't anticipate this being a problem in the future. (And I'm sure I'll never look back on this statement as naive.) Change-Id: I2f84db356cafa5e158bcc3724fb1c3f58aca7f1e Reviewed-on: https://skia-review.googlesource.com/c/skia/+/293599 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-06-02 11:02:45 -05:00 · 2020-06-02 11:02:45 -05:00 · 08a39c2b5f
commit 08a39c2b5f
parent f80a78602e
4 changed files with 138 additions and 90 deletions
--- a/gn/core.gni
+++ b/gn/core.gni
@ -272,7 +272,15 @@ skia_core_sources = [
  "$_src/core/SkOSFile.h",
  "$_src/core/SkOpts.cpp",
  "$_src/core/SkOpts.h",
+
+  #"$_src/core/SkOpts_avx.cpp",
+  #"$_src/core/SkOpts_crc32.cpp",
+  "$_src/core/SkOpts_hsw.cpp",
  "$_src/core/SkOpts_skx.cpp",
+
+  #"$_src/core/SkOpts_sse41.cpp",
+  #"$_src/core/SkOpts_sse42.cpp",
+  #"$_src/core/SkOpts_ssse3.cpp",
  "$_src/core/SkOrderedReadBuffer.h",
  "$_src/core/SkOverdrawCanvas.cpp",
  "$_src/core/SkPaint.cpp",
--- a/src/core/SkOpts_hsw.cpp
+++ b/src/core/SkOpts_hsw.cpp
@ -0,0 +1,92 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "src/core/SkOpts.h"
+
+#if defined(SK_CPU_X86)
+
+    // Turn on HSW feature set.
+    #if defined(__clang__)
+        #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to=function)
+    #elif defined(__GNUC__)
+        #pragma GCC push_options
+        #pragma GCC target("avx2,f16c,fma")
+    #endif
+
+    // Let our code in *_opts.h know we want HSW features.
+    #undef  SK_CPU_SSE_LEVEL
+    #define SK_CPU_SSE_LEVEL SK_CPU_SSE_LEVEL_AVX2
+
+    #if defined(__clang__) && defined(_MSC_VER)
+        // clang-cl's immintrin.h is bizarrely annoying, not including the
+        // various foointrin.h unless the __FOO__ flag is also defined (i.e.
+        // you used command-line flags to set the features instead of attributes).
+        // MSVC itself doesn't work this way, nor does non-_MSC_VER clang.  :/
+        #define __SSE__ 1
+        #define __SSE2__ 1
+        #define __SSE3__ 1
+        #define __SSSE3__ 1
+        #define __SSE4_1__ 1
+        #define __SSE4_2__ 1
+        #define __AVX__ 1
+        #define __F16C__ 1
+        #define __AVX2__ 1
+        #define __FMA__ 1
+    #endif
+
+    #define SK_OPTS_NS hsw
+    #include "src/core/SkCubicSolver.h"
+    #include "src/opts/SkBitmapProcState_opts.h"
+    #include "src/opts/SkBlitRow_opts.h"
+    #include "src/opts/SkRasterPipeline_opts.h"
+    #include "src/opts/SkSwizzler_opts.h"
+    #include "src/opts/SkUtils_opts.h"
+    #include "src/opts/SkVM_opts.h"
+
+    namespace SkOpts {
+        void Init_hsw() {
+            blit_row_color32     = SK_OPTS_NS::blit_row_color32;
+            blit_row_s32a_opaque = SK_OPTS_NS::blit_row_s32a_opaque;
+
+            S32_alpha_D32_filter_DX  = SK_OPTS_NS::S32_alpha_D32_filter_DX;
+
+            cubic_solver = SK_OPTS_NS::cubic_solver;
+
+            RGBA_to_BGRA          = SK_OPTS_NS::RGBA_to_BGRA;
+            RGBA_to_rgbA          = SK_OPTS_NS::RGBA_to_rgbA;
+            RGBA_to_bgrA          = SK_OPTS_NS::RGBA_to_bgrA;
+            RGB_to_RGB1           = SK_OPTS_NS::RGB_to_RGB1;
+            RGB_to_BGR1           = SK_OPTS_NS::RGB_to_BGR1;
+            gray_to_RGB1          = SK_OPTS_NS::gray_to_RGB1;
+            grayA_to_RGBA         = SK_OPTS_NS::grayA_to_RGBA;
+            grayA_to_rgbA         = SK_OPTS_NS::grayA_to_rgbA;
+            inverted_CMYK_to_RGB1 = SK_OPTS_NS::inverted_CMYK_to_RGB1;
+            inverted_CMYK_to_BGR1 = SK_OPTS_NS::inverted_CMYK_to_BGR1;
+
+        #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
+            SK_RASTER_PIPELINE_STAGES(M)
+            just_return_highp = (StageFn)SK_OPTS_NS::just_return;
+            start_pipeline_highp = SK_OPTS_NS::start_pipeline;
+        #undef M
+
+        #define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
+            SK_RASTER_PIPELINE_STAGES(M)
+            just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
+            start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
+        #undef M
+
+            interpret_skvm = SK_OPTS_NS::interpret_skvm;
+        }
+    }
+
+    #if defined(__clang__)
+        #pragma clang attribute pop
+    #elif defined(__GNUC__)
+        #pragma GCC pop_options
+    #endif
+
+#endif//defined(SK_CPU_X86)
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@ -5,49 +5,4 @@
 * found in the LICENSE file.
 */

-#include "src/core/SkOpts.h"
-
-#define SK_OPTS_NS hsw
-#include "src/core/SkCubicSolver.h"
-#include "src/opts/SkBitmapProcState_opts.h"
-#include "src/opts/SkBlitRow_opts.h"
-#include "src/opts/SkRasterPipeline_opts.h"
-#include "src/opts/SkSwizzler_opts.h"
-#include "src/opts/SkUtils_opts.h"
-#include "src/opts/SkVM_opts.h"
-
-namespace SkOpts {
-    void Init_hsw() {
-        blit_row_color32     = hsw::blit_row_color32;
-        blit_row_s32a_opaque = hsw::blit_row_s32a_opaque;
-
-        S32_alpha_D32_filter_DX  = hsw::S32_alpha_D32_filter_DX;
-
-        cubic_solver = SK_OPTS_NS::cubic_solver;
-
-        RGBA_to_BGRA          = SK_OPTS_NS::RGBA_to_BGRA;
-        RGBA_to_rgbA          = SK_OPTS_NS::RGBA_to_rgbA;
-        RGBA_to_bgrA          = SK_OPTS_NS::RGBA_to_bgrA;
-        RGB_to_RGB1           = SK_OPTS_NS::RGB_to_RGB1;
-        RGB_to_BGR1           = SK_OPTS_NS::RGB_to_BGR1;
-        gray_to_RGB1          = SK_OPTS_NS::gray_to_RGB1;
-        grayA_to_RGBA         = SK_OPTS_NS::grayA_to_RGBA;
-        grayA_to_rgbA         = SK_OPTS_NS::grayA_to_rgbA;
-        inverted_CMYK_to_RGB1 = SK_OPTS_NS::inverted_CMYK_to_RGB1;
-        inverted_CMYK_to_BGR1 = SK_OPTS_NS::inverted_CMYK_to_BGR1;
-
-    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_highp = (StageFn)SK_OPTS_NS::just_return;
-        start_pipeline_highp = SK_OPTS_NS::start_pipeline;
-    #undef M
-
-    #define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
-        SK_RASTER_PIPELINE_STAGES(M)
-        just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
-        start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
-    #undef M
-
-        interpret_skvm = SK_OPTS_NS::interpret_skvm;
-    }
-}
+// Intentionally empty, to be cleaned up.
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@ -189,8 +189,7 @@ static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
    return div255_round(vmull_u8(x, y));
 }

-template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
+static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
    while (count >= 8) {
        // Load 8 pixels.
        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
@ -227,11 +226,11 @@ static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count)
 }

 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<false>(dst, src, count);
+    premul_should_swapRB(false, dst, src, count);
 }

 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<true>(dst, src, count);
+    premul_should_swapRB(true, dst, src, count);
 }

 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
@ -267,8 +266,8 @@ static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count)
    RGBA_to_BGRA_portable(dst, src, count);
 }

-template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
+static void insert_alpha_should_swaprb(bool kSwapRB,
+                                       uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 16) {
        // Load 16 pixels.
        uint8x16x3_t rgb = vld3q_u8(src);
@ -321,11 +320,11 @@ static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int c
 }

 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<false>(dst, src, count);
+    insert_alpha_should_swaprb(false, dst, src, count);
 }

 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<true>(dst, src, count);
+    insert_alpha_should_swaprb(true, dst, src, count);
 }

 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
@ -368,8 +367,7 @@ static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int c
    gray_to_RGB1_portable(dst, src, count);
 }

-template <bool kPremul>
-static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
+static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 16) {
        // Load 16 pixels.
        uint8x16x2_t ga = vld2q_u8(src);
@ -423,16 +421,15 @@ static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
 }

 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
-    expand_grayA<false>(dst, src, count);
+    expand_grayA(false, dst, src, count);
 }

 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
-    expand_grayA<true>(dst, src, count);
+    expand_grayA(true, dst, src, count);
 }

 enum Format { kRGB1, kBGR1 };
-template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
+static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
    while (count >= 8) {
        // Load 8 cmyk pixels.
        uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
@ -470,11 +467,11 @@ static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
 }

 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kRGB1>(dst, src, count);
+    inverted_cmyk_to(kRGB1, dst, src, count);
 }

 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kBGR1>(dst, src, count);
+    inverted_cmyk_to(kBGR1, dst, src, count);
 }

 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
@ -489,10 +486,9 @@ static __m256i scale(__m256i x, __m256i y) {
    return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
 }

-template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
+static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {

-    auto premul8 = [](__m256i* lo, __m256i* hi) {
+    auto premul8 = [=](__m256i* lo, __m256i* hi) {
        const __m256i zeros = _mm256_setzero_si256();
        __m256i planar;
        if (kSwapRB) {
@ -560,11 +556,11 @@ static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count)
 }

 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<false>(dst, src, count);
+    premul_should_swapRB(false, dst, src, count);
 }

 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<true>(dst, src, count);
+    premul_should_swapRB(true, dst, src, count);
 }

 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
@ -585,8 +581,8 @@ static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count)
 }

 // Use SSSE3 impl as AVX2 impl regresses performance on some platforms.
-template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
+static void insert_alpha_should_swaprb(bool kSwapRB,
+                                       uint32_t dst[], const uint8_t* src, int count) {
    const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
    __m128i expand;
    const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
@ -620,11 +616,11 @@ static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int c
 }

 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<false>(dst, src, count);
+    insert_alpha_should_swaprb(false, dst, src, count);
 }

 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<true>(dst, src, count);
+    insert_alpha_should_swaprb(true, dst, src, count);
 }

 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
@ -737,9 +733,8 @@ static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int c
 }

 enum Format { kRGB1, kBGR1 };
-template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
-    auto convert8 = [](__m256i* lo, __m256i* hi) {
+static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
+    auto convert8 = [=](__m256i* lo, __m256i* hi) {
        const __m256i zeros = _mm256_setzero_si256();
        __m256i planar;
        if (kBGR1 == format) {
@ -808,11 +803,11 @@ static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
 }

 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kRGB1>(dst, src, count);
+    inverted_cmyk_to(kRGB1, dst, src, count);
 }

 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kBGR1>(dst, src, count);
+    inverted_cmyk_to(kBGR1, dst, src, count);
 }

 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
@ -827,10 +822,9 @@ static __m128i scale(__m128i x, __m128i y) {
    return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
 }

-template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
+static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {

-    auto premul8 = [](__m128i* lo, __m128i* hi) {
+    auto premul8 = [=](__m128i* lo, __m128i* hi) {
        const __m128i zeros = _mm_setzero_si128();
        __m128i planar;
        if (kSwapRB) {
@ -896,11 +890,11 @@ static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count)
 }

 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<false>(dst, src, count);
+    premul_should_swapRB(false, dst, src, count);
 }

 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<true>(dst, src, count);
+    premul_should_swapRB(true, dst, src, count);
 }

 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
@ -919,8 +913,8 @@ static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count)
    RGBA_to_BGRA_portable(dst, src, count);
 }

-template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
+static void insert_alpha_should_swaprb(bool kSwapRB,
+                                       uint32_t dst[], const uint8_t* src, int count) {
    const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
    __m128i expand;
    const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
@ -953,11 +947,11 @@ static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int c
 }

 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<false>(dst, src, count);
+    insert_alpha_should_swaprb(false, dst, src, count);
 }

 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<true>(dst, src, count);
+    insert_alpha_should_swaprb(true, dst, src, count);
 }

 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
@ -1038,9 +1032,8 @@ static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int c
 }

 enum Format { kRGB1, kBGR1 };
-template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
-    auto convert8 = [](__m128i* lo, __m128i* hi) {
+static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
+    auto convert8 = [=](__m128i* lo, __m128i* hi) {
        const __m128i zeros = _mm_setzero_si128();
        __m128i planar;
        if (kBGR1 == format) {
@ -1105,11 +1098,11 @@ static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
 }

 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kRGB1>(dst, src, count);
+    inverted_cmyk_to(kRGB1, dst, src, count);
 }

 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kBGR1>(dst, src, count);
+    inverted_cmyk_to(kBGR1, dst, src, count);
 }

 #else