Clean up hyper-local SkCpu feature test experiment.

This removes the code paths where we make SkCpu::Supports() calls from within a tight loop. It keeps code paths using SkCpu::Supports() to choose entire routines from src/opts/. We can't rely on these hyper-local checks to be hoisted up reliably enough. It worked pretty well with the first couple platforms we tried (e.g. Clang on Linux/Mac) but we can't gaurantee it works everywhere. Further, I'm not able to actually do anything fancy with those tests outside of x86... I've not found a way to get, say, NEON+F16 conversion code embedded into ordinary NEON code outside writing then entire function in external assembly. This whole idea becomes less important now that we've got a way to chain separate function calls together efficiently. We can now, e.g., use an AVX+F16C method to load some pixels, then chain that into an ordinary AVX method to color filter them. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2138073002 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2138073002
2016-07-11 09:59:21 -07:00 · 2016-07-11 09:59:21 -07:00 · 5608e2ed22
commit 5608e2ed22
parent d6767aa2d5
5 changed files with 15 additions and 64 deletions
--- a/src/core/SkCpu.cpp
+++ b/src/core/SkCpu.cpp
@ -5,7 +5,6 @@
 * found in the LICENSE file.
 */

-#define SkCpu_IMPL
 #include "SkCpu.h"
 #include "SkOnce.h"

@ -79,17 +78,9 @@

 #endif

-#if defined(_MSC_VER)
-    const uint32_t SkCpu::gCachedFeatures = read_cpu_features();
+uint32_t SkCpu::gCachedFeatures = 0;

-    void SkCpu::CacheRuntimeFeatures() {}
-
-#else
-    uint32_t SkCpu::gCachedFeatures = 0;
-
-    void SkCpu::CacheRuntimeFeatures() {
-        static SkOnce once;
-        once([] { gCachedFeatures = read_cpu_features(); });
-    }
-
-#endif
+void SkCpu::CacheRuntimeFeatures() {
+    static SkOnce once;
+    once([] { gCachedFeatures = read_cpu_features(); });
+}
--- a/src/core/SkCpu.h
+++ b/src/core/SkCpu.h
@ -32,11 +32,7 @@ struct SkCpu {
    static void CacheRuntimeFeatures();
    static bool Supports(uint32_t);
 private:
-#if defined(_MSC_VER) || !defined(SkCpu_IMPL)
-    static const uint32_t gCachedFeatures;
-#else
-    static       uint32_t gCachedFeatures;
-#endif
+    static uint32_t gCachedFeatures;
 };

 inline bool SkCpu::Supports(uint32_t mask) {
--- a/src/core/SkHalf.h
+++ b/src/core/SkHalf.h
@ -8,7 +8,6 @@
 #ifndef SkHalf_DEFINED
 #define SkHalf_DEFINED

-#include "SkCpu.h"
 #include "SkNx.h"
 #include "SkTypes.h"

@ -123,32 +122,3 @@ static inline uint64_t SkFloatToHalf_01(const Sk4f& fs) {
 }

 #endif
-
-static inline Sk4f SkHalfToFloat_01(const uint64_t* hs) {
-#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-    if (SkCpu::Supports(SkCpu::F16C)) {
-        __m128 fs;
-    #if defined(__GNUC__) || defined(__clang__)
-        asm("vcvtph2ps %[hs], %[fs]" : [fs]"=x"(fs) : [hs]"m"(*hs));
-    #else
-        fs = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)hs));
-    #endif
-        return fs;
-    }
-#endif
-    return SkHalfToFloat_01(*hs);
-}
-
-static inline void SkFloatToHalf_01(const Sk4f& fs, uint64_t* hs) {
-#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-    if (SkCpu::Supports(SkCpu::F16C)) {
-    #if defined(__GNUC__) || defined(__clang__)
-        asm("vcvtps2ph $0, %[fs], %[hs]" : [hs]"=m"(*hs) : [fs]"x"(fs.fVec));
-    #else
-        _mm_storel_epi64((__m128i*)hs, _mm_cvtps_ph(fs.fVec, 0));
-    #endif
-        return;
-    }
-#endif
-    *hs = SkFloatToHalf_01(fs);
-}
--- a/src/core/SkXfermodeF16.cpp
+++ b/src/core/SkXfermodeF16.cpp
@ -135,12 +135,12 @@ static void srcover_n(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int
                      const SkAlpha aa[]) {
    for (int i = 0; i < count; ++i) {
        Sk4f s = Sk4f::Load(src+i),
-             d = SkHalfToFloat_01(dst+i),
+             d = SkHalfToFloat_01(dst[i]),
             r = s + d*(1.0f - SkNx_shuffle<3,3,3,3>(s));
        if (aa) {
            r = lerp_by_coverage(r, d, aa[i]);
        }
-        SkFloatToHalf_01(r, dst+i);
+        dst[i] = SkFloatToHalf_01(r);
    }
 }

--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@ -8,7 +8,6 @@
 #ifndef SkNx_sse_DEFINED
 #define SkNx_sse_DEFINED

-#include "SkCpu.h"
 #include <immintrin.h>

 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
@ -91,15 +90,9 @@ public:

    SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
    SkNx floor() const {
-        if (SkCpu::Supports(SkCpu::SSE41)) {
-            __m128 r;
-        #if defined(__GNUC__) || defined(__clang__)
-            asm("roundps $0x1, %[fVec], %[r]" : [r]"=x"(r) : [fVec]"x"(fVec));
-        #else
-            r = _mm_floor_ps(fVec);
-        #endif
-            return r;
-        }
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+        return _mm_floor_ps(fVec);
+    #else
        // Emulate _mm_floor_ps() with SSE2:
        //   - roundtrip through integers via truncation
        //   - subtract 1 if that's too big (possible for negative values).
@ -108,6 +101,7 @@ public:
        __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(fVec));
        __m128 too_big = _mm_cmpgt_ps(roundtrip, fVec);
        return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f)));
+    #endif
    }

    SkNx   sqrt() const { return _mm_sqrt_ps (fVec);  }
@ -124,12 +118,12 @@ public:
    bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }

    SkNx thenElse(const SkNx& t, const SkNx& e) const {
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
        return _mm_blendv_ps(e.fVec, t.fVec, fVec);
-#else
+    #else
        return _mm_or_ps(_mm_and_ps   (fVec, t.fVec),
                         _mm_andnot_ps(fVec, e.fVec));
-#endif
+    #endif
    }

    __m128 fVec;