Clean up hyper-local SkCpu feature test experiment.

This removes the code paths where we make SkCpu::Supports() calls
    from within a tight loop.  It keeps code paths using SkCpu::Supports()
    to choose entire routines from src/opts/.

    We can't rely on these hyper-local checks to be hoisted up reliably enough.
    It worked pretty well with the first couple platforms we tried (e.g. Clang
    on Linux/Mac) but we can't gaurantee it works everywhere.

    Further, I'm not able to actually do anything fancy with those tests
    outside of x86... I've not found a way to get, say, NEON+F16 conversion
    code embedded into ordinary NEON code outside writing then entire function
    in external assembly.

    This whole idea becomes less important now that we've got a way to chain
    separate function calls together efficiently.  We can now, e.g., use an
    AVX+F16C method to load some pixels, then chain that into an ordinary AVX
    method to color filter them.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2138073002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2138073002
This commit is contained in:
mtklein 2016-07-11 09:59:21 -07:00 committed by Commit bot
parent d6767aa2d5
commit 5608e2ed22
5 changed files with 15 additions and 64 deletions

View File

@ -5,7 +5,6 @@
* found in the LICENSE file.
*/
#define SkCpu_IMPL
#include "SkCpu.h"
#include "SkOnce.h"
@ -79,17 +78,9 @@
#endif
#if defined(_MSC_VER)
const uint32_t SkCpu::gCachedFeatures = read_cpu_features();
uint32_t SkCpu::gCachedFeatures = 0;
void SkCpu::CacheRuntimeFeatures() {}
#else
uint32_t SkCpu::gCachedFeatures = 0;
void SkCpu::CacheRuntimeFeatures() {
static SkOnce once;
once([] { gCachedFeatures = read_cpu_features(); });
}
#endif
void SkCpu::CacheRuntimeFeatures() {
static SkOnce once;
once([] { gCachedFeatures = read_cpu_features(); });
}

View File

@ -32,11 +32,7 @@ struct SkCpu {
static void CacheRuntimeFeatures();
static bool Supports(uint32_t);
private:
#if defined(_MSC_VER) || !defined(SkCpu_IMPL)
static const uint32_t gCachedFeatures;
#else
static uint32_t gCachedFeatures;
#endif
static uint32_t gCachedFeatures;
};
inline bool SkCpu::Supports(uint32_t mask) {

View File

@ -8,7 +8,6 @@
#ifndef SkHalf_DEFINED
#define SkHalf_DEFINED
#include "SkCpu.h"
#include "SkNx.h"
#include "SkTypes.h"
@ -123,32 +122,3 @@ static inline uint64_t SkFloatToHalf_01(const Sk4f& fs) {
}
#endif
static inline Sk4f SkHalfToFloat_01(const uint64_t* hs) {
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
if (SkCpu::Supports(SkCpu::F16C)) {
__m128 fs;
#if defined(__GNUC__) || defined(__clang__)
asm("vcvtph2ps %[hs], %[fs]" : [fs]"=x"(fs) : [hs]"m"(*hs));
#else
fs = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)hs));
#endif
return fs;
}
#endif
return SkHalfToFloat_01(*hs);
}
static inline void SkFloatToHalf_01(const Sk4f& fs, uint64_t* hs) {
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
if (SkCpu::Supports(SkCpu::F16C)) {
#if defined(__GNUC__) || defined(__clang__)
asm("vcvtps2ph $0, %[fs], %[hs]" : [hs]"=m"(*hs) : [fs]"x"(fs.fVec));
#else
_mm_storel_epi64((__m128i*)hs, _mm_cvtps_ph(fs.fVec, 0));
#endif
return;
}
#endif
*hs = SkFloatToHalf_01(fs);
}

View File

@ -135,12 +135,12 @@ static void srcover_n(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int
const SkAlpha aa[]) {
for (int i = 0; i < count; ++i) {
Sk4f s = Sk4f::Load(src+i),
d = SkHalfToFloat_01(dst+i),
d = SkHalfToFloat_01(dst[i]),
r = s + d*(1.0f - SkNx_shuffle<3,3,3,3>(s));
if (aa) {
r = lerp_by_coverage(r, d, aa[i]);
}
SkFloatToHalf_01(r, dst+i);
dst[i] = SkFloatToHalf_01(r);
}
}

View File

@ -8,7 +8,6 @@
#ifndef SkNx_sse_DEFINED
#define SkNx_sse_DEFINED
#include "SkCpu.h"
#include <immintrin.h>
// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
@ -91,15 +90,9 @@ public:
SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
SkNx floor() const {
if (SkCpu::Supports(SkCpu::SSE41)) {
__m128 r;
#if defined(__GNUC__) || defined(__clang__)
asm("roundps $0x1, %[fVec], %[r]" : [r]"=x"(r) : [fVec]"x"(fVec));
#else
r = _mm_floor_ps(fVec);
#endif
return r;
}
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
return _mm_floor_ps(fVec);
#else
// Emulate _mm_floor_ps() with SSE2:
// - roundtrip through integers via truncation
// - subtract 1 if that's too big (possible for negative values).
@ -108,6 +101,7 @@ public:
__m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(fVec));
__m128 too_big = _mm_cmpgt_ps(roundtrip, fVec);
return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f)));
#endif
}
SkNx sqrt() const { return _mm_sqrt_ps (fVec); }
@ -124,12 +118,12 @@ public:
bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }
SkNx thenElse(const SkNx& t, const SkNx& e) const {
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
return _mm_blendv_ps(e.fVec, t.fVec, fVec);
#else
#else
return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
_mm_andnot_ps(fVec, e.fVec));
#endif
#endif
}
__m128 fVec;