Clean up hyper-local SkCpu feature test experiment.
This removes the code paths where we make SkCpu::Supports() calls from within a tight loop. It keeps code paths using SkCpu::Supports() to choose entire routines from src/opts/. We can't rely on these hyper-local checks to be hoisted up reliably enough. It worked pretty well with the first couple platforms we tried (e.g. Clang on Linux/Mac) but we can't gaurantee it works everywhere. Further, I'm not able to actually do anything fancy with those tests outside of x86... I've not found a way to get, say, NEON+F16 conversion code embedded into ordinary NEON code outside writing then entire function in external assembly. This whole idea becomes less important now that we've got a way to chain separate function calls together efficiently. We can now, e.g., use an AVX+F16C method to load some pixels, then chain that into an ordinary AVX method to color filter them. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2138073002 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2138073002
This commit is contained in:
parent
d6767aa2d5
commit
5608e2ed22
@ -5,7 +5,6 @@
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#define SkCpu_IMPL
|
||||
#include "SkCpu.h"
|
||||
#include "SkOnce.h"
|
||||
|
||||
@ -79,17 +78,9 @@
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
const uint32_t SkCpu::gCachedFeatures = read_cpu_features();
|
||||
uint32_t SkCpu::gCachedFeatures = 0;
|
||||
|
||||
void SkCpu::CacheRuntimeFeatures() {}
|
||||
|
||||
#else
|
||||
uint32_t SkCpu::gCachedFeatures = 0;
|
||||
|
||||
void SkCpu::CacheRuntimeFeatures() {
|
||||
static SkOnce once;
|
||||
once([] { gCachedFeatures = read_cpu_features(); });
|
||||
}
|
||||
|
||||
#endif
|
||||
void SkCpu::CacheRuntimeFeatures() {
|
||||
static SkOnce once;
|
||||
once([] { gCachedFeatures = read_cpu_features(); });
|
||||
}
|
||||
|
@ -32,11 +32,7 @@ struct SkCpu {
|
||||
static void CacheRuntimeFeatures();
|
||||
static bool Supports(uint32_t);
|
||||
private:
|
||||
#if defined(_MSC_VER) || !defined(SkCpu_IMPL)
|
||||
static const uint32_t gCachedFeatures;
|
||||
#else
|
||||
static uint32_t gCachedFeatures;
|
||||
#endif
|
||||
static uint32_t gCachedFeatures;
|
||||
};
|
||||
|
||||
inline bool SkCpu::Supports(uint32_t mask) {
|
||||
|
@ -8,7 +8,6 @@
|
||||
#ifndef SkHalf_DEFINED
|
||||
#define SkHalf_DEFINED
|
||||
|
||||
#include "SkCpu.h"
|
||||
#include "SkNx.h"
|
||||
#include "SkTypes.h"
|
||||
|
||||
@ -123,32 +122,3 @@ static inline uint64_t SkFloatToHalf_01(const Sk4f& fs) {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline Sk4f SkHalfToFloat_01(const uint64_t* hs) {
|
||||
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
if (SkCpu::Supports(SkCpu::F16C)) {
|
||||
__m128 fs;
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
asm("vcvtph2ps %[hs], %[fs]" : [fs]"=x"(fs) : [hs]"m"(*hs));
|
||||
#else
|
||||
fs = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)hs));
|
||||
#endif
|
||||
return fs;
|
||||
}
|
||||
#endif
|
||||
return SkHalfToFloat_01(*hs);
|
||||
}
|
||||
|
||||
static inline void SkFloatToHalf_01(const Sk4f& fs, uint64_t* hs) {
|
||||
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
if (SkCpu::Supports(SkCpu::F16C)) {
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
asm("vcvtps2ph $0, %[fs], %[hs]" : [hs]"=m"(*hs) : [fs]"x"(fs.fVec));
|
||||
#else
|
||||
_mm_storel_epi64((__m128i*)hs, _mm_cvtps_ph(fs.fVec, 0));
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
*hs = SkFloatToHalf_01(fs);
|
||||
}
|
||||
|
@ -135,12 +135,12 @@ static void srcover_n(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int
|
||||
const SkAlpha aa[]) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
Sk4f s = Sk4f::Load(src+i),
|
||||
d = SkHalfToFloat_01(dst+i),
|
||||
d = SkHalfToFloat_01(dst[i]),
|
||||
r = s + d*(1.0f - SkNx_shuffle<3,3,3,3>(s));
|
||||
if (aa) {
|
||||
r = lerp_by_coverage(r, d, aa[i]);
|
||||
}
|
||||
SkFloatToHalf_01(r, dst+i);
|
||||
dst[i] = SkFloatToHalf_01(r);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8,7 +8,6 @@
|
||||
#ifndef SkNx_sse_DEFINED
|
||||
#define SkNx_sse_DEFINED
|
||||
|
||||
#include "SkCpu.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
|
||||
@ -91,15 +90,9 @@ public:
|
||||
|
||||
SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
|
||||
SkNx floor() const {
|
||||
if (SkCpu::Supports(SkCpu::SSE41)) {
|
||||
__m128 r;
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
asm("roundps $0x1, %[fVec], %[r]" : [r]"=x"(r) : [fVec]"x"(fVec));
|
||||
#else
|
||||
r = _mm_floor_ps(fVec);
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_floor_ps(fVec);
|
||||
#else
|
||||
// Emulate _mm_floor_ps() with SSE2:
|
||||
// - roundtrip through integers via truncation
|
||||
// - subtract 1 if that's too big (possible for negative values).
|
||||
@ -108,6 +101,7 @@ public:
|
||||
__m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(fVec));
|
||||
__m128 too_big = _mm_cmpgt_ps(roundtrip, fVec);
|
||||
return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f)));
|
||||
#endif
|
||||
}
|
||||
|
||||
SkNx sqrt() const { return _mm_sqrt_ps (fVec); }
|
||||
@ -124,12 +118,12 @@ public:
|
||||
bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }
|
||||
|
||||
SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_blendv_ps(e.fVec, t.fVec, fVec);
|
||||
#else
|
||||
#else
|
||||
return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
|
||||
_mm_andnot_ps(fVec, e.fVec));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
__m128 fVec;
|
||||
|
Loading…
Reference in New Issue
Block a user