Port SkUtils opts to SkOpts.

With this new arrangement, the benefits of inlining sk_memset16/32 have changed.

On x86, they're not significantly different, except for small N<=10 where the inlined code is significantly slower.
On ARMv7 with NEON, our custom code is still significantly faster for N>10 (up to 2x faster).  For small N<=10 inlining is still significantly faster.
On ARMv7 without NEON, our custom code is still ridiculously faster (up to 10x) than inlining for N>10, though for small N<=10 inlining is still a little faster.

We were not using the NEON memset16 and memset32 procs on ARMv8.  At first blush, that seems to be an oversight, but if so it's an extremely lucky one.  The ARMv8 code generation for our memset16/32 procs is total garbage, leaving those methods ~8x slower than just inlining the memset, using the compiler's autovectorization.

So, no need to inline any more on x86, and still inline for N<=10 on ARMv7.  Always inline for ARMv8.

BUG=skia:4117

Review URL: https://codereview.chromium.org/1270573002
This commit is contained in:
mtklein 2015-07-31 10:46:50 -07:00 committed by Commit bot
parent 5119ac069e
commit 7eb0945af2
14 changed files with 142 additions and 397 deletions

View File

@ -10,7 +10,6 @@
'<(skia_src_path)/opts/SkBlurImage_opts_none.cpp',
'<(skia_src_path)/opts/SkMorphology_opts_none.cpp',
'<(skia_src_path)/opts/SkTextureCompression_opts_none.cpp',
'<(skia_src_path)/opts/SkUtils_opts_none.cpp',
'<(skia_src_path)/opts/SkXfermode_opts_none.cpp',
],
@ -21,7 +20,6 @@
'<(skia_src_path)/opts/SkBlurImage_opts_arm.cpp',
'<(skia_src_path)/opts/SkMorphology_opts_arm.cpp',
'<(skia_src_path)/opts/SkTextureCompression_opts_arm.cpp',
'<(skia_src_path)/opts/SkUtils_opts_arm.cpp',
'<(skia_src_path)/opts/SkXfermode_opts_arm.cpp',
],
'neon_sources': [
@ -32,7 +30,6 @@
'<(skia_src_path)/opts/SkBlurImage_opts_neon.cpp',
'<(skia_src_path)/opts/SkMorphology_opts_neon.cpp',
'<(skia_src_path)/opts/SkTextureCompression_opts_neon.cpp',
'<(skia_src_path)/opts/SkUtils_opts_arm_neon.cpp',
'<(skia_src_path)/opts/SkXfermode_opts_arm_neon.cpp',
'<(skia_src_path)/opts/SkOpts_neon.cpp',
],
@ -49,7 +46,6 @@
'<(skia_src_path)/opts/SkMorphology_opts_arm.cpp',
'<(skia_src_path)/opts/SkMorphology_opts_neon.cpp',
'<(skia_src_path)/opts/SkTextureCompression_opts_none.cpp',
'<(skia_src_path)/opts/SkUtils_opts_none.cpp',
'<(skia_src_path)/opts/SkXfermode_opts_arm.cpp',
'<(skia_src_path)/opts/SkXfermode_opts_arm_neon.cpp',
'<(skia_src_path)/opts/SkOpts_neon.cpp',
@ -62,7 +58,6 @@
'<(skia_src_path)/opts/SkBlurImage_opts_none.cpp',
'<(skia_src_path)/opts/SkMorphology_opts_none.cpp',
'<(skia_src_path)/opts/SkTextureCompression_opts_none.cpp',
'<(skia_src_path)/opts/SkUtils_opts_none.cpp',
'<(skia_src_path)/opts/SkXfermode_opts_none.cpp',
],
@ -73,7 +68,6 @@
'<(skia_src_path)/opts/SkBlurImage_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkMorphology_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkTextureCompression_opts_none.cpp',
'<(skia_src_path)/opts/SkUtils_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkXfermode_opts_none.cpp',
'<(skia_src_path)/opts/opts_check_x86.cpp',
'<(skia_src_path)/opts/SkOpts_sse2.cpp',

View File

@ -11,6 +11,7 @@
#define SkFloatingPoint_DEFINED
#include "SkTypes.h"
#include "../private/SkOpts.h"
#include <math.h>
#include <float.h>
@ -127,8 +128,6 @@ extern const uint32_t gIEEENegativeInfinity;
#define SK_FloatInfinity (*SkTCast<const float*>(&gIEEEInfinity))
#define SK_FloatNegativeInfinity (*SkTCast<const float*>(&gIEEENegativeInfinity))
namespace SkOpts { extern float (*rsqrt)(float); }
// Fast, approximate inverse square root.
// Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON.
static inline float sk_float_rsqrt(const float x) {

View File

@ -9,57 +9,46 @@
#define SkUtils_DEFINED
#include "SkTypes.h"
#include "../private/SkOpts.h"
///////////////////////////////////////////////////////////////////////////////
// Determined empirically using bench/MemsetBench.cpp on a Nexus 7, Nexus 9, and desktop.
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 || defined(SK_ARM_HAS_NEON)
// Platforms where we can assume an autovectorizer will give us a good inline memset.
#define SK_SMALL_MEMSET 1000
#else
// Platforms like Chrome on ARMv7 that don't typically compile with NEON globally.
#define SK_SMALL_MEMSET 10
#endif
// The inlining heuristics below were determined using bench/MemsetBench.cpp
// on a x86 desktop, a Nexus 7 with and without NEON, and a Nexus 9:
// - on x86, inlining was never faster,
// - on ARMv7, inlining was faster for N<=10. Putting this check inside the NEON
// code was not helpful; it's got to be here outside.
// - NEON code generation for ARMv8 with GCC 4.9 is terrible,
// making the NEON code ~8x slower that just a serial loop.
/** Similar to memset(), but it assigns a 16bit value into the buffer.
@param buffer The memory to have value copied into it
@param value The 16bit value to be copied into buffer
@param count The number of times value should be copied into the buffer.
*/
void sk_memset16_large(uint16_t dst[], uint16_t value, int count);
inline void sk_memset16(uint16_t dst[], uint16_t value, int count) {
if (count <= SK_SMALL_MEMSET) {
for (int i = 0; i < count; i++) {
dst[i] = value;
static inline void sk_memset16(uint16_t buffer[], uint16_t value, int count) {
#if defined(SK_CPU_ARM64)
while (count --> 0) { *buffer++ = value; } return;
#elif defined(SK_CPU_ARM32)
if (count <= 10) { while (count --> 0) { *buffer++ = value; } return; }
#endif
SkOpts::memset16(buffer, value, count);
}
} else {
sk_memset16_large(dst, value, count);
}
}
typedef void (*SkMemset16Proc)(uint16_t dst[], uint16_t value, int count);
SkMemset16Proc SkMemset16GetPlatformProc();
/** Similar to memset(), but it assigns a 32bit value into the buffer.
@param buffer The memory to have value copied into it
@param value The 32bit value to be copied into buffer
@param count The number of times value should be copied into the buffer.
*/
void sk_memset32_large(uint32_t dst[], uint32_t value, int count);
inline void sk_memset32(uint32_t dst[], uint32_t value, int count) {
if (count <= SK_SMALL_MEMSET) {
for (int i = 0; i < count; i++) {
dst[i] = value;
}
} else {
sk_memset32_large(dst, value, count);
}
static inline void sk_memset32(uint32_t buffer[], uint32_t value, int count) {
#if defined(SK_CPU_ARM64)
while (count --> 0) { *buffer++ = value; } return;
#elif defined(SK_CPU_ARM32)
if (count <= 10) { while (count --> 0) { *buffer++ = value; } return; }
#endif
SkOpts::memset32(buffer, value, count);
}
typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count);
SkMemset32Proc SkMemset32GetPlatformProc();
#undef SK_SMALL_MEMSET
///////////////////////////////////////////////////////////////////////////////

View File

@ -20,6 +20,10 @@ namespace SkOpts {
// Returns a fast approximation of 1.0f/sqrtf(x).
extern float (*rsqrt)(float);
// See SkUtils.h
extern void (*memset16)(uint16_t[], uint16_t, int);
extern void (*memset32)(uint32_t[], uint32_t, int);
}
#endif//SkOpts_DEFINED

View File

@ -20,7 +20,9 @@
#include <cpu-features.h>
#endif
static float rsqrt_portable(float x) {
namespace portable { // This helps identify methods from this file when debugging / profiling.
static float rsqrt(float x) {
// Get initial estimate.
int i = *SkTCast<int*>(&x);
i = 0x5F1FFFF9 - (i>>1);
@ -32,9 +34,17 @@ static float rsqrt_portable(float x) {
return estimate;
}
template <typename T>
static void memsetT(T dst[], T val, int n) { while (n --> 0) { *dst++ = val; } }
} // namespace portable
namespace SkOpts {
// Define default function pointer values here...
decltype(rsqrt) rsqrt = rsqrt_portable;
decltype(rsqrt) rsqrt = portable::rsqrt;
decltype(memset16) memset16 = portable::memsetT<uint16_t>;
decltype(memset32) memset32 = portable::memsetT<uint32_t>;
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_sse2();

View File

@ -8,134 +8,6 @@
#include "SkUtils.h"
#include "SkLazyFnPtr.h"
#if 0
#define assign_16_longs(dst, value) \
do { \
(dst)[0] = value; (dst)[1] = value; \
(dst)[2] = value; (dst)[3] = value; \
(dst)[4] = value; (dst)[5] = value; \
(dst)[6] = value; (dst)[7] = value; \
(dst)[8] = value; (dst)[9] = value; \
(dst)[10] = value; (dst)[11] = value; \
(dst)[12] = value; (dst)[13] = value; \
(dst)[14] = value; (dst)[15] = value; \
} while (0)
#else
#define assign_16_longs(dst, value) \
do { \
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
} while (0)
#endif
///////////////////////////////////////////////////////////////////////////////
static void sk_memset16_portable(uint16_t dst[], uint16_t value, int count) {
SkASSERT(dst != NULL && count >= 0);
if (count <= 0) {
return;
}
// not sure if this helps to short-circuit on small values of count
if (count < 8) {
do {
*dst++ = (uint16_t)value;
} while (--count != 0);
return;
}
// ensure we're on a long boundary
if ((size_t)dst & 2) {
*dst++ = (uint16_t)value;
count -= 1;
}
uint32_t value32 = ((uint32_t)value << 16) | value;
// handle the bulk with our unrolled macro
{
int sixteenlongs = count >> 5;
if (sixteenlongs) {
uint32_t* dst32 = (uint32_t*)dst;
do {
assign_16_longs(dst32, value32);
} while (--sixteenlongs != 0);
dst = (uint16_t*)dst32;
count &= 31;
}
}
// handle (most) of the rest
{
int longs = count >> 1;
if (longs) {
do {
*(uint32_t*)dst = value32;
dst += 2;
} while (--longs != 0);
}
}
// cleanup a possible trailing short
if (count & 1) {
*dst = (uint16_t)value;
}
}
static void sk_memset32_portable(uint32_t dst[], uint32_t value, int count) {
SkASSERT(dst != NULL && count >= 0);
int sixteenlongs = count >> 4;
if (sixteenlongs) {
do {
assign_16_longs(dst, value);
} while (--sixteenlongs != 0);
count &= 15;
}
if (count) {
do {
*dst++ = value;
} while (--count != 0);
}
}
namespace {
// These three methods technically need external linkage to be passed as template parameters.
// Since they can't be static, we hide them in an anonymous namespace instead.
SkMemset16Proc choose_memset16() {
SkMemset16Proc proc = SkMemset16GetPlatformProc();
return proc ? proc : sk_memset16_portable;
}
SkMemset32Proc choose_memset32() {
SkMemset32Proc proc = SkMemset32GetPlatformProc();
return proc ? proc : sk_memset32_portable;
}
} // namespace
void sk_memset16_large(uint16_t dst[], uint16_t value, int count) {
SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset16Proc, proc, choose_memset16);
proc.get()(dst, value, count);
}
void sk_memset32_large(uint32_t dst[], uint32_t value, int count) {
SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset32Proc, proc, choose_memset32);
proc.get()(dst, value, count);
}
///////////////////////////////////////////////////////////////////////////////
/* 0xxxxxxx 1 total
10xxxxxx // never a leading byte

View File

@ -8,9 +8,66 @@
#include "SkOpts.h"
#include "SkFloatingPoint.h"
namespace neon { // This helps identify methods from this file when debugging / profiling.
static float rsqrt(float x) {
return sk_float_rsqrt(x); // This sk_float_rsqrt copy will take the NEON compile-time path.
}
static void memset16(uint16_t* dst, uint16_t value, int n) {
uint16x8_t v8 = vdupq_n_u16(value);
uint16x8x4_t v32 = {{ v8, v8, v8, v8 }};
while (n >= 32) {
vst4q_u16(dst, v32); // This swizzles, but we don't care: all lanes are the same, value.
dst += 32;
n -= 32;
}
switch (n / 8) {
case 3: vst1q_u16(dst, v8); dst += 8;
case 2: vst1q_u16(dst, v8); dst += 8;
case 1: vst1q_u16(dst, v8); dst += 8;
}
if (n & 4) {
vst1_u16(dst, vget_low_u16(v8));
dst += 4;
}
switch (n & 3) {
case 3: *dst++ = value;
case 2: *dst++ = value;
case 1: *dst = value;
}
}
static void memset32(uint32_t* dst, uint32_t value, int n) {
uint32x4_t v4 = vdupq_n_u32(value);
uint32x4x4_t v16 = {{ v4, v4, v4, v4 }};
while (n >= 16) {
vst4q_u32(dst, v16); // This swizzles, but we don't care: all lanes are the same, value.
dst += 16;
n -= 16;
}
switch (n / 4) {
case 3: vst1q_u32(dst, v4); dst += 4;
case 2: vst1q_u32(dst, v4); dst += 4;
case 1: vst1q_u32(dst, v4); dst += 4;
}
if (n & 2) {
vst1_u32(dst, vget_low_u32(v4));
dst += 2;
}
if (n & 1) {
*dst = value;
}
}
} // namespace neon
namespace SkOpts {
void Init_neon() {
rsqrt = sk_float_rsqrt; // This copy of sk_float_rsqrt will take the NEON path.
rsqrt = neon::rsqrt;
memset16 = neon::memset16;
memset32 = neon::memset32;
}
}

View File

@ -7,8 +7,49 @@
#include "SkOpts.h"
namespace sse2 { // This helps identify methods from this file when debugging / profiling.
static void memset16(uint16_t* dst, uint16_t val, int n) {
auto dst8 = (__m128i*)dst;
auto val8 = _mm_set1_epi16(val);
for ( ; n >= 8; n -= 8) {
_mm_storeu_si128(dst8++, val8);
}
dst = (uint16_t*)dst8;
if (n & 4) {
_mm_storel_epi64((__m128i*)dst, val8);
dst += 4;
}
if (n & 2) {
*(uint32_t*)dst = _mm_cvtsi128_si32(val8);
dst += 2;
}
if (n & 1) {
*dst = val;
}
}
static void memset32(uint32_t* dst, uint32_t val, int n) {
auto dst4 = (__m128i*)dst;
auto val4 = _mm_set1_epi32(val);
for ( ; n >= 4; n -= 4) {
_mm_storeu_si128(dst4++, val4);
}
dst = (uint32_t*)dst4;
if (n & 2) {
_mm_storel_epi64((__m128i*)dst, val4);
dst += 2;
}
if (n & 1) {
*dst = val;
}
}
} // namespace sse2
namespace SkOpts {
void Init_sse2() {
memset16 = sse2::memset16;
memset32 = sse2::memset32;
}
}

View File

@ -1,69 +0,0 @@
/*
* Copyright 2009 The Android Open Source Project
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include <emmintrin.h>
#include "SkUtils_opts_SSE2.h"
void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count)
{
SkASSERT(dst != NULL && count >= 0);
// dst must be 2-byte aligned.
SkASSERT((((size_t) dst) & 0x01) == 0);
if (count >= 32) {
while (((size_t)dst) & 0x0F) {
*dst++ = value;
--count;
}
__m128i *d = reinterpret_cast<__m128i*>(dst);
__m128i value_wide = _mm_set1_epi16(value);
while (count >= 32) {
_mm_store_si128(d , value_wide);
_mm_store_si128(d + 1, value_wide);
_mm_store_si128(d + 2, value_wide);
_mm_store_si128(d + 3, value_wide);
d += 4;
count -= 32;
}
dst = reinterpret_cast<uint16_t*>(d);
}
while (count > 0) {
*dst++ = value;
--count;
}
}
void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count)
{
SkASSERT(dst != NULL && count >= 0);
// dst must be 4-byte aligned.
SkASSERT((((size_t) dst) & 0x03) == 0);
if (count >= 16) {
while (((size_t)dst) & 0x0F) {
*dst++ = value;
--count;
}
__m128i *d = reinterpret_cast<__m128i*>(dst);
__m128i value_wide = _mm_set1_epi32(value);
while (count >= 16) {
_mm_store_si128(d , value_wide);
_mm_store_si128(d + 1, value_wide);
_mm_store_si128(d + 2, value_wide);
_mm_store_si128(d + 3, value_wide);
d += 4;
count -= 16;
}
dst = reinterpret_cast<uint32_t*>(d);
}
while (count > 0) {
*dst++ = value;
--count;
}
}

View File

@ -1,16 +0,0 @@
/*
* Copyright 2009 The Android Open Source Project
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkUtils_opts_SSE2_DEFINED
#define SkUtils_opts_SSE2_DEFINED
#include "SkTypes.h"
void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count);
void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count);
#endif

View File

@ -1,32 +0,0 @@
/*
* Copyright 2014 ARM Ltd.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkUtils.h"
#include "SkUtilsArm.h"
void sk_memset16_neon(uint16_t dst[], uint16_t value, int count);
void sk_memset32_neon(uint32_t dst[], uint32_t value, int count);
SkMemset16Proc SkMemset16GetPlatformProc() {
#if SK_ARM_NEON_IS_ALWAYS
return sk_memset16_neon;
#elif SK_ARM_NEON_IS_DYNAMIC
return sk_cpu_arm_has_neon() ? sk_memset16_neon : nullptr;
#else
return nullptr;
#endif
}
SkMemset32Proc SkMemset32GetPlatformProc() {
#if SK_ARM_NEON_IS_ALWAYS
return sk_memset32_neon;
#elif SK_ARM_NEON_IS_DYNAMIC
return sk_cpu_arm_has_neon() ? sk_memset32_neon : nullptr;
#else
return nullptr;
#endif
}

View File

@ -1,66 +0,0 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkTypes.h"
#include <arm_neon.h>
void sk_memset32_neon(uint32_t dst[], uint32_t value, int count) {
uint32x4_t v4 = vdupq_n_u32(value);
uint32x4x4_t v16 = {{ v4, v4, v4, v4 }};
while (count >= 16) {
vst4q_u32(dst, v16); // This swizzles, but we don't care: all lanes are the same, value.
dst += 16;
count -= 16;
}
SkASSERT(count < 16);
switch (count / 4) {
case 3: vst1q_u32(dst, v4); dst += 4; count -= 4;
case 2: vst1q_u32(dst, v4); dst += 4; count -= 4;
case 1: vst1q_u32(dst, v4); dst += 4; count -= 4;
}
SkASSERT(count < 4);
if (count >= 2) {
vst1_u32(dst, vget_low_u32(v4));
dst += 2;
count -= 2;
}
SkASSERT(count < 2);
if (count > 0) {
*dst = value;
}
}
void sk_memset16_neon(uint16_t dst[], uint16_t value, int count) {
uint16x8_t v8 = vdupq_n_u16(value);
uint16x8x4_t v32 = {{ v8, v8, v8, v8 }};
while (count >= 32) {
vst4q_u16(dst, v32); // This swizzles, but we don't care: all lanes are the same, value.
dst += 32;
count -= 32;
}
SkASSERT(count < 32);
switch (count / 8) {
case 3: vst1q_u16(dst, v8); dst += 8; count -= 8;
case 2: vst1q_u16(dst, v8); dst += 8; count -= 8;
case 1: vst1q_u16(dst, v8); dst += 8; count -= 8;
}
SkASSERT(count < 8);
if (count >= 4) {
vst1_u16(dst, vget_low_u16(v8));
dst += 4;
count -= 4;
}
SkASSERT(count < 4);
switch (count) {
case 3: *dst++ = value;
case 2: *dst++ = value;
case 1: *dst = value;
}
}

View File

@ -1,18 +0,0 @@
/*
* Copyright 2009 The Android Open Source Project
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkUtils.h"
SkMemset16Proc SkMemset16GetPlatformProc() {
return NULL;
}
SkMemset32Proc SkMemset32GetPlatformProc() {
return NULL;
}

View File

@ -19,8 +19,6 @@
#include "SkMorphology_opts.h"
#include "SkMorphology_opts_SSE2.h"
#include "SkRTConf.h"
#include "SkUtils.h"
#include "SkUtils_opts_SSE2.h"
#if defined(_MSC_VER) && defined(_WIN64)
#include <intrin.h>
@ -301,24 +299,6 @@ SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, Ro
////////////////////////////////////////////////////////////////////////////////
SkMemset16Proc SkMemset16GetPlatformProc() {
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return sk_memset16_SSE2;
} else {
return NULL;
}
}
SkMemset32Proc SkMemset32GetPlatformProc() {
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return sk_memset32_SSE2;
} else {
return NULL;
}
}
////////////////////////////////////////////////////////////////////////////////
SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return NULL;