SSE2 implementation of memcpy32
With SSE2 version memcpy32, S32_Opaque_BlitRow32() in SkBlitRow_D32.cpp has about 30% performance improvement. Here are the data on desktop i7-3770. before: bitmap_scale_filter_90_90 8888: cmsecs = 2.01 bitmaprect_FF_filter_trans 8888: cmsecs = 3.61 bitmaprect_FF_nofilter_trans 8888: cmsecs = 3.57 bitmaprect_FF_filter_identity 8888: cmsecs = 3.53 bitmaprect_FF_nofilter_identity 8888: cmsecs = 3.53 bitmap_4444_update 8888: cmsecs = 4.84 bitmap_4444_update_volatile 8888: cmsecs = 4.81 bitmap_4444 8888: cmsecs = 4.81 after: bitmap_scale_filter_90_90 8888: cmsecs = 1.83 bitmaprect_FF_filter_trans 8888: cmsecs = 2.36 bitmaprect_FF_nofilter_trans 8888: cmsecs = 2.36 bitmaprect_FF_filter_identity 8888: cmsecs = 2.60 bitmaprect_FF_nofilter_identity 8888: cmsecs = 2.63 bitmap_4444_update 8888: cmsecs = 3.30 bitmap_4444_update_volatile 8888: cmsecs = 3.30 bitmap_4444 8888: cmsecs = 3.29 BUG=skia: R=mtklein@google.com, reed@google.com, bsalomon@google.com Author: qiankun.miao@intel.com Review URL: https://codereview.chromium.org/285313002 git-svn-id: http://skia.googlecode.com/svn/trunk@14822 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
74ff1badf4
commit
f0ea77a363
@ -30,6 +30,15 @@ void sk_memset32(uint32_t dst[], uint32_t value, int count);
|
||||
typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count);
|
||||
SkMemset32Proc SkMemset32GetPlatformProc();
|
||||
|
||||
/** Similar to memcpy(), but it copies count 32bit values from src to dst.
|
||||
@param dst The memory to have value copied into it
|
||||
@param src The memory to have value copied from it
|
||||
@param count The number of values should be copied.
|
||||
*/
|
||||
void sk_memcpy32(uint32_t dst[], const uint32_t src[], int count);
|
||||
typedef void (*SkMemcpy32Proc)(uint32_t dst[], const uint32_t src[], int count);
|
||||
SkMemcpy32Proc SkMemcpy32GetPlatformProc();
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define kMaxBytesInUTF8Sequence 4
|
||||
|
@ -18,7 +18,7 @@ static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(255 == alpha);
|
||||
memcpy(dst, src, count * sizeof(SkPMColor));
|
||||
sk_memcpy32(dst, src, count);
|
||||
}
|
||||
|
||||
static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
|
@ -34,6 +34,18 @@
|
||||
*(dst)++ = value; *(dst)++ = value; \
|
||||
*(dst)++ = value; *(dst)++ = value; \
|
||||
} while (0)
|
||||
|
||||
#define copy_16_longs(dst, src) \
|
||||
do { \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@ -109,6 +121,24 @@ static void sk_memset32_portable(uint32_t dst[], uint32_t value, int count) {
|
||||
}
|
||||
}
|
||||
|
||||
static void sk_memcpy32_portable(uint32_t dst[], const uint32_t src[], int count) {
|
||||
SkASSERT(dst != NULL && count >= 0);
|
||||
|
||||
int sixteenlongs = count >> 4;
|
||||
if (sixteenlongs) {
|
||||
do {
|
||||
copy_16_longs(dst, src);
|
||||
} while (--sixteenlongs != 0);
|
||||
count &= 15;
|
||||
}
|
||||
|
||||
if (count) {
|
||||
do {
|
||||
*dst++ = *src++;
|
||||
} while (--count != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void choose_memset16(SkMemset16Proc* proc) {
|
||||
*proc = SkMemset16GetPlatformProc();
|
||||
if (NULL == *proc) {
|
||||
@ -141,6 +171,22 @@ void sk_memset32(uint32_t dst[], uint32_t value, int count) {
|
||||
return proc(dst, value, count);
|
||||
}
|
||||
|
||||
static void choose_memcpy32(SkMemcpy32Proc* proc) {
|
||||
*proc = SkMemcpy32GetPlatformProc();
|
||||
if (NULL == *proc) {
|
||||
*proc = &sk_memcpy32_portable;
|
||||
}
|
||||
}
|
||||
|
||||
void sk_memcpy32(uint32_t dst[], const uint32_t src[], int count) {
|
||||
SK_DECLARE_STATIC_ONCE(once);
|
||||
static SkMemcpy32Proc proc = NULL;
|
||||
SkOnce(&once, choose_memcpy32, &proc);
|
||||
SkASSERT(proc != NULL);
|
||||
|
||||
return proc(dst, src, count);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/* 0xxxxxxx 1 total
|
||||
|
@ -67,3 +67,33 @@ void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count)
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count)
|
||||
{
|
||||
if (count >= 16) {
|
||||
while (((size_t)dst) & 0x0F) {
|
||||
*dst++ = *src++;
|
||||
--count;
|
||||
}
|
||||
__m128i *dst128 = reinterpret_cast<__m128i*>(dst);
|
||||
const __m128i *src128 = reinterpret_cast<const __m128i*>(src);
|
||||
while (count >= 16) {
|
||||
__m128i a = _mm_loadu_si128(src128++);
|
||||
__m128i b = _mm_loadu_si128(src128++);
|
||||
__m128i c = _mm_loadu_si128(src128++);
|
||||
__m128i d = _mm_loadu_si128(src128++);
|
||||
|
||||
_mm_store_si128(dst128++, a);
|
||||
_mm_store_si128(dst128++, b);
|
||||
_mm_store_si128(dst128++, c);
|
||||
_mm_store_si128(dst128++, d);
|
||||
count -= 16;
|
||||
}
|
||||
dst = reinterpret_cast<uint32_t*>(dst128);
|
||||
src = reinterpret_cast<const uint32_t*>(src128);
|
||||
}
|
||||
while (count > 0) {
|
||||
*dst++ = *src++;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
@ -12,5 +12,6 @@
|
||||
|
||||
void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count);
|
||||
void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count);
|
||||
void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count);
|
||||
|
||||
#endif
|
||||
|
@ -51,3 +51,7 @@ SkMemset32Proc SkMemset32GetPlatformProc() {
|
||||
return arm_memset32;
|
||||
#endif
|
||||
}
|
||||
|
||||
SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
|
||||
return NULL;
|
||||
}
|
||||
|
@ -16,3 +16,7 @@ SkMemset16Proc SkMemset16GetPlatformProc() {
|
||||
SkMemset32Proc SkMemset32GetPlatformProc() {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
|
||||
return NULL;
|
||||
}
|
||||
|
@ -305,6 +305,14 @@ SkMemset32Proc SkMemset32GetPlatformProc() {
|
||||
}
|
||||
}
|
||||
|
||||
SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
return sk_memcpy32_SSE2;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
|
||||
|
Loading…
Reference in New Issue
Block a user