SSE2 implementation of memcpy32

With SSE2 version memcpy32, S32_Opaque_BlitRow32() in SkBlitRow_D32.cpp
has about 30% performance improvement. Here are the data on desktop
i7-3770.
before:
       bitmap_scale_filter_90_90   8888:  cmsecs =      2.01
      bitmaprect_FF_filter_trans   8888:  cmsecs =      3.61
    bitmaprect_FF_nofilter_trans   8888:  cmsecs =      3.57
   bitmaprect_FF_filter_identity   8888:  cmsecs =      3.53
 bitmaprect_FF_nofilter_identity   8888:  cmsecs =      3.53
              bitmap_4444_update   8888:  cmsecs =      4.84
     bitmap_4444_update_volatile   8888:  cmsecs =      4.81
                     bitmap_4444   8888:  cmsecs =      4.81
after:
       bitmap_scale_filter_90_90   8888:  cmsecs =      1.83
      bitmaprect_FF_filter_trans   8888:  cmsecs =      2.36
    bitmaprect_FF_nofilter_trans   8888:  cmsecs =      2.36
   bitmaprect_FF_filter_identity   8888:  cmsecs =      2.60
 bitmaprect_FF_nofilter_identity   8888:  cmsecs =      2.63
              bitmap_4444_update   8888:  cmsecs =      3.30
     bitmap_4444_update_volatile   8888:  cmsecs =      3.30
                     bitmap_4444   8888:  cmsecs =      3.29

BUG=skia:
R=mtklein@google.com, reed@google.com, bsalomon@google.com

Author: qiankun.miao@intel.com

Review URL: https://codereview.chromium.org/285313002

git-svn-id: http://skia.googlecode.com/svn/trunk@14822 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
commit-bot@chromium.org 2014-05-21 12:43:07 +00:00
parent 74ff1badf4
commit f0ea77a363
8 changed files with 103 additions and 1 deletions

View File

@ -30,6 +30,15 @@ void sk_memset32(uint32_t dst[], uint32_t value, int count);
typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count);
SkMemset32Proc SkMemset32GetPlatformProc();
/** Similar to memcpy(), but it copies count 32bit values from src to dst.
@param dst The memory to have value copied into it
@param src The memory to have value copied from it
@param count The number of values should be copied.
*/
void sk_memcpy32(uint32_t dst[], const uint32_t src[], int count);
typedef void (*SkMemcpy32Proc)(uint32_t dst[], const uint32_t src[], int count);
SkMemcpy32Proc SkMemcpy32GetPlatformProc();
///////////////////////////////////////////////////////////////////////////////
#define kMaxBytesInUTF8Sequence 4

View File

@ -18,7 +18,7 @@ static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(255 == alpha);
memcpy(dst, src, count * sizeof(SkPMColor));
sk_memcpy32(dst, src, count);
}
static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,

View File

@ -34,6 +34,18 @@
*(dst)++ = value; *(dst)++ = value; \
*(dst)++ = value; *(dst)++ = value; \
} while (0)
#define copy_16_longs(dst, src) \
do { \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
*(dst)++ = *(src)++; *(dst)++ = *(src)++; \
} while (0)
#endif
///////////////////////////////////////////////////////////////////////////////
@ -109,6 +121,24 @@ static void sk_memset32_portable(uint32_t dst[], uint32_t value, int count) {
}
}
static void sk_memcpy32_portable(uint32_t dst[], const uint32_t src[], int count) {
SkASSERT(dst != NULL && count >= 0);
int sixteenlongs = count >> 4;
if (sixteenlongs) {
do {
copy_16_longs(dst, src);
} while (--sixteenlongs != 0);
count &= 15;
}
if (count) {
do {
*dst++ = *src++;
} while (--count != 0);
}
}
static void choose_memset16(SkMemset16Proc* proc) {
*proc = SkMemset16GetPlatformProc();
if (NULL == *proc) {
@ -141,6 +171,22 @@ void sk_memset32(uint32_t dst[], uint32_t value, int count) {
return proc(dst, value, count);
}
static void choose_memcpy32(SkMemcpy32Proc* proc) {
*proc = SkMemcpy32GetPlatformProc();
if (NULL == *proc) {
*proc = &sk_memcpy32_portable;
}
}
void sk_memcpy32(uint32_t dst[], const uint32_t src[], int count) {
SK_DECLARE_STATIC_ONCE(once);
static SkMemcpy32Proc proc = NULL;
SkOnce(&once, choose_memcpy32, &proc);
SkASSERT(proc != NULL);
return proc(dst, src, count);
}
///////////////////////////////////////////////////////////////////////////////
/* 0xxxxxxx 1 total

View File

@ -67,3 +67,33 @@ void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count)
--count;
}
}
void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count)
{
if (count >= 16) {
while (((size_t)dst) & 0x0F) {
*dst++ = *src++;
--count;
}
__m128i *dst128 = reinterpret_cast<__m128i*>(dst);
const __m128i *src128 = reinterpret_cast<const __m128i*>(src);
while (count >= 16) {
__m128i a = _mm_loadu_si128(src128++);
__m128i b = _mm_loadu_si128(src128++);
__m128i c = _mm_loadu_si128(src128++);
__m128i d = _mm_loadu_si128(src128++);
_mm_store_si128(dst128++, a);
_mm_store_si128(dst128++, b);
_mm_store_si128(dst128++, c);
_mm_store_si128(dst128++, d);
count -= 16;
}
dst = reinterpret_cast<uint32_t*>(dst128);
src = reinterpret_cast<const uint32_t*>(src128);
}
while (count > 0) {
*dst++ = *src++;
--count;
}
}

View File

@ -12,5 +12,6 @@
void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count);
void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count);
void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count);
#endif

View File

@ -51,3 +51,7 @@ SkMemset32Proc SkMemset32GetPlatformProc() {
return arm_memset32;
#endif
}
SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
return NULL;
}

View File

@ -16,3 +16,7 @@ SkMemset16Proc SkMemset16GetPlatformProc() {
SkMemset32Proc SkMemset32GetPlatformProc() {
return NULL;
}
SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
return NULL;
}

View File

@ -305,6 +305,14 @@ SkMemset32Proc SkMemset32GetPlatformProc() {
}
}
SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return sk_memcpy32_SSE2;
} else {
return NULL;
}
}
////////////////////////////////////////////////////////////////////////////////
SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {