SSE2 implementation of S32_D565_Opaque_Dither

Run benchmarks with command line option "--forceDither true". The result
shows that all benchmarks exercised S32_D565_Opaque_Dither benefit from
this SSE2 optimization. Here are the data on i7-3770:
                                                  before    after
constXTile_MM_filter                              900.93   217.75  75.83%
constXTile_CC_filter_scale                        907.59   225.65  75.14%
constXTile_RR_filter                              903.33   219.41  75.71%
constXTile_MM_scale                               902.45   221.46  75.46%
constXTile_CC                                     898.55   218.37  75.70%
constXTile_RR_scale                               902.69   222.35  75.37%
repeatTile_4444_X                                 938.53   240.49  74.38%
gradient_radial2_mirror                         16999.49 11540.39  32.11%
gradient_radial2_clamp_hicolor                  17943.38 12501.71  30.33%
gradient_radial2_clamp                          17816.36 12492.04  29.88%
bitmaprect_FF_filter_trans                         47.81    10.98  77.03%
bitmaprect_FF_nofilter_trans                       47.79    10.91  77.18%
bitmaprect_FF_filter_identity                      47.74    10.89  77.18%
bitmaprect_FF_nofilter_identity                    47.83    10.89  77.24%
bitmap_4444_update_scale_rotate_bilerp            100.45    76.84  23.50%
bitmap_4444_update_volatile_scale_rotate_bilerp   100.80    76.70  23.91%
bitmap_4444_scale_rotate_bilerp                   100.43    77.18  23.15%
bitmap_4444_update_scale_bilerp                    79.00    49.03  37.93%
bitmap_4444_update_volatile_scale_bilerp           78.90    48.87  38.06%
bitmap_4444_scale_bilerp                           78.92    48.81  38.16%
bitmap_4444_update                                 42.19    11.53  72.68%
bitmap_4444_update_volatile                        42.28    11.49  72.82%
bitmap_a8                                          60.37    29.75  50.72%
bitmap_4444                                        42.19    11.52  72.69%

BUG=
R=mtklein@google.com

Author: qiankun.miao@intel.com

Review URL: https://codereview.chromium.org/181293002

git-svn-id: http://skia.googlecode.com/svn/trunk@13698 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
commit-bot@chromium.org 2014-03-07 03:25:32 +00:00
parent 4cd9e2169e
commit 275804782f
3 changed files with 119 additions and 1 deletions

View File

@ -10,6 +10,7 @@
#include "SkBitmapProcState_opts_SSE2.h"
#include "SkColorPriv.h"
#include "SkColor_opts_SSE2.h"
#include "SkDither.h"
#include "SkUtils.h"
#include <emmintrin.h>
@ -1051,3 +1052,117 @@ void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
} while (--count != 0);
}
}
void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha, int x, int y) {
SkASSERT(255 == alpha);
if (count <= 0) {
return;
}
if (count >= 8) {
while (((size_t)dst & 0x0F) != 0) {
DITHER_565_SCAN(y);
SkPMColor c = *src++;
SkPMColorAssert(c);
unsigned dither = DITHER_VALUE(x);
*dst++ = SkDitherRGB32To565(c, dither);
DITHER_INC_X(x);
count--;
}
unsigned short dither_value[8];
__m128i dither;
#ifdef ENABLE_DITHER_MATRIX_4X4
const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
#else
const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
dither_value[0] = dither_value[4] = (dither_scan
>> (((x) & 3) << 2)) & 0xF;
dither_value[1] = dither_value[5] = (dither_scan
>> (((x + 1) & 3) << 2)) & 0xF;
dither_value[2] = dither_value[6] = (dither_scan
>> (((x + 2) & 3) << 2)) & 0xF;
dither_value[3] = dither_value[7] = (dither_scan
>> (((x + 3) & 3) << 2)) & 0xF;
#endif
dither = _mm_loadu_si128((__m128i*) dither_value);
const __m128i* s = reinterpret_cast<const __m128i*>(src);
__m128i* d = reinterpret_cast<__m128i*>(dst);
while (count >= 8) {
// Load 8 pixels of src.
__m128i src_pixel1 = _mm_loadu_si128(s++);
__m128i src_pixel2 = _mm_loadu_si128(s++);
// Extract R from src.
__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
sr1 = _mm_srli_epi32(sr1, 24);
__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
sr2 = _mm_srli_epi32(sr2, 24);
__m128i sr = _mm_packs_epi32(sr1, sr2);
// SkDITHER_R32To565(sr, dither)
__m128i sr_offset = _mm_srli_epi16(sr, 5);
sr = _mm_add_epi16(sr, dither);
sr = _mm_sub_epi16(sr, sr_offset);
sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
// Extract G from src.
__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
sg1 = _mm_srli_epi32(sg1, 24);
__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
sg2 = _mm_srli_epi32(sg2, 24);
__m128i sg = _mm_packs_epi32(sg1, sg2);
// SkDITHER_R32To565(sg, dither)
__m128i sg_offset = _mm_srli_epi16(sg, 6);
sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
sg = _mm_sub_epi16(sg, sg_offset);
sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
// Extract B from src.
__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
sb1 = _mm_srli_epi32(sb1, 24);
__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
sb2 = _mm_srli_epi32(sb2, 24);
__m128i sb = _mm_packs_epi32(sb1, sb2);
// SkDITHER_R32To565(sb, dither)
__m128i sb_offset = _mm_srli_epi16(sb, 5);
sb = _mm_add_epi16(sb, dither);
sb = _mm_sub_epi16(sb, sb_offset);
sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
// Pack and store 16-bit dst pixel.
__m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
_mm_store_si128(d++, d_pixel);
count -= 8;
x += 8;
}
src = reinterpret_cast<const SkPMColor*>(s);
dst = reinterpret_cast<uint16_t*>(d);
}
if (count > 0) {
DITHER_565_SCAN(y);
do {
SkPMColor c = *src++;
SkPMColorAssert(c);
unsigned dither = DITHER_VALUE(x);
*dst++ = SkDitherRGB32To565(c, dither);
DITHER_INC_X(x);
} while (--count != 0);
}
}

View File

@ -35,3 +35,6 @@ void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha, int /*x*/, int /*y*/);
void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha, int x, int y);

View File

@ -170,7 +170,7 @@ static SkBlitRow::Proc platform_16_procs[] = {
NULL, // S32_D565_Blend
S32A_D565_Opaque_SSE2, // S32A_D565_Opaque
NULL, // S32A_D565_Blend
NULL, // S32_D565_Opaque_Dither
S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither
NULL, // S32_D565_Blend_Dither
NULL, // S32A_D565_Opaque_Dither
NULL, // S32A_D565_Blend_Dither