SSE2 implementation of S32_D565_Opaque_Dither
Run benchmarks with command line option "--forceDither true". The result shows that all benchmarks exercised S32_D565_Opaque_Dither benefit from this SSE2 optimization. Here are the data on i7-3770: before after constXTile_MM_filter 900.93 217.75 75.83% constXTile_CC_filter_scale 907.59 225.65 75.14% constXTile_RR_filter 903.33 219.41 75.71% constXTile_MM_scale 902.45 221.46 75.46% constXTile_CC 898.55 218.37 75.70% constXTile_RR_scale 902.69 222.35 75.37% repeatTile_4444_X 938.53 240.49 74.38% gradient_radial2_mirror 16999.49 11540.39 32.11% gradient_radial2_clamp_hicolor 17943.38 12501.71 30.33% gradient_radial2_clamp 17816.36 12492.04 29.88% bitmaprect_FF_filter_trans 47.81 10.98 77.03% bitmaprect_FF_nofilter_trans 47.79 10.91 77.18% bitmaprect_FF_filter_identity 47.74 10.89 77.18% bitmaprect_FF_nofilter_identity 47.83 10.89 77.24% bitmap_4444_update_scale_rotate_bilerp 100.45 76.84 23.50% bitmap_4444_update_volatile_scale_rotate_bilerp 100.80 76.70 23.91% bitmap_4444_scale_rotate_bilerp 100.43 77.18 23.15% bitmap_4444_update_scale_bilerp 79.00 49.03 37.93% bitmap_4444_update_volatile_scale_bilerp 78.90 48.87 38.06% bitmap_4444_scale_bilerp 78.92 48.81 38.16% bitmap_4444_update 42.19 11.53 72.68% bitmap_4444_update_volatile 42.28 11.49 72.82% bitmap_a8 60.37 29.75 50.72% bitmap_4444 42.19 11.52 72.69% BUG= R=mtklein@google.com Author: qiankun.miao@intel.com Review URL: https://codereview.chromium.org/181293002 git-svn-id: http://skia.googlecode.com/svn/trunk@13698 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
4cd9e2169e
commit
275804782f
@ -10,6 +10,7 @@
|
|||||||
#include "SkBitmapProcState_opts_SSE2.h"
|
#include "SkBitmapProcState_opts_SSE2.h"
|
||||||
#include "SkColorPriv.h"
|
#include "SkColorPriv.h"
|
||||||
#include "SkColor_opts_SSE2.h"
|
#include "SkColor_opts_SSE2.h"
|
||||||
|
#include "SkDither.h"
|
||||||
#include "SkUtils.h"
|
#include "SkUtils.h"
|
||||||
|
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
@ -1051,3 +1052,117 @@ void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
|
|||||||
} while (--count != 0);
|
} while (--count != 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
|
||||||
|
const SkPMColor* SK_RESTRICT src,
|
||||||
|
int count, U8CPU alpha, int x, int y) {
|
||||||
|
SkASSERT(255 == alpha);
|
||||||
|
|
||||||
|
if (count <= 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count >= 8) {
|
||||||
|
while (((size_t)dst & 0x0F) != 0) {
|
||||||
|
DITHER_565_SCAN(y);
|
||||||
|
SkPMColor c = *src++;
|
||||||
|
SkPMColorAssert(c);
|
||||||
|
|
||||||
|
unsigned dither = DITHER_VALUE(x);
|
||||||
|
*dst++ = SkDitherRGB32To565(c, dither);
|
||||||
|
DITHER_INC_X(x);
|
||||||
|
count--;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned short dither_value[8];
|
||||||
|
__m128i dither;
|
||||||
|
#ifdef ENABLE_DITHER_MATRIX_4X4
|
||||||
|
const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
|
||||||
|
dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
|
||||||
|
dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
|
||||||
|
dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
|
||||||
|
dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
|
||||||
|
#else
|
||||||
|
const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
|
||||||
|
dither_value[0] = dither_value[4] = (dither_scan
|
||||||
|
>> (((x) & 3) << 2)) & 0xF;
|
||||||
|
dither_value[1] = dither_value[5] = (dither_scan
|
||||||
|
>> (((x + 1) & 3) << 2)) & 0xF;
|
||||||
|
dither_value[2] = dither_value[6] = (dither_scan
|
||||||
|
>> (((x + 2) & 3) << 2)) & 0xF;
|
||||||
|
dither_value[3] = dither_value[7] = (dither_scan
|
||||||
|
>> (((x + 3) & 3) << 2)) & 0xF;
|
||||||
|
#endif
|
||||||
|
dither = _mm_loadu_si128((__m128i*) dither_value);
|
||||||
|
|
||||||
|
const __m128i* s = reinterpret_cast<const __m128i*>(src);
|
||||||
|
__m128i* d = reinterpret_cast<__m128i*>(dst);
|
||||||
|
|
||||||
|
while (count >= 8) {
|
||||||
|
// Load 8 pixels of src.
|
||||||
|
__m128i src_pixel1 = _mm_loadu_si128(s++);
|
||||||
|
__m128i src_pixel2 = _mm_loadu_si128(s++);
|
||||||
|
|
||||||
|
// Extract R from src.
|
||||||
|
__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
|
||||||
|
sr1 = _mm_srli_epi32(sr1, 24);
|
||||||
|
__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
|
||||||
|
sr2 = _mm_srli_epi32(sr2, 24);
|
||||||
|
__m128i sr = _mm_packs_epi32(sr1, sr2);
|
||||||
|
|
||||||
|
// SkDITHER_R32To565(sr, dither)
|
||||||
|
__m128i sr_offset = _mm_srli_epi16(sr, 5);
|
||||||
|
sr = _mm_add_epi16(sr, dither);
|
||||||
|
sr = _mm_sub_epi16(sr, sr_offset);
|
||||||
|
sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
|
||||||
|
|
||||||
|
// Extract G from src.
|
||||||
|
__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
|
||||||
|
sg1 = _mm_srli_epi32(sg1, 24);
|
||||||
|
__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
|
||||||
|
sg2 = _mm_srli_epi32(sg2, 24);
|
||||||
|
__m128i sg = _mm_packs_epi32(sg1, sg2);
|
||||||
|
|
||||||
|
// SkDITHER_R32To565(sg, dither)
|
||||||
|
__m128i sg_offset = _mm_srli_epi16(sg, 6);
|
||||||
|
sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
|
||||||
|
sg = _mm_sub_epi16(sg, sg_offset);
|
||||||
|
sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
|
||||||
|
|
||||||
|
// Extract B from src.
|
||||||
|
__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
|
||||||
|
sb1 = _mm_srli_epi32(sb1, 24);
|
||||||
|
__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
|
||||||
|
sb2 = _mm_srli_epi32(sb2, 24);
|
||||||
|
__m128i sb = _mm_packs_epi32(sb1, sb2);
|
||||||
|
|
||||||
|
// SkDITHER_R32To565(sb, dither)
|
||||||
|
__m128i sb_offset = _mm_srli_epi16(sb, 5);
|
||||||
|
sb = _mm_add_epi16(sb, dither);
|
||||||
|
sb = _mm_sub_epi16(sb, sb_offset);
|
||||||
|
sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
|
||||||
|
|
||||||
|
// Pack and store 16-bit dst pixel.
|
||||||
|
__m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
|
||||||
|
_mm_store_si128(d++, d_pixel);
|
||||||
|
|
||||||
|
count -= 8;
|
||||||
|
x += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
src = reinterpret_cast<const SkPMColor*>(s);
|
||||||
|
dst = reinterpret_cast<uint16_t*>(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count > 0) {
|
||||||
|
DITHER_565_SCAN(y);
|
||||||
|
do {
|
||||||
|
SkPMColor c = *src++;
|
||||||
|
SkPMColorAssert(c);
|
||||||
|
|
||||||
|
unsigned dither = DITHER_VALUE(x);
|
||||||
|
*dst++ = SkDitherRGB32To565(c, dither);
|
||||||
|
DITHER_INC_X(x);
|
||||||
|
} while (--count != 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -35,3 +35,6 @@ void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
|
|||||||
void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
|
void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
|
||||||
const SkPMColor* SK_RESTRICT src,
|
const SkPMColor* SK_RESTRICT src,
|
||||||
int count, U8CPU alpha, int /*x*/, int /*y*/);
|
int count, U8CPU alpha, int /*x*/, int /*y*/);
|
||||||
|
void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
|
||||||
|
const SkPMColor* SK_RESTRICT src,
|
||||||
|
int count, U8CPU alpha, int x, int y);
|
||||||
|
@ -170,7 +170,7 @@ static SkBlitRow::Proc platform_16_procs[] = {
|
|||||||
NULL, // S32_D565_Blend
|
NULL, // S32_D565_Blend
|
||||||
S32A_D565_Opaque_SSE2, // S32A_D565_Opaque
|
S32A_D565_Opaque_SSE2, // S32A_D565_Opaque
|
||||||
NULL, // S32A_D565_Blend
|
NULL, // S32A_D565_Blend
|
||||||
NULL, // S32_D565_Opaque_Dither
|
S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither
|
||||||
NULL, // S32_D565_Blend_Dither
|
NULL, // S32_D565_Blend_Dither
|
||||||
NULL, // S32A_D565_Opaque_Dither
|
NULL, // S32A_D565_Opaque_Dither
|
||||||
NULL, // S32A_D565_Blend_Dither
|
NULL, // S32A_D565_Blend_Dither
|
||||||
|
Loading…
Reference in New Issue
Block a user