Add SSE optimization of Color32A_D565
Adds an SSE4.1 version of the Color32A_D565 function. Performance improvement in the following benchmarks: Xfermode_SrcOver - ~100% luma_colorfilter_large - ~150% luma_colorfilter_small - ~60% tablebench - ~10% chart_bw - ~10% (Measured on a Atom Silvermont core) Signed-off-by: Henrik Smiding <henrik.smiding@intel.com> Review URL: https://codereview.chromium.org/892623002
This commit is contained in:
parent
46b8083339
commit
4e65473069
@ -7,10 +7,13 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
|
||||
sk_throw();
|
||||
}
|
||||
|
||||
void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
|
||||
sk_throw();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <emmintrin.h> // SSE2: Most _mm_foo() in this file.
|
||||
#include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128.
|
||||
#include <smmintrin.h> // SSE4.1 intrinsics
|
||||
|
||||
#include "SkColorPriv.h"
|
||||
#include "SkColor_opts_SSE2.h"
|
||||
@ -63,4 +66,76 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) {
|
||||
uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;
|
||||
return SkCompact_rgb_16((src_expand + dst_expand) >> 5);
|
||||
}
|
||||
|
||||
void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
|
||||
SkASSERT(count > 0);
|
||||
|
||||
uint32_t src_expand = (SkGetPackedG32(src) << 24) |
|
||||
(SkGetPackedR32(src) << 13) |
|
||||
(SkGetPackedB32(src) << 2);
|
||||
unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
|
||||
|
||||
// Check if we have enough pixels to run SIMD
|
||||
if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
|
||||
__m128i* dst_wide;
|
||||
const __m128i src_expand_wide = _mm_set1_epi32(src_expand);
|
||||
const __m128i scale_wide = _mm_set1_epi32(scale);
|
||||
const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE |
|
||||
SK_B16_MASK_IN_PLACE |
|
||||
(SK_G16_MASK_IN_PLACE << 16));
|
||||
|
||||
// Align dst to an even 16 byte address (0-7 pixels)
|
||||
while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
|
||||
*dst = Color32A_D565_1x(*dst, scale, src_expand);
|
||||
dst += 1;
|
||||
count--;
|
||||
}
|
||||
|
||||
dst_wide = reinterpret_cast<__m128i*>(dst);
|
||||
do {
|
||||
// Load 8 RGB565 pixels
|
||||
__m128i pixels = _mm_load_si128(dst_wide);
|
||||
|
||||
// Duplicate and mask
|
||||
__m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);
|
||||
pixels_high = _mm_and_si128(mask_green, pixels_high);
|
||||
pixels = _mm_unpacklo_epi16(pixels, pixels);
|
||||
pixels = _mm_and_si128(mask_green, pixels);
|
||||
|
||||
// Scale with alpha
|
||||
pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
|
||||
pixels = _mm_mullo_epi32(pixels, scale_wide);
|
||||
|
||||
// Add src_expand_wide and shift down again
|
||||
pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);
|
||||
pixels_high = _mm_srli_epi32(pixels_high, 5);
|
||||
pixels = _mm_add_epi32(pixels, src_expand_wide);
|
||||
pixels = _mm_srli_epi32(pixels, 5);
|
||||
|
||||
// Mask
|
||||
pixels_high = _mm_and_si128(mask_green, pixels_high);
|
||||
pixels = _mm_and_si128(mask_green, pixels);
|
||||
|
||||
// Combine into RGB565 and store
|
||||
pixels = _mm_hadd_epi16(pixels, pixels_high);
|
||||
_mm_store_si128(dst_wide, pixels);
|
||||
count -= 8;
|
||||
dst_wide++;
|
||||
} while (count >= 8);
|
||||
|
||||
dst = reinterpret_cast<uint16_t*>(dst_wide);
|
||||
}
|
||||
|
||||
// Small loop to handle remaining pixels.
|
||||
while (count > 0) {
|
||||
*dst = Color32A_D565_1x(*dst, scale, src_expand);
|
||||
dst += 1;
|
||||
count--;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -14,5 +14,8 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
|
||||
const SkPMColor* SK_RESTRICT,
|
||||
int count,
|
||||
U8CPU alpha);
|
||||
|
||||
void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -197,7 +197,7 @@ void SkBitmapProcState::platformProcs() {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static SkBlitRow::Proc16 platform_16_procs[] = {
|
||||
static const SkBlitRow::Proc16 platform_16_procs[] = {
|
||||
S32_D565_Opaque_SSE2, // S32_D565_Opaque
|
||||
NULL, // S32_D565_Blend
|
||||
S32A_D565_Opaque_SSE2, // S32A_D565_Opaque
|
||||
@ -216,18 +216,27 @@ SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
|
||||
}
|
||||
}
|
||||
|
||||
static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE4[] = {
|
||||
Color32A_D565_SSE4, // Color32A_D565,
|
||||
NULL, // Color32A_D565_Dither
|
||||
};
|
||||
|
||||
SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
|
||||
return NULL;
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
|
||||
return platform_565_colorprocs_SSE4[flags];
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
|
||||
static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
|
||||
NULL, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_SSE2, // S32_Blend,
|
||||
S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
|
||||
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
|
||||
};
|
||||
|
||||
static SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
|
||||
static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
|
||||
NULL, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_SSE2, // S32_Blend,
|
||||
S32A_Opaque_BlitRow32_SSE4, // S32A_Opaque
|
||||
|
Loading…
Reference in New Issue
Block a user