Remove SSE2 ColorRect32 code/files

Removes the disabled SSE2 optimization of ColorRect32 and deletes
the two files containing the code.
Measured on both Core Haswell and Atom Silvermont, and only got
some miniscule improvement compared to the default implementation.

Also tried to write a new, ultimate, version of this optimization,
but only got ~5% improvement on ColorRect32-heavy tests.

Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>

Review URL: https://codereview.chromium.org/957433002
This commit is contained in:
henrik.smiding 2015-02-25 07:37:14 -08:00 committed by Commit bot
parent 83787d0ff0
commit 3704df347a
4 changed files with 0 additions and 162 deletions

View File

@ -65,7 +65,6 @@
'sse2_sources': [
'<(skia_src_path)/opts/SkBitmapFilter_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkBitmapProcState_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkBlitRect_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkBlitRow_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkBlurImage_opts_SSE2.cpp',
'<(skia_src_path)/opts/SkMorphology_opts_SSE2.cpp',

View File

@ -1,132 +0,0 @@
/*
* Copyright 2011 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include <emmintrin.h>
#include "SkBlitRect_opts_SSE2.h"
#include "SkBlitRow.h"
#include "SkColorPriv.h"
/* Simple blitting of opaque rectangles less than 31 pixels wide:
* inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
*/
static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
int width, int height,
size_t rowBytes, uint32_t color) {
SkASSERT(255 == SkGetPackedA32(color));
SkASSERT(width > 0);
SkASSERT(width < 31);
while (--height >= 0) {
SkPMColor* dst = destination;
int count = width;
while (count > 4) {
*dst++ = color;
*dst++ = color;
*dst++ = color;
*dst++ = color;
count -= 4;
}
while (count > 0) {
*dst++ = color;
--count;
}
destination = (uint32_t*)((char*)destination + rowBytes);
}
}
/*
* Fast blitting of opaque rectangles at least 31 pixels wide:
* inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
* A 31 pixel rectangle is guaranteed to have at least one
* 16-pixel aligned span that can take advantage of mm_store.
*/
static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
int width, int height,
size_t rowBytes, uint32_t color) {
SkASSERT(255 == SkGetPackedA32(color));
SkASSERT(width >= 31);
__m128i color_wide = _mm_set1_epi32(color);
while (--height >= 0) {
// Prefetching one row ahead to L1 cache can equal hardware
// performance for large/tall rects, but never *beats*
// hardware performance.
SkPMColor* dst = destination;
int count = width;
while (((size_t)dst) & 0x0F) {
*dst++ = color;
--count;
}
__m128i *d = reinterpret_cast<__m128i*>(dst);
// Googling suggests _mm_stream is only going to beat _mm_store
// for things that wouldn't fit in L2 cache anyway, typically
// >500kB, and precisely fill cache lines. For us, with
// arrays > 100k elements _mm_stream is still 100%+ slower than
// mm_store.
// Unrolling to count >= 64 is a break-even for most
// input patterns; we seem to be saturating the bus and having
// low enough overhead at 32.
while (count >= 32) {
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
count -= 32;
}
if (count >= 16) {
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
count -= 16;
}
dst = reinterpret_cast<uint32_t*>(d);
// Unrolling the loop in the Narrow code is a significant performance
// gain, but unrolling this loop appears to make no difference in
// benchmarks with either mm_store_si128 or individual sets.
while (count > 0) {
*dst++ = color;
--count;
}
destination = (uint32_t*)((char*)destination + rowBytes);
}
}
void ColorRect32_SSE2(SkPMColor* destination,
int width, int height,
size_t rowBytes, uint32_t color) {
if (0 == height || 0 == width || 0 == color) {
return;
}
unsigned colorA = SkGetPackedA32(color);
colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
if (255 == colorA) {
if (width < 31) {
BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
rowBytes, color);
} else {
BlitRect32_OpaqueWide_SSE2(destination, width, height,
rowBytes, color);
}
} else {
SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
}
}

View File

@ -1,21 +0,0 @@
/*
* Copyright 2011 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkBlitRect_opts_SSE2_DEFINED
#define SkBlitRect_opts_SSE2_DEFINED
#include "SkColor.h"
/* These functions' implementations copy sections of both
* SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2.
*/
void ColorRect32_SSE2(SkPMColor* SK_RESTRICT dst,
int width, int height,
size_t rowBytes, uint32_t color);
#endif

View File

@ -10,7 +10,6 @@
#include "SkBitmapProcState_opts_SSSE3.h"
#include "SkBitmapScaler.h"
#include "SkBlitMask.h"
#include "SkBlitRect_opts_SSE2.h"
#include "SkBlitRow.h"
#include "SkBlitRow_opts_SSE2.h"
#include "SkBlitRow_opts_SSE4.h"
@ -265,13 +264,6 @@ SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
/* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled.
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return ColorRect32_SSE2;
} else {
return NULL;
}
*/
return NULL;
}