Sk4px blit mask.

Local SKP nanobenching ranges SSE between 1.05x and 0.87x, much more heavily weighted toward <1.0x ratios (speedups).
I profiled the top five regressions (1.05x-1.01x) and they look like noise.  Will follow up after broad bot results.

NEON looks similar but less extreme than SSE changes, ranging between 1.02x and 0.95x, again mostly speedups in 0.99x-0.97x range.

The old code trifurcated into black, opaque-but-not-black, and general versions as a function of the constant src color.  I did not see a significant difference between general and opaque-but-not-black, and I don't think a black version would be faster using SIMD.  So we have here just one version of the code, the general version.

Somewhat fantastically, I see no pixel diffs on GMs or SKPs.

I will be following up with more CLs for the other procs called by SkBlitMask.
BUG=skia:

Review URL: https://codereview.chromium.org/1278253003
This commit is contained in:
mtklein 2015-08-10 12:58:17 -07:00 committed by Commit bot
parent c699873ac7
commit 4977983510
15 changed files with 91 additions and 348 deletions

View File

@ -165,6 +165,34 @@ public:
}
}
// As above, but with dst4' = fn(dst4, alpha4).
template <typename Fn, typename Dst>
static void MapDstAlpha(int n, Dst* dst, const SkAlpha* a, const Fn& fn) {
while (n > 0) {
if (n >= 8) {
Sk4px dst0 = fn(Load4(dst+0), Load4Alphas(a+0)),
dst4 = fn(Load4(dst+4), Load4Alphas(a+4));
dst0.store4(dst+0);
dst4.store4(dst+4);
dst += 8; a += 8; n -= 8;
continue; // Keep our stride at 8 pixels as long as possible.
}
SkASSERT(n <= 7);
if (n >= 4) {
fn(Load4(dst), Load4Alphas(a)).store4(dst);
dst += 4; a += 4; n -= 4;
}
if (n >= 2) {
fn(Load2(dst), Load2Alphas(a)).store2(dst);
dst += 2; a += 2; n -= 2;
}
if (n >= 1) {
fn(Load1(dst), DupAlpha(*a)).store1(dst);
}
break;
}
}
// As above, but with dst4' = fn(dst4, src4, alpha4).
template <typename Fn, typename Dst>
static void MapDstSrcAlpha(int n, Dst* dst, const SkPMColor* src, const SkAlpha* a,

View File

@ -47,18 +47,6 @@ public:
typedef void (*RowProc)(void* dst, const void* mask,
const SkPMColor* src, int width);
/**
* Public entry-point to return a blitmask ColorProc.
* May return NULL if config or format are not supported.
*/
static ColorProc ColorFactory(SkColorType, SkMask::Format, SkColor);
/**
* Return either platform specific optimized blitmask ColorProc,
* or NULL if no optimized routine is available.
*/
static ColorProc PlatformColorProcs(SkColorType, SkMask::Format, SkColor);
/**
* Public entry-point to return a blitcolor BlitLCD16RowProc.
*/

View File

@ -8,68 +8,7 @@
#include "SkBlitMask.h"
#include "SkColor.h"
#include "SkColorPriv.h"
static void D32_A8_Color(void* SK_RESTRICT dst, size_t dstRB,
const void* SK_RESTRICT maskPtr, size_t maskRB,
SkColor color, int width, int height) {
SkPMColor pmc = SkPreMultiplyColor(color);
size_t dstOffset = dstRB - (width << 2);
size_t maskOffset = maskRB - width;
SkPMColor* SK_RESTRICT device = (SkPMColor *)dst;
const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
do {
int w = width;
do {
unsigned aa = *mask++;
*device = SkBlendARGB32(pmc, *device, aa);
device += 1;
} while (--w != 0);
device = (uint32_t*)((char*)device + dstOffset);
mask += maskOffset;
} while (--height != 0);
}
static void D32_A8_Opaque(void* SK_RESTRICT dst, size_t dstRB,
const void* SK_RESTRICT maskPtr, size_t maskRB,
SkColor color, int width, int height) {
SkPMColor pmc = SkPreMultiplyColor(color);
SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
maskRB -= width;
dstRB -= (width << 2);
do {
int w = width;
do {
unsigned aa = *mask++;
*device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
device += 1;
} while (--w != 0);
device = (uint32_t*)((char*)device + dstRB);
mask += maskRB;
} while (--height != 0);
}
static void D32_A8_Black(void* SK_RESTRICT dst, size_t dstRB,
const void* SK_RESTRICT maskPtr, size_t maskRB,
SkColor, int width, int height) {
SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
maskRB -= width;
dstRB -= (width << 2);
do {
int w = width;
do {
unsigned aa = *mask++;
*device = (aa << SK_A32_SHIFT) + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
device += 1;
} while (--w != 0);
device = (uint32_t*)((char*)device + dstRB);
mask += maskRB;
} while (--height != 0);
}
#include "SkOpts.h"
SkBlitMask::BlitLCD16RowProc SkBlitMask::BlitLCD16RowFactory(bool isOpaque) {
BlitLCD16RowProc proc = PlatformBlitRowProcs16(isOpaque);
@ -112,51 +51,25 @@ static void D32_LCD16_Proc(void* SK_RESTRICT dst, size_t dstRB,
///////////////////////////////////////////////////////////////////////////////
static SkBlitMask::ColorProc D32_A8_Factory(SkColor color) {
if (SK_ColorBLACK == color) {
return D32_A8_Black;
} else if (0xFF == SkColorGetA(color)) {
return D32_A8_Opaque;
} else {
return D32_A8_Color;
}
}
SkBlitMask::ColorProc SkBlitMask::ColorFactory(SkColorType ct,
SkMask::Format format,
SkColor color) {
ColorProc proc = PlatformColorProcs(ct, format, color);
if (proc) {
return proc;
}
switch (ct) {
case kN32_SkColorType:
switch (format) {
case SkMask::kA8_Format:
return D32_A8_Factory(color);
case SkMask::kLCD16_Format:
return D32_LCD16_Proc;
default:
break;
}
break;
default:
break;
}
return NULL;
}
bool SkBlitMask::BlitColor(const SkPixmap& device, const SkMask& mask,
const SkIRect& clip, SkColor color) {
ColorProc proc = ColorFactory(device.colorType(), mask.fFormat, color);
if (proc) {
int x = clip.fLeft;
int y = clip.fTop;
proc(device.writable_addr32(x, y), device.rowBytes(), mask.getAddr(x, y),
mask.fRowBytes, color, clip.width(), clip.height());
int x = clip.fLeft, y = clip.fTop;
if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
(const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
color, clip.width(), clip.height());
return true;
}
if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
// TODO: Is this reachable code? Seems like no.
D32_LCD16_Proc(device.writable_addr32(x,y), device.rowBytes(),
mask.getAddr(x,y), mask.fRowBytes,
color, clip.width(), clip.height());
return true;
}
return false;
}

View File

@ -9,6 +9,7 @@
#include "SkOpts.h"
#define SK_OPTS_NS portable
#include "SkBlitMask_opts.h"
#include "SkBlurImageFilter_opts.h"
#include "SkFloatingPoint_opts.h"
#include "SkMorphologyImageFilter_opts.h"
@ -50,6 +51,8 @@ namespace SkOpts {
decltype(texture_compressor) texture_compressor = portable::texture_compressor;
decltype(fill_block_dimensions) fill_block_dimensions = portable::fill_block_dimensions;
decltype(blit_mask_d32_a8) blit_mask_d32_a8 = portable::blit_mask_d32_a8;
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_sse2();
void Init_ssse3();

View File

@ -43,6 +43,7 @@ namespace SkOpts {
extern TextureCompressor (*texture_compressor)(SkColorType, SkTextureCompressor::Format);
extern bool (*fill_block_dimensions)(SkTextureCompressor::Format, int* x, int* y);
extern void (*blit_mask_d32_a8)(SkPMColor*, size_t, const SkAlpha*, size_t, SkColor, int, int);
}
#endif//SkOpts_DEFINED

View File

@ -0,0 +1,37 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkBlitMask_opts_DEFINED
#define SkBlitMask_opts_DEFINED
#include "Sk4px.h"
namespace SK_OPTS_NS {
static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB,
const SkAlpha* mask, size_t maskRB,
SkColor color, int w, int h) {
auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color));
auto fn = [&](const Sk4px& d, const Sk4px& aa) {
// = (s + d(1-sa))aa + d(1-aa)
// = s*aa + d(1-sa*aa)
auto left = s.approxMulDiv255(aa),
right = d.approxMulDiv255(left.alphas().inv());
return left + right; // This does not overflow (exhaustively checked).
};
while (h --> 0) {
Sk4px::MapDstAlpha(w, dst, mask, fn);
dst += dstRB / sizeof(*dst);
mask += maskRB / sizeof(*mask);
}
}
} // SK_OPTS_NS
#endif//SkBlitMask_opts_DEFINED

View File

@ -11,32 +11,6 @@
#include "SkUtilsArm.h"
#include "SkBlitMask_opts_arm_neon.h"
SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
SkMask::Format maskFormat,
SkColor color) {
#if SK_ARM_NEON_IS_NONE
return NULL;
#else
/* ** This has been disabled until we can diagnose and fix the SIGILL generated
** in the NEON code. See http://skbug.com/2067 for details.
#if SK_ARM_NEON_IS_DYNAMIC
if (!sk_cpu_arm_has_neon()) {
return NULL;
}
#endif
if ((kN32_SkColorType == dstCT) &&
(SkMask::kA8_Format == maskFormat)) {
return D32_A8_Factory_neon(color);
}
*/
#endif
// We don't need to handle the SkMask::kLCD16_Format case as the default
// LCD16 will call us through SkBlitMask::PlatformBlitRowProcs16()
return NULL;
}
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
if (isOpaque) {
return SK_ARM_NEON_WRAP(SkBlitLCD16OpaqueRow);

View File

@ -8,129 +8,6 @@
#include "SkBlitMask.h"
#include "SkColor_opts_neon.h"
static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,
const void* SK_RESTRICT maskPtr, size_t maskRB,
SkColor, int width, int height) {
SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
maskRB -= width;
dstRB -= (width << 2);
do {
int w = width;
while (w >= 8) {
uint8x8_t vmask = vld1_u8(mask);
uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask);
uint8x8x4_t vdevice = vld4_u8((uint8_t*)device);
vdevice = SkAlphaMulQ_neon8(vdevice, vscale);
vdevice.val[NEON_A] += vmask;
vst4_u8((uint8_t*)device, vdevice);
mask += 8;
device += 8;
w -= 8;
}
while (w-- > 0) {
unsigned aa = *mask++;
*device = (aa << SK_A32_SHIFT)
+ SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
device += 1;
};
device = (uint32_t*)((char*)device + dstRB);
mask += maskRB;
} while (--height != 0);
}
template <bool isColor>
static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
const void* SK_RESTRICT maskPtr, size_t maskRB,
SkColor color, int width, int height) {
SkPMColor pmc = SkPreMultiplyColor(color);
SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
uint8x8x4_t vpmc;
maskRB -= width;
dstRB -= (width << 2);
if (width >= 8) {
vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc));
vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc));
vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc));
vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc));
}
do {
int w = width;
while (w >= 8) {
uint8x8_t vmask = vld1_u8(mask);
uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask);
if (isColor) {
vscale = vsubw_u8(vdupq_n_u16(256),
SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256));
} else {
vscale = vsubw_u8(vdupq_n_u16(256), vmask);
}
uint8x8x4_t vdev = vld4_u8((uint8_t*)device);
vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)
+ SkAlphaMul_neon8(vdev.val[NEON_A], vscale);
vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256)
+ SkAlphaMul_neon8(vdev.val[NEON_R], vscale);
vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256)
+ SkAlphaMul_neon8(vdev.val[NEON_G], vscale);
vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256)
+ SkAlphaMul_neon8(vdev.val[NEON_B], vscale);
vst4_u8((uint8_t*)device, vdev);
mask += 8;
device += 8;
w -= 8;
}
while (w--) {
unsigned aa = *mask++;
if (isColor) {
*device = SkBlendARGB32(pmc, *device, aa);
} else {
*device = SkAlphaMulQ(pmc, SkAlpha255To256(aa))
+ SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
}
device += 1;
};
device = (uint32_t*)((char*)device + dstRB);
mask += maskRB;
} while (--height != 0);
}
static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB,
const void* SK_RESTRICT maskPtr, size_t maskRB,
SkColor color, int width, int height) {
D32_A8_Opaque_Color_neon<false>(dst, dstRB, maskPtr, maskRB, color, width, height);
}
static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
const void* SK_RESTRICT maskPtr, size_t maskRB,
SkColor color, int width, int height) {
D32_A8_Opaque_Color_neon<true>(dst, dstRB, maskPtr, maskRB, color, width, height);
}
SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) {
if (SK_ColorBLACK == color) {
return D32_A8_Black_neon;
} else if (0xFF == SkColorGetA(color)) {
return D32_A8_Opaque_neon;
} else {
return D32_A8_Color_neon;
}
}
////////////////////////////////////////////////////////////////////////////////
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
SkColor color, int width,
SkPMColor opaqueDst) {

View File

@ -11,8 +11,6 @@
#include "SkColor.h"
#include "SkBlitMask.h"
extern SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color);
extern void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
SkColor color, int width,
SkPMColor opaqueDst);

View File

@ -7,12 +7,6 @@
#include "SkBlitMask.h"
SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
SkMask::Format maskFormat,
SkColor color) {
return NULL;
}
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
return NULL;
}

View File

@ -301,54 +301,6 @@ void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y)
}
}
void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
size_t maskRB, SkColor origColor,
int width, int height) {
SkPMColor color = SkPreMultiplyColor(origColor);
size_t dstOffset = dstRB - (width << 2);
size_t maskOffset = maskRB - width;
SkPMColor* dst = (SkPMColor *)device;
const uint8_t* mask = (const uint8_t*)maskPtr;
do {
int count = width;
if (count >= 4) {
while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
*dst = SkBlendARGB32(color, *dst, *mask);
mask++;
dst++;
count--;
}
__m128i *d = reinterpret_cast<__m128i*>(dst);
__m128i src_pixel = _mm_set1_epi32(color);
while (count >= 4) {
// Load 4 dst pixels
__m128i dst_pixel = _mm_load_si128(d);
// Set the alpha value
__m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
__m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
_mm_store_si128(d, result);
// Load the next 4 dst pixels and alphas
mask = mask + 4;
d++;
count -= 4;
}
dst = reinterpret_cast<SkPMColor*>(d);
}
while (count > 0) {
*dst= SkBlendARGB32(color, *dst, *mask);
dst += 1;
mask++;
count --;
}
dst = (SkPMColor *)((char*)dst + dstOffset);
mask += maskOffset;
} while (--height != 0);
}
// The following (left) shifts cause the top 5 bits of the mask components to
// line up with the corresponding components in an SkPMColor.
// Note that the mask's RGB16 order may differ from the SkPMColor order.

View File

@ -25,10 +25,6 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x,
int y);
void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* mask,
size_t maskRB, SkColor color,
int width, int height);
void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
SkColor color, int width, SkPMColor);
void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],

View File

@ -8,6 +8,7 @@
#include "SkOpts.h"
#define SK_OPTS_NS neon
#include "SkBlitMask_opts.h"
#include "SkBlurImageFilter_opts.h"
#include "SkFloatingPoint_opts.h"
#include "SkMorphologyImageFilter_opts.h"
@ -33,5 +34,7 @@ namespace SkOpts {
texture_compressor = neon::texture_compressor;
fill_block_dimensions = neon::fill_block_dimensions;
blit_mask_d32_a8 = neon::blit_mask_d32_a8;
}
}

View File

@ -8,6 +8,7 @@
#include "SkOpts.h"
#define SK_OPTS_NS sse2
#include "SkBlitMask_opts.h"
#include "SkBlurImageFilter_opts.h"
#include "SkMorphologyImageFilter_opts.h"
#include "SkUtils_opts.h"
@ -27,5 +28,7 @@ namespace SkOpts {
dilate_y = sse2::dilate_y;
erode_x = sse2::erode_x;
erode_y = sse2::erode_y;
blit_mask_d32_a8 = sse2::blit_mask_d32_a8;
}
}

View File

@ -252,30 +252,6 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
////////////////////////////////////////////////////////////////////////////////
SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
SkMask::Format maskFormat,
SkColor color) {
if (SkMask::kA8_Format != maskFormat) {
return NULL;
}
ColorProc proc = NULL;
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
switch (dstCT) {
case kN32_SkColorType:
// The SSE2 version is not (yet) faster for black, so we check
// for that.
if (SK_ColorBLACK != color) {
proc = SkARGB32_A8_BlitMask_SSE2;
}
break;
default:
break;
}
}
return proc;
}
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
if (isOpaque) {