Port S32A_opaque blit row to SkOpts.

This should be a pixel-for-pixel (i.e. bug-for-bug) port.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1820313002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1820313002
This commit is contained in:
mtklein 2016-03-23 06:29:12 -07:00 committed by Commit bot
parent cc77c12293
commit b4a7dc99b1
13 changed files with 189 additions and 492 deletions

View File

@ -49,7 +49,6 @@
'<(skia_src_path)/opts/SkOpts_ssse3.cpp',
],
'sse41_sources': [
'<(skia_src_path)/opts/SkBlitRow_opts_SSE4.cpp',
'<(skia_src_path)/opts/SkOpts_sse41.cpp',
],
# These targets are empty, but XCode doesn't like that, so add an empty file to each.

View File

@ -52,35 +52,6 @@ static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
}
}
static void S32A_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(255 == alpha);
if (count > 0) {
#ifdef UNROLL
if (count & 1) {
*dst = SkPMSrcOver(*(src++), *dst);
dst += 1;
count -= 1;
}
const SkPMColor* SK_RESTRICT srcEnd = src + count;
while (src != srcEnd) {
*dst = SkPMSrcOver(*(src++), *dst);
dst += 1;
*dst = SkPMSrcOver(*(src++), *dst);
dst += 1;
}
#else
do {
*dst = SkPMSrcOver(*src, *dst);
src += 1;
dst += 1;
} while (--count > 0);
#endif
}
}
static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
@ -115,7 +86,7 @@ static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
static const SkBlitRow::Proc32 gDefault_Procs32[] = {
S32_Opaque_BlitRow32,
S32_Blend_BlitRow32,
S32A_Opaque_BlitRow32,
nullptr,
S32A_Blend_BlitRow32
};
@ -124,6 +95,11 @@ SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
// just so we don't crash
flags &= kFlags32_Mask;
if (flags == 2) {
// S32A_Opaque_BlitRow32 has been ported to SkOpts, but not the others yet.
return SkOpts::blit_row_s32a_opaque;
}
SkBlitRow::Proc32 proc = PlatformProcs32(flags);
if (nullptr == proc) {
proc = gDefault_Procs32[flags];

View File

@ -63,6 +63,7 @@ namespace SK_OPTS_NS {
#endif
namespace SkOpts {
// Define default function pointer values here...
// If our global compile options are set high enough, these defaults might even be
// CPU-specialized, e.g. a typical x86-64 machine might start with SSE2 defaults.
@ -84,7 +85,8 @@ namespace SkOpts {
decltype(blit_mask_d32_a8) blit_mask_d32_a8 = sk_default::blit_mask_d32_a8;
decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
decltype(blit_row_s32a_opaque) blit_row_s32a_opaque = sk_default::blit_row_s32a_opaque;
decltype(matrix_translate) matrix_translate = sk_default::matrix_translate;
decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;

View File

@ -39,6 +39,7 @@ namespace SkOpts {
extern void (*blit_mask_d32_a8)(SkPMColor*, size_t, const SkAlpha*, size_t, SkColor, int, int);
extern void (*blit_row_color32)(SkPMColor*, const SkPMColor*, int, SkPMColor);
extern void (*blit_row_s32a_opaque)(SkPMColor*, const SkPMColor*, int, U8CPU);
// This function is an optimized version of SkColorCubeFilter::filterSpan
extern void (*color_cube_filter_span)(const SkPMColor[],

View File

@ -9,6 +9,12 @@
#define SkBlitRow_opts_DEFINED
#include "Sk4px.h"
#include "SkColorPriv.h"
#include "SkMSAN.h"
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include "SkColor_opts_SSE2.h"
#endif
namespace SK_OPTS_NS {
@ -17,7 +23,8 @@ namespace SK_OPTS_NS {
// and it's quite a bit faster than blend_perfect.
//
// blend_256_round_alt is our currently blessed algorithm. Please use it or an analogous one.
static void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
static inline
void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
unsigned invA = 255 - SkGetPackedA32(color);
invA += invA >> 7;
SkASSERT(invA < 256); // We've should have already handled alpha == 0 externally.
@ -30,6 +37,167 @@ static void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, Sk
});
}
static inline
void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
SkASSERT(alpha == 0xFF);
sk_msan_assert_initialized(src, src+len);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
while (len >= 16) {
// Load 16 source pixels.
auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
const auto alphaMask = _mm_set1_epi32(0xFF000000);
auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
if (_mm_testz_si128(ORed, alphaMask)) {
// All 16 source pixels are transparent. Nothing to do.
src += 16;
dst += 16;
len -= 16;
continue;
}
auto d0 = (__m128i*)(dst) + 0,
d1 = (__m128i*)(dst) + 1,
d2 = (__m128i*)(dst) + 2,
d3 = (__m128i*)(dst) + 3;
auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
if (_mm_testc_si128(ANDed, alphaMask)) {
// All 16 source pixels are opaque. SrcOver becomes Src.
_mm_storeu_si128(d0, s0);
_mm_storeu_si128(d1, s1);
_mm_storeu_si128(d2, s2);
_mm_storeu_si128(d3, s3);
src += 16;
dst += 16;
len -= 16;
continue;
}
// TODO: This math is wrong.
// Do SrcOver.
_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
src += 16;
dst += 16;
len -= 16;
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
while (len >= 16) {
// Load 16 source pixels.
auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
const auto alphaMask = _mm_set1_epi32(0xFF000000);
auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
_mm_setzero_si128()))) {
// All 16 source pixels are transparent. Nothing to do.
src += 16;
dst += 16;
len -= 16;
continue;
}
auto d0 = (__m128i*)(dst) + 0,
d1 = (__m128i*)(dst) + 1,
d2 = (__m128i*)(dst) + 2,
d3 = (__m128i*)(dst) + 3;
auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
alphaMask))) {
// All 16 source pixels are opaque. SrcOver becomes Src.
_mm_storeu_si128(d0, s0);
_mm_storeu_si128(d1, s1);
_mm_storeu_si128(d2, s2);
_mm_storeu_si128(d3, s3);
src += 16;
dst += 16;
len -= 16;
continue;
}
// TODO: This math is wrong.
// Do SrcOver.
_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
src += 16;
dst += 16;
len -= 16;
}
#elif defined(SK_ARM_HAS_NEON)
while (len >= 4) {
if ((src[0] | src[1] | src[2] | src[3]) == 0x00000000) {
// All 16 source pixels are transparent. Nothing to do.
src += 4;
dst += 4;
len -= 4;
continue;
}
if ((src[0] & src[1] & src[2] & src[3]) >= 0xFF000000) {
// All 16 source pixels are opaque. SrcOver becomes Src.
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
src += 4;
dst += 4;
len -= 4;
continue;
}
// Load 4 source and destination pixels.
auto src0 = vreinterpret_u8_u32(vld1_u32(src+0)),
src2 = vreinterpret_u8_u32(vld1_u32(src+2)),
dst0 = vreinterpret_u8_u32(vld1_u32(dst+0)),
dst2 = vreinterpret_u8_u32(vld1_u32(dst+2));
// TODO: This math is wrong.
const uint8x8_t alphas = vcreate_u8(0x0707070703030303);
auto invSA0_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src0, alphas)),
invSA2_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src2, alphas));
auto dstInvSA0 = vmulq_u16(invSA0_w, vmovl_u8(dst0)),
dstInvSA2 = vmulq_u16(invSA2_w, vmovl_u8(dst2));
dst0 = vadd_u8(src0, vshrn_n_u16(dstInvSA0, 8));
dst2 = vadd_u8(src2, vshrn_n_u16(dstInvSA2, 8));
vst1_u32(dst+0, vreinterpret_u32_u8(dst0));
vst1_u32(dst+2, vreinterpret_u32_u8(dst2));
src += 4;
dst += 4;
len -= 4;
}
#endif
while (len-- > 0) {
if (*src) {
*dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
}
src++;
dst++;
}
}
} // SK_OPTS_NS
#endif//SkBlitRow_opts_DEFINED

View File

@ -67,61 +67,6 @@ void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
}
}
void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
sk_msan_assert_initialized(src, src+count);
SkASSERT(alpha == 255);
if (count <= 0) {
return;
}
int count16 = count / 16;
__m128i* dst4 = (__m128i*)dst;
const __m128i* src4 = (const __m128i*)src;
for (int i = 0; i < count16 * 4; i += 4) {
// Load 16 source pixels.
__m128i s0 = _mm_loadu_si128(src4+i+0),
s1 = _mm_loadu_si128(src4+i+1),
s2 = _mm_loadu_si128(src4+i+2),
s3 = _mm_loadu_si128(src4+i+3);
const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
__m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
if (0xffff == _mm_movemask_epi8(cmp)) {
// All 16 source pixels are fully transparent. There's nothing to do!
continue;
}
const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
if (0xffff == _mm_movemask_epi8(cmp)) {
// All 16 source pixels are fully opaque. There's no need to read dst or blend it.
_mm_storeu_si128(dst4+i+0, s0);
_mm_storeu_si128(dst4+i+1, s1);
_mm_storeu_si128(dst4+i+2, s2);
_mm_storeu_si128(dst4+i+3, s3);
continue;
}
// The general slow case: do the blend for all 16 pixels.
_mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
_mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
_mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
_mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
}
// Wrap up the last <= 15 pixels.
SkASSERT(count - (count16*16) <= 15);
for (int i = count16*16; i < count; i++) {
// This check is not really necessarily, but it prevents pointless autovectorization.
if (src[i] & 0xFF000000) {
dst[i] = SkPMSrcOver(src[i], dst[i]);
}
}
}
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {

View File

@ -14,10 +14,6 @@ void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);
void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);

View File

@ -1,74 +0,0 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkBlitRow_opts_SSE4.h"
// Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub methods.
// The stubs should never be called, so we make them crash just to confirm that.
#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) {
sk_throw();
}
#else
#include <smmintrin.h> // SSE4.1 intrinsics
#include "SkColorPriv.h"
#include "SkColor_opts_SSE2.h"
#include "SkMSAN.h"
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count,
U8CPU alpha) {
sk_msan_assert_initialized(src, src+count);
SkASSERT(alpha == 255);
// As long as we can, we'll work on 16 pixel pairs at once.
int count16 = count / 16;
__m128i* dst4 = (__m128i*)dst;
const __m128i* src4 = (const __m128i*)src;
for (int i = 0; i < count16 * 4; i += 4) {
// Load 16 source pixels.
__m128i s0 = _mm_loadu_si128(src4+i+0),
s1 = _mm_loadu_si128(src4+i+1),
s2 = _mm_loadu_si128(src4+i+2),
s3 = _mm_loadu_si128(src4+i+3);
const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
if (_mm_testz_si128(ORed, alphaMask)) {
// All 16 source pixels are fully transparent. There's nothing to do!
continue;
}
const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
if (_mm_testc_si128(ANDed, alphaMask)) {
// All 16 source pixels are fully opaque. There's no need to read dst or blend it.
_mm_storeu_si128(dst4+i+0, s0);
_mm_storeu_si128(dst4+i+1, s1);
_mm_storeu_si128(dst4+i+2, s2);
_mm_storeu_si128(dst4+i+3, s3);
continue;
}
// The general slow case: do the blend for all 16 pixels.
_mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
_mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
_mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
_mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
}
// Wrap up the last <= 15 pixels.
for (int i = count16*16; i < count; i++) {
// This check is not really necessarily, but it prevents pointless autovectorization.
if (src[i] & 0xFF000000) {
dst[i] = SkPMSrcOver(src[i], dst[i]);
}
}
}
#endif

View File

@ -1,18 +0,0 @@
/*
* Copyright 2014 The Android Open Source Project
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkBlitRow_opts_SSE4_DEFINED
#define SkBlitRow_opts_SSE4_DEFINED
#include "SkBlitRow.h"
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
const SkPMColor* SK_RESTRICT,
int count,
U8CPU alpha);
#endif

View File

@ -871,282 +871,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
}
}
void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(255 == alpha);
if (count > 0) {
uint8x8_t alpha_mask;
static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
alpha_mask = vld1_u8(alpha_mask_setup);
/* do the NEON unrolled code */
#define UNROLL 4
while (count >= UNROLL) {
uint8x8_t src_raw, dst_raw, dst_final;
uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
/* The two prefetches below may make the code slighlty
* slower for small values of count but are worth having
* in the general case.
*/
__builtin_prefetch(src+32);
__builtin_prefetch(dst+32);
/* get the source */
src_raw = vreinterpret_u8_u32(vld1_u32(src));
#if UNROLL > 2
src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
#endif
/* get and hold the dst too */
dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
#if UNROLL > 2
dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
#endif
/* 1st and 2nd bits of the unrolling */
{
uint8x8_t dst_cooked;
uint16x8_t dst_wide;
uint8x8_t alpha_narrow;
uint16x8_t alpha_wide;
/* get the alphas spread out properly */
alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
/* spread the dest */
dst_wide = vmovl_u8(dst_raw);
/* alpha mul the dest */
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
dst_cooked = vshrn_n_u16(dst_wide, 8);
/* sum -- ignoring any byte lane overflows */
dst_final = vadd_u8(src_raw, dst_cooked);
}
#if UNROLL > 2
/* the 3rd and 4th bits of our unrolling */
{
uint8x8_t dst_cooked;
uint16x8_t dst_wide;
uint8x8_t alpha_narrow;
uint16x8_t alpha_wide;
alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
/* spread the dest */
dst_wide = vmovl_u8(dst_raw_2);
/* alpha mul the dest */
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
dst_cooked = vshrn_n_u16(dst_wide, 8);
/* sum -- ignoring any byte lane overflows */
dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
}
#endif
vst1_u32(dst, vreinterpret_u32_u8(dst_final));
#if UNROLL > 2
vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
#endif
src += UNROLL;
dst += UNROLL;
count -= UNROLL;
}
#undef UNROLL
/* do any residual iterations */
while (--count >= 0) {
*dst = SkPMSrcOver(*src, *dst);
src += 1;
dst += 1;
}
}
}
void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(255 == alpha);
if (count <= 0)
return;
/* Use these to check if src is transparent or opaque */
const unsigned int ALPHA_OPAQ = 0xFF000000;
const unsigned int ALPHA_TRANS = 0x00FFFFFF;
#define UNROLL 4
const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
const SkPMColor* SK_RESTRICT src_temp = src;
/* set up the NEON variables */
uint8x8_t alpha_mask;
static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
alpha_mask = vld1_u8(alpha_mask_setup);
uint8x8_t src_raw, dst_raw, dst_final;
uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
uint8x8_t dst_cooked;
uint16x8_t dst_wide;
uint8x8_t alpha_narrow;
uint16x8_t alpha_wide;
/* choose the first processing type */
if( src >= src_end)
goto TAIL;
if(*src <= ALPHA_TRANS)
goto ALPHA_0;
if(*src >= ALPHA_OPAQ)
goto ALPHA_255;
/* fall-thru */
ALPHA_1_TO_254:
do {
/* get the source */
src_raw = vreinterpret_u8_u32(vld1_u32(src));
src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
/* get and hold the dst too */
dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
/* get the alphas spread out properly */
alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
/* we collapsed (255-a)+1 ... */
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
/* spread the dest */
dst_wide = vmovl_u8(dst_raw);
/* alpha mul the dest */
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
dst_cooked = vshrn_n_u16(dst_wide, 8);
/* sum -- ignoring any byte lane overflows */
dst_final = vadd_u8(src_raw, dst_cooked);
alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
/* we collapsed (255-a)+1 ... */
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
/* spread the dest */
dst_wide = vmovl_u8(dst_raw_2);
/* alpha mul the dest */
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
dst_cooked = vshrn_n_u16(dst_wide, 8);
/* sum -- ignoring any byte lane overflows */
dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
vst1_u32(dst, vreinterpret_u32_u8(dst_final));
vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
src += UNROLL;
dst += UNROLL;
/* if 2 of the next pixels aren't between 1 and 254
it might make sense to go to the optimized loops */
if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
break;
} while(src < src_end);
if (src >= src_end)
goto TAIL;
if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
goto ALPHA_255;
/*fall-thru*/
ALPHA_0:
/*In this state, we know the current alpha is 0 and
we optimize for the next alpha also being zero. */
src_temp = src; //so we don't have to increment dst every time
do {
if(*(++src) > ALPHA_TRANS)
break;
if(*(++src) > ALPHA_TRANS)
break;
if(*(++src) > ALPHA_TRANS)
break;
if(*(++src) > ALPHA_TRANS)
break;
} while(src < src_end);
dst += (src - src_temp);
/* no longer alpha 0, so determine where to go next. */
if( src >= src_end)
goto TAIL;
if(*src >= ALPHA_OPAQ)
goto ALPHA_255;
else
goto ALPHA_1_TO_254;
ALPHA_255:
while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
dst[0]=src[0];
dst[1]=src[1];
dst[2]=src[2];
dst[3]=src[3];
src+=UNROLL;
dst+=UNROLL;
if(src >= src_end)
goto TAIL;
}
//Handle remainder.
if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
}
}
if( src >= src_end)
goto TAIL;
if(*src <= ALPHA_TRANS)
goto ALPHA_0;
else
goto ALPHA_1_TO_254;
TAIL:
/* do any residual iterations */
src_end += UNROLL + 1; //goto the real end
while(src != src_end) {
if( *src != 0 ) {
if( *src >= ALPHA_OPAQ ) {
*dst = *src;
}
else {
*dst = SkPMSrcOver(*src, *dst);
}
}
src++;
dst++;
}
#undef UNROLL
return;
}
/* Neon version of S32_Blend_BlitRow32()
* portable version is in src/core/SkBlitRow_D32.cpp
*/
@ -1561,21 +1285,7 @@ const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
nullptr, // S32_Opaque,
S32_Blend_BlitRow32_neon, // S32_Blend,
/*
* We have two choices for S32A_Opaque procs. The one reads the src alpha
* value and attempts to optimize accordingly. The optimization is
* sensitive to the source content and is not a win in all cases. For
* example, if there are a lot of transitions between the alpha states,
* the performance will almost certainly be worse. However, for many
* common cases the performance is equivalent or better than the standard
* case where we do not inspect the src alpha.
*/
#if SK_A32_SHIFT == 24
// This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
#else
S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
#endif
nullptr, // Ported to SkOpts
#ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon // S32A_Blend
#else

View File

@ -36,7 +36,8 @@ namespace SkOpts {
blit_mask_d32_a8 = sk_neon::blit_mask_d32_a8;
blit_row_color32 = sk_neon::blit_row_color32;
blit_row_color32 = sk_neon::blit_row_color32;
blit_row_s32a_opaque = sk_neon::blit_row_s32a_opaque;
color_cube_filter_span = sk_neon::color_cube_filter_span;

View File

@ -9,10 +9,11 @@
#define SK_OPTS_NS sk_sse41
#include "SkBlurImageFilter_opts.h"
#include "SkBlitRow_opts.h"
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
namespace sk_sse41 {
namespace sk_sse41_new {
// An SSE register holding at most 64 bits of useful data in the low lanes.
struct m64i {
@ -211,7 +212,7 @@ static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB,
}
}
} // namespace sk_sse41
} // namespace sk_sse41_new
#endif
@ -222,8 +223,9 @@ namespace SkOpts {
box_blur_yx = sk_sse41::box_blur_yx;
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
blit_row_color32 = sk_sse41::blit_row_color32;
blit_mask_d32_a8 = sk_sse41::blit_mask_d32_a8;
blit_row_color32 = sk_sse41_new::blit_row_color32;
blit_mask_d32_a8 = sk_sse41_new::blit_mask_d32_a8;
#endif
blit_row_s32a_opaque = sk_sse41::blit_row_s32a_opaque;
}
}

View File

@ -12,7 +12,6 @@
#include "SkBlitMask.h"
#include "SkBlitRow.h"
#include "SkBlitRow_opts_SSE2.h"
#include "SkBlitRow_opts_SSE4.h"
#include "SkOncePtr.h"
#include "SkRTConf.h"
@ -215,21 +214,11 @@ SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
nullptr, // S32_Opaque,
S32_Blend_BlitRow32_SSE2, // S32_Blend,
S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
};
static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
nullptr, // S32_Opaque,
S32_Blend_BlitRow32_SSE2, // S32_Blend,
S32A_Opaque_BlitRow32_SSE4, // S32A_Opaque
nullptr, // Ported to SkOpts
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
};
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
return platform_32_procs_SSE4[flags];
} else
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return platform_32_procs_SSE2[flags];
} else {