Port S32A_opaque blit row to SkOpts.
This should be a pixel-for-pixel (i.e. bug-for-bug) port. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1820313002 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review URL: https://codereview.chromium.org/1820313002
This commit is contained in:
parent
cc77c12293
commit
b4a7dc99b1
@ -49,7 +49,6 @@
|
||||
'<(skia_src_path)/opts/SkOpts_ssse3.cpp',
|
||||
],
|
||||
'sse41_sources': [
|
||||
'<(skia_src_path)/opts/SkBlitRow_opts_SSE4.cpp',
|
||||
'<(skia_src_path)/opts/SkOpts_sse41.cpp',
|
||||
],
|
||||
# These targets are empty, but XCode doesn't like that, so add an empty file to each.
|
||||
|
@ -52,35 +52,6 @@ static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void S32A_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(255 == alpha);
|
||||
if (count > 0) {
|
||||
#ifdef UNROLL
|
||||
if (count & 1) {
|
||||
*dst = SkPMSrcOver(*(src++), *dst);
|
||||
dst += 1;
|
||||
count -= 1;
|
||||
}
|
||||
|
||||
const SkPMColor* SK_RESTRICT srcEnd = src + count;
|
||||
while (src != srcEnd) {
|
||||
*dst = SkPMSrcOver(*(src++), *dst);
|
||||
dst += 1;
|
||||
*dst = SkPMSrcOver(*(src++), *dst);
|
||||
dst += 1;
|
||||
}
|
||||
#else
|
||||
do {
|
||||
*dst = SkPMSrcOver(*src, *dst);
|
||||
src += 1;
|
||||
dst += 1;
|
||||
} while (--count > 0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
@ -115,7 +86,7 @@ static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
static const SkBlitRow::Proc32 gDefault_Procs32[] = {
|
||||
S32_Opaque_BlitRow32,
|
||||
S32_Blend_BlitRow32,
|
||||
S32A_Opaque_BlitRow32,
|
||||
nullptr,
|
||||
S32A_Blend_BlitRow32
|
||||
};
|
||||
|
||||
@ -124,6 +95,11 @@ SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
|
||||
// just so we don't crash
|
||||
flags &= kFlags32_Mask;
|
||||
|
||||
if (flags == 2) {
|
||||
// S32A_Opaque_BlitRow32 has been ported to SkOpts, but not the others yet.
|
||||
return SkOpts::blit_row_s32a_opaque;
|
||||
}
|
||||
|
||||
SkBlitRow::Proc32 proc = PlatformProcs32(flags);
|
||||
if (nullptr == proc) {
|
||||
proc = gDefault_Procs32[flags];
|
||||
|
@ -63,6 +63,7 @@ namespace SK_OPTS_NS {
|
||||
#endif
|
||||
|
||||
namespace SkOpts {
|
||||
|
||||
// Define default function pointer values here...
|
||||
// If our global compile options are set high enough, these defaults might even be
|
||||
// CPU-specialized, e.g. a typical x86-64 machine might start with SSE2 defaults.
|
||||
@ -84,7 +85,8 @@ namespace SkOpts {
|
||||
|
||||
decltype(blit_mask_d32_a8) blit_mask_d32_a8 = sk_default::blit_mask_d32_a8;
|
||||
|
||||
decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
|
||||
decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
|
||||
decltype(blit_row_s32a_opaque) blit_row_s32a_opaque = sk_default::blit_row_s32a_opaque;
|
||||
|
||||
decltype(matrix_translate) matrix_translate = sk_default::matrix_translate;
|
||||
decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;
|
||||
|
@ -39,6 +39,7 @@ namespace SkOpts {
|
||||
|
||||
extern void (*blit_mask_d32_a8)(SkPMColor*, size_t, const SkAlpha*, size_t, SkColor, int, int);
|
||||
extern void (*blit_row_color32)(SkPMColor*, const SkPMColor*, int, SkPMColor);
|
||||
extern void (*blit_row_s32a_opaque)(SkPMColor*, const SkPMColor*, int, U8CPU);
|
||||
|
||||
// This function is an optimized version of SkColorCubeFilter::filterSpan
|
||||
extern void (*color_cube_filter_span)(const SkPMColor[],
|
||||
|
@ -9,6 +9,12 @@
|
||||
#define SkBlitRow_opts_DEFINED
|
||||
|
||||
#include "Sk4px.h"
|
||||
#include "SkColorPriv.h"
|
||||
#include "SkMSAN.h"
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include "SkColor_opts_SSE2.h"
|
||||
#endif
|
||||
|
||||
namespace SK_OPTS_NS {
|
||||
|
||||
@ -17,7 +23,8 @@ namespace SK_OPTS_NS {
|
||||
// and it's quite a bit faster than blend_perfect.
|
||||
//
|
||||
// blend_256_round_alt is our currently blessed algorithm. Please use it or an analogous one.
|
||||
static void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
|
||||
static inline
|
||||
void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
|
||||
unsigned invA = 255 - SkGetPackedA32(color);
|
||||
invA += invA >> 7;
|
||||
SkASSERT(invA < 256); // We've should have already handled alpha == 0 externally.
|
||||
@ -30,6 +37,167 @@ static void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, Sk
|
||||
});
|
||||
}
|
||||
|
||||
static inline
|
||||
void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
|
||||
SkASSERT(alpha == 0xFF);
|
||||
sk_msan_assert_initialized(src, src+len);
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
while (len >= 16) {
|
||||
// Load 16 source pixels.
|
||||
auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
|
||||
s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
|
||||
s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
|
||||
s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
|
||||
|
||||
const auto alphaMask = _mm_set1_epi32(0xFF000000);
|
||||
|
||||
auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
|
||||
if (_mm_testz_si128(ORed, alphaMask)) {
|
||||
// All 16 source pixels are transparent. Nothing to do.
|
||||
src += 16;
|
||||
dst += 16;
|
||||
len -= 16;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto d0 = (__m128i*)(dst) + 0,
|
||||
d1 = (__m128i*)(dst) + 1,
|
||||
d2 = (__m128i*)(dst) + 2,
|
||||
d3 = (__m128i*)(dst) + 3;
|
||||
|
||||
auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
|
||||
if (_mm_testc_si128(ANDed, alphaMask)) {
|
||||
// All 16 source pixels are opaque. SrcOver becomes Src.
|
||||
_mm_storeu_si128(d0, s0);
|
||||
_mm_storeu_si128(d1, s1);
|
||||
_mm_storeu_si128(d2, s2);
|
||||
_mm_storeu_si128(d3, s3);
|
||||
src += 16;
|
||||
dst += 16;
|
||||
len -= 16;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: This math is wrong.
|
||||
// Do SrcOver.
|
||||
_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
|
||||
_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
|
||||
_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
|
||||
_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
|
||||
src += 16;
|
||||
dst += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
while (len >= 16) {
|
||||
// Load 16 source pixels.
|
||||
auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
|
||||
s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
|
||||
s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
|
||||
s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
|
||||
|
||||
const auto alphaMask = _mm_set1_epi32(0xFF000000);
|
||||
|
||||
auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
|
||||
if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
|
||||
_mm_setzero_si128()))) {
|
||||
// All 16 source pixels are transparent. Nothing to do.
|
||||
src += 16;
|
||||
dst += 16;
|
||||
len -= 16;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto d0 = (__m128i*)(dst) + 0,
|
||||
d1 = (__m128i*)(dst) + 1,
|
||||
d2 = (__m128i*)(dst) + 2,
|
||||
d3 = (__m128i*)(dst) + 3;
|
||||
|
||||
auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
|
||||
if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
|
||||
alphaMask))) {
|
||||
// All 16 source pixels are opaque. SrcOver becomes Src.
|
||||
_mm_storeu_si128(d0, s0);
|
||||
_mm_storeu_si128(d1, s1);
|
||||
_mm_storeu_si128(d2, s2);
|
||||
_mm_storeu_si128(d3, s3);
|
||||
src += 16;
|
||||
dst += 16;
|
||||
len -= 16;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: This math is wrong.
|
||||
// Do SrcOver.
|
||||
_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
|
||||
_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
|
||||
_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
|
||||
_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
|
||||
|
||||
src += 16;
|
||||
dst += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
#elif defined(SK_ARM_HAS_NEON)
|
||||
while (len >= 4) {
|
||||
if ((src[0] | src[1] | src[2] | src[3]) == 0x00000000) {
|
||||
// All 16 source pixels are transparent. Nothing to do.
|
||||
src += 4;
|
||||
dst += 4;
|
||||
len -= 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((src[0] & src[1] & src[2] & src[3]) >= 0xFF000000) {
|
||||
// All 16 source pixels are opaque. SrcOver becomes Src.
|
||||
dst[0] = src[0];
|
||||
dst[1] = src[1];
|
||||
dst[2] = src[2];
|
||||
dst[3] = src[3];
|
||||
src += 4;
|
||||
dst += 4;
|
||||
len -= 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Load 4 source and destination pixels.
|
||||
auto src0 = vreinterpret_u8_u32(vld1_u32(src+0)),
|
||||
src2 = vreinterpret_u8_u32(vld1_u32(src+2)),
|
||||
dst0 = vreinterpret_u8_u32(vld1_u32(dst+0)),
|
||||
dst2 = vreinterpret_u8_u32(vld1_u32(dst+2));
|
||||
|
||||
// TODO: This math is wrong.
|
||||
const uint8x8_t alphas = vcreate_u8(0x0707070703030303);
|
||||
auto invSA0_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src0, alphas)),
|
||||
invSA2_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src2, alphas));
|
||||
|
||||
auto dstInvSA0 = vmulq_u16(invSA0_w, vmovl_u8(dst0)),
|
||||
dstInvSA2 = vmulq_u16(invSA2_w, vmovl_u8(dst2));
|
||||
|
||||
dst0 = vadd_u8(src0, vshrn_n_u16(dstInvSA0, 8));
|
||||
dst2 = vadd_u8(src2, vshrn_n_u16(dstInvSA2, 8));
|
||||
|
||||
vst1_u32(dst+0, vreinterpret_u32_u8(dst0));
|
||||
vst1_u32(dst+2, vreinterpret_u32_u8(dst2));
|
||||
|
||||
src += 4;
|
||||
dst += 4;
|
||||
len -= 4;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (len-- > 0) {
|
||||
if (*src) {
|
||||
*dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
|
||||
}
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
}
|
||||
|
||||
} // SK_OPTS_NS
|
||||
|
||||
#endif//SkBlitRow_opts_DEFINED
|
||||
|
@ -67,61 +67,6 @@ void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
}
|
||||
}
|
||||
|
||||
void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
sk_msan_assert_initialized(src, src+count);
|
||||
|
||||
SkASSERT(alpha == 255);
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int count16 = count / 16;
|
||||
__m128i* dst4 = (__m128i*)dst;
|
||||
const __m128i* src4 = (const __m128i*)src;
|
||||
|
||||
for (int i = 0; i < count16 * 4; i += 4) {
|
||||
// Load 16 source pixels.
|
||||
__m128i s0 = _mm_loadu_si128(src4+i+0),
|
||||
s1 = _mm_loadu_si128(src4+i+1),
|
||||
s2 = _mm_loadu_si128(src4+i+2),
|
||||
s3 = _mm_loadu_si128(src4+i+3);
|
||||
|
||||
const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
|
||||
const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
|
||||
__m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
|
||||
if (0xffff == _mm_movemask_epi8(cmp)) {
|
||||
// All 16 source pixels are fully transparent. There's nothing to do!
|
||||
continue;
|
||||
}
|
||||
const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
|
||||
cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
|
||||
if (0xffff == _mm_movemask_epi8(cmp)) {
|
||||
// All 16 source pixels are fully opaque. There's no need to read dst or blend it.
|
||||
_mm_storeu_si128(dst4+i+0, s0);
|
||||
_mm_storeu_si128(dst4+i+1, s1);
|
||||
_mm_storeu_si128(dst4+i+2, s2);
|
||||
_mm_storeu_si128(dst4+i+3, s3);
|
||||
continue;
|
||||
}
|
||||
// The general slow case: do the blend for all 16 pixels.
|
||||
_mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
|
||||
_mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
|
||||
_mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
|
||||
_mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
|
||||
}
|
||||
|
||||
// Wrap up the last <= 15 pixels.
|
||||
SkASSERT(count - (count16*16) <= 15);
|
||||
for (int i = count16*16; i < count; i++) {
|
||||
// This check is not really necessarily, but it prevents pointless autovectorization.
|
||||
if (src[i] & 0xFF000000) {
|
||||
dst[i] = SkPMSrcOver(src[i], dst[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
|
@ -14,10 +14,6 @@ void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha);
|
||||
|
||||
void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha);
|
||||
|
||||
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha);
|
||||
|
@ -1,74 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBlitRow_opts_SSE4.h"
|
||||
|
||||
// Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub methods.
|
||||
// The stubs should never be called, so we make them crash just to confirm that.
|
||||
#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
|
||||
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) {
|
||||
sk_throw();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <smmintrin.h> // SSE4.1 intrinsics
|
||||
#include "SkColorPriv.h"
|
||||
#include "SkColor_opts_SSE2.h"
|
||||
#include "SkMSAN.h"
|
||||
|
||||
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count,
|
||||
U8CPU alpha) {
|
||||
sk_msan_assert_initialized(src, src+count);
|
||||
|
||||
SkASSERT(alpha == 255);
|
||||
// As long as we can, we'll work on 16 pixel pairs at once.
|
||||
int count16 = count / 16;
|
||||
__m128i* dst4 = (__m128i*)dst;
|
||||
const __m128i* src4 = (const __m128i*)src;
|
||||
|
||||
for (int i = 0; i < count16 * 4; i += 4) {
|
||||
// Load 16 source pixels.
|
||||
__m128i s0 = _mm_loadu_si128(src4+i+0),
|
||||
s1 = _mm_loadu_si128(src4+i+1),
|
||||
s2 = _mm_loadu_si128(src4+i+2),
|
||||
s3 = _mm_loadu_si128(src4+i+3);
|
||||
|
||||
const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
|
||||
const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
|
||||
if (_mm_testz_si128(ORed, alphaMask)) {
|
||||
// All 16 source pixels are fully transparent. There's nothing to do!
|
||||
continue;
|
||||
}
|
||||
const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
|
||||
if (_mm_testc_si128(ANDed, alphaMask)) {
|
||||
// All 16 source pixels are fully opaque. There's no need to read dst or blend it.
|
||||
_mm_storeu_si128(dst4+i+0, s0);
|
||||
_mm_storeu_si128(dst4+i+1, s1);
|
||||
_mm_storeu_si128(dst4+i+2, s2);
|
||||
_mm_storeu_si128(dst4+i+3, s3);
|
||||
continue;
|
||||
}
|
||||
// The general slow case: do the blend for all 16 pixels.
|
||||
_mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
|
||||
_mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
|
||||
_mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
|
||||
_mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
|
||||
}
|
||||
|
||||
// Wrap up the last <= 15 pixels.
|
||||
for (int i = count16*16; i < count; i++) {
|
||||
// This check is not really necessarily, but it prevents pointless autovectorization.
|
||||
if (src[i] & 0xFF000000) {
|
||||
dst[i] = SkPMSrcOver(src[i], dst[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -1,18 +0,0 @@
|
||||
/*
|
||||
* Copyright 2014 The Android Open Source Project
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkBlitRow_opts_SSE4_DEFINED
|
||||
#define SkBlitRow_opts_SSE4_DEFINED
|
||||
|
||||
#include "SkBlitRow.h"
|
||||
|
||||
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
|
||||
const SkPMColor* SK_RESTRICT,
|
||||
int count,
|
||||
U8CPU alpha);
|
||||
#endif
|
||||
|
@ -871,282 +871,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
|
||||
}
|
||||
}
|
||||
|
||||
void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
|
||||
SkASSERT(255 == alpha);
|
||||
if (count > 0) {
|
||||
|
||||
|
||||
uint8x8_t alpha_mask;
|
||||
|
||||
static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
|
||||
alpha_mask = vld1_u8(alpha_mask_setup);
|
||||
|
||||
/* do the NEON unrolled code */
|
||||
#define UNROLL 4
|
||||
while (count >= UNROLL) {
|
||||
uint8x8_t src_raw, dst_raw, dst_final;
|
||||
uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
|
||||
|
||||
/* The two prefetches below may make the code slighlty
|
||||
* slower for small values of count but are worth having
|
||||
* in the general case.
|
||||
*/
|
||||
__builtin_prefetch(src+32);
|
||||
__builtin_prefetch(dst+32);
|
||||
|
||||
/* get the source */
|
||||
src_raw = vreinterpret_u8_u32(vld1_u32(src));
|
||||
#if UNROLL > 2
|
||||
src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
|
||||
#endif
|
||||
|
||||
/* get and hold the dst too */
|
||||
dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
|
||||
#if UNROLL > 2
|
||||
dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
|
||||
#endif
|
||||
|
||||
/* 1st and 2nd bits of the unrolling */
|
||||
{
|
||||
uint8x8_t dst_cooked;
|
||||
uint16x8_t dst_wide;
|
||||
uint8x8_t alpha_narrow;
|
||||
uint16x8_t alpha_wide;
|
||||
|
||||
/* get the alphas spread out properly */
|
||||
alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
|
||||
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
||||
|
||||
/* spread the dest */
|
||||
dst_wide = vmovl_u8(dst_raw);
|
||||
|
||||
/* alpha mul the dest */
|
||||
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
||||
dst_cooked = vshrn_n_u16(dst_wide, 8);
|
||||
|
||||
/* sum -- ignoring any byte lane overflows */
|
||||
dst_final = vadd_u8(src_raw, dst_cooked);
|
||||
}
|
||||
|
||||
#if UNROLL > 2
|
||||
/* the 3rd and 4th bits of our unrolling */
|
||||
{
|
||||
uint8x8_t dst_cooked;
|
||||
uint16x8_t dst_wide;
|
||||
uint8x8_t alpha_narrow;
|
||||
uint16x8_t alpha_wide;
|
||||
|
||||
alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
|
||||
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
||||
|
||||
/* spread the dest */
|
||||
dst_wide = vmovl_u8(dst_raw_2);
|
||||
|
||||
/* alpha mul the dest */
|
||||
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
||||
dst_cooked = vshrn_n_u16(dst_wide, 8);
|
||||
|
||||
/* sum -- ignoring any byte lane overflows */
|
||||
dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1_u32(dst, vreinterpret_u32_u8(dst_final));
|
||||
#if UNROLL > 2
|
||||
vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
|
||||
#endif
|
||||
|
||||
src += UNROLL;
|
||||
dst += UNROLL;
|
||||
count -= UNROLL;
|
||||
}
|
||||
#undef UNROLL
|
||||
|
||||
/* do any residual iterations */
|
||||
while (--count >= 0) {
|
||||
*dst = SkPMSrcOver(*src, *dst);
|
||||
src += 1;
|
||||
dst += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(255 == alpha);
|
||||
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
/* Use these to check if src is transparent or opaque */
|
||||
const unsigned int ALPHA_OPAQ = 0xFF000000;
|
||||
const unsigned int ALPHA_TRANS = 0x00FFFFFF;
|
||||
|
||||
#define UNROLL 4
|
||||
const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
|
||||
const SkPMColor* SK_RESTRICT src_temp = src;
|
||||
|
||||
/* set up the NEON variables */
|
||||
uint8x8_t alpha_mask;
|
||||
static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
|
||||
alpha_mask = vld1_u8(alpha_mask_setup);
|
||||
|
||||
uint8x8_t src_raw, dst_raw, dst_final;
|
||||
uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
|
||||
uint8x8_t dst_cooked;
|
||||
uint16x8_t dst_wide;
|
||||
uint8x8_t alpha_narrow;
|
||||
uint16x8_t alpha_wide;
|
||||
|
||||
/* choose the first processing type */
|
||||
if( src >= src_end)
|
||||
goto TAIL;
|
||||
if(*src <= ALPHA_TRANS)
|
||||
goto ALPHA_0;
|
||||
if(*src >= ALPHA_OPAQ)
|
||||
goto ALPHA_255;
|
||||
/* fall-thru */
|
||||
|
||||
ALPHA_1_TO_254:
|
||||
do {
|
||||
|
||||
/* get the source */
|
||||
src_raw = vreinterpret_u8_u32(vld1_u32(src));
|
||||
src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
|
||||
|
||||
/* get and hold the dst too */
|
||||
dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
|
||||
dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
|
||||
|
||||
|
||||
/* get the alphas spread out properly */
|
||||
alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
|
||||
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
|
||||
/* we collapsed (255-a)+1 ... */
|
||||
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
||||
|
||||
/* spread the dest */
|
||||
dst_wide = vmovl_u8(dst_raw);
|
||||
|
||||
/* alpha mul the dest */
|
||||
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
||||
dst_cooked = vshrn_n_u16(dst_wide, 8);
|
||||
|
||||
/* sum -- ignoring any byte lane overflows */
|
||||
dst_final = vadd_u8(src_raw, dst_cooked);
|
||||
|
||||
alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
|
||||
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
|
||||
/* we collapsed (255-a)+1 ... */
|
||||
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
||||
|
||||
/* spread the dest */
|
||||
dst_wide = vmovl_u8(dst_raw_2);
|
||||
|
||||
/* alpha mul the dest */
|
||||
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
||||
dst_cooked = vshrn_n_u16(dst_wide, 8);
|
||||
|
||||
/* sum -- ignoring any byte lane overflows */
|
||||
dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
|
||||
|
||||
vst1_u32(dst, vreinterpret_u32_u8(dst_final));
|
||||
vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
|
||||
|
||||
src += UNROLL;
|
||||
dst += UNROLL;
|
||||
|
||||
/* if 2 of the next pixels aren't between 1 and 254
|
||||
it might make sense to go to the optimized loops */
|
||||
if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
|
||||
break;
|
||||
|
||||
} while(src < src_end);
|
||||
|
||||
if (src >= src_end)
|
||||
goto TAIL;
|
||||
|
||||
if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
|
||||
goto ALPHA_255;
|
||||
|
||||
/*fall-thru*/
|
||||
|
||||
ALPHA_0:
|
||||
|
||||
/*In this state, we know the current alpha is 0 and
|
||||
we optimize for the next alpha also being zero. */
|
||||
src_temp = src; //so we don't have to increment dst every time
|
||||
do {
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
} while(src < src_end);
|
||||
|
||||
dst += (src - src_temp);
|
||||
|
||||
/* no longer alpha 0, so determine where to go next. */
|
||||
if( src >= src_end)
|
||||
goto TAIL;
|
||||
if(*src >= ALPHA_OPAQ)
|
||||
goto ALPHA_255;
|
||||
else
|
||||
goto ALPHA_1_TO_254;
|
||||
|
||||
ALPHA_255:
|
||||
while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
|
||||
dst[0]=src[0];
|
||||
dst[1]=src[1];
|
||||
dst[2]=src[2];
|
||||
dst[3]=src[3];
|
||||
src+=UNROLL;
|
||||
dst+=UNROLL;
|
||||
if(src >= src_end)
|
||||
goto TAIL;
|
||||
}
|
||||
|
||||
//Handle remainder.
|
||||
if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
|
||||
if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
|
||||
if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
|
||||
}
|
||||
}
|
||||
|
||||
if( src >= src_end)
|
||||
goto TAIL;
|
||||
if(*src <= ALPHA_TRANS)
|
||||
goto ALPHA_0;
|
||||
else
|
||||
goto ALPHA_1_TO_254;
|
||||
|
||||
TAIL:
|
||||
/* do any residual iterations */
|
||||
src_end += UNROLL + 1; //goto the real end
|
||||
while(src != src_end) {
|
||||
if( *src != 0 ) {
|
||||
if( *src >= ALPHA_OPAQ ) {
|
||||
*dst = *src;
|
||||
}
|
||||
else {
|
||||
*dst = SkPMSrcOver(*src, *dst);
|
||||
}
|
||||
}
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
|
||||
#undef UNROLL
|
||||
return;
|
||||
}
|
||||
|
||||
/* Neon version of S32_Blend_BlitRow32()
|
||||
* portable version is in src/core/SkBlitRow_D32.cpp
|
||||
*/
|
||||
@ -1561,21 +1285,7 @@ const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
|
||||
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
|
||||
nullptr, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_neon, // S32_Blend,
|
||||
/*
|
||||
* We have two choices for S32A_Opaque procs. The one reads the src alpha
|
||||
* value and attempts to optimize accordingly. The optimization is
|
||||
* sensitive to the source content and is not a win in all cases. For
|
||||
* example, if there are a lot of transitions between the alpha states,
|
||||
* the performance will almost certainly be worse. However, for many
|
||||
* common cases the performance is equivalent or better than the standard
|
||||
* case where we do not inspect the src alpha.
|
||||
*/
|
||||
#if SK_A32_SHIFT == 24
|
||||
// This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
|
||||
S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
|
||||
#else
|
||||
S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
|
||||
#endif
|
||||
nullptr, // Ported to SkOpts
|
||||
#ifdef SK_CPU_ARM32
|
||||
S32A_Blend_BlitRow32_neon // S32A_Blend
|
||||
#else
|
||||
|
@ -36,7 +36,8 @@ namespace SkOpts {
|
||||
|
||||
blit_mask_d32_a8 = sk_neon::blit_mask_d32_a8;
|
||||
|
||||
blit_row_color32 = sk_neon::blit_row_color32;
|
||||
blit_row_color32 = sk_neon::blit_row_color32;
|
||||
blit_row_s32a_opaque = sk_neon::blit_row_s32a_opaque;
|
||||
|
||||
color_cube_filter_span = sk_neon::color_cube_filter_span;
|
||||
|
||||
|
@ -9,10 +9,11 @@
|
||||
|
||||
#define SK_OPTS_NS sk_sse41
|
||||
#include "SkBlurImageFilter_opts.h"
|
||||
#include "SkBlitRow_opts.h"
|
||||
|
||||
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
|
||||
|
||||
namespace sk_sse41 {
|
||||
namespace sk_sse41_new {
|
||||
|
||||
// An SSE register holding at most 64 bits of useful data in the low lanes.
|
||||
struct m64i {
|
||||
@ -211,7 +212,7 @@ static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace sk_sse41
|
||||
} // namespace sk_sse41_new
|
||||
|
||||
#endif
|
||||
|
||||
@ -222,8 +223,9 @@ namespace SkOpts {
|
||||
box_blur_yx = sk_sse41::box_blur_yx;
|
||||
|
||||
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
|
||||
blit_row_color32 = sk_sse41::blit_row_color32;
|
||||
blit_mask_d32_a8 = sk_sse41::blit_mask_d32_a8;
|
||||
blit_row_color32 = sk_sse41_new::blit_row_color32;
|
||||
blit_mask_d32_a8 = sk_sse41_new::blit_mask_d32_a8;
|
||||
#endif
|
||||
blit_row_s32a_opaque = sk_sse41::blit_row_s32a_opaque;
|
||||
}
|
||||
}
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkBlitRow_opts_SSE2.h"
|
||||
#include "SkBlitRow_opts_SSE4.h"
|
||||
#include "SkOncePtr.h"
|
||||
#include "SkRTConf.h"
|
||||
|
||||
@ -215,21 +214,11 @@ SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
|
||||
static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
|
||||
nullptr, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_SSE2, // S32_Blend,
|
||||
S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
|
||||
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
|
||||
};
|
||||
|
||||
static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
|
||||
nullptr, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_SSE2, // S32_Blend,
|
||||
S32A_Opaque_BlitRow32_SSE4, // S32A_Opaque
|
||||
nullptr, // Ported to SkOpts
|
||||
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
|
||||
};
|
||||
|
||||
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
|
||||
return platform_32_procs_SSE4[flags];
|
||||
} else
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
return platform_32_procs_SSE2[flags];
|
||||
} else {
|
||||
|
Loading…
Reference in New Issue
Block a user