Revert "Optimize SkBlend by using NEON intrinsics"

This reverts commit 7adde145d3.

Reason for revert: may be breaking our Android One test bots.

Original change's description:
> Optimize SkBlend by using NEON intrinsics
> 
> Use NEON intrinsics to check the alpha channel of the pixels.
> 
> In some case, it's about 14 times faster than the original implementation.
> 
> $ ./bin/droid out/arm64_release/nanobench  --samples 300 --nompd --match LinearSrcOver -v > neon_opt.log
> $ ./bin/compare neon_opt.log clean.log
>      LinearSrcOver_yellow_rose.pngVSkOptsDefault         1.8ms -> 24.9ms        13.8x
>        LinearSrcOver_iconstrip.pngVSkOptsDefault        5.71ms -> 69.8ms        12.2x
>            LinearSrcOver_plane.pngVSkOptsDefault        1.45ms ->   11ms        7.62x
>         LinearSrcOver_baby_tux.pngVSkOptsDefault        1.88ms -> 9.96ms        5.29x
>     LinearSrcOver_mandrill_512.pngVSkOptsDefault        1.41ms -> 4.62ms        3.29x
>      LinearSrcOver_yellow_rose.pngVSkOptsTrivial        24.9ms -> 24.9ms        1x
>  LinearSrcOver_yellow_rose.pngVSkOptsNonSimdCore        2.17ms -> 2.18ms        1x
>            LinearSrcOver_plane.pngVSkOptsTrivial        11.1ms -> 11.1ms        1x
>        LinearSrcOver_plane.pngVSkOptsNonSimdCore         1.5ms ->  1.5ms        1x
> LinearSrcOver_mandrill_512.pngVSkOptsNonSimdCore        2.39ms -> 2.39ms        1x
>    LinearSrcOver_iconstrip.pngVSkOptsNonSimdCore        6.43ms -> 6.43ms        1x
>      LinearSrcOver_baby_tux.pngVSkOptsBruteForce        22.3ms -> 22.3ms        1x
>   LinearSrcOver_yellow_rose.pngVSkOptsBruteForce        45.5ms -> 45.5ms        1x
>     LinearSrcOver_baby_tux.pngVSkOptsNonSimdCore        2.02ms -> 2.02ms        1x
>        LinearSrcOver_iconstrip.pngVSkOptsTrivial        69.7ms -> 69.7ms        1x
>         LinearSrcOver_baby_tux.pngVSkOptsTrivial        9.96ms -> 9.95ms        1x
>  LinearSrcOver_mandrill_512.pngVSkOptsBruteForce        99.3ms -> 99.2ms        1x
> 
> BUG=skia:
> 
> CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD
> 
> Change-Id: Ia576365578d65b771440da65fdf41f090ccf0541
> Reviewed-on: https://skia-review.googlesource.com/6860
> Reviewed-by: Mike Klein <mtklein@chromium.org>
> Commit-Queue: Mike Klein <mtklein@chromium.org>
> 

TBR=mtklein@chromium.org,bsalomon@google.com,joel.liang@arm.com,reviews@skia.org
BUG=skia:
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true

Change-Id: Ie40eb5a7c27807aaf396429a82a1a2dd328c2b5b
Reviewed-on: https://skia-review.googlesource.com/7036
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-01-13 20:10:09 +00:00 committed by Skia Commit-Bot
parent 43475ad9dc
commit 1d8e198d75

View File

@ -17,8 +17,6 @@ ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include <immintrin.h>
#elif defined(SK_ARM_HAS_NEON)
#include <arm_neon.h>
#endif
namespace SK_OPTS_NS {
@ -27,8 +25,6 @@ static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
if (src >= 0xFF000000) {
*dst = src;
return;
} else if (src <= 0x00FFFFFF) {
return;
}
auto d = Sk4f_fromS32(*dst),
s = Sk4f_fromS32( src);
@ -170,87 +166,6 @@ static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
}
}
#endif
#elif defined(SK_ARM_HAS_NEON)
static inline uint32x4_t load(const uint32_t* p) {
return vld1q_u32(p);
}
static inline void store(uint32_t* p, uint32x4_t v) {
vst1q_u32(p, v);
}
static inline bool check_opaque_alphas(uint32x4_t pixels) {
uint64_t mask =
vget_lane_u64(
vreinterpret_u64_u16(
vmovn_u32(
vcltq_u32(pixels, vdupq_n_u32(0xFF000000)))),
0);
return mask == 0;
}
static inline bool check_transparent_alphas(uint32x4_t pixels) {
uint64_t mask =
vget_lane_u64(
vreinterpret_u64_u16(
vmovn_u32(
vcgtq_u32(pixels, vdupq_n_u32(0x00FFFFFF)))),
0);
return mask == 0;
}
static inline bool check_partial_alphas(uint32x4_t pixels) {
uint32x4_t opaque = vcltq_u32(pixels, vdupq_n_u32(0xFF000000));
uint32x4_t transparent = vcgtq_u32(pixels, vdupq_n_u32(0x00FFFFFF));
uint64_t mask =
vget_lane_u64(
vreinterpret_u64_u16(
vmovn_u32(
veorq_u32(opaque, transparent))),
0);
return mask == 0;
}
static void srcover_srgb_srgb(
uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
while (ndst > 0) {
int count = SkTMin(ndst, nsrc);
ndst -= count;
const uint32_t* src = srcStart;
const uint32_t* end = dst + (count & ~3);
const ptrdiff_t delta = src - dst;
uint32x4_t pixels = load(src);
do {
if (check_opaque_alphas(pixels)) {
uint32_t* start = dst;
do {
store(dst, pixels);
dst += 4;
} while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
src += dst - start;
} else if (check_transparent_alphas(pixels)) {
const uint32_t* start = dst;
do {
dst += 4;
} while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
src += dst - start;
} else {
const uint32_t* start = dst;
do {
srcover_srgb_srgb_4(dst, dst + delta);
dst += 4;
} while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
src += dst - start;
}
} while (dst < end);
count = count & 3;
while (count-- > 0) {
srcover_srgb_srgb_1(dst++, *src++);
}
}
}
#else
static void srcover_srgb_srgb(