Revert "Optimize SkBlend by using NEON intrinsics"
This reverts commit 7adde145d3
.
Reason for revert: may be breaking our Android One test bots.
Original change's description:
> Optimize SkBlend by using NEON intrinsics
>
> Use NEON intrinsics to check the alpha channel of the pixels.
>
> In some case, it's about 14 times faster than the original implementation.
>
> $ ./bin/droid out/arm64_release/nanobench --samples 300 --nompd --match LinearSrcOver -v > neon_opt.log
> $ ./bin/compare neon_opt.log clean.log
> LinearSrcOver_yellow_rose.pngVSkOptsDefault 1.8ms -> 24.9ms 13.8x
> LinearSrcOver_iconstrip.pngVSkOptsDefault 5.71ms -> 69.8ms 12.2x
> LinearSrcOver_plane.pngVSkOptsDefault 1.45ms -> 11ms 7.62x
> LinearSrcOver_baby_tux.pngVSkOptsDefault 1.88ms -> 9.96ms 5.29x
> LinearSrcOver_mandrill_512.pngVSkOptsDefault 1.41ms -> 4.62ms 3.29x
> LinearSrcOver_yellow_rose.pngVSkOptsTrivial 24.9ms -> 24.9ms 1x
> LinearSrcOver_yellow_rose.pngVSkOptsNonSimdCore 2.17ms -> 2.18ms 1x
> LinearSrcOver_plane.pngVSkOptsTrivial 11.1ms -> 11.1ms 1x
> LinearSrcOver_plane.pngVSkOptsNonSimdCore 1.5ms -> 1.5ms 1x
> LinearSrcOver_mandrill_512.pngVSkOptsNonSimdCore 2.39ms -> 2.39ms 1x
> LinearSrcOver_iconstrip.pngVSkOptsNonSimdCore 6.43ms -> 6.43ms 1x
> LinearSrcOver_baby_tux.pngVSkOptsBruteForce 22.3ms -> 22.3ms 1x
> LinearSrcOver_yellow_rose.pngVSkOptsBruteForce 45.5ms -> 45.5ms 1x
> LinearSrcOver_baby_tux.pngVSkOptsNonSimdCore 2.02ms -> 2.02ms 1x
> LinearSrcOver_iconstrip.pngVSkOptsTrivial 69.7ms -> 69.7ms 1x
> LinearSrcOver_baby_tux.pngVSkOptsTrivial 9.96ms -> 9.95ms 1x
> LinearSrcOver_mandrill_512.pngVSkOptsBruteForce 99.3ms -> 99.2ms 1x
>
> BUG=skia:
>
> CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD
>
> Change-Id: Ia576365578d65b771440da65fdf41f090ccf0541
> Reviewed-on: https://skia-review.googlesource.com/6860
> Reviewed-by: Mike Klein <mtklein@chromium.org>
> Commit-Queue: Mike Klein <mtklein@chromium.org>
>
TBR=mtklein@chromium.org,bsalomon@google.com,joel.liang@arm.com,reviews@skia.org
BUG=skia:
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
Change-Id: Ie40eb5a7c27807aaf396429a82a1a2dd328c2b5b
Reviewed-on: https://skia-review.googlesource.com/7036
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
43475ad9dc
commit
1d8e198d75
@ -17,8 +17,6 @@ ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include <immintrin.h>
|
||||
#elif defined(SK_ARM_HAS_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
namespace SK_OPTS_NS {
|
||||
@ -27,8 +25,6 @@ static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
|
||||
if (src >= 0xFF000000) {
|
||||
*dst = src;
|
||||
return;
|
||||
} else if (src <= 0x00FFFFFF) {
|
||||
return;
|
||||
}
|
||||
auto d = Sk4f_fromS32(*dst),
|
||||
s = Sk4f_fromS32( src);
|
||||
@ -170,87 +166,6 @@ static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#elif defined(SK_ARM_HAS_NEON)
|
||||
static inline uint32x4_t load(const uint32_t* p) {
|
||||
return vld1q_u32(p);
|
||||
}
|
||||
|
||||
static inline void store(uint32_t* p, uint32x4_t v) {
|
||||
vst1q_u32(p, v);
|
||||
}
|
||||
|
||||
static inline bool check_opaque_alphas(uint32x4_t pixels) {
|
||||
uint64_t mask =
|
||||
vget_lane_u64(
|
||||
vreinterpret_u64_u16(
|
||||
vmovn_u32(
|
||||
vcltq_u32(pixels, vdupq_n_u32(0xFF000000)))),
|
||||
0);
|
||||
return mask == 0;
|
||||
}
|
||||
|
||||
static inline bool check_transparent_alphas(uint32x4_t pixels) {
|
||||
uint64_t mask =
|
||||
vget_lane_u64(
|
||||
vreinterpret_u64_u16(
|
||||
vmovn_u32(
|
||||
vcgtq_u32(pixels, vdupq_n_u32(0x00FFFFFF)))),
|
||||
0);
|
||||
return mask == 0;
|
||||
}
|
||||
|
||||
static inline bool check_partial_alphas(uint32x4_t pixels) {
|
||||
uint32x4_t opaque = vcltq_u32(pixels, vdupq_n_u32(0xFF000000));
|
||||
uint32x4_t transparent = vcgtq_u32(pixels, vdupq_n_u32(0x00FFFFFF));
|
||||
uint64_t mask =
|
||||
vget_lane_u64(
|
||||
vreinterpret_u64_u16(
|
||||
vmovn_u32(
|
||||
veorq_u32(opaque, transparent))),
|
||||
0);
|
||||
return mask == 0;
|
||||
}
|
||||
|
||||
static void srcover_srgb_srgb(
|
||||
uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
|
||||
while (ndst > 0) {
|
||||
int count = SkTMin(ndst, nsrc);
|
||||
ndst -= count;
|
||||
const uint32_t* src = srcStart;
|
||||
const uint32_t* end = dst + (count & ~3);
|
||||
const ptrdiff_t delta = src - dst;
|
||||
|
||||
uint32x4_t pixels = load(src);
|
||||
do {
|
||||
if (check_opaque_alphas(pixels)) {
|
||||
uint32_t* start = dst;
|
||||
do {
|
||||
store(dst, pixels);
|
||||
dst += 4;
|
||||
} while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
|
||||
src += dst - start;
|
||||
} else if (check_transparent_alphas(pixels)) {
|
||||
const uint32_t* start = dst;
|
||||
do {
|
||||
dst += 4;
|
||||
} while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
|
||||
src += dst - start;
|
||||
} else {
|
||||
const uint32_t* start = dst;
|
||||
do {
|
||||
srcover_srgb_srgb_4(dst, dst + delta);
|
||||
dst += 4;
|
||||
} while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
|
||||
src += dst - start;
|
||||
}
|
||||
} while (dst < end);
|
||||
|
||||
count = count & 3;
|
||||
while (count-- > 0) {
|
||||
srcover_srgb_srgb_1(dst++, *src++);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
static void srcover_srgb_srgb(
|
||||
|
Loading…
Reference in New Issue
Block a user