clean up SkBlitRow_opts
SSE2 and NEON are common baseline instruction sets now, so there's no need to runtime detect support for these routines. I simplified the SSE and portable implementations while moving them. Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel Change-Id: I34e96851735c8d7ad90198f3ac4bf86ff508f17c Reviewed-on: https://skia-review.googlesource.com/c/170220 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
7600cb3566
commit
6a2c42f893
@ -9,12 +9,10 @@ _src = get_path_info("../src", "abspath")
|
||||
none = [
|
||||
"$_src/opts/Sk4px_none.h",
|
||||
"$_src/opts/SkBitmapProcState_opts_none.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_none.cpp",
|
||||
]
|
||||
|
||||
armv7 = [
|
||||
"$_src/opts/SkBitmapProcState_opts_none.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_arm.cpp",
|
||||
]
|
||||
|
||||
neon = [
|
||||
@ -23,8 +21,6 @@ neon = [
|
||||
"$_src/opts/SkBitmapProcState_filter_neon.h",
|
||||
"$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
|
||||
"$_src/opts/SkBitmapProcState_matrix_neon.h",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.h",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.cpp",
|
||||
"$_src/opts/SkColor_opts_neon.h",
|
||||
]
|
||||
|
||||
@ -35,9 +31,6 @@ arm64 = [
|
||||
"$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
|
||||
"$_src/opts/SkBitmapProcState_matrix_neon.h",
|
||||
"$_src/opts/SkBitmapProcState_opts_none.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_arm.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.h",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.cpp",
|
||||
"$_src/opts/SkColor_opts_neon.h",
|
||||
]
|
||||
|
||||
@ -47,7 +40,6 @@ sse2 = [
|
||||
"$_src/opts/Sk4px_SSE2.h",
|
||||
"$_src/opts/SkBitmapProcState_opts_SSE2.h",
|
||||
"$_src/opts/SkBitmapProcState_opts_SSE2.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_SSE2.cpp",
|
||||
"$_src/opts/SkColor_opts_SSE2.h",
|
||||
"$_src/opts/opts_check_x86.cpp",
|
||||
]
|
||||
|
@ -33,19 +33,6 @@ public:
|
||||
if they are not, they may not overlap.
|
||||
*/
|
||||
static void Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color);
|
||||
|
||||
/** These static functions are called by the Factory and Factory32
|
||||
functions, and should return either NULL, or a
|
||||
platform-specific function-ptr to be used in place of the
|
||||
system default.
|
||||
*/
|
||||
|
||||
static Proc32 PlatformProcs32(unsigned flags);
|
||||
|
||||
private:
|
||||
enum {
|
||||
kFlags32_Mask = 3
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -11,103 +11,233 @@
|
||||
#include "SkOpts.h"
|
||||
#include "SkUtils.h"
|
||||
|
||||
#define UNROLL
|
||||
|
||||
static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
// Everyone agrees memcpy() is the best way to do this.
|
||||
static void blit_row_s32_opaque(SkPMColor* dst,
|
||||
const SkPMColor* src,
|
||||
int count,
|
||||
U8CPU alpha) {
|
||||
SkASSERT(255 == alpha);
|
||||
memcpy(dst, src, count * 4);
|
||||
memcpy(dst, src, count * sizeof(SkPMColor));
|
||||
}
|
||||
|
||||
static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
if (count > 0) {
|
||||
unsigned src_scale = SkAlpha255To256(alpha);
|
||||
// We have SSE2, NEON, and portable implementations of
|
||||
// blit_row_s32_blend() and blit_row_s32a_blend().
|
||||
|
||||
#ifdef UNROLL
|
||||
if (count & 1) {
|
||||
*dst = SkPMLerp(*src, *dst, src_scale);
|
||||
src += 1;
|
||||
dst += 1;
|
||||
count -= 1;
|
||||
// TODO(mtklein): can we do better in NEON than 2 pixels at a time?
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include "SkColor_opts_SSE2.h"
|
||||
|
||||
static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
|
||||
auto src4 = (const __m128i*)src;
|
||||
auto dst4 = ( __m128i*)dst;
|
||||
|
||||
while (count >= 4) {
|
||||
_mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
|
||||
_mm_loadu_si128(dst4),
|
||||
SkAlpha255To256(alpha)));
|
||||
src4++;
|
||||
dst4++;
|
||||
count -= 4;
|
||||
}
|
||||
|
||||
const SkPMColor* SK_RESTRICT srcEnd = src + count;
|
||||
while (src != srcEnd) {
|
||||
*dst = SkPMLerp(*src, *dst, src_scale);
|
||||
src += 1;
|
||||
dst += 1;
|
||||
*dst = SkPMLerp(*src, *dst, src_scale);
|
||||
src += 1;
|
||||
dst += 1;
|
||||
src = (const SkPMColor*)src4;
|
||||
dst = ( SkPMColor*)dst4;
|
||||
|
||||
while (count --> 0) {
|
||||
*dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
#else
|
||||
do {
|
||||
*dst = SkPMLerp(*src, *dst, src_scale);
|
||||
src += 1;
|
||||
dst += 1;
|
||||
} while (--count > 0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
if (count > 0) {
|
||||
#ifdef UNROLL
|
||||
if (count & 1) {
|
||||
*dst = SkBlendARGB32(*(src++), *dst, alpha);
|
||||
dst += 1;
|
||||
count -= 1;
|
||||
static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
|
||||
auto src4 = (const __m128i*)src;
|
||||
auto dst4 = ( __m128i*)dst;
|
||||
|
||||
while (count >= 4) {
|
||||
_mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
|
||||
_mm_loadu_si128(dst4),
|
||||
alpha));
|
||||
src4++;
|
||||
dst4++;
|
||||
count -= 4;
|
||||
}
|
||||
|
||||
const SkPMColor* SK_RESTRICT srcEnd = src + count;
|
||||
while (src != srcEnd) {
|
||||
*dst = SkBlendARGB32(*(src++), *dst, alpha);
|
||||
dst += 1;
|
||||
*dst = SkBlendARGB32(*(src++), *dst, alpha);
|
||||
dst += 1;
|
||||
}
|
||||
#else
|
||||
do {
|
||||
src = (const SkPMColor*)src4;
|
||||
dst = ( SkPMColor*)dst4;
|
||||
|
||||
while (count --> 0) {
|
||||
*dst = SkBlendARGB32(*src, *dst, alpha);
|
||||
src += 1;
|
||||
dst += 1;
|
||||
} while (--count > 0);
|
||||
#endif
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
#elif defined(SK_ARM_HAS_NEON)
|
||||
|
||||
static const SkBlitRow::Proc32 gDefault_Procs32[] = {
|
||||
S32_Opaque_BlitRow32,
|
||||
S32_Blend_BlitRow32,
|
||||
nullptr,
|
||||
S32A_Blend_BlitRow32
|
||||
};
|
||||
#include "SkColor_opts_neon.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
|
||||
uint16_t src_scale = SkAlpha255To256(alpha);
|
||||
uint16_t dst_scale = 256 - src_scale;
|
||||
|
||||
while (count >= 2) {
|
||||
uint8x8_t vsrc, vdst, vres;
|
||||
uint16x8_t vsrc_wide, vdst_wide;
|
||||
|
||||
vsrc = vreinterpret_u8_u32(vld1_u32(src));
|
||||
vdst = vreinterpret_u8_u32(vld1_u32(dst));
|
||||
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
|
||||
|
||||
vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
|
||||
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
vst1_u32(dst, vreinterpret_u32_u8(vres));
|
||||
|
||||
src += 2;
|
||||
dst += 2;
|
||||
count -= 2;
|
||||
}
|
||||
|
||||
if (count == 1) {
|
||||
uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
|
||||
uint16x8_t vsrc_wide, vdst_wide;
|
||||
|
||||
vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
|
||||
vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
|
||||
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
|
||||
vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
|
||||
SkASSERT(alpha < 255);
|
||||
|
||||
unsigned alpha256 = SkAlpha255To256(alpha);
|
||||
|
||||
if (count & 1) {
|
||||
uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
|
||||
uint16x8_t vdst_wide, vsrc_wide;
|
||||
unsigned dst_scale;
|
||||
|
||||
vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
|
||||
vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
|
||||
|
||||
dst_scale = vget_lane_u8(vsrc, 3);
|
||||
dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
|
||||
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
|
||||
|
||||
vdst_wide = vmovl_u8(vdst);
|
||||
vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
|
||||
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
|
||||
dst++;
|
||||
src++;
|
||||
count--;
|
||||
}
|
||||
|
||||
uint8x8_t alpha_mask;
|
||||
static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
|
||||
alpha_mask = vld1_u8(alpha_mask_setup);
|
||||
|
||||
while (count) {
|
||||
|
||||
uint8x8_t vsrc, vdst, vres, vsrc_alphas;
|
||||
uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
|
||||
|
||||
__builtin_prefetch(src+32);
|
||||
__builtin_prefetch(dst+32);
|
||||
|
||||
vsrc = vreinterpret_u8_u32(vld1_u32(src));
|
||||
vdst = vreinterpret_u8_u32(vld1_u32(dst));
|
||||
|
||||
vsrc_scale = vdupq_n_u16(alpha256);
|
||||
|
||||
vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
|
||||
vdst_scale = vmovl_u8(vsrc_alphas);
|
||||
// Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
|
||||
// A 16-bit lane would overflow if we used 0xFFFF here,
|
||||
// so use an approximation with 0xFF00 that is off by 1,
|
||||
// and add back 1 after to get the correct value.
|
||||
// This is valid if alpha256 <= 255.
|
||||
vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
|
||||
vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
|
||||
vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
|
||||
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide *= vsrc_scale;
|
||||
|
||||
vdst_wide = vmovl_u8(vdst);
|
||||
vdst_wide *= vdst_scale;
|
||||
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
vst1_u32(dst, vreinterpret_u32_u8(vres));
|
||||
|
||||
src += 2;
|
||||
dst += 2;
|
||||
count -= 2;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
while (count --> 0) {
|
||||
*dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
}
|
||||
|
||||
static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
while (count --> 0) {
|
||||
*dst = SkBlendARGB32(*src, *dst, alpha);
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
|
||||
SkASSERT(flags < SK_ARRAY_COUNT(gDefault_Procs32));
|
||||
// just so we don't crash
|
||||
flags &= kFlags32_Mask;
|
||||
static const SkBlitRow::Proc32 kProcs[] = {
|
||||
blit_row_s32_opaque,
|
||||
blit_row_s32_blend,
|
||||
nullptr, // blit_row_s32a_opaque is in SkOpts
|
||||
blit_row_s32a_blend
|
||||
};
|
||||
|
||||
if (flags == 2) {
|
||||
// S32A_Opaque_BlitRow32 has been ported to SkOpts, but not the others yet.
|
||||
return SkOpts::blit_row_s32a_opaque;
|
||||
}
|
||||
SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
|
||||
flags &= SK_ARRAY_COUNT(kProcs) - 1; // just to be safe
|
||||
|
||||
SkBlitRow::Proc32 proc = PlatformProcs32(flags);
|
||||
if (nullptr == proc) {
|
||||
proc = gDefault_Procs32[flags];
|
||||
}
|
||||
SkASSERT(proc);
|
||||
return proc;
|
||||
return flags == 2 ? SkOpts::blit_row_s32a_opaque
|
||||
: kProcs[flags];
|
||||
}
|
||||
|
||||
void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
|
||||
|
@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012 The Android Open Source Project
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include "SkBitmapProcState_opts_SSE2.h"
|
||||
#include "SkBlitRow_opts_SSE2.h"
|
||||
#include "SkColorData.h"
|
||||
#include "SkColor_opts_SSE2.h"
|
||||
#include "SkMSAN.h"
|
||||
#include "SkUTF.h"
|
||||
|
||||
/* SSE2 version of S32_Blend_BlitRow32()
|
||||
* portable version is in core/SkBlitRow_D32.cpp
|
||||
*/
|
||||
void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t src_scale = SkAlpha255To256(alpha);
|
||||
|
||||
if (count >= 4) {
|
||||
SkASSERT(((size_t)dst & 0x03) == 0);
|
||||
while (((size_t)dst & 0x0F) != 0) {
|
||||
*dst = SkPMLerp(*src, *dst, src_scale);
|
||||
src++;
|
||||
dst++;
|
||||
count--;
|
||||
}
|
||||
|
||||
const __m128i *s = reinterpret_cast<const __m128i*>(src);
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
|
||||
while (count >= 4) {
|
||||
// Load 4 pixels each of src and dest.
|
||||
__m128i src_pixel = _mm_loadu_si128(s);
|
||||
__m128i dst_pixel = _mm_load_si128(d);
|
||||
|
||||
__m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
|
||||
_mm_store_si128(d, result);
|
||||
s++;
|
||||
d++;
|
||||
count -= 4;
|
||||
}
|
||||
src = reinterpret_cast<const SkPMColor*>(s);
|
||||
dst = reinterpret_cast<SkPMColor*>(d);
|
||||
}
|
||||
|
||||
while (count > 0) {
|
||||
*dst = SkPMLerp(*src, *dst, src_scale);
|
||||
src++;
|
||||
dst++;
|
||||
count--;
|
||||
}
|
||||
}
|
||||
|
||||
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (count >= 4) {
|
||||
while (((size_t)dst & 0x0F) != 0) {
|
||||
*dst = SkBlendARGB32(*src, *dst, alpha);
|
||||
src++;
|
||||
dst++;
|
||||
count--;
|
||||
}
|
||||
|
||||
const __m128i *s = reinterpret_cast<const __m128i*>(src);
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
while (count >= 4) {
|
||||
// Load 4 pixels each of src and dest.
|
||||
__m128i src_pixel = _mm_loadu_si128(s);
|
||||
__m128i dst_pixel = _mm_load_si128(d);
|
||||
|
||||
__m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
|
||||
_mm_store_si128(d, result);
|
||||
s++;
|
||||
d++;
|
||||
count -= 4;
|
||||
}
|
||||
src = reinterpret_cast<const SkPMColor*>(s);
|
||||
dst = reinterpret_cast<SkPMColor*>(d);
|
||||
}
|
||||
|
||||
while (count > 0) {
|
||||
*dst = SkBlendARGB32(*src, *dst, alpha);
|
||||
src++;
|
||||
dst++;
|
||||
count--;
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
/*
|
||||
* Copyright 2009 The Android Open Source Project
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkBlitRow_opts_SSE2_DEFINED
|
||||
#define SkBlitRow_opts_SSE2_DEFINED
|
||||
|
||||
#include "SkBlitRow.h"
|
||||
|
||||
void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha);
|
||||
|
||||
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha);
|
||||
|
||||
#endif
|
@ -1,19 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012 The Android Open Source Project
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkUtilsArm.h"
|
||||
|
||||
#include "SkBlitRow_opts_arm_neon.h"
|
||||
|
||||
extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = {
|
||||
nullptr, nullptr, nullptr, nullptr,
|
||||
};
|
||||
|
||||
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
|
||||
return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags];
|
||||
}
|
@ -1,200 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012 The Android Open Source Project
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBlitRow_opts_arm_neon.h"
|
||||
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkColorData.h"
|
||||
#include "SkMathPriv.h"
|
||||
#include "SkUTF.h"
|
||||
|
||||
#include "SkColor_opts_neon.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
/* Neon version of S32_Blend_BlitRow32()
|
||||
* portable version is in src/core/SkBlitRow_D32.cpp
|
||||
*/
|
||||
void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(alpha <= 255);
|
||||
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint16_t src_scale = SkAlpha255To256(alpha);
|
||||
uint16_t dst_scale = 256 - src_scale;
|
||||
|
||||
while (count >= 2) {
|
||||
uint8x8_t vsrc, vdst, vres;
|
||||
uint16x8_t vsrc_wide, vdst_wide;
|
||||
|
||||
/* These commented prefetches are a big win for count
|
||||
* values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
|
||||
* They also hurt a little (<5%) on an A15
|
||||
*/
|
||||
//__builtin_prefetch(src+32);
|
||||
//__builtin_prefetch(dst+32);
|
||||
|
||||
// Load
|
||||
vsrc = vreinterpret_u8_u32(vld1_u32(src));
|
||||
vdst = vreinterpret_u8_u32(vld1_u32(dst));
|
||||
|
||||
// Process src
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
|
||||
|
||||
// Process dst
|
||||
vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
|
||||
|
||||
// Combine
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
// Store
|
||||
vst1_u32(dst, vreinterpret_u32_u8(vres));
|
||||
|
||||
src += 2;
|
||||
dst += 2;
|
||||
count -= 2;
|
||||
}
|
||||
|
||||
if (count == 1) {
|
||||
uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
|
||||
uint16x8_t vsrc_wide, vdst_wide;
|
||||
|
||||
// Load
|
||||
vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
|
||||
vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
|
||||
|
||||
// Process
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
|
||||
vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
// Store
|
||||
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SK_CPU_ARM32
|
||||
void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
|
||||
SkASSERT(255 > alpha);
|
||||
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned alpha256 = SkAlpha255To256(alpha);
|
||||
|
||||
// First deal with odd counts
|
||||
if (count & 1) {
|
||||
uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
|
||||
uint16x8_t vdst_wide, vsrc_wide;
|
||||
unsigned dst_scale;
|
||||
|
||||
// Load
|
||||
vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
|
||||
vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
|
||||
|
||||
// Calc dst_scale
|
||||
dst_scale = vget_lane_u8(vsrc, 3);
|
||||
dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
|
||||
|
||||
// Process src
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
|
||||
|
||||
// Process dst
|
||||
vdst_wide = vmovl_u8(vdst);
|
||||
vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
|
||||
|
||||
// Combine
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
|
||||
dst++;
|
||||
src++;
|
||||
count--;
|
||||
}
|
||||
|
||||
if (count) {
|
||||
uint8x8_t alpha_mask;
|
||||
static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
|
||||
alpha_mask = vld1_u8(alpha_mask_setup);
|
||||
|
||||
do {
|
||||
|
||||
uint8x8_t vsrc, vdst, vres, vsrc_alphas;
|
||||
uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
|
||||
|
||||
__builtin_prefetch(src+32);
|
||||
__builtin_prefetch(dst+32);
|
||||
|
||||
// Load
|
||||
vsrc = vreinterpret_u8_u32(vld1_u32(src));
|
||||
vdst = vreinterpret_u8_u32(vld1_u32(dst));
|
||||
|
||||
// Prepare src_scale
|
||||
vsrc_scale = vdupq_n_u16(alpha256);
|
||||
|
||||
// Calc dst_scale
|
||||
vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
|
||||
vdst_scale = vmovl_u8(vsrc_alphas);
|
||||
// Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
|
||||
// A 16-bit lane would overflow if we used 0xFFFF here,
|
||||
// so use an approximation with 0xFF00 that is off by 1,
|
||||
// and add back 1 after to get the correct value.
|
||||
// This is valid if alpha256 <= 255.
|
||||
vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
|
||||
vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
|
||||
vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
|
||||
|
||||
// Process src
|
||||
vsrc_wide = vmovl_u8(vsrc);
|
||||
vsrc_wide *= vsrc_scale;
|
||||
|
||||
// Process dst
|
||||
vdst_wide = vmovl_u8(vdst);
|
||||
vdst_wide *= vdst_scale;
|
||||
|
||||
// Combine
|
||||
vdst_wide += vsrc_wide;
|
||||
vres = vshrn_n_u16(vdst_wide, 8);
|
||||
|
||||
vst1_u32(dst, vreinterpret_u32_u8(vres));
|
||||
|
||||
src += 2;
|
||||
dst += 2;
|
||||
count -= 2;
|
||||
} while(count);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif // #ifdef SK_CPU_ARM32
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
|
||||
nullptr, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_neon, // S32_Blend,
|
||||
nullptr, // Ported to SkOpts
|
||||
#ifdef SK_CPU_ARM32
|
||||
S32A_Blend_BlitRow32_neon // S32A_Blend
|
||||
#else
|
||||
nullptr
|
||||
#endif
|
||||
};
|
@ -1,14 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
#ifndef SkBlitRow_opts_arm_neon_DEFINED
|
||||
#define SkBlitRow_opts_arm_neon_DEFINED
|
||||
|
||||
#include "SkBlitRow.h"
|
||||
|
||||
extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[];
|
||||
|
||||
#endif
|
@ -1,14 +0,0 @@
|
||||
/*
|
||||
* Copyright 2011 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBlitRow.h"
|
||||
|
||||
// Platform impl of Platform_procs with no overrides
|
||||
|
||||
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
|
||||
return nullptr;
|
||||
}
|
@ -7,8 +7,6 @@
|
||||
|
||||
#include "SkBitmapProcState_opts_SSE2.h"
|
||||
#include "SkBitmapProcState_opts_SSSE3.h"
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkBlitRow_opts_SSE2.h"
|
||||
#include "SkCpu.h"
|
||||
|
||||
|
||||
@ -61,20 +59,3 @@ void SkBitmapProcState::platformProcs() {
|
||||
fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
|
||||
nullptr, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_SSE2, // S32_Blend,
|
||||
nullptr, // Ported to SkOpts
|
||||
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
|
||||
};
|
||||
|
||||
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
|
||||
if (SkCpu::Supports(SkCpu::SSE2)) {
|
||||
return platform_32_procs_SSE2[flags];
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user