Revert of ARM Skia NEON patches - 35 - First AArch64 support (https://codereview.chromium.org/143423004/)
Reason for revert: GYP's failing on most (all?) bots. Original issue's description: > ARM Skia NEON patches - 35 - First AArch64 support > > Aarch64 support > > This change contains the necessary modifications to have Skia build and > run properly on an ARMv8 processor in aarch64 execution state. > > Here's a list of the changes: > > - add an arm64 target to the build system + SK_CPU_ARM64 flag > > - MatrixTest was failing when built in Release mode. Fused MAC > instructions were generated which made some intermediate results > more accurate. As the test relies on result comparison, the more > precise results when compared to others led to a gap bigger than > what was tolerated. As I don't know if some actual skia code relies > on results being comparable, I've disabled fused MAC instruction > with -ffp-contract=off for arm64. > > - Modify include/core/SkOnce.h to have barriers work. > > - SK_CPU_ARM64 implies SK_ARM_NEON_MODE_ALWAYS. > > - use existing Xfermode optimisations with modifications that can be > removed in the future when toolchains are ready. Also save a few > instructions is two Xfermodes (will apply to ARM too). > > - use existing SkBoxBlur and SkMorphology optimisations. > > - use existing SkBlitMask optimisations > > - use existing BitmapProcState and Convolution optimisations. > > Future changes will include: > > - Blitters (only partialy merged upstream) > > - SkUtils (there's little value in sending asm optimisations without > having them benchmarked on real hardware). > > Signed-off-by: Kevin PETIT <kevin.petit@arm.com> > > BUG=skia: > > Committed: http://code.google.com/p/skia/source/detail?r=13980 R=djsollen@google.com, reed@google.com, halcanary@google.com, kevin.petit@arm.com TBR=djsollen@google.com, halcanary@google.com, kevin.petit@arm.com, reed@google.com NOTREECHECKS=true NOTRY=true BUG=skia: Author: mtklein@google.com Review URL: https://codereview.chromium.org/216113005 git-svn-id: http://skia.googlecode.com/svn/trunk@13983 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
972f9cd7a0
commit
d643a90ee2
@ -8,12 +8,6 @@
|
|||||||
'SK_FORCE_DISTANCEFIELD_FONTS=<(skia_force_distancefield_fonts)',
|
'SK_FORCE_DISTANCEFIELD_FONTS=<(skia_force_distancefield_fonts)',
|
||||||
],
|
],
|
||||||
'conditions' : [
|
'conditions' : [
|
||||||
[ 'skia_arch_type == "arm64"', {
|
|
||||||
'cflags': [
|
|
||||||
'-ffp-contract=off',
|
|
||||||
],
|
|
||||||
}],
|
|
||||||
|
|
||||||
[ 'skia_os == "win"',
|
[ 'skia_os == "win"',
|
||||||
{
|
{
|
||||||
'defines': [
|
'defines': [
|
||||||
|
@ -72,7 +72,7 @@
|
|||||||
}, {
|
}, {
|
||||||
'skia_poppler_enabled%': 0,
|
'skia_poppler_enabled%': 0,
|
||||||
}],
|
}],
|
||||||
[ 'skia_os in ["linux", "freebsd", "openbsd", "solaris", "mac"] or skia_arch_type == "arm64"', {
|
[ 'skia_os in ["linux", "freebsd", "openbsd", "solaris", "mac"]', {
|
||||||
'skia_arch_width%': 64,
|
'skia_arch_width%': 64,
|
||||||
}, {
|
}, {
|
||||||
'skia_arch_width%': 32,
|
'skia_arch_width%': 32,
|
||||||
|
17
gyp/opts.gyp
17
gyp/opts.gyp
@ -122,23 +122,6 @@
|
|||||||
'-mno-apcs-frame',
|
'-mno-apcs-frame',
|
||||||
]
|
]
|
||||||
}],
|
}],
|
||||||
[ 'skia_arch_type == "arm64"', {
|
|
||||||
'sources': [
|
|
||||||
'../src/opts/SkBitmapProcState_arm_neon.cpp',
|
|
||||||
'../src/opts/SkBitmapProcState_matrixProcs_neon.cpp',
|
|
||||||
'../src/opts/SkBitmapProcState_opts_arm.cpp',
|
|
||||||
'../src/opts/SkBlitMask_opts_arm.cpp',
|
|
||||||
'../src/opts/SkBlitMask_opts_arm_neon.cpp',
|
|
||||||
'../src/opts/SkBlitRow_opts_none.cpp',
|
|
||||||
'../src/opts/SkBlurImage_opts_arm.cpp',
|
|
||||||
'../src/opts/SkBlurImage_opts_neon.cpp',
|
|
||||||
'../src/opts/SkMorphology_opts_arm.cpp',
|
|
||||||
'../src/opts/SkMorphology_opts_neon.cpp',
|
|
||||||
'../src/opts/SkUtils_opts_none.cpp',
|
|
||||||
'../src/opts/SkXfermode_opts_arm.cpp',
|
|
||||||
'../src/opts/SkXfermode_opts_arm_neon.cpp',
|
|
||||||
],
|
|
||||||
}],
|
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
# For the same lame reasons as what is done for skia_opts, we have to
|
# For the same lame reasons as what is done for skia_opts, we have to
|
||||||
|
@ -85,10 +85,12 @@ inline static void compiler_barrier() {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline static void full_barrier_on_arm() {
|
inline static void full_barrier_on_arm() {
|
||||||
#if (defined(SK_CPU_ARM) && SK_ARM_ARCH >= 7) || defined(SK_CPU_ARM64)
|
#ifdef SK_CPU_ARM
|
||||||
asm volatile("dmb ish" : : : "memory");
|
# if SK_ARM_ARCH >= 7
|
||||||
#elif defined(SK_CPU_ARM)
|
asm volatile("dmb" : : : "memory");
|
||||||
|
# else
|
||||||
asm volatile("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory");
|
asm volatile("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory");
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,10 +174,6 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
|
||||||
#define SK_CPU_ARM64
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
#if !defined(SKIA_IMPLEMENTATION)
|
#if !defined(SKIA_IMPLEMENTATION)
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
cat >&2 <<EOF
|
cat >&2 <<EOF
|
||||||
arm64_make - this script builds a AArch64 version of skia that
|
arm64_make - this script builds a ARMv7 Aarch64 version of skia that
|
||||||
does not depend on external libraries, perfect for putting in an
|
does not depend on external libraries, perfect for putting in an
|
||||||
embedded system running Linux.
|
embedded system running Linux.
|
||||||
|
|
||||||
@ -45,7 +45,12 @@ done
|
|||||||
|
|
||||||
export GYP_DEFINES="${GYP_DEFINES} \
|
export GYP_DEFINES="${GYP_DEFINES} \
|
||||||
skia_gpu=0 \
|
skia_gpu=0 \
|
||||||
skia_arch_type=arm64 \
|
skia_arch_type=arm \
|
||||||
|
skia_arch_width=64 \
|
||||||
|
armv7=1 \
|
||||||
|
armv8=1 \
|
||||||
|
arm_neon=0 \
|
||||||
|
arm_thumb=0 \
|
||||||
"
|
"
|
||||||
|
|
||||||
"$(dirname "$0")/barelinux_make" -t "$BUILD_TYPE"
|
"$(dirname "$0")/barelinux_make" -t "$BUILD_TYPE"
|
||||||
|
@ -23,7 +23,7 @@
|
|||||||
|
|
||||||
#if defined(SK_CPU_ARM) && defined(__ARM_HAVE_OPTIONAL_NEON_SUPPORT)
|
#if defined(SK_CPU_ARM) && defined(__ARM_HAVE_OPTIONAL_NEON_SUPPORT)
|
||||||
# define SK_ARM_NEON_MODE SK_ARM_NEON_MODE_DYNAMIC
|
# define SK_ARM_NEON_MODE SK_ARM_NEON_MODE_DYNAMIC
|
||||||
#elif defined(SK_CPU_ARM) && defined(__ARM_HAVE_NEON) || defined(SK_CPU_ARM64)
|
#elif defined(SK_CPU_ARM) && defined(__ARM_HAVE_NEON)
|
||||||
# define SK_ARM_NEON_MODE SK_ARM_NEON_MODE_ALWAYS
|
# define SK_ARM_NEON_MODE SK_ARM_NEON_MODE_ALWAYS
|
||||||
#else
|
#else
|
||||||
# define SK_ARM_NEON_MODE SK_ARM_NEON_MODE_NONE
|
# define SK_ARM_NEON_MODE SK_ARM_NEON_MODE_NONE
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
#include "SkConvolver.h"
|
#include "SkConvolver.h"
|
||||||
|
|
||||||
#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
|
#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
|
||||||
void SI8_D16_nofilter_DX_arm(
|
void SI8_D16_nofilter_DX_arm(
|
||||||
const SkBitmapProcState& s,
|
const SkBitmapProcState& s,
|
||||||
const uint32_t* SK_RESTRICT xy,
|
const uint32_t* SK_RESTRICT xy,
|
||||||
@ -186,7 +186,7 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
|
|||||||
|
|
||||||
s.fBitmap->getColorTable()->unlockColors();
|
s.fBitmap->getColorTable()->unlockColors();
|
||||||
}
|
}
|
||||||
#endif // !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
|
#endif // SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@ -194,7 +194,6 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
|
|||||||
otherwise the shader won't even look at the matrix/sampler
|
otherwise the shader won't even look at the matrix/sampler
|
||||||
*/
|
*/
|
||||||
void SkBitmapProcState::platformProcs() {
|
void SkBitmapProcState::platformProcs() {
|
||||||
#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
|
|
||||||
bool isOpaque = 256 == fAlphaScale;
|
bool isOpaque = 256 == fAlphaScale;
|
||||||
bool justDx = false;
|
bool justDx = false;
|
||||||
|
|
||||||
@ -204,6 +203,7 @@ void SkBitmapProcState::platformProcs() {
|
|||||||
|
|
||||||
switch (fBitmap->config()) {
|
switch (fBitmap->config()) {
|
||||||
case SkBitmap::kIndex8_Config:
|
case SkBitmap::kIndex8_Config:
|
||||||
|
#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
|
||||||
if (justDx && SkPaint::kNone_FilterLevel == fFilterLevel) {
|
if (justDx && SkPaint::kNone_FilterLevel == fFilterLevel) {
|
||||||
#if 0 /* crashing on android device */
|
#if 0 /* crashing on android device */
|
||||||
fSampleProc16 = SI8_D16_nofilter_DX_arm;
|
fSampleProc16 = SI8_D16_nofilter_DX_arm;
|
||||||
@ -215,11 +215,11 @@ void SkBitmapProcState::platformProcs() {
|
|||||||
fShaderProc32 = NULL;
|
fShaderProc32 = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -41,13 +41,8 @@ static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp
|
|||||||
static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
|
static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
|
||||||
uint16x8_t tmp;
|
uint16x8_t tmp;
|
||||||
|
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),
|
|
||||||
vreinterpretq_u32_s32(p2));
|
|
||||||
#else
|
|
||||||
tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
|
tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
|
||||||
vmovn_u32(vreinterpretq_u32_s32(p2)));
|
vmovn_u32(vreinterpretq_u32_s32(p2)));
|
||||||
#endif
|
|
||||||
|
|
||||||
tmp += vdupq_n_u16(128);
|
tmp += vdupq_n_u16(128);
|
||||||
tmp += vshrq_n_u16(tmp, 8);
|
tmp += vshrq_n_u16(tmp, 8);
|
||||||
@ -71,11 +66,7 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
|
|||||||
// Test if <= 0
|
// Test if <= 0
|
||||||
cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
|
cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
|
||||||
cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
|
cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
|
|
||||||
#else
|
|
||||||
cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
|
cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
|
||||||
#endif
|
|
||||||
cmp8_1 = vmovn_u16(cmp16);
|
cmp8_1 = vmovn_u16(cmp16);
|
||||||
|
|
||||||
// Init to zero
|
// Init to zero
|
||||||
@ -84,11 +75,7 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
|
|||||||
// Test if >= 255*255
|
// Test if >= 255*255
|
||||||
cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
|
cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
|
||||||
cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
|
cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
|
|
||||||
#else
|
|
||||||
cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
|
cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
|
||||||
#endif
|
|
||||||
cmp8 = vmovn_u16(cmp16);
|
cmp8 = vmovn_u16(cmp16);
|
||||||
|
|
||||||
// Insert 255 where true
|
// Insert 255 where true
|
||||||
@ -422,19 +409,11 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
|
|||||||
if (overlay) {
|
if (overlay) {
|
||||||
dc2 = vshll_n_u8(dc, 1);
|
dc2 = vshll_n_u8(dc, 1);
|
||||||
scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
|
scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));
|
|
||||||
#else
|
|
||||||
scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
|
scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
sc2 = vshll_n_u8(sc, 1);
|
sc2 = vshll_n_u8(sc, 1);
|
||||||
scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
|
scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));
|
|
||||||
#else
|
|
||||||
scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
|
scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calc COM
|
// Calc COM
|
||||||
@ -442,20 +421,12 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
|
|||||||
com1 = vreinterpretq_s32_u32(
|
com1 = vreinterpretq_s32_u32(
|
||||||
vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
|
vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
|
||||||
com2 = vreinterpretq_s32_u32(
|
com2 = vreinterpretq_s32_u32(
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
vmull_high_u16(const255, sc_plus_dc));
|
|
||||||
#else
|
|
||||||
vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
|
vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
|
||||||
#endif
|
|
||||||
|
|
||||||
// Calc SUB
|
// Calc SUB
|
||||||
int32x4_t sub1, sub2;
|
int32x4_t sub1, sub2;
|
||||||
sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
|
sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));
|
|
||||||
#else
|
|
||||||
sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
|
sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
|
||||||
#endif
|
|
||||||
sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
|
sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
|
||||||
sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
|
sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
|
||||||
|
|
||||||
@ -473,14 +444,10 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
|
|||||||
int32x4_t val2_1, val2_2;
|
int32x4_t val2_1, val2_2;
|
||||||
uint32x4_t cmp1, cmp2;
|
uint32x4_t cmp1, cmp2;
|
||||||
|
|
||||||
// Doing a signed lengthening allows to save a few instructions
|
cmp1 = vmovl_u16(vget_low_u16(cmp));
|
||||||
// thanks to sign extension.
|
cmp1 |= vshlq_n_u32(cmp1, 16);
|
||||||
cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp))));
|
cmp2 = vmovl_u16(vget_high_u16(cmp));
|
||||||
#ifdef SK_CPU_ARM64
|
cmp2 |= vshlq_n_u32(cmp2, 16);
|
||||||
cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));
|
|
||||||
#else
|
|
||||||
cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp))));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Calc COM - SUB
|
// Calc COM - SUB
|
||||||
val1_1 = com1 - sub1;
|
val1_1 = com1 - sub1;
|
||||||
@ -491,11 +458,7 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
|
|||||||
val2_2 = com2 + sub2;
|
val2_2 = com2 + sub2;
|
||||||
|
|
||||||
val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
|
val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));
|
|
||||||
#else
|
|
||||||
val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
|
val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
|
||||||
#endif
|
|
||||||
|
|
||||||
// Insert where needed
|
// Insert where needed
|
||||||
val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
|
val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
|
||||||
@ -665,19 +628,11 @@ static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc,
|
|||||||
term1_1 = vreinterpretq_s32_u32(
|
term1_1 = vreinterpretq_s32_u32(
|
||||||
vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
|
vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
|
||||||
term1_2 = vreinterpretq_s32_u32(
|
term1_2 = vreinterpretq_s32_u32(
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
vmull_high_u16(const255, sc_plus_dc));
|
|
||||||
#else
|
|
||||||
vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
|
vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Calc the second term */
|
/* Calc the second term */
|
||||||
term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
|
term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));
|
|
||||||
#else
|
|
||||||
term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
|
term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
|
||||||
#endif
|
|
||||||
|
|
||||||
return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
|
return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
|
||||||
}
|
}
|
||||||
@ -706,18 +661,10 @@ static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,
|
|||||||
scdc = vmull_u8(sc, dc);
|
scdc = vmull_u8(sc, dc);
|
||||||
|
|
||||||
val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
|
val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
val2 = vaddl_high_u16(t1, t2);
|
|
||||||
#else
|
|
||||||
val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
|
val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
|
||||||
#endif
|
|
||||||
|
|
||||||
val1 = vaddw_u16(val1, vget_low_u16(scdc));
|
val1 = vaddw_u16(val1, vget_low_u16(scdc));
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
val2 = vaddw_high_u16(val2, scdc);
|
|
||||||
#else
|
|
||||||
val2 = vaddw_u16(val2, vget_high_u16(scdc));
|
val2 = vaddw_u16(val2, vget_high_u16(scdc));
|
||||||
#endif
|
|
||||||
|
|
||||||
return clamp_div255round_simd8_32(
|
return clamp_div255round_simd8_32(
|
||||||
vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
|
vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
|
||||||
@ -761,10 +708,6 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
|
|||||||
while (count >= 8) {
|
while (count >= 8) {
|
||||||
uint8x8x4_t vsrc, vdst, vres;
|
uint8x8x4_t vsrc, vdst, vres;
|
||||||
|
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
vsrc = vld4_u8((uint8_t*)src);
|
|
||||||
vdst = vld4_u8((uint8_t*)dst);
|
|
||||||
#else
|
|
||||||
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vld4.u8 %h[vsrc], [%[src]]! \t\n"
|
"vld4.u8 %h[vsrc], [%[src]]! \t\n"
|
||||||
@ -797,7 +740,6 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
|
|||||||
vsrc.val[2] = d2; vdst.val[2] = d6;
|
vsrc.val[2] = d2; vdst.val[2] = d6;
|
||||||
vsrc.val[3] = d3; vdst.val[3] = d7;
|
vsrc.val[3] = d3; vdst.val[3] = d7;
|
||||||
#endif
|
#endif
|
||||||
#endif // #ifdef SK_CPU_ARM64
|
|
||||||
|
|
||||||
vres = procSIMD(vsrc, vdst);
|
vres = procSIMD(vsrc, vdst);
|
||||||
|
|
||||||
@ -805,9 +747,6 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
|
|||||||
|
|
||||||
count -= 8;
|
count -= 8;
|
||||||
dst += 8;
|
dst += 8;
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
src += 8;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
// Leftovers
|
// Leftovers
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
@ -844,9 +783,6 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
|
|||||||
|
|
||||||
vdst = vld1q_u16(dst);
|
vdst = vld1q_u16(dst);
|
||||||
|
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
vsrc = vld4_u8((uint8_t*)src);
|
|
||||||
#else
|
|
||||||
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vld4.u8 %h[vsrc], [%[src]]! \t\n"
|
"vld4.u8 %h[vsrc], [%[src]]! \t\n"
|
||||||
@ -870,7 +806,6 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
|
|||||||
vsrc.val[2] = d2;
|
vsrc.val[2] = d2;
|
||||||
vsrc.val[3] = d3;
|
vsrc.val[3] = d3;
|
||||||
#endif
|
#endif
|
||||||
#endif // #ifdef SK_CPU_ARM64
|
|
||||||
|
|
||||||
vdst32 = SkPixel16ToPixel32_neon8(vdst);
|
vdst32 = SkPixel16ToPixel32_neon8(vdst);
|
||||||
vres = procSIMD(vsrc, vdst32);
|
vres = procSIMD(vsrc, vdst32);
|
||||||
@ -880,9 +815,6 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
|
|||||||
|
|
||||||
count -= 8;
|
count -= 8;
|
||||||
dst += 8;
|
dst += 8;
|
||||||
#ifdef SK_CPU_ARM64
|
|
||||||
src += 8;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
|
SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
|
||||||
|
Loading…
Reference in New Issue
Block a user