From d611864e679a58865b111e74fe7ac919cba42163 Mon Sep 17 00:00:00 2001 From: "commit-bot@chromium.org" Date: Fri, 6 Dec 2013 11:32:27 +0000 Subject: [PATCH] ARM Skia NEON patches - 32 - Xfermode: 1-pixel NEON modeprocs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases, it's easy to provide a NEON version of the 1-pixel modeprocs. Combined with https://codereview.chromium.org/23724013/ (merged) it allows up to 35% speed improvement on Xfermodes when aa is non-NULL. Signed-off-by: Kévin PETIT BUG= R=djsollen@google.com, reed@google.com, mtklein@google.com, luisjoseromeroesclusa@hotmail.com Author: kevin.petit.arm@gmail.com Review URL: https://codereview.chromium.org/104883004 git-svn-id: http://skia.googlecode.com/svn/trunk@12525 2bbb7eff-a529-9590-31e7-b0007b416f81 --- src/core/SkXfermode.cpp | 9 +- src/opts/SkXfermode_opts_arm.cpp | 10 ++ src/opts/SkXfermode_opts_arm_neon.cpp | 170 ++++++++++++++++++++++++++ src/opts/SkXfermode_opts_arm_neon.h | 6 + src/opts/SkXfermode_opts_none.cpp | 8 +- 5 files changed, 201 insertions(+), 2 deletions(-) diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp index 313e2aef8c..8cb79c2dbd 100644 --- a/src/core/SkXfermode.cpp +++ b/src/core/SkXfermode.cpp @@ -1669,6 +1669,7 @@ void SkXfermode::Term() { extern SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, SkXfermode::Mode mode); +extern SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode); SkXfermode* SkXfermode::Create(Mode mode) { SkASSERT(SK_ARRAY_COUNT(gProcCoeffs) == kModeCount); @@ -1690,7 +1691,13 @@ SkXfermode* SkXfermode::Create(Mode mode) { SkXfermode* xfer = gCachedXfermodes[mode]; if (NULL == xfer) { - const ProcCoeff& rec = gProcCoeffs[mode]; + ProcCoeff rec = gProcCoeffs[mode]; + + SkXfermodeProc pp = SkPlatformXfermodeProcFactory(mode); + + if (pp != NULL) { + rec.fProc = pp; + } // check if we have a platform optim for that SkProcCoeffXfermode* xfm = SkPlatformXfermodeFactory(rec, mode); diff --git a/src/opts/SkXfermode_opts_arm.cpp b/src/opts/SkXfermode_opts_arm.cpp index eb3b3016e3..2a796d6f6a 100644 --- a/src/opts/SkXfermode_opts_arm.cpp +++ b/src/opts/SkXfermode_opts_arm.cpp @@ -5,12 +5,22 @@ extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, SkXfermode::Mode mode); +extern SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode); + SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec, SkXfermode::Mode mode) { return NULL; } +SkXfermodeProc SkPlatformXfermodeProcFactory_impl(SkXfermode::Mode mode) { + return NULL; +} + SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, SkXfermode::Mode mode) { return SK_ARM_NEON_WRAP(SkPlatformXfermodeFactory_impl)(rec, mode); } + +SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) { + return SK_ARM_NEON_WRAP(SkPlatformXfermodeProcFactory_impl)(mode); +} diff --git a/src/opts/SkXfermode_opts_arm_neon.cpp b/src/opts/SkXfermode_opts_arm_neon.cpp index 7435dd44de..6a79b73726 100644 --- a/src/opts/SkXfermode_opts_arm_neon.cpp +++ b/src/opts/SkXfermode_opts_arm_neon.cpp @@ -92,6 +92,133 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val return ret; } +//////////////////////////////////////////////////////////////////////////////// +// 1 pixel modeprocs +//////////////////////////////////////////////////////////////////////////////// + +// kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc] +SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) { + unsigned sa = SkGetPackedA32(src); + unsigned da = SkGetPackedA32(dst); + unsigned isa = 255 - sa; + + uint8x8_t vda, visa, vsrc, vdst; + + vda = vdup_n_u8(da); + visa = vdup_n_u8(isa); + + uint16x8_t vsrc_wide, vdst_wide; + vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src))); + vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst))); + + vsrc_wide += vdupq_n_u16(128); + vsrc_wide += vshrq_n_u16(vsrc_wide, 8); + + vdst_wide += vdupq_n_u16(128); + vdst_wide += vshrq_n_u16(vdst_wide, 8); + + vsrc = vshrn_n_u16(vsrc_wide, 8); + vdst = vshrn_n_u16(vdst_wide, 8); + + vsrc += vdst; + vsrc = vset_lane_u8(da, vsrc, 3); + + return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); +} + +// kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)] +SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) { + unsigned sa = SkGetPackedA32(src); + unsigned da = SkGetPackedA32(dst); + unsigned ida = 255 - da; + + uint8x8_t vsa, vida, vsrc, vdst; + + vsa = vdup_n_u8(sa); + vida = vdup_n_u8(ida); + + uint16x8_t vsrc_wide, vdst_wide; + vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src))); + vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst))); + + vsrc_wide += vdupq_n_u16(128); + vsrc_wide += vshrq_n_u16(vsrc_wide, 8); + + vdst_wide += vdupq_n_u16(128); + vdst_wide += vshrq_n_u16(vdst_wide, 8); + + vsrc = vshrn_n_u16(vsrc_wide, 8); + vdst = vshrn_n_u16(vdst_wide, 8); + + vsrc += vdst; + vsrc = vset_lane_u8(sa, vsrc, 3); + + return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); +} + +// kXor_Mode [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc] +SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) { + unsigned sa = SkGetPackedA32(src); + unsigned da = SkGetPackedA32(dst); + unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1); + unsigned isa = 255 - sa; + unsigned ida = 255 - da; + + uint8x8_t vsrc, vdst, visa, vida; + uint16x8_t vsrc_wide, vdst_wide; + + visa = vdup_n_u8(isa); + vida = vdup_n_u8(ida); + vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); + vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); + + vsrc_wide = vmull_u8(vsrc, vida); + vdst_wide = vmull_u8(vdst, visa); + + vsrc_wide += vdupq_n_u16(128); + vsrc_wide += vshrq_n_u16(vsrc_wide, 8); + + vdst_wide += vdupq_n_u16(128); + vdst_wide += vshrq_n_u16(vdst_wide, 8); + + vsrc = vshrn_n_u16(vsrc_wide, 8); + vdst = vshrn_n_u16(vdst_wide, 8); + + vsrc += vdst; + + vsrc = vset_lane_u8(ret_alpha, vsrc, 3); + + return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); +} + +// kPlus_Mode +SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) { + uint8x8_t vsrc, vdst; + vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); + vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); + vsrc = vqadd_u8(vsrc, vdst); + + return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); +} + +// kModulate_Mode +SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) { + uint8x8_t vsrc, vdst, vres; + uint16x8_t vres_wide; + + vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); + vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); + + vres_wide = vmull_u8(vsrc, vdst); + + vres_wide += vdupq_n_u16(128); + vres_wide += vshrq_n_u16(vres_wide, 8); + + vres = vshrn_n_u16(vres_wide, 8); + + return vget_lane_u32(vreinterpret_u32_u8(vres), 0); +} + //////////////////////////////////////////////////////////////////////////////// // 8 pixels modeprocs //////////////////////////////////////////////////////////////////////////////// @@ -755,6 +882,45 @@ SK_COMPILE_ASSERT( mode_count_arm ); +SkXfermodeProc gNEONXfermodeProcs1[] = { + NULL, // kClear_Mode + NULL, // kSrc_Mode + NULL, // kDst_Mode + NULL, // kSrcOver_Mode + NULL, // kDstOver_Mode + NULL, // kSrcIn_Mode + NULL, // kDstIn_Mode + NULL, // kSrcOut_Mode + NULL, // kDstOut_Mode + srcatop_modeproc_neon, + dstatop_modeproc_neon, + xor_modeproc_neon, + plus_modeproc_neon, + modulate_modeproc_neon, + NULL, // kScreen_Mode + + NULL, // kOverlay_Mode + NULL, // kDarken_Mode + NULL, // kLighten_Mode + NULL, // kColorDodge_Mode + NULL, // kColorBurn_Mode + NULL, // kHardLight_Mode + NULL, // kSoftLight_Mode + NULL, // kDifference_Mode + NULL, // kExclusion_Mode + NULL, // kMultiply_Mode + + NULL, // kHue_Mode + NULL, // kSaturation_Mode + NULL, // kColor_Mode + NULL, // kLuminosity_Mode +}; + +SK_COMPILE_ASSERT( + SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1, + mode1_count_arm +); + SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, SkXfermode::Mode mode) { @@ -765,3 +931,7 @@ SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, } return NULL; } + +SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { + return gNEONXfermodeProcs1[mode]; +} diff --git a/src/opts/SkXfermode_opts_arm_neon.h b/src/opts/SkXfermode_opts_arm_neon.h index 4c88fc7a63..a8d438195e 100644 --- a/src/opts/SkXfermode_opts_arm_neon.h +++ b/src/opts/SkXfermode_opts_arm_neon.h @@ -26,4 +26,10 @@ private: typedef SkProcCoeffXfermode INHERITED; }; +extern SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst); +extern SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst); +extern SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst); +extern SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst); +extern SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst); + #endif //#ifdef SkXfermode_opts_arm_neon_DEFINED diff --git a/src/opts/SkXfermode_opts_none.cpp b/src/opts/SkXfermode_opts_none.cpp index ca53fa0dd0..7c46fdd93c 100644 --- a/src/opts/SkXfermode_opts_none.cpp +++ b/src/opts/SkXfermode_opts_none.cpp @@ -1,11 +1,17 @@ #include "SkXfermode.h" #include "SkXfermode_proccoeff.h" -// The prototype below is for Clang +// The prototypes below are for Clang extern SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, SkXfermode::Mode mode); +extern SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode); + SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, SkXfermode::Mode mode) { return NULL; } + +SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) { + return NULL; +}