diff --git a/gyp/core.gyp b/gyp/core.gyp index 3034264570..2639ff45c5 100644 --- a/gyp/core.gyp +++ b/gyp/core.gyp @@ -22,7 +22,6 @@ '../include/utils', '../include/xml', '../src/core', - '../src/opts', '../src/image', ], 'sources': [ diff --git a/gyp/opts.gyp b/gyp/opts.gyp index d9cd6f29ba..04966ba67b 100644 --- a/gyp/opts.gyp +++ b/gyp/opts.gyp @@ -173,7 +173,6 @@ '../src/opts/SkBitmapProcState_matrix_clamp_neon.h', '../src/opts/SkBitmapProcState_matrix_repeat_neon.h', '../src/opts/SkBlitRow_opts_arm_neon.cpp', - '../src/opts/SkXfermode_opts_arm_neon.cpp', ], }, ], diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp index 6cdd97bb4c..993c754711 100644 --- a/src/core/SkXfermode.cpp +++ b/src/core/SkXfermode.cpp @@ -13,11 +13,6 @@ #include "SkFlattenableBuffers.h" #include "SkMathPriv.h" #include "SkString.h" -#include "SkUtilsArm.h" - -#if !SK_ARM_NEON_IS_NONE -#include "SkXfermode_opts_arm_neon.h" -#endif SK_DEFINE_INST_COUNT(SkXfermode) @@ -1955,7 +1950,4 @@ SK_DEFINE_FLATTENABLE_REGISTRAR_GROUP_START(SkXfermode) SK_DEFINE_FLATTENABLE_REGISTRAR_ENTRY(SkSrcXfermode) SK_DEFINE_FLATTENABLE_REGISTRAR_ENTRY(SkDstInXfermode) SK_DEFINE_FLATTENABLE_REGISTRAR_ENTRY(SkDstOutXfermode) -#if !SK_ARM_NEON_IS_NONE - SK_DEFINE_FLATTENABLE_REGISTRAR_ENTRY(SkNEONProcCoeffXfermode) -#endif SK_DEFINE_FLATTENABLE_REGISTRAR_GROUP_END diff --git a/src/core/SkXfermode_proccoeff.h b/src/core/SkXfermode_proccoeff.h index 23a83f2c0d..60ebe3ff4c 100644 --- a/src/core/SkXfermode_proccoeff.h +++ b/src/core/SkXfermode_proccoeff.h @@ -53,10 +53,6 @@ protected: virtual void flatten(SkFlattenableWriteBuffer& buffer) const SK_OVERRIDE; - Mode getMode() const { - return fMode; - } - private: Mode fMode; Coeff fSrcCoeff, fDstCoeff; diff --git a/src/opts/SkColor_opts_neon.h b/src/opts/SkColor_opts_neon.h index cd9e8133e2..7e3057d14c 100644 --- a/src/opts/SkColor_opts_neon.h +++ b/src/opts/SkColor_opts_neon.h @@ -3,30 +3,9 @@ #include "SkTypes.h" -#include - #define NEON_A (SK_A32_SHIFT / 8) #define NEON_R (SK_R32_SHIFT / 8) #define NEON_G (SK_G32_SHIFT / 8) #define NEON_B (SK_B32_SHIFT / 8) -static inline uint16x8_t SkAlpha255To256_neon8(uint8x8_t alpha) { - return vaddw_u8(vdupq_n_u16(1), alpha); -} - -static inline uint8x8_t SkAlphaMul_neon8(uint8x8_t color, uint16x8_t scale) { - return vshrn_n_u16(vmovl_u8(color) * scale, 8); -} - -static inline uint8x8x4_t SkAlphaMulQ_neon8(uint8x8x4_t color, uint16x8_t scale) { - uint8x8x4_t ret; - - ret.val[NEON_A] = SkAlphaMul_neon8(color.val[NEON_A], scale); - ret.val[NEON_R] = SkAlphaMul_neon8(color.val[NEON_R], scale); - ret.val[NEON_G] = SkAlphaMul_neon8(color.val[NEON_G], scale); - ret.val[NEON_B] = SkAlphaMul_neon8(color.val[NEON_B], scale); - - return ret; -} - #endif /* #ifndef SkColor_opts_neon_DEFINED */ diff --git a/src/opts/SkXfermode_opts_arm.cpp b/src/opts/SkXfermode_opts_arm.cpp index eb3b3016e3..db5d5317e3 100644 --- a/src/opts/SkXfermode_opts_arm.cpp +++ b/src/opts/SkXfermode_opts_arm.cpp @@ -1,16 +1,158 @@ #include "SkXfermode.h" #include "SkXfermode_proccoeff.h" +#include "SkColorPriv.h" #include "SkUtilsArm.h" -extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, - SkXfermode::Mode mode); +#if !SK_ARM_NEON_IS_NONE -SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec, - SkXfermode::Mode mode) { - return NULL; +#include + +//////////////////////////////////////////////////////////////////////////////// + +typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); + +class SkNEONProcCoeffXfermode : public SkProcCoeffXfermode { +public: + SkNEONProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, + SkXfermodeProcSIMD procSIMD) + : INHERITED(rec, mode), fProcSIMD(procSIMD) {} + + virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count, + const SkAlpha aa[]) const SK_OVERRIDE; + + SK_DEVELOPER_TO_STRING() + SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkNEONProcCoeffXfermode) + +private: + SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer) + : INHERITED(buffer) { + + fProcSIMD = NULL; + if (!buffer.isCrossProcess()) { + fProcSIMD = (SkXfermodeProcSIMD)buffer.readFunctionPtr(); + } + } + + virtual void flatten(SkFlattenableWriteBuffer& buffer) const SK_OVERRIDE; + + SkXfermodeProcSIMD fProcSIMD; + typedef SkProcCoeffXfermode INHERITED; +}; + + +void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], + int count, const SkAlpha aa[]) const { + SkASSERT(dst && src && count >= 0); + + SkXfermodeProc proc = this->getProc(); + SkXfermodeProcSIMD procSIMD = fProcSIMD; + + if (NULL == aa) { + // Unrolled NEON code + while (count >= 8) { + uint8x8x4_t vsrc, vdst, vres; + + asm volatile ( + "vld4.u8 %h[vsrc], [%[src]]! \t\n" + "vld4.u8 %h[vdst], [%[dst]] \t\n" + : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst) + : [src] "r" (src), [dst] "r" (dst) + : + ); + + vres = procSIMD(vsrc, vdst); + + vst4_u8((uint8_t*)dst, vres); + + count -= 8; + dst += 8; + } + // Leftovers + for (int i = 0; i < count; i++) { + dst[i] = proc(src[i], dst[i]); + } + } else { + for (int i = count - 1; i >= 0; --i) { + unsigned a = aa[i]; + if (0 != a) { + SkPMColor dstC = dst[i]; + SkPMColor C = proc(src[i], dstC); + if (a != 0xFF) { + C = SkFourByteInterp(C, dstC, a); + } + dst[i] = C; + } + } + } } +#ifdef SK_DEVELOPER +void SkNEONProcCoeffXfermode::toString(SkString* str) const { + this->INHERITED::toString(str); +} +#endif + +void SkNEONProcCoeffXfermode::flatten(SkFlattenableWriteBuffer& buffer) const { + this->INHERITED::flatten(buffer); + if (!buffer.isCrossProcess()) { + buffer.writeFunctionPtr((void*)fProcSIMD); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +SkXfermodeProcSIMD gNEONXfermodeProcs[] = { + [SkXfermode::kClear_Mode] = NULL, + [SkXfermode::kSrc_Mode] = NULL, + [SkXfermode::kDst_Mode] = NULL, + [SkXfermode::kSrcOver_Mode] = NULL, + [SkXfermode::kDstOver_Mode] = NULL, + [SkXfermode::kSrcIn_Mode] = NULL, + [SkXfermode::kDstIn_Mode] = NULL, + [SkXfermode::kSrcOut_Mode] = NULL, + [SkXfermode::kDstOut_Mode] = NULL, + [SkXfermode::kSrcATop_Mode] = NULL, + [SkXfermode::kDstATop_Mode] = NULL, + [SkXfermode::kXor_Mode] = NULL, + [SkXfermode::kPlus_Mode] = NULL, + [SkXfermode::kModulate_Mode]= NULL, + [SkXfermode::kScreen_Mode] = NULL, + + [SkXfermode::kOverlay_Mode] = NULL, + [SkXfermode::kDarken_Mode] = NULL, + [SkXfermode::kLighten_Mode] = NULL, + [SkXfermode::kColorDodge_Mode] = NULL, + [SkXfermode::kColorBurn_Mode] = NULL, + [SkXfermode::kHardLight_Mode] = NULL, + [SkXfermode::kSoftLight_Mode] = NULL, + [SkXfermode::kDifference_Mode] = NULL, + [SkXfermode::kExclusion_Mode] = NULL, + [SkXfermode::kMultiply_Mode] = NULL, + + [SkXfermode::kHue_Mode] = NULL, + [SkXfermode::kSaturation_Mode] = NULL, + [SkXfermode::kColor_Mode] = NULL, + [SkXfermode::kLuminosity_Mode] = NULL, +}; + +SK_COMPILE_ASSERT( + SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, + mode_count_arm +); + +#endif + SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, SkXfermode::Mode mode) { - return SK_ARM_NEON_WRAP(SkPlatformXfermodeFactory_impl)(rec, mode); +#if !SK_ARM_NEON_IS_NONE + #if SK_ARM_NEON_IS_DYNAMIC + if ((sk_cpu_arm_has_neon()) && (gNEONXfermodeProcs[mode] != NULL)) { + #elif SK_ARM_NEON_IS_ALWAYS + if (gNEONXfermodeProcs[mode] != NULL) { + #endif + return SkNEW_ARGS(SkNEONProcCoeffXfermode, + (rec, mode, gNEONXfermodeProcs[mode])); + } +#endif + return NULL; } diff --git a/src/opts/SkXfermode_opts_arm_neon.cpp b/src/opts/SkXfermode_opts_arm_neon.cpp deleted file mode 100644 index 32490f7ff4..0000000000 --- a/src/opts/SkXfermode_opts_arm_neon.cpp +++ /dev/null @@ -1,673 +0,0 @@ -#include "SkXfermode.h" -#include "SkXfermode_proccoeff.h" -#include "SkColorPriv.h" - -#include -#include "SkColor_opts_neon.h" -#include "SkXfermode_opts_arm_neon.h" - -#define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) - - -//////////////////////////////////////////////////////////////////////////////// -// NEONized skia functions -//////////////////////////////////////////////////////////////////////////////// - -static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) { - uint16x8_t tmp; - uint8x8_t ret; - - tmp = vmull_u8(color, alpha); - tmp = vaddq_u16(tmp, vdupq_n_u16(128)); - tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); - - ret = vshrn_n_u16(tmp, 8); - - return ret; -} - -static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) { - uint16x8_t ret; - - ret = vmull_u8(color, alpha); - ret = vaddq_u16(ret, vdupq_n_u16(128)); - ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); - - ret = vshrq_n_u16(ret, 8); - - return ret; -} - -static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { - uint16x8_t tmp; - - tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), - vmovn_u32(vreinterpretq_u32_s32(p2))); - - tmp += vdupq_n_u16(128); - tmp += vshrq_n_u16(tmp, 8); - - return vshrn_n_u16(tmp, 8); -} - -static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { - prod += vdupq_n_u16(128); - prod += vshrq_n_u16(prod, 8); - - return vshrq_n_u16(prod, 8); -} - -static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) { - uint8x8_t ret; - uint32x4_t cmp1, cmp2; - uint16x8_t cmp16; - uint8x8_t cmp8, cmp8_1; - - // Test if <= 0 - cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); - cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); - cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); - cmp8_1 = vmovn_u16(cmp16); - - // Init to zero - ret = vdup_n_u8(0); - - // Test if >= 255*255 - cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); - cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); - cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); - cmp8 = vmovn_u16(cmp16); - - // Insert 255 where true - ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); - - // Calc SkDiv255Round - uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); - - // Insert where false and previous test false - cmp8 = cmp8 | cmp8_1; - ret = vbsl_u8(cmp8, ret, div); - - // Return the final combination - return ret; -} - -//////////////////////////////////////////////////////////////////////////////// -// 8 pixels modeprocs -//////////////////////////////////////////////////////////////////////////////// - -uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint16x8_t src_scale; - - src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); - - ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale); - ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale); - ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale); - ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale); - - return ret; -} - -uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint16x8_t scale; - - scale = SkAlpha255To256_neon8(dst.val[NEON_A]); - - ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); - ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); - ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); - ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); - - return ret; -} - -uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint16x8_t scale; - - scale = SkAlpha255To256_neon8(src.val[NEON_A]); - - ret = SkAlphaMulQ_neon8(dst, scale); - - return ret; -} - -uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); - - ret = SkAlphaMulQ_neon8(src, scale); - - return ret; -} - -uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); - - ret = SkAlphaMulQ_neon8(dst, scale); - - return ret; -} - -uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint8x8_t isa; - - isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); - - ret.val[NEON_A] = dst.val[NEON_A]; - ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) - + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); - ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) - + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); - ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) - + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); - - return ret; -} - -uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint8x8_t ida; - - ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); - - ret.val[NEON_A] = src.val[NEON_A]; - ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) - + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); - ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) - + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); - ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) - + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); - - return ret; -} - -uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - uint8x8_t isa, ida; - uint16x8_t tmp_wide, tmp_wide2; - - isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); - ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); - - // First calc alpha - tmp_wide = vmovl_u8(src.val[NEON_A]); - tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); - tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1); - tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); - ret.val[NEON_A] = vmovn_u16(tmp_wide); - - // Then colors - ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) - + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); - ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) - + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); - ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) - + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); - - return ret; -} - -uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); - ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); - ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); - - return ret; -} - -uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); - ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); - ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); - - return ret; -} - -static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { - uint16x8_t tmp; - - tmp = vaddl_u8(a, b); - tmp -= SkAlphaMulAlpha_neon8_16(a, b); - - return vmovn_u16(tmp); -} - -uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); - ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); - ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); - - return ret; -} - -template -static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - /* - * In the end we're gonna use (rc + tmp) with a different rc - * coming from an alternative. - * The whole value (rc + tmp) can always be expressed as - * VAL = COM - SUB in the if case - * VAL = COM + SUB - sa*da in the else case - * - * with COM = 255 * (sc + dc) - * and SUB = sc*da + dc*sa - 2*dc*sc - */ - - // Prepare common subexpressions - uint16x8_t const255 = vdupq_n_u16(255); - uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); - uint16x8_t scda = vmull_u8(sc, da); - uint16x8_t dcsa = vmull_u8(dc, sa); - uint16x8_t sada = vmull_u8(sa, da); - - // Prepare non common subexpressions - uint16x8_t dc2, sc2; - uint32x4_t scdc2_1, scdc2_2; - if (overlay) { - dc2 = vshll_n_u8(dc, 1); - scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); - scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); - } else { - sc2 = vshll_n_u8(sc, 1); - scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); - scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); - } - - // Calc COM - int32x4_t com1, com2; - com1 = vreinterpretq_s32_u32( - vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); - com2 = vreinterpretq_s32_u32( - vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); - - // Calc SUB - int32x4_t sub1, sub2; - sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa))); - sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa))); - sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); - sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); - - // Compare 2*dc <= da - uint16x8_t cmp; - - if (overlay) { - cmp = vcleq_u16(dc2, vmovl_u8(da)); - } else { - cmp = vcleq_u16(sc2, vmovl_u8(sa)); - } - - // Prepare variables - int32x4_t val1_1, val1_2; - int32x4_t val2_1, val2_2; - uint32x4_t cmp1, cmp2; - - cmp1 = vmovl_u16(vget_low_u16(cmp)); - cmp1 |= vshlq_n_u32(cmp1, 16); - cmp2 = vmovl_u16(vget_high_u16(cmp)); - cmp2 |= vshlq_n_u32(cmp2, 16); - - // Calc COM - SUB - val1_1 = com1 - sub1; - val1_2 = com2 - sub2; - - // Calc COM + SUB - sa*da - val2_1 = com1 + sub1; - val2_2 = com2 + sub2; - - val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada)))); - val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada)))); - - // Insert where needed - val1_1 = vbslq_s32(cmp1, val1_1, val2_1); - val1_2 = vbslq_s32(cmp2, val1_2, val2_2); - - // Call the clamp_div255round function - return clamp_div255round_simd8_32(val1_1, val1_2); -} - -static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - return overlay_hardlight_color(sc, dc, sa, da); -} - -uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], - src.val[NEON_A], dst.val[NEON_A]); - - return ret; -} - -template -static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - uint16x8_t sd, ds, cmp, tmp, tmp2; - - // Prepare - sd = vmull_u8(sc, da); - ds = vmull_u8(dc, sa); - - // Do test - if (lighten) { - cmp = vcgtq_u16(sd, ds); - } else { - cmp = vcltq_u16(sd, ds); - } - - // Assign if - tmp = vaddl_u8(sc, dc); - tmp2 = tmp; - tmp -= SkDiv255Round_neon8_16_16(ds); - - // Calc else - tmp2 -= SkDiv255Round_neon8_16_16(sd); - - // Insert where needed - tmp = vbslq_u16(cmp, tmp, tmp2); - - return vmovn_u16(tmp); -} - -static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - return lighten_darken_color(sc, dc, sa, da); -} - -uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], - src.val[NEON_A], dst.val[NEON_A]); - - return ret; -} - -static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - return lighten_darken_color(sc, dc, sa, da); -} - -uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], - src.val[NEON_A], dst.val[NEON_A]); - - return ret; -} - -static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - return overlay_hardlight_color(sc, dc, sa, da); -} - -uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], - src.val[NEON_A], dst.val[NEON_A]); - - return ret; -} - -static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - uint16x8_t sd, ds, tmp; - int16x8_t val; - - sd = vmull_u8(sc, da); - ds = vmull_u8(dc, sa); - - tmp = vminq_u16(sd, ds); - tmp = SkDiv255Round_neon8_16_16(tmp); - tmp = vshlq_n_u16(tmp, 1); - - val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); - - val -= vreinterpretq_s16_u16(tmp); - - val = vmaxq_s16(val, vdupq_n_s16(0)); - val = vminq_s16(val, vdupq_n_s16(255)); - - return vmovn_u16(vreinterpretq_u16_s16(val)); -} - -uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], - src.val[NEON_A], dst.val[NEON_A]); - - return ret; -} - -static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ - - uint16x8_t sc_plus_dc, scdc, const255; - int32x4_t term1_1, term1_2, term2_1, term2_2; - - /* Calc (sc + dc) and (sc * dc) */ - sc_plus_dc = vaddl_u8(sc, dc); - scdc = vmull_u8(sc, dc); - - /* Prepare constants */ - const255 = vdupq_n_u16(255); - - /* Calc the first term */ - term1_1 = vreinterpretq_s32_u32( - vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); - term1_2 = vreinterpretq_s32_u32( - vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); - - /* Calc the second term */ - term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); - term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); - - return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); -} - -uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], - src.val[NEON_A], dst.val[NEON_A]); - - return ret; -} - -static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, - uint8x8_t sa, uint8x8_t da) { - uint32x4_t val1, val2; - uint16x8_t scdc, t1, t2; - - t1 = vmull_u8(sc, vdup_n_u8(255) - da); - t2 = vmull_u8(dc, vdup_n_u8(255) - sa); - scdc = vmull_u8(sc, dc); - - val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); - val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); - - val1 = vaddw_u16(val1, vget_low_u16(scdc)); - val2 = vaddw_u16(val2, vget_high_u16(scdc)); - - return clamp_div255round_simd8_32( - vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); -} - -uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { - uint8x8x4_t ret; - - ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], - src.val[NEON_A], dst.val[NEON_A]); - ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], - src.val[NEON_A], dst.val[NEON_A]); - - return ret; -} - -//////////////////////////////////////////////////////////////////////////////// - -typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); - -extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; - -SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer) - : INHERITED(buffer) { - fProcSIMD = reinterpret_cast(gNEONXfermodeProcs[this->getMode()]); -} - -void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], - int count, const SkAlpha aa[]) const { - SkASSERT(dst && src && count >= 0); - - SkXfermodeProc proc = this->getProc(); - SkXfermodeProcSIMD procSIMD = reinterpret_cast(fProcSIMD); - - if (NULL == aa) { - // Unrolled NEON code - while (count >= 8) { - uint8x8x4_t vsrc, vdst, vres; - - asm volatile ( - "vld4.u8 %h[vsrc], [%[src]]! \t\n" - "vld4.u8 %h[vdst], [%[dst]] \t\n" - : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+r" (src) - : [dst] "r" (dst) - : - ); - - vres = procSIMD(vsrc, vdst); - - vst4_u8((uint8_t*)dst, vres); - - count -= 8; - dst += 8; - } - // Leftovers - for (int i = 0; i < count; i++) { - dst[i] = proc(src[i], dst[i]); - } - } else { - for (int i = count - 1; i >= 0; --i) { - unsigned a = aa[i]; - if (0 != a) { - SkPMColor dstC = dst[i]; - SkPMColor C = proc(src[i], dstC); - if (a != 0xFF) { - C = SkFourByteInterp(C, dstC, a); - } - dst[i] = C; - } - } - } -} - -#ifdef SK_DEVELOPER -void SkNEONProcCoeffXfermode::toString(SkString* str) const { - this->INHERITED::toString(str); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// - -SkXfermodeProcSIMD gNEONXfermodeProcs[] = { - [SkXfermode::kClear_Mode] = NULL, - [SkXfermode::kSrc_Mode] = NULL, - [SkXfermode::kDst_Mode] = NULL, - [SkXfermode::kSrcOver_Mode] = NULL, - [SkXfermode::kDstOver_Mode] = dstover_modeproc_neon8, - [SkXfermode::kSrcIn_Mode] = srcin_modeproc_neon8, - [SkXfermode::kDstIn_Mode] = dstin_modeproc_neon8, - [SkXfermode::kSrcOut_Mode] = srcout_modeproc_neon8, - [SkXfermode::kDstOut_Mode] = dstout_modeproc_neon8, - [SkXfermode::kSrcATop_Mode] = srcatop_modeproc_neon8, - [SkXfermode::kDstATop_Mode] = dstatop_modeproc_neon8, - [SkXfermode::kXor_Mode] = xor_modeproc_neon8, - [SkXfermode::kPlus_Mode] = plus_modeproc_neon8, - [SkXfermode::kModulate_Mode]= modulate_modeproc_neon8, - [SkXfermode::kScreen_Mode] = screen_modeproc_neon8, - - [SkXfermode::kOverlay_Mode] = overlay_modeproc_neon8, - [SkXfermode::kDarken_Mode] = darken_modeproc_neon8, - [SkXfermode::kLighten_Mode] = lighten_modeproc_neon8, - [SkXfermode::kColorDodge_Mode] = NULL, - [SkXfermode::kColorBurn_Mode] = NULL, - [SkXfermode::kHardLight_Mode] = hardlight_modeproc_neon8, - [SkXfermode::kSoftLight_Mode] = NULL, - [SkXfermode::kDifference_Mode] = difference_modeproc_neon8, - [SkXfermode::kExclusion_Mode] = exclusion_modeproc_neon8, - [SkXfermode::kMultiply_Mode] = multiply_modeproc_neon8, - - [SkXfermode::kHue_Mode] = NULL, - [SkXfermode::kSaturation_Mode] = NULL, - [SkXfermode::kColor_Mode] = NULL, - [SkXfermode::kLuminosity_Mode] = NULL, -}; - -SK_COMPILE_ASSERT( - SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, - mode_count_arm -); - -SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, - SkXfermode::Mode mode) { - - void* procSIMD = reinterpret_cast(gNEONXfermodeProcs[mode]); - - if (procSIMD != NULL) { - return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); - } - return NULL; -} diff --git a/src/opts/SkXfermode_opts_arm_neon.h b/src/opts/SkXfermode_opts_arm_neon.h deleted file mode 100644 index 702b2160a7..0000000000 --- a/src/opts/SkXfermode_opts_arm_neon.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef SkXfermode_opts_arm_neon_DEFINED -#define SkXfermode_opts_arm_neon_DEFINED - -#include "SkXfermode_proccoeff.h" - -class SkNEONProcCoeffXfermode : public SkProcCoeffXfermode { -public: - SkNEONProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, - void* procSIMD) - : INHERITED(rec, mode), fProcSIMD(procSIMD) {} - - virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count, - const SkAlpha aa[]) const SK_OVERRIDE; - - SK_DEVELOPER_TO_STRING() - SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkNEONProcCoeffXfermode) - -private: - SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer); - - // void* is used to avoid pulling arm_neon.h in the core and having to build - // it with -mfpu=neon. - void* fProcSIMD; - typedef SkProcCoeffXfermode INHERITED; -}; - -#endif //#ifdef SkXfermode_opts_arm_neon_DEFINED