diff --git a/bench/SkBenchmark.cpp b/bench/SkBenchmark.cpp index 8dd66f06b9..36280ac716 100644 --- a/bench/SkBenchmark.cpp +++ b/bench/SkBenchmark.cpp @@ -7,6 +7,7 @@ SkBenchmark::SkBenchmark(void* defineDict) { fDict = reinterpret_cast*>(defineDict); fForceAlpha = 0xFF; fForceAA = true; + fDither = SkTriState::kDefault; } const char* SkBenchmark::getName() { @@ -25,6 +26,10 @@ void SkBenchmark::setupPaint(SkPaint* paint) { paint->setAlpha(fForceAlpha); paint->setAntiAlias(fForceAA); paint->setFilterBitmap(fForceFilter); + + if (SkTriState::kDefault != fDither) { + paint->setDither(SkTriState::kTrue == fDither); + } } const char* SkBenchmark::findDefine(const char* key) const { diff --git a/bench/SkBenchmark.h b/bench/SkBenchmark.h index 5ecff3b84c..a9c4085eb8 100644 --- a/bench/SkBenchmark.h +++ b/bench/SkBenchmark.h @@ -9,6 +9,15 @@ class SkCanvas; class SkPaint; +class SkTriState { +public: + enum State { + kDefault, + kTrue, + kFalse + }; +}; + class SkBenchmark : public SkRefCnt { public: SkBenchmark(void* defineDict); @@ -28,6 +37,10 @@ public: void setForceFilter(bool filter) { fForceFilter = filter; } + + void setDither(SkTriState::State state) { + fDither = state; + } const char* findDefine(const char* key) const; @@ -44,6 +57,7 @@ private: int fForceAlpha; bool fForceAA; bool fForceFilter; + SkTriState::State fDither; }; static inline SkIPoint SkMakeIPoint(int x, int y) { diff --git a/bench/benchmain.cpp b/bench/benchmain.cpp index 8ccb373eb9..23a0c8598a 100644 --- a/bench/benchmain.cpp +++ b/bench/benchmain.cpp @@ -193,6 +193,7 @@ int main (int argc, char * const argv[]) { int forceAlpha = 0xFF; bool forceAA = true; bool forceFilter = false; + SkTriState::State forceDither = SkTriState::kDefault; bool doScale = false; bool doRotate = false; bool doClip = false; @@ -243,6 +244,13 @@ int main (int argc, char * const argv[]) { log_error("missing arg for -forceFilter\n"); return -1; } + } else if (strcmp(*argv, "-forceDither") == 0) { + bool tmp; + if (!parse_bool_arg(++argv, stop, &tmp)) { + log_error("missing arg for -forceDither\n"); + return -1; + } + forceDither = tmp ? SkTriState::kTrue : SkTriState::kFalse; } else if (strcmp(*argv, "-forceBlend") == 0) { bool wantAlpha = false; if (!parse_bool_arg(++argv, stop, &wantAlpha)) { @@ -303,6 +311,7 @@ int main (int argc, char * const argv[]) { bench->setForceAlpha(forceAlpha); bench->setForceAA(forceAA); bench->setForceFilter(forceFilter); + bench->setDither(forceDither); // only run benchmarks if their name contains matchStr if (matchStr && strstr(bench->getName(), matchStr) == NULL) { diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 0c38113adf..017f3e0875 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -20,6 +20,10 @@ #include "SkColorPriv.h" #include "SkDither.h" +#if defined(__ARM_HAVE_NEON) +#include +#endif + #if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, @@ -415,6 +419,297 @@ static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, /////////////////////////////////////////////////////////////////////////////// +#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) + +static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha) { + + SkASSERT(255 == alpha); + if (count > 0) { + + /* do the NEON unrolled code */ +#define UNROLL 4 + while (count >= UNROLL) { + uint8x8_t src_raw, dst_raw, dst_final; + uint8x8_t src_raw_2, dst_raw_2, dst_final_2; + uint8x8_t alpha_mask; + + /* use vtbl, with src_raw as the table */ + /* expect gcc to hoist alpha_mask setup above loop */ + alpha_mask = vdup_n_u8(3); + alpha_mask = vset_lane_u8(7, alpha_mask, 4); + alpha_mask = vset_lane_u8(7, alpha_mask, 5); + alpha_mask = vset_lane_u8(7, alpha_mask, 6); + alpha_mask = vset_lane_u8(7, alpha_mask, 7); + + /* get the source */ + src_raw = vreinterpret_u8_u32(vld1_u32(src)); +#if UNROLL > 2 + src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); +#endif + + /* get and hold the dst too */ + dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); +#if UNROLL > 2 + dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); +#endif + +#if 1 + /* 1st and 2nd bits of the unrolling */ + { + uint8x8_t dst_cooked; + uint16x8_t dst_wide; + uint8x8_t alpha_narrow; + uint16x8_t alpha_wide; + + /* get the alphas spread out properly */ + alpha_narrow = vtbl1_u8(src_raw, alpha_mask); + alpha_narrow = vsub_u8(vdup_n_u8(255), alpha_narrow); + alpha_wide = vmovl_u8(alpha_narrow); + alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); + + /* get the dest, spread it */ + dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); + dst_wide = vmovl_u8(dst_raw); + + /* alpha mul the dest */ + dst_wide = vmulq_u16 (dst_wide, alpha_wide); + dst_cooked = vshrn_n_u16(dst_wide, 8); + + /* sum -- ignoring any byte lane overflows */ + dst_final = vadd_u8(src_raw, dst_cooked); + } +#endif + +#if UNROLL > 2 + /* the 3rd and 4th bits of our unrolling */ + { + uint8x8_t dst_cooked; + uint16x8_t dst_wide; + uint8x8_t alpha_narrow; + uint16x8_t alpha_wide; + + alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); + alpha_narrow = vsub_u8(vdup_n_u8(255), alpha_narrow); + alpha_wide = vmovl_u8(alpha_narrow); + alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); + + /* get the dest, spread it */ + dst_wide = vmovl_u8(dst_raw_2); + + /* alpha mul the dest */ + dst_wide = vmulq_u16 (dst_wide, alpha_wide); + dst_cooked = vshrn_n_u16(dst_wide, 8); + + /* sum -- ignoring any byte lane overflows */ + dst_final_2 = vadd_u8(src_raw_2, dst_cooked); + } +#endif + + vst1_u32(dst, vreinterpret_u32_u8(dst_final)); +#if UNROLL > 2 + vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); +#endif + + src += UNROLL; + dst += UNROLL; + count -= UNROLL; + } +#undef UNROLL + + /* do any residual iterations */ + while (--count >= 0) { +#ifdef TEST_SRC_ALPHA + SkPMColor sc = *src; + if (sc) { + unsigned srcA = SkGetPackedA32(sc); + SkPMColor result = sc; + if (srcA != 255) { + result = SkPMSrcOver(sc, *dst); + } + *dst = result; + } +#else + *dst = SkPMSrcOver(*src, *dst); +#endif + src += 1; + dst += 1; + } + } +} + +#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon +#else +#define S32A_Opaque_BlitRow32_PROC NULL +#endif + +/* Neon version of S32_Blend_BlitRow32() + * portable version is in core/SkBlitRow_D32.cpp + */ +#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) +static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + if (count > 0) { + uint16_t src_scale = SkAlpha255To256(alpha); + uint16_t dst_scale = 256 - src_scale; + + /* run them N at a time through the NEON unit */ + /* note that each 1 is 4 bytes, each treated exactly the same, + * so we can work under that guise. We *do* know that the src&dst + * will be 32-bit aligned quantities, so we can specify that on + * the load/store ops and do a neon 'reinterpret' to get us to + * byte-sized (pun intended) pieces that we widen/multiply/shift + * we're limited at 128 bits in the wide ops, which is 8x16bits + * or a pair of 32 bit src/dsts. + */ + /* we *could* manually unroll this loop so that we load 128 bits + * (as a pair of 64s) from each of src and dst, processing them + * in pieces. This might give us a little better management of + * the memory latency, but my initial attempts here did not + * produce an instruction stream that looked all that nice. + */ +#define UNROLL 2 + while (count >= UNROLL) { + uint8x8_t src_raw, dst_raw, dst_final; + uint16x8_t src_wide, dst_wide; + + /* get 64 bits of src, widen it, multiply by src_scale */ + src_raw = vreinterpret_u8_u32(vld1_u32(src)); + src_wide = vmovl_u8(src_raw); + /* gcc hoists vdupq_n_u16(), better code than vmulq_n_u16() */ + src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); + + /* ditto with dst */ + dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); + dst_wide = vmovl_u8(dst_raw); + dst_wide = vmulq_u16 (dst_wide, vdupq_n_u16(dst_scale)); + + /* sum (knowing it won't overflow 16 bits) and take high bits */ + dst_wide = vaddq_u16(dst_wide, src_wide); + dst_final = vshrn_n_u16(dst_wide, 8); + + vst1_u32(dst, vreinterpret_u32_u8(dst_final)); + + src += UNROLL; + dst += UNROLL; + count -= UNROLL; + } + /* RBE: well, i don't like how gcc manages src/dst across the above + * loop it's constantly calculating src+bias, dst+bias and it only + * adjusts the real ones when we leave the loop. Not sure why + * it's "hoisting down" (hoisting implies above in my lexicon ;)) + * the adjustments to src/dst/count, but it does... + * (might be SSA-style internal logic... + */ + +#if UNROLL == 2 + if (count == 1) { + *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); + } +#else + if (count > 0) { + do { + *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); + src += 1; + dst += 1; + } while (--count > 0); + } +#endif + +#undef UNROLL + } +} + +#define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon +#else +#define S32_Blend_BlitRow32_PROC NULL +#endif + +/////////////////////////////////////////////////////////////////////////////// + +#if 0 && defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) +/* RBE: working on this 2009/10/8 */ +static void S32A_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int x, int y) { + SkASSERT(255 == alpha); + + if (count > 0) { + DITHER_565_SCAN(y); + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + /* RBE: make sure we don't generate wrong output if c==0 */ + if (c) { + + /* let's do a vld4 to get 64 bits (8 bytes) of each Argb */ + /* so we'll have 8 a's, 8 r's, etc */ + /* little endian: ABGR is the ordering (R at lsb) */ + unsigned a = SkGetPackedA32(c); + + // RBE: could load a table and do vtbl for these things + // DITHER_VALUE() masks x to 3 bits [0..7] before lookup, so can + // so 8x unrolling gets us perfectly aligned. + // and we could even avoid the vtbl at that point + /* d is 0..7 according to skia/core/SkDither.h asserts */ + int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); + + unsigned sr = SkGetPackedR32(c); + unsigned sg = SkGetPackedG32(c); + unsigned sb = SkGetPackedB32(c); + + /* R and B handled identically; G is a little different */ + + /* sr - (sr>>5) means that +d can NOT overflow */ + /* do (sr-(sr>>5)), followed by adding d -- stay in 8 bits */ + /* sr = sr+d - (sr>>5) */ + sr = SkDITHER_R32_FOR_565(sr, d); + /* calculate sr+(sr>>5) here, then add d */ + + /* sg = sg + (d>>1) - (sg>>6) */ + sg = SkDITHER_G32_FOR_565(sg, d); + /* sg>>6 could be '3' and d>>1 is <= 3, so we're ok */ + /* calculate sg-(sg>>6), then add "d>>1" */ + + + /* sb = sb+d - (sb>>5) */ + sb = SkDITHER_B32_FOR_565(sb, d); + /* calculate sb+(sb>>5) here, then add d */ + + + /* been dealing in 8x8 through here; gonna have to go to 8x16 */ + + /* need to pick up 8 dst's -- at 16 bits each, 256 bits */ + /* extract dst into 8x16's */ + /* blend */ + /* shift */ + /* reassemble */ + + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); + uint32_t dst_expanded = SkExpand_rgb_16(*dst); + + // would be shifted by 8, but the >>3 makes it be just 5 + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); + // now src and dst expanded are in g:11 r:10 x:1 b:10 + *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); + } + dst += 1; + /* RBE: a NOP with wide enough unrolling; wide_enough == 8 */ + DITHER_INC_X(x); + } while (--count != 0); + } +} + +#define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon +#else +#define S32A_D565_Opaque_Dither_PROC NULL +#endif + +/////////////////////////////////////////////////////////////////////////////// + const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { // no dither S32_D565_Opaque_PROC, @@ -425,7 +720,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { // dither NULL, // S32_D565_Opaque_Dither, S32_D565_Blend_Dither_PROC, - NULL, // S32A_D565_Opaque_Dither, + S32A_D565_Opaque_Dither_PROC, NULL, // S32A_D565_Blend_Dither }; @@ -445,8 +740,8 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = { const SkBlitRow::Proc32 SkBlitRow::gPlatform_Procs32[] = { NULL, // S32_Opaque, - NULL, // S32_Blend, - NULL, // S32A_Opaque, + S32_Blend_BlitRow32_PROC, // S32_Blend, + S32A_Opaque_BlitRow32_PROC, // S32A_Opaque, NULL, // S32A_Blend, };