From 9db43ac4ee1a83a4f7b332fe6c00f592b6237349 Mon Sep 17 00:00:00 2001 From: mtklein Date: Tue, 1 Dec 2015 07:10:21 -0800 Subject: [PATCH] Add Sk4f::ToBytes(uint8_t[16], Sk4f, Sk4f, Sk4f, Sk4f) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a big speedup for float -> byte. E.g. gradient_linear_clamp_3color: x86-64 147µs -> 103µs (Broadwell MBP) arm64 2.03ms -> 648µs (Galaxy S6) armv7 1.12ms -> 489µs (Galaxy S6, same device!) BUG=skia: CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;client.skia.android:Test-Android-GCC-Nexus9-CPU-Denver-Arm64-Debug-Trybot Review URL: https://codereview.chromium.org/1483953002 --- bench/Sk4fBench.cpp | 5 +---- src/core/SkNx.h | 9 +++++++++ src/effects/gradients/SkLinearGradient.cpp | 15 ++++++++++----- src/opts/SkNx_neon.h | 8 ++++++++ src/opts/SkNx_sse.h | 9 +++++++++ 5 files changed, 37 insertions(+), 9 deletions(-) diff --git a/bench/Sk4fBench.cpp b/bench/Sk4fBench.cpp index 02ac14591d..b2f2b4a58e 100644 --- a/bench/Sk4fBench.cpp +++ b/bench/Sk4fBench.cpp @@ -62,10 +62,7 @@ struct Sk4fGradientBench : public Benchmark { c = b + dcdx, d = c + dcdx; for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) { - a.toBytes((uint8_t*)(fDevice+i+0)); - b.toBytes((uint8_t*)(fDevice+i+1)); - c.toBytes((uint8_t*)(fDevice+i+2)); - d.toBytes((uint8_t*)(fDevice+i+3)); + Sk4f::ToBytes((uint8_t*)(fDevice+i), a, b, c, d); a = a + dcdx4; b = b + dcdx4; c = c + dcdx4; diff --git a/src/core/SkNx.h b/src/core/SkNx.h index b9b67704d4..36a645cac1 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -113,6 +113,15 @@ public: fHi.toBytes(bytes+N/2); } + // Some implementations can do this faster. + static void ToBytes(uint8_t bytes[4*N], + const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { + a.toBytes(bytes+0*N); + b.toBytes(bytes+1*N); + c.toBytes(bytes+2*N); + d.toBytes(bytes+3*N); + } + SkNx operator + (const SkNx& o) const { return SkNx(fLo + o.fLo, fHi + o.fHi); } SkNx operator - (const SkNx& o) const { return SkNx(fLo - o.fLo, fHi - o.fHi); } SkNx operator * (const SkNx& o) const { return SkNx(fLo * o.fLo, fHi * o.fHi); } diff --git a/src/effects/gradients/SkLinearGradient.cpp b/src/effects/gradients/SkLinearGradient.cpp index f47b6ab30e..b224474536 100644 --- a/src/effects/gradients/SkLinearGradient.cpp +++ b/src/effects/gradients/SkLinearGradient.cpp @@ -748,10 +748,15 @@ template void ramp(SkPMColor dstC[], int n, const Sk4f& c, co Sk4f cd2 = cd0 + dc2; Sk4f cd3 = cd1 + dc2; while (n >= 4) { - *dstC++ = trunc_from_255(cd0); - *dstC++ = trunc_from_255(cd1); - *dstC++ = trunc_from_255(cd2); - *dstC++ = trunc_from_255(cd3); + if (!apply_alpha) { + Sk4f::ToBytes((uint8_t*)dstC, cd0, cd1, cd2, cd3); + dstC += 4; + } else { + *dstC++ = trunc_from_255(cd0); + *dstC++ = trunc_from_255(cd1); + *dstC++ = trunc_from_255(cd2); + *dstC++ = trunc_from_255(cd3); + } cd0 = cd0 + dc4; cd1 = cd1 + dc4; cd2 = cd2 + dc4; @@ -861,7 +866,7 @@ void SkLinearGradient::LinearGradientContext::shade4_dx_clamp(SkPMColor dstC[], ramp(dstC, n, c, dc, dither0, dither1); dstC += n; SkASSERT(dstC <= endDstC); - + if (n & 1) { SkTSwap(dither0, dither1); } diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index a03f0be674..6fe6137e5f 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -165,6 +165,14 @@ public: vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); } + static void ToBytes(uint8_t bytes[16], + const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { + vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), + (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], + vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), + (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0]); + } + SkNx approxInvert() const { float32x4_t est0 = vrecpeq_f32(fVec), est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index f0ccd3f7b5..a4f8656536 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -132,6 +132,15 @@ public: *(int*)bytes = _mm_cvtsi128_si32(fix8); } + static void ToBytes(uint8_t bytes[16], + const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { + _mm_storeu_si128((__m128i*)bytes, + _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), + _mm_cvttps_epi32(b.fVec)), + _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), + _mm_cvttps_epi32(d.fVec)))); + } + SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }