diff --git a/include/private/SkVx.h b/include/private/SkVx.h index 2a5c2445dc..7ada49337d 100644 --- a/include/private/SkVx.h +++ b/include/private/SkVx.h @@ -798,7 +798,7 @@ private: SI Vec<8,uint16_t> mull(const Vec<8,uint8_t>& x, const Vec<8,uint8_t>& y) { return to_vec<8,uint16_t>(vmull_u8(to_vext(x), - to_vext(y))); + to_vext(y))); } SIN std::enable_if_t<(N < 8), Vec> mull(const Vec& x, @@ -815,13 +815,37 @@ SIN std::enable_if_t<(N > 8), Vec> mull(const Vec& x, mull(x.hi, y.hi)); } +// Or do four u16*u16 -> u32 in one instruction, vmull_u16 +SI Vec<4,uint32_t> mull(const Vec<4,uint16_t>& x, + const Vec<4,uint16_t>& y) { + return to_vec<4,uint32_t>(vmull_u16(to_vext(x), + to_vext(y))); +} + +SIN std::enable_if_t<(N < 4), Vec> mull(const Vec& x, + const Vec& y) { + // N < 4 --> double up data until N == 4, returning the part we need. + return mull(join(x,x), + join(y,y)).lo; +} + +SIN std::enable_if_t<(N > 4), Vec> mull(const Vec& x, + const Vec& y) { + // N > 4 --> usual join(lo,hi) strategy to recurse down to N == 4. + return join(mull(x.lo, y.lo), + mull(x.hi, y.hi)); +} + #else -// Nothing special when we don't have NEON... just cast up to 16-bit and multiply. +// Nothing special when we don't have NEON... just cast up and multiply. SIN Vec mull(const Vec& x, - const Vec& y) { - return cast(x) - * cast(y); + const Vec& y) { + return cast(x) * cast(y); +} +SIN Vec mull(const Vec& x, + const Vec& y) { + return cast(x) * cast(y); } #endif diff --git a/src/core/BUILD.bazel b/src/core/BUILD.bazel index 7c8b9acd00..6dad199261 100644 --- a/src/core/BUILD.bazel +++ b/src/core/BUILD.bazel @@ -2647,10 +2647,10 @@ generated_cc_atom( ":SkMaskBlurFilter_hdr", "//include/core:SkColorPriv_hdr", "//include/private:SkMalloc_hdr", - "//include/private:SkNx_hdr", "//include/private:SkTPin_hdr", "//include/private:SkTemplates_hdr", "//include/private:SkTo_hdr", + "//include/private:SkVx_hdr", ], ) diff --git a/src/core/SkMaskBlurFilter.cpp b/src/core/SkMaskBlurFilter.cpp index ed79f702cc..67058e4dfb 100644 --- a/src/core/SkMaskBlurFilter.cpp +++ b/src/core/SkMaskBlurFilter.cpp @@ -9,10 +9,10 @@ #include "include/core/SkColorPriv.h" #include "include/private/SkMalloc.h" -#include "include/private/SkNx.h" #include "include/private/SkTPin.h" #include "include/private/SkTemplates.h" #include "include/private/SkTo.h" +#include "include/private/SkVx.h" #include "src/core/SkArenaAlloc.h" #include "src/core/SkGaussFilter.h" @@ -294,7 +294,15 @@ static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) { } using ToA8 = decltype(bw_to_a8); -static Sk8h load(const uint8_t* from, int width, ToA8* toA8) { +using fp88 = skvx::Vec<8, uint16_t>; // 8-wide fixed point 8.8 + +static fp88 mulhi(const fp88& a, const fp88& b) { + // On NEON, this is optimal; with SSE, clang appears to detect the pattern and convert it to the + // optimal single instruction, _mm_mulhi_epu16. + return skvx::cast(mull(a, b) >> 16); +} + +static fp88 load(const uint8_t* from, int width, ToA8* toA8) { // Our fast path is a full 8-byte load of A8. // So we'll conditionally handle the two slow paths using tmp: // - if we have a function to convert another mask to A8, use it; @@ -311,11 +319,11 @@ static Sk8h load(const uint8_t* from, int width, ToA8* toA8) { } // Load A8 and convert to 8.8 fixed-point. - return SkNx_cast(Sk8b::Load(from)) << 8; + return skvx::cast(skvx::byte8::Load(from)) << 8; } -static void store(uint8_t* to, const Sk8h& v, int width) { - Sk8b b = SkNx_cast(v >> 8); +static void store(uint8_t* to, const fp88& v, int width) { + skvx::byte8 b = skvx::cast(v >> 8); if (width == 8) { b.store(to); } else { @@ -410,135 +418,131 @@ static constexpr uint16_t kHalf = 0x80u; // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation. static void blur_x_radius_1( - const Sk8h& s0, - const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, - Sk8h* d0, Sk8h* d8) { + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&, + fp88* d0, fp88* d8) { - auto v1 = s0.mulHi(g1); - auto v0 = s0.mulHi(g0); + auto v1 = mulhi(s0, g1); + auto v0 = mulhi(s0, g0); // D[n..n+7] += S[n..n+7] * G[1] *d0 += v1; //D[n..n+8] += {0, S[n..n+7] * G[0]} - *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}; - *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}; + *d8 += fp88{v0[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} - *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; - *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; + *d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; } static void blur_x_radius_2( - const Sk8h& s0, - const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, - Sk8h* d0, Sk8h* d8) { - auto v0 = s0.mulHi(g0); - auto v1 = s0.mulHi(g1); - auto v2 = s0.mulHi(g2); + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&, + fp88* d0, fp88* d8) { + auto v0 = mulhi(s0, g0); + auto v1 = mulhi(s0, g1); + auto v2 = mulhi(s0, g2); // D[n..n+7] += S[n..n+7] * G[2] *d0 += v2; // D[n..n+8] += {0, S[n..n+7] * G[1]} - *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]}; - *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]}; + *d8 += fp88{v1[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[0]} - *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]}; - *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]}; + *d8 += fp88{v0[6], v0[7], _____, _____, _____, _____, _____, _____}; // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} - *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; - *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; + *d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; + *d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]} - *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]}; - *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____}; + *d0 += fp88{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]}; + *d8 += fp88{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____}; } static void blur_x_radius_3( - const Sk8h& s0, - const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&, - Sk8h* d0, Sk8h* d8) { - auto v0 = s0.mulHi(gauss0); - auto v1 = s0.mulHi(gauss1); - auto v2 = s0.mulHi(gauss2); - auto v3 = s0.mulHi(gauss3); + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&, + fp88* d0, fp88* d8) { + auto v0 = mulhi(s0, g0); + auto v1 = mulhi(s0, g1); + auto v2 = mulhi(s0, g2); + auto v3 = mulhi(s0, g3); // D[n..n+7] += S[n..n+7] * G[3] *d0 += v3; // D[n..n+8] += {0, S[n..n+7] * G[2]} - *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]}; - *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]}; + *d8 += fp88{v2[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} - *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; - *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; + *d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]} - *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]}; - *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____}; + *d0 += fp88{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]}; + *d8 += fp88{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____}; // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]} - *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]}; - *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____}; + *d0 += fp88{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]}; + *d8 += fp88{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____}; // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]} - *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]}; - *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____}; + *d0 += fp88{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]}; + *d8 += fp88{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____}; // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} - *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]}; - *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____}; + *d0 += fp88{_____, _____, _____, _____, _____, _____, v3[0], v3[1]}; + *d8 += fp88{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____}; } static void blur_x_radius_4( - const Sk8h& s0, - const Sk8h& gauss0, - const Sk8h& gauss1, - const Sk8h& gauss2, - const Sk8h& gauss3, - const Sk8h& gauss4, - Sk8h* d0, Sk8h* d8) { - auto v0 = s0.mulHi(gauss0); - auto v1 = s0.mulHi(gauss1); - auto v2 = s0.mulHi(gauss2); - auto v3 = s0.mulHi(gauss3); - auto v4 = s0.mulHi(gauss4); + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, + fp88* d0, fp88* d8) { + auto v0 = mulhi(s0, g0); + auto v1 = mulhi(s0, g1); + auto v2 = mulhi(s0, g2); + auto v3 = mulhi(s0, g3); + auto v4 = mulhi(s0, g4); // D[n..n+7] += S[n..n+7] * G[4] *d0 += v4; // D[n..n+8] += {0, S[n..n+7] * G[3]} - *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]}; - *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]}; + *d8 += fp88{v3[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[2]} - *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]}; - *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____}; + *d0 += fp88{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]}; + *d8 += fp88{v2[6], v2[7], _____, _____, _____, _____, _____, _____}; // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} - *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; - *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; + *d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; + *d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]} - *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]}; - *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____}; + *d0 += fp88{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]}; + *d8 += fp88{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____}; // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]} - *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]}; - *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____}; + *d0 += fp88{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]}; + *d8 += fp88{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____}; // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]} - *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]}; - *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____}; + *d0 += fp88{_____, _____, _____, _____, _____, _____, v2[0], v2[1]}; + *d8 += fp88{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____}; // D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} - *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]}; - *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____}; + *d0 += fp88{_____, _____, _____, _____, _____, _____, _____, v3[0]}; + *d8 += fp88{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____}; // D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]} *d8 += v4; @@ -549,11 +553,11 @@ using BlurX = decltype(blur_x_radius_1); // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). static void blur_row( BlurX blur, - const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, const uint8_t* src, int srcW, uint8_t* dst, int dstW) { // Clear the buffer to handle summing wider than source. - Sk8h d0{kHalf}, d8{kHalf}; + fp88 d0(kHalf), d8(kHalf); // Go by multiples of 8 in src. int x = 0; @@ -563,7 +567,7 @@ static void blur_row( store(dst, d0, 8); d0 = d8; - d8 = Sk8h{kHalf}; + d8 = fp88(kHalf); src += 8; dst += 8; @@ -596,11 +600,11 @@ static void blur_x_rect(BlurX blur, const uint8_t* src, size_t srcStride, int srcW, uint8_t* dst, size_t dstStride, int dstW, int dstH) { - Sk8h g0{gauss[0]}, - g1{gauss[1]}, - g2{gauss[2]}, - g3{gauss[3]}, - g4{gauss[4]}; + fp88 g0(gauss[0]), + g1(gauss[1]), + g2(gauss[2]), + g3(gauss[3]), + g4(gauss[4]); // Blur *ALL* the rows. for (int y = 0; y < dstH; y++) { @@ -686,29 +690,29 @@ static void direct_blur_x(int radius, uint16_t* gauss, // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0] // d12[0..7] = S[n+0r..n+0r+7]*G[1] // return answer[0..7] -static Sk8h blur_y_radius_1( - const Sk8h& s0, - const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, - Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { - auto v0 = s0.mulHi(g0); - auto v1 = s0.mulHi(g1); +static fp88 blur_y_radius_1( + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&, + fp88* d01, fp88* d12, fp88*, fp88*, fp88*, fp88*, fp88*, fp88*) { + auto v0 = mulhi(s0, g0); + auto v1 = mulhi(s0, g1); - Sk8h answer = *d01 + v1; + fp88 answer = *d01 + v1; *d01 = *d12 + v0; *d12 = v1 + kHalf; return answer; } -static Sk8h blur_y_radius_2( - const Sk8h& s0, - const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, - Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { - auto v0 = s0.mulHi(g0); - auto v1 = s0.mulHi(g1); - auto v2 = s0.mulHi(g2); +static fp88 blur_y_radius_2( + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&, + fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88*, fp88*, fp88*, fp88*) { + auto v0 = mulhi(s0, g0); + auto v1 = mulhi(s0, g1); + auto v2 = mulhi(s0, g2); - Sk8h answer = *d01 + v2; + fp88 answer = *d01 + v2; *d01 = *d12 + v1; *d12 = *d23 + v0; *d23 = *d34 + v1; @@ -717,16 +721,16 @@ static Sk8h blur_y_radius_2( return answer; } -static Sk8h blur_y_radius_3( - const Sk8h& s0, - const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&, - Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) { - auto v0 = s0.mulHi(g0); - auto v1 = s0.mulHi(g1); - auto v2 = s0.mulHi(g2); - auto v3 = s0.mulHi(g3); +static fp88 blur_y_radius_3( + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&, + fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88*, fp88*) { + auto v0 = mulhi(s0, g0); + auto v1 = mulhi(s0, g1); + auto v2 = mulhi(s0, g2); + auto v3 = mulhi(s0, g3); - Sk8h answer = *d01 + v3; + fp88 answer = *d01 + v3; *d01 = *d12 + v2; *d12 = *d23 + v1; *d23 = *d34 + v0; @@ -737,17 +741,17 @@ static Sk8h blur_y_radius_3( return answer; } -static Sk8h blur_y_radius_4( - const Sk8h& s0, - const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, - Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) { - auto v0 = s0.mulHi(g0); - auto v1 = s0.mulHi(g1); - auto v2 = s0.mulHi(g2); - auto v3 = s0.mulHi(g3); - auto v4 = s0.mulHi(g4); +static fp88 blur_y_radius_4( + const fp88& s0, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, + fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88* d67, fp88* d78) { + auto v0 = mulhi(s0, g0); + auto v1 = mulhi(s0, g1); + auto v2 = mulhi(s0, g2); + auto v3 = mulhi(s0, g3); + auto v4 = mulhi(s0, g4); - Sk8h answer = *d01 + v4; + fp88 answer = *d01 + v4; *d01 = *d12 + v3; *d12 = *d23 + v2; *d23 = *d34 + v1; @@ -766,13 +770,13 @@ using BlurY = decltype(blur_y_radius_1); static void blur_column( ToA8 toA8, BlurY blur, int radius, int width, - const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, + const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, const uint8_t* src, size_t srcRB, int srcH, uint8_t* dst, size_t dstRB) { - Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf}, - d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf}; + fp88 d01(kHalf), d12(kHalf), d23(kHalf), d34(kHalf), + d45(kHalf), d56(kHalf), d67(kHalf), d78(kHalf); - auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) { + auto flush = [&](uint8_t* to, const fp88& v0, const fp88& v1) { store(to, v0, width); to += dstRB; store(to, v1, width); @@ -809,11 +813,11 @@ static void blur_y_rect(ToA8 toA8, const int strideOf8, const uint8_t *src, size_t srcRB, int srcW, int srcH, uint8_t *dst, size_t dstRB) { - Sk8h g0{gauss[0]}, - g1{gauss[1]}, - g2{gauss[2]}, - g3{gauss[3]}, - g4{gauss[4]}; + fp88 g0(gauss[0]), + g1(gauss[1]), + g2(gauss[2]), + g3(gauss[3]), + g4(gauss[4]); int x = 0; for (; x <= srcW - 8; x += 8) { diff --git a/src/effects/imagefilters/BUILD.bazel b/src/effects/imagefilters/BUILD.bazel index 78c67e7d21..8607ea1f95 100644 --- a/src/effects/imagefilters/BUILD.bazel +++ b/src/effects/imagefilters/BUILD.bazel @@ -38,7 +38,7 @@ generated_cc_atom( "//include/core:SkCanvas_hdr", "//include/effects:SkImageFilters_hdr", "//include/gpu:GrRecordingContext_hdr", - "//include/private:SkNx_hdr", + "//include/private:SkVx_hdr", "//src/core:SkImageFilter_Base_hdr", "//src/core:SkReadBuffer_hdr", "//src/core:SkRuntimeEffectPriv_hdr", diff --git a/src/effects/imagefilters/SkArithmeticImageFilter.cpp b/src/effects/imagefilters/SkArithmeticImageFilter.cpp index d718a0677d..88c890e450 100644 --- a/src/effects/imagefilters/SkArithmeticImageFilter.cpp +++ b/src/effects/imagefilters/SkArithmeticImageFilter.cpp @@ -8,7 +8,7 @@ #include "include/core/SkBitmap.h" #include "include/core/SkCanvas.h" #include "include/effects/SkImageFilters.h" -#include "include/private/SkNx.h" +#include "include/private/SkVx.h" #include "src/core/SkImageFilter_Base.h" #include "src/core/SkReadBuffer.h" #include "src/core/SkSpecialImage.h" @@ -130,42 +130,38 @@ void SkArithmeticImageFilter::flatten(SkWriteBuffer& buffer) const { /////////////////////////////////////////////////////////////////////////////////////////////////// -static Sk4f pin(float min, const Sk4f& val, float max) { - return Sk4f::Max(min, Sk4f::Min(val, max)); -} - template void arith_span(const SkV4& k, SkPMColor dst[], const SkPMColor src[], int count) { - const Sk4f k1 = k[0] * (1/255.0f), - k2 = k[1], - k3 = k[2], - k4 = k[3] * 255.0f + 0.5f; + const skvx::float4 k1 = k[0] * (1/255.0f), + k2 = k[1], + k3 = k[2], + k4 = k[3] * 255.0f + 0.5f; for (int i = 0; i < count; i++) { - Sk4f s = SkNx_cast(Sk4b::Load(src+i)), - d = SkNx_cast(Sk4b::Load(dst+i)), - r = pin(0, k1*s*d + k2*s + k3*d + k4, 255); + skvx::float4 s = skvx::cast(skvx::byte4::Load(src+i)), + d = skvx::cast(skvx::byte4::Load(dst+i)), + r = pin(k1*s*d + k2*s + k3*d + k4, skvx::float4(0.f), skvx::float4(255.f)); if (EnforcePMColor) { - Sk4f a = SkNx_shuffle<3,3,3,3>(r); - r = Sk4f::Min(a, r); + auto a = skvx::shuffle<3,3,3,3>(r); + r = min(a, r); } - SkNx_cast(r).store(dst+i); + skvx::cast(r).store(dst+i); } } // apply mode to src==transparent (0) template void arith_transparent(const SkV4& k, SkPMColor dst[], int count) { - const Sk4f k3 = k[2], - k4 = k[3] * 255.0f + 0.5f; + const skvx::float4 k3 = k[2], + k4 = k[3] * 255.0f + 0.5f; for (int i = 0; i < count; i++) { - Sk4f d = SkNx_cast(Sk4b::Load(dst+i)), - r = pin(0, k3*d + k4, 255); + skvx::float4 d = skvx::cast(skvx::byte4::Load(dst+i)), + r = pin(k3*d + k4, skvx::float4(0.f), skvx::float4(255.f)); if (EnforcePMColor) { - Sk4f a = SkNx_shuffle<3,3,3,3>(r); - r = Sk4f::Min(a, r); + auto a = skvx::shuffle<3,3,3,3>(r); + r = min(a, r); } - SkNx_cast(r).store(dst+i); + skvx::cast(r).store(dst+i); } }