Update filters to use skvx instead of SkNx

Change-Id: I1a5490f546a3cb046c64b114a30be991d2d9f2cc
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/541064
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
This commit is contained in:
Michael Ludwig 2022-05-20 17:02:33 -04:00 committed by SkCQ
parent f2d000328f
commit 8e870728db
5 changed files with 177 additions and 153 deletions

View File

@ -798,7 +798,7 @@ private:
SI Vec<8,uint16_t> mull(const Vec<8,uint8_t>& x,
const Vec<8,uint8_t>& y) {
return to_vec<8,uint16_t>(vmull_u8(to_vext(x),
to_vext(y)));
to_vext(y)));
}
SIN std::enable_if_t<(N < 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x,
@ -815,13 +815,37 @@ SIN std::enable_if_t<(N > 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x,
mull(x.hi, y.hi));
}
// Or do four u16*u16 -> u32 in one instruction, vmull_u16
SI Vec<4,uint32_t> mull(const Vec<4,uint16_t>& x,
const Vec<4,uint16_t>& y) {
return to_vec<4,uint32_t>(vmull_u16(to_vext(x),
to_vext(y)));
}
SIN std::enable_if_t<(N < 4), Vec<N,uint32_t>> mull(const Vec<N,uint16_t>& x,
const Vec<N,uint16_t>& y) {
// N < 4 --> double up data until N == 4, returning the part we need.
return mull(join(x,x),
join(y,y)).lo;
}
SIN std::enable_if_t<(N > 4), Vec<N,uint32_t>> mull(const Vec<N,uint16_t>& x,
const Vec<N,uint16_t>& y) {
// N > 4 --> usual join(lo,hi) strategy to recurse down to N == 4.
return join(mull(x.lo, y.lo),
mull(x.hi, y.hi));
}
#else
// Nothing special when we don't have NEON... just cast up to 16-bit and multiply.
// Nothing special when we don't have NEON... just cast up and multiply.
SIN Vec<N,uint16_t> mull(const Vec<N,uint8_t>& x,
const Vec<N,uint8_t>& y) {
return cast<uint16_t>(x)
* cast<uint16_t>(y);
const Vec<N,uint8_t>& y) {
return cast<uint16_t>(x) * cast<uint16_t>(y);
}
SIN Vec<N,uint32_t> mull(const Vec<N,uint16_t>& x,
const Vec<N,uint16_t>& y) {
return cast<uint32_t>(x) * cast<uint32_t>(y);
}
#endif

View File

@ -2647,10 +2647,10 @@ generated_cc_atom(
":SkMaskBlurFilter_hdr",
"//include/core:SkColorPriv_hdr",
"//include/private:SkMalloc_hdr",
"//include/private:SkNx_hdr",
"//include/private:SkTPin_hdr",
"//include/private:SkTemplates_hdr",
"//include/private:SkTo_hdr",
"//include/private:SkVx_hdr",
],
)

View File

@ -9,10 +9,10 @@
#include "include/core/SkColorPriv.h"
#include "include/private/SkMalloc.h"
#include "include/private/SkNx.h"
#include "include/private/SkTPin.h"
#include "include/private/SkTemplates.h"
#include "include/private/SkTo.h"
#include "include/private/SkVx.h"
#include "src/core/SkArenaAlloc.h"
#include "src/core/SkGaussFilter.h"
@ -294,7 +294,15 @@ static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) {
}
using ToA8 = decltype(bw_to_a8);
static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
using fp88 = skvx::Vec<8, uint16_t>; // 8-wide fixed point 8.8
static fp88 mulhi(const fp88& a, const fp88& b) {
// On NEON, this is optimal; with SSE, clang appears to detect the pattern and convert it to the
// optimal single instruction, _mm_mulhi_epu16.
return skvx::cast<uint16_t>(mull(a, b) >> 16);
}
static fp88 load(const uint8_t* from, int width, ToA8* toA8) {
// Our fast path is a full 8-byte load of A8.
// So we'll conditionally handle the two slow paths using tmp:
// - if we have a function to convert another mask to A8, use it;
@ -311,11 +319,11 @@ static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
}
// Load A8 and convert to 8.8 fixed-point.
return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8;
return skvx::cast<uint16_t>(skvx::byte8::Load(from)) << 8;
}
static void store(uint8_t* to, const Sk8h& v, int width) {
Sk8b b = SkNx_cast<uint8_t>(v >> 8);
static void store(uint8_t* to, const fp88& v, int width) {
skvx::byte8 b = skvx::cast<uint8_t>(v >> 8);
if (width == 8) {
b.store(to);
} else {
@ -410,135 +418,131 @@ static constexpr uint16_t kHalf = 0x80u;
// Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
static void blur_x_radius_1(
const Sk8h& s0,
const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
Sk8h* d0, Sk8h* d8) {
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
fp88* d0, fp88* d8) {
auto v1 = s0.mulHi(g1);
auto v0 = s0.mulHi(g0);
auto v1 = mulhi(s0, g1);
auto v0 = mulhi(s0, g0);
// D[n..n+7] += S[n..n+7] * G[1]
*d0 += v1;
//D[n..n+8] += {0, S[n..n+7] * G[0]}
*d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
*d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
*d8 += fp88{v0[7], _____, _____, _____, _____, _____, _____, _____};
// D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
*d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
*d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
*d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
}
static void blur_x_radius_2(
const Sk8h& s0,
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
Sk8h* d0, Sk8h* d8) {
auto v0 = s0.mulHi(g0);
auto v1 = s0.mulHi(g1);
auto v2 = s0.mulHi(g2);
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
fp88* d0, fp88* d8) {
auto v0 = mulhi(s0, g0);
auto v1 = mulhi(s0, g1);
auto v2 = mulhi(s0, g2);
// D[n..n+7] += S[n..n+7] * G[2]
*d0 += v2;
// D[n..n+8] += {0, S[n..n+7] * G[1]}
*d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
*d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
*d8 += fp88{v1[7], _____, _____, _____, _____, _____, _____, _____};
// D[n..n+9] += {0, 0, S[n..n+7] * G[0]}
*d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
*d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
*d8 += fp88{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
// D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
*d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
*d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
*d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
*d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
// D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]}
*d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
*d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
*d0 += fp88{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
*d8 += fp88{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
}
static void blur_x_radius_3(
const Sk8h& s0,
const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
Sk8h* d0, Sk8h* d8) {
auto v0 = s0.mulHi(gauss0);
auto v1 = s0.mulHi(gauss1);
auto v2 = s0.mulHi(gauss2);
auto v3 = s0.mulHi(gauss3);
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
fp88* d0, fp88* d8) {
auto v0 = mulhi(s0, g0);
auto v1 = mulhi(s0, g1);
auto v2 = mulhi(s0, g2);
auto v3 = mulhi(s0, g3);
// D[n..n+7] += S[n..n+7] * G[3]
*d0 += v3;
// D[n..n+8] += {0, S[n..n+7] * G[2]}
*d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
*d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
*d8 += fp88{v2[7], _____, _____, _____, _____, _____, _____, _____};
// D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
*d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
*d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
*d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
// D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]}
*d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
*d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
*d0 += fp88{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
*d8 += fp88{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
// D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]}
*d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
*d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
*d0 += fp88{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
*d8 += fp88{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
// D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
*d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
*d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
*d0 += fp88{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
*d8 += fp88{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
// D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
*d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
*d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
*d0 += fp88{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
*d8 += fp88{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
}
static void blur_x_radius_4(
const Sk8h& s0,
const Sk8h& gauss0,
const Sk8h& gauss1,
const Sk8h& gauss2,
const Sk8h& gauss3,
const Sk8h& gauss4,
Sk8h* d0, Sk8h* d8) {
auto v0 = s0.mulHi(gauss0);
auto v1 = s0.mulHi(gauss1);
auto v2 = s0.mulHi(gauss2);
auto v3 = s0.mulHi(gauss3);
auto v4 = s0.mulHi(gauss4);
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
fp88* d0, fp88* d8) {
auto v0 = mulhi(s0, g0);
auto v1 = mulhi(s0, g1);
auto v2 = mulhi(s0, g2);
auto v3 = mulhi(s0, g3);
auto v4 = mulhi(s0, g4);
// D[n..n+7] += S[n..n+7] * G[4]
*d0 += v4;
// D[n..n+8] += {0, S[n..n+7] * G[3]}
*d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
*d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
*d8 += fp88{v3[7], _____, _____, _____, _____, _____, _____, _____};
// D[n..n+9] += {0, 0, S[n..n+7] * G[2]}
*d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
*d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
*d0 += fp88{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
*d8 += fp88{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
// D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
*d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
*d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
*d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
*d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
// D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]}
*d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
*d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
*d0 += fp88{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
*d8 += fp88{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
// D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
*d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
*d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
*d0 += fp88{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
*d8 += fp88{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
// D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
*d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
*d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
*d0 += fp88{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
*d8 += fp88{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
// D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
*d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
*d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
*d0 += fp88{_____, _____, _____, _____, _____, _____, _____, v3[0]};
*d8 += fp88{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
// D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
*d8 += v4;
@ -549,11 +553,11 @@ using BlurX = decltype(blur_x_radius_1);
// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
static void blur_row(
BlurX blur,
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
const uint8_t* src, int srcW,
uint8_t* dst, int dstW) {
// Clear the buffer to handle summing wider than source.
Sk8h d0{kHalf}, d8{kHalf};
fp88 d0(kHalf), d8(kHalf);
// Go by multiples of 8 in src.
int x = 0;
@ -563,7 +567,7 @@ static void blur_row(
store(dst, d0, 8);
d0 = d8;
d8 = Sk8h{kHalf};
d8 = fp88(kHalf);
src += 8;
dst += 8;
@ -596,11 +600,11 @@ static void blur_x_rect(BlurX blur,
const uint8_t* src, size_t srcStride, int srcW,
uint8_t* dst, size_t dstStride, int dstW, int dstH) {
Sk8h g0{gauss[0]},
g1{gauss[1]},
g2{gauss[2]},
g3{gauss[3]},
g4{gauss[4]};
fp88 g0(gauss[0]),
g1(gauss[1]),
g2(gauss[2]),
g3(gauss[3]),
g4(gauss[4]);
// Blur *ALL* the rows.
for (int y = 0; y < dstH; y++) {
@ -686,29 +690,29 @@ static void direct_blur_x(int radius, uint16_t* gauss,
// d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0]
// d12[0..7] = S[n+0r..n+0r+7]*G[1]
// return answer[0..7]
static Sk8h blur_y_radius_1(
const Sk8h& s0,
const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
auto v0 = s0.mulHi(g0);
auto v1 = s0.mulHi(g1);
static fp88 blur_y_radius_1(
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
fp88* d01, fp88* d12, fp88*, fp88*, fp88*, fp88*, fp88*, fp88*) {
auto v0 = mulhi(s0, g0);
auto v1 = mulhi(s0, g1);
Sk8h answer = *d01 + v1;
fp88 answer = *d01 + v1;
*d01 = *d12 + v0;
*d12 = v1 + kHalf;
return answer;
}
static Sk8h blur_y_radius_2(
const Sk8h& s0,
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
auto v0 = s0.mulHi(g0);
auto v1 = s0.mulHi(g1);
auto v2 = s0.mulHi(g2);
static fp88 blur_y_radius_2(
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88*, fp88*, fp88*, fp88*) {
auto v0 = mulhi(s0, g0);
auto v1 = mulhi(s0, g1);
auto v2 = mulhi(s0, g2);
Sk8h answer = *d01 + v2;
fp88 answer = *d01 + v2;
*d01 = *d12 + v1;
*d12 = *d23 + v0;
*d23 = *d34 + v1;
@ -717,16 +721,16 @@ static Sk8h blur_y_radius_2(
return answer;
}
static Sk8h blur_y_radius_3(
const Sk8h& s0,
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
auto v0 = s0.mulHi(g0);
auto v1 = s0.mulHi(g1);
auto v2 = s0.mulHi(g2);
auto v3 = s0.mulHi(g3);
static fp88 blur_y_radius_3(
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88*, fp88*) {
auto v0 = mulhi(s0, g0);
auto v1 = mulhi(s0, g1);
auto v2 = mulhi(s0, g2);
auto v3 = mulhi(s0, g3);
Sk8h answer = *d01 + v3;
fp88 answer = *d01 + v3;
*d01 = *d12 + v2;
*d12 = *d23 + v1;
*d23 = *d34 + v0;
@ -737,17 +741,17 @@ static Sk8h blur_y_radius_3(
return answer;
}
static Sk8h blur_y_radius_4(
const Sk8h& s0,
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
auto v0 = s0.mulHi(g0);
auto v1 = s0.mulHi(g1);
auto v2 = s0.mulHi(g2);
auto v3 = s0.mulHi(g3);
auto v4 = s0.mulHi(g4);
static fp88 blur_y_radius_4(
const fp88& s0,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88* d67, fp88* d78) {
auto v0 = mulhi(s0, g0);
auto v1 = mulhi(s0, g1);
auto v2 = mulhi(s0, g2);
auto v3 = mulhi(s0, g3);
auto v4 = mulhi(s0, g4);
Sk8h answer = *d01 + v4;
fp88 answer = *d01 + v4;
*d01 = *d12 + v3;
*d12 = *d23 + v2;
*d23 = *d34 + v1;
@ -766,13 +770,13 @@ using BlurY = decltype(blur_y_radius_1);
static void blur_column(
ToA8 toA8,
BlurY blur, int radius, int width,
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
const uint8_t* src, size_t srcRB, int srcH,
uint8_t* dst, size_t dstRB) {
Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
fp88 d01(kHalf), d12(kHalf), d23(kHalf), d34(kHalf),
d45(kHalf), d56(kHalf), d67(kHalf), d78(kHalf);
auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
auto flush = [&](uint8_t* to, const fp88& v0, const fp88& v1) {
store(to, v0, width);
to += dstRB;
store(to, v1, width);
@ -809,11 +813,11 @@ static void blur_y_rect(ToA8 toA8, const int strideOf8,
const uint8_t *src, size_t srcRB, int srcW, int srcH,
uint8_t *dst, size_t dstRB) {
Sk8h g0{gauss[0]},
g1{gauss[1]},
g2{gauss[2]},
g3{gauss[3]},
g4{gauss[4]};
fp88 g0(gauss[0]),
g1(gauss[1]),
g2(gauss[2]),
g3(gauss[3]),
g4(gauss[4]);
int x = 0;
for (; x <= srcW - 8; x += 8) {

View File

@ -38,7 +38,7 @@ generated_cc_atom(
"//include/core:SkCanvas_hdr",
"//include/effects:SkImageFilters_hdr",
"//include/gpu:GrRecordingContext_hdr",
"//include/private:SkNx_hdr",
"//include/private:SkVx_hdr",
"//src/core:SkImageFilter_Base_hdr",
"//src/core:SkReadBuffer_hdr",
"//src/core:SkRuntimeEffectPriv_hdr",

View File

@ -8,7 +8,7 @@
#include "include/core/SkBitmap.h"
#include "include/core/SkCanvas.h"
#include "include/effects/SkImageFilters.h"
#include "include/private/SkNx.h"
#include "include/private/SkVx.h"
#include "src/core/SkImageFilter_Base.h"
#include "src/core/SkReadBuffer.h"
#include "src/core/SkSpecialImage.h"
@ -130,42 +130,38 @@ void SkArithmeticImageFilter::flatten(SkWriteBuffer& buffer) const {
///////////////////////////////////////////////////////////////////////////////////////////////////
static Sk4f pin(float min, const Sk4f& val, float max) {
return Sk4f::Max(min, Sk4f::Min(val, max));
}
template <bool EnforcePMColor>
void arith_span(const SkV4& k, SkPMColor dst[], const SkPMColor src[], int count) {
const Sk4f k1 = k[0] * (1/255.0f),
k2 = k[1],
k3 = k[2],
k4 = k[3] * 255.0f + 0.5f;
const skvx::float4 k1 = k[0] * (1/255.0f),
k2 = k[1],
k3 = k[2],
k4 = k[3] * 255.0f + 0.5f;
for (int i = 0; i < count; i++) {
Sk4f s = SkNx_cast<float>(Sk4b::Load(src+i)),
d = SkNx_cast<float>(Sk4b::Load(dst+i)),
r = pin(0, k1*s*d + k2*s + k3*d + k4, 255);
skvx::float4 s = skvx::cast<float>(skvx::byte4::Load(src+i)),
d = skvx::cast<float>(skvx::byte4::Load(dst+i)),
r = pin(k1*s*d + k2*s + k3*d + k4, skvx::float4(0.f), skvx::float4(255.f));
if (EnforcePMColor) {
Sk4f a = SkNx_shuffle<3,3,3,3>(r);
r = Sk4f::Min(a, r);
auto a = skvx::shuffle<3,3,3,3>(r);
r = min(a, r);
}
SkNx_cast<uint8_t>(r).store(dst+i);
skvx::cast<uint8_t>(r).store(dst+i);
}
}
// apply mode to src==transparent (0)
template<bool EnforcePMColor> void arith_transparent(const SkV4& k, SkPMColor dst[], int count) {
const Sk4f k3 = k[2],
k4 = k[3] * 255.0f + 0.5f;
const skvx::float4 k3 = k[2],
k4 = k[3] * 255.0f + 0.5f;
for (int i = 0; i < count; i++) {
Sk4f d = SkNx_cast<float>(Sk4b::Load(dst+i)),
r = pin(0, k3*d + k4, 255);
skvx::float4 d = skvx::cast<float>(skvx::byte4::Load(dst+i)),
r = pin(k3*d + k4, skvx::float4(0.f), skvx::float4(255.f));
if (EnforcePMColor) {
Sk4f a = SkNx_shuffle<3,3,3,3>(r);
r = Sk4f::Min(a, r);
auto a = skvx::shuffle<3,3,3,3>(r);
r = min(a, r);
}
SkNx_cast<uint8_t>(r).store(dst+i);
skvx::cast<uint8_t>(r).store(dst+i);
}
}