Update filters to use skvx instead of SkNx
Change-Id: I1a5490f546a3cb046c64b114a30be991d2d9f2cc Reviewed-on: https://skia-review.googlesource.com/c/skia/+/541064 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Michael Ludwig <michaelludwig@google.com>
This commit is contained in:
parent
f2d000328f
commit
8e870728db
@ -798,7 +798,7 @@ private:
|
||||
SI Vec<8,uint16_t> mull(const Vec<8,uint8_t>& x,
|
||||
const Vec<8,uint8_t>& y) {
|
||||
return to_vec<8,uint16_t>(vmull_u8(to_vext(x),
|
||||
to_vext(y)));
|
||||
to_vext(y)));
|
||||
}
|
||||
|
||||
SIN std::enable_if_t<(N < 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x,
|
||||
@ -815,13 +815,37 @@ SIN std::enable_if_t<(N > 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x,
|
||||
mull(x.hi, y.hi));
|
||||
}
|
||||
|
||||
// Or do four u16*u16 -> u32 in one instruction, vmull_u16
|
||||
SI Vec<4,uint32_t> mull(const Vec<4,uint16_t>& x,
|
||||
const Vec<4,uint16_t>& y) {
|
||||
return to_vec<4,uint32_t>(vmull_u16(to_vext(x),
|
||||
to_vext(y)));
|
||||
}
|
||||
|
||||
SIN std::enable_if_t<(N < 4), Vec<N,uint32_t>> mull(const Vec<N,uint16_t>& x,
|
||||
const Vec<N,uint16_t>& y) {
|
||||
// N < 4 --> double up data until N == 4, returning the part we need.
|
||||
return mull(join(x,x),
|
||||
join(y,y)).lo;
|
||||
}
|
||||
|
||||
SIN std::enable_if_t<(N > 4), Vec<N,uint32_t>> mull(const Vec<N,uint16_t>& x,
|
||||
const Vec<N,uint16_t>& y) {
|
||||
// N > 4 --> usual join(lo,hi) strategy to recurse down to N == 4.
|
||||
return join(mull(x.lo, y.lo),
|
||||
mull(x.hi, y.hi));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Nothing special when we don't have NEON... just cast up to 16-bit and multiply.
|
||||
// Nothing special when we don't have NEON... just cast up and multiply.
|
||||
SIN Vec<N,uint16_t> mull(const Vec<N,uint8_t>& x,
|
||||
const Vec<N,uint8_t>& y) {
|
||||
return cast<uint16_t>(x)
|
||||
* cast<uint16_t>(y);
|
||||
const Vec<N,uint8_t>& y) {
|
||||
return cast<uint16_t>(x) * cast<uint16_t>(y);
|
||||
}
|
||||
SIN Vec<N,uint32_t> mull(const Vec<N,uint16_t>& x,
|
||||
const Vec<N,uint16_t>& y) {
|
||||
return cast<uint32_t>(x) * cast<uint32_t>(y);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -2647,10 +2647,10 @@ generated_cc_atom(
|
||||
":SkMaskBlurFilter_hdr",
|
||||
"//include/core:SkColorPriv_hdr",
|
||||
"//include/private:SkMalloc_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkTPin_hdr",
|
||||
"//include/private:SkTemplates_hdr",
|
||||
"//include/private:SkTo_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -9,10 +9,10 @@
|
||||
|
||||
#include "include/core/SkColorPriv.h"
|
||||
#include "include/private/SkMalloc.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkTPin.h"
|
||||
#include "include/private/SkTemplates.h"
|
||||
#include "include/private/SkTo.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkArenaAlloc.h"
|
||||
#include "src/core/SkGaussFilter.h"
|
||||
|
||||
@ -294,7 +294,15 @@ static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) {
|
||||
}
|
||||
using ToA8 = decltype(bw_to_a8);
|
||||
|
||||
static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
|
||||
using fp88 = skvx::Vec<8, uint16_t>; // 8-wide fixed point 8.8
|
||||
|
||||
static fp88 mulhi(const fp88& a, const fp88& b) {
|
||||
// On NEON, this is optimal; with SSE, clang appears to detect the pattern and convert it to the
|
||||
// optimal single instruction, _mm_mulhi_epu16.
|
||||
return skvx::cast<uint16_t>(mull(a, b) >> 16);
|
||||
}
|
||||
|
||||
static fp88 load(const uint8_t* from, int width, ToA8* toA8) {
|
||||
// Our fast path is a full 8-byte load of A8.
|
||||
// So we'll conditionally handle the two slow paths using tmp:
|
||||
// - if we have a function to convert another mask to A8, use it;
|
||||
@ -311,11 +319,11 @@ static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
|
||||
}
|
||||
|
||||
// Load A8 and convert to 8.8 fixed-point.
|
||||
return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8;
|
||||
return skvx::cast<uint16_t>(skvx::byte8::Load(from)) << 8;
|
||||
}
|
||||
|
||||
static void store(uint8_t* to, const Sk8h& v, int width) {
|
||||
Sk8b b = SkNx_cast<uint8_t>(v >> 8);
|
||||
static void store(uint8_t* to, const fp88& v, int width) {
|
||||
skvx::byte8 b = skvx::cast<uint8_t>(v >> 8);
|
||||
if (width == 8) {
|
||||
b.store(to);
|
||||
} else {
|
||||
@ -410,135 +418,131 @@ static constexpr uint16_t kHalf = 0x80u;
|
||||
// Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
|
||||
|
||||
static void blur_x_radius_1(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
|
||||
Sk8h* d0, Sk8h* d8) {
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
|
||||
fp88* d0, fp88* d8) {
|
||||
|
||||
auto v1 = s0.mulHi(g1);
|
||||
auto v0 = s0.mulHi(g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
auto v0 = mulhi(s0, g0);
|
||||
|
||||
// D[n..n+7] += S[n..n+7] * G[1]
|
||||
*d0 += v1;
|
||||
|
||||
//D[n..n+8] += {0, S[n..n+7] * G[0]}
|
||||
*d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
|
||||
*d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
|
||||
*d8 += fp88{v0[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
|
||||
*d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
|
||||
*d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
|
||||
*d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
|
||||
|
||||
}
|
||||
|
||||
static void blur_x_radius_2(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
|
||||
Sk8h* d0, Sk8h* d8) {
|
||||
auto v0 = s0.mulHi(g0);
|
||||
auto v1 = s0.mulHi(g1);
|
||||
auto v2 = s0.mulHi(g2);
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
|
||||
fp88* d0, fp88* d8) {
|
||||
auto v0 = mulhi(s0, g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
auto v2 = mulhi(s0, g2);
|
||||
|
||||
// D[n..n+7] += S[n..n+7] * G[2]
|
||||
*d0 += v2;
|
||||
|
||||
// D[n..n+8] += {0, S[n..n+7] * G[1]}
|
||||
*d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
|
||||
*d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
|
||||
*d8 += fp88{v1[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+9] += {0, 0, S[n..n+7] * G[0]}
|
||||
*d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
|
||||
*d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
|
||||
*d8 += fp88{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
|
||||
*d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
|
||||
*d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
|
||||
*d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
|
||||
*d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
|
||||
*d8 += fp88{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
|
||||
}
|
||||
|
||||
static void blur_x_radius_3(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
|
||||
Sk8h* d0, Sk8h* d8) {
|
||||
auto v0 = s0.mulHi(gauss0);
|
||||
auto v1 = s0.mulHi(gauss1);
|
||||
auto v2 = s0.mulHi(gauss2);
|
||||
auto v3 = s0.mulHi(gauss3);
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
|
||||
fp88* d0, fp88* d8) {
|
||||
auto v0 = mulhi(s0, g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
auto v2 = mulhi(s0, g2);
|
||||
auto v3 = mulhi(s0, g3);
|
||||
|
||||
// D[n..n+7] += S[n..n+7] * G[3]
|
||||
*d0 += v3;
|
||||
|
||||
// D[n..n+8] += {0, S[n..n+7] * G[2]}
|
||||
*d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
|
||||
*d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
|
||||
*d8 += fp88{v2[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
|
||||
*d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
|
||||
*d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
|
||||
*d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]}
|
||||
*d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
|
||||
*d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
|
||||
*d8 += fp88{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
|
||||
*d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
|
||||
*d8 += fp88{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
|
||||
*d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
|
||||
*d8 += fp88{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
|
||||
|
||||
// D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
|
||||
*d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
|
||||
*d8 += fp88{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
|
||||
}
|
||||
|
||||
static void blur_x_radius_4(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& gauss0,
|
||||
const Sk8h& gauss1,
|
||||
const Sk8h& gauss2,
|
||||
const Sk8h& gauss3,
|
||||
const Sk8h& gauss4,
|
||||
Sk8h* d0, Sk8h* d8) {
|
||||
auto v0 = s0.mulHi(gauss0);
|
||||
auto v1 = s0.mulHi(gauss1);
|
||||
auto v2 = s0.mulHi(gauss2);
|
||||
auto v3 = s0.mulHi(gauss3);
|
||||
auto v4 = s0.mulHi(gauss4);
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
|
||||
fp88* d0, fp88* d8) {
|
||||
auto v0 = mulhi(s0, g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
auto v2 = mulhi(s0, g2);
|
||||
auto v3 = mulhi(s0, g3);
|
||||
auto v4 = mulhi(s0, g4);
|
||||
|
||||
// D[n..n+7] += S[n..n+7] * G[4]
|
||||
*d0 += v4;
|
||||
|
||||
// D[n..n+8] += {0, S[n..n+7] * G[3]}
|
||||
*d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
|
||||
*d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
|
||||
*d8 += fp88{v3[7], _____, _____, _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+9] += {0, 0, S[n..n+7] * G[2]}
|
||||
*d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
|
||||
*d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
|
||||
*d8 += fp88{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
|
||||
*d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
|
||||
*d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
|
||||
*d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
|
||||
*d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
|
||||
*d8 += fp88{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
|
||||
|
||||
// D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
|
||||
*d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
|
||||
*d8 += fp88{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
|
||||
|
||||
// D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
|
||||
*d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
|
||||
*d8 += fp88{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
|
||||
|
||||
// D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
|
||||
*d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
|
||||
*d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
|
||||
*d0 += fp88{_____, _____, _____, _____, _____, _____, _____, v3[0]};
|
||||
*d8 += fp88{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
|
||||
|
||||
// D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
|
||||
*d8 += v4;
|
||||
@ -549,11 +553,11 @@ using BlurX = decltype(blur_x_radius_1);
|
||||
// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
|
||||
static void blur_row(
|
||||
BlurX blur,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
|
||||
const uint8_t* src, int srcW,
|
||||
uint8_t* dst, int dstW) {
|
||||
// Clear the buffer to handle summing wider than source.
|
||||
Sk8h d0{kHalf}, d8{kHalf};
|
||||
fp88 d0(kHalf), d8(kHalf);
|
||||
|
||||
// Go by multiples of 8 in src.
|
||||
int x = 0;
|
||||
@ -563,7 +567,7 @@ static void blur_row(
|
||||
store(dst, d0, 8);
|
||||
|
||||
d0 = d8;
|
||||
d8 = Sk8h{kHalf};
|
||||
d8 = fp88(kHalf);
|
||||
|
||||
src += 8;
|
||||
dst += 8;
|
||||
@ -596,11 +600,11 @@ static void blur_x_rect(BlurX blur,
|
||||
const uint8_t* src, size_t srcStride, int srcW,
|
||||
uint8_t* dst, size_t dstStride, int dstW, int dstH) {
|
||||
|
||||
Sk8h g0{gauss[0]},
|
||||
g1{gauss[1]},
|
||||
g2{gauss[2]},
|
||||
g3{gauss[3]},
|
||||
g4{gauss[4]};
|
||||
fp88 g0(gauss[0]),
|
||||
g1(gauss[1]),
|
||||
g2(gauss[2]),
|
||||
g3(gauss[3]),
|
||||
g4(gauss[4]);
|
||||
|
||||
// Blur *ALL* the rows.
|
||||
for (int y = 0; y < dstH; y++) {
|
||||
@ -686,29 +690,29 @@ static void direct_blur_x(int radius, uint16_t* gauss,
|
||||
// d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0]
|
||||
// d12[0..7] = S[n+0r..n+0r+7]*G[1]
|
||||
// return answer[0..7]
|
||||
static Sk8h blur_y_radius_1(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
|
||||
Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
|
||||
auto v0 = s0.mulHi(g0);
|
||||
auto v1 = s0.mulHi(g1);
|
||||
static fp88 blur_y_radius_1(
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
|
||||
fp88* d01, fp88* d12, fp88*, fp88*, fp88*, fp88*, fp88*, fp88*) {
|
||||
auto v0 = mulhi(s0, g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
|
||||
Sk8h answer = *d01 + v1;
|
||||
fp88 answer = *d01 + v1;
|
||||
*d01 = *d12 + v0;
|
||||
*d12 = v1 + kHalf;
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
static Sk8h blur_y_radius_2(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
|
||||
Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
|
||||
auto v0 = s0.mulHi(g0);
|
||||
auto v1 = s0.mulHi(g1);
|
||||
auto v2 = s0.mulHi(g2);
|
||||
static fp88 blur_y_radius_2(
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
|
||||
fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88*, fp88*, fp88*, fp88*) {
|
||||
auto v0 = mulhi(s0, g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
auto v2 = mulhi(s0, g2);
|
||||
|
||||
Sk8h answer = *d01 + v2;
|
||||
fp88 answer = *d01 + v2;
|
||||
*d01 = *d12 + v1;
|
||||
*d12 = *d23 + v0;
|
||||
*d23 = *d34 + v1;
|
||||
@ -717,16 +721,16 @@ static Sk8h blur_y_radius_2(
|
||||
return answer;
|
||||
}
|
||||
|
||||
static Sk8h blur_y_radius_3(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
|
||||
Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
|
||||
auto v0 = s0.mulHi(g0);
|
||||
auto v1 = s0.mulHi(g1);
|
||||
auto v2 = s0.mulHi(g2);
|
||||
auto v3 = s0.mulHi(g3);
|
||||
static fp88 blur_y_radius_3(
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
|
||||
fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88*, fp88*) {
|
||||
auto v0 = mulhi(s0, g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
auto v2 = mulhi(s0, g2);
|
||||
auto v3 = mulhi(s0, g3);
|
||||
|
||||
Sk8h answer = *d01 + v3;
|
||||
fp88 answer = *d01 + v3;
|
||||
*d01 = *d12 + v2;
|
||||
*d12 = *d23 + v1;
|
||||
*d23 = *d34 + v0;
|
||||
@ -737,17 +741,17 @@ static Sk8h blur_y_radius_3(
|
||||
return answer;
|
||||
}
|
||||
|
||||
static Sk8h blur_y_radius_4(
|
||||
const Sk8h& s0,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
|
||||
Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
|
||||
auto v0 = s0.mulHi(g0);
|
||||
auto v1 = s0.mulHi(g1);
|
||||
auto v2 = s0.mulHi(g2);
|
||||
auto v3 = s0.mulHi(g3);
|
||||
auto v4 = s0.mulHi(g4);
|
||||
static fp88 blur_y_radius_4(
|
||||
const fp88& s0,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
|
||||
fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88* d67, fp88* d78) {
|
||||
auto v0 = mulhi(s0, g0);
|
||||
auto v1 = mulhi(s0, g1);
|
||||
auto v2 = mulhi(s0, g2);
|
||||
auto v3 = mulhi(s0, g3);
|
||||
auto v4 = mulhi(s0, g4);
|
||||
|
||||
Sk8h answer = *d01 + v4;
|
||||
fp88 answer = *d01 + v4;
|
||||
*d01 = *d12 + v3;
|
||||
*d12 = *d23 + v2;
|
||||
*d23 = *d34 + v1;
|
||||
@ -766,13 +770,13 @@ using BlurY = decltype(blur_y_radius_1);
|
||||
static void blur_column(
|
||||
ToA8 toA8,
|
||||
BlurY blur, int radius, int width,
|
||||
const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
|
||||
const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
|
||||
const uint8_t* src, size_t srcRB, int srcH,
|
||||
uint8_t* dst, size_t dstRB) {
|
||||
Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
|
||||
d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
|
||||
fp88 d01(kHalf), d12(kHalf), d23(kHalf), d34(kHalf),
|
||||
d45(kHalf), d56(kHalf), d67(kHalf), d78(kHalf);
|
||||
|
||||
auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
|
||||
auto flush = [&](uint8_t* to, const fp88& v0, const fp88& v1) {
|
||||
store(to, v0, width);
|
||||
to += dstRB;
|
||||
store(to, v1, width);
|
||||
@ -809,11 +813,11 @@ static void blur_y_rect(ToA8 toA8, const int strideOf8,
|
||||
const uint8_t *src, size_t srcRB, int srcW, int srcH,
|
||||
uint8_t *dst, size_t dstRB) {
|
||||
|
||||
Sk8h g0{gauss[0]},
|
||||
g1{gauss[1]},
|
||||
g2{gauss[2]},
|
||||
g3{gauss[3]},
|
||||
g4{gauss[4]};
|
||||
fp88 g0(gauss[0]),
|
||||
g1(gauss[1]),
|
||||
g2(gauss[2]),
|
||||
g3(gauss[3]),
|
||||
g4(gauss[4]);
|
||||
|
||||
int x = 0;
|
||||
for (; x <= srcW - 8; x += 8) {
|
||||
|
@ -38,7 +38,7 @@ generated_cc_atom(
|
||||
"//include/core:SkCanvas_hdr",
|
||||
"//include/effects:SkImageFilters_hdr",
|
||||
"//include/gpu:GrRecordingContext_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//src/core:SkImageFilter_Base_hdr",
|
||||
"//src/core:SkReadBuffer_hdr",
|
||||
"//src/core:SkRuntimeEffectPriv_hdr",
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include "include/core/SkBitmap.h"
|
||||
#include "include/core/SkCanvas.h"
|
||||
#include "include/effects/SkImageFilters.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkImageFilter_Base.h"
|
||||
#include "src/core/SkReadBuffer.h"
|
||||
#include "src/core/SkSpecialImage.h"
|
||||
@ -130,42 +130,38 @@ void SkArithmeticImageFilter::flatten(SkWriteBuffer& buffer) const {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static Sk4f pin(float min, const Sk4f& val, float max) {
|
||||
return Sk4f::Max(min, Sk4f::Min(val, max));
|
||||
}
|
||||
|
||||
template <bool EnforcePMColor>
|
||||
void arith_span(const SkV4& k, SkPMColor dst[], const SkPMColor src[], int count) {
|
||||
const Sk4f k1 = k[0] * (1/255.0f),
|
||||
k2 = k[1],
|
||||
k3 = k[2],
|
||||
k4 = k[3] * 255.0f + 0.5f;
|
||||
const skvx::float4 k1 = k[0] * (1/255.0f),
|
||||
k2 = k[1],
|
||||
k3 = k[2],
|
||||
k4 = k[3] * 255.0f + 0.5f;
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
Sk4f s = SkNx_cast<float>(Sk4b::Load(src+i)),
|
||||
d = SkNx_cast<float>(Sk4b::Load(dst+i)),
|
||||
r = pin(0, k1*s*d + k2*s + k3*d + k4, 255);
|
||||
skvx::float4 s = skvx::cast<float>(skvx::byte4::Load(src+i)),
|
||||
d = skvx::cast<float>(skvx::byte4::Load(dst+i)),
|
||||
r = pin(k1*s*d + k2*s + k3*d + k4, skvx::float4(0.f), skvx::float4(255.f));
|
||||
if (EnforcePMColor) {
|
||||
Sk4f a = SkNx_shuffle<3,3,3,3>(r);
|
||||
r = Sk4f::Min(a, r);
|
||||
auto a = skvx::shuffle<3,3,3,3>(r);
|
||||
r = min(a, r);
|
||||
}
|
||||
SkNx_cast<uint8_t>(r).store(dst+i);
|
||||
skvx::cast<uint8_t>(r).store(dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
// apply mode to src==transparent (0)
|
||||
template<bool EnforcePMColor> void arith_transparent(const SkV4& k, SkPMColor dst[], int count) {
|
||||
const Sk4f k3 = k[2],
|
||||
k4 = k[3] * 255.0f + 0.5f;
|
||||
const skvx::float4 k3 = k[2],
|
||||
k4 = k[3] * 255.0f + 0.5f;
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
Sk4f d = SkNx_cast<float>(Sk4b::Load(dst+i)),
|
||||
r = pin(0, k3*d + k4, 255);
|
||||
skvx::float4 d = skvx::cast<float>(skvx::byte4::Load(dst+i)),
|
||||
r = pin(k3*d + k4, skvx::float4(0.f), skvx::float4(255.f));
|
||||
if (EnforcePMColor) {
|
||||
Sk4f a = SkNx_shuffle<3,3,3,3>(r);
|
||||
r = Sk4f::Min(a, r);
|
||||
auto a = skvx::shuffle<3,3,3,3>(r);
|
||||
r = min(a, r);
|
||||
}
|
||||
SkNx_cast<uint8_t>(r).store(dst+i);
|
||||
skvx::cast<uint8_t>(r).store(dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user