Update Sk4px to use skvx instead of SkNx
Adds a saturated_add function that was on SkNx and used in SkXfermode_opts, but hadn't been ported to skvx yet. Removes the Sk4px_opts variants and simplifies some of its functions; many were already defined skvx. The largest change is that Sk4px does not extend skvx::byte16, since it used to extend Sk16b. Now it just has a vector as a data type. This was necessary so that we could define operators that were typed for Sk4px and Wide w/o conflicting with the free operators that were defined for the base skvx types. Change-Id: I8c667ba86f662ccf07ad85aa32e78abfc0a8c7ae Reviewed-on: https://skia-review.googlesource.com/c/skia/+/542645 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Michael Ludwig <michaelludwig@google.com>
This commit is contained in:
parent
3149a7b283
commit
767586b330
@ -749,6 +749,31 @@ SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y
|
||||
return cast<uint8_t>( (X*Y+X)/256 );
|
||||
}
|
||||
|
||||
// saturated_add(x,y) sums values and clamps to the maximum value instead of overflowing.
|
||||
SINT std::enable_if_t<std::is_unsigned_v<T>, Vec<N,T>> saturated_add(const Vec<N,T>& x,
|
||||
const Vec<N,T>& y) {
|
||||
#if SKVX_USE_SIMD && (defined(__SSE__) || defined(__ARM_NEON))
|
||||
// Both SSE and ARM have 16-lane saturated adds, so use intrinsics for those and recurse down
|
||||
// or join up to take advantage.
|
||||
if constexpr (N == 16 && sizeof(T) == 1) {
|
||||
#if defined(__SSE__)
|
||||
return unchecked_bit_pun<Vec<N,T>>(_mm_adds_epu8(unchecked_bit_pun<__m128i>(x),
|
||||
unchecked_bit_pun<__m128i>(y)));
|
||||
#else // __ARM_NEON
|
||||
return unchecked_bit_pun<Vec<N,T>>(vqaddq_u8(unchecked_bit_pun<uint8x16_t>(x),
|
||||
unchecked_bit_pun<uint8x16_t>(y)));
|
||||
#endif
|
||||
} else if constexpr (N < 16 && sizeof(T) == 1) {
|
||||
return saturated_add(join(x,x), join(y,y)).lo;
|
||||
} else if constexpr (sizeof(T) == 1) {
|
||||
return join(saturated_add(x.lo, y.lo), saturated_add(x.hi, y.hi));
|
||||
}
|
||||
#endif
|
||||
// Otherwise saturate manually
|
||||
auto sum = x + y;
|
||||
return if_then_else(sum < x, Vec<N,T>(std::numeric_limits<T>::max()), sum);
|
||||
}
|
||||
|
||||
// The ScaledDividerU32 takes a divisor > 1, and creates a function divide(numerator) that
|
||||
// calculates a numerator / denominator. For this to be rounded properly, numerator should have
|
||||
// half added in:
|
||||
|
@ -240,10 +240,7 @@ generated_cc_atom(
|
||||
deps = [
|
||||
"//include/core:SkColor_hdr",
|
||||
"//include/private:SkColorData_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//src/opts:Sk4px_NEON_hdr",
|
||||
"//src/opts:Sk4px_SSE2_hdr",
|
||||
"//src/opts:Sk4px_none_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
],
|
||||
)
|
||||
|
||||
|
113
src/core/Sk4px.h
113
src/core/Sk4px.h
@ -10,30 +10,27 @@
|
||||
|
||||
#include "include/core/SkColor.h"
|
||||
#include "include/private/SkColorData.h"
|
||||
#include "include/private/SkNx.h"
|
||||
|
||||
// This file may be included multiple times by .cpp files with different flags, leading
|
||||
// to different definitions. Usually that doesn't matter because it's all inlined, but
|
||||
// in Debug modes the compilers may not inline everything. So wrap everything in an
|
||||
// anonymous namespace to give each includer their own silo of this code (or the linker
|
||||
// will probably pick one randomly for us, which is rarely correct).
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
#include "include/private/SkVx.h"
|
||||
|
||||
// 1, 2 or 4 SkPMColors, generally vectorized.
|
||||
class Sk4px : public Sk16b {
|
||||
class Sk4px {
|
||||
public:
|
||||
Sk4px(const Sk16b& v) : INHERITED(v) {}
|
||||
Sk4px(const skvx::byte16& v) : fV(v) {}
|
||||
|
||||
static Sk4px DupPMColor(SkPMColor c) {
|
||||
Sk4u splat(c);
|
||||
skvx::uint4 splat(c);
|
||||
|
||||
Sk4px v;
|
||||
memcpy((void*)&v, &splat, 16);
|
||||
return v;
|
||||
}
|
||||
|
||||
Sk4px alphas() const; // ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx
|
||||
Sk4px inv() const { return Sk16b(255) - *this; }
|
||||
// RGBA rgba XYZW xyzw -> AAAA aaaa WWWW wwww
|
||||
Sk4px alphas() const {
|
||||
static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
|
||||
return Sk4px(skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(fV));
|
||||
}
|
||||
Sk4px inv() const { return Sk4px(skvx::byte16(255) - fV); }
|
||||
|
||||
// When loading or storing fewer than 4 SkPMColors, we use the low lanes.
|
||||
static Sk4px Load4(const SkPMColor px[4]) {
|
||||
@ -53,8 +50,16 @@ public:
|
||||
}
|
||||
|
||||
// Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px.
|
||||
static Sk4px Load4Alphas(const SkAlpha[4]); // AaXx -> AAAA aaaa XXXX xxxx
|
||||
static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa ???? ????
|
||||
// AaXx -> AAAA aaaa XXXX xxxx
|
||||
static Sk4px Load4Alphas(const SkAlpha alphas[4]) {
|
||||
skvx::byte4 a = skvx::byte4::Load(alphas);
|
||||
return Sk4px(skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(a));
|
||||
}
|
||||
// Aa -> AAAA aaaa ???? ????
|
||||
static Sk4px Load2Alphas(const SkAlpha alphas[2]) {
|
||||
skvx::byte2 a = skvx::byte2::Load(alphas);
|
||||
return Sk4px(join(skvx::shuffle<0,0,0,0, 1,1,1,1>(a), skvx::byte8()));
|
||||
}
|
||||
|
||||
void store4(SkPMColor px[4]) const { memcpy(px, this, 16); }
|
||||
void store2(SkPMColor px[2]) const { memcpy(px, this, 8); }
|
||||
@ -62,45 +67,47 @@ public:
|
||||
|
||||
// 1, 2, or 4 SkPMColors with 16-bit components.
|
||||
// This is most useful as the result of a multiply, e.g. from mulWiden().
|
||||
class Wide : public Sk16h {
|
||||
class Wide {
|
||||
public:
|
||||
Wide(const Sk16h& v) : Sk16h(v) {}
|
||||
|
||||
// Add, then pack the top byte of each component back down into 4 SkPMColors.
|
||||
Sk4px addNarrowHi(const Sk16h&) const;
|
||||
Wide(const skvx::Vec<16, uint16_t>& v) : fV(v) {}
|
||||
|
||||
// Rounds, i.e. (x+127) / 255.
|
||||
Sk4px div255() const;
|
||||
Sk4px div255() const { return Sk4px(skvx::div255(fV)); }
|
||||
|
||||
// These just keep the types as Wide so the user doesn't have to keep casting.
|
||||
Wide operator * (const Wide& o) const { return INHERITED::operator*(o); }
|
||||
Wide operator + (const Wide& o) const { return INHERITED::operator+(o); }
|
||||
Wide operator - (const Wide& o) const { return INHERITED::operator-(o); }
|
||||
Wide operator >> (int bits) const { return INHERITED::operator>>(bits); }
|
||||
Wide operator << (int bits) const { return INHERITED::operator<<(bits); }
|
||||
Wide operator * (const Wide& o) const { return Wide(fV * o.fV); }
|
||||
Wide operator + (const Wide& o) const { return Wide(fV + o.fV); }
|
||||
Wide operator - (const Wide& o) const { return Wide(fV - o.fV); }
|
||||
Wide operator >> (int bits) const { return Wide(fV >> bits); }
|
||||
Wide operator << (int bits) const { return Wide(fV << bits); }
|
||||
|
||||
private:
|
||||
using INHERITED = Sk16h;
|
||||
skvx::Vec<16, uint16_t> fV;
|
||||
};
|
||||
|
||||
Wide widen() const; // Widen 8-bit values to low 8-bits of 16-bit lanes.
|
||||
Wide mulWiden(const Sk16b&) const; // 8-bit x 8-bit -> 16-bit components.
|
||||
// Widen 8-bit values to low 8-bits of 16-bit lanes.
|
||||
Wide widen() const { return Wide(skvx::cast<uint16_t>(fV)); }
|
||||
// 8-bit x 8-bit -> 16-bit components.
|
||||
Wide mulWiden(const skvx::byte16& o) const { return Wide(mull(fV, o)); }
|
||||
|
||||
// The only 8-bit multiply we use is 8-bit x 8-bit -> 16-bit. Might as well make it pithy.
|
||||
Wide operator * (const Sk4px& o) const { return this->mulWiden(o); }
|
||||
Wide operator * (const Sk4px& o) const { return this->mulWiden(o.fV); }
|
||||
|
||||
// These just keep the types as Sk4px so the user doesn't have to keep casting.
|
||||
Sk4px operator + (const Sk4px& o) const { return INHERITED::operator+(o); }
|
||||
Sk4px operator - (const Sk4px& o) const { return INHERITED::operator-(o); }
|
||||
Sk4px operator < (const Sk4px& o) const { return INHERITED::operator<(o); }
|
||||
Sk4px thenElse(const Sk4px& t, const Sk4px& e) const { return INHERITED::thenElse(t,e); }
|
||||
Sk4px operator + (const Sk4px& o) const { return Sk4px(fV + o.fV); }
|
||||
Sk4px operator - (const Sk4px& o) const { return Sk4px(fV - o.fV); }
|
||||
Sk4px operator < (const Sk4px& o) const { return Sk4px(fV < o.fV); }
|
||||
Sk4px operator & (const Sk4px& o) const { return Sk4px(fV & o.fV); }
|
||||
Sk4px thenElse(const Sk4px& t, const Sk4px& e) const {
|
||||
return Sk4px(if_then_else(fV, t.fV, e.fV));
|
||||
}
|
||||
|
||||
// Generally faster than (*this * o).div255().
|
||||
// May be incorrect by +-1, but is always exactly correct when *this or o is 0 or 255.
|
||||
Sk4px approxMulDiv255(const Sk16b& o) const {
|
||||
// (x*y + x) / 256 meets these criteria. (As of course does (x*y + y) / 256 by symmetry.)
|
||||
// FYI: (x*y + 255) / 256 also meets these criteria. In my brief testing, it was slower.
|
||||
return this->widen().addNarrowHi(*this * o);
|
||||
Sk4px approxMulDiv255(const Sk4px& o) const {
|
||||
return Sk4px(approx_scale(fV, o.fV));
|
||||
}
|
||||
|
||||
Sk4px saturatedAdd(const Sk4px& o) const {
|
||||
return Sk4px(saturated_add(fV, o.fV));
|
||||
}
|
||||
|
||||
// A generic driver that maps fn over a src array into a dst array.
|
||||
@ -192,7 +199,7 @@ public:
|
||||
dst += 2; a += 2; n -= 2;
|
||||
}
|
||||
if (n >= 1) {
|
||||
fn(Load1(dst), Sk16b(*a)).store1(dst);
|
||||
fn(Load1(dst), skvx::byte16(*a)).store1(dst);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -224,7 +231,7 @@ public:
|
||||
dst += 2; src += 2; a += 2; n -= 2;
|
||||
}
|
||||
if (n >= 1) {
|
||||
fn(Load1(dst), Load1(src), Sk16b(*a)).store1(dst);
|
||||
fn(Load1(dst), Load1(src), skvx::byte16(*a)).store1(dst);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -233,24 +240,10 @@ public:
|
||||
private:
|
||||
Sk4px() = default;
|
||||
|
||||
using INHERITED = Sk16b;
|
||||
skvx::byte16 fV;
|
||||
};
|
||||
|
||||
static_assert(sizeof(Sk4px) == sizeof(Sk16b));
|
||||
static_assert(sizeof(Sk4px) == 16);
|
||||
static_assert(sizeof(Sk4px) == sizeof(skvx::byte16));
|
||||
static_assert(alignof(Sk4px) == alignof(skvx::byte16));
|
||||
|
||||
} // namespace
|
||||
|
||||
#ifdef SKNX_NO_SIMD
|
||||
#include "src/opts/Sk4px_none.h"
|
||||
#else
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include "src/opts/Sk4px_SSE2.h"
|
||||
#elif defined(SK_ARM_HAS_NEON)
|
||||
#include "src/opts/Sk4px_NEON.h"
|
||||
#else
|
||||
#include "src/opts/Sk4px_none.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif//Sk4px_DEFINED
|
||||
#endif // Sk4px_DEFINED
|
||||
|
@ -20,25 +20,6 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "Sk4px_NEON_hdr",
|
||||
hdrs = ["Sk4px_NEON.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "Sk4px_SSE2_hdr",
|
||||
hdrs = ["Sk4px_SSE2.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "Sk4px_none_hdr",
|
||||
hdrs = ["Sk4px_none.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = ["//src/core:SkUtils_hdr"],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "SkBitmapProcState_opts_hdr",
|
||||
hdrs = ["SkBitmapProcState_opts.h"],
|
||||
@ -198,7 +179,6 @@ generated_cc_atom(
|
||||
hdrs = ["SkXfermode_opts.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = [
|
||||
"//include/private:SkNx_hdr",
|
||||
"//src/core:Sk4px_hdr",
|
||||
"//src/core:SkMSAN_hdr",
|
||||
"//src/core:SkXfermodePriv_hdr",
|
||||
|
@ -1,56 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
|
||||
inline Sk4px::Wide Sk4px::widen() const {
|
||||
return Sk16h(vmovl_u8(vget_low_u8 (this->fVec)),
|
||||
vmovl_u8(vget_high_u8(this->fVec)));
|
||||
}
|
||||
|
||||
inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
|
||||
return Sk16h(vmull_u8(vget_low_u8 (this->fVec), vget_low_u8 (other.fVec)),
|
||||
vmull_u8(vget_high_u8(this->fVec), vget_high_u8(other.fVec)));
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
|
||||
const Sk4px::Wide o(other); // Should be no code, but allows us to access fLo, fHi.
|
||||
return Sk16b(vcombine_u8(vaddhn_u16(this->fLo.fVec, o.fLo.fVec),
|
||||
vaddhn_u16(this->fHi.fVec, o.fHi.fVec)));
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Wide::div255() const {
|
||||
// Calculated as (x + (x+128)>>8 +128) >> 8. The 'r' in each instruction provides each +128.
|
||||
return Sk16b(vcombine_u8(vraddhn_u16(this->fLo.fVec, vrshrq_n_u16(this->fLo.fVec, 8)),
|
||||
vraddhn_u16(this->fHi.fVec, vrshrq_n_u16(this->fHi.fVec, 8))));
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::alphas() const {
|
||||
auto as = vshrq_n_u32((uint32x4_t)fVec, SK_A32_SHIFT); // ___3 ___2 ___1 ___0
|
||||
return Sk16b((uint8x16_t)vmulq_n_u32(as, 0x01010101)); // 3333 2222 1111 0000
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
|
||||
uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
|
||||
a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
|
||||
a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
|
||||
a8 = vld1q_lane_u8(a+2, a8, 8); // ____ ___2 ___1 ___0
|
||||
a8 = vld1q_lane_u8(a+3, a8, 12); // ___3 ___2 ___1 ___0
|
||||
auto a32 = (uint32x4_t)a8; //
|
||||
return Sk16b((uint8x16_t)vmulq_n_u32(a32, 0x01010101)); // 3333 2222 1111 0000
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
|
||||
uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
|
||||
a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
|
||||
a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
|
||||
auto a32 = (uint32x4_t)a8; //
|
||||
return Sk16b((uint8x16_t)vmulq_n_u32(a32, 0x01010101)); // ____ ____ 1111 0000
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -1,76 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
|
||||
inline Sk4px::Wide Sk4px::widen() const {
|
||||
return Sk16h(_mm_unpacklo_epi8(this->fVec, _mm_setzero_si128()),
|
||||
_mm_unpackhi_epi8(this->fVec, _mm_setzero_si128()));
|
||||
}
|
||||
|
||||
inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
|
||||
return this->widen() * Sk4px(other).widen();
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
|
||||
Sk4px::Wide r = (*this + other) >> 8;
|
||||
return Sk4px(_mm_packus_epi16(r.fLo.fVec, r.fHi.fVec));
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Wide::div255() const {
|
||||
// (x + 127) / 255 == ((x+128) * 257)>>16,
|
||||
// and _mm_mulhi_epu16 makes the (_ * 257)>>16 part very convenient.
|
||||
const __m128i _128 = _mm_set1_epi16(128),
|
||||
_257 = _mm_set1_epi16(257);
|
||||
return Sk4px(_mm_packus_epi16(_mm_mulhi_epu16(_mm_add_epi16(fLo.fVec, _128), _257),
|
||||
_mm_mulhi_epu16(_mm_add_epi16(fHi.fVec, _128), _257)));
|
||||
}
|
||||
|
||||
// Load4Alphas and Load2Alphas use possibly-unaligned loads (SkAlpha[] -> uint16_t or uint32_t).
|
||||
// These are safe on x86, often with no speed penalty.
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
inline Sk4px Sk4px::alphas() const {
|
||||
static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
|
||||
__m128i splat = _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7,7,7, 3,3,3,3);
|
||||
return Sk16b(_mm_shuffle_epi8(this->fVec, splat));
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
|
||||
uint32_t as;
|
||||
memcpy(&as, a, 4);
|
||||
__m128i splat = _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
|
||||
return Sk16b(_mm_shuffle_epi8(_mm_cvtsi32_si128(as), splat));
|
||||
}
|
||||
#else
|
||||
inline Sk4px Sk4px::alphas() const {
|
||||
static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
|
||||
// We exploit that A >= rgb for any premul pixel.
|
||||
__m128i as = fVec; // 3xxx 2xxx 1xxx 0xxx
|
||||
as = _mm_max_epu8(as, _mm_srli_epi32(as, 8)); // 33xx 22xx 11xx 00xx
|
||||
as = _mm_max_epu8(as, _mm_srli_epi32(as, 16)); // 3333 2222 1111 0000
|
||||
return Sk16b(as);
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
|
||||
__m128i as;
|
||||
memcpy(&as, a, 4); // ____ ____ ____ 3210
|
||||
as = _mm_unpacklo_epi8 (as, as); // ____ ____ 3322 1100
|
||||
as = _mm_unpacklo_epi16(as, as); // 3333 2222 1111 0000
|
||||
return Sk16b(as);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
|
||||
uint16_t alphas;
|
||||
memcpy(&alphas, a, 2);
|
||||
uint32_t alphas_and_two_zeros = alphas; // Aa -> Aa00
|
||||
|
||||
return Load4Alphas((const SkAlpha*)&alphas_and_two_zeros);
|
||||
}
|
||||
|
||||
} // namespace
|
@ -1,59 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "src/core/SkUtils.h"
|
||||
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
|
||||
inline Sk4px::Wide Sk4px::widen() const {
|
||||
return Sk16h((*this)[ 0], (*this)[ 1], (*this)[ 2], (*this)[ 3],
|
||||
(*this)[ 4], (*this)[ 5], (*this)[ 6], (*this)[ 7],
|
||||
(*this)[ 8], (*this)[ 9], (*this)[10], (*this)[11],
|
||||
(*this)[12], (*this)[13], (*this)[14], (*this)[15]);
|
||||
}
|
||||
|
||||
inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
|
||||
return this->widen() * Sk4px(other).widen();
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
|
||||
Sk4px::Wide r = (*this + other) >> 8;
|
||||
return Sk16b(r[ 0], r[ 1], r[ 2], r[ 3],
|
||||
r[ 4], r[ 5], r[ 6], r[ 7],
|
||||
r[ 8], r[ 9], r[10], r[11],
|
||||
r[12], r[13], r[14], r[15]);
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Wide::div255() const {
|
||||
// Calculated as ((x+128) + ((x+128)>>8)) >> 8.
|
||||
auto v = *this + Sk16h(128);
|
||||
return v.addNarrowHi(v>>8);
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::alphas() const {
|
||||
static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
|
||||
return Sk16b((*this)[ 3], (*this)[ 3], (*this)[ 3], (*this)[ 3],
|
||||
(*this)[ 7], (*this)[ 7], (*this)[ 7], (*this)[ 7],
|
||||
(*this)[11], (*this)[11], (*this)[11], (*this)[11],
|
||||
(*this)[15], (*this)[15], (*this)[15], (*this)[15]);
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
|
||||
return Sk16b(a[0], a[0], a[0], a[0],
|
||||
a[1], a[1], a[1], a[1],
|
||||
a[2], a[2], a[2], a[2],
|
||||
a[3], a[3], a[3], a[3]);
|
||||
}
|
||||
|
||||
inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
|
||||
return Sk16b(a[0], a[0], a[0], a[0],
|
||||
a[1], a[1], a[1], a[1],
|
||||
0,0,0,0,
|
||||
0,0,0,0);
|
||||
}
|
||||
|
||||
} // namespace
|
@ -205,7 +205,7 @@ namespace SK_OPTS_NS {
|
||||
// ~~~>
|
||||
// a = 1*aa + d(1-1*aa) = aa + d(1-aa)
|
||||
// c = 0*aa + d(1-1*aa) = d(1-aa)
|
||||
return Sk4px(Sk16b(aa) & Sk16b(0,0,0,255, 0,0,0,255, 0,0,0,255, 0,0,0,255))
|
||||
return (aa & Sk4px(skvx::byte16{0,0,0,255, 0,0,0,255, 0,0,0,255, 0,0,0,255}))
|
||||
+ d.approxMulDiv255(aa.inv());
|
||||
};
|
||||
while (h --> 0) {
|
||||
|
@ -8,7 +8,6 @@
|
||||
#ifndef Sk4pxXfermode_DEFINED
|
||||
#define Sk4pxXfermode_DEFINED
|
||||
|
||||
#include "include/private/SkNx.h"
|
||||
#include "src/core/Sk4px.h"
|
||||
#include "src/core/SkMSAN.h"
|
||||
#include "src/core/SkXfermodePriv.h"
|
||||
|
@ -5562,7 +5562,6 @@ generated_cc_atom(
|
||||
":Test_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/utils:SkRandom_hdr",
|
||||
"//src/core:Sk4px_hdr",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -7,7 +7,6 @@
|
||||
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/utils/SkRandom.h"
|
||||
#include "src/core/Sk4px.h"
|
||||
#include "tests/Test.h"
|
||||
|
||||
template <int N>
|
||||
@ -185,29 +184,6 @@ DEF_TEST(SkNi_mulHi, r) {
|
||||
REPORTER_ASSERT(r, c[3] == q[3]);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk4px_muldiv255round, r) {
|
||||
for (int a = 0; a < (1<<8); a++) {
|
||||
for (int b = 0; b < (1<<8); b++) {
|
||||
int exact = (a*b+127)/255;
|
||||
|
||||
// Duplicate a and b 16x each.
|
||||
Sk4px av = Sk16b(a),
|
||||
bv = Sk16b(b);
|
||||
|
||||
// This way should always be exactly correct.
|
||||
int correct = (av * bv).div255()[0];
|
||||
REPORTER_ASSERT(r, correct == exact);
|
||||
|
||||
// We're a bit more flexible on this method: correct for 0 or 255, otherwise off by <=1.
|
||||
int fast = av.approxMulDiv255(bv)[0];
|
||||
REPORTER_ASSERT(r, fast-exact >= -1 && fast-exact <= 1);
|
||||
if (a == 0 || a == 255 || b == 0 || b == 255) {
|
||||
REPORTER_ASSERT(r, fast == exact);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_abs, r) {
|
||||
auto fs = Sk4f(0.0f, -0.0f, 2.0f, -4.0f).abs();
|
||||
REPORTER_ASSERT(r, fs[0] == 0.0f);
|
||||
|
@ -304,7 +304,7 @@ DEF_TEST(SkVx_strided_loads, r) {
|
||||
check_strided_loads<float>(r);
|
||||
}
|
||||
|
||||
DEF_TEST(SkVM_ScaledDividerU32, r) {
|
||||
DEF_TEST(SkVx_ScaledDividerU32, r) {
|
||||
static constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
auto errorBounds = [&](uint32_t actual, uint32_t expected) {
|
||||
@ -342,4 +342,16 @@ DEF_TEST(SkVM_ScaledDividerU32, r) {
|
||||
test(512'927'377);
|
||||
}
|
||||
|
||||
DEF_TEST(SkVx_saturated_add, r) {
|
||||
for (int a = 0; a < (1<<8); a++) {
|
||||
for (int b = 0; b < (1<<8); b++) {
|
||||
int exact = a+b;
|
||||
if (exact > 255) { exact = 255; }
|
||||
if (exact < 0) { exact = 0; }
|
||||
|
||||
REPORTER_ASSERT(r, saturated_add(skvx::byte16(a), skvx::byte16(b))[0] == exact);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace skvx
|
||||
|
@ -48,7 +48,6 @@ ignore = re.compile('|'.join([
|
||||
r'src/opts/.*_SSSE3\.h',
|
||||
r'src/opts/.*_neon\.h',
|
||||
r'src/opts/.*_sse\.h',
|
||||
r'src/opts/Sk4px_.*\.h',
|
||||
r'src/ports/.*',
|
||||
r'src/utils/.*_win\.h',
|
||||
r'src/utils/win/.*',
|
||||
@ -113,4 +112,3 @@ def main(argv):
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user