Update Sk4px to use skvx instead of SkNx

Adds a saturated_add function that was on SkNx and used in SkXfermode_opts, but hadn't been ported to skvx yet. Removes the Sk4px_opts variants and simplifies some of its functions; many were already defined skvx. The largest change is that Sk4px does not extend skvx::byte16, since it used to extend Sk16b. Now it just has a vector as a data type. This was necessary so that we could define operators that were typed for Sk4px and Wide w/o conflicting with the free operators that were defined for the base skvx types. Change-Id: I8c667ba86f662ccf07ad85aa32e78abfc0a8c7ae Reviewed-on: https://skia-review.googlesource.com/c/skia/+/542645 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Michael Ludwig <michaelludwig@google.com>
2022-05-20 15:55:12 -04:00 · 2022-05-20 15:55:12 -04:00 · 767586b330
commit 767586b330
parent 3149a7b283
13 changed files with 93 additions and 305 deletions
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@ -749,6 +749,31 @@ SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y
    return cast<uint8_t>( (X*Y+X)/256 );
 }

+// saturated_add(x,y) sums values and clamps to the maximum value instead of overflowing.
+SINT std::enable_if_t<std::is_unsigned_v<T>, Vec<N,T>> saturated_add(const Vec<N,T>& x,
+                                                                     const Vec<N,T>& y) {
+#if SKVX_USE_SIMD && (defined(__SSE__) || defined(__ARM_NEON))
+    // Both SSE and ARM have 16-lane saturated adds, so use intrinsics for those and recurse down
+    // or join up to take advantage.
+    if constexpr (N == 16 && sizeof(T) == 1) {
+        #if defined(__SSE__)
+        return unchecked_bit_pun<Vec<N,T>>(_mm_adds_epu8(unchecked_bit_pun<__m128i>(x),
+                                                         unchecked_bit_pun<__m128i>(y)));
+        #else // __ARM_NEON
+        return unchecked_bit_pun<Vec<N,T>>(vqaddq_u8(unchecked_bit_pun<uint8x16_t>(x),
+                                                     unchecked_bit_pun<uint8x16_t>(y)));
+        #endif
+    } else if constexpr (N < 16 && sizeof(T) == 1) {
+        return saturated_add(join(x,x), join(y,y)).lo;
+    } else if constexpr (sizeof(T) == 1) {
+        return join(saturated_add(x.lo, y.lo), saturated_add(x.hi, y.hi));
+    }
+#endif
+    // Otherwise saturate manually
+    auto sum = x + y;
+    return if_then_else(sum < x, Vec<N,T>(std::numeric_limits<T>::max()), sum);
+}
+
 // The ScaledDividerU32 takes a divisor > 1, and creates a function divide(numerator) that
 // calculates a numerator / denominator. For this to be rounded properly, numerator should have
 // half added in:
--- a/src/core/BUILD.bazel
+++ b/src/core/BUILD.bazel
@ -240,10 +240,7 @@ generated_cc_atom(
    deps = [
        "//include/core:SkColor_hdr",
        "//include/private:SkColorData_hdr",
-        "//include/private:SkNx_hdr",
-        "//src/opts:Sk4px_NEON_hdr",
-        "//src/opts:Sk4px_SSE2_hdr",
-        "//src/opts:Sk4px_none_hdr",
+        "//include/private:SkVx_hdr",
    ],
 )

--- a/src/core/Sk4px.h
+++ b/src/core/Sk4px.h
@ -10,30 +10,27 @@

 #include "include/core/SkColor.h"
 #include "include/private/SkColorData.h"
-#include "include/private/SkNx.h"
-
-// This file may be included multiple times by .cpp files with different flags, leading
-// to different definitions.  Usually that doesn't matter because it's all inlined, but
-// in Debug modes the compilers may not inline everything.  So wrap everything in an
-// anonymous namespace to give each includer their own silo of this code (or the linker
-// will probably pick one randomly for us, which is rarely correct).
-namespace {  // NOLINT(google-build-namespaces)
+#include "include/private/SkVx.h"

 // 1, 2 or 4 SkPMColors, generally vectorized.
-class Sk4px : public Sk16b {
+class Sk4px {
 public:
-    Sk4px(const Sk16b& v) : INHERITED(v) {}
+    Sk4px(const skvx::byte16& v) : fV(v) {}

    static Sk4px DupPMColor(SkPMColor c) {
-        Sk4u splat(c);
+        skvx::uint4 splat(c);

        Sk4px v;
        memcpy((void*)&v, &splat, 16);
        return v;
    }

-    Sk4px alphas() const;  // ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx
-    Sk4px inv() const { return Sk16b(255) - *this; }
+    // RGBA rgba XYZW xyzw -> AAAA aaaa WWWW wwww
+    Sk4px alphas() const {
+        static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
+        return Sk4px(skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(fV));
+    }
+    Sk4px inv() const { return Sk4px(skvx::byte16(255) - fV); }

    // When loading or storing fewer than 4 SkPMColors, we use the low lanes.
    static Sk4px Load4(const SkPMColor px[4]) {
@ -53,8 +50,16 @@ public:
    }

    // Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px.
-    static Sk4px Load4Alphas(const SkAlpha[4]);  // AaXx -> AAAA aaaa XXXX xxxx
-    static Sk4px Load2Alphas(const SkAlpha[2]);  // Aa   -> AAAA aaaa ???? ????
+    // AaXx -> AAAA aaaa XXXX xxxx
+    static Sk4px Load4Alphas(const SkAlpha alphas[4]) {
+        skvx::byte4 a = skvx::byte4::Load(alphas);
+        return Sk4px(skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(a));
+    }
+    // Aa   -> AAAA aaaa ???? ????
+    static Sk4px Load2Alphas(const SkAlpha alphas[2]) {
+        skvx::byte2 a = skvx::byte2::Load(alphas);
+        return Sk4px(join(skvx::shuffle<0,0,0,0, 1,1,1,1>(a), skvx::byte8()));
+    }

    void store4(SkPMColor px[4]) const { memcpy(px, this, 16); }
    void store2(SkPMColor px[2]) const { memcpy(px, this,  8); }
@ -62,45 +67,47 @@ public:

    // 1, 2, or 4 SkPMColors with 16-bit components.
    // This is most useful as the result of a multiply, e.g. from mulWiden().
-    class Wide : public Sk16h {
+    class Wide {
    public:
-        Wide(const Sk16h& v) : Sk16h(v) {}
-
-        // Add, then pack the top byte of each component back down into 4 SkPMColors.
-        Sk4px addNarrowHi(const Sk16h&) const;
+        Wide(const skvx::Vec<16, uint16_t>& v) : fV(v) {}

        // Rounds, i.e. (x+127) / 255.
-        Sk4px div255() const;
+        Sk4px div255() const { return Sk4px(skvx::div255(fV)); }

-        // These just keep the types as Wide so the user doesn't have to keep casting.
-        Wide operator * (const Wide& o) const { return INHERITED::operator*(o); }
-        Wide operator + (const Wide& o) const { return INHERITED::operator+(o); }
-        Wide operator - (const Wide& o) const { return INHERITED::operator-(o); }
-        Wide operator >> (int bits) const { return INHERITED::operator>>(bits); }
-        Wide operator << (int bits) const { return INHERITED::operator<<(bits); }
+        Wide operator * (const Wide& o) const { return Wide(fV * o.fV); }
+        Wide operator + (const Wide& o) const { return Wide(fV + o.fV); }
+        Wide operator - (const Wide& o) const { return Wide(fV - o.fV); }
+        Wide operator >> (int bits) const { return Wide(fV >> bits); }
+        Wide operator << (int bits) const { return Wide(fV << bits); }

    private:
-        using INHERITED = Sk16h;
+        skvx::Vec<16, uint16_t> fV;
    };

-    Wide widen() const;               // Widen 8-bit values to low 8-bits of 16-bit lanes.
-    Wide mulWiden(const Sk16b&) const;  // 8-bit x 8-bit -> 16-bit components.
+    // Widen 8-bit values to low 8-bits of 16-bit lanes.
+    Wide widen() const { return Wide(skvx::cast<uint16_t>(fV)); }
+    // 8-bit x 8-bit -> 16-bit components.
+    Wide mulWiden(const skvx::byte16& o) const { return Wide(mull(fV, o)); }

    // The only 8-bit multiply we use is 8-bit x 8-bit -> 16-bit.  Might as well make it pithy.
-    Wide operator * (const Sk4px& o) const { return this->mulWiden(o); }
+    Wide operator * (const Sk4px& o) const { return this->mulWiden(o.fV); }

-    // These just keep the types as Sk4px so the user doesn't have to keep casting.
-    Sk4px operator + (const Sk4px& o) const { return INHERITED::operator+(o); }
-    Sk4px operator - (const Sk4px& o) const { return INHERITED::operator-(o); }
-    Sk4px operator < (const Sk4px& o) const { return INHERITED::operator<(o); }
-    Sk4px thenElse(const Sk4px& t, const Sk4px& e) const { return INHERITED::thenElse(t,e); }
+    Sk4px operator + (const Sk4px& o) const { return Sk4px(fV + o.fV); }
+    Sk4px operator - (const Sk4px& o) const { return Sk4px(fV - o.fV); }
+    Sk4px operator < (const Sk4px& o) const { return Sk4px(fV < o.fV); }
+    Sk4px operator & (const Sk4px& o) const { return Sk4px(fV & o.fV); }
+    Sk4px thenElse(const Sk4px& t, const Sk4px& e) const {
+        return Sk4px(if_then_else(fV, t.fV, e.fV));
+    }

    // Generally faster than (*this * o).div255().
    // May be incorrect by +-1, but is always exactly correct when *this or o is 0 or 255.
-    Sk4px approxMulDiv255(const Sk16b& o) const {
-        // (x*y + x) / 256 meets these criteria.  (As of course does (x*y + y) / 256 by symmetry.)
-        // FYI: (x*y + 255) / 256 also meets these criteria.  In my brief testing, it was slower.
-        return this->widen().addNarrowHi(*this * o);
+    Sk4px approxMulDiv255(const Sk4px& o) const {
+        return Sk4px(approx_scale(fV, o.fV));
+    }
+
+    Sk4px saturatedAdd(const Sk4px& o) const {
+        return Sk4px(saturated_add(fV, o.fV));
    }

    // A generic driver that maps fn over a src array into a dst array.
@ -192,7 +199,7 @@ public:
                dst += 2; a += 2; n -= 2;
            }
            if (n >= 1) {
-                fn(Load1(dst), Sk16b(*a)).store1(dst);
+                fn(Load1(dst), skvx::byte16(*a)).store1(dst);
            }
            break;
        }
@ -224,7 +231,7 @@ public:
                dst += 2; src += 2; a += 2; n -= 2;
            }
            if (n >= 1) {
-                fn(Load1(dst), Load1(src), Sk16b(*a)).store1(dst);
+                fn(Load1(dst), Load1(src), skvx::byte16(*a)).store1(dst);
            }
            break;
        }
@ -233,24 +240,10 @@ public:
 private:
    Sk4px() = default;

-    using INHERITED = Sk16b;
+    skvx::byte16 fV;
 };

-static_assert(sizeof(Sk4px) == sizeof(Sk16b));
-static_assert(sizeof(Sk4px) == 16);
+static_assert(sizeof(Sk4px) == sizeof(skvx::byte16));
+static_assert(alignof(Sk4px) == alignof(skvx::byte16));

-}  // namespace
-
-#ifdef SKNX_NO_SIMD
-    #include "src/opts/Sk4px_none.h"
-#else
-    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-        #include "src/opts/Sk4px_SSE2.h"
-    #elif defined(SK_ARM_HAS_NEON)
-        #include "src/opts/Sk4px_NEON.h"
-    #else
-        #include "src/opts/Sk4px_none.h"
-    #endif
-#endif
-
-#endif//Sk4px_DEFINED
+#endif // Sk4px_DEFINED
--- a/src/opts/BUILD.bazel
+++ b/src/opts/BUILD.bazel
@ -20,25 +20,6 @@ cc_library(
    ],
 )

-generated_cc_atom(
-    name = "Sk4px_NEON_hdr",
-    hdrs = ["Sk4px_NEON.h"],
-    visibility = ["//:__subpackages__"],
-)
-
-generated_cc_atom(
-    name = "Sk4px_SSE2_hdr",
-    hdrs = ["Sk4px_SSE2.h"],
-    visibility = ["//:__subpackages__"],
-)
-
-generated_cc_atom(
-    name = "Sk4px_none_hdr",
-    hdrs = ["Sk4px_none.h"],
-    visibility = ["//:__subpackages__"],
-    deps = ["//src/core:SkUtils_hdr"],
-)
-
 generated_cc_atom(
    name = "SkBitmapProcState_opts_hdr",
    hdrs = ["SkBitmapProcState_opts.h"],
@ -198,7 +179,6 @@ generated_cc_atom(
    hdrs = ["SkXfermode_opts.h"],
    visibility = ["//:__subpackages__"],
    deps = [
-        "//include/private:SkNx_hdr",
        "//src/core:Sk4px_hdr",
        "//src/core:SkMSAN_hdr",
        "//src/core:SkXfermodePriv_hdr",
--- a/src/opts/Sk4px_NEON.h
+++ b/src/opts/Sk4px_NEON.h
@ -1,56 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // NOLINT(google-build-namespaces)
-
-inline Sk4px::Wide Sk4px::widen() const {
-    return Sk16h(vmovl_u8(vget_low_u8 (this->fVec)),
-                 vmovl_u8(vget_high_u8(this->fVec)));
-}
-
-inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
-    return Sk16h(vmull_u8(vget_low_u8 (this->fVec), vget_low_u8 (other.fVec)),
-                 vmull_u8(vget_high_u8(this->fVec), vget_high_u8(other.fVec)));
-}
-
-inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
-    const Sk4px::Wide o(other);  // Should be no code, but allows us to access fLo, fHi.
-    return Sk16b(vcombine_u8(vaddhn_u16(this->fLo.fVec, o.fLo.fVec),
-                             vaddhn_u16(this->fHi.fVec, o.fHi.fVec)));
-}
-
-inline Sk4px Sk4px::Wide::div255() const {
-    // Calculated as (x + (x+128)>>8 +128) >> 8.  The 'r' in each instruction provides each +128.
-    return Sk16b(vcombine_u8(vraddhn_u16(this->fLo.fVec, vrshrq_n_u16(this->fLo.fVec, 8)),
-                             vraddhn_u16(this->fHi.fVec, vrshrq_n_u16(this->fHi.fVec, 8))));
-}
-
-inline Sk4px Sk4px::alphas() const {
-    auto as = vshrq_n_u32((uint32x4_t)fVec, SK_A32_SHIFT);  // ___3 ___2 ___1 ___0
-    return Sk16b((uint8x16_t)vmulq_n_u32(as, 0x01010101));  // 3333 2222 1111 0000
-}
-
-inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
-    uint8x16_t a8 = vdupq_n_u8(0);                           // ____ ____ ____ ____
-    a8 = vld1q_lane_u8(a+0, a8,  0);                         // ____ ____ ____ ___0
-    a8 = vld1q_lane_u8(a+1, a8,  4);                         // ____ ____ ___1 ___0
-    a8 = vld1q_lane_u8(a+2, a8,  8);                         // ____ ___2 ___1 ___0
-    a8 = vld1q_lane_u8(a+3, a8, 12);                         // ___3 ___2 ___1 ___0
-    auto a32 = (uint32x4_t)a8;                               //
-    return Sk16b((uint8x16_t)vmulq_n_u32(a32, 0x01010101));  // 3333 2222 1111 0000
-}
-
-inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
-    uint8x16_t a8 = vdupq_n_u8(0);                           // ____ ____ ____ ____
-    a8 = vld1q_lane_u8(a+0, a8,  0);                         // ____ ____ ____ ___0
-    a8 = vld1q_lane_u8(a+1, a8,  4);                         // ____ ____ ___1 ___0
-    auto a32 = (uint32x4_t)a8;                               //
-    return Sk16b((uint8x16_t)vmulq_n_u32(a32, 0x01010101));  // ____ ____ 1111 0000
-}
-
-} // namespace
-
--- a/src/opts/Sk4px_SSE2.h
+++ b/src/opts/Sk4px_SSE2.h
@ -1,76 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // NOLINT(google-build-namespaces)
-
-inline Sk4px::Wide Sk4px::widen() const {
-    return Sk16h(_mm_unpacklo_epi8(this->fVec, _mm_setzero_si128()),
-                 _mm_unpackhi_epi8(this->fVec, _mm_setzero_si128()));
-}
-
-inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
-    return this->widen() * Sk4px(other).widen();
-}
-
-inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
-    Sk4px::Wide r = (*this + other) >> 8;
-    return Sk4px(_mm_packus_epi16(r.fLo.fVec, r.fHi.fVec));
-}
-
-inline Sk4px Sk4px::Wide::div255() const {
-    // (x + 127) / 255 == ((x+128) * 257)>>16,
-    // and _mm_mulhi_epu16 makes the (_ * 257)>>16 part very convenient.
-    const __m128i _128 = _mm_set1_epi16(128),
-                  _257 = _mm_set1_epi16(257);
-    return Sk4px(_mm_packus_epi16(_mm_mulhi_epu16(_mm_add_epi16(fLo.fVec, _128), _257),
-                                  _mm_mulhi_epu16(_mm_add_epi16(fHi.fVec, _128), _257)));
-}
-
-// Load4Alphas and Load2Alphas use possibly-unaligned loads (SkAlpha[] -> uint16_t or uint32_t).
-// These are safe on x86, often with no speed penalty.
-
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
-    inline Sk4px Sk4px::alphas() const {
-        static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
-        __m128i splat = _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7,7,7, 3,3,3,3);
-        return Sk16b(_mm_shuffle_epi8(this->fVec, splat));
-    }
-
-    inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
-        uint32_t as;
-        memcpy(&as, a, 4);
-        __m128i splat = _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
-        return Sk16b(_mm_shuffle_epi8(_mm_cvtsi32_si128(as), splat));
-    }
-#else
-    inline Sk4px Sk4px::alphas() const {
-        static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
-        // We exploit that A >= rgb for any premul pixel.
-        __m128i as = fVec;                             // 3xxx 2xxx 1xxx 0xxx
-        as = _mm_max_epu8(as, _mm_srli_epi32(as,  8)); // 33xx 22xx 11xx 00xx
-        as = _mm_max_epu8(as, _mm_srli_epi32(as, 16)); // 3333 2222 1111 0000
-        return Sk16b(as);
-    }
-
-    inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
-        __m128i as;
-        memcpy(&as, a, 4);                   // ____ ____ ____ 3210
-        as = _mm_unpacklo_epi8 (as, as);     // ____ ____ 3322 1100
-        as = _mm_unpacklo_epi16(as, as);     // 3333 2222 1111 0000
-        return Sk16b(as);
-    }
-#endif
-
-inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
-    uint16_t alphas;
-    memcpy(&alphas, a, 2);
-    uint32_t alphas_and_two_zeros = alphas;   // Aa -> Aa00
-
-    return Load4Alphas((const SkAlpha*)&alphas_and_two_zeros);
-}
-
-}  // namespace
--- a/src/opts/Sk4px_none.h
+++ b/src/opts/Sk4px_none.h
@ -1,59 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "src/core/SkUtils.h"
-
-namespace {  // NOLINT(google-build-namespaces)
-
-inline Sk4px::Wide Sk4px::widen() const {
-    return Sk16h((*this)[ 0], (*this)[ 1], (*this)[ 2], (*this)[ 3],
-                 (*this)[ 4], (*this)[ 5], (*this)[ 6], (*this)[ 7],
-                 (*this)[ 8], (*this)[ 9], (*this)[10], (*this)[11],
-                 (*this)[12], (*this)[13], (*this)[14], (*this)[15]);
-}
-
-inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
-    return this->widen() * Sk4px(other).widen();
-}
-
-inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
-    Sk4px::Wide r = (*this + other) >> 8;
-    return Sk16b(r[ 0], r[ 1], r[ 2], r[ 3],
-                 r[ 4], r[ 5], r[ 6], r[ 7],
-                 r[ 8], r[ 9], r[10], r[11],
-                 r[12], r[13], r[14], r[15]);
-}
-
-inline Sk4px Sk4px::Wide::div255() const {
-    // Calculated as ((x+128) + ((x+128)>>8)) >> 8.
-    auto v = *this + Sk16h(128);
-    return v.addNarrowHi(v>>8);
-}
-
-inline Sk4px Sk4px::alphas() const {
-    static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
-    return Sk16b((*this)[ 3], (*this)[ 3], (*this)[ 3], (*this)[ 3],
-                 (*this)[ 7], (*this)[ 7], (*this)[ 7], (*this)[ 7],
-                 (*this)[11], (*this)[11], (*this)[11], (*this)[11],
-                 (*this)[15], (*this)[15], (*this)[15], (*this)[15]);
-}
-
-inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
-    return Sk16b(a[0], a[0], a[0], a[0],
-                 a[1], a[1], a[1], a[1],
-                 a[2], a[2], a[2], a[2],
-                 a[3], a[3], a[3], a[3]);
-}
-
-inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
-    return Sk16b(a[0], a[0], a[0], a[0],
-                 a[1], a[1], a[1], a[1],
-                 0,0,0,0,
-                 0,0,0,0);
-}
-
-}  // namespace
--- a/src/opts/SkBlitMask_opts.h
+++ b/src/opts/SkBlitMask_opts.h
@ -205,7 +205,7 @@ namespace SK_OPTS_NS {
            //   ~~~>
            // a = 1*aa + d(1-1*aa) = aa + d(1-aa)
            // c = 0*aa + d(1-1*aa) =      d(1-aa)
-            return Sk4px(Sk16b(aa) & Sk16b(0,0,0,255, 0,0,0,255, 0,0,0,255, 0,0,0,255))
+            return (aa & Sk4px(skvx::byte16{0,0,0,255, 0,0,0,255, 0,0,0,255, 0,0,0,255}))
                 + d.approxMulDiv255(aa.inv());
        };
        while (h --> 0) {
--- a/src/opts/SkXfermode_opts.h
+++ b/src/opts/SkXfermode_opts.h
@ -8,7 +8,6 @@
 #ifndef Sk4pxXfermode_DEFINED
 #define Sk4pxXfermode_DEFINED

-#include "include/private/SkNx.h"
 #include "src/core/Sk4px.h"
 #include "src/core/SkMSAN.h"
 #include "src/core/SkXfermodePriv.h"
--- a/tests/BUILD.bazel
+++ b/tests/BUILD.bazel
@ -5562,7 +5562,6 @@ generated_cc_atom(
        ":Test_hdr",
        "//include/private:SkNx_hdr",
        "//include/utils:SkRandom_hdr",
-        "//src/core:Sk4px_hdr",
    ],
 )

--- a/tests/SkNxTest.cpp
+++ b/tests/SkNxTest.cpp
@ -7,7 +7,6 @@

 #include "include/private/SkNx.h"
 #include "include/utils/SkRandom.h"
-#include "src/core/Sk4px.h"
 #include "tests/Test.h"

 template <int N>
@ -185,29 +184,6 @@ DEF_TEST(SkNi_mulHi, r) {
    REPORTER_ASSERT(r, c[3] == q[3]);
 }

-DEF_TEST(Sk4px_muldiv255round, r) {
-    for (int a = 0; a < (1<<8); a++) {
-    for (int b = 0; b < (1<<8); b++) {
-        int exact = (a*b+127)/255;
-
-        // Duplicate a and b 16x each.
-        Sk4px av = Sk16b(a),
-              bv = Sk16b(b);
-
-        // This way should always be exactly correct.
-        int correct = (av * bv).div255()[0];
-        REPORTER_ASSERT(r, correct == exact);
-
-        // We're a bit more flexible on this method: correct for 0 or 255, otherwise off by <=1.
-        int fast = av.approxMulDiv255(bv)[0];
-        REPORTER_ASSERT(r, fast-exact >= -1 && fast-exact <= 1);
-        if (a == 0 || a == 255 || b == 0 || b == 255) {
-            REPORTER_ASSERT(r, fast == exact);
-        }
-    }
-    }
-}
-
 DEF_TEST(SkNx_abs, r) {
    auto fs = Sk4f(0.0f, -0.0f, 2.0f, -4.0f).abs();
    REPORTER_ASSERT(r, fs[0] == 0.0f);
--- a/tests/SkVxTest.cpp
+++ b/tests/SkVxTest.cpp
@ -304,7 +304,7 @@ DEF_TEST(SkVx_strided_loads, r) {
    check_strided_loads<float>(r);
 }

-DEF_TEST(SkVM_ScaledDividerU32, r) {
+DEF_TEST(SkVx_ScaledDividerU32, r) {
    static constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();

    auto errorBounds = [&](uint32_t actual, uint32_t expected) {
@ -342,4 +342,16 @@ DEF_TEST(SkVM_ScaledDividerU32, r) {
    test(512'927'377);
 }

+DEF_TEST(SkVx_saturated_add, r) {
+    for (int a = 0; a < (1<<8); a++) {
+        for (int b = 0; b < (1<<8); b++) {
+            int exact = a+b;
+            if (exact > 255) { exact = 255; }
+            if (exact <   0) { exact =   0; }
+
+            REPORTER_ASSERT(r, saturated_add(skvx::byte16(a), skvx::byte16(b))[0] == exact);
+        }
+    }
+}
+
 }  // namespace skvx
--- a/tools/check-headers-self-sufficient
+++ b/tools/check-headers-self-sufficient
@ -48,7 +48,6 @@ ignore = re.compile('|'.join([
    r'src/opts/.*_SSSE3\.h',
    r'src/opts/.*_neon\.h',
    r'src/opts/.*_sse\.h',
-    r'src/opts/Sk4px_.*\.h',
    r'src/ports/.*',
    r'src/utils/.*_win\.h',
    r'src/utils/win/.*',
@ -113,4 +112,3 @@ def main(argv):

 if __name__ == '__main__':
    main(sys.argv)
-