Convert color data to skvx::float4 from Sk4f

Change-Id: I511f6105537b24953de1533ad7b73d1186afd4fc Reviewed-on: https://skia-review.googlesource.com/c/skia/+/541060 Commit-Queue: Michael Ludwig <michaelludwig@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
2022-05-17 16:26:03 -04:00 · 2022-05-17 16:26:03 -04:00 · 9b59fe655c
commit 9b59fe655c
parent 7f99451720
21 changed files with 154 additions and 220 deletions
--- a/bench/VertexColorSpaceBench.cpp
+++ b/bench/VertexColorSpaceBench.cpp
@ -232,11 +232,7 @@ private:
            };
            SkASSERT(sizeof(V) == vertexStride);
            uint64_t color;
-            Sk4h halfColor = SkFloatToHalf_finite_ftz(Sk4f::Load(&fColor4f));
-            color = (uint64_t)halfColor[0] << 48 |
-                    (uint64_t)halfColor[1] << 32 |
-                    (uint64_t)halfColor[2] << 16 |
-                    (uint64_t)halfColor[3] << 0;
+            SkFloatToHalf_finite_ftz(skvx::float4::Load(&fColor4f)).store(&color);
            V* v = (V*)verts;
            for (int i = 0; i < kVertexCount; i += 2) {
                v[i + 0].fPos.set(dx * i, 0.0f);
--- a/dm/DM.cpp
+++ b/dm/DM.cpp
@ -1256,9 +1256,9 @@ struct Task {
                    bool unclamped = false;
                    for (int y = 0; y < pm.height() && !unclamped; ++y)
                    for (int x = 0; x < pm.width() && !unclamped; ++x) {
-                        Sk4f rgba = SkHalfToFloat_finite_ftz(*pm.addr64(x, y));
+                        skvx::float4 rgba = SkHalfToFloat_finite_ftz(*pm.addr64(x, y));
                        float a = rgba[3];
-                        if (a > 1.0f || (rgba < 0.0f).anyTrue() || (rgba > a).anyTrue()) {
+                        if (a > 1.0f || any(rgba < 0.0f) || any(rgba > a)) {
                            SkDebugf("[%s] F16Norm pixel [%d, %d] unclamped: (%g, %g, %g, %g)\n",
                                     name.c_str(), x, y, rgba[0], rgba[1], rgba[2], rgba[3]);
                            unclamped = true;
--- a/include/private/BUILD.bazel
+++ b/include/private/BUILD.bazel
@ -28,8 +28,8 @@ generated_cc_atom(
    hdrs = ["SkColorData.h"],
    visibility = ["//:__subpackages__"],
    deps = [
-        ":SkNx_hdr",
        ":SkTo_hdr",
+        ":SkVx_hdr",
        "//include/core:SkColorPriv_hdr",
        "//include/core:SkColor_hdr",
    ],
@ -93,7 +93,7 @@ generated_cc_atom(
    hdrs = ["SkHalf.h"],
    visibility = ["//:__subpackages__"],
    deps = [
-        ":SkNx_hdr",
+        ":SkVx_hdr",
        "//include/core:SkTypes_hdr",
    ],
 )
--- a/include/private/SkColorData.h
+++ b/include/private/SkColorData.h
@ -10,8 +10,8 @@

 #include "include/core/SkColor.h"
 #include "include/core/SkColorPriv.h"
-#include "include/private/SkNx.h"
 #include "include/private/SkTo.h"
+#include "include/private/SkVx.h"

 ////////////////////////////////////////////////////////////////////////////////////////////
 // Convert a 16bit pixel to a 32bit pixel
@ -395,11 +395,11 @@ static inline SkPMColor SkPixel4444ToPixel32(U16CPU c) {
    return d | (d << 4);
 }

-static inline Sk4f swizzle_rb(const Sk4f& x) {
-    return SkNx_shuffle<2, 1, 0, 3>(x);
+static inline skvx::float4 swizzle_rb(const skvx::float4& x) {
+    return skvx::shuffle<2, 1, 0, 3>(x);
 }

-static inline Sk4f swizzle_rb_if_bgra(const Sk4f& x) {
+static inline skvx::float4 swizzle_rb_if_bgra(const skvx::float4& x) {
 #ifdef SK_PMCOLOR_IS_BGRA
    return swizzle_rb(x);
 #else
@ -407,24 +407,13 @@ static inline Sk4f swizzle_rb_if_bgra(const Sk4f& x) {
 #endif
 }

-static inline Sk4f Sk4f_fromL32(uint32_t px) {
-    return SkNx_cast<float>(Sk4b::Load(&px)) * (1 / 255.0f);
+static inline skvx::float4 Sk4f_fromL32(uint32_t px) {
+    return skvx::cast<float>(skvx::byte4::Load(&px)) * (1 / 255.0f);
 }

-static inline uint32_t Sk4f_toL32(const Sk4f& px) {
-    Sk4f v = px;
-
-#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-    // SkNx_cast<uint8_t, int32_t>() pins, and we don't anticipate giant floats
-#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON)
-    // SkNx_cast<uint8_t, int32_t>() pins, and so does Sk4f_round().
-#else
-    // No guarantee of a pin.
-    v = Sk4f::Max(0, Sk4f::Min(v, 1));
-#endif
-
+static inline uint32_t Sk4f_toL32(const skvx::float4& px) {
    uint32_t l32;
-    SkNx_cast<uint8_t>(Sk4f_round(v * 255.0f)).store(&l32);
+    skvx::cast<uint8_t>(pin(lrint(px * 255.f), skvx::int4(0), skvx::int4(255))).store(&l32);
    return l32;
 }

--- a/include/private/SkHalf.h
+++ b/include/private/SkHalf.h
@ -9,7 +9,7 @@
 #define SkHalf_DEFINED

 #include "include/core/SkTypes.h"
-#include "include/private/SkNx.h"
+#include "include/private/SkVx.h"

 // 16-bit floating point value
 // format is 1 bit sign, 5 bits exponent, 10 bits mantissa
@ -28,58 +28,11 @@ SkHalf SkFloatToHalf(float f);
 // Convert between half and single precision floating point,
 // assuming inputs and outputs are both finite, and may
 // flush values which would be denormal half floats to zero.
-static inline Sk4f SkHalfToFloat_finite_ftz(uint64_t);
-static inline Sk4h SkFloatToHalf_finite_ftz(const Sk4f&);
-
-// ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //
-
-// Like the serial versions in SkHalf.cpp, these are based on
-// https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
-
-// GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use inline assembly.
-
-static inline Sk4f SkHalfToFloat_finite_ftz(uint64_t rgba) {
-    Sk4h hs = Sk4h::Load(&rgba);
-#if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
-    float32x4_t fs;
-    asm ("fcvtl %[fs].4s, %[hs].4h   \n"   // vcvt_f32_f16(...)
-        : [fs] "=w" (fs)                   // =w: write-only NEON register
-        : [hs] "w" (hs.fVec));             //  w: read-only NEON register
-    return fs;
-#else
-    Sk4i bits     = SkNx_cast<int>(hs),  // Expand to 32 bit.
-         sign     = bits & 0x00008000,   // Save the sign bit for later...
-         positive = bits ^ sign,         // ...but strip it off for now.
-         is_norm  = 0x03ff < positive;   // Exponent > 0?
-
-    // For normal half floats, extend the mantissa by 13 zero bits,
-    // then adjust the exponent from 15 bias to 127 bias.
-    Sk4i norm = (positive << 13) + ((127 - 15) << 23);
-
-    Sk4i merged = (sign << 16) | (norm & is_norm);
-    return Sk4f::Load(&merged);
-#endif
+static inline skvx::float4 SkHalfToFloat_finite_ftz(uint64_t rgba) {
+    return skvx::from_half(skvx::half4::Load(&rgba));
 }
-
-static inline Sk4h SkFloatToHalf_finite_ftz(const Sk4f& fs) {
-#if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
-    float32x4_t vec = fs.fVec;
-    asm ("fcvtn %[vec].4h, %[vec].4s  \n"   // vcvt_f16_f32(vec)
-        : [vec] "+w" (vec));                // +w: read-write NEON register
-    return vreinterpret_u16_f32(vget_low_f32(vec));
-#else
-    Sk4i bits         = Sk4i::Load(&fs),
-         sign         = bits & 0x80000000,      // Save the sign bit for later...
-         positive     = bits ^ sign,            // ...but strip it off for now.
-         will_be_norm = 0x387fdfff < positive;  // greater than largest denorm half?
-
-    // For normal half floats, adjust the exponent from 127 bias to 15 bias,
-    // then drop the bottom 13 mantissa bits.
-    Sk4i norm = (positive - ((127 - 15) << 23)) >> 13;
-
-    Sk4i merged = (sign >> 16) | (will_be_norm & norm);
-    return SkNx_cast<uint16_t>(merged);
-#endif
+static inline skvx::half4 SkFloatToHalf_finite_ftz(const skvx::float4& c) {
+    return skvx::to_half(c);
 }

 #endif
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@ -463,28 +463,28 @@ SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec
    // Specializations inline here so they can generalize what types the apply to.
    // (This header is used in C++14 contexts, so we have to kind of fake constexpr if.)
 #if SKVX_USE_SIMD && defined(__AVX2__)
-    if /*constexpr*/ (N*sizeof(T) == 32) {
+    if constexpr (N*sizeof(T) == 32) {
        return unchecked_bit_pun<Vec<N,T>>(_mm256_blendv_epi8(unchecked_bit_pun<__m256i>(e),
                                                              unchecked_bit_pun<__m256i>(t),
                                                              unchecked_bit_pun<__m256i>(cond)));
    }
 #endif
 #if SKVX_USE_SIMD && defined(__SSE4_1__)
-    if /*constexpr*/ (N*sizeof(T) == 16) {
+    if constexpr (N*sizeof(T) == 16) {
        return unchecked_bit_pun<Vec<N,T>>(_mm_blendv_epi8(unchecked_bit_pun<__m128i>(e),
                                                           unchecked_bit_pun<__m128i>(t),
                                                           unchecked_bit_pun<__m128i>(cond)));
    }
 #endif
 #if SKVX_USE_SIMD && defined(__ARM_NEON)
-    if /*constexpr*/ (N*sizeof(T) == 16) {
+    if constexpr (N*sizeof(T) == 16) {
        return unchecked_bit_pun<Vec<N,T>>(vbslq_u8(unchecked_bit_pun<uint8x16_t>(cond),
                                                    unchecked_bit_pun<uint8x16_t>(t),
                                                    unchecked_bit_pun<uint8x16_t>(e)));
    }
 #endif
    // Recurse for large vectors to try to hit the specializations above.
-    if /*constexpr*/ (N*sizeof(T) > 16) {
+    if constexpr (N*sizeof(T) > 16) {
        return join(if_then_else(cond.lo, t.lo, e.lo),
                    if_then_else(cond.hi, t.hi, e.hi));
    }
@ -506,19 +506,19 @@ SINT bool any(const Vec<N,T>& x) {
 SIT  bool all(const Vec<1,T>& x) { return x.val != 0; }
 SINT bool all(const Vec<N,T>& x) {
 #if SKVX_USE_SIMD && defined(__AVX2__)
-    if /*constexpr*/ (N*sizeof(T) == 32) {
+    if constexpr (N*sizeof(T) == 32) {
        return _mm256_testc_si256(unchecked_bit_pun<__m256i>(x),
                                  _mm256_set1_epi32(-1));
    }
 #endif
 #if SKVX_USE_SIMD && defined(__SSE4_1__)
-    if /*constexpr*/ (N*sizeof(T) == 16) {
+    if constexpr (N*sizeof(T) == 16) {
        return _mm_testc_si128(unchecked_bit_pun<__m128i>(x),
                               _mm_set1_epi32(-1));
    }
 #endif
 #if SKVX_USE_SIMD && defined(__wasm_simd128__)
-    if /*constexpr*/ (N == 4 && sizeof(T) == 4) {
+    if constexpr (N == 4 && sizeof(T) == 4) {
        return wasm_i32x4_all_true(unchecked_bit_pun<VExt<4,int>>(x));
    }
 #endif
@ -622,12 +622,12 @@ SI Vec<1,int> lrint(const Vec<1,float>& x) {
 }
 SIN Vec<N,int> lrint(const Vec<N,float>& x) {
 #if SKVX_USE_SIMD && defined(__AVX__)
-    if /*constexpr*/ (N == 8) {
+    if constexpr (N == 8) {
        return unchecked_bit_pun<Vec<N,int>>(_mm256_cvtps_epi32(unchecked_bit_pun<__m256>(x)));
    }
 #endif
 #if SKVX_USE_SIMD && defined(__SSE__)
-    if /*constexpr*/ (N == 4) {
+    if constexpr (N == 4) {
        return unchecked_bit_pun<Vec<N,int>>(_mm_cvtps_epi32(unchecked_bit_pun<__m128>(x)));
    }
 #endif
@ -637,8 +637,7 @@ SIN Vec<N,int> lrint(const Vec<N,float>& x) {

 SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); }

-// The default logic for to_half/from_half is borrowed from skcms,
-// and assumes inputs are finite and treat/flush denorm half floats as/to zero.
+// Assumes inputs are finite and treat/flush denorm half floats as/to zero.
 // Key constants to watch for:
 //    - a float is 32-bit, 1-8-23 sign-exponent-mantissa, with 127 exponent bias;
 //    - a half  is 16-bit, 1-5-10 sign-exponent-mantissa, with  15 exponent bias.
@ -646,17 +645,17 @@ SIN Vec<N,uint16_t> to_half_finite_ftz(const Vec<N,float>& x) {
    Vec<N,uint32_t> sem = bit_pun<Vec<N,uint32_t>>(x),
                    s   = sem & 0x8000'0000,
                     em = sem ^ s,
-              is_denorm =  em < 0x3880'0000;
-    return cast<uint16_t>(if_then_else(is_denorm, Vec<N,uint32_t>(0)
-                                                , (s>>16) + (em>>13) - ((127-15)<<10)));
+                is_norm =  em > 0x387f'd000, // halfway between largest f16 denorm and smallest norm
+                   norm = (em>>13) - ((127-15)<<10);
+    return cast<uint16_t>((s>>16) | (is_norm & norm));
 }
 SIN Vec<N,float> from_half_finite_ftz(const Vec<N,uint16_t>& x) {
    Vec<N,uint32_t> wide = cast<uint32_t>(x),
                      s  = wide & 0x8000,
-                      em = wide ^ s;
-    auto is_denorm = bit_pun<Vec<N,int32_t>>(em < 0x0400);
-    return if_then_else(is_denorm, Vec<N,float>(0)
-                                 , bit_pun<Vec<N,float>>( (s<<16) + (em<<13) + ((127-15)<<23) ));
+                      em = wide ^ s,
+                 is_norm =   em > 0x3ff,
+                    norm = (em<<13) + ((127-15)<<23);
+    return bit_pun<Vec<N,float>>((s<<16) | (is_norm & norm));
 }

 // Like if_then_else(), these N=1 base cases won't actually be used unless explicitly called.
@ -665,18 +664,18 @@ SI Vec<1,float>  from_half(const Vec<1,uint16_t>& x) { return from_half_finite_f

 SIN Vec<N,uint16_t> to_half(const Vec<N,float>& x) {
 #if SKVX_USE_SIMD && defined(__F16C__)
-    if /*constexpr*/ (N == 8) {
+    if constexpr (N == 8) {
        return unchecked_bit_pun<Vec<N,uint16_t>>(_mm256_cvtps_ph(unchecked_bit_pun<__m256>(x),
                                                                  _MM_FROUND_CUR_DIRECTION));
    }
 #endif
 #if SKVX_USE_SIMD && defined(__aarch64__)
-    if /*constexpr*/ (N == 4) {
+    if constexpr (N == 4) {
        return unchecked_bit_pun<Vec<N,uint16_t>>(vcvt_f16_f32(unchecked_bit_pun<float32x4_t>(x)));

    }
 #endif
-    if /*constexpr*/ (N > 4) {
+    if constexpr (N > 4) {
        return join(to_half(x.lo),
                    to_half(x.hi));
    }
@ -685,16 +684,16 @@ SIN Vec<N,uint16_t> to_half(const Vec<N,float>& x) {

 SIN Vec<N,float> from_half(const Vec<N,uint16_t>& x) {
 #if SKVX_USE_SIMD && defined(__F16C__)
-    if /*constexpr*/ (N == 8) {
+    if constexpr (N == 8) {
        return unchecked_bit_pun<Vec<N,float>>(_mm256_cvtph_ps(unchecked_bit_pun<__m128i>(x)));
    }
 #endif
 #if SKVX_USE_SIMD && defined(__aarch64__)
-    if /*constexpr*/ (N == 4) {
+    if constexpr (N == 4) {
        return unchecked_bit_pun<Vec<N,float>>(vcvt_f32_f16(unchecked_bit_pun<float16x4_t>(x)));
    }
 #endif
-    if /*constexpr*/ (N > 4) {
+    if constexpr (N > 4) {
        return join(from_half(x.lo),
                    from_half(x.hi));
    }
--- a/samplecode/SamplePathText.cpp
+++ b/samplecode/SamplePathText.cpp
@ -8,6 +8,7 @@
 #include "include/core/SkCanvas.h"
 #include "include/core/SkPaint.h"
 #include "include/core/SkPath.h"
+#include "include/private/SkNx.h"
 #include "include/utils/SkRandom.h"
 #include "samplecode/Sample.h"
 #include "src/core/SkPathPriv.h"
--- a/src/core/BUILD.bazel
+++ b/src/core/BUILD.bazel
@ -681,6 +681,7 @@ generated_cc_atom(
    deps = [
        ":SkBlendModePriv_hdr",
        ":SkRasterPipeline_hdr",
+        "//include/private:SkVx_hdr",
    ],
 )

@ -2993,7 +2994,6 @@ generated_cc_atom(
        "//include/private:SkColorData_hdr",
        "//include/private:SkHalf_hdr",
        "//include/private:SkImageInfoPriv_hdr",
-        "//include/private:SkNx_hdr",
        "//include/private:SkTo_hdr",
        "//include/private:SkVx_hdr",
    ],
@ -3613,10 +3613,10 @@ generated_cc_atom(
        "//include/private:SkColorData_hdr",
        "//include/private:SkHalf_hdr",
        "//include/private:SkImageInfoPriv_hdr",
-        "//include/private:SkNx_hdr",
        "//include/private:SkTPin_hdr",
        "//include/private:SkTemplates_hdr",
        "//include/private:SkTo_hdr",
+        "//include/private:SkVx_hdr",
        "//src/image:SkReadPixelsRec_hdr",
        "//src/shaders:SkImageShader_hdr",
    ],
--- a/src/core/SkBlendMode.cpp
+++ b/src/core/SkBlendMode.cpp
@ -6,6 +6,8 @@
 */

 #include "src/core/SkBlendModePriv.h"
+
+#include "include/private/SkVx.h"
 #include "src/core/SkRasterPipeline.h"

 bool SkBlendMode_ShouldPreScaleCoverage(SkBlendMode mode, bool rgb_coverage) {
@ -129,8 +131,9 @@ SkPMColor4f SkBlendMode_Apply(SkBlendMode mode, const SkPMColor4f& src, const Sk
        case SkBlendMode::kSrc:     return src;
        case SkBlendMode::kDst:     return dst;
        case SkBlendMode::kSrcOver: {
-            Sk4f r = Sk4f::Load(src.vec()) + Sk4f::Load(dst.vec()) * Sk4f(1 - src.fA);
-            return { r[0], r[1], r[2], r[3] };
+            SkPMColor4f r;
+            (skvx::float4::Load(src.vec()) + skvx::float4::Load(dst.vec()) * (1-src.fA)).store(&r);
+            return r;
        }
        default:
            break;
--- a/src/core/SkColor.cpp
+++ b/src/core/SkColor.cpp
@ -115,12 +115,12 @@ SkColor4f SkColor4f::FromColor(SkColor bgra) {

 template <>
 SkColor SkColor4f::toSkColor() const {
-    return Sk4f_toL32(swizzle_rb(Sk4f::Load(this->vec())));
+    return Sk4f_toL32(swizzle_rb(skvx::float4::Load(this->vec())));
 }

 template <>
 uint32_t SkColor4f::toBytes_RGBA() const {
-    return Sk4f_toL32(Sk4f::Load(this->vec()));
+    return Sk4f_toL32(skvx::float4::Load(this->vec()));
 }

 template <>
@ -139,7 +139,7 @@ SkPMColor4f SkPMColor4f::FromPMColor(SkPMColor c) {

 template <>
 uint32_t SkPMColor4f::toBytes_RGBA() const {
-    return Sk4f_toL32(Sk4f::Load(this->vec()));
+    return Sk4f_toL32(skvx::float4::Load(this->vec()));
 }

 template <>
--- a/src/core/SkMipmap.cpp
+++ b/src/core/SkMipmap.cpp
@ -10,7 +10,6 @@
 #include "include/private/SkColorData.h"
 #include "include/private/SkHalf.h"
 #include "include/private/SkImageInfoPriv.h"
-#include "include/private/SkNx.h"
 #include "include/private/SkTo.h"
 #include "include/private/SkVx.h"
 #include "src/core/SkMathPriv.h"
@ -27,12 +26,12 @@

 struct ColorTypeFilter_8888 {
    typedef uint32_t Type;
-    static Sk4h Expand(uint32_t x) {
-        return SkNx_cast<uint16_t>(Sk4b::Load(&x));
+    static skvx::Vec<4, uint16_t> Expand(uint32_t x) {
+        return skvx::cast<uint16_t>(skvx::byte4::Load(&x));
    }
-    static uint32_t Compact(const Sk4h& x) {
+    static uint32_t Compact(const skvx::Vec<4, uint16_t>& x) {
        uint32_t r;
-        SkNx_cast<uint8_t>(x).store(&r);
+        skvx::cast<uint8_t>(x).store(&r);
        return r;
    }
 };
@ -69,11 +68,11 @@ struct ColorTypeFilter_8 {

 struct ColorTypeFilter_Alpha_F16 {
    typedef uint16_t Type;
-    static Sk4f Expand(uint16_t x) {
+    static skvx::float4 Expand(uint16_t x) {
        return SkHalfToFloat_finite_ftz((uint64_t) x); // expand out to four lanes

    }
-    static uint16_t Compact(const Sk4f& x) {
+    static uint16_t Compact(const skvx::float4& x) {
        uint64_t r;
        SkFloatToHalf_finite_ftz(x).store(&r);
        return r & 0xFFFF;  // but ignore the extra 3 here
@ -82,10 +81,10 @@ struct ColorTypeFilter_Alpha_F16 {

 struct ColorTypeFilter_RGBA_F16 {
    typedef uint64_t Type; // SkHalf x4
-    static Sk4f Expand(uint64_t x) {
+    static skvx::float4 Expand(uint64_t x) {
        return SkHalfToFloat_finite_ftz(x);
    }
-    static uint64_t Compact(const Sk4f& x) {
+    static uint64_t Compact(const skvx::float4& x) {
        uint64_t r;
        SkFloatToHalf_finite_ftz(x).store(&r);
        return r;
@ -114,10 +113,10 @@ struct ColorTypeFilter_1616 {

 struct ColorTypeFilter_F16F16 {
    typedef uint32_t Type;
-    static Sk4f Expand(uint32_t x) {
+    static skvx::float4 Expand(uint32_t x) {
        return SkHalfToFloat_finite_ftz((uint64_t) x); // expand out to four lanes
    }
-    static uint32_t Compact(const Sk4f& x) {
+    static uint32_t Compact(const skvx::float4& x) {
        uint64_t r;
        SkFloatToHalf_finite_ftz(x).store(&r);
        return (uint32_t) (r & 0xFFFFFFFF);  // but ignore the extra 2 here
@ -170,7 +169,7 @@ template <typename T> T shift_right(const T& x, int bits) {
    return x >> bits;
 }

-Sk4f shift_right(const Sk4f& x, int bits) {
+skvx::float4 shift_right(const skvx::float4& x, int bits) {
    return x * (1.0f / (1 << bits));
 }

@ -178,7 +177,7 @@ template <typename T> T shift_left(const T& x, int bits) {
    return x << bits;
 }

-Sk4f shift_left(const Sk4f& x, int bits) {
+skvx::float4 shift_left(const skvx::float4& x, int bits) {
    return x * (1 << bits);
 }

--- a/src/core/SkPixmap.cpp
+++ b/src/core/SkPixmap.cpp
@ -15,10 +15,10 @@
 #include "include/private/SkColorData.h"
 #include "include/private/SkHalf.h"
 #include "include/private/SkImageInfoPriv.h"
-#include "include/private/SkNx.h"
 #include "include/private/SkTPin.h"
 #include "include/private/SkTemplates.h"
 #include "include/private/SkTo.h"
+#include "include/private/SkVx.h"
 #include "src/core/SkConvertPixels.h"
 #include "src/core/SkDraw.h"
 #include "src/core/SkMask.h"
@ -419,29 +419,25 @@ SkColor SkPixmap::getColor(int x, int y) const {
        case kRGBA_F16_SkColorType: {
            const uint64_t* addr =
                (const uint64_t*)fPixels + y * (fRowBytes >> 3) + x;
-            Sk4f p4 = SkHalfToFloat_finite_ftz(*addr);
+            skvx::float4 p4 = SkHalfToFloat_finite_ftz(*addr);
            if (p4[3] && needsUnpremul) {
                float inva = 1 / p4[3];
-                p4 = p4 * Sk4f(inva, inva, inva, 1);
+                p4 = p4 * skvx::float4(inva, inva, inva, 1);
            }
-            SkColor c;
-            SkNx_cast<uint8_t>(p4 * Sk4f(255) + Sk4f(0.5f)).store(&c);
            // p4 is RGBA, but we want BGRA, so we need to swap next
-            return SkSwizzle_RB(c);
+            return Sk4f_toL32(swizzle_rb(p4));
        }
        case kRGBA_F32_SkColorType: {
            const float* rgba =
                (const float*)fPixels + 4*y*(fRowBytes >> 4) + 4*x;
-            Sk4f p4 = Sk4f::Load(rgba);
+            skvx::float4 p4 = skvx::float4::Load(rgba);
            // From here on, just like F16:
            if (p4[3] && needsUnpremul) {
                float inva = 1 / p4[3];
-                p4 = p4 * Sk4f(inva, inva, inva, 1);
+                p4 = p4 * skvx::float4(inva, inva, inva, 1);
            }
-            SkColor c;
-            SkNx_cast<uint8_t>(p4 * Sk4f(255) + Sk4f(0.5f)).store(&c);
            // p4 is RGBA, but we want BGRA, so we need to swap next
-            return SkSwizzle_RB(c);
+            return Sk4f_toL32(swizzle_rb(p4));
        }
        case kUnknown_SkColorType:
            break;
@ -639,4 +635,3 @@ bool SkPixmapPriv::Orient(const SkPixmap& dst, const SkPixmap& src, SkEncodedOri
 SkImageInfo SkPixmapPriv::SwapWidthHeight(const SkImageInfo& info) {
    return info.makeWH(info.height(), info.width());
 }
-
--- a/src/gpu/ganesh/GrColor.h
+++ b/src/gpu/ganesh/GrColor.h
@ -81,7 +81,7 @@ static inline bool SkPMColor4fFitsInBytes(const SkPMColor4f& color) {

 static inline uint64_t SkPMColor4f_toFP16(const SkPMColor4f& color) {
    uint64_t halfColor;
-    SkFloatToHalf_finite_ftz(Sk4f::Load(color.vec())).store(&halfColor);
+    SkFloatToHalf_finite_ftz(skvx::float4::Load(color.vec())).store(&halfColor);
    return halfColor;
 }

--- a/src/gpu/ganesh/gradients/GrGradientBitmapCache.cpp
+++ b/src/gpu/ganesh/gradients/GrGradientBitmapCache.cpp
@ -129,16 +129,13 @@ void GrGradientBitmapCache::fillGradient(const SkPMColor4f* colors, const SkScal
    SkHalf* pixelsF16 = reinterpret_cast<SkHalf*>(bitmap->getPixels());
    uint32_t* pixels32 = reinterpret_cast<uint32_t*>(bitmap->getPixels());

-    typedef std::function<void(const Sk4f&, int)> pixelWriteFn_t;
+    typedef std::function<void(const skvx::float4&, int)> pixelWriteFn_t;

-    pixelWriteFn_t writeF16Pixel = [&](const Sk4f& x, int index) {
-        Sk4h c = SkFloatToHalf_finite_ftz(x);
-        pixelsF16[4*index+0] = c[0];
-        pixelsF16[4*index+1] = c[1];
-        pixelsF16[4*index+2] = c[2];
-        pixelsF16[4*index+3] = c[3];
+    pixelWriteFn_t writeF16Pixel = [&](const skvx::float4& x, int index) {
+        skvx::half4 c = SkFloatToHalf_finite_ftz(x);
+        c.store(pixelsF16 + (4 * index));
    };
-    pixelWriteFn_t write8888Pixel = [&](const Sk4f& c, int index) {
+    pixelWriteFn_t write8888Pixel = [&](const skvx::float4& c, int index) {
        pixels32[index] = Sk4f_toL32(c);
    };

@ -154,11 +151,11 @@ void GrGradientBitmapCache::fillGradient(const SkPMColor4f* colors, const SkScal
                               SkIntToScalar(fResolution - 1));

        if (nextIndex > prevIndex) {
-            Sk4f          c0 = Sk4f::Load(colors[i - 1].vec()),
-                          c1 = Sk4f::Load(colors[i    ].vec());
+            auto c0 = skvx::float4::Load(colors[i - 1].vec()),
+                 c1 = skvx::float4::Load(colors[i    ].vec());

-            Sk4f step = Sk4f(1.0f / static_cast<float>(nextIndex - prevIndex));
-            Sk4f delta = (c1 - c0) * step;
+            auto step = skvx::float4(1.0f / static_cast<float>(nextIndex - prevIndex));
+            auto delta = (c1 - c0) * step;

            for (int curIndex = prevIndex; curIndex <= nextIndex; ++curIndex) {
                writePixel(c0, curIndex);
--- a/src/shaders/gradients/BUILD.bazel
+++ b/src/shaders/gradients/BUILD.bazel
@ -12,8 +12,8 @@ generated_cc_atom(
        ":Sk4fGradientPriv_hdr",
        ":SkGradientShaderPriv_hdr",
        "//include/core:SkColor_hdr",
-        "//include/private:SkNx_hdr",
        "//include/private:SkTArray_hdr",
+        "//include/private:SkVx_hdr",
        "//src/core:SkMatrixPriv_hdr",
        "//src/shaders:SkShaderBase_hdr",
    ],
@ -38,7 +38,7 @@ generated_cc_atom(
        "//include/core:SkImageInfo_hdr",
        "//include/private:SkColorData_hdr",
        "//include/private:SkHalf_hdr",
-        "//include/private:SkNx_hdr",
+        "//include/private:SkVx_hdr",
        "//src/core:SkOpts_hdr",
    ],
 )
--- a/src/shaders/gradients/Sk4fGradientBase.cpp
+++ b/src/shaders/gradients/Sk4fGradientBase.cpp
@ -11,16 +11,15 @@

 namespace {

-Sk4f pack_color(const SkColor4f& c4f, bool premul, const Sk4f& component_scale) {
-    Sk4f pm4f = premul
-        ? Sk4f::Load(c4f.premul().vec())
-        : Sk4f::Load(c4f.vec());
+skvx::float4 pack_color(const SkColor4f& c4f, bool premul, const skvx::float4& component_scale) {
+    auto pm4f = premul ? skvx::float4::Load(c4f.premul().vec())
+                       : skvx::float4::Load(c4f.vec());

    if (premul) {
        // If the stops are premul, we clamp them to gamut now.
        // If the stops are unpremul, the colors will eventually go through Sk4f_toL32(),
        // which ends up clamping to gamut then.
-        pm4f = Sk4f::Max(0, Sk4f::Min(pm4f, pm4f[3]));
+        pm4f = max(0, min(pm4f, pm4f[3]));
    }

    return pm4f * component_scale;
@ -97,7 +96,7 @@ private:

 void addMirrorIntervals(const SkGradientShaderBase& shader,
                        const SkColor4f* colors,
-                        const Sk4f& componentScale,
+                        const skvx::float4& componentScale,
                        bool premulColors, bool reverse,
                        Sk4fGradientIntervalBuffer::BufferType* buffer) {
    const IntervalIterator iter(shader, reverse);
@ -117,10 +116,10 @@ void addMirrorIntervals(const SkGradientShaderBase& shader,

 } // anonymous namespace

-Sk4fGradientInterval::Sk4fGradientInterval(const Sk4f& c0, SkScalar t0,
-                                           const Sk4f& c1, SkScalar t1)
-    : fT0(t0)
-    , fT1(t1) {
+Sk4fGradientInterval::Sk4fGradientInterval(const skvx::float4& c0, SkScalar t0,
+                                           const skvx::float4& c1, SkScalar t1)
+        : fT0(t0)
+        , fT1(t1) {
    SkASSERT(t0 != t1);
    // Either p0 or p1 can be (-)inf for synthetic clamp edge intervals.
    SkASSERT(SkScalarIsFinite(t0) || SkScalarIsFinite(t1));
@ -128,10 +127,10 @@ Sk4fGradientInterval::Sk4fGradientInterval(const Sk4f& c0, SkScalar t0,
    const auto dt = t1 - t0;

    // Clamp edge intervals are always zero-ramp.
-    SkASSERT(SkScalarIsFinite(dt) || (c0 == c1).allTrue());
-    SkASSERT(SkScalarIsFinite(t0) || (c0 == c1).allTrue());
-    const Sk4f   dc = SkScalarIsFinite(dt) ? (c1 - c0) / dt : 0;
-    const Sk4f bias = c0 - (SkScalarIsFinite(t0) ? t0 * dc : 0);
+    SkASSERT(SkScalarIsFinite(dt) || all(c0 == c1));
+    SkASSERT(SkScalarIsFinite(t0) || all(c0 == c1));
+    const auto   dc = SkScalarIsFinite(dt) ? (c1 - c0) / dt : 0;
+    const auto bias = c0 - (SkScalarIsFinite(t0) ? t0 * dc : 0);

    bias.store(fCb.vec());
    dc.store(fCg.vec());
@ -187,9 +186,8 @@ void Sk4fGradientIntervalBuffer::init(const SkGradientShaderBase& shader, SkColo

    fIntervals.reset();

-    const Sk4f componentScale = premulColors
-        ? Sk4f(alpha)
-        : Sk4f(1.0f, 1.0f, 1.0f, alpha);
+    const skvx::float4 componentScale = premulColors ? skvx::float4(alpha)
+                                                     : skvx::float4(1.0f, 1.0f, 1.0f, alpha);
    const int first_index = reverse ? count - 1 : 0;
    const int last_index = count - 1 - first_index;
    const SkScalar first_pos = reverse ? SK_Scalar1 : 0;
@ -200,7 +198,7 @@ void Sk4fGradientIntervalBuffer::init(const SkGradientShaderBase& shader, SkColo

    if (tileMode == SkTileMode::kClamp) {
        // synthetic edge interval: -/+inf .. P0
-        const Sk4f clamp_color = pack_color(xformedColors.fColors[first_index],
+        const auto clamp_color = pack_color(xformedColors.fColors[first_index],
                                            premulColors, componentScale);
        const SkScalar clamp_pos = reverse ? SK_ScalarInfinity : SK_ScalarNegativeInfinity;
        fIntervals.emplace_back(clamp_color, clamp_pos,
@ -222,7 +220,7 @@ void Sk4fGradientIntervalBuffer::init(const SkGradientShaderBase& shader, SkColo

    if (tileMode == SkTileMode::kClamp) {
        // synthetic edge interval: Pn .. +/-inf
-        const Sk4f clamp_color = pack_color(xformedColors.fColors[last_index],
+        const auto clamp_color = pack_color(xformedColors.fColors[last_index],
                                            premulColors, componentScale);
        const SkScalar clamp_pos = reverse ? SK_ScalarNegativeInfinity : SK_ScalarInfinity;
        fIntervals.emplace_back(clamp_color, last_pos,
--- a/src/shaders/gradients/Sk4fGradientBase.h
+++ b/src/shaders/gradients/Sk4fGradientBase.h
@ -9,16 +9,16 @@
 #define Sk4fGradientBase_DEFINED

 #include "include/core/SkColor.h"
-#include "include/private/SkNx.h"
 #include "include/private/SkTArray.h"
+#include "include/private/SkVx.h"
 #include "src/core/SkMatrixPriv.h"
 #include "src/shaders/SkShaderBase.h"
 #include "src/shaders/gradients/Sk4fGradientPriv.h"
 #include "src/shaders/gradients/SkGradientShaderPriv.h"

 struct Sk4fGradientInterval {
-    Sk4fGradientInterval(const Sk4f& c0, SkScalar t0,
-                         const Sk4f& c1, SkScalar t1);
+    Sk4fGradientInterval(const skvx::float4& c0, SkScalar t0,
+                         const skvx::float4& c1, SkScalar t1);

    bool contains(SkScalar t) const {
        // True if t is in [p0,p1].  Note: this helper assumes a
@ -73,7 +73,8 @@ private:
    using INHERITED = Context;

    void addMirrorIntervals(const SkGradientShaderBase&,
-                            const Sk4f& componentScale, bool reverse);
+                            const skvx::float4& componentScale,
+                            bool reverse);
 };

 #endif // Sk4fGradientBase_DEFINED
--- a/src/shaders/gradients/Sk4fGradientPriv.h
+++ b/src/shaders/gradients/Sk4fGradientPriv.h
@ -12,7 +12,7 @@
 #include "include/core/SkImageInfo.h"
 #include "include/private/SkColorData.h"
 #include "include/private/SkHalf.h"
-#include "include/private/SkNx.h"
+#include "include/private/SkVx.h"
 #include "src/core/SkOpts.h"

 // Templates shared by various 4f gradient flavors.
@ -26,29 +26,29 @@ struct PremulTraits;

 template <>
 struct PremulTraits<ApplyPremul::False> {
-    static Sk4f apply(const Sk4f& c) { return c; }
+    static skvx::float4 apply(const skvx::float4& c) { return c; }
 };

 template <>
 struct PremulTraits<ApplyPremul::True> {
-    static Sk4f apply(const Sk4f& c) {
+    static skvx::float4 apply(const skvx::float4& c) {
        const float alpha = c[3];
        // FIXME: portable swizzle?
-        return c * Sk4f(alpha, alpha, alpha, 1);
+        return c * skvx::float4(alpha, alpha, alpha, 1);
    }
 };

 // Struct encapsulating various dest-dependent ops:
 //
-//   - load()       Load a SkPMColor4f value into Sk4f.  Normally called once per interval
+//   - load()       Load a SkPMColor4f value into skvx::float4.  Normally called once per interval
 //                  advance.  Also applies a scale and swizzle suitable for DstType.
 //
-//   - store()      Store one Sk4f to dest.  Optionally handles premul, color space
+//   - store()      Store one skvx::float4 to dest.  Optionally handles premul, color space
 //                  conversion, etc.
 //
-//   - store(count) Store the Sk4f value repeatedly to dest, count times.
+//   - store(count) Store the skvx::float4 value repeatedly to dest, count times.
 //
-//   - store4x()    Store 4 Sk4f values to dest (opportunistic optimization).
+//   - store4x()    Store 4 skvx::float4 values to dest (opportunistic optimization).
 //

 template <ApplyPremul premul>
@ -56,36 +56,39 @@ struct DstTraits {
    using PM   = PremulTraits<premul>;

    // For L32, prescaling by 255 saves a per-pixel multiplication when premul is not needed.
-    static Sk4f load(const SkPMColor4f& c) {
-        Sk4f c4f = swizzle_rb_if_bgra(Sk4f::Load(c.vec()));
+    static skvx::float4 load(const SkPMColor4f& c) {
+        skvx::float4 c4f = swizzle_rb_if_bgra(skvx::float4::Load(c.vec()));
        return premul == ApplyPremul::False
-            ? c4f * Sk4f(255)
+            ? c4f * skvx::float4(255)
            : c4f;
    }

-    static void store(const Sk4f& c, SkPMColor* dst, const Sk4f& bias) {
+    static void store(const skvx::float4& c, SkPMColor* dst, const skvx::float4& bias) {
        if (premul == ApplyPremul::False) {
            // c is pre-scaled by 255 and pre-biased, just store.
-            SkNx_cast<uint8_t>(c).store(dst);
+            skvx::cast<uint8_t>(c).store(dst);
        } else {
            *dst = Sk4f_toL32(PM::apply(c) + bias);
        }
    }

-    static void store(const Sk4f& c, SkPMColor* dst, int n) {
+    static void store(const skvx::float4& c, SkPMColor* dst, int n) {
        SkPMColor pmc;
-        store(c, &pmc, Sk4f(0));
+        store(c, &pmc, skvx::float4(0));
        sk_memset32(dst, pmc, n);
    }

-    static void store4x(const Sk4f& c0, const Sk4f& c1,
-                        const Sk4f& c2, const Sk4f& c3,
+    static void store4x(const skvx::float4& c0, const skvx::float4& c1,
+                        const skvx::float4& c2, const skvx::float4& c3,
                        SkPMColor* dst,
-                        const Sk4f& bias0,
-                        const Sk4f& bias1) {
+                        const skvx::float4& bias0,
+                        const skvx::float4& bias1) {
        if (premul == ApplyPremul::False) {
            // colors are pre-scaled and pre-biased.
-            Sk4f_ToBytes((uint8_t*)dst, c0, c1, c2, c3);
+            skvx::cast<uint8_t>(c0).store(dst + 0);
+            skvx::cast<uint8_t>(c1).store(dst + 1);
+            skvx::cast<uint8_t>(c2).store(dst + 2);
+            skvx::cast<uint8_t>(c3).store(dst + 3);
        } else {
            store(c0, dst + 0, bias0);
            store(c1, dst + 1, bias1);
@ -94,7 +97,7 @@ struct DstTraits {
        }
    }

-    static Sk4f pre_lerp_bias(const Sk4f& bias) {
+    static skvx::float4 pre_lerp_bias(const skvx::float4& bias) {
        // We can apply the bias before interpolation when the colors are premultiplied.
        return premul == ApplyPremul::False ? bias : 0;
    }
--- a/src/shaders/gradients/Sk4fLinearGradient.cpp
+++ b/src/shaders/gradients/Sk4fLinearGradient.cpp
@ -15,14 +15,14 @@
 namespace {

 template<ApplyPremul premul>
-void ramp(const Sk4f& c, const Sk4f& dc, SkPMColor dst[], int n,
-          const Sk4f& bias0, const Sk4f& bias1) {
+void ramp(const skvx::float4& c, const skvx::float4& dc, SkPMColor dst[], int n,
+          const skvx::float4& bias0, const skvx::float4& bias1) {
    SkASSERT(n > 0);

-    const Sk4f dc2 = dc + dc,
+    const auto dc2 = dc + dc,
               dc4 = dc2 + dc2;

-    Sk4f c0 =  c +      DstTraits<premul>::pre_lerp_bias(bias0),
+    auto c0 =  c +      DstTraits<premul>::pre_lerp_bias(bias0),
         c1 =  c + dc + DstTraits<premul>::pre_lerp_bias(bias1),
         c2 = c0 + dc2,
         c3 = c1 + dc2;
@ -222,8 +222,8 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y, SkPMColor dst[], int co
                                                   fx,
                                                   dx,
                                                   SkScalarNearlyZero(dx * count));
-    Sk4f bias4f0(bias0),
-         bias4f1(bias1);
+    skvx::float4 bias4f0(bias0),
+                 bias4f1(bias1);

    while (count > 0) {
        // What we really want here is SkTPin(advance, 1, count)
@ -300,8 +300,8 @@ public:
    }

    bool currentRampIsZero() const { return fZeroRamp; }
-    const Sk4f& currentColor() const { return fCc; }
-    const Sk4f& currentColorGrad() const { return fDcDx; }
+    const skvx::float4& currentColor() const { return fCc; }
+    const skvx::float4& currentColorGrad() const { return fDcDx; }

    void advance(SkScalar advX) {
        SkASSERT(advX > 0);
@ -312,7 +312,7 @@ public:
        }
        SkASSERT(advX < fAdvX);

-        fCc = fCc + fDcDx * Sk4f(advX);
+        fCc = fCc + fDcDx * advX;
        fAdvX -= advX;
    }

@ -320,17 +320,17 @@ private:
    void compute_interval_props(SkScalar t) {
        SkASSERT(in_range(t, fInterval->fT0, fInterval->fT1));

-        const Sk4f dc = DstTraits<premul>::load(fInterval->fCg);
-                  fCc = DstTraits<premul>::load(fInterval->fCb) + dc * Sk4f(t);
+        const auto dc = DstTraits<premul>::load(fInterval->fCg);
+                  fCc = DstTraits<premul>::load(fInterval->fCb) + dc * t;
                fDcDx = dc * fDx;
-            fZeroRamp = fIsVertical || (dc == 0).allTrue();
+            fZeroRamp = fIsVertical || all(dc == 0);
    }

    void init_average_props() {
        fAdvX     = SK_ScalarInfinity;
        fZeroRamp = true;
        fDcDx     = 0;
-        fCc       = Sk4f(0);
+        fCc       = 0;

        // TODO: precompute the average at interval setup time?
        for (const auto* i = fFirstInterval; i <= fLastInterval; ++i) {
@ -376,10 +376,10 @@ private:
    }

    // Current interval properties.
-    Sk4f            fDcDx;      // dst color gradient (dc/dx)
-    Sk4f            fCc;        // current color, interpolated in dst
-    SkScalar        fAdvX;      // remaining interval advance in dst
-    bool            fZeroRamp;  // current interval color grad is 0
+    skvx::float4 fDcDx;      // dst color gradient (dc/dx)
+    skvx::float4 fCc;        // current color, interpolated in dst
+    SkScalar     fAdvX;      // remaining interval advance in dst
+    bool         fZeroRamp;  // current interval color grad is 0

    const Sk4fGradientInterval* fFirstInterval;
    const Sk4fGradientInterval* fLastInterval;
--- a/src/shaders/gradients/SkGradientShader.cpp
+++ b/src/shaders/gradients/SkGradientShader.cpp
@ -339,8 +339,8 @@ bool SkGradientShaderBase::onAppendStages(const SkStageRec& rec) const {

        // See F and B below.
        auto ctx = alloc->make<SkRasterPipeline_EvenlySpaced2StopGradientCtx>();
-        (Sk4f::Load(c_r.vec()) - Sk4f::Load(c_l.vec())).store(ctx->f);
-        (                        Sk4f::Load(c_l.vec())).store(ctx->b);
+        (skvx::float4::Load(c_r.vec()) - skvx::float4::Load(c_l.vec())).store(ctx->f);
+        (                                skvx::float4::Load(c_l.vec())).store(ctx->b);
        ctx->interpolatedInPremul = premulGrad;

        p->append(SkRasterPipeline::evenly_spaced_2_stop_gradient, ctx);
@ -705,11 +705,11 @@ static SkColor4f average_gradient_color(const SkColor4f colors[], const SkScalar
    // the integral between the two endpoints is 0.5 * (ci + cj) * (pj - pi), which provides that
    // intervals average color. The overall average color is thus the sum of each piece. The thing
    // to keep in mind is that the provided gradient definition may implicitly use p=0 and p=1.
-    Sk4f blend(0.0f);
+    skvx::float4 blend(0.0f);
    for (int i = 0; i < colorCount - 1; ++i) {
        // Calculate the average color for the interval between pos(i) and pos(i+1)
-        Sk4f c0 = Sk4f::Load(&colors[i]);
-        Sk4f c1 = Sk4f::Load(&colors[i + 1]);
+        auto c0 = skvx::float4::Load(&colors[i]);
+        auto c1 = skvx::float4::Load(&colors[i + 1]);

        // when pos == null, there are colorCount uniformly distributed stops, going from 0 to 1,
        // so pos[i + 1] - pos[i] = 1/(colorCount-1)
@ -726,7 +726,7 @@ static SkColor4f average_gradient_color(const SkColor4f colors[], const SkScalar
                if (p0 > 0.0f) {
                    // The first color is fixed between p = 0 to pos[0], so 0.5*(ci + cj)*(pj - pi)
                    // becomes 0.5*(c + c)*(pj - 0) = c * pj
-                    Sk4f c = Sk4f::Load(&colors[0]);
+                    auto c = skvx::float4::Load(&colors[0]);
                    blend += p0 * c;
                }
            }
@ -734,7 +734,7 @@ static SkColor4f average_gradient_color(const SkColor4f colors[], const SkScalar
                if (p1 < 1.f) {
                    // The last color is fixed between pos[n-1] to p = 1, so 0.5*(ci + cj)*(pj - pi)
                    // becomes 0.5*(c + c)*(1 - pi) = c * (1 - pi)
-                    Sk4f c = Sk4f::Load(&colors[colorCount - 1]);
+                    auto c = skvx::float4::Load(&colors[colorCount - 1]);
                    blend += (1.f - p1) * c;
                }
            }
--- a/tests/Float16Test.cpp
+++ b/tests/Float16Test.cpp
@ -64,7 +64,7 @@ DEF_TEST(SkFloatToHalf_finite_ftz, r) {
            alternate = std::signbit(f) ? 0x8000 : 0x0000;
        }

-        uint16_t actual = SkFloatToHalf_finite_ftz(Sk4f{f})[0];
+        uint16_t actual = SkFloatToHalf_finite_ftz(skvx::float4{f})[0];
        // _finite_ftz() may truncate instead of rounding, so it may be one too small.
        REPORTER_ASSERT(r, actual == expected  || actual == expected  - 1 ||
                           actual == alternate || actual == alternate - 1);