clean up SkBlitRow_opts

SSE2 and NEON are common baseline instruction sets now, so there's no need to runtime detect support for these routines. I simplified the SSE and portable implementations while moving them. Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel Change-Id: I34e96851735c8d7ad90198f3ac4bf86ff508f17c Reviewed-on: https://skia-review.googlesource.com/c/170220 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2018-11-09 12:09:36 -05:00 · 2018-11-09 12:09:36 -05:00 · 6a2c42f893
commit 6a2c42f893
parent 7600cb3566
10 changed files with 208 additions and 489 deletions
--- a/gn/opts.gni
+++ b/gn/opts.gni
@ -9,12 +9,10 @@ _src = get_path_info("../src", "abspath")
 none = [
  "$_src/opts/Sk4px_none.h",
  "$_src/opts/SkBitmapProcState_opts_none.cpp",
-  "$_src/opts/SkBlitRow_opts_none.cpp",
 ]

 armv7 = [
  "$_src/opts/SkBitmapProcState_opts_none.cpp",
-  "$_src/opts/SkBlitRow_opts_arm.cpp",
 ]

 neon = [
@ -23,8 +21,6 @@ neon = [
  "$_src/opts/SkBitmapProcState_filter_neon.h",
  "$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
  "$_src/opts/SkBitmapProcState_matrix_neon.h",
-  "$_src/opts/SkBlitRow_opts_arm_neon.h",
-  "$_src/opts/SkBlitRow_opts_arm_neon.cpp",
  "$_src/opts/SkColor_opts_neon.h",
 ]

@ -35,9 +31,6 @@ arm64 = [
  "$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
  "$_src/opts/SkBitmapProcState_matrix_neon.h",
  "$_src/opts/SkBitmapProcState_opts_none.cpp",
-  "$_src/opts/SkBlitRow_opts_arm.cpp",
-  "$_src/opts/SkBlitRow_opts_arm_neon.h",
-  "$_src/opts/SkBlitRow_opts_arm_neon.cpp",
  "$_src/opts/SkColor_opts_neon.h",
 ]

@ -47,7 +40,6 @@ sse2 = [
  "$_src/opts/Sk4px_SSE2.h",
  "$_src/opts/SkBitmapProcState_opts_SSE2.h",
  "$_src/opts/SkBitmapProcState_opts_SSE2.cpp",
-  "$_src/opts/SkBlitRow_opts_SSE2.cpp",
  "$_src/opts/SkColor_opts_SSE2.h",
  "$_src/opts/opts_check_x86.cpp",
 ]
--- a/src/core/SkBlitRow.h
+++ b/src/core/SkBlitRow.h
@ -33,19 +33,6 @@ public:
        if they are not, they may not overlap.
     */
    static void Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color);
-
-    /** These static functions are called by the Factory and Factory32
-        functions, and should return either NULL, or a
-        platform-specific function-ptr to be used in place of the
-        system default.
-     */
-
-    static Proc32 PlatformProcs32(unsigned flags);
-
-private:
-    enum {
-        kFlags32_Mask = 3
-    };
 };

 #endif
--- a/src/core/SkBlitRow_D32.cpp
+++ b/src/core/SkBlitRow_D32.cpp
@ -11,103 +11,233 @@
 #include "SkOpts.h"
 #include "SkUtils.h"

-#define UNROLL
-
-static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
-                                 const SkPMColor* SK_RESTRICT src,
-                                 int count, U8CPU alpha) {
+// Everyone agrees memcpy() is the best way to do this.
+static void blit_row_s32_opaque(SkPMColor* dst,
+                                const SkPMColor* src,
+                                int count,
+                                U8CPU alpha) {
    SkASSERT(255 == alpha);
-    memcpy(dst, src, count * 4);
+    memcpy(dst, src, count * sizeof(SkPMColor));
 }

-static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
-                                const SkPMColor* SK_RESTRICT src,
-                                int count, U8CPU alpha) {
+// We have SSE2, NEON, and portable implementations of
+// blit_row_s32_blend() and blit_row_s32a_blend().
+
+// TODO(mtklein): can we do better in NEON than 2 pixels at a time?
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    #include <emmintrin.h>
+    #include "SkColor_opts_SSE2.h"
+
+    static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
        SkASSERT(alpha <= 255);
-    if (count > 0) {
-        unsigned src_scale = SkAlpha255To256(alpha);

-#ifdef UNROLL
-        if (count & 1) {
-            *dst = SkPMLerp(*src, *dst, src_scale);
-            src += 1;
-            dst += 1;
-            count -= 1;
+        auto src4 = (const __m128i*)src;
+        auto dst4 = (      __m128i*)dst;
+
+        while (count >= 4) {
+            _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
+                                                 _mm_loadu_si128(dst4),
+                                                 SkAlpha255To256(alpha)));
+            src4++;
+            dst4++;
+            count -= 4;
        }

-        const SkPMColor* SK_RESTRICT srcEnd = src + count;
-        while (src != srcEnd) {
-            *dst = SkPMLerp(*src, *dst, src_scale);
-            src += 1;
-            dst += 1;
-            *dst = SkPMLerp(*src, *dst, src_scale);
-            src += 1;
-            dst += 1;
-        }
-#else
-        do {
-            *dst = SkPMLerp(*src, *dst, src_scale);
-            src += 1;
-            dst += 1;
-        } while (--count > 0);
-#endif
+        src = (const SkPMColor*)src4;
+        dst = (      SkPMColor*)dst4;
+
+        while (count --> 0) {
+            *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
+            src++;
+            dst++;
        }
    }

-static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
-                                 const SkPMColor* SK_RESTRICT src,
-                                 int count, U8CPU alpha) {
+    static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
        SkASSERT(alpha <= 255);
-    if (count > 0) {
-#ifdef UNROLL
-        if (count & 1) {
-            *dst = SkBlendARGB32(*(src++), *dst, alpha);
-            dst += 1;
-            count -= 1;
+
+        auto src4 = (const __m128i*)src;
+        auto dst4 = (      __m128i*)dst;
+
+        while (count >= 4) {
+            _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
+                                                      _mm_loadu_si128(dst4),
+                                                      alpha));
+            src4++;
+            dst4++;
+            count -= 4;
        }

-        const SkPMColor* SK_RESTRICT srcEnd = src + count;
-        while (src != srcEnd) {
-            *dst = SkBlendARGB32(*(src++), *dst, alpha);
-            dst += 1;
-            *dst = SkBlendARGB32(*(src++), *dst, alpha);
-            dst += 1;
-        }
-#else
-        do {
+        src = (const SkPMColor*)src4;
+        dst = (      SkPMColor*)dst4;
+
+        while (count --> 0) {
            *dst = SkBlendARGB32(*src, *dst, alpha);
-            src += 1;
-            dst += 1;
-        } while (--count > 0);
+            src++;
+            dst++;
+        }
+    }
+
+#elif defined(SK_ARM_HAS_NEON)
+
+    #include "SkColor_opts_neon.h"
+    #include <arm_neon.h>
+
+    static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
+        SkASSERT(alpha <= 255);
+
+        uint16_t src_scale = SkAlpha255To256(alpha);
+        uint16_t dst_scale = 256 - src_scale;
+
+        while (count >= 2) {
+            uint8x8_t vsrc, vdst, vres;
+            uint16x8_t vsrc_wide, vdst_wide;
+
+            vsrc = vreinterpret_u8_u32(vld1_u32(src));
+            vdst = vreinterpret_u8_u32(vld1_u32(dst));
+
+            vsrc_wide = vmovl_u8(vsrc);
+            vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
+
+            vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+
+            vdst_wide += vsrc_wide;
+            vres = vshrn_n_u16(vdst_wide, 8);
+
+            vst1_u32(dst, vreinterpret_u32_u8(vres));
+
+            src += 2;
+            dst += 2;
+            count -= 2;
+        }
+
+        if (count == 1) {
+            uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
+            uint16x8_t vsrc_wide, vdst_wide;
+
+            vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
+            vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
+
+            vsrc_wide = vmovl_u8(vsrc);
+            vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
+            vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+            vdst_wide += vsrc_wide;
+            vres = vshrn_n_u16(vdst_wide, 8);
+
+            vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
+        }
+    }
+
+    static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
+        SkASSERT(alpha < 255);
+
+        unsigned alpha256 = SkAlpha255To256(alpha);
+
+        if (count & 1) {
+            uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
+            uint16x8_t vdst_wide, vsrc_wide;
+            unsigned dst_scale;
+
+            vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
+            vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
+
+            dst_scale = vget_lane_u8(vsrc, 3);
+            dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
+
+            vsrc_wide = vmovl_u8(vsrc);
+            vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
+
+            vdst_wide = vmovl_u8(vdst);
+            vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
+
+            vdst_wide += vsrc_wide;
+            vres = vshrn_n_u16(vdst_wide, 8);
+
+            vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
+            dst++;
+            src++;
+            count--;
+        }
+
+        uint8x8_t alpha_mask;
+        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
+        alpha_mask = vld1_u8(alpha_mask_setup);
+
+        while (count) {
+
+            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
+            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
+
+            __builtin_prefetch(src+32);
+            __builtin_prefetch(dst+32);
+
+            vsrc = vreinterpret_u8_u32(vld1_u32(src));
+            vdst = vreinterpret_u8_u32(vld1_u32(dst));
+
+            vsrc_scale = vdupq_n_u16(alpha256);
+
+            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
+            vdst_scale = vmovl_u8(vsrc_alphas);
+            // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
+            // A 16-bit lane would overflow if we used 0xFFFF here,
+            // so use an approximation with 0xFF00 that is off by 1,
+            // and add back 1 after to get the correct value.
+            // This is valid if alpha256 <= 255.
+            vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
+            vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
+            vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
+
+            vsrc_wide = vmovl_u8(vsrc);
+            vsrc_wide *= vsrc_scale;
+
+            vdst_wide = vmovl_u8(vdst);
+            vdst_wide *= vdst_scale;
+
+            vdst_wide += vsrc_wide;
+            vres = vshrn_n_u16(vdst_wide, 8);
+
+            vst1_u32(dst, vreinterpret_u32_u8(vres));
+
+            src += 2;
+            dst += 2;
+            count -= 2;
+        }
+    }
+
+#else
+    static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
+        SkASSERT(alpha <= 255);
+        while (count --> 0) {
+            *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
+            src++;
+            dst++;
+        }
+    }
+
+    static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
+        SkASSERT(alpha <= 255);
+        while (count --> 0) {
+            *dst = SkBlendARGB32(*src, *dst, alpha);
+            src++;
+            dst++;
+        }
+    }
 #endif
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-static const SkBlitRow::Proc32 gDefault_Procs32[] = {
-    S32_Opaque_BlitRow32,
-    S32_Blend_BlitRow32,
-    nullptr,
-    S32A_Blend_BlitRow32
-};

 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
-    SkASSERT(flags < SK_ARRAY_COUNT(gDefault_Procs32));
-    // just so we don't crash
-    flags &= kFlags32_Mask;
+    static const SkBlitRow::Proc32 kProcs[] = {
+        blit_row_s32_opaque,
+        blit_row_s32_blend,
+        nullptr,  // blit_row_s32a_opaque is in SkOpts
+        blit_row_s32a_blend
+    };

-    if (flags == 2) {
-        // S32A_Opaque_BlitRow32 has been ported to SkOpts, but not the others yet.
-        return SkOpts::blit_row_s32a_opaque;
-    }
+    SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
+    flags &= SK_ARRAY_COUNT(kProcs) - 1;  // just to be safe

-    SkBlitRow::Proc32 proc = PlatformProcs32(flags);
-    if (nullptr == proc) {
-        proc = gDefault_Procs32[flags];
-    }
-    SkASSERT(proc);
-    return proc;
+    return flags == 2 ? SkOpts::blit_row_s32a_opaque
+                      : kProcs[flags];
 }

 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@ -1,103 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include <emmintrin.h>
-#include "SkBitmapProcState_opts_SSE2.h"
-#include "SkBlitRow_opts_SSE2.h"
-#include "SkColorData.h"
-#include "SkColor_opts_SSE2.h"
-#include "SkMSAN.h"
-#include "SkUTF.h"
-
-/* SSE2 version of S32_Blend_BlitRow32()
- * portable version is in core/SkBlitRow_D32.cpp
- */
-void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                              const SkPMColor* SK_RESTRICT src,
-                              int count, U8CPU alpha) {
-    SkASSERT(alpha <= 255);
-    if (count <= 0) {
-        return;
-    }
-
-    uint32_t src_scale = SkAlpha255To256(alpha);
-
-    if (count >= 4) {
-        SkASSERT(((size_t)dst & 0x03) == 0);
-        while (((size_t)dst & 0x0F) != 0) {
-            *dst = SkPMLerp(*src, *dst, src_scale);
-            src++;
-            dst++;
-            count--;
-        }
-
-        const __m128i *s = reinterpret_cast<const __m128i*>(src);
-        __m128i *d = reinterpret_cast<__m128i*>(dst);
-
-        while (count >= 4) {
-            // Load 4 pixels each of src and dest.
-            __m128i src_pixel = _mm_loadu_si128(s);
-            __m128i dst_pixel = _mm_load_si128(d);
-
-            __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
-            _mm_store_si128(d, result);
-            s++;
-            d++;
-            count -= 4;
-        }
-        src = reinterpret_cast<const SkPMColor*>(s);
-        dst = reinterpret_cast<SkPMColor*>(d);
-    }
-
-    while (count > 0) {
-        *dst = SkPMLerp(*src, *dst, src_scale);
-        src++;
-        dst++;
-        count--;
-    }
-}
-
-void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                               const SkPMColor* SK_RESTRICT src,
-                               int count, U8CPU alpha) {
-    SkASSERT(alpha <= 255);
-    if (count <= 0) {
-        return;
-    }
-
-    if (count >= 4) {
-        while (((size_t)dst & 0x0F) != 0) {
-            *dst = SkBlendARGB32(*src, *dst, alpha);
-            src++;
-            dst++;
-            count--;
-        }
-
-        const __m128i *s = reinterpret_cast<const __m128i*>(src);
-        __m128i *d = reinterpret_cast<__m128i*>(dst);
-        while (count >= 4) {
-            // Load 4 pixels each of src and dest.
-            __m128i src_pixel = _mm_loadu_si128(s);
-            __m128i dst_pixel = _mm_load_si128(d);
-
-            __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
-            _mm_store_si128(d, result);
-            s++;
-            d++;
-            count -= 4;
-        }
-        src = reinterpret_cast<const SkPMColor*>(s);
-        dst = reinterpret_cast<SkPMColor*>(d);
-    }
-
-    while (count > 0) {
-        *dst = SkBlendARGB32(*src, *dst, alpha);
-        src++;
-        dst++;
-        count--;
-    }
-}
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@ -1,21 +0,0 @@
-/*
- * Copyright 2009 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkBlitRow_opts_SSE2_DEFINED
-#define SkBlitRow_opts_SSE2_DEFINED
-
-#include "SkBlitRow.h"
-
-void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                              const SkPMColor* SK_RESTRICT src,
-                              int count, U8CPU alpha);
-
-void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                               const SkPMColor* SK_RESTRICT src,
-                               int count, U8CPU alpha);
-
-#endif
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@ -1,19 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBlitRow.h"
-#include "SkUtilsArm.h"
-
-#include "SkBlitRow_opts_arm_neon.h"
-
-extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = {
-    nullptr, nullptr, nullptr, nullptr,
-};
-
-SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
-    return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags];
-}
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@ -1,200 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBlitRow_opts_arm_neon.h"
-
-#include "SkBlitRow.h"
-#include "SkColorData.h"
-#include "SkMathPriv.h"
-#include "SkUTF.h"
-
-#include "SkColor_opts_neon.h"
-#include <arm_neon.h>
-
-/* Neon version of S32_Blend_BlitRow32()
- * portable version is in src/core/SkBlitRow_D32.cpp
- */
-void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
-                              const SkPMColor* SK_RESTRICT src,
-                              int count, U8CPU alpha) {
-    SkASSERT(alpha <= 255);
-
-    if (count <= 0) {
-        return;
-    }
-
-    uint16_t src_scale = SkAlpha255To256(alpha);
-    uint16_t dst_scale = 256 - src_scale;
-
-    while (count >= 2) {
-        uint8x8_t vsrc, vdst, vres;
-        uint16x8_t vsrc_wide, vdst_wide;
-
-        /* These commented prefetches are a big win for count
-         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
-         * They also hurt a little (<5%) on an A15
-         */
-        //__builtin_prefetch(src+32);
-        //__builtin_prefetch(dst+32);
-
-        // Load
-        vsrc = vreinterpret_u8_u32(vld1_u32(src));
-        vdst = vreinterpret_u8_u32(vld1_u32(dst));
-
-        // Process src
-        vsrc_wide = vmovl_u8(vsrc);
-        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
-
-        // Process dst
-        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
-
-        // Combine
-        vdst_wide += vsrc_wide;
-        vres = vshrn_n_u16(vdst_wide, 8);
-
-        // Store
-        vst1_u32(dst, vreinterpret_u32_u8(vres));
-
-        src += 2;
-        dst += 2;
-        count -= 2;
-    }
-
-    if (count == 1) {
-        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
-        uint16x8_t vsrc_wide, vdst_wide;
-
-        // Load
-        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
-        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
-
-        // Process
-        vsrc_wide = vmovl_u8(vsrc);
-        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
-        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
-        vdst_wide += vsrc_wide;
-        vres = vshrn_n_u16(vdst_wide, 8);
-
-        // Store
-        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
-    }
-}
-
-#ifdef SK_CPU_ARM32
-void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
-                         const SkPMColor* SK_RESTRICT src,
-                         int count, U8CPU alpha) {
-
-    SkASSERT(255 > alpha);
-
-    if (count <= 0) {
-        return;
-    }
-
-    unsigned alpha256 = SkAlpha255To256(alpha);
-
-    // First deal with odd counts
-    if (count & 1) {
-        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
-        uint16x8_t vdst_wide, vsrc_wide;
-        unsigned dst_scale;
-
-        // Load
-        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
-        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
-
-        // Calc dst_scale
-        dst_scale = vget_lane_u8(vsrc, 3);
-        dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
-
-        // Process src
-        vsrc_wide = vmovl_u8(vsrc);
-        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
-
-        // Process dst
-        vdst_wide = vmovl_u8(vdst);
-        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
-
-        // Combine
-        vdst_wide += vsrc_wide;
-        vres = vshrn_n_u16(vdst_wide, 8);
-
-        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
-        dst++;
-        src++;
-        count--;
-    }
-
-    if (count) {
-        uint8x8_t alpha_mask;
-        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
-        alpha_mask = vld1_u8(alpha_mask_setup);
-
-        do {
-
-            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
-            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
-
-            __builtin_prefetch(src+32);
-            __builtin_prefetch(dst+32);
-
-            // Load
-            vsrc = vreinterpret_u8_u32(vld1_u32(src));
-            vdst = vreinterpret_u8_u32(vld1_u32(dst));
-
-            // Prepare src_scale
-            vsrc_scale = vdupq_n_u16(alpha256);
-
-            // Calc dst_scale
-            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
-            vdst_scale = vmovl_u8(vsrc_alphas);
-            // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
-            // A 16-bit lane would overflow if we used 0xFFFF here,
-            // so use an approximation with 0xFF00 that is off by 1,
-            // and add back 1 after to get the correct value.
-            // This is valid if alpha256 <= 255.
-            vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
-            vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
-            vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
-
-            // Process src
-            vsrc_wide = vmovl_u8(vsrc);
-            vsrc_wide *= vsrc_scale;
-
-            // Process dst
-            vdst_wide = vmovl_u8(vdst);
-            vdst_wide *= vdst_scale;
-
-            // Combine
-            vdst_wide += vsrc_wide;
-            vres = vshrn_n_u16(vdst_wide, 8);
-
-            vst1_u32(dst, vreinterpret_u32_u8(vres));
-
-            src += 2;
-            dst += 2;
-            count -= 2;
-        } while(count);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-#endif // #ifdef SK_CPU_ARM32
-
-///////////////////////////////////////////////////////////////////////////////
-
-const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
-    nullptr,   // S32_Opaque,
-    S32_Blend_BlitRow32_neon,        // S32_Blend,
-    nullptr,  // Ported to SkOpts
-#ifdef SK_CPU_ARM32
-    S32A_Blend_BlitRow32_neon        // S32A_Blend
-#else
-    nullptr
-#endif
-};
--- a/src/opts/SkBlitRow_opts_arm_neon.h
+++ b/src/opts/SkBlitRow_opts_arm_neon.h
@ -1,14 +0,0 @@
-/*
- * Copyright 2012 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-#ifndef SkBlitRow_opts_arm_neon_DEFINED
-#define SkBlitRow_opts_arm_neon_DEFINED
-
-#include "SkBlitRow.h"
-
-extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[];
-
-#endif
--- a/src/opts/SkBlitRow_opts_none.cpp
+++ b/src/opts/SkBlitRow_opts_none.cpp
@ -1,14 +0,0 @@
-/*
- * Copyright 2011 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBlitRow.h"
-
-// Platform impl of Platform_procs with no overrides
-
-SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
-    return nullptr;
-}
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@ -7,8 +7,6 @@

 #include "SkBitmapProcState_opts_SSE2.h"
 #include "SkBitmapProcState_opts_SSSE3.h"
-#include "SkBlitRow.h"
-#include "SkBlitRow_opts_SSE2.h"
 #include "SkCpu.h"


@ -61,20 +59,3 @@ void SkBitmapProcState::platformProcs() {
        fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
    }
 }
-
-////////////////////////////////////////////////////////////////////////////////
-
-static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
-    nullptr,                               // S32_Opaque,
-    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
-    nullptr,                            // Ported to SkOpts
-    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
-};
-
-SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
-    if (SkCpu::Supports(SkCpu::SSE2)) {
-        return platform_32_procs_SSE2[flags];
-    } else {
-        return nullptr;
-    }
-}