Purge non-NEON ARM code.

As I begin to wade in here, it's nice to remove as much code as possible. BUG=skia:4117 Review URL: https://codereview.chromium.org/1277953002
2015-08-06 11:18:50 -07:00 · 2015-08-06 11:18:50 -07:00 · e683e810a3
commit e683e810a3
parent 765d6ad975
3 changed files with 6 additions and 710 deletions
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@ -16,207 +16,7 @@

 #include "SkConvolver.h"

-#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
-void SI8_D16_nofilter_DX_arm(
-    const SkBitmapProcState& s,
-    const uint32_t* SK_RESTRICT xy,
-    int count,
-    uint16_t* SK_RESTRICT colors) SK_ATTRIBUTE_OPTIMIZE_O1;
-
-void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s,
-                             const uint32_t* SK_RESTRICT xy,
-                             int count, uint16_t* SK_RESTRICT colors) {
-    SkASSERT(count > 0 && colors != NULL);
-    SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
-    SkASSERT(kNone_SkFilterQuality == s.fFilterLevel);
-
-    const uint16_t* SK_RESTRICT table = s.fPixmap.ctable()->read16BitCache();
-    const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fPixmap.addr();
-
-    // buffer is y32, x16, x16, x16, x16, x16
-    // bump srcAddr to the proper row, since we're told Y never changes
-    SkASSERT((unsigned)xy[0] < (unsigned)s.fPixmap.height());
-    srcAddr = (const uint8_t*)((const char*)srcAddr + xy[0] * s.fPixmap.rowBytes());
-
-    uint8_t src;
-
-    if (1 == s.fPixmap.width()) {
-        src = srcAddr[0];
-        uint16_t dstValue = table[src];
-        sk_memset16(colors, dstValue, count);
-    } else {
-        int i;
-        int count8 = count >> 3;
-        const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1);
-
-        asm volatile (
-            "cmp        %[count8], #0                  \n\t"  // compare loop counter with 0
-            "beq        2f                             \n\t"  // if loop counter == 0, exit
-            "1:                                        \n\t"
-            "ldmia      %[xx]!, {r5, r7, r9, r11}      \n\t"  // load ptrs to pixels 0-7
-            "subs       %[count8], %[count8], #1       \n\t"  // decrement loop counter
-            "uxth       r4, r5                         \n\t"  // extract ptr 0
-            "mov        r5, r5, lsr #16                \n\t"  // extract ptr 1
-            "uxth       r6, r7                         \n\t"  // extract ptr 2
-            "mov        r7, r7, lsr #16                \n\t"  // extract ptr 3
-            "ldrb       r4, [%[srcAddr], r4]           \n\t"  // load pixel 0 from image
-            "uxth       r8, r9                         \n\t"  // extract ptr 4
-            "ldrb       r5, [%[srcAddr], r5]           \n\t"  // load pixel 1 from image
-            "mov        r9, r9, lsr #16                \n\t"  // extract ptr 5
-            "ldrb       r6, [%[srcAddr], r6]           \n\t"  // load pixel 2 from image
-            "uxth       r10, r11                       \n\t"  // extract ptr 6
-            "ldrb       r7, [%[srcAddr], r7]           \n\t"  // load pixel 3 from image
-            "mov        r11, r11, lsr #16              \n\t"  // extract ptr 7
-            "ldrb       r8, [%[srcAddr], r8]           \n\t"  // load pixel 4 from image
-            "add        r4, r4, r4                     \n\t"  // double pixel 0 for RGB565 lookup
-            "ldrb       r9, [%[srcAddr], r9]           \n\t"  // load pixel 5 from image
-            "add        r5, r5, r5                     \n\t"  // double pixel 1 for RGB565 lookup
-            "ldrb       r10, [%[srcAddr], r10]         \n\t"  // load pixel 6 from image
-            "add        r6, r6, r6                     \n\t"  // double pixel 2 for RGB565 lookup
-            "ldrb       r11, [%[srcAddr], r11]         \n\t"  // load pixel 7 from image
-            "add        r7, r7, r7                     \n\t"  // double pixel 3 for RGB565 lookup
-            "ldrh       r4, [%[table], r4]             \n\t"  // load pixel 0 RGB565 from colmap
-            "add        r8, r8, r8                     \n\t"  // double pixel 4 for RGB565 lookup
-            "ldrh       r5, [%[table], r5]             \n\t"  // load pixel 1 RGB565 from colmap
-            "add        r9, r9, r9                     \n\t"  // double pixel 5 for RGB565 lookup
-            "ldrh       r6, [%[table], r6]             \n\t"  // load pixel 2 RGB565 from colmap
-            "add        r10, r10, r10                  \n\t"  // double pixel 6 for RGB565 lookup
-            "ldrh       r7, [%[table], r7]             \n\t"  // load pixel 3 RGB565 from colmap
-            "add        r11, r11, r11                  \n\t"  // double pixel 7 for RGB565 lookup
-            "ldrh       r8, [%[table], r8]             \n\t"  // load pixel 4 RGB565 from colmap
-            "ldrh       r9, [%[table], r9]             \n\t"  // load pixel 5 RGB565 from colmap
-            "ldrh       r10, [%[table], r10]           \n\t"  // load pixel 6 RGB565 from colmap
-            "ldrh       r11, [%[table], r11]           \n\t"  // load pixel 7 RGB565 from colmap
-            "pkhbt      r5, r4, r5, lsl #16            \n\t"  // pack pixels 0 and 1
-            "pkhbt      r6, r6, r7, lsl #16            \n\t"  // pack pixels 2 and 3
-            "pkhbt      r8, r8, r9, lsl #16            \n\t"  // pack pixels 4 and 5
-            "pkhbt      r10, r10, r11, lsl #16         \n\t"  // pack pixels 6 and 7
-            "stmia      %[colors]!, {r5, r6, r8, r10}  \n\t"  // store last 8 pixels
-            "bgt        1b                             \n\t"  // loop if counter > 0
-            "2:                                        \n\t"
-            : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors)
-            : [table] "r" (table), [srcAddr] "r" (srcAddr)
-            : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
-        );
-
-        for (i = (count & 7); i > 0; --i) {
-            src = srcAddr[*xx++]; *colors++ = table[src];
-        }
-    }
-}
-
-void SI8_opaque_D32_nofilter_DX_arm(
-    const SkBitmapProcState& s,
-    const uint32_t* SK_RESTRICT xy,
-    int count,
-    SkPMColor* SK_RESTRICT colors) SK_ATTRIBUTE_OPTIMIZE_O1;
-
-void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
-                                    const uint32_t* SK_RESTRICT xy,
-                                    int count, SkPMColor* SK_RESTRICT colors) {
-    SkASSERT(count > 0 && colors != NULL);
-    SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
-    SkASSERT(kNone_SkFilterQuality == s.fFilterLevel);
-
-    const SkPMColor* SK_RESTRICT table = s.fPixmap.ctable()->readColors();
-    const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fPixmap.addr();
-
-    // buffer is y32, x16, x16, x16, x16, x16
-    // bump srcAddr to the proper row, since we're told Y never changes
-    SkASSERT((unsigned)xy[0] < (unsigned)s.fPixmap.height());
-    srcAddr = (const uint8_t*)((const char*)srcAddr + xy[0] * s.fPixmap.rowBytes());
-
-    if (1 == s.fPixmap.width()) {
-        uint8_t src = srcAddr[0];
-        SkPMColor dstValue = table[src];
-        sk_memset32(colors, dstValue, count);
-    } else {
-        const uint16_t* xx = (const uint16_t*)(xy + 1);
-
-        asm volatile (
-            "subs       %[count], %[count], #8        \n\t"   // decrement count by 8, set flags
-            "blt        2f                            \n\t"   // if count < 0, branch to singles
-            "1:                                       \n\t"   // eights loop
-            "ldmia      %[xx]!, {r5, r7, r9, r11}     \n\t"   // load ptrs to pixels 0-7
-            "uxth       r4, r5                        \n\t"   // extract ptr 0
-            "mov        r5, r5, lsr #16               \n\t"   // extract ptr 1
-            "uxth       r6, r7                        \n\t"   // extract ptr 2
-            "mov        r7, r7, lsr #16               \n\t"   // extract ptr 3
-            "ldrb       r4, [%[srcAddr], r4]          \n\t"   // load pixel 0 from image
-            "uxth       r8, r9                        \n\t"   // extract ptr 4
-            "ldrb       r5, [%[srcAddr], r5]          \n\t"   // load pixel 1 from image
-            "mov        r9, r9, lsr #16               \n\t"   // extract ptr 5
-            "ldrb       r6, [%[srcAddr], r6]          \n\t"   // load pixel 2 from image
-            "uxth       r10, r11                      \n\t"   // extract ptr 6
-            "ldrb       r7, [%[srcAddr], r7]          \n\t"   // load pixel 3 from image
-            "mov        r11, r11, lsr #16             \n\t"   // extract ptr 7
-            "ldrb       r8, [%[srcAddr], r8]          \n\t"   // load pixel 4 from image
-            "ldrb       r9, [%[srcAddr], r9]          \n\t"   // load pixel 5 from image
-            "ldrb       r10, [%[srcAddr], r10]        \n\t"   // load pixel 6 from image
-            "ldrb       r11, [%[srcAddr], r11]        \n\t"   // load pixel 7 from image
-            "ldr        r4, [%[table], r4, lsl #2]    \n\t"   // load pixel 0 SkPMColor from colmap
-            "ldr        r5, [%[table], r5, lsl #2]    \n\t"   // load pixel 1 SkPMColor from colmap
-            "ldr        r6, [%[table], r6, lsl #2]    \n\t"   // load pixel 2 SkPMColor from colmap
-            "ldr        r7, [%[table], r7, lsl #2]    \n\t"   // load pixel 3 SkPMColor from colmap
-            "ldr        r8, [%[table], r8, lsl #2]    \n\t"   // load pixel 4 SkPMColor from colmap
-            "ldr        r9, [%[table], r9, lsl #2]    \n\t"   // load pixel 5 SkPMColor from colmap
-            "ldr        r10, [%[table], r10, lsl #2]  \n\t"   // load pixel 6 SkPMColor from colmap
-            "ldr        r11, [%[table], r11, lsl #2]  \n\t"   // load pixel 7 SkPMColor from colmap
-            "subs       %[count], %[count], #8        \n\t"   // decrement loop counter
-            "stmia      %[colors]!, {r4-r11}          \n\t"   // store 8 pixels
-            "bge        1b                            \n\t"   // loop if counter >= 0
-            "2:                                       \n\t"
-            "adds       %[count], %[count], #8        \n\t"   // fix up counter, set flags
-            "beq        4f                            \n\t"   // if count == 0, branch to exit
-            "3:                                       \n\t"   // singles loop
-            "ldrh       r4, [%[xx]], #2               \n\t"   // load pixel ptr
-            "subs       %[count], %[count], #1        \n\t"   // decrement loop counter
-            "ldrb       r5, [%[srcAddr], r4]          \n\t"   // load pixel from image
-            "ldr        r6, [%[table], r5, lsl #2]    \n\t"   // load SkPMColor from colmap
-            "str        r6, [%[colors]], #4           \n\t"   // store pixel, update ptr
-            "bne        3b                            \n\t"   // loop if counter != 0
-            "4:                                       \n\t"   // exit
-            : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors)
-            : [table] "r" (table), [srcAddr] "r" (srcAddr)
-            : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
-        );
-    }
-}
-#endif // !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
-
-///////////////////////////////////////////////////////////////////////////////
-
-/*  If we replace a sampleproc, then we null-out the associated shaderproc,
-    otherwise the shader won't even look at the matrix/sampler
- */
-void SkBitmapProcState::platformProcs() {
-#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
-    bool isOpaque = 256 == fAlphaScale;
-    bool justDx = false;
-
-    if (fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) {
-        justDx = true;
-    }
-
-    switch (fPixmap.colorType()) {
-        case kIndex_8_SkColorType:
-            if (justDx && kNone_SkFilterQuality == fFilterLevel) {
-#if 0   /* crashing on android device */
-                fSampleProc16 = SI8_D16_nofilter_DX_arm;
-                fShaderProc16 = NULL;
-#endif
-                if (isOpaque) {
-                    // this one is only very slighty faster than the C version
-                    fSampleProc32 = SI8_opaque_D32_nofilter_DX_arm;
-                    fShaderProc32 = NULL;
-                }
-            }
-            break;
-        default:
-            break;
-    }
-#endif
-}
+void SkBitmapProcState::platformProcs() { }

 ///////////////////////////////////////////////////////////////////////////////

--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@ -6,378 +6,22 @@
 */

 #include "SkBlitRow.h"
-#include "SkColorPriv.h"
-#include "SkDither.h"
-#include "SkMathPriv.h"
-#include "SkUtils.h"
 #include "SkUtilsArm.h"

-// Define USE_NEON_CODE to indicate that we need to build NEON routines
-#define USE_NEON_CODE  (!SK_ARM_NEON_IS_NONE)
-
-// Define USE_ARM_CODE to indicate that we need to build ARM routines
-#define USE_ARM_CODE   (!SK_ARM_NEON_IS_ALWAYS)
-
-#if USE_NEON_CODE
-  #include "SkBlitRow_opts_arm_neon.h"
-#endif
-
-#if USE_ARM_CODE
-
-static void S32A_D565_Opaque(uint16_t* SK_RESTRICT dst,
-                             const SkPMColor* SK_RESTRICT src, int count,
-                             U8CPU alpha, int /*x*/, int /*y*/) {
-    SkASSERT(255 == alpha);
-
-    asm volatile (
-                  "1:                                   \n\t"
-                  "ldr     r3, [%[src]], #4             \n\t"
-                  "cmp     r3, #0xff000000              \n\t"
-                  "blo     2f                           \n\t"
-                  "and     r4, r3, #0x0000f8            \n\t"
-                  "and     r5, r3, #0x00fc00            \n\t"
-                  "and     r6, r3, #0xf80000            \n\t"
-#ifdef SK_ARM_HAS_EDSP
-                  "pld     [r1, #32]                    \n\t"
-#endif
-                  "lsl     r3, r4, #8                   \n\t"
-                  "orr     r3, r3, r5, lsr #5           \n\t"
-                  "orr     r3, r3, r6, lsr #19          \n\t"
-                  "subs    %[count], %[count], #1       \n\t"
-                  "strh    r3, [%[dst]], #2             \n\t"
-                  "bne     1b                           \n\t"
-                  "b       4f                           \n\t"
-                  "2:                                   \n\t"
-                  "lsrs    r7, r3, #24                  \n\t"
-                  "beq     3f                           \n\t"
-                  "ldrh    r4, [%[dst]]                 \n\t"
-                  "rsb     r7, r7, #255                 \n\t"
-                  "and     r6, r4, #0x001f              \n\t"
-#if SK_ARM_ARCH <= 6
-                  "lsl     r5, r4, #21                  \n\t"
-                  "lsr     r5, r5, #26                  \n\t"
-#else
-                  "ubfx    r5, r4, #5, #6               \n\t"
-#endif
-#ifdef SK_ARM_HAS_EDSP
-                  "pld     [r0, #16]                    \n\t"
-#endif
-                  "lsr     r4, r4, #11                  \n\t"
-#ifdef SK_ARM_HAS_EDSP
-                  "smulbb  r6, r6, r7                   \n\t"
-                  "smulbb  r5, r5, r7                   \n\t"
-                  "smulbb  r4, r4, r7                   \n\t"
-#else
-                  "mul     r6, r6, r7                   \n\t"
-                  "mul     r5, r5, r7                   \n\t"
-                  "mul     r4, r4, r7                   \n\t"
-#endif
-#if SK_ARM_ARCH >= 6
-                  "uxtb    r7, r3, ROR #16              \n\t"
-                  "uxtb    ip, r3, ROR #8               \n\t"
-#else
-                  "mov     ip, #0xff                    \n\t"
-                  "and     r7, ip, r3, ROR #16          \n\t"
-                  "and     ip, ip, r3, ROR #8           \n\t"
-#endif
-                  "and     r3, r3, #0xff                \n\t"
-                  "add     r6, r6, #16                  \n\t"
-                  "add     r5, r5, #32                  \n\t"
-                  "add     r4, r4, #16                  \n\t"
-                  "add     r6, r6, r6, lsr #5           \n\t"
-                  "add     r5, r5, r5, lsr #6           \n\t"
-                  "add     r4, r4, r4, lsr #5           \n\t"
-                  "add     r6, r7, r6, lsr #5           \n\t"
-                  "add     r5, ip, r5, lsr #6           \n\t"
-                  "add     r4, r3, r4, lsr #5           \n\t"
-                  "lsr     r6, r6, #3                   \n\t"
-                  "and     r5, r5, #0xfc                \n\t"
-                  "and     r4, r4, #0xf8                \n\t"
-                  "orr     r6, r6, r5, lsl #3           \n\t"
-                  "orr     r4, r6, r4, lsl #8           \n\t"
-                  "strh    r4, [%[dst]], #2             \n\t"
-#ifdef SK_ARM_HAS_EDSP
-                  "pld     [r1, #32]                    \n\t"
-#endif
-                  "subs    %[count], %[count], #1       \n\t"
-                  "bne     1b                           \n\t"
-                  "b       4f                           \n\t"
-                  "3:                                   \n\t"
-                  "subs    %[count], %[count], #1       \n\t"
-                  "add     %[dst], %[dst], #2           \n\t"
-                  "bne     1b                           \n\t"
-                  "4:                                   \n\t"
-                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
-                  :
-                  : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip"
-                  );
-}
-
-static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
-                                  const SkPMColor* SK_RESTRICT src,
-                                  int count, U8CPU alpha) {
-
-    SkASSERT(255 == alpha);
-
-    asm volatile (
-                  "cmp    %[count], #0               \n\t" /* comparing count with 0 */
-                  "beq    3f                         \n\t" /* if zero exit */
-
-                  "mov    ip, #0xff                  \n\t" /* load the 0xff mask in ip */
-                  "orr    ip, ip, ip, lsl #16        \n\t" /* convert it to 0xff00ff in ip */
-
-                  "cmp    %[count], #2               \n\t" /* compare count with 2 */
-                  "blt    2f                         \n\t" /* if less than 2 -> single loop */
-
-                  /* Double Loop */
-                  "1:                                \n\t" /* <double loop> */
-                  "ldm    %[src]!, {r5,r6}           \n\t" /* load the src(s) at r5-r6 */
-                  "ldm    %[dst], {r7,r8}            \n\t" /* loading dst(s) into r7-r8 */
-                  "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
-
-                  /* ----------- */
-                  "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
-                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
-                  "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
-
-                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
-                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
-                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
-
-                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
-                  "lsr    r4, r6, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
-                  "orr    r7, r9, r10                \n\t" /* br | ag*/
-
-                  "add    r7, r5, r7                 \n\t" /* dst = src + calc dest(r7) */
-                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 255 -> r4=scale */
-
-                  /* ----------- */
-                  "and    r9, ip, r8                 \n\t" /* r9 = br masked by ip */
-
-                  "and    r10, ip, r8, lsr #8        \n\t" /* r10 = ag masked by ip */
-                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
-                  "sub    %[count], %[count], #2     \n\t"
-                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
-
-                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
-                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
-                  "cmp    %[count], #1               \n\t" /* comparing count with 1 */
-                  "orr    r8, r9, r10                \n\t" /* br | ag */
-
-                  "add    r8, r6, r8                 \n\t" /* dst = src + calc dest(r8) */
-
-                  /* ----------------- */
-                  "stm    %[dst]!, {r7,r8}           \n\t" /* *dst = r7, increment dst by two (each times 4) */
-                  /* ----------------- */
-
-                  "bgt    1b                         \n\t" /* if greater than 1 -> reloop */
-                  "blt    3f                         \n\t" /* if less than 1 -> exit */
-
-                  /* Single Loop */
-                  "2:                                \n\t" /* <single loop> */
-                  "ldr    r5, [%[src]], #4           \n\t" /* load the src pointer into r5 r5=src */
-                  "ldr    r7, [%[dst]]               \n\t" /* loading dst into r7 */
-                  "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
-
-                  /* ----------- */
-                  "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
-                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
-
-                  "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
-                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
-                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
-                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
-
-                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag */
-                  "orr    r7, r9, r10                \n\t" /* br | ag */
-
-                  "add    r7, r5, r7                 \n\t" /* *dst = src + calc dest(r7) */
-
-                  /* ----------------- */
-                  "str    r7, [%[dst]], #4           \n\t" /* *dst = r7, increment dst by one (times 4) */
-                  /* ----------------- */
-
-                  "3:                                \n\t" /* <exit> */
-                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
-                  :
-                  : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
-                  );
-}
-
-/*
- * ARM asm version of S32A_Blend_BlitRow32
- */
-void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
-                              const SkPMColor* SK_RESTRICT src,
-                              int count, U8CPU alpha) {
-    asm volatile (
-                  "cmp    %[count], #0               \n\t" /* comparing count with 0 */
-                  "beq    3f                         \n\t" /* if zero exit */
-
-                  "mov    r12, #0xff                 \n\t" /* load the 0xff mask in r12 */
-                  "orr    r12, r12, r12, lsl #16     \n\t" /* convert it to 0xff00ff in r12 */
-
-                  /* src1,2_scale */
-                  "add    %[alpha], %[alpha], #1     \n\t" /* loading %[alpha]=src_scale=alpha+1 */
-
-                  "cmp    %[count], #2               \n\t" /* comparing count with 2 */
-                  "blt    2f                         \n\t" /* if less than 2 -> single loop */
-
-                  /* Double Loop */
-                  "1:                                \n\t" /* <double loop> */
-                  "ldm    %[src]!, {r5, r6}          \n\t" /* loading src pointers into r5 and r6 */
-                  "ldm    %[dst], {r7, r8}           \n\t" /* loading dst pointers into r7 and r8 */
-
-                  /* dst1_scale and dst2_scale*/
-                  "lsr    r9, r5, #24                \n\t" /* src >> 24 */
-                  "lsr    r10, r6, #24               \n\t" /* src >> 24 */
-#ifdef SK_ARM_HAS_EDSP
-                  "smulbb r9, r9, %[alpha]           \n\t" /* r9 = SkMulS16 r9 with src_scale */
-                  "smulbb r10, r10, %[alpha]         \n\t" /* r10 = SkMulS16 r10 with src_scale */
-#else
-                  "mul    r9, r9, %[alpha]           \n\t" /* r9 = SkMulS16 r9 with src_scale */
-                  "mul    r10, r10, %[alpha]         \n\t" /* r10 = SkMulS16 r10 with src_scale */
-#endif
-                  "lsr    r9, r9, #8                 \n\t" /* r9 >> 8 */
-                  "lsr    r10, r10, #8               \n\t" /* r10 >> 8 */
-                  "rsb    r9, r9, #256               \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
-                  "rsb    r10, r10, #256             \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
-
-                  /* ---------------------- */
-
-                  /* src1, src1_scale */
-                  "and    r11, r12, r5, lsr #8       \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
-                  "and    r4, r12, r5                \n\t" /* rb = r4 = r5 masked by r12 */
-                  "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
-                  "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
-                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
-                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
-                  "orr    r5, r11, r4                \n\t" /* r5 = (src1, src_scale) */
-
-                  /* dst1, dst1_scale */
-                  "and    r11, r12, r7, lsr #8       \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
-                  "and    r4, r12, r7                \n\t" /* rb = r4 = r7 masked by r12 */
-                  "mul    r11, r11, r9               \n\t" /* ag = r11 times dst_scale (r9) */
-                  "mul    r4, r4, r9                 \n\t" /* rb = r4 times dst_scale (r9) */
-                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
-                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
-                  "orr    r9, r11, r4                \n\t" /* r9 = (dst1, dst_scale) */
-
-                  /* ---------------------- */
-                  "add    r9, r5, r9                 \n\t" /* *dst = src plus dst both scaled */
-                  /* ---------------------- */
-
-                  /* ====================== */
-
-                  /* src2, src2_scale */
-                  "and    r11, r12, r6, lsr #8       \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
-                  "and    r4, r12, r6                \n\t" /* rb = r4 = r6 masked by r12 */
-                  "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
-                  "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
-                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
-                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
-                  "orr    r6, r11, r4                \n\t" /* r6 = (src2, src_scale) */
-
-                  /* dst2, dst2_scale */
-                  "and    r11, r12, r8, lsr #8       \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
-                  "and    r4, r12, r8                \n\t" /* rb = r4 = r8 masked by r12 */
-                  "mul    r11, r11, r10              \n\t" /* ag = r11 times dst_scale (r10) */
-                  "mul    r4, r4, r10                \n\t" /* rb = r4 times dst_scale (r6) */
-                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
-                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
-                  "orr    r10, r11, r4               \n\t" /* r10 = (dst2, dst_scale) */
-
-                  "sub    %[count], %[count], #2     \n\t" /* decrease count by 2 */
-                  /* ---------------------- */
-                  "add    r10, r6, r10               \n\t" /* *dst = src plus dst both scaled */
-                  /* ---------------------- */
-                  "cmp    %[count], #1               \n\t" /* compare count with 1 */
-                  /* ----------------- */
-                  "stm    %[dst]!, {r9, r10}         \n\t" /* copy r9 and r10 to r7 and r8 respectively */
-                  /* ----------------- */
-
-                  "bgt    1b                         \n\t" /* if %[count] greater than 1 reloop */
-                  "blt    3f                         \n\t" /* if %[count] less than 1 exit */
-                                                           /* else get into the single loop */
-                  /* Single Loop */
-                  "2:                                \n\t" /* <single loop> */
-                  "ldr    r5, [%[src]], #4           \n\t" /* loading src pointer into r5: r5=src */
-                  "ldr    r7, [%[dst]]               \n\t" /* loading dst pointer into r7: r7=dst */
-
-                  "lsr    r6, r5, #24                \n\t" /* src >> 24 */
-                  "and    r8, r12, r5, lsr #8        \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
-#ifdef SK_ARM_HAS_EDSP
-                  "smulbb r6, r6, %[alpha]           \n\t" /* r6 = SkMulS16 with src_scale */
-#else
-                  "mul    r6, r6, %[alpha]           \n\t" /* r6 = SkMulS16 with src_scale */
-#endif
-                  "and    r9, r12, r5                \n\t" /* rb = r9 = r5 masked by r12 */
-                  "lsr    r6, r6, #8                 \n\t" /* r6 >> 8 */
-                  "mul    r8, r8, %[alpha]           \n\t" /* ag = r8 times scale */
-                  "rsb    r6, r6, #256               \n\t" /* r6 = 255 - r6 + 1 */
-
-                  /* src, src_scale */
-                  "mul    r9, r9, %[alpha]           \n\t" /* rb = r9 times scale */
-                  "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
-                  "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
-                  "orr    r10, r8, r9                \n\t" /* r10 = (scr, src_scale) */
-
-                  /* dst, dst_scale */
-                  "and    r8, r12, r7, lsr #8        \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
-                  "and    r9, r12, r7                \n\t" /* rb = r9 = r7 masked by r12 */
-                  "mul    r8, r8, r6                 \n\t" /* ag = r8 times scale (r6) */
-                  "mul    r9, r9, r6                 \n\t" /* rb = r9 times scale (r6) */
-                  "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
-                  "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
-                  "orr    r7, r8, r9                 \n\t" /* r7 = (dst, dst_scale) */
-
-                  "add    r10, r7, r10               \n\t" /* *dst = src plus dst both scaled */
-
-                  /* ----------------- */
-                  "str    r10, [%[dst]], #4          \n\t" /* *dst = r10, postincrement dst by one (times 4) */
-                  /* ----------------- */
-
-                  "3:                                \n\t" /* <exit> */
-                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
-                  :
-                  : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
-                  );
-
-}
-
-///////////////////////////////////////////////////////////////////////////////
+#include "SkBlitRow_opts_arm_neon.h"

 static const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm[] = {
-    // no dither
-    // NOTE: For the functions below, we don't have a special version
-    //       that assumes that each source pixel is opaque. But our S32A is
-    //       still faster than the default, so use it.
-    S32A_D565_Opaque,   // S32_D565_Opaque
-    NULL,               // S32_D565_Blend
-    S32A_D565_Opaque,   // S32A_D565_Opaque
-    NULL,               // S32A_D565_Blend
-
-    // dither
-    NULL,   // S32_D565_Opaque_Dither
-    NULL,   // S32_D565_Blend_Dither
-    NULL,   // S32A_D565_Opaque_Dither
-    NULL,   // S32A_D565_Blend_Dither
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 };

 static const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm[] = {
-    NULL,   // Color32A_D565,
-    NULL,   // Color32A_D565_Dither
+    NULL, NULL,
 };

 static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = {
-    NULL,   // S32_Opaque,
-    NULL,   // S32_Blend,
-    S32A_Opaque_BlitRow32_arm,   // S32A_Opaque,
-    S32A_Blend_BlitRow32_arm     // S32A_Blend
+    NULL, NULL, NULL, NULL,
 };

-#endif // USE_ARM_CODE
-
 SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
    return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_procs_arm)[flags];
 }
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@ -635,8 +635,7 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
            vdst = vld1q_u16(dst);
 #ifdef SK_CPU_ARM64
            vsrc = sk_vld4_u8_arm64_4(src);
-#else
-#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
+#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
            asm (
                "vld4.u8 %h[vsrc], [%[src]]!"
                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
@ -659,7 +658,6 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
            vsrc.val[2] = d2;
            vsrc.val[3] = d3;
 #endif
-#endif // #ifdef SK_CPU_ARM64


            // deinterleave dst
@ -1311,37 +1309,6 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

 ///////////////////////////////////////////////////////////////////////////////

-#undef    DEBUG_OPAQUE_DITHER
-
-#if    defined(DEBUG_OPAQUE_DITHER)
-static void showme8(char *str, void *p, int len)
-{
-    static char buf[256];
-    char tbuf[32];
-    int i;
-    char *pc = (char*) p;
-    sprintf(buf,"%8s:", str);
-    for(i=0;i<len;i++) {
-        sprintf(tbuf, "   %02x", pc[i]);
-        strcat(buf, tbuf);
-    }
-    SkDebugf("%s\n", buf);
-}
-static void showme16(char *str, void *p, int len)
-{
-    static char buf[256];
-    char tbuf[32];
-    int i;
-    uint16_t *pc = (uint16_t*) p;
-    sprintf(buf,"%8s:", str);
-    len = (len / sizeof(uint16_t));    /* passed as bytes */
-    for(i=0;i<len;i++) {
-        sprintf(tbuf, " %04x", pc[i]);
-        strcat(buf, tbuf);
-    }
-    SkDebugf("%s\n", buf);
-}
-#endif
 #endif // #ifdef SK_CPU_ARM32

 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
@ -1353,17 +1320,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

    if (count >= UNROLL) {

-#if defined(DEBUG_OPAQUE_DITHER)
-    uint16_t tmpbuf[UNROLL];
-    int td[UNROLL];
-    int tdv[UNROLL];
-    int ta[UNROLL];
-    int tap[UNROLL];
-    uint16_t in_dst[UNROLL];
-    int offset = 0;
-    int noisy = 0;
-#endif
-
    uint8x8_t dbase;
    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    dbase = vld1_u8(dstart);
@ -1374,52 +1330,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
        uint16x8_t dst8, scale8, alpha8;
        uint16x8_t dst_r, dst_g, dst_b;

-#if defined(DEBUG_OPAQUE_DITHER)
-        // calculate 8 elements worth into a temp buffer
-        {
-        int my_y = y;
-        int my_x = x;
-        SkPMColor* my_src = (SkPMColor*)src;
-        uint16_t* my_dst = dst;
-        int i;
-
-        DITHER_565_SCAN(my_y);
-        for(i = 0; i < UNROLL; i++) {
-            SkPMColor c = *my_src++;
-            SkPMColorAssert(c);
-            if (c) {
-                unsigned a = SkGetPackedA32(c);
-
-                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
-                tdv[i] = DITHER_VALUE(my_x);
-                ta[i] = a;
-                tap[i] = SkAlpha255To256(a);
-                td[i] = d;
-
-                unsigned sr = SkGetPackedR32(c);
-                unsigned sg = SkGetPackedG32(c);
-                unsigned sb = SkGetPackedB32(c);
-                sr = SkDITHER_R32_FOR_565(sr, d);
-                sg = SkDITHER_G32_FOR_565(sg, d);
-                sb = SkDITHER_B32_FOR_565(sb, d);
-
-                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
-                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
-                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
-                // now src and dst expanded are in g:11 r:10 x:1 b:10
-                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
-                td[i] = d;
-            } else {
-                tmpbuf[i] = *my_dst;
-                ta[i] = tdv[i] = td[i] = 0xbeef;
-            }
-            in_dst[i] = *my_dst;
-            my_dst += 1;
-            DITHER_INC_X(my_x);
-        }
-        }
-#endif
-
 #ifdef SK_CPU_ARM64
        vsrc = sk_vld4_u8_arm64_4(src);
 #else
@ -1489,43 +1399,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

        vst1q_u16(dst, dst8);

-#if defined(DEBUG_OPAQUE_DITHER)
-        // verify my 8 elements match the temp buffer
-        {
-        int i, bad=0;
-        static int invocation;
-
-        for (i = 0; i < UNROLL; i++) {
-            if (tmpbuf[i] != dst[i]) {
-                bad=1;
-            }
-        }
-        if (bad) {
-            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
-                     invocation, offset);
-            SkDebugf("  alpha 0x%x\n", alpha);
-            for (i = 0; i < UNROLL; i++)
-                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
-                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
-                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
-
-            showme16("alpha8", &alpha8, sizeof(alpha8));
-            showme16("scale8", &scale8, sizeof(scale8));
-            showme8("d", &d, sizeof(d));
-            showme16("dst8", &dst8, sizeof(dst8));
-            showme16("dst_b", &dst_b, sizeof(dst_b));
-            showme16("dst_g", &dst_g, sizeof(dst_g));
-            showme16("dst_r", &dst_r, sizeof(dst_r));
-            showme8("sb", &sb, sizeof(sb));
-            showme8("sg", &sg, sizeof(sg));
-            showme8("sr", &sr, sizeof(sr));
-
-            return;
-        }
-        offset += UNROLL;
-        invocation++;
-        }
-#endif
        dst += UNROLL;
        count -= UNROLL;
        // skip x += UNROLL, since it's unchanged mod-4
@ -1569,8 +1442,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

 ///////////////////////////////////////////////////////////////////////////////

-#undef    DEBUG_S32_OPAQUE_DITHER
-
 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
                                 const SkPMColor* SK_RESTRICT src,
                                 int count, U8CPU alpha, int x, int y) {
@ -1637,25 +1508,6 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
        // store it
        vst1q_u16(dst, dst8);

-#if    defined(DEBUG_S32_OPAQUE_DITHER)
-        // always good to know if we generated good results
-        {
-        int i, myx = x, myy = y;
-        DITHER_565_SCAN(myy);
-        for (i=0;i<UNROLL;i++) {
-            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
-            SkPMColor c = src[i-8];
-            unsigned dither = DITHER_VALUE(myx);
-            uint16_t val = SkDitherRGB32To565(c, dither);
-            if (val != dst[i]) {
-            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
-                c, dither, val, dst[i], dstart[i]);
-            }
-            DITHER_INC_X(myx);
-        }
-        }
-#endif
-
        dst += UNROLL;
        // we don't need to increment src as the asm above has already done it
        count -= UNROLL;