Reverting r5364 (Update ARM and NEON optimizations for S32A_Opaque_BlitRow32)

git-svn-id: http://skia.googlecode.com/svn/trunk@5378 2bbb7eff-a529-9590-31e7-b0007b416f81
2012-09-04 12:48:01 +00:00 · 2012-09-04 12:48:01 +00:00 · b78765e63b
commit b78765e63b
parent 4f55d39a17
3 changed files with 48 additions and 657 deletions
--- a/bench/BitmapBench.cpp
+++ b/bench/BitmapBench.cpp
@ -21,6 +21,25 @@ static const char* gConfigName[] = {
    "ERROR", "a1", "a8", "index8", "565", "4444", "8888"
 };

+static void drawIntoBitmap(const SkBitmap& bm) {
+    const int w = bm.width();
+    const int h = bm.height();
+
+    SkCanvas canvas(bm);
+    SkPaint p;
+    p.setAntiAlias(true);
+    p.setColor(SK_ColorRED);
+    canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2,
+                      SkIntToScalar(SkMin32(w, h))*3/8, p);
+
+    SkRect r;
+    r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h));
+    p.setStyle(SkPaint::kStroke_Style);
+    p.setStrokeWidth(SkIntToScalar(4));
+    p.setColor(SK_ColorBLUE);
+    canvas.drawRect(r, p);
+}
+
 static int conv6ToByte(int x) {
    return x * 0xFF / 5;
 }
@ -83,23 +102,38 @@ class BitmapBench : public SkBenchmark {
    bool        fIsOpaque;
    bool        fForceUpdate; //bitmap marked as dirty before each draw. forces bitmap to be updated on device cache
    int         fTileX, fTileY; // -1 means don't use shader
-    bool        fIsVolatile;
-    SkBitmap::Config fConfig;
    SkString    fName;
    enum { N = SkBENCHLOOP(300) };
-    enum { W = 128 };
-    enum { H = 128 };
 public:
    BitmapBench(void* param, bool isOpaque, SkBitmap::Config c,
                bool forceUpdate = false, bool bitmapVolatile = false,
                int tx = -1, int ty = -1)
-        : INHERITED(param)
-        , fIsOpaque(isOpaque)
-        , fForceUpdate(forceUpdate)
-        , fIsVolatile(bitmapVolatile)
-        , fTileX(tx)
-        , fTileY(ty)
-        , fConfig(c) {
+        : INHERITED(param), fIsOpaque(isOpaque), fForceUpdate(forceUpdate), fTileX(tx), fTileY(ty) {
+        const int w = 128;
+        const int h = 128;
+        SkBitmap bm;
+
+        if (SkBitmap::kIndex8_Config == c) {
+            bm.setConfig(SkBitmap::kARGB_8888_Config, w, h);
+        } else {
+            bm.setConfig(c, w, h);
+        }
+        bm.allocPixels();
+        bm.eraseColor(isOpaque ? SK_ColorBLACK : 0);
+
+        drawIntoBitmap(bm);
+
+        if (SkBitmap::kIndex8_Config == c) {
+            convertToIndex666(bm, &fBitmap);
+        } else {
+            fBitmap = bm;
+        }
+
+        if (fBitmap.getColorTable()) {
+            fBitmap.getColorTable()->setIsOpaque(isOpaque);
+        }
+        fBitmap.setIsOpaque(isOpaque);
+        fBitmap.setIsVolatile(bitmapVolatile);
    }

 protected:
@ -111,43 +145,16 @@ protected:
                fName.appendf("_%s", gTileName[fTileY]);
            }
        }
-        fName.appendf("_%s%s", gConfigName[fConfig],
+        fName.appendf("_%s%s", gConfigName[fBitmap.config()],
                      fIsOpaque ? "" : "_A");
        if (fForceUpdate)
            fName.append("_update");
-        if (fIsVolatile)
+        if (fBitmap.isVolatile())
            fName.append("_volatile");

        return fName.c_str();
    }

-    virtual void onPreDraw() {
-        SkBitmap bm;
-
-        if (SkBitmap::kIndex8_Config == fConfig) {
-            bm.setConfig(SkBitmap::kARGB_8888_Config, W, H);
-        } else {
-            bm.setConfig(fConfig, W, H);
-        }
-
-        bm.allocPixels();
-        bm.eraseColor(fIsOpaque ? SK_ColorBLACK : 0);
-
-        onDrawIntoBitmap(bm);
-
-        if (SkBitmap::kIndex8_Config == fConfig) {
-            convertToIndex666(bm, &fBitmap);
-        } else {
-            fBitmap = bm;
-        }
-
-        if (fBitmap.getColorTable()) {
-            fBitmap.getColorTable()->setIsOpaque(fIsOpaque);
-        }
-        fBitmap.setIsOpaque(fIsOpaque);
-        fBitmap.setIsVolatile(fIsVolatile);
-    }
-
    virtual void onDraw(SkCanvas* canvas) {
        SkIPoint dim = this->getSize();
        SkRandom rand;
@ -170,25 +177,6 @@ protected:
        }
    }

-    virtual void onDrawIntoBitmap(const SkBitmap& bm) {
-        const int w = bm.width();
-        const int h = bm.height();
-
-        SkCanvas canvas(bm);
-        SkPaint p;
-        p.setAntiAlias(true);
-        p.setColor(SK_ColorRED);
-        canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2,
-                          SkIntToScalar(SkMin32(w, h))*3/8, p);
-
-        SkRect r;
-        r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h));
-        p.setStyle(SkPaint::kStroke_Style);
-        p.setStrokeWidth(SkIntToScalar(4));
-        p.setColor(SK_ColorBLUE);
-        canvas.drawRect(r, p);
-    }
-
 private:
    typedef SkBenchmark INHERITED;
 };
@ -253,95 +241,6 @@ private:
    typedef BitmapBench INHERITED;
 };

-/** Verify optimizations that test source alpha values. */
-
-class SourceAlphaBitmapBench : public BitmapBench {
-public:
-    enum SourceAlpha { kOpaque_SourceAlpha, kTransparent_SourceAlpha,
-                       kTwoStripes_SourceAlpha, kThreeStripes_SourceAlpha};
-private:
-    SkString    fFullName;
-    SourceAlpha fSourceAlpha;
-public:
-    SourceAlphaBitmapBench(void* param, SourceAlpha alpha, SkBitmap::Config c,
-                bool forceUpdate = false, bool bitmapVolatile = false,
-                int tx = -1, int ty = -1)
-        : INHERITED(param, false, c, forceUpdate, bitmapVolatile, tx, ty)
-        , fSourceAlpha(alpha) {
-    }
-
-protected:
-    virtual const char* onGetName() {
-        fFullName.set(INHERITED::onGetName());
-
-        if (fSourceAlpha == kOpaque_SourceAlpha) {
-                fFullName.append("_source_opaque");
-        } else if (fSourceAlpha == kTransparent_SourceAlpha) {
-                fFullName.append("_source_transparent");
-        } else if (fSourceAlpha == kTwoStripes_SourceAlpha) {
-                fFullName.append("_source_stripes_two");
-        } else if (fSourceAlpha == kThreeStripes_SourceAlpha) {
-                fFullName.append("_source_stripes_three");
-        }
-
-        return fFullName.c_str();
-    }
-
-    virtual void onDrawIntoBitmap(const SkBitmap& bm) SK_OVERRIDE {
-        const int w = bm.width();
-        const int h = bm.height();
-
-        if (kOpaque_SourceAlpha == fSourceAlpha) {
-            bm.eraseColor(SK_ColorBLACK);
-        } else if (kTransparent_SourceAlpha == fSourceAlpha) {
-            bm.eraseColor(0);
-        } else if (kTwoStripes_SourceAlpha == fSourceAlpha) {
-            bm.eraseColor(0);
-
-            SkCanvas canvas(bm);
-            SkPaint p;
-            p.setAntiAlias(false);
-            p.setStyle(SkPaint::kFill_Style);
-            p.setColor(SK_ColorRED);
-
-            // Draw red vertical stripes on transparent background
-            SkRect r;
-            for (int x = 0; x < w; x+=2)
-            {
-                r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h));
-                canvas.drawRect(r, p);
-            }
-
-        } else if (kThreeStripes_SourceAlpha == fSourceAlpha) {
-            bm.eraseColor(0);
-
-            SkCanvas canvas(bm);
-            SkPaint p;
-            p.setAntiAlias(false);
-            p.setStyle(SkPaint::kFill_Style);
-
-            // Draw vertical stripes on transparent background with a pattern
-            // where the first pixel is fully transparent, the next is semi-transparent
-            // and the third is fully opaque.
-            SkRect r;
-            for (int x = 0; x < w; x++)
-            {
-                if (x % 3 == 0) {
-                    continue; // Keep transparent
-                } else if (x % 3 == 1) {
-                    p.setColor(SkColorSetARGB(127, 127, 127, 127)); // Semi-transparent
-                } else if (x % 3 == 2) {
-                    p.setColor(SK_ColorRED); // Opaque
-                }
-                r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h));
-                canvas.drawRect(r, p);
-            }
-        }
-    }
-
-private:
-    typedef BitmapBench INHERITED;
-};
 static SkBenchmark* Fact0(void* p) { return new BitmapBench(p, false, SkBitmap::kARGB_8888_Config); }
 static SkBenchmark* Fact1(void* p) { return new BitmapBench(p, true, SkBitmap::kARGB_8888_Config); }
 static SkBenchmark* Fact2(void* p) { return new BitmapBench(p, true, SkBitmap::kRGB_565_Config); }
@ -364,12 +263,6 @@ static SkBenchmark* Fact14(void* p) { return new FilterBitmapBench(p, true, SkBi
 static SkBenchmark* Fact15(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, true, -1, -1, true, true, true); }
 static SkBenchmark* Fact16(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, false, -1, -1, true, true, true); }

-// source alpha tests -> S32A_Opaque_BlitRow32_{arm,neon}
-static SkBenchmark* Fact17(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kOpaque_SourceAlpha, SkBitmap::kARGB_8888_Config); }
-static SkBenchmark* Fact18(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTransparent_SourceAlpha, SkBitmap::kARGB_8888_Config); }
-static SkBenchmark* Fact19(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTwoStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); }
-static SkBenchmark* Fact20(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kThreeStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); }
-
 static BenchRegistry gReg0(Fact0);
 static BenchRegistry gReg1(Fact1);
 static BenchRegistry gReg2(Fact2);
@ -390,7 +283,3 @@ static BenchRegistry gReg14(Fact14);
 static BenchRegistry gReg15(Fact15);
 static BenchRegistry gReg16(Fact16);

-static BenchRegistry gReg17(Fact17);
-static BenchRegistry gReg18(Fact18);
-static BenchRegistry gReg19(Fact19);
-static BenchRegistry gReg20(Fact20);
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@ -185,306 +185,6 @@ static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
                  : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
                  );
 }
-
-static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_src_alpha
-                                        (SkPMColor* SK_RESTRICT dst,
-                                         const SkPMColor* SK_RESTRICT src,
-                                         int count, U8CPU alpha) {
-
-/* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */
-/* Predicts that the next pixel will have the same alpha type as the current pixel */
-
-asm volatile (
-
-    "\tSTMDB  r13!, {r4-r12, r14}        \n" /* saving r4-r12, lr on the stack */
-                                             /* we should not save r0-r3 according to ABI */
-
-    "\tCMP    r2, #0                     \n" /* if (count == 0) */
-    "\tBEQ    9f                         \n" /* go to EXIT */
-
-    "\tMOV    r12, #0xff                 \n" /* load the 0xff mask in r12 */
-    "\tORR    r12, r12, r12, LSL #16     \n" /* convert it to 0xff00ff in r12 */
-
-    "\tMOV    r14, #255                  \n" /* r14 = 255 */
-                                             /* will be used later for left-side comparison */
-
-    "\tADD    r2, %[src], r2, LSL #2     \n" /* r2 points to last array element which can be used */
-    "\tSUB    r2, r2, #16                \n" /* as a base for 4-way processing algorithm */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer is bigger than */
-    "\tBGT    8f                         \n" /* calculated marker for 4-way -> */
-                                             /* use simple one-by-one processing */
-
-    /* START OF DISPATCHING BLOCK */
-
-    "\t0:                                \n"
-
-    "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
-
-    "\tLSR    r7, r3, #24                \n" /* if not all src alphas of 4-way block are equal -> */
-    "\tCMP    r7, r4, LSR #24            \n"
-    "\tCMPEQ  r7, r5, LSR #24            \n"
-    "\tCMPEQ  r7, r6, LSR #24            \n"
-    "\tBNE    1f                         \n" /* -> go to general 4-way processing routine */
-
-    "\tCMP    r14, r7                    \n" /* if all src alphas are equal to 255 */
-    "\tBEQ    3f                         \n" /* go to alpha == 255 optimized routine */
-
-    "\tCMP    r7,  #0                    \n" /* if all src alphas are equal to 0 */
-    "\tBEQ    6f                         \n" /* go to alpha == 0 optimized routine */
-
-    /* END OF DISPATCHING BLOCK */
-
-    /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
-
-    "\t1:                                \n"
-                                             /* we do not have enough registers to make */
-                                             /* 4-way [dst] loading -> we are using 2 * 2-way */
-
-    "\tLDM    %[dst], {r7, r8}           \n" /* 1st 2-way loading of dst values to r7-r8 */
-
-    /* PROCESSING BLOCK 1 */
-    /* r3 = src, r7 = dst */
-
-    "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source and storing to r11 */
-    "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
-    "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
-    "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
-    "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
-    "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
-    "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
-    "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
-    "\tORR    r7,  r9,  r10              \n" /* br | ag */
-    "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
-
-    /* PROCESSING BLOCK 2 */
-    /* r4 = src, r8 = dst */
-
-    "\tLSR    r11, r4,  #24              \n" /* see PROCESSING BLOCK 1 */
-    "\tAND    r9,  r12, r8               \n"
-    "\tRSB    r11, r11, #256             \n"
-    "\tAND    r10, r12, r8, LSR #8       \n"
-    "\tMUL    r9,  r9,  r11              \n"
-    "\tAND    r9,  r12, r9, LSR #8       \n"
-    "\tMUL    r10, r10, r11              \n"
-    "\tAND    r10, r10, r12, LSL #8      \n"
-    "\tORR    r8,  r9,  r10              \n"
-    "\tADD    r8,  r4,  r8               \n"
-
-    "\tSTM    %[dst]!, {r7, r8}          \n" /* 1st 2-way storing of processed dst values */
-
-    "\tLDM    %[dst], {r9, r10}          \n" /* 2nd 2-way loading of dst values to r9-r10 */
-
-    /* PROCESSING BLOCK 3 */
-    /* r5 = src, r9 = dst */
-
-    "\tLSR    r11, r5,  #24              \n" /* see PROCESSING BLOCK 1 */
-    "\tAND    r7,  r12, r9               \n"
-    "\tRSB    r11, r11, #256             \n"
-    "\tAND    r8,  r12, r9, LSR #8       \n"
-    "\tMUL    r7,  r7,  r11              \n"
-    "\tAND    r7,  r12, r7, LSR #8       \n"
-    "\tMUL    r8,  r8,  r11              \n"
-    "\tAND    r8,  r8,  r12, LSL #8      \n"
-    "\tORR    r9,  r7,  r8               \n"
-    "\tADD    r9,  r5,  r9               \n"
-
-    /* PROCESSING BLOCK 4 */
-    /* r6 = src, r10 = dst */
-
-    "\tLSR    r11, r6,  #24              \n" /* see PROCESSING BLOCK 1 */
-    "\tAND    r7,  r12, r10              \n"
-    "\tRSB    r11, r11, #256             \n"
-    "\tAND    r8,  r12, r10, LSR #8      \n"
-    "\tMUL    r7,  r7,  r11              \n"
-    "\tAND    r7,  r12, r7, LSR #8       \n"
-    "\tMUL    r8,  r8,  r11              \n"
-    "\tAND    r8,  r8,  r12, LSL #8      \n"
-    "\tORR    r10, r7,  r8               \n"
-    "\tADD    r10, r6,  r10              \n"
-
-    "\tSTM    %[dst]!, {r9, r10}         \n" /* 2nd 2-way storing of processed dst values */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] pointer <= calculated marker */
-    "\tBLE    0b                         \n" /* we could run 4-way processing -> go to dispatcher */
-    "\tBGT    8f                         \n" /* else -> use simple one-by-one processing */
-
-    /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
-
-    /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */
-
-    "\t2:                                \n" /* ENTRY 1: LOADING [src] to registers */
-
-    "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
-
-    "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
-    "\tAND    r8, r5, r6                 \n"
-    "\tAND    r9, r7, r8                 \n"
-    "\tCMP    r14, r9, LSR #24           \n"
-    "\tBNE    4f                         \n" /* -> go to alpha == 0 check */
-
-    "\t3:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
-
-    "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
-    "\tBLE    2b                         \n" /* we could run 4-way processing */
-                                             /* because now we're in ALPHA == 255 state */
-                                             /* run next cycle with priority alpha == 255 checks */
-
-    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
-                                             /* use simple one-by-one processing */
-
-    "\t4:                                \n"
-
-    "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
-    "\tORR    r8, r5, r6                 \n"
-    "\tORR    r9, r7, r8                 \n"
-    "\tLSRS   r9, #24                    \n"
-    "\tBNE    1b                         \n" /* -> go to general processing mode */
-                                             /* (we already checked for alpha == 255) */
-
-    "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
-    "\tBLE    5f                         \n" /* we could run 4-way processing one more time */
-                                             /* because now we're in ALPHA == 0 state */
-                                             /* run next cycle with priority alpha == 0 checks */
-
-    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
-                                             /* use simple one-by-one processing */
-
-    /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */
-
-    /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */
-
-    "\t5:                                \n" /* ENTRY 1: LOADING [src] to registers */
-
-    "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
-
-    "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
-    "\tORR    r8, r5, r6                 \n"
-    "\tORR    r9, r7, r8                 \n"
-    "\tLSRS   r9, #24                    \n"
-    "\tBNE    7f                         \n" /* -> go to alpha == 255 check */
-
-    "\t6:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
-
-    "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
-    "\tBLE    5b                         \n" /* we could run 4-way processing one more time */
-                                             /* because now we're in ALPHA == 0 state */
-                                             /* run next cycle with priority alpha == 0 checks */
-
-    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
-                                             /* use simple one-by-one processing */
-    "\t7:                                \n"
-
-    "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
-    "\tAND    r8, r5, r6                 \n"
-    "\tAND    r9, r7, r8                 \n"
-    "\tCMP    r14, r9, LSR #24           \n"
-    "\tBNE    1b                         \n" /* -> go to general processing mode */
-                                             /* (we already checked for alpha == 0) */
-
-    "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
-    "\tBLE    2b                         \n" /* we could run 4-way processing one more time */
-                                             /* because now we're in ALPHA == 255 state */
-                                             /* run next cycle with priority alpha == 255 checks */
-
-    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
-                                             /* use simple one-by-one processing */
-
-    /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */
-
-    /* START OF TAIL BLOCK */
-    /* (used when array is too small to be processed with 4-way algorithm)*/
-
-    "\t8:                                \n"
-
-    "\tADD    r2, r2, #16                \n" /* now r2 points to the element just after array */
-                                             /* we've done r2 = r2 - 16 at procedure start */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
-    "\tBEQ    9f                         \n" /* goto EXIT */
-
-    /* TAIL PROCESSING BLOCK 1 */
-
-    "\tLDR    r3, [%[src]], #4           \n" /* r3 = *src, src++ */
-    "\tLDR    r7, [%[dst]]               \n" /* r7 = *dst */
-
-    "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source */
-    "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
-    "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
-    "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
-    "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
-    "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
-    "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
-    "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
-    "\tORR    r7,  r9,  r10              \n" /* br | ag */
-    "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
-
-    "\tSTR    r7, [%[dst]], #4           \n" /* *dst = r7; dst++ */
-
-    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
-    "\tBEQ    9f                         \n" /* goto EXIT */
-
-    /* TAIL PROCESSING BLOCK 2 */
-
-    "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
-    "\tLDR    r7, [%[dst]]               \n"
-
-    "\tLSR    r11, r3,  #24              \n"
-    "\tAND    r9,  r12, r7               \n"
-    "\tRSB    r11, r11, #256             \n"
-    "\tAND    r10, r12, r7, LSR #8       \n"
-    "\tMUL    r9,  r9,  r11              \n"
-    "\tAND    r9,  r12, r9, LSR #8       \n"
-    "\tMUL    r10, r10, r11              \n"
-    "\tAND    r10, r10, r12, LSL #8      \n"
-    "\tORR    r7,  r9,  r10              \n"
-    "\tADD    r7,  r3,  r7               \n"
-
-    "\tSTR    r7, [%[dst]], #4           \n"
-
-    "\tCMP    %[src], r2                 \n"
-    "\tBEQ    9f                         \n"
-
-    /* TAIL PROCESSING BLOCK 3 */
-
-    "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
-    "\tLDR    r7, [%[dst]]               \n"
-
-    "\tLSR    r11, r3,  #24              \n"
-    "\tAND    r9,  r12, r7               \n"
-    "\tRSB    r11, r11, #256             \n"
-    "\tAND    r10, r12, r7, LSR #8       \n"
-    "\tMUL    r9,  r9,  r11              \n"
-    "\tAND    r9,  r12, r9, LSR #8       \n"
-    "\tMUL    r10, r10, r11              \n"
-    "\tAND    r10, r10, r12, LSL #8      \n"
-    "\tORR    r7,  r9,  r10              \n"
-    "\tADD    r7,  r3,  r7               \n"
-
-    "\tSTR    r7, [%[dst]], #4           \n"
-
-    /* END OF TAIL BLOCK */
-
-    "\t9:                                \n" /* EXIT */
-
-    "\tLDMIA  r13!, {r4-r12, r14}        \n" /* restoring r4-r12, lr from stack */
-    "\tBX     lr                         \n" /* return */
-
-    : [dst] "+r" (dst), [src] "+r" (src)
-    :
-    : "cc", "r2", "r3", "memory"
-
-    );
-
-}
 #endif // USE_ARM_CODE

 /*
@ -666,21 +366,7 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm[] = {
 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = {
    NULL,   // S32_Opaque,
    NULL,   // S32_Blend,
-    /*
-     * We have two choices for S32A_Opaque procs. The one reads the src alpha
-     * value and attempts to optimize accordingly.  The optimization is
-     * sensitive to the source content and is not a win in all cases. For
-     * example, if there are a lot of transitions between the alpha states,
-     * the performance will almost certainly be worse.  However, for many
-     * common cases the performance is equivalent or better than the standard
-     * case where we do not inspect the src alpha.
-     */
-#if SK_A32_SHIFT == 24
-    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
-    S32A_Opaque_BlitRow32_arm_src_alpha,   // S32A_Opaque,
-#else
    S32A_Opaque_BlitRow32_arm,   // S32A_Opaque,
-#endif
    S32A_Blend_BlitRow32_arm     // S32A_Blend
 };
 #endif
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@ -517,176 +517,6 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    }
 }

-void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
-                                const SkPMColor* SK_RESTRICT src,
-                                int count, U8CPU alpha) {
-    SkASSERT(255 == alpha);
-
-    if (count <= 0)
-    return;
-
-    /* Use these to check if src is transparent or opaque */
-    const unsigned int ALPHA_OPAQ  = 0xFF000000;
-    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
-
-#define UNROLL  4
-    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
-    const SkPMColor* SK_RESTRICT src_temp = src;
-
-    /* set up the NEON variables */
-    uint8x8_t alpha_mask;
-    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
-    alpha_mask = vld1_u8(alpha_mask_setup);
-
-    uint8x8_t src_raw, dst_raw, dst_final;
-    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
-    uint8x8_t dst_cooked;
-    uint16x8_t dst_wide;
-    uint8x8_t alpha_narrow;
-    uint16x8_t alpha_wide;
-
-    /* choose the first processing type */
-    if( src >= src_end)
-        goto TAIL;
-    if(*src <= ALPHA_TRANS)
-        goto ALPHA_0;
-    if(*src >= ALPHA_OPAQ)
-        goto ALPHA_255;
-    /* fall-thru */
-
-ALPHA_1_TO_254:
-    do {
-
-        /* get the source */
-        src_raw = vreinterpret_u8_u32(vld1_u32(src));
-        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
-
-        /* get and hold the dst too */
-        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
-        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
-
-
-        /* get the alphas spread out properly */
-        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
-        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
-        /* we collapsed (255-a)+1 ... */
-        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
-        /* spread the dest */
-        dst_wide = vmovl_u8(dst_raw);
-
-        /* alpha mul the dest */
-        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
-        dst_cooked = vshrn_n_u16(dst_wide, 8);
-
-        /* sum -- ignoring any byte lane overflows */
-        dst_final = vadd_u8(src_raw, dst_cooked);
-
-        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
-        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
-        /* we collapsed (255-a)+1 ... */
-        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
-        /* spread the dest */
-        dst_wide = vmovl_u8(dst_raw_2);
-
-        /* alpha mul the dest */
-        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
-        dst_cooked = vshrn_n_u16(dst_wide, 8);
-
-        /* sum -- ignoring any byte lane overflows */
-        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
-
-        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
-        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
-
-        src += UNROLL;
-        dst += UNROLL;
-
-        /* if 2 of the next pixels aren't between 1 and 254
-        it might make sense to go to the optimized loops */
-        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
-            break;
-
-    } while(src < src_end);
-
-    if (src >= src_end)
-        goto TAIL;
-
-    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
-        goto ALPHA_255;
-
-    /*fall-thru*/
-
-ALPHA_0:
-
-    /*In this state, we know the current alpha is 0 and
-     we optimize for the next alpha also being zero. */
-    src_temp = src;  //so we don't have to increment dst every time
-    do {
-        if(*(++src) > ALPHA_TRANS)
-            break;
-        if(*(++src) > ALPHA_TRANS)
-            break;
-        if(*(++src) > ALPHA_TRANS)
-            break;
-        if(*(++src) > ALPHA_TRANS)
-            break;
-    } while(src < src_end);
-
-    dst += (src - src_temp);
-
-    /* no longer alpha 0, so determine where to go next. */
-    if( src >= src_end)
-        goto TAIL;
-    if(*src >= ALPHA_OPAQ)
-        goto ALPHA_255;
-    else
-        goto ALPHA_1_TO_254;
-
-ALPHA_255:
-    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
-        dst[0]=src[0];
-        dst[1]=src[1];
-        dst[2]=src[2];
-        dst[3]=src[3];
-        src+=UNROLL;
-        dst+=UNROLL;
-        if(src >= src_end)
-            goto TAIL;
-    }
-
-    //Handle remainder.
-    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
-        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
-            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
-        }
-    }
-
-    if( src >= src_end)
-        goto TAIL;
-    if(*src <= ALPHA_TRANS)
-        goto ALPHA_0;
-    else
-        goto ALPHA_1_TO_254;
-
-TAIL:
-    /* do any residual iterations */
-    src_end += UNROLL + 1;  //goto the real end
-    while(src != src_end) {
-        if( *src != 0 ) {
-            if( *src >= ALPHA_OPAQ ) {
-                *dst = *src;
-            }
-            else {
-                *dst = SkPMSrcOver(*src, *dst);
-            }
-        }
-        src++;
-        dst++;
-    }
-    return;
-}

 /* Neon version of S32_Blend_BlitRow32()
 * portable version is in src/core/SkBlitRow_D32.cpp
@ -1277,20 +1107,6 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = {
 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
    NULL,   // S32_Opaque,
    S32_Blend_BlitRow32_neon,        // S32_Blend,
-    /*
-     * We have two choices for S32A_Opaque procs. The one reads the src alpha
-     * value and attempts to optimize accordingly.  The optimization is
-     * sensitive to the source content and is not a win in all cases. For
-     * example, if there are a lot of transitions between the alpha states,
-     * the performance will almost certainly be worse.  However, for many
-     * common cases the performance is equivalent or better than the standard
-     * case where we do not inspect the src alpha.
-     */
-#if SK_A32_SHIFT == 24
-    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
-    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
-#else
-    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
-#endif
+    S32A_Opaque_BlitRow32_neon,        // S32A_Opaque,
    S32A_Blend_BlitRow32_arm        // S32A_Blend
 };