From 3848427d884b72114854c8eef9662691f23fae7b Mon Sep 17 00:00:00 2001
From: mtklein <mtklein@chromium.org>
Date: Fri, 7 Aug 2015 08:48:12 -0700
Subject: [PATCH] The compiler can generate smulbb perfectly well nowadays.

BUG=skia:4117

Review URL: https://codereview.chromium.org/1273203002
---
 include/core/SkColorPriv.h           |  4 ++--
 include/core/SkMath.h                | 32 ++--------------------------
 include/core/SkPreConfig.h           |  6 ------
 src/core/SkMathPriv.h                |  4 ++--
 src/core/SkScan_Antihair.cpp         | 22 +++++++++----------
 src/opts/SkBlitRow_opts_arm_neon.cpp |  6 +++---
 src/opts/SkBlitRow_opts_mips_dsp.cpp |  9 +++-----
 7 files changed, 23 insertions(+), 60 deletions(-)
diff --git a/include/core/SkColorPriv.h b/include/core/SkColorPriv.h
index f9c5d928a0..3dec49b73e 100644
--- a/include/core/SkColorPriv.h
+++ b/include/core/SkColorPriv.h
@@ -193,7 +193,7 @@ static inline unsigned Sk255To256(U8CPU value) {
 /** Multiplify value by 0..256, and shift the result down 8
     (i.e. return (value * alpha256) >> 8)
  */
-#define SkAlphaMul(value, alpha256)     (SkMulS16(value, alpha256) >> 8)
+#define SkAlphaMul(value, alpha256)     (((value) * (alpha256)) >> 8)
 
 //  The caller may want negative values, so keep all params signed (int)
 //  so we don't accidentally slip into unsigned math and lose the sign
@@ -213,7 +213,7 @@ static inline int SkAlphaBlend255(S16CPU src, S16CPU dst, U8CPU alpha) {
     SkASSERT((int16_t)dst == dst);
     SkASSERT((uint8_t)alpha == alpha);
 
-    int prod = SkMulS16(src - dst, alpha) + 128;
+    int prod = (src - dst) * alpha + 128;
     prod = (prod + (prod >> 8)) >> 8;
     return dst + prod;
 }
diff --git a/include/core/SkMath.h b/include/core/SkMath.h
index d1d0e360d4..e5069592d0 100644
--- a/include/core/SkMath.h
+++ b/include/core/SkMath.h
@@ -156,34 +156,6 @@ template <typename T> inline bool SkIsPow2(T value) {
 
 ///////////////////////////////////////////////////////////////////////////////
 
-/**
- *  SkMulS16(a, b) multiplies a * b, but requires that a and b are both int16_t.
- *  With this requirement, we can generate faster instructions on some
- *  architectures.
- */
-#ifdef SK_ARM_HAS_EDSP
-    static inline int32_t SkMulS16(S16CPU x, S16CPU y) {
-        SkASSERT((int16_t)x == x);
-        SkASSERT((int16_t)y == y);
-        int32_t product;
-        asm("smulbb %0, %1, %2 \n"
-            : "=r"(product)
-            : "r"(x), "r"(y)
-            );
-        return product;
-    }
-#else
-    #ifdef SK_DEBUG
-        static inline int32_t SkMulS16(S16CPU x, S16CPU y) {
-            SkASSERT((int16_t)x == x);
-            SkASSERT((int16_t)y == y);
-            return x * y;
-        }
-    #else
-        #define SkMulS16(x, y)  ((x) * (y))
-    #endif
-#endif
-
 /**
  *  Return a*b/((1 << shift) - 1), rounding any fractional bits.
  *  Only valid if a and b are unsigned and <= 32767 and shift is > 0 and <= 8
@@ -192,7 +164,7 @@ static inline unsigned SkMul16ShiftRound(U16CPU a, U16CPU b, int shift) {
     SkASSERT(a <= 32767);
     SkASSERT(b <= 32767);
     SkASSERT(shift > 0 && shift <= 8);
-    unsigned prod = SkMulS16(a, b) + (1 << (shift - 1));
+    unsigned prod = a*b + (1 << (shift - 1));
     return (prod + (prod >> shift)) >> shift;
 }
 
@@ -203,7 +175,7 @@ static inline unsigned SkMul16ShiftRound(U16CPU a, U16CPU b, int shift) {
 static inline U8CPU SkMulDiv255Round(U16CPU a, U16CPU b) {
     SkASSERT(a <= 32767);
     SkASSERT(b <= 32767);
-    unsigned prod = SkMulS16(a, b) + 128;
+    unsigned prod = a*b + 128;
     return (prod + (prod >> 8)) >> 8;
 }
 
diff --git a/include/core/SkPreConfig.h b/include/core/SkPreConfig.h
index 19363427a0..7a849f566e 100644
--- a/include/core/SkPreConfig.h
+++ b/include/core/SkPreConfig.h
@@ -172,12 +172,6 @@
         #else
             #define SK_ARM_ARCH 3
         #endif
-
-        #if defined(__thumb2__) && (SK_ARM_ARCH >= 6) \
-                || !defined(__thumb__) && ((SK_ARM_ARCH > 5) || defined(__ARM_ARCH_5E__) \
-                || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__))
-            #define SK_ARM_HAS_EDSP
-        #endif
     #endif
 #endif
 
diff --git a/src/core/SkMathPriv.h b/src/core/SkMathPriv.h
index 345815354c..b9184a0726 100644
--- a/src/core/SkMathPriv.h
+++ b/src/core/SkMathPriv.h
@@ -57,7 +57,7 @@ static inline unsigned SkClampUMax(unsigned value, unsigned max) {
 static inline U8CPU SkMulDiv255Trunc(U8CPU a, U8CPU b) {
     SkASSERT((uint8_t)a == a);
     SkASSERT((uint8_t)b == b);
-    unsigned prod = SkMulS16(a, b) + 1;
+    unsigned prod = a*b + 1;
     return (prod + (prod >> 8)) >> 8;
 }
 
@@ -67,7 +67,7 @@ static inline U8CPU SkMulDiv255Trunc(U8CPU a, U8CPU b) {
 static inline U8CPU SkMulDiv255Ceiling(U8CPU a, U8CPU b) {
     SkASSERT((uint8_t)a == a);
     SkASSERT((uint8_t)b == b);
-    unsigned prod = SkMulS16(a, b) + 255;
+    unsigned prod = a*b + 255;
     return (prod + (prod >> 8)) >> 8;
 }
 
diff --git a/src/core/SkScan_Antihair.cpp b/src/core/SkScan_Antihair.cpp
index 546ced0072..2bcb4c64c1 100644
--- a/src/core/SkScan_Antihair.cpp
+++ b/src/core/SkScan_Antihair.cpp
@@ -34,7 +34,7 @@
 static inline int SmallDot6Scale(int value, int dot6) {
     SkASSERT((int16_t)value == value);
     SkASSERT((unsigned)dot6 <= 64);
-    return SkMulS16(value, dot6) >> 6;
+    return (value * dot6) >> 6;
 }
 
 //#define TEST_GAMMA
@@ -155,19 +155,19 @@ class Horish_SkAntiHairBlitter : public SkAntiHairBlitter {
 public:
     SkFixed drawCap(int x, SkFixed fy, SkFixed dy, int mod64) override {
         fy += SK_Fixed1/2;
-        
+
         int lower_y = fy >> 16;
         uint8_t  a = (uint8_t)(fy >> 8);
         unsigned a0 = SmallDot6Scale(255 - a, mod64);
         unsigned a1 = SmallDot6Scale(a, mod64);
         this->getBlitter()->blitAntiV2(x, lower_y - 1, a0, a1);
-        
+
         return fy + dy - SK_Fixed1/2;
     }
-    
+
     SkFixed drawLine(int x, int stopx, SkFixed fy, SkFixed dy) override {
         SkASSERT(x < stopx);
-        
+
         fy += SK_Fixed1/2;
         SkBlitter* blitter = this->getBlitter();
         do {
@@ -176,7 +176,7 @@ public:
             blitter->blitAntiV2(x, lower_y - 1, 255 - a, a);
             fy += dy;
         } while (++x < stopx);
-        
+
         return fy - SK_Fixed1/2;
     }
 };
@@ -226,15 +226,15 @@ class Vertish_SkAntiHairBlitter : public SkAntiHairBlitter {
 public:
     SkFixed drawCap(int y, SkFixed fx, SkFixed dx, int mod64) override {
         fx += SK_Fixed1/2;
-        
+
         int x = fx >> 16;
         uint8_t a = (uint8_t)(fx >> 8);
         this->getBlitter()->blitAntiH2(x - 1, y,
                                        SmallDot6Scale(255 - a, mod64), SmallDot6Scale(a, mod64));
-        
+
         return fx + dx - SK_Fixed1/2;
     }
-    
+
     SkFixed drawLine(int y, int stopy, SkFixed fx, SkFixed dx) override {
         SkASSERT(y < stopy);
         fx += SK_Fixed1/2;
@@ -244,7 +244,7 @@ public:
             this->getBlitter()->blitAntiH2(x - 1, y, 255 - a, a);
             fx += dx;
         } while (++y < stopy);
-        
+
         return fx - SK_Fixed1/2;
     }
 };
@@ -540,7 +540,7 @@ void SkScan::AntiHairLineRgn(const SkPoint array[], int arrayCount, const SkRegi
         clipBounds.set(clip->getBounds());
         /*  We perform integral clipping later on, but we do a scalar clip first
          to ensure that our coordinates are expressible in fixed/integers.
-         
+
          antialiased hairlines can draw up to 1/2 of a pixel outside of
          their bounds, so we need to outset the clip before calling the
          clipper. To make the numerics safer, we outset by a whole pixel,
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index ca67469808..4db82a5070 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -714,9 +714,9 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
         if (sc) {
             uint16_t dc = *dst;
             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
-            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
-            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
-            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
+            unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
+            unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
+            unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
         }
         dst += 1;
diff --git a/src/opts/SkBlitRow_opts_mips_dsp.cpp b/src/opts/SkBlitRow_opts_mips_dsp.cpp
index 869a04a4af..c6747f0960 100644
--- a/src/opts/SkBlitRow_opts_mips_dsp.cpp
+++ b/src/opts/SkBlitRow_opts_mips_dsp.cpp
@@ -753,12 +753,9 @@ static void S32A_D565_Blend_mips_dsp(uint16_t* SK_RESTRICT dst,
         if (sc) {
             uint16_t dc = *dst;
             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
-            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) +
-                          SkMulS16(SkGetPackedR16(dc), dst_scale);
-            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) +
-                          SkMulS16(SkGetPackedG16(dc), dst_scale);
-            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) +
-                          SkMulS16(SkGetPackedB16(dc), dst_scale);
+            unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
+            unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
+            unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
         }
         dst += 1;