ARM Skia NEON patches - 11 - Blitter_RGB16

Blitter_RGB16: fixes and improvements - fix alpha calculation: it was still using the old version of SkAlpha255To256. 11 more tests pass in gm. - clean a lot the code: the existing code was "a bit" messy with a lot of duplicated hardcoded constants, got rid of all this. - improve speed a little: part of it as a side-effect of the change in the way alpha is calculated but also by grouping loads and stores. One "issue" was present and still remains: the NEON code doesn't give the same result as the black blitter on black. It accounts for dozens of mismatches in gm. Is this considered "not too bad"? Would you be interested in a NEON version of the black blitter? The current comments seem to indicate that the black blitter is here only to give a performance boost when NEON is not presents so I didn't write a NEON version. BUG= R=djsollen@google.com, tomhudson@google.com, reed@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/18666005 git-svn-id: http://skia.googlecode.com/svn/trunk@10635 2bbb7eff-a529-9590-31e7-b0007b416f81
2013-08-08 10:51:45 +00:00 · 2013-08-08 10:51:45 +00:00 · 641a249196
commit 641a249196
parent d55e357a8e
1 changed files with 37 additions and 47 deletions
--- a/src/core/SkBlitter_RGB16.cpp
+++ b/src/core/SkBlitter_RGB16.cpp
@ -390,63 +390,53 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
    do {
        int w = width;
        if (w >= UNROLL) {
-            uint32x4_t color;        /* can use same one */
-            uint32x4_t dev_lo, dev_hi;
-            uint32x4_t t1;
-            uint32x4_t wn1, wn2;
-            uint16x4_t odev_lo, odev_hi;
-            uint16x4_t alpha_lo, alpha_hi;
-            uint16x8_t  alpha_full;
+            uint32x4_t color, dev_lo, dev_hi;
+            uint32x4_t wn1, wn2, tmp;
+            uint32x4_t vmask_g16, vmask_ng16;
+            uint16x8_t valpha, vdev;
+            uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi;

+            // prepare constants
+            vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE);
+            vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE);
            color = vdupq_n_u32(expanded32);

            do {
-                /* alpha is 8x8, widen and split to get pair of 16x4's */
-                alpha_full = vmovl_u8(vld1_u8(alpha));
-                alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7));
-                alpha_full = vshrq_n_u16(alpha_full, 3);
-                alpha_lo = vget_low_u16(alpha_full);
-                alpha_hi = vget_high_u16(alpha_full);
+                // alpha is 8x8, widen and split to get a pair of 16x4
+                valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha));
+                valpha = vshrq_n_u16(valpha, 3);
+                valpha_lo = vget_low_u16(valpha);
+                valpha_hi = vget_high_u16(valpha);

-                dev_lo = vmovl_u16(vld1_u16(device));
-                dev_hi = vmovl_u16(vld1_u16(device+4));
+                // load pixels
+                vdev = vld1q_u16(device);
+                dev_lo = vmovl_u16(vget_low_u16(vdev));
+                dev_hi = vmovl_u16(vget_high_u16(vdev));

-                /* unpack in 32 bits */
-                dev_lo = vorrq_u32(
-                                   vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
-                                   vshlq_n_u32(vandq_u32(dev_lo,
-                                                         vdupq_n_u32(0x000007E0)),
-                                               16)
-                                   );
-                dev_hi = vorrq_u32(
-                                   vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
-                                   vshlq_n_u32(vandq_u32(dev_hi,
-                                                         vdupq_n_u32(0x000007E0)),
-                                               16)
-                                   );
+                // unpack them in 32 bits
+                dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16, 16);
+                dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16, 16);

-                /* blend the two */
-                t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo));
-                t1 = vshrq_n_u32(t1, 5);
-                dev_lo = vaddq_u32(dev_lo, t1);
+                // blend with color
+                tmp = (color - dev_lo) * vmovl_u16(valpha_lo);
+                tmp = vshrq_n_u32(tmp, 5);
+                dev_lo += tmp;

-                t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi));
-                t1 = vshrq_n_u32(t1, 5);
-                dev_hi = vaddq_u32(dev_hi, t1);
+                tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi));
+                tmp = vshrq_n_u32(tmp, 5);
+                dev_hi += tmp;

-                /* re-compact and store */
-                wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
-                wn2 = vshrq_n_u32(dev_lo, 16);
-                wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
-                odev_lo = vmovn_u32(vorrq_u32(wn1, wn2));
+                // re-compact
+                wn1 = dev_lo & vmask_ng16;
+                wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16;
+                odev_lo = vmovn_u32(wn1 | wn2);

-                wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
-                wn2 = vshrq_n_u32(dev_hi, 16);
-                wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
-                odev_hi = vmovn_u32(vorrq_u32(wn1, wn2));
+                wn1 = dev_hi & vmask_ng16;
+                wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16;
+                odev_hi = vmovn_u32(wn1 | wn2);

-                vst1_u16(device, odev_lo);
-                vst1_u16(device+4, odev_hi);
+                // store
+                vst1q_u16(device, vcombine_u16(odev_lo, odev_hi));

                device += UNROLL;
                alpha += UNROLL;
@ -454,7 +444,7 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
            } while (w >= UNROLL);
        }

-        /* residuals (which is everything if we have no neon) */
+        // residuals
        while (w > 0) {
            *device = blend_compact(expanded32, SkExpand_rgb_16(*device),
                                    SkAlpha255To256(*alpha++) >> 3);