ARM Skia NEON patches - 11 - Blitter_RGB16

Blitter_RGB16: fixes and improvements

- fix alpha calculation: it was still using the old version of
  SkAlpha255To256. 11 more tests pass in gm.

- clean a lot the code: the existing code was "a bit" messy with
  a lot of duplicated hardcoded constants, got rid of all this.

- improve speed a little: part of it as a side-effect of the change
  in the way alpha is calculated but also by grouping loads and stores.

One "issue" was present and still remains: the NEON code doesn't give
the same result as the black blitter on black. It accounts for dozens
of mismatches in gm. Is this considered "not too bad"? Would you be
interested in a NEON version of the black blitter? The current
comments seem to indicate that the black blitter is here only to give
a performance boost when NEON is not presents so I didn't write a NEON
version.

BUG=
R=djsollen@google.com, tomhudson@google.com, reed@google.com

Author: kevin.petit.arm@gmail.com

Review URL: https://chromiumcodereview.appspot.com/18666005

git-svn-id: http://skia.googlecode.com/svn/trunk@10635 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
commit-bot@chromium.org 2013-08-08 10:51:45 +00:00
parent d55e357a8e
commit 641a249196

View File

@ -390,63 +390,53 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
do {
int w = width;
if (w >= UNROLL) {
uint32x4_t color; /* can use same one */
uint32x4_t dev_lo, dev_hi;
uint32x4_t t1;
uint32x4_t wn1, wn2;
uint16x4_t odev_lo, odev_hi;
uint16x4_t alpha_lo, alpha_hi;
uint16x8_t alpha_full;
uint32x4_t color, dev_lo, dev_hi;
uint32x4_t wn1, wn2, tmp;
uint32x4_t vmask_g16, vmask_ng16;
uint16x8_t valpha, vdev;
uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi;
// prepare constants
vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE);
vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE);
color = vdupq_n_u32(expanded32);
do {
/* alpha is 8x8, widen and split to get pair of 16x4's */
alpha_full = vmovl_u8(vld1_u8(alpha));
alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7));
alpha_full = vshrq_n_u16(alpha_full, 3);
alpha_lo = vget_low_u16(alpha_full);
alpha_hi = vget_high_u16(alpha_full);
// alpha is 8x8, widen and split to get a pair of 16x4
valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha));
valpha = vshrq_n_u16(valpha, 3);
valpha_lo = vget_low_u16(valpha);
valpha_hi = vget_high_u16(valpha);
dev_lo = vmovl_u16(vld1_u16(device));
dev_hi = vmovl_u16(vld1_u16(device+4));
// load pixels
vdev = vld1q_u16(device);
dev_lo = vmovl_u16(vget_low_u16(vdev));
dev_hi = vmovl_u16(vget_high_u16(vdev));
/* unpack in 32 bits */
dev_lo = vorrq_u32(
vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
vshlq_n_u32(vandq_u32(dev_lo,
vdupq_n_u32(0x000007E0)),
16)
);
dev_hi = vorrq_u32(
vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
vshlq_n_u32(vandq_u32(dev_hi,
vdupq_n_u32(0x000007E0)),
16)
);
// unpack them in 32 bits
dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16, 16);
dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16, 16);
/* blend the two */
t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo));
t1 = vshrq_n_u32(t1, 5);
dev_lo = vaddq_u32(dev_lo, t1);
// blend with color
tmp = (color - dev_lo) * vmovl_u16(valpha_lo);
tmp = vshrq_n_u32(tmp, 5);
dev_lo += tmp;
t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi));
t1 = vshrq_n_u32(t1, 5);
dev_hi = vaddq_u32(dev_hi, t1);
tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi));
tmp = vshrq_n_u32(tmp, 5);
dev_hi += tmp;
/* re-compact and store */
wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
wn2 = vshrq_n_u32(dev_lo, 16);
wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
odev_lo = vmovn_u32(vorrq_u32(wn1, wn2));
// re-compact
wn1 = dev_lo & vmask_ng16;
wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16;
odev_lo = vmovn_u32(wn1 | wn2);
wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
wn2 = vshrq_n_u32(dev_hi, 16);
wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
odev_hi = vmovn_u32(vorrq_u32(wn1, wn2));
wn1 = dev_hi & vmask_ng16;
wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16;
odev_hi = vmovn_u32(wn1 | wn2);
vst1_u16(device, odev_lo);
vst1_u16(device+4, odev_hi);
// store
vst1q_u16(device, vcombine_u16(odev_lo, odev_hi));
device += UNROLL;
alpha += UNROLL;
@ -454,7 +444,7 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
} while (w >= UNROLL);
}
/* residuals (which is everything if we have no neon) */
// residuals
while (w > 0) {
*device = blend_compact(expanded32, SkExpand_rgb_16(*device),
SkAlpha255To256(*alpha++) >> 3);