ARM Skia NEON patches - 23 - S32_D565_Opaque_Dither cleanup/bugfix/speed
BlitRow565: S32_D565_Opaque_Dither: cleaning / bugfix This patch brings a little code cleaning (spaces/comments) and a little speed improvement (by using post-incrementation in the asm) but more importantly it fixes a bug on Linux. The new code now supports ARGB as well as ABGR. I removed the comment as I have confirmed with benchmarks that this code bring a *massive* (3x-7x) speedup compared to the C code. Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG= R=djsollen@google.com, mtklein@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/22269003 git-svn-id: http://skia.googlecode.com/svn/trunk@11339 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
8c960bf965
commit
688d362b45
@ -1125,13 +1125,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
|
||||
* speedup untested, but ARM version is 26 insns/iteration and
|
||||
* this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
|
||||
* which is 10x the native version; that's pure instruction counts,
|
||||
* not accounting for any instruction or memory latencies.
|
||||
*/
|
||||
|
||||
#undef DEBUG_S32_OPAQUE_DITHER
|
||||
|
||||
void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
|
||||
@ -1150,18 +1143,23 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
|
||||
uint16x8_t dr, dg, db;
|
||||
uint16x8_t dst8;
|
||||
|
||||
/* source is in ABGR ordering (R == lsb) */
|
||||
{
|
||||
register uint8x8_t d0 asm("d0");
|
||||
register uint8x8_t d1 asm("d1");
|
||||
register uint8x8_t d2 asm("d2");
|
||||
register uint8x8_t d3 asm("d3");
|
||||
|
||||
asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
|
||||
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
|
||||
: "r" (src)
|
||||
);
|
||||
sr = d0; sg = d1; sb = d2;
|
||||
asm (
|
||||
"vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
|
||||
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
|
||||
:
|
||||
);
|
||||
sg = d1;
|
||||
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
|
||||
sr = d2; sb = d0;
|
||||
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
|
||||
sr = d0; sb = d2;
|
||||
#endif
|
||||
}
|
||||
/* XXX: if we want to prefetch, hide it in the above asm()
|
||||
* using the gcc __builtin_prefetch(), the prefetch will
|
||||
@ -1169,34 +1167,34 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
|
||||
* at the top of the loop, just after the vld4.
|
||||
*/
|
||||
|
||||
/* sr = sr - (sr>>5) + d */
|
||||
// sr = sr - (sr>>5) + d
|
||||
sr = vsub_u8(sr, vshr_n_u8(sr, 5));
|
||||
dr = vaddl_u8(sr, d);
|
||||
|
||||
/* sb = sb - (sb>>5) + d */
|
||||
// sb = sb - (sb>>5) + d
|
||||
sb = vsub_u8(sb, vshr_n_u8(sb, 5));
|
||||
db = vaddl_u8(sb, d);
|
||||
|
||||
/* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
|
||||
// sg = sg - (sg>>6) + d>>1; similar logic for overflows
|
||||
sg = vsub_u8(sg, vshr_n_u8(sg, 6));
|
||||
dg = vaddl_u8(sg, vshr_n_u8(d,1));
|
||||
/* XXX: check that the "d>>1" here is hoisted */
|
||||
dg = vaddl_u8(sg, vshr_n_u8(d, 1));
|
||||
|
||||
/* pack high bits of each into 565 format (rgb, b is lsb) */
|
||||
// pack high bits of each into 565 format (rgb, b is lsb)
|
||||
dst8 = vshrq_n_u16(db, 3);
|
||||
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
|
||||
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
|
||||
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
|
||||
|
||||
/* store it */
|
||||
// store it
|
||||
vst1q_u16(dst, dst8);
|
||||
|
||||
#if defined(DEBUG_S32_OPAQUE_DITHER)
|
||||
/* always good to know if we generated good results */
|
||||
// always good to know if we generated good results
|
||||
{
|
||||
int i, myx = x, myy = y;
|
||||
DITHER_565_SCAN(myy);
|
||||
for (i=0;i<UNROLL;i++) {
|
||||
SkPMColor c = src[i];
|
||||
// the '!' in the asm block above post-incremented src by the 8 pixels it reads.
|
||||
SkPMColor c = src[i-8];
|
||||
unsigned dither = DITHER_VALUE(myx);
|
||||
uint16_t val = SkDitherRGB32To565(c, dither);
|
||||
if (val != dst[i]) {
|
||||
@ -1209,14 +1207,14 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
|
||||
#endif
|
||||
|
||||
dst += UNROLL;
|
||||
src += UNROLL;
|
||||
// we don't need to increment src as the asm above has already done it
|
||||
count -= UNROLL;
|
||||
x += UNROLL; /* probably superfluous */
|
||||
x += UNROLL; // probably superfluous
|
||||
}
|
||||
}
|
||||
#undef UNROLL
|
||||
|
||||
/* residuals */
|
||||
// residuals
|
||||
if (count > 0) {
|
||||
DITHER_565_SCAN(y);
|
||||
do {
|
||||
|
Loading…
Reference in New Issue
Block a user