Update ARM and NEON optimizations for S32A_Opaque_BlitRow32.
These patches replace those written by ARM with ones provided by NVidia. Review URL: https://codereview.appspot.com/6465075 git-svn-id: http://skia.googlecode.com/svn/trunk@5364 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
ed01f12d13
commit
dc1a3badc7
@ -21,25 +21,6 @@ static const char* gConfigName[] = {
|
||||
"ERROR", "a1", "a8", "index8", "565", "4444", "8888"
|
||||
};
|
||||
|
||||
static void drawIntoBitmap(const SkBitmap& bm) {
|
||||
const int w = bm.width();
|
||||
const int h = bm.height();
|
||||
|
||||
SkCanvas canvas(bm);
|
||||
SkPaint p;
|
||||
p.setAntiAlias(true);
|
||||
p.setColor(SK_ColorRED);
|
||||
canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2,
|
||||
SkIntToScalar(SkMin32(w, h))*3/8, p);
|
||||
|
||||
SkRect r;
|
||||
r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h));
|
||||
p.setStyle(SkPaint::kStroke_Style);
|
||||
p.setStrokeWidth(SkIntToScalar(4));
|
||||
p.setColor(SK_ColorBLUE);
|
||||
canvas.drawRect(r, p);
|
||||
}
|
||||
|
||||
static int conv6ToByte(int x) {
|
||||
return x * 0xFF / 5;
|
||||
}
|
||||
@ -102,38 +83,23 @@ class BitmapBench : public SkBenchmark {
|
||||
bool fIsOpaque;
|
||||
bool fForceUpdate; //bitmap marked as dirty before each draw. forces bitmap to be updated on device cache
|
||||
int fTileX, fTileY; // -1 means don't use shader
|
||||
bool fIsVolatile;
|
||||
SkBitmap::Config fConfig;
|
||||
SkString fName;
|
||||
enum { N = SkBENCHLOOP(300) };
|
||||
enum { W = 128 };
|
||||
enum { H = 128 };
|
||||
public:
|
||||
BitmapBench(void* param, bool isOpaque, SkBitmap::Config c,
|
||||
bool forceUpdate = false, bool bitmapVolatile = false,
|
||||
int tx = -1, int ty = -1)
|
||||
: INHERITED(param), fIsOpaque(isOpaque), fForceUpdate(forceUpdate), fTileX(tx), fTileY(ty) {
|
||||
const int w = 128;
|
||||
const int h = 128;
|
||||
SkBitmap bm;
|
||||
|
||||
if (SkBitmap::kIndex8_Config == c) {
|
||||
bm.setConfig(SkBitmap::kARGB_8888_Config, w, h);
|
||||
} else {
|
||||
bm.setConfig(c, w, h);
|
||||
}
|
||||
bm.allocPixels();
|
||||
bm.eraseColor(isOpaque ? SK_ColorBLACK : 0);
|
||||
|
||||
drawIntoBitmap(bm);
|
||||
|
||||
if (SkBitmap::kIndex8_Config == c) {
|
||||
convertToIndex666(bm, &fBitmap);
|
||||
} else {
|
||||
fBitmap = bm;
|
||||
}
|
||||
|
||||
if (fBitmap.getColorTable()) {
|
||||
fBitmap.getColorTable()->setIsOpaque(isOpaque);
|
||||
}
|
||||
fBitmap.setIsOpaque(isOpaque);
|
||||
fBitmap.setIsVolatile(bitmapVolatile);
|
||||
: INHERITED(param)
|
||||
, fIsOpaque(isOpaque)
|
||||
, fForceUpdate(forceUpdate)
|
||||
, fIsVolatile(bitmapVolatile)
|
||||
, fTileX(tx)
|
||||
, fTileY(ty)
|
||||
, fConfig(c) {
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -145,16 +111,43 @@ protected:
|
||||
fName.appendf("_%s", gTileName[fTileY]);
|
||||
}
|
||||
}
|
||||
fName.appendf("_%s%s", gConfigName[fBitmap.config()],
|
||||
fName.appendf("_%s%s", gConfigName[fConfig],
|
||||
fIsOpaque ? "" : "_A");
|
||||
if (fForceUpdate)
|
||||
fName.append("_update");
|
||||
if (fBitmap.isVolatile())
|
||||
if (fIsVolatile)
|
||||
fName.append("_volatile");
|
||||
|
||||
return fName.c_str();
|
||||
}
|
||||
|
||||
virtual void onPreDraw() {
|
||||
SkBitmap bm;
|
||||
|
||||
if (SkBitmap::kIndex8_Config == fConfig) {
|
||||
bm.setConfig(SkBitmap::kARGB_8888_Config, W, H);
|
||||
} else {
|
||||
bm.setConfig(fConfig, W, H);
|
||||
}
|
||||
|
||||
bm.allocPixels();
|
||||
bm.eraseColor(fIsOpaque ? SK_ColorBLACK : 0);
|
||||
|
||||
onDrawIntoBitmap(bm);
|
||||
|
||||
if (SkBitmap::kIndex8_Config == fConfig) {
|
||||
convertToIndex666(bm, &fBitmap);
|
||||
} else {
|
||||
fBitmap = bm;
|
||||
}
|
||||
|
||||
if (fBitmap.getColorTable()) {
|
||||
fBitmap.getColorTable()->setIsOpaque(fIsOpaque);
|
||||
}
|
||||
fBitmap.setIsOpaque(fIsOpaque);
|
||||
fBitmap.setIsVolatile(fIsVolatile);
|
||||
}
|
||||
|
||||
virtual void onDraw(SkCanvas* canvas) {
|
||||
SkIPoint dim = this->getSize();
|
||||
SkRandom rand;
|
||||
@ -177,6 +170,25 @@ protected:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void onDrawIntoBitmap(const SkBitmap& bm) {
|
||||
const int w = bm.width();
|
||||
const int h = bm.height();
|
||||
|
||||
SkCanvas canvas(bm);
|
||||
SkPaint p;
|
||||
p.setAntiAlias(true);
|
||||
p.setColor(SK_ColorRED);
|
||||
canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2,
|
||||
SkIntToScalar(SkMin32(w, h))*3/8, p);
|
||||
|
||||
SkRect r;
|
||||
r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h));
|
||||
p.setStyle(SkPaint::kStroke_Style);
|
||||
p.setStrokeWidth(SkIntToScalar(4));
|
||||
p.setColor(SK_ColorBLUE);
|
||||
canvas.drawRect(r, p);
|
||||
}
|
||||
|
||||
private:
|
||||
typedef SkBenchmark INHERITED;
|
||||
};
|
||||
@ -241,6 +253,95 @@ private:
|
||||
typedef BitmapBench INHERITED;
|
||||
};
|
||||
|
||||
/** Verify optimizations that test source alpha values. */
|
||||
|
||||
class SourceAlphaBitmapBench : public BitmapBench {
|
||||
public:
|
||||
enum SourceAlpha { kOpaque_SourceAlpha, kTransparent_SourceAlpha,
|
||||
kTwoStripes_SourceAlpha, kThreeStripes_SourceAlpha};
|
||||
private:
|
||||
SkString fFullName;
|
||||
SourceAlpha fSourceAlpha;
|
||||
public:
|
||||
SourceAlphaBitmapBench(void* param, SourceAlpha alpha, SkBitmap::Config c,
|
||||
bool forceUpdate = false, bool bitmapVolatile = false,
|
||||
int tx = -1, int ty = -1)
|
||||
: INHERITED(param, false, c, forceUpdate, bitmapVolatile, tx, ty)
|
||||
, fSourceAlpha(alpha) {
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual const char* onGetName() {
|
||||
fFullName.set(INHERITED::onGetName());
|
||||
|
||||
if (fSourceAlpha == kOpaque_SourceAlpha) {
|
||||
fFullName.append("_source_opaque");
|
||||
} else if (fSourceAlpha == kTransparent_SourceAlpha) {
|
||||
fFullName.append("_source_transparent");
|
||||
} else if (fSourceAlpha == kTwoStripes_SourceAlpha) {
|
||||
fFullName.append("_source_stripes_two");
|
||||
} else if (fSourceAlpha == kThreeStripes_SourceAlpha) {
|
||||
fFullName.append("_source_stripes_three");
|
||||
}
|
||||
|
||||
return fFullName.c_str();
|
||||
}
|
||||
|
||||
virtual void onDrawIntoBitmap(const SkBitmap& bm) SK_OVERRIDE {
|
||||
const int w = bm.width();
|
||||
const int h = bm.height();
|
||||
|
||||
if (kOpaque_SourceAlpha == fSourceAlpha) {
|
||||
bm.eraseColor(SK_ColorBLACK);
|
||||
} else if (kTransparent_SourceAlpha == fSourceAlpha) {
|
||||
bm.eraseColor(0);
|
||||
} else if (kTwoStripes_SourceAlpha == fSourceAlpha) {
|
||||
bm.eraseColor(0);
|
||||
|
||||
SkCanvas canvas(bm);
|
||||
SkPaint p;
|
||||
p.setAntiAlias(false);
|
||||
p.setStyle(SkPaint::kFill_Style);
|
||||
p.setColor(SK_ColorRED);
|
||||
|
||||
// Draw red vertical stripes on transparent background
|
||||
SkRect r;
|
||||
for (int x = 0; x < w; x+=2)
|
||||
{
|
||||
r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h));
|
||||
canvas.drawRect(r, p);
|
||||
}
|
||||
|
||||
} else if (kThreeStripes_SourceAlpha == fSourceAlpha) {
|
||||
bm.eraseColor(0);
|
||||
|
||||
SkCanvas canvas(bm);
|
||||
SkPaint p;
|
||||
p.setAntiAlias(false);
|
||||
p.setStyle(SkPaint::kFill_Style);
|
||||
|
||||
// Draw vertical stripes on transparent background with a pattern
|
||||
// where the first pixel is fully transparent, the next is semi-transparent
|
||||
// and the third is fully opaque.
|
||||
SkRect r;
|
||||
for (int x = 0; x < w; x++)
|
||||
{
|
||||
if (x % 3 == 0) {
|
||||
continue; // Keep transparent
|
||||
} else if (x % 3 == 1) {
|
||||
p.setColor(SkColorSetARGB(127, 127, 127, 127)); // Semi-transparent
|
||||
} else if (x % 3 == 2) {
|
||||
p.setColor(SK_ColorRED); // Opaque
|
||||
}
|
||||
r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h));
|
||||
canvas.drawRect(r, p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typedef BitmapBench INHERITED;
|
||||
};
|
||||
static SkBenchmark* Fact0(void* p) { return new BitmapBench(p, false, SkBitmap::kARGB_8888_Config); }
|
||||
static SkBenchmark* Fact1(void* p) { return new BitmapBench(p, true, SkBitmap::kARGB_8888_Config); }
|
||||
static SkBenchmark* Fact2(void* p) { return new BitmapBench(p, true, SkBitmap::kRGB_565_Config); }
|
||||
@ -263,6 +364,12 @@ static SkBenchmark* Fact14(void* p) { return new FilterBitmapBench(p, true, SkBi
|
||||
static SkBenchmark* Fact15(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, true, -1, -1, true, true, true); }
|
||||
static SkBenchmark* Fact16(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, false, -1, -1, true, true, true); }
|
||||
|
||||
// source alpha tests -> S32A_Opaque_BlitRow32_{arm,neon}
|
||||
static SkBenchmark* Fact17(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kOpaque_SourceAlpha, SkBitmap::kARGB_8888_Config); }
|
||||
static SkBenchmark* Fact18(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTransparent_SourceAlpha, SkBitmap::kARGB_8888_Config); }
|
||||
static SkBenchmark* Fact19(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTwoStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); }
|
||||
static SkBenchmark* Fact20(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kThreeStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); }
|
||||
|
||||
static BenchRegistry gReg0(Fact0);
|
||||
static BenchRegistry gReg1(Fact1);
|
||||
static BenchRegistry gReg2(Fact2);
|
||||
@ -283,3 +390,7 @@ static BenchRegistry gReg14(Fact14);
|
||||
static BenchRegistry gReg15(Fact15);
|
||||
static BenchRegistry gReg16(Fact16);
|
||||
|
||||
static BenchRegistry gReg17(Fact17);
|
||||
static BenchRegistry gReg18(Fact18);
|
||||
static BenchRegistry gReg19(Fact19);
|
||||
static BenchRegistry gReg20(Fact20);
|
||||
|
@ -185,6 +185,306 @@ static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
|
||||
: "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_src_alpha
|
||||
(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
|
||||
/* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */
|
||||
/* Predicts that the next pixel will have the same alpha type as the current pixel */
|
||||
|
||||
asm volatile (
|
||||
|
||||
"\tSTMDB r13!, {r4-r12, r14} \n" /* saving r4-r12, lr on the stack */
|
||||
/* we should not save r0-r3 according to ABI */
|
||||
|
||||
"\tCMP r2, #0 \n" /* if (count == 0) */
|
||||
"\tBEQ 9f \n" /* go to EXIT */
|
||||
|
||||
"\tMOV r12, #0xff \n" /* load the 0xff mask in r12 */
|
||||
"\tORR r12, r12, r12, LSL #16 \n" /* convert it to 0xff00ff in r12 */
|
||||
|
||||
"\tMOV r14, #255 \n" /* r14 = 255 */
|
||||
/* will be used later for left-side comparison */
|
||||
|
||||
"\tADD r2, %[src], r2, LSL #2 \n" /* r2 points to last array element which can be used */
|
||||
"\tSUB r2, r2, #16 \n" /* as a base for 4-way processing algorithm */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] array pointer is bigger than */
|
||||
"\tBGT 8f \n" /* calculated marker for 4-way -> */
|
||||
/* use simple one-by-one processing */
|
||||
|
||||
/* START OF DISPATCHING BLOCK */
|
||||
|
||||
"\t0: \n"
|
||||
|
||||
"\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */
|
||||
|
||||
"\tLSR r7, r3, #24 \n" /* if not all src alphas of 4-way block are equal -> */
|
||||
"\tCMP r7, r4, LSR #24 \n"
|
||||
"\tCMPEQ r7, r5, LSR #24 \n"
|
||||
"\tCMPEQ r7, r6, LSR #24 \n"
|
||||
"\tBNE 1f \n" /* -> go to general 4-way processing routine */
|
||||
|
||||
"\tCMP r14, r7 \n" /* if all src alphas are equal to 255 */
|
||||
"\tBEQ 3f \n" /* go to alpha == 255 optimized routine */
|
||||
|
||||
"\tCMP r7, #0 \n" /* if all src alphas are equal to 0 */
|
||||
"\tBEQ 6f \n" /* go to alpha == 0 optimized routine */
|
||||
|
||||
/* END OF DISPATCHING BLOCK */
|
||||
|
||||
/* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
|
||||
|
||||
"\t1: \n"
|
||||
/* we do not have enough registers to make */
|
||||
/* 4-way [dst] loading -> we are using 2 * 2-way */
|
||||
|
||||
"\tLDM %[dst], {r7, r8} \n" /* 1st 2-way loading of dst values to r7-r8 */
|
||||
|
||||
/* PROCESSING BLOCK 1 */
|
||||
/* r3 = src, r7 = dst */
|
||||
|
||||
"\tLSR r11, r3, #24 \n" /* extracting alpha from source and storing to r11 */
|
||||
"\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */
|
||||
"\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */
|
||||
"\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */
|
||||
"\tMUL r9, r9, r11 \n" /* br = br * scale */
|
||||
"\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */
|
||||
"\tMUL r10, r10, r11 \n" /* ag = ag * scale */
|
||||
"\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */
|
||||
"\tORR r7, r9, r10 \n" /* br | ag */
|
||||
"\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */
|
||||
|
||||
/* PROCESSING BLOCK 2 */
|
||||
/* r4 = src, r8 = dst */
|
||||
|
||||
"\tLSR r11, r4, #24 \n" /* see PROCESSING BLOCK 1 */
|
||||
"\tAND r9, r12, r8 \n"
|
||||
"\tRSB r11, r11, #256 \n"
|
||||
"\tAND r10, r12, r8, LSR #8 \n"
|
||||
"\tMUL r9, r9, r11 \n"
|
||||
"\tAND r9, r12, r9, LSR #8 \n"
|
||||
"\tMUL r10, r10, r11 \n"
|
||||
"\tAND r10, r10, r12, LSL #8 \n"
|
||||
"\tORR r8, r9, r10 \n"
|
||||
"\tADD r8, r4, r8 \n"
|
||||
|
||||
"\tSTM %[dst]!, {r7, r8} \n" /* 1st 2-way storing of processed dst values */
|
||||
|
||||
"\tLDM %[dst], {r9, r10} \n" /* 2nd 2-way loading of dst values to r9-r10 */
|
||||
|
||||
/* PROCESSING BLOCK 3 */
|
||||
/* r5 = src, r9 = dst */
|
||||
|
||||
"\tLSR r11, r5, #24 \n" /* see PROCESSING BLOCK 1 */
|
||||
"\tAND r7, r12, r9 \n"
|
||||
"\tRSB r11, r11, #256 \n"
|
||||
"\tAND r8, r12, r9, LSR #8 \n"
|
||||
"\tMUL r7, r7, r11 \n"
|
||||
"\tAND r7, r12, r7, LSR #8 \n"
|
||||
"\tMUL r8, r8, r11 \n"
|
||||
"\tAND r8, r8, r12, LSL #8 \n"
|
||||
"\tORR r9, r7, r8 \n"
|
||||
"\tADD r9, r5, r9 \n"
|
||||
|
||||
/* PROCESSING BLOCK 4 */
|
||||
/* r6 = src, r10 = dst */
|
||||
|
||||
"\tLSR r11, r6, #24 \n" /* see PROCESSING BLOCK 1 */
|
||||
"\tAND r7, r12, r10 \n"
|
||||
"\tRSB r11, r11, #256 \n"
|
||||
"\tAND r8, r12, r10, LSR #8 \n"
|
||||
"\tMUL r7, r7, r11 \n"
|
||||
"\tAND r7, r12, r7, LSR #8 \n"
|
||||
"\tMUL r8, r8, r11 \n"
|
||||
"\tAND r8, r8, r12, LSL #8 \n"
|
||||
"\tORR r10, r7, r8 \n"
|
||||
"\tADD r10, r6, r10 \n"
|
||||
|
||||
"\tSTM %[dst]!, {r9, r10} \n" /* 2nd 2-way storing of processed dst values */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] pointer <= calculated marker */
|
||||
"\tBLE 0b \n" /* we could run 4-way processing -> go to dispatcher */
|
||||
"\tBGT 8f \n" /* else -> use simple one-by-one processing */
|
||||
|
||||
/* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
|
||||
|
||||
/* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */
|
||||
|
||||
"\t2: \n" /* ENTRY 1: LOADING [src] to registers */
|
||||
|
||||
"\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */
|
||||
|
||||
"\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */
|
||||
"\tAND r8, r5, r6 \n"
|
||||
"\tAND r9, r7, r8 \n"
|
||||
"\tCMP r14, r9, LSR #24 \n"
|
||||
"\tBNE 4f \n" /* -> go to alpha == 0 check */
|
||||
|
||||
"\t3: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
|
||||
|
||||
"\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
|
||||
"\tBLE 2b \n" /* we could run 4-way processing */
|
||||
/* because now we're in ALPHA == 255 state */
|
||||
/* run next cycle with priority alpha == 255 checks */
|
||||
|
||||
"\tBGT 8f \n" /* if our current [src] array pointer > marker */
|
||||
/* use simple one-by-one processing */
|
||||
|
||||
"\t4: \n"
|
||||
|
||||
"\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */
|
||||
"\tORR r8, r5, r6 \n"
|
||||
"\tORR r9, r7, r8 \n"
|
||||
"\tLSRS r9, #24 \n"
|
||||
"\tBNE 1b \n" /* -> go to general processing mode */
|
||||
/* (we already checked for alpha == 255) */
|
||||
|
||||
"\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
|
||||
"\tBLE 5f \n" /* we could run 4-way processing one more time */
|
||||
/* because now we're in ALPHA == 0 state */
|
||||
/* run next cycle with priority alpha == 0 checks */
|
||||
|
||||
"\tBGT 8f \n" /* if our current [src] array pointer > marker */
|
||||
/* use simple one-by-one processing */
|
||||
|
||||
/* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */
|
||||
|
||||
/* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */
|
||||
|
||||
"\t5: \n" /* ENTRY 1: LOADING [src] to registers */
|
||||
|
||||
"\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */
|
||||
|
||||
"\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */
|
||||
"\tORR r8, r5, r6 \n"
|
||||
"\tORR r9, r7, r8 \n"
|
||||
"\tLSRS r9, #24 \n"
|
||||
"\tBNE 7f \n" /* -> go to alpha == 255 check */
|
||||
|
||||
"\t6: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
|
||||
|
||||
"\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
|
||||
"\tBLE 5b \n" /* we could run 4-way processing one more time */
|
||||
/* because now we're in ALPHA == 0 state */
|
||||
/* run next cycle with priority alpha == 0 checks */
|
||||
|
||||
"\tBGT 8f \n" /* if our current [src] array pointer > marker */
|
||||
/* use simple one-by-one processing */
|
||||
"\t7: \n"
|
||||
|
||||
"\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */
|
||||
"\tAND r8, r5, r6 \n"
|
||||
"\tAND r9, r7, r8 \n"
|
||||
"\tCMP r14, r9, LSR #24 \n"
|
||||
"\tBNE 1b \n" /* -> go to general processing mode */
|
||||
/* (we already checked for alpha == 0) */
|
||||
|
||||
"\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
|
||||
"\tBLE 2b \n" /* we could run 4-way processing one more time */
|
||||
/* because now we're in ALPHA == 255 state */
|
||||
/* run next cycle with priority alpha == 255 checks */
|
||||
|
||||
"\tBGT 8f \n" /* if our current [src] array pointer > marker */
|
||||
/* use simple one-by-one processing */
|
||||
|
||||
/* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */
|
||||
|
||||
/* START OF TAIL BLOCK */
|
||||
/* (used when array is too small to be processed with 4-way algorithm)*/
|
||||
|
||||
"\t8: \n"
|
||||
|
||||
"\tADD r2, r2, #16 \n" /* now r2 points to the element just after array */
|
||||
/* we've done r2 = r2 - 16 at procedure start */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */
|
||||
"\tBEQ 9f \n" /* goto EXIT */
|
||||
|
||||
/* TAIL PROCESSING BLOCK 1 */
|
||||
|
||||
"\tLDR r3, [%[src]], #4 \n" /* r3 = *src, src++ */
|
||||
"\tLDR r7, [%[dst]] \n" /* r7 = *dst */
|
||||
|
||||
"\tLSR r11, r3, #24 \n" /* extracting alpha from source */
|
||||
"\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */
|
||||
"\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */
|
||||
"\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */
|
||||
"\tMUL r9, r9, r11 \n" /* br = br * scale */
|
||||
"\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */
|
||||
"\tMUL r10, r10, r11 \n" /* ag = ag * scale */
|
||||
"\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */
|
||||
"\tORR r7, r9, r10 \n" /* br | ag */
|
||||
"\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */
|
||||
|
||||
"\tSTR r7, [%[dst]], #4 \n" /* *dst = r7; dst++ */
|
||||
|
||||
"\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */
|
||||
"\tBEQ 9f \n" /* goto EXIT */
|
||||
|
||||
/* TAIL PROCESSING BLOCK 2 */
|
||||
|
||||
"\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */
|
||||
"\tLDR r7, [%[dst]] \n"
|
||||
|
||||
"\tLSR r11, r3, #24 \n"
|
||||
"\tAND r9, r12, r7 \n"
|
||||
"\tRSB r11, r11, #256 \n"
|
||||
"\tAND r10, r12, r7, LSR #8 \n"
|
||||
"\tMUL r9, r9, r11 \n"
|
||||
"\tAND r9, r12, r9, LSR #8 \n"
|
||||
"\tMUL r10, r10, r11 \n"
|
||||
"\tAND r10, r10, r12, LSL #8 \n"
|
||||
"\tORR r7, r9, r10 \n"
|
||||
"\tADD r7, r3, r7 \n"
|
||||
|
||||
"\tSTR r7, [%[dst]], #4 \n"
|
||||
|
||||
"\tCMP %[src], r2 \n"
|
||||
"\tBEQ 9f \n"
|
||||
|
||||
/* TAIL PROCESSING BLOCK 3 */
|
||||
|
||||
"\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */
|
||||
"\tLDR r7, [%[dst]] \n"
|
||||
|
||||
"\tLSR r11, r3, #24 \n"
|
||||
"\tAND r9, r12, r7 \n"
|
||||
"\tRSB r11, r11, #256 \n"
|
||||
"\tAND r10, r12, r7, LSR #8 \n"
|
||||
"\tMUL r9, r9, r11 \n"
|
||||
"\tAND r9, r12, r9, LSR #8 \n"
|
||||
"\tMUL r10, r10, r11 \n"
|
||||
"\tAND r10, r10, r12, LSL #8 \n"
|
||||
"\tORR r7, r9, r10 \n"
|
||||
"\tADD r7, r3, r7 \n"
|
||||
|
||||
"\tSTR r7, [%[dst]], #4 \n"
|
||||
|
||||
/* END OF TAIL BLOCK */
|
||||
|
||||
"\t9: \n" /* EXIT */
|
||||
|
||||
"\tLDMIA r13!, {r4-r12, r14} \n" /* restoring r4-r12, lr from stack */
|
||||
"\tBX lr \n" /* return */
|
||||
|
||||
: [dst] "+r" (dst), [src] "+r" (src)
|
||||
:
|
||||
: "cc", "r2", "r3", "memory"
|
||||
|
||||
);
|
||||
|
||||
}
|
||||
#endif // USE_ARM_CODE
|
||||
|
||||
/*
|
||||
@ -366,7 +666,21 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm[] = {
|
||||
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = {
|
||||
NULL, // S32_Opaque,
|
||||
NULL, // S32_Blend,
|
||||
/*
|
||||
* We have two choices for S32A_Opaque procs. The one reads the src alpha
|
||||
* value and attempts to optimize accordingly. The optimization is
|
||||
* sensitive to the source content and is not a win in all cases. For
|
||||
* example, if there are a lot of transitions between the alpha states,
|
||||
* the performance will almost certainly be worse. However, for many
|
||||
* common cases the performance is equivalent or better than the standard
|
||||
* case where we do not inspect the src alpha.
|
||||
*/
|
||||
#if SK_A32_SHIFT == 24
|
||||
// This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
|
||||
S32A_Opaque_BlitRow32_arm_src_alpha, // S32A_Opaque,
|
||||
#else
|
||||
S32A_Opaque_BlitRow32_arm, // S32A_Opaque,
|
||||
#endif
|
||||
S32A_Blend_BlitRow32_arm // S32A_Blend
|
||||
};
|
||||
#endif
|
||||
|
@ -517,6 +517,176 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
||||
}
|
||||
}
|
||||
|
||||
void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha) {
|
||||
SkASSERT(255 == alpha);
|
||||
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
/* Use these to check if src is transparent or opaque */
|
||||
const unsigned int ALPHA_OPAQ = 0xFF000000;
|
||||
const unsigned int ALPHA_TRANS = 0x00FFFFFF;
|
||||
|
||||
#define UNROLL 4
|
||||
const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
|
||||
const SkPMColor* SK_RESTRICT src_temp = src;
|
||||
|
||||
/* set up the NEON variables */
|
||||
uint8x8_t alpha_mask;
|
||||
static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
|
||||
alpha_mask = vld1_u8(alpha_mask_setup);
|
||||
|
||||
uint8x8_t src_raw, dst_raw, dst_final;
|
||||
uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
|
||||
uint8x8_t dst_cooked;
|
||||
uint16x8_t dst_wide;
|
||||
uint8x8_t alpha_narrow;
|
||||
uint16x8_t alpha_wide;
|
||||
|
||||
/* choose the first processing type */
|
||||
if( src >= src_end)
|
||||
goto TAIL;
|
||||
if(*src <= ALPHA_TRANS)
|
||||
goto ALPHA_0;
|
||||
if(*src >= ALPHA_OPAQ)
|
||||
goto ALPHA_255;
|
||||
/* fall-thru */
|
||||
|
||||
ALPHA_1_TO_254:
|
||||
do {
|
||||
|
||||
/* get the source */
|
||||
src_raw = vreinterpret_u8_u32(vld1_u32(src));
|
||||
src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
|
||||
|
||||
/* get and hold the dst too */
|
||||
dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
|
||||
dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
|
||||
|
||||
|
||||
/* get the alphas spread out properly */
|
||||
alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
|
||||
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
|
||||
/* we collapsed (255-a)+1 ... */
|
||||
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
||||
|
||||
/* spread the dest */
|
||||
dst_wide = vmovl_u8(dst_raw);
|
||||
|
||||
/* alpha mul the dest */
|
||||
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
||||
dst_cooked = vshrn_n_u16(dst_wide, 8);
|
||||
|
||||
/* sum -- ignoring any byte lane overflows */
|
||||
dst_final = vadd_u8(src_raw, dst_cooked);
|
||||
|
||||
alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
|
||||
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
|
||||
/* we collapsed (255-a)+1 ... */
|
||||
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
||||
|
||||
/* spread the dest */
|
||||
dst_wide = vmovl_u8(dst_raw_2);
|
||||
|
||||
/* alpha mul the dest */
|
||||
dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
||||
dst_cooked = vshrn_n_u16(dst_wide, 8);
|
||||
|
||||
/* sum -- ignoring any byte lane overflows */
|
||||
dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
|
||||
|
||||
vst1_u32(dst, vreinterpret_u32_u8(dst_final));
|
||||
vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
|
||||
|
||||
src += UNROLL;
|
||||
dst += UNROLL;
|
||||
|
||||
/* if 2 of the next pixels aren't between 1 and 254
|
||||
it might make sense to go to the optimized loops */
|
||||
if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
|
||||
break;
|
||||
|
||||
} while(src < src_end);
|
||||
|
||||
if (src >= src_end)
|
||||
goto TAIL;
|
||||
|
||||
if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
|
||||
goto ALPHA_255;
|
||||
|
||||
/*fall-thru*/
|
||||
|
||||
ALPHA_0:
|
||||
|
||||
/*In this state, we know the current alpha is 0 and
|
||||
we optimize for the next alpha also being zero. */
|
||||
src_temp = src; //so we don't have to increment dst every time
|
||||
do {
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
if(*(++src) > ALPHA_TRANS)
|
||||
break;
|
||||
} while(src < src_end);
|
||||
|
||||
dst += (src - src_temp);
|
||||
|
||||
/* no longer alpha 0, so determine where to go next. */
|
||||
if( src >= src_end)
|
||||
goto TAIL;
|
||||
if(*src >= ALPHA_OPAQ)
|
||||
goto ALPHA_255;
|
||||
else
|
||||
goto ALPHA_1_TO_254;
|
||||
|
||||
ALPHA_255:
|
||||
while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
|
||||
dst[0]=src[0];
|
||||
dst[1]=src[1];
|
||||
dst[2]=src[2];
|
||||
dst[3]=src[3];
|
||||
src+=UNROLL;
|
||||
dst+=UNROLL;
|
||||
if(src >= src_end)
|
||||
goto TAIL;
|
||||
}
|
||||
|
||||
//Handle remainder.
|
||||
if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
|
||||
if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
|
||||
if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
|
||||
}
|
||||
}
|
||||
|
||||
if( src >= src_end)
|
||||
goto TAIL;
|
||||
if(*src <= ALPHA_TRANS)
|
||||
goto ALPHA_0;
|
||||
else
|
||||
goto ALPHA_1_TO_254;
|
||||
|
||||
TAIL:
|
||||
/* do any residual iterations */
|
||||
src_end += UNROLL + 1; //goto the real end
|
||||
while(src != src_end) {
|
||||
if( *src != 0 ) {
|
||||
if( *src >= ALPHA_OPAQ ) {
|
||||
*dst = *src;
|
||||
}
|
||||
else {
|
||||
*dst = SkPMSrcOver(*src, *dst);
|
||||
}
|
||||
}
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Neon version of S32_Blend_BlitRow32()
|
||||
* portable version is in src/core/SkBlitRow_D32.cpp
|
||||
@ -1107,6 +1277,20 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = {
|
||||
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
|
||||
NULL, // S32_Opaque,
|
||||
S32_Blend_BlitRow32_neon, // S32_Blend,
|
||||
S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
|
||||
/*
|
||||
* We have two choices for S32A_Opaque procs. The one reads the src alpha
|
||||
* value and attempts to optimize accordingly. The optimization is
|
||||
* sensitive to the source content and is not a win in all cases. For
|
||||
* example, if there are a lot of transitions between the alpha states,
|
||||
* the performance will almost certainly be worse. However, for many
|
||||
* common cases the performance is equivalent or better than the standard
|
||||
* case where we do not inspect the src alpha.
|
||||
*/
|
||||
#if SK_A32_SHIFT == 24
|
||||
// This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
|
||||
S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
|
||||
#else
|
||||
S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
|
||||
#endif
|
||||
S32A_Blend_BlitRow32_arm // S32A_Blend
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user