Optimize highQualityFilter

portable version:
before:
  10M   1       806µs   807µs   810µs   821µs   1%      █▂▁▁▃▁▁▁█▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic
after:
  10M   1       566µs   568µs   569µs   579µs   1%      ▄▂▂█▂▁▁▁▃▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic

SSE version:
before:
  10M   1       485µs   486µs   487µs   494µs   1%      ▇▂▁▁▁▁█▂▁▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic
after:
  10M   1       419µs   420µs   421µs   430µs   1%      ▅▃▂▁▁█▂▁▁▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic

BUG=skia:

Review URL: https://codereview.chromium.org/759603002
This commit is contained in:
qiankun.miao 2014-11-25 07:12:27 -08:00 committed by Commit bot
parent 2253aa9393
commit 6ff4acedb5
2 changed files with 23 additions and 15 deletions

View File

@ -29,6 +29,7 @@ void highQualityFilter(ColorPacker pack, const SkBitmapProcState& s, int x, int
const int maxX = s.fBitmap->width(); const int maxX = s.fBitmap->width();
const int maxY = s.fBitmap->height(); const int maxY = s.fBitmap->height();
SkAutoTMalloc<SkScalar> xWeights(maxX); SkAutoTMalloc<SkScalar> xWeights(maxX);
const SkBitmapFilter* filter = s.getBitmapFilter();
while (count-- > 0) { while (count-- > 0) {
SkPoint srcPt; SkPoint srcPt;
@ -40,30 +41,33 @@ void highQualityFilter(ColorPacker pack, const SkBitmapProcState& s, int x, int
SkScalar weight = 0; SkScalar weight = 0;
SkScalar fr = 0, fg = 0, fb = 0, fa = 0; SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY); int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY);
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY); int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1), maxY);
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX); int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX);
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX); int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1, maxX);
for (int srcX = x0; srcX < x1 ; srcX++) { for (int srcX = x0; srcX < x1 ; srcX++) {
// Looking these up once instead of each loop is a ~15% speedup. // Looking these up once instead of each loop is a ~15% speedup.
xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX)); xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX));
} }
for (int srcY = y0; srcY < y1; srcY++) { for (int srcY = y0; srcY < y1; srcY++) {
SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY)); SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY));
for (int srcX = x0; srcX < x1 ; srcX++) { for (int srcX = x0; srcX < x1 ; srcX++) {
SkScalar xWeight = xWeights[srcX - x0]; SkScalar xWeight = xWeights[srcX - x0];
SkScalar combined_weight = SkScalarMul(xWeight, yWeight); SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
weight += combined_weight;
SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY); SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
if (!c) {
continue;
}
fr += combined_weight * SkGetPackedR32(c); fr += combined_weight * SkGetPackedR32(c);
fg += combined_weight * SkGetPackedG32(c); fg += combined_weight * SkGetPackedG32(c);
fb += combined_weight * SkGetPackedB32(c); fb += combined_weight * SkGetPackedB32(c);
fa += combined_weight * SkGetPackedA32(c); fa += combined_weight * SkGetPackedA32(c);
weight += combined_weight;
} }
} }

View File

@ -49,6 +49,7 @@ void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
const int maxX = s.fBitmap->width(); const int maxX = s.fBitmap->width();
const int maxY = s.fBitmap->height(); const int maxY = s.fBitmap->height();
SkAutoTMalloc<SkScalar> xWeights(maxX); SkAutoTMalloc<SkScalar> xWeights(maxX);
const SkBitmapFilter* filter = s.getBitmapFilter();
while (count-- > 0) { while (count-- > 0) {
SkPoint srcPt; SkPoint srcPt;
@ -59,34 +60,37 @@ void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
__m128 weight = _mm_setzero_ps(); __m128 weight = _mm_setzero_ps();
__m128 accum = _mm_setzero_ps(); __m128 accum = _mm_setzero_ps();
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY); int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY);
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY); int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1), maxY);
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX); int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX);
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX); int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1, maxX);
for (int srcX = x0; srcX < x1 ; srcX++) { for (int srcX = x0; srcX < x1 ; srcX++) {
// Looking these up once instead of each loop is a ~15% speedup. // Looking these up once instead of each loop is a ~15% speedup.
xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX)); xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX));
} }
for (int srcY = y0; srcY < y1; srcY++) { for (int srcY = y0; srcY < y1; srcY++) {
SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY)); SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY));
for (int srcX = x0; srcX < x1 ; srcX++) { for (int srcX = x0; srcX < x1 ; srcX++) {
SkScalar xWeight = xWeights[srcX - x0]; SkScalar xWeight = xWeights[srcX - x0];
SkScalar combined_weight = SkScalarMul(xWeight, yWeight); SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
__m128 weightVector = _mm_set1_ps(combined_weight);
weight = _mm_add_ps( weight, weightVector );
SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY); SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);
if (!color) {
continue;
}
__m128i c = _mm_cvtsi32_si128(color); __m128i c = _mm_cvtsi32_si128(color);
c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); c = _mm_unpacklo_epi8(c, _mm_setzero_si128());
c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); c = _mm_unpacklo_epi16(c, _mm_setzero_si128());
__m128 cfloat = _mm_cvtepi32_ps(c); __m128 cfloat = _mm_cvtepi32_ps(c);
__m128 weightVector = _mm_set1_ps(combined_weight);
accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));
weight = _mm_add_ps( weight, weightVector );
} }
} }