Optimize highQualityFilter
portable version: before: 10M 1 806µs 807µs 810µs 821µs 1% █▂▁▁▃▁▁▁█▁ 8888 bitmap_BGRA_8888_A_scale_rotate_bicubic after: 10M 1 566µs 568µs 569µs 579µs 1% ▄▂▂█▂▁▁▁▃▁ 8888 bitmap_BGRA_8888_A_scale_rotate_bicubic SSE version: before: 10M 1 485µs 486µs 487µs 494µs 1% ▇▂▁▁▁▁█▂▁▁ 8888 bitmap_BGRA_8888_A_scale_rotate_bicubic after: 10M 1 419µs 420µs 421µs 430µs 1% ▅▃▂▁▁█▂▁▁▁ 8888 bitmap_BGRA_8888_A_scale_rotate_bicubic BUG=skia: Review URL: https://codereview.chromium.org/759603002
This commit is contained in:
parent
2253aa9393
commit
6ff4acedb5
@ -29,6 +29,7 @@ void highQualityFilter(ColorPacker pack, const SkBitmapProcState& s, int x, int
|
|||||||
const int maxX = s.fBitmap->width();
|
const int maxX = s.fBitmap->width();
|
||||||
const int maxY = s.fBitmap->height();
|
const int maxY = s.fBitmap->height();
|
||||||
SkAutoTMalloc<SkScalar> xWeights(maxX);
|
SkAutoTMalloc<SkScalar> xWeights(maxX);
|
||||||
|
const SkBitmapFilter* filter = s.getBitmapFilter();
|
||||||
|
|
||||||
while (count-- > 0) {
|
while (count-- > 0) {
|
||||||
SkPoint srcPt;
|
SkPoint srcPt;
|
||||||
@ -40,30 +41,33 @@ void highQualityFilter(ColorPacker pack, const SkBitmapProcState& s, int x, int
|
|||||||
SkScalar weight = 0;
|
SkScalar weight = 0;
|
||||||
SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
|
SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
|
||||||
|
|
||||||
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
|
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY);
|
||||||
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY);
|
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1), maxY);
|
||||||
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
|
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX);
|
||||||
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX);
|
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1, maxX);
|
||||||
|
|
||||||
for (int srcX = x0; srcX < x1 ; srcX++) {
|
for (int srcX = x0; srcX < x1 ; srcX++) {
|
||||||
// Looking these up once instead of each loop is a ~15% speedup.
|
// Looking these up once instead of each loop is a ~15% speedup.
|
||||||
xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
|
xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int srcY = y0; srcY < y1; srcY++) {
|
for (int srcY = y0; srcY < y1; srcY++) {
|
||||||
SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
|
SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY));
|
||||||
|
|
||||||
for (int srcX = x0; srcX < x1 ; srcX++) {
|
for (int srcX = x0; srcX < x1 ; srcX++) {
|
||||||
SkScalar xWeight = xWeights[srcX - x0];
|
SkScalar xWeight = xWeights[srcX - x0];
|
||||||
|
|
||||||
SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
|
SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
|
||||||
|
weight += combined_weight;
|
||||||
|
|
||||||
SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
|
SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
|
||||||
|
if (!c) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
fr += combined_weight * SkGetPackedR32(c);
|
fr += combined_weight * SkGetPackedR32(c);
|
||||||
fg += combined_weight * SkGetPackedG32(c);
|
fg += combined_weight * SkGetPackedG32(c);
|
||||||
fb += combined_weight * SkGetPackedB32(c);
|
fb += combined_weight * SkGetPackedB32(c);
|
||||||
fa += combined_weight * SkGetPackedA32(c);
|
fa += combined_weight * SkGetPackedA32(c);
|
||||||
weight += combined_weight;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -49,6 +49,7 @@ void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
|
|||||||
const int maxX = s.fBitmap->width();
|
const int maxX = s.fBitmap->width();
|
||||||
const int maxY = s.fBitmap->height();
|
const int maxY = s.fBitmap->height();
|
||||||
SkAutoTMalloc<SkScalar> xWeights(maxX);
|
SkAutoTMalloc<SkScalar> xWeights(maxX);
|
||||||
|
const SkBitmapFilter* filter = s.getBitmapFilter();
|
||||||
|
|
||||||
while (count-- > 0) {
|
while (count-- > 0) {
|
||||||
SkPoint srcPt;
|
SkPoint srcPt;
|
||||||
@ -59,34 +60,37 @@ void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
|
|||||||
__m128 weight = _mm_setzero_ps();
|
__m128 weight = _mm_setzero_ps();
|
||||||
__m128 accum = _mm_setzero_ps();
|
__m128 accum = _mm_setzero_ps();
|
||||||
|
|
||||||
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
|
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY);
|
||||||
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY);
|
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1), maxY);
|
||||||
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
|
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX);
|
||||||
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX);
|
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1, maxX);
|
||||||
|
|
||||||
for (int srcX = x0; srcX < x1 ; srcX++) {
|
for (int srcX = x0; srcX < x1 ; srcX++) {
|
||||||
// Looking these up once instead of each loop is a ~15% speedup.
|
// Looking these up once instead of each loop is a ~15% speedup.
|
||||||
xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
|
xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int srcY = y0; srcY < y1; srcY++) {
|
for (int srcY = y0; srcY < y1; srcY++) {
|
||||||
SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
|
SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY));
|
||||||
|
|
||||||
for (int srcX = x0; srcX < x1 ; srcX++) {
|
for (int srcX = x0; srcX < x1 ; srcX++) {
|
||||||
SkScalar xWeight = xWeights[srcX - x0];
|
SkScalar xWeight = xWeights[srcX - x0];
|
||||||
|
|
||||||
SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
|
SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
|
||||||
|
__m128 weightVector = _mm_set1_ps(combined_weight);
|
||||||
|
weight = _mm_add_ps( weight, weightVector );
|
||||||
|
|
||||||
SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);
|
SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);
|
||||||
|
if (!color) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
__m128i c = _mm_cvtsi32_si128(color);
|
__m128i c = _mm_cvtsi32_si128(color);
|
||||||
c = _mm_unpacklo_epi8(c, _mm_setzero_si128());
|
c = _mm_unpacklo_epi8(c, _mm_setzero_si128());
|
||||||
c = _mm_unpacklo_epi16(c, _mm_setzero_si128());
|
c = _mm_unpacklo_epi16(c, _mm_setzero_si128());
|
||||||
__m128 cfloat = _mm_cvtepi32_ps(c);
|
__m128 cfloat = _mm_cvtepi32_ps(c);
|
||||||
|
|
||||||
__m128 weightVector = _mm_set1_ps(combined_weight);
|
|
||||||
accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));
|
accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));
|
||||||
weight = _mm_add_ps( weight, weightVector );
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user