The image resampling code has been transplanted from Chrome; it's incredibly fast.

We've tested this CL plumbed into Chrome and done benchmarking with excellent results.

This CL can land independent of any Chrome changes; it's completely internal to skia.

BUG=
R=reed@google.com

Review URL: https://codereview.chromium.org/19335002

git-svn-id: http://skia.googlecode.com/svn/trunk@10206 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
humper@google.com 2013-07-19 20:20:04 +00:00
parent d322cf4939
commit 138ebc3e40
15 changed files with 1689 additions and 265 deletions

View File

@ -75,7 +75,7 @@ protected:
curWidth = (int) (fBM.width() * curScale + 2); curWidth = (int) (fBM.width() * curScale + 2);
curX += curWidth; curX += curWidth;
curScale *= 0.75f; curScale *= 0.75f;
} while (curX < 4*fBM.width()); } while (curWidth >= 2 && curX < 4*fBM.width());
} }
private: private:

View File

@ -32,6 +32,8 @@
'<(skia_src_path)/core/SkBitmapProcState_matrix.h', '<(skia_src_path)/core/SkBitmapProcState_matrix.h',
'<(skia_src_path)/core/SkBitmapProcState_matrixProcs.cpp', '<(skia_src_path)/core/SkBitmapProcState_matrixProcs.cpp',
'<(skia_src_path)/core/SkBitmapProcState_sample.h', '<(skia_src_path)/core/SkBitmapProcState_sample.h',
'<(skia_src_path)/core/SkBitmapScaler.h',
'<(skia_src_path)/core/SkBitmapScaler.cpp',
'<(skia_src_path)/core/SkBitmapShader16BilerpTemplate.h', '<(skia_src_path)/core/SkBitmapShader16BilerpTemplate.h',
'<(skia_src_path)/core/SkBitmapShaderTemplate.h', '<(skia_src_path)/core/SkBitmapShaderTemplate.h',
'<(skia_src_path)/core/SkBitmap_scroll.cpp', '<(skia_src_path)/core/SkBitmap_scroll.cpp',
@ -56,6 +58,8 @@
'<(skia_src_path)/core/SkComposeShader.cpp', '<(skia_src_path)/core/SkComposeShader.cpp',
'<(skia_src_path)/core/SkConfig8888.cpp', '<(skia_src_path)/core/SkConfig8888.cpp',
'<(skia_src_path)/core/SkConfig8888.h', '<(skia_src_path)/core/SkConfig8888.h',
'<(skia_src_path)/core/SkConvolver.cpp',
'<(skia_src_path)/core/SkConvolver.h',
'<(skia_src_path)/core/SkCordic.cpp', '<(skia_src_path)/core/SkCordic.cpp',
'<(skia_src_path)/core/SkCordic.h', '<(skia_src_path)/core/SkCordic.h',
'<(skia_src_path)/core/SkCoreBlitters.h', '<(skia_src_path)/core/SkCoreBlitters.h',

View File

@ -702,19 +702,7 @@ private:
int extractMipLevel(SkBitmap* dst, SkFixed sx, SkFixed sy); int extractMipLevel(SkBitmap* dst, SkFixed sx, SkFixed sy);
bool hasMipMap() const; bool hasMipMap() const;
void freeMipMap(); void freeMipMap();
/** Make a scaled copy of this bitmap into the provided destination.
* The caller is responsible for having set the width and height of the
* provided destination bitmap, and also having allocated its pixel
* memory.
*
* This function is temporary and for testing purposes only; it will
* likely move once it has been properly plumbed into the bitmap
* shader infrastructure.
*/
void scale(SkBitmap *dst) const;
friend struct SkBitmapProcState; friend struct SkBitmapProcState;
}; };

View File

@ -5,15 +5,23 @@
* found in the LICENSE file. * found in the LICENSE file.
*/ */
#include "SkErrorInternals.h"
#include "SkConvolver.h"
#include "SkBitmapProcState.h" #include "SkBitmapProcState.h"
#include "SkBitmap.h" #include "SkBitmap.h"
#include "SkColor.h" #include "SkColor.h"
#include "SkColorPriv.h" #include "SkColorPriv.h"
#include "SkConvolver.h"
#include "SkUnPreMultiply.h" #include "SkUnPreMultiply.h"
#include "SkShader.h" #include "SkShader.h"
#include "SkRTConf.h" #include "SkRTConf.h"
#include "SkMath.h" #include "SkMath.h"
// These are the per-scanline callbacks that are used when we must resort to
// resampling an image as it is blitted. Typically these are used only when
// the image is rotated or has some other complex transformation applied.
// Scaled images will usually be rescaled directly before rasterization.
void highQualityFilter(const SkBitmapProcState& s, int x, int y, void highQualityFilter(const SkBitmapProcState& s, int x, int y,
SkPMColor* SK_RESTRICT colors, int count) { SkPMColor* SK_RESTRICT colors, int count) {
@ -68,71 +76,15 @@ void highQualityFilter(const SkBitmapProcState& s, int x, int y,
} }
} }
void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y, SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which scanline bitmap filter to use [mitchell, lanczos, hamming, gaussian, triangle, box]");
SkPMColor *SK_RESTRICT colors, int count) {
const int maxX = s.fBitmap->width() - 1;
const int maxY = s.fBitmap->height() - 1;
SkPoint srcPt; SkBitmapFilter *SkBitmapFilter::Allocate() {
s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
SkFloatToScalar(y + 0.5f), &srcPt);
srcPt.fY -= SK_ScalarHalf;
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()), maxY);
while (count-- > 0) {
s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
SkFloatToScalar(y + 0.5f), &srcPt);
srcPt.fX -= SK_ScalarHalf;
srcPt.fY -= SK_ScalarHalf;
SkScalar weight = 0;
SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width()), maxX);
for (int srcY = y0; srcY <= y1; srcY++) {
SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
for (int srcX = x0; srcX <= x1 ; srcX++) {
SkScalar xWeight = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
fr += combined_weight * SkGetPackedR32(c);
fg += combined_weight * SkGetPackedG32(c);
fb += combined_weight * SkGetPackedB32(c);
fa += combined_weight * SkGetPackedA32(c);
weight += combined_weight;
}
}
fr = SkScalarDiv(fr, weight);
fg = SkScalarDiv(fg, weight);
fb = SkScalarDiv(fb, weight);
fa = SkScalarDiv(fa, weight);
int a = SkClampMax(SkScalarRoundToInt(fa), 255);
int r = SkClampMax(SkScalarRoundToInt(fr), a);
int g = SkClampMax(SkScalarRoundToInt(fg), a);
int b = SkClampMax(SkScalarRoundToInt(fb), a);
*colors++ = SkPackARGB32(a, r, g, b);
x++;
}
}
SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which bitmap filter to use [mitchell, sinc, gaussian, triangle, box]");
static SkBitmapFilter *allocateBitmapFilter() {
if (!strcmp(c_bitmapFilter, "mitchell")) { if (!strcmp(c_bitmapFilter, "mitchell")) {
return SkNEW_ARGS(SkMitchellFilter,(1.f/3.f,1.f/3.f)); return SkNEW_ARGS(SkMitchellFilter,(1.f/3.f,1.f/3.f));
} else if (!strcmp(c_bitmapFilter, "sinc")) { } else if (!strcmp(c_bitmapFilter, "lanczos")) {
return SkNEW_ARGS(SkSincFilter,(3)); return SkNEW(SkLanczosFilter);
} else if (!strcmp(c_bitmapFilter, "hamming")) {
return SkNEW(SkHammingFilter);
} else if (!strcmp(c_bitmapFilter, "gaussian")) { } else if (!strcmp(c_bitmapFilter, "gaussian")) {
return SkNEW_ARGS(SkGaussianFilter,(2)); return SkNEW_ARGS(SkGaussianFilter,(2));
} else if (!strcmp(c_bitmapFilter, "triangle")) { } else if (!strcmp(c_bitmapFilter, "triangle")) {
@ -168,159 +120,12 @@ SkBitmapProcState::chooseBitmapFilterProc() {
} }
if (fInvType & (SkMatrix::kAffine_Mask | SkMatrix::kScale_Mask)) { if (fInvType & (SkMatrix::kAffine_Mask | SkMatrix::kScale_Mask)) {
fBitmapFilter = allocateBitmapFilter(); fBitmapFilter = SkBitmapFilter::Allocate();
} }
if (fInvType & SkMatrix::kAffine_Mask) { if (fInvType & SkMatrix::kScale_Mask) {
return highQualityFilter; return highQualityFilter;
} else if (fInvType & SkMatrix::kScale_Mask) {
return highQualityFilter_ScaleOnly;
} else { } else {
return NULL; return NULL;
} }
} }
static void divideByWeights(SkScalar *sums, SkScalar *weights, SkBitmap *dst) {
for (int y = 0 ; y < dst->height() ; y++) {
for (int x = 0 ; x < dst->width() ; x++) {
SkScalar fr = SkScalarDiv(sums[4*(y*dst->width() + x) + 0], weights[y*dst->width() + x]);
SkScalar fg = SkScalarDiv(sums[4*(y*dst->width() + x) + 1], weights[y*dst->width() + x]);
SkScalar fb = SkScalarDiv(sums[4*(y*dst->width() + x) + 2], weights[y*dst->width() + x]);
SkScalar fa = SkScalarDiv(sums[4*(y*dst->width() + x) + 3], weights[y*dst->width() + x]);
int a = SkClampMax(SkScalarRoundToInt(fa), 255);
int r = SkClampMax(SkScalarRoundToInt(fr), a);
int g = SkClampMax(SkScalarRoundToInt(fg), a);
int b = SkClampMax(SkScalarRoundToInt(fb), a);
*dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
}
}
}
static void upScaleHorizTranspose(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
for (int y = 0 ; y < dst->height() ; y++) {
for (int x = 0 ; x < dst->width() ; x++) {
float sx = (y + 0.5f) / scale - 0.5f;
int x0 = SkClampMax(sk_float_ceil2int(sx-filter->width()), src->width()-1);
int x1 = SkClampMax(sk_float_floor2int(sx+filter->width()), src->width()-1);
SkScalar totalWeight = 0;
SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
for (int srcX = x0 ; srcX <= x1 ; srcX++) {
SkScalar weight = filter->lookupScalar(sx - srcX);
SkPMColor c = *src->getAddr32(srcX, x);
fr += SkScalarMul(weight,SkGetPackedR32(c));
fg += SkScalarMul(weight,SkGetPackedG32(c));
fb += SkScalarMul(weight,SkGetPackedB32(c));
fa += SkScalarMul(weight,SkGetPackedA32(c));
totalWeight += weight;
}
fr = SkScalarDiv(fr,totalWeight);
fg = SkScalarDiv(fg,totalWeight);
fb = SkScalarDiv(fb,totalWeight);
fa = SkScalarDiv(fa,totalWeight);
int a = SkClampMax(SkScalarRoundToInt(fa), 255);
int r = SkClampMax(SkScalarRoundToInt(fr), a);
int g = SkClampMax(SkScalarRoundToInt(fg), a);
int b = SkClampMax(SkScalarRoundToInt(fb), a);
*dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
}
}
}
static void downScaleHoriz(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
SkAutoTDeleteArray<SkScalar> ada1(sums);
SkAutoTDeleteArray<SkScalar> ada2(weights);
memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
for (int y = 0 ; y < src->height() ; y++) {
for (int x = 0 ; x < src->width() ; x++) {
// splat each source pixel into the destination image
float dx = (x + 0.5f) * scale - 0.5f;
int x0 = SkClampMax(sk_float_ceil2int(dx-filter->width()), dst->width()-1);
int x1 = SkClampMax(sk_float_floor2int(dx+filter->width()), dst->width()-1);
SkPMColor c = *src->getAddr32(x,y);
for (int dst_x = x0 ; dst_x <= x1 ; dst_x++) {
SkScalar weight = filter->lookup(dx - dst_x);
sums[4*(y*dst->width() + dst_x) + 0] += weight*SkGetPackedR32(c);
sums[4*(y*dst->width() + dst_x) + 1] += weight*SkGetPackedG32(c);
sums[4*(y*dst->width() + dst_x) + 2] += weight*SkGetPackedB32(c);
sums[4*(y*dst->width() + dst_x) + 3] += weight*SkGetPackedA32(c);
weights[y*dst->width() + dst_x] += weight;
}
}
}
divideByWeights(sums, weights, dst);
}
static void downScaleVert(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
SkAutoTDeleteArray<SkScalar> ada1(sums);
SkAutoTDeleteArray<SkScalar> ada2(weights);
memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
for (int y = 0 ; y < src->height() ; y++) {
for (int x = 0 ; x < src->width() ; x++) {
// splat each source pixel into the destination image
float dy = (y + 0.5f) * scale - 0.5f;
int y0 = SkClampMax(sk_float_ceil2int(dy-filter->width()), dst->height()-1);
int y1 = SkClampMax(sk_float_ceil2int(dy+filter->width()), dst->height()-1);
SkPMColor c = *src->getAddr32(x,y);
for (int dst_y = y0 ; dst_y <= y1 ; dst_y++) {
SkScalar weight = filter->lookupScalar(dy - dst_y);
sums[4*(dst_y*dst->width() + x) + 0] += weight*SkGetPackedR32(c);
sums[4*(dst_y*dst->width() + x) + 1] += weight*SkGetPackedG32(c);
sums[4*(dst_y*dst->width() + x) + 2] += weight*SkGetPackedB32(c);
sums[4*(dst_y*dst->width() + x) + 3] += weight*SkGetPackedA32(c);
weights[dst_y*dst->width() + x] += weight;
}
}
}
divideByWeights(sums, weights, dst);
}
void SkBitmap::scale(SkBitmap *dst) const {
SkBitmap horizTemp;
horizTemp.setConfig(SkBitmap::kARGB_8888_Config, height(), dst->width());
horizTemp.allocPixels();
SkBitmapFilter *filter = allocateBitmapFilter();
float horizScale = float(dst->width()) / width();
if (horizScale >= 1) {
upScaleHorizTranspose(this, &horizTemp, horizScale, filter);
} else if (horizScale < 1) {
downScaleHoriz(this, &horizTemp, horizScale, filter);
}
float vertScale = float(dst->height()) / height();
if (vertScale >= 1) {
upScaleHorizTranspose(&horizTemp, dst, vertScale, filter);
} else if (vertScale < 1) {
downScaleVert(&horizTemp, dst, vertScale, filter);
}
SkDELETE(filter);
}

View File

@ -26,28 +26,30 @@ class SkBitmapFilter {
fLookupMultiplier = this->invWidth() * (SKBITMAP_FILTER_TABLE_SIZE-1); fLookupMultiplier = this->invWidth() * (SKBITMAP_FILTER_TABLE_SIZE-1);
} }
SkFixed lookup( float x ) const { SkFixed lookup(float x) const {
if (!fPrecomputed) { if (!fPrecomputed) {
precomputeTable(); precomputeTable();
} }
int filter_idx = int(sk_float_abs(x * fLookupMultiplier)); int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE); SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
return fFilterTable[ filter_idx ]; return fFilterTable[filter_idx];
} }
SkScalar lookupScalar( float x ) const { SkScalar lookupScalar(float x) const {
if (!fPrecomputed) { if (!fPrecomputed) {
precomputeTable(); precomputeTable();
} }
int filter_idx = int(sk_float_abs(x * fLookupMultiplier)); int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE); SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
return fFilterTableScalar[ filter_idx ]; return fFilterTableScalar[filter_idx];
} }
float width() const { return fWidth; } float width() const { return fWidth; }
float invWidth() const { return fInvWidth; } float invWidth() const { return fInvWidth; }
virtual float evaluate(float x) const = 0; virtual float evaluate(float x) const = 0;
virtual ~SkBitmapFilter() {} virtual ~SkBitmapFilter() {}
static SkBitmapFilter* Allocate();
protected: protected:
float fWidth; float fWidth;
float fInvWidth; float fInvWidth;
@ -126,29 +128,47 @@ class SkBoxFilter: public SkBitmapFilter {
} }
virtual float evaluate(float x) const SK_OVERRIDE { virtual float evaluate(float x) const SK_OVERRIDE {
return 1; return (x >= -fWidth && x < fWidth) ? 1.0f : 0.0f;
} }
protected: protected:
}; };
class SkHammingFilter: public SkBitmapFilter {
public:
SkHammingFilter(float width=1.f)
: SkBitmapFilter(width) {
}
virtual float evaluate(float x) const SK_OVERRIDE {
if (x <= -fWidth || x >= fWidth) {
return 0.0f; // Outside of the window.
}
if (x > -FLT_EPSILON && x < FLT_EPSILON) {
return 1.0f; // Special case the sinc discontinuity at the origin.
}
const float xpi = x * static_cast<float>(M_PI);
class SkSincFilter: public SkBitmapFilter { return ((sk_float_sin(xpi) / xpi) * // sinc(x)
(0.54f + 0.46f * sk_float_cos(xpi / fWidth))); // hamming(x)
}
};
class SkLanczosFilter: public SkBitmapFilter {
public: public:
SkSincFilter(float t, float width=3.f) SkLanczosFilter(float width=3.f)
: SkBitmapFilter(width), tau(t) { : SkBitmapFilter(width) {
} }
virtual float evaluate(float x) const SK_OVERRIDE { virtual float evaluate(float x) const SK_OVERRIDE {
x = sk_float_abs(x * fInvWidth); if (x <= -fWidth || x >= fWidth) {
if (x < 1e-5f) return 1.f; return 0.0f; // Outside of the window.
if (x > 1.f) return 0.f; }
x *= SK_ScalarPI; if (x > -FLT_EPSILON && x < FLT_EPSILON) {
float sinc = sk_float_sin(x) / x; return 1.0f; // Special case the discontinuity at the origin.
float lanczos = sk_float_sin(x * tau) / (x * tau); }
return sinc * lanczos; float xpi = x * static_cast<float>(M_PI);
} return (sk_float_sin(xpi) / xpi) * // sinc(x)
protected: sk_float_sin(xpi / fWidth) / (xpi / fWidth); // sinc(x/fWidth)
float tau; }
}; };

View File

@ -11,6 +11,7 @@
#include "SkPaint.h" #include "SkPaint.h"
#include "SkShader.h" // for tilemodes #include "SkShader.h" // for tilemodes
#include "SkUtilsArm.h" #include "SkUtilsArm.h"
#include "SkBitmapScaler.h"
#if !SK_ARM_NEON_IS_NONE #if !SK_ARM_NEON_IS_NONE
// These are defined in src/opts/SkBitmapProcState_arm_neon.cpp // These are defined in src/opts/SkBitmapProcState_arm_neon.cpp
@ -99,23 +100,45 @@ void SkBitmapProcState::possiblyScaleImage() {
if (fFilterQuality != kHQ_BitmapFilter) { if (fFilterQuality != kHQ_BitmapFilter) {
return; return;
} }
// see if our platform has any specialized convolution code.
// Set up a pointer to a local (instead of storing the structure in the
// proc state) to avoid introducing a header dependency; this makes
// recompiles a lot less painful.
SkConvolutionProcs simd;
fConvolutionProcs = &simd;
fConvolutionProcs->fExtraHorizontalReads = 0;
fConvolutionProcs->fConvolveVertically = NULL;
fConvolutionProcs->fConvolve4RowsHorizontally = NULL;
fConvolutionProcs->fConvolveHorizontally = NULL;
fConvolutionProcs->fApplySIMDPadding = NULL;
this->platformConvolutionProcs();
// STEP 1: UPSAMPLE? // STEP 1: Highest quality direct scale?
// Check to see if the transformation matrix is scaling up, and if // Check to see if the transformation matrix is simple, and if we're
// the matrix is simple, and if we're doing high quality scaling. // doing high quality scaling. If so, do the bitmap scale here and
// If so, do the bitmap scale here and remove the scaling component from the matrix. // remove the scaling component from the matrix.
if (fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) && if (fFilterQuality == kHQ_BitmapFilter &&
(fInvMatrix.getScaleX() < 1 || fInvMatrix.getScaleY() < 1) && fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) &&
fOrigBitmap.config() == SkBitmap::kARGB_8888_Config) { fOrigBitmap.config() == SkBitmap::kARGB_8888_Config) {
int dest_width = SkScalarCeilToInt(fOrigBitmap.width() / fInvMatrix.getScaleX());
int dest_height = SkScalarCeilToInt(fOrigBitmap.height() / fInvMatrix.getScaleY());
// All the criteria are met; let's make a new bitmap. // All the criteria are met; let's make a new bitmap.
fScaledBitmap.setConfig(SkBitmap::kARGB_8888_Config,
(int)(fOrigBitmap.width() / fInvMatrix.getScaleX()), fScaledBitmap = SkBitmapScaler::Resize( fOrigBitmap, SkBitmapScaler::RESIZE_BEST,
(int)(fOrigBitmap.height() / fInvMatrix.getScaleY())); dest_width, dest_height, fConvolutionProcs );
fScaledBitmap.allocPixels();
fOrigBitmap.scale(&fScaledBitmap); fScaledBitmap.lockPixels();
fBitmap = &fScaledBitmap; fBitmap = &fScaledBitmap;
// set the inv matrix type to translate-only; // set the inv matrix type to translate-only;
@ -130,9 +153,9 @@ void SkBitmapProcState::possiblyScaleImage() {
return; return;
} }
if (!fOrigBitmap.hasMipMap()) { if (!fOrigBitmap.hasMipMap() && fFilterQuality != kNone_BitmapFilter) {
// STEP 2: DOWNSAMPLE // STEP 2: MIPMAP DOWNSAMPLE?
// Check to see if the transformation matrix is scaling *down*. // Check to see if the transformation matrix is scaling *down*.
// If so, automatically build mipmaps. // If so, automatically build mipmaps.

View File

@ -31,6 +31,7 @@
#endif #endif
class SkPaint; class SkPaint;
class SkConvolutionProcs;
struct SkBitmapProcState { struct SkBitmapProcState {
@ -59,7 +60,7 @@ struct SkBitmapProcState {
const uint32_t[], const uint32_t[],
int count, int count,
uint16_t colors[]); uint16_t colors[]);
typedef U16CPU (*FixedTileProc)(SkFixed); // returns 0..0xFFFF typedef U16CPU (*FixedTileProc)(SkFixed); // returns 0..0xFFFF
typedef U16CPU (*FixedTileLowBitsProc)(SkFixed, int); // returns 0..0xF typedef U16CPU (*FixedTileLowBitsProc)(SkFixed, int); // returns 0..0xF
typedef U16CPU (*IntTileProc)(int value, int count); // returns 0..count-1 typedef U16CPU (*IntTileProc)(int value, int count); // returns 0..count-1
@ -78,6 +79,8 @@ struct SkBitmapProcState {
IntTileProc fIntTileProcY; // chooseProcs IntTileProc fIntTileProcY; // chooseProcs
SkFixed fFilterOneX; SkFixed fFilterOneX;
SkFixed fFilterOneY; SkFixed fFilterOneY;
SkConvolutionProcs* fConvolutionProcs; // possiblyScaleImage
SkPMColor fPaintPMColor; // chooseProcs - A8 config SkPMColor fPaintPMColor; // chooseProcs - A8 config
SkFixed fInvSx; // chooseProcs SkFixed fInvSx; // chooseProcs
@ -113,7 +116,12 @@ struct SkBitmapProcState {
implementation can do nothing (see SkBitmapProcState_opts_none.cpp) implementation can do nothing (see SkBitmapProcState_opts_none.cpp)
*/ */
void platformProcs(); void platformProcs();
/** Platforms can also optionally overwrite the convolution functions
if we have SIMD versions of them.
*/
void platformConvolutionProcs();
/** Given the byte size of the index buffer to be passed to the matrix proc, /** Given the byte size of the index buffer to be passed to the matrix proc,
return the maximum number of resulting pixels that can be computed return the maximum number of resulting pixels that can be computed
@ -160,7 +168,7 @@ private:
void possiblyScaleImage(); void possiblyScaleImage();
SkBitmapFilter *fBitmapFilter; SkBitmapFilter* fBitmapFilter;
ShaderProc32 chooseBitmapFilterProc(); ShaderProc32 chooseBitmapFilterProc();
@ -218,8 +226,6 @@ void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s,
void S32_D16_filter_DX(const SkBitmapProcState& s, void S32_D16_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, uint16_t* colors); const uint32_t* xy, int count, uint16_t* colors);
void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);
void highQualityFilter(const SkBitmapProcState &s, int x, int y, void highQualityFilter(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count); SkPMColor *SK_RESTRICT colors, int count);

315
src/core/SkBitmapScaler.cpp Normal file
View File

@ -0,0 +1,315 @@
#include "SkBitmapScaler.h"
#include "SkBitmapFilter.h"
#include "SkRect.h"
#include "SkTArray.h"
#include "SkErrorInternals.h"
#include "SkConvolver.h"
// SkResizeFilter ----------------------------------------------------------------
// Encapsulates computation and storage of the filters required for one complete
// resize operation.
class SkResizeFilter {
public:
SkResizeFilter(SkBitmapScaler::ResizeMethod method,
int srcFullWidth, int srcFullHeight,
int destWidth, int destHeight,
const SkIRect& destSubset,
SkConvolutionProcs* convolveProcs);
~SkResizeFilter() {
SkDELETE( fBitmapFilter );
}
// Returns the filled filter values.
const SkConvolutionFilter1D& xFilter() { return fXFilter; }
const SkConvolutionFilter1D& yFilter() { return fYFilter; }
private:
SkBitmapFilter* fBitmapFilter;
// Computes one set of filters either horizontally or vertically. The caller
// will specify the "min" and "max" rather than the bottom/top and
// right/bottom so that the same code can be re-used in each dimension.
//
// |srcDependLo| and |srcDependSize| gives the range for the source
// depend rectangle (horizontally or vertically at the caller's discretion
// -- see above for what this means).
//
// Likewise, the range of destination values to compute and the scale factor
// for the transform is also specified.
void computeFilters(int srcSize,
int destSubsetLo, int destSubsetSize,
float scale,
SkConvolutionFilter1D* output,
SkConvolutionProcs* convolveProcs);
// Subset of scaled destination bitmap to compute.
SkIRect fOutBounds;
SkConvolutionFilter1D fXFilter;
SkConvolutionFilter1D fYFilter;
};
SkResizeFilter::SkResizeFilter(SkBitmapScaler::ResizeMethod method,
int srcFullWidth, int srcFullHeight,
int destWidth, int destHeight,
const SkIRect& destSubset,
SkConvolutionProcs* convolveProcs)
: fOutBounds(destSubset) {
// method will only ever refer to an "algorithm method".
SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
(method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
switch(method) {
case SkBitmapScaler::RESIZE_BOX:
fBitmapFilter = SkNEW(SkBoxFilter);
break;
case SkBitmapScaler::RESIZE_TRIANGLE:
fBitmapFilter = SkNEW(SkTriangleFilter);
break;
case SkBitmapScaler::RESIZE_MITCHELL:
fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
break;
case SkBitmapScaler::RESIZE_HAMMING:
fBitmapFilter = SkNEW(SkHammingFilter);
break;
case SkBitmapScaler::RESIZE_LANCZOS3:
fBitmapFilter = SkNEW(SkLanczosFilter);
break;
default:
// NOTREACHED:
fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
break;
}
float scaleX = static_cast<float>(destWidth) /
static_cast<float>(srcFullWidth);
float scaleY = static_cast<float>(destHeight) /
static_cast<float>(srcFullHeight);
this->computeFilters(srcFullWidth, destSubset.fLeft, destSubset.width(),
scaleX, &fXFilter, convolveProcs);
this->computeFilters(srcFullHeight, destSubset.fTop, destSubset.height(),
scaleY, &fYFilter, convolveProcs);
}
// TODO(egouriou): Take advantage of periods in the convolution.
// Practical resizing filters are periodic outside of the border area.
// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the
// source become p pixels in the destination) will have a period of p.
// A nice consequence is a period of 1 when downscaling by an integral
// factor. Downscaling from typical display resolutions is also bound
// to produce interesting periods as those are chosen to have multiple
// small factors.
// Small periods reduce computational load and improve cache usage if
// the coefficients can be shared. For periods of 1 we can consider
// loading the factors only once outside the borders.
void SkResizeFilter::computeFilters(int srcSize,
int destSubsetLo, int destSubsetSize,
float scale,
SkConvolutionFilter1D* output,
SkConvolutionProcs* convolveProcs) {
int destSubsetHi = destSubsetLo + destSubsetSize; // [lo, hi)
// When we're doing a magnification, the scale will be larger than one. This
// means the destination pixels are much smaller than the source pixels, and
// that the range covered by the filter won't necessarily cover any source
// pixel boundaries. Therefore, we use these clamped values (max of 1) for
// some computations.
float clampedScale = SkTMin(1.0f, scale);
// This is how many source pixels from the center we need to count
// to support the filtering function.
float srcSupport = fBitmapFilter->width() / clampedScale;
// Speed up the divisions below by turning them into multiplies.
float invScale = 1.0f / scale;
SkTArray<float> filterValues(64);
SkTArray<short> fixedFilterValues(64);
// Loop over all pixels in the output range. We will generate one set of
// filter values for each one. Those values will tell us how to blend the
// source pixels to compute the destination pixel.
for (int destSubsetI = destSubsetLo; destSubsetI < destSubsetHi;
destSubsetI++) {
// Reset the arrays. We don't declare them inside so they can re-use the
// same malloc-ed buffer.
filterValues.reset();
fixedFilterValues.reset();
// This is the pixel in the source directly under the pixel in the dest.
// Note that we base computations on the "center" of the pixels. To see
// why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
// downscale should "cover" the pixels around the pixel with *its center*
// at coordinates (2.5, 2.5) in the source, not those around (0, 0).
// Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
float srcPixel = (static_cast<float>(destSubsetI) + 0.5f) * invScale;
// Compute the (inclusive) range of source pixels the filter covers.
int srcBegin = SkTMax(0, SkScalarFloorToInt(srcPixel - srcSupport));
int srcEnd = SkTMin(srcSize - 1, SkScalarCeilToInt(srcPixel + srcSupport));
// Compute the unnormalized filter value at each location of the source
// it covers.
float filterSum = 0.0f; // Sub of the filter values for normalizing.
for (int curFilterPixel = srcBegin; curFilterPixel <= srcEnd;
curFilterPixel++) {
// Distance from the center of the filter, this is the filter coordinate
// in source space. We also need to consider the center of the pixel
// when comparing distance against 'srcPixel'. In the 5x downscale
// example used above the distance from the center of the filter to
// the pixel with coordinates (2, 2) should be 0, because its center
// is at (2.5, 2.5).
float srcFilterDist =
((static_cast<float>(curFilterPixel) + 0.5f) - srcPixel);
// Since the filter really exists in dest space, map it there.
float destFilterDist = srcFilterDist * clampedScale;
// Compute the filter value at that location.
float filterValue = fBitmapFilter->evaluate(destFilterDist);
filterValues.push_back(filterValue);
filterSum += filterValue;
}
SkASSERT(!filterValues.empty());
// The filter must be normalized so that we don't affect the brightness of
// the image. Convert to normalized fixed point.
short fixedSum = 0;
for (int i = 0; i < filterValues.count(); i++) {
short curFixed = output->FloatToFixed(filterValues[i] / filterSum);
fixedSum += curFixed;
fixedFilterValues.push_back(curFixed);
}
// The conversion to fixed point will leave some rounding errors, which
// we add back in to avoid affecting the brightness of the image. We
// arbitrarily add this to the center of the filter array (this won't always
// be the center of the filter function since it could get clipped on the
// edges, but it doesn't matter enough to worry about that case).
short leftovers = output->FloatToFixed(1.0f) - fixedSum;
fixedFilterValues[fixedFilterValues.count() / 2] += leftovers;
// Now it's ready to go.
output->AddFilter(srcBegin, &fixedFilterValues[0],
static_cast<int>(fixedFilterValues.count()));
}
if (convolveProcs->fApplySIMDPadding) {
convolveProcs->fApplySIMDPadding( output );
}
}
static SkBitmapScaler::ResizeMethod ResizeMethodToAlgorithmMethod(
SkBitmapScaler::ResizeMethod method) {
// Convert any "Quality Method" into an "Algorithm Method"
if (method >= SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD &&
method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD) {
return method;
}
// The call to SkBitmapScalerGtv::Resize() above took care of
// GPU-acceleration in the cases where it is possible. So now we just
// pick the appropriate software method for each resize quality.
switch (method) {
// Users of RESIZE_GOOD are willing to trade a lot of quality to
// get speed, allowing the use of linear resampling to get hardware
// acceleration (SRB). Hence any of our "good" software filters
// will be acceptable, so we use a triangle.
case SkBitmapScaler::RESIZE_GOOD:
return SkBitmapScaler::RESIZE_TRIANGLE;
// Users of RESIZE_BETTER are willing to trade some quality in order
// to improve performance, but are guaranteed not to devolve to a linear
// resampling. In visual tests we see that Hamming-1 is not as good as
// Lanczos-2, however it is about 40% faster and Lanczos-2 itself is
// about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed
// an acceptable trade-off between quality and speed.
case SkBitmapScaler::RESIZE_BETTER:
return SkBitmapScaler::RESIZE_HAMMING;
default:
return SkBitmapScaler::RESIZE_MITCHELL;
}
}
// static
SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
ResizeMethod method,
int destWidth, int destHeight,
const SkIRect& destSubset,
SkConvolutionProcs* convolveProcs,
SkBitmap::Allocator* allocator) {
// Ensure that the ResizeMethod enumeration is sound.
SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) &&
(method <= RESIZE_LAST_QUALITY_METHOD)) ||
((RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
(method <= RESIZE_LAST_ALGORITHM_METHOD)));
SkIRect dest = { 0, 0, destWidth, destHeight };
if (!dest.contains(destSubset)) {
SkErrorInternals::SetError( kInvalidArgument_SkError,
"Sorry, you passed me a bitmap resize "
" method I have never heard of: %d",
method );
}
// If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just
// return empty.
if (source.width() < 1 || source.height() < 1 ||
destWidth < 1 || destHeight < 1) {
return SkBitmap();
}
method = ResizeMethodToAlgorithmMethod(method);
// Check that we deal with an "algorithm methods" from this point onward.
SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
(method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
SkAutoLockPixels locker(source);
if (!source.readyToDraw() || source.config() != SkBitmap::kARGB_8888_Config)
return SkBitmap();
SkResizeFilter filter(method, source.width(), source.height(),
destWidth, destHeight, destSubset, convolveProcs);
// Get a source bitmap encompassing this touched area. We construct the
// offsets and row strides such that it looks like a new bitmap, while
// referring to the old data.
const unsigned char* sourceSubset =
reinterpret_cast<const unsigned char*>(source.getPixels());
// Convolve into the result.
SkBitmap result;
result.setConfig(SkBitmap::kARGB_8888_Config,
destSubset.width(), destSubset.height());
result.allocPixels(allocator, NULL);
if (!result.readyToDraw())
return SkBitmap();
BGRAConvolve2D(sourceSubset, static_cast<int>(source.rowBytes()),
!source.isOpaque(), filter.xFilter(), filter.yFilter(),
static_cast<int>(result.rowBytes()),
static_cast<unsigned char*>(result.getPixels()),
convolveProcs, true);
// Preserve the "opaque" flag for use as an optimization later.
result.setIsOpaque(source.isOpaque());
return result;
}
// static
SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
ResizeMethod method,
int destWidth, int destHeight,
SkConvolutionProcs* convolveProcs,
SkBitmap::Allocator* allocator) {
SkIRect destSubset = { 0, 0, destWidth, destHeight };
return Resize(source, method, destWidth, destHeight, destSubset,
convolveProcs, allocator);
}

106
src/core/SkBitmapScaler.h Normal file
View File

@ -0,0 +1,106 @@
/*
* Copyright 2013 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkBitmapScaler_DEFINED
#define SkBitmapScaler_DEFINED
#include "SkBitmap.h"
#include "SkConvolver.h"
/** \class SkBitmapScaler
Provides the interface for high quality image resampling.
*/
class SK_API SkBitmapScaler {
public:
enum ResizeMethod {
// Quality Methods
//
// Those enumeration values express a desired quality/speed tradeoff.
// They are translated into an algorithm-specific method that depends
// on the capabilities (CPU, GPU) of the underlying platform.
// It is possible for all three methods to be mapped to the same
// algorithm on a given platform.
// Good quality resizing. Fastest resizing with acceptable visual quality.
// This is typically intended for use during interactive layouts
// where slower platforms may want to trade image quality for large
// increase in resizing performance.
//
// For example the resizing implementation may devolve to linear
// filtering if this enables GPU acceleration to be used.
//
// Note that the underlying resizing method may be determined
// on the fly based on the parameters for a given resize call.
// For example an implementation using a GPU-based linear filter
// in the common case may still use a higher-quality software-based
// filter in cases where using the GPU would actually be slower - due
// to too much latency - or impossible - due to image format or size
// constraints.
RESIZE_GOOD,
// Medium quality resizing. Close to high quality resizing (better
// than linear interpolation) with potentially some quality being
// traded-off for additional speed compared to RESIZE_BEST.
//
// This is intended, for example, for generation of large thumbnails
// (hundreds of pixels in each dimension) from large sources, where
// a linear filter would produce too many artifacts but where
// a RESIZE_HIGH might be too costly time-wise.
RESIZE_BETTER,
// High quality resizing. The algorithm is picked to favor image quality.
RESIZE_BEST,
//
// Algorithm-specific enumerations
//
// Box filter. This is a weighted average of all of the pixels touching
// the destination pixel. For enlargement, this is nearest neighbor.
//
// You probably don't want this, it is here for testing since it is easy to
// compute. Use RESIZE_LANCZOS3 instead.
RESIZE_BOX,
RESIZE_TRIANGLE,
RESIZE_LANCZOS3,
RESIZE_HAMMING,
RESIZE_MITCHELL,
// enum aliases for first and last methods by algorithm or by quality.
RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD,
RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST,
RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX,
RESIZE_LAST_ALGORITHM_METHOD = RESIZE_MITCHELL,
};
// Resizes the given source bitmap using the specified resize method, so that
// the entire image is (dest_size) big. The dest_subset is the rectangle in
// this destination image that should actually be returned.
//
// The output image will be (dest_subset.width(), dest_subset.height()). This
// will save work if you do not need the entire bitmap.
//
// The destination subset must be smaller than the destination image.
static SkBitmap Resize(const SkBitmap& source,
ResizeMethod method,
int dest_width, int dest_height,
const SkIRect& dest_subset,
SkConvolutionProcs *convolveProcs = NULL,
SkBitmap::Allocator* allocator = NULL);
// Alternate version for resizing and returning the entire bitmap rather than
// a subset.
static SkBitmap Resize(const SkBitmap& source,
ResizeMethod method,
int dest_width, int dest_height,
SkConvolutionProcs *convolveProcs = NULL,
SkBitmap::Allocator* allocator = NULL);
};
#endif

473
src/core/SkConvolver.cpp Normal file
View File

@ -0,0 +1,473 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "SkConvolver.h"
#include "SkSize.h"
#include "SkTypes.h"
namespace {
// Converts the argument to an 8-bit unsigned value by clamping to the range
// 0-255.
inline unsigned char ClampTo8(int a) {
if (static_cast<unsigned>(a) < 256) {
return a; // Avoid the extra check in the common case.
}
if (a < 0) {
return 0;
}
return 255;
}
// Takes the value produced by accumulating element-wise product of image with
// a kernel and brings it back into range.
// All of the filter scaling factors are in fixed point with kShiftBits bits of
// fractional part.
inline unsigned char BringBackTo8(int a, bool takeAbsolute) {
a >>= SkConvolutionFilter1D::kShiftBits;
if (takeAbsolute) {
a = abs(a);
}
return ClampTo8(a);
}
// Stores a list of rows in a circular buffer. The usage is you write into it
// by calling AdvanceRow. It will keep track of which row in the buffer it
// should use next, and the total number of rows added.
class CircularRowBuffer {
public:
// The number of pixels in each row is given in |sourceRowPixelWidth|.
// The maximum number of rows needed in the buffer is |maxYFilterSize|
// (we only need to store enough rows for the biggest filter).
//
// We use the |firstInputRow| to compute the coordinates of all of the
// following rows returned by Advance().
CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize,
int firstInputRow)
: fRowByteWidth(destRowPixelWidth * 4),
fNumRows(maxYFilterSize),
fNextRow(0),
fNextRowCoordinate(firstInputRow) {
fBuffer.reset(fRowByteWidth * maxYFilterSize);
fRowAddresses.reset(fNumRows);
}
// Moves to the next row in the buffer, returning a pointer to the beginning
// of it.
unsigned char* advanceRow() {
unsigned char* row = &fBuffer[fNextRow * fRowByteWidth];
fNextRowCoordinate++;
// Set the pointer to the next row to use, wrapping around if necessary.
fNextRow++;
if (fNextRow == fNumRows) {
fNextRow = 0;
}
return row;
}
// Returns a pointer to an "unrolled" array of rows. These rows will start
// at the y coordinate placed into |*firstRowIndex| and will continue in
// order for the maximum number of rows in this circular buffer.
//
// The |firstRowIndex_| may be negative. This means the circular buffer
// starts before the top of the image (it hasn't been filled yet).
unsigned char* const* GetRowAddresses(int* firstRowIndex) {
// Example for a 4-element circular buffer holding coords 6-9.
// Row 0 Coord 8
// Row 1 Coord 9
// Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10.
// Row 3 Coord 7
//
// The "next" row is also the first (lowest) coordinate. This computation
// may yield a negative value, but that's OK, the math will work out
// since the user of this buffer will compute the offset relative
// to the firstRowIndex and the negative rows will never be used.
*firstRowIndex = fNextRowCoordinate - fNumRows;
int curRow = fNextRow;
for (int i = 0; i < fNumRows; i++) {
fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth];
// Advance to the next row, wrapping if necessary.
curRow++;
if (curRow == fNumRows) {
curRow = 0;
}
}
return &fRowAddresses[0];
}
private:
// The buffer storing the rows. They are packed, each one fRowByteWidth.
SkTArray<unsigned char> fBuffer;
// Number of bytes per row in the |buffer|.
int fRowByteWidth;
// The number of rows available in the buffer.
int fNumRows;
// The next row index we should write into. This wraps around as the
// circular buffer is used.
int fNextRow;
// The y coordinate of the |fNextRow|. This is incremented each time a
// new row is appended and does not wrap.
int fNextRowCoordinate;
// Buffer used by GetRowAddresses().
SkTArray<unsigned char*> fRowAddresses;
};
// Convolves horizontally along a single row. The row data is given in
// |srcData| and continues for the numValues() of the filter.
template<bool hasAlpha>
void ConvolveHorizontally(const unsigned char* srcData,
const SkConvolutionFilter1D& filter,
unsigned char* outRow) {
// Loop over each pixel on this row in the output image.
int numValues = filter.numValues();
for (int outX = 0; outX < numValues; outX++) {
// Get the filter that determines the current output pixel.
int filterOffset, filterLength;
const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
filter.FilterForValue(outX, &filterOffset, &filterLength);
// Compute the first pixel in this row that the filter affects. It will
// touch |filterLength| pixels (4 bytes each) after this.
const unsigned char* rowToFilter = &srcData[filterOffset * 4];
// Apply the filter to the row to get the destination pixel in |accum|.
int accum[4] = {0};
for (int filterX = 0; filterX < filterLength; filterX++) {
SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX];
accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
if (hasAlpha) {
accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
}
}
// Bring this value back in range. All of the filter scaling factors
// are in fixed point with kShiftBits bits of fractional part.
accum[0] >>= SkConvolutionFilter1D::kShiftBits;
accum[1] >>= SkConvolutionFilter1D::kShiftBits;
accum[2] >>= SkConvolutionFilter1D::kShiftBits;
if (hasAlpha) {
accum[3] >>= SkConvolutionFilter1D::kShiftBits;
}
// Store the new pixel.
outRow[outX * 4 + 0] = ClampTo8(accum[0]);
outRow[outX * 4 + 1] = ClampTo8(accum[1]);
outRow[outX * 4 + 2] = ClampTo8(accum[2]);
if (hasAlpha) {
outRow[outX * 4 + 3] = ClampTo8(accum[3]);
}
}
}
// Does vertical convolution to produce one output row. The filter values and
// length are given in the first two parameters. These are applied to each
// of the rows pointed to in the |sourceDataRows| array, with each row
// being |pixelWidth| wide.
//
// The output must have room for |pixelWidth * 4| bytes.
template<bool hasAlpha>
void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
int filterLength,
unsigned char* const* sourceDataRows,
int pixelWidth,
unsigned char* outRow) {
// We go through each column in the output and do a vertical convolution,
// generating one output pixel each time.
for (int outX = 0; outX < pixelWidth; outX++) {
// Compute the number of bytes over in each row that the current column
// we're convolving starts at. The pixel will cover the next 4 bytes.
int byteOffset = outX * 4;
// Apply the filter to one column of pixels.
int accum[4] = {0};
for (int filterY = 0; filterY < filterLength; filterY++) {
SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY];
accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
if (hasAlpha) {
accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
}
}
// Bring this value back in range. All of the filter scaling factors
// are in fixed point with kShiftBits bits of precision.
accum[0] >>= SkConvolutionFilter1D::kShiftBits;
accum[1] >>= SkConvolutionFilter1D::kShiftBits;
accum[2] >>= SkConvolutionFilter1D::kShiftBits;
if (hasAlpha) {
accum[3] >>= SkConvolutionFilter1D::kShiftBits;
}
// Store the new pixel.
outRow[byteOffset + 0] = ClampTo8(accum[0]);
outRow[byteOffset + 1] = ClampTo8(accum[1]);
outRow[byteOffset + 2] = ClampTo8(accum[2]);
if (hasAlpha) {
unsigned char alpha = ClampTo8(accum[3]);
// Make sure the alpha channel doesn't come out smaller than any of the
// color channels. We use premultipled alpha channels, so this should
// never happen, but rounding errors will cause this from time to time.
// These "impossible" colors will cause overflows (and hence random pixel
// values) when the resulting bitmap is drawn to the screen.
//
// We only need to do this when generating the final output row (here).
int maxColorChannel = SkTMax(outRow[byteOffset + 0],
SkTMax(outRow[byteOffset + 1],
outRow[byteOffset + 2]));
if (alpha < maxColorChannel) {
outRow[byteOffset + 3] = maxColorChannel;
} else {
outRow[byteOffset + 3] = alpha;
}
} else {
// No alpha channel, the image is opaque.
outRow[byteOffset + 3] = 0xff;
}
}
}
void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
int filterLength,
unsigned char* const* sourceDataRows,
int pixelWidth,
unsigned char* outRow,
bool sourceHasAlpha) {
if (sourceHasAlpha) {
ConvolveVertically<true>(filterValues, filterLength,
sourceDataRows, pixelWidth,
outRow);
} else {
ConvolveVertically<false>(filterValues, filterLength,
sourceDataRows, pixelWidth,
outRow);
}
}
} // namespace
// SkConvolutionFilter1D ---------------------------------------------------------
SkConvolutionFilter1D::SkConvolutionFilter1D()
: fMaxFilter(0) {
}
SkConvolutionFilter1D::~SkConvolutionFilter1D() {
}
void SkConvolutionFilter1D::AddFilter(int filterOffset,
const float* filterValues,
int filterLength) {
SkASSERT(filterLength > 0);
SkTArray<ConvolutionFixed> fixedValues;
fixedValues.reset(filterLength);
for (int i = 0; i < filterLength; ++i) {
fixedValues.push_back(FloatToFixed(filterValues[i]));
}
AddFilter(filterOffset, &fixedValues[0], filterLength);
}
void SkConvolutionFilter1D::AddFilter(int filterOffset,
const ConvolutionFixed* filterValues,
int filterLength) {
// It is common for leading/trailing filter values to be zeros. In such
// cases it is beneficial to only store the central factors.
// For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
// a 1080p image this optimization gives a ~10% speed improvement.
int filterSize = filterLength;
int firstNonZero = 0;
while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) {
firstNonZero++;
}
if (firstNonZero < filterLength) {
// Here we have at least one non-zero factor.
int lastNonZero = filterLength - 1;
while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) {
lastNonZero--;
}
filterOffset += firstNonZero;
filterLength = lastNonZero + 1 - firstNonZero;
SkASSERT(filterLength > 0);
for (int i = firstNonZero; i <= lastNonZero; i++) {
fFilterValues.push_back(filterValues[i]);
}
} else {
// Here all the factors were zeroes.
filterLength = 0;
}
FilterInstance instance;
// We pushed filterLength elements onto fFilterValues
instance.fDataLocation = (static_cast<int>(fFilterValues.count()) -
filterLength);
instance.fOffset = filterOffset;
instance.fTrimmedLength = filterLength;
instance.fLength = filterSize;
fFilters.push_back(instance);
fMaxFilter = SkTMax(fMaxFilter, filterLength);
}
const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter(
int* specifiedFilterlength,
int* filterOffset,
int* filterLength) const {
const FilterInstance& filter = fFilters[0];
*filterOffset = filter.fOffset;
*filterLength = filter.fTrimmedLength;
*specifiedFilterlength = filter.fLength;
if (filter.fTrimmedLength == 0) {
return NULL;
}
return &fFilterValues[filter.fDataLocation];
}
void BGRAConvolve2D(const unsigned char* sourceData,
int sourceByteRowStride,
bool sourceHasAlpha,
const SkConvolutionFilter1D& filterX,
const SkConvolutionFilter1D& filterY,
int outputByteRowStride,
unsigned char* output,
SkConvolutionProcs* convolveProcs,
bool useSimdIfPossible) {
int maxYFilterSize = filterY.maxFilter();
// The next row in the input that we will generate a horizontally
// convolved row for. If the filter doesn't start at the beginning of the
// image (this is the case when we are only resizing a subset), then we
// don't want to generate any output rows before that. Compute the starting
// row for convolution as the first pixel for the first vertical filter.
int filterOffset, filterLength;
const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
filterY.FilterForValue(0, &filterOffset, &filterLength);
int nextXRow = filterOffset;
// We loop over each row in the input doing a horizontal convolution. This
// will result in a horizontally convolved image. We write the results into
// a circular buffer of convolved rows and do vertical convolution as rows
// are available. This prevents us from having to store the entire
// intermediate image and helps cache coherency.
// We will need four extra rows to allow horizontal convolution could be done
// simultaneously. We also pad each row in row buffer to be aligned-up to
// 16 bytes.
// TODO(jiesun): We do not use aligned load from row buffer in vertical
// convolution pass yet. Somehow Windows does not like it.
int rowBufferWidth = (filterX.numValues() + 15) & ~0xF;
int rowBufferHeight = maxYFilterSize +
(convolveProcs->fConvolve4RowsHorizontally ? 4 : 0);
CircularRowBuffer rowBuffer(rowBufferWidth,
rowBufferHeight,
filterOffset);
// Loop over every possible output row, processing just enough horizontal
// convolutions to run each subsequent vertical convolution.
SkASSERT(outputByteRowStride >= filterX.numValues() * 4);
int numOutputRows = filterY.numValues();
// We need to check which is the last line to convolve before we advance 4
// lines in one iteration.
int lastFilterOffset, lastFilterLength;
// SSE2 can access up to 3 extra pixels past the end of the
// buffer. At the bottom of the image, we have to be careful
// not to access data past the end of the buffer. Normally
// we fall back to the C++ implementation for the last row.
// If the last row is less than 3 pixels wide, we may have to fall
// back to the C++ version for more rows. Compute how many
// rows we need to avoid the SSE implementation for here.
filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset,
&lastFilterLength);
int avoidSimdRows = 1 + convolveProcs->fExtraHorizontalReads /
(lastFilterOffset + lastFilterLength);
filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset,
&lastFilterLength);
for (int outY = 0; outY < numOutputRows; outY++) {
filterValues = filterY.FilterForValue(outY,
&filterOffset, &filterLength);
// Generate output rows until we have enough to run the current filter.
while (nextXRow < filterOffset + filterLength) {
if (convolveProcs->fConvolve4RowsHorizontally &&
nextXRow + 3 < lastFilterOffset + lastFilterLength -
avoidSimdRows) {
const unsigned char* src[4];
unsigned char* outRow[4];
for (int i = 0; i < 4; ++i) {
src[i] = &sourceData[(nextXRow + i) * sourceByteRowStride];
outRow[i] = rowBuffer.advanceRow();
}
convolveProcs->fConvolve4RowsHorizontally(src, filterX, outRow);
nextXRow += 4;
} else {
// Check if we need to avoid SSE2 for this row.
if (convolveProcs->fConvolveHorizontally &&
nextXRow < lastFilterOffset + lastFilterLength -
avoidSimdRows) {
convolveProcs->fConvolveHorizontally(
&sourceData[nextXRow * sourceByteRowStride],
filterX, rowBuffer.advanceRow(), sourceHasAlpha);
} else {
if (sourceHasAlpha) {
ConvolveHorizontally<true>(
&sourceData[nextXRow * sourceByteRowStride],
filterX, rowBuffer.advanceRow());
} else {
ConvolveHorizontally<false>(
&sourceData[nextXRow * sourceByteRowStride],
filterX, rowBuffer.advanceRow());
}
}
nextXRow++;
}
}
// Compute where in the output image this row of final data will go.
unsigned char* curOutputRow = &output[outY * outputByteRowStride];
// Get the list of rows that the circular buffer has, in order.
int firstRowInCircularBuffer;
unsigned char* const* rowsToConvolve =
rowBuffer.GetRowAddresses(&firstRowInCircularBuffer);
// Now compute the start of the subset of those rows that the filter
// needs.
unsigned char* const* firstRowForFilter =
&rowsToConvolve[filterOffset - firstRowInCircularBuffer];
if (convolveProcs->fConvolveVertically) {
convolveProcs->fConvolveVertically(filterValues, filterLength,
firstRowForFilter,
filterX.numValues(), curOutputRow,
sourceHasAlpha);
} else {
ConvolveVertically(filterValues, filterLength,
firstRowForFilter,
filterX.numValues(), curOutputRow,
sourceHasAlpha);
}
}
}

203
src/core/SkConvolver.h Normal file
View File

@ -0,0 +1,203 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SK_CONVOLVER_H
#define SK_CONVOLVER_H
#include "SkSize.h"
#include "SkTypes.h"
#include "SkTArray.h"
// avoid confusion with Mac OS X's math library (Carbon)
#if defined(__APPLE__)
#undef FloatToConvolutionFixed
#undef ConvolutionFixedToFloat
#endif
// Represents a filter in one dimension. Each output pixel has one entry in this
// object for the filter values contributing to it. You build up the filter
// list by calling AddFilter for each output pixel (in order).
//
// We do 2-dimensional convolution by first convolving each row by one
// SkConvolutionFilter1D, then convolving each column by another one.
//
// Entries are stored in ConvolutionFixed point, shifted left by kShiftBits.
class SkConvolutionFilter1D {
public:
typedef short ConvolutionFixed;
// The number of bits that ConvolutionFixed point values are shifted by.
enum { kShiftBits = 14 };
SK_API SkConvolutionFilter1D();
SK_API ~SkConvolutionFilter1D();
// Convert between floating point and our ConvolutionFixed point representation.
static ConvolutionFixed FloatToFixed(float f) {
return static_cast<ConvolutionFixed>(f * (1 << kShiftBits));
}
static unsigned char FixedToChar(ConvolutionFixed x) {
return static_cast<unsigned char>(x >> kShiftBits);
}
static float FixedToFloat(ConvolutionFixed x) {
// The cast relies on ConvolutionFixed being a short, implying that on
// the platforms we care about all (16) bits will fit into
// the mantissa of a (32-bit) float.
SK_COMPILE_ASSERT(sizeof(ConvolutionFixed) == 2, ConvolutionFixed_type_should_fit_in_float_mantissa);
float raw = static_cast<float>(x);
return ldexpf(raw, -kShiftBits);
}
// Returns the maximum pixel span of a filter.
int maxFilter() const { return fMaxFilter; }
// Returns the number of filters in this filter. This is the dimension of the
// output image.
int numValues() const { return static_cast<int>(fFilters.count()); }
// Appends the given list of scaling values for generating a given output
// pixel. |filterOffset| is the distance from the edge of the image to where
// the scaling factors start. The scaling factors apply to the source pixels
// starting from this position, and going for the next |filterLength| pixels.
//
// You will probably want to make sure your input is normalized (that is,
// all entries in |filterValuesg| sub to one) to prevent affecting the overall
// brighness of the image.
//
// The filterLength must be > 0.
//
// This version will automatically convert your input to ConvolutionFixed point.
SK_API void AddFilter(int filterOffset,
const float* filterValues,
int filterLength);
// Same as the above version, but the input is already ConvolutionFixed point.
void AddFilter(int filterOffset,
const ConvolutionFixed* filterValues,
int filterLength);
// Retrieves a filter for the given |valueOffset|, a position in the output
// image in the direction we're convolving. The offset and length of the
// filter values are put into the corresponding out arguments (see AddFilter
// above for what these mean), and a pointer to the first scaling factor is
// returned. There will be |filterLength| values in this array.
inline const ConvolutionFixed* FilterForValue(int valueOffset,
int* filterOffset,
int* filterLength) const {
const FilterInstance& filter = fFilters[valueOffset];
*filterOffset = filter.fOffset;
*filterLength = filter.fTrimmedLength;
if (filter.fTrimmedLength == 0) {
return NULL;
}
return &fFilterValues[filter.fDataLocation];
}
// Retrieves the filter for the offset 0, presumed to be the one and only.
// The offset and length of the filter values are put into the corresponding
// out arguments (see AddFilter). Note that |filterLegth| and
// |specifiedFilterLength| may be different if leading/trailing zeros of the
// original floating point form were clipped.
// There will be |filterLength| values in the return array.
// Returns NULL if the filter is 0-length (for instance when all floating
// point values passed to AddFilter were clipped to 0).
SK_API const ConvolutionFixed* GetSingleFilter(int* specifiedFilterLength,
int* filterOffset,
int* filterLength) const;
// Add another value to the fFilterValues array -- useful for
// SIMD padding which happens outside of this class.
void addFilterValue( ConvolutionFixed val ) {
fFilterValues.push_back( val );
}
private:
struct FilterInstance {
// Offset within filterValues for this instance of the filter.
int fDataLocation;
// Distance from the left of the filter to the center. IN PIXELS
int fOffset;
// Number of values in this filter instance.
int fTrimmedLength;
// Filter length as specified. Note that this may be different from
// 'trimmed_length' if leading/trailing zeros of the original floating
// point form were clipped differently on each tail.
int fLength;
};
// Stores the information for each filter added to this class.
SkTArray<FilterInstance> fFilters;
// We store all the filter values in this flat list, indexed by
// |FilterInstance.data_location| to avoid the mallocs required for storing
// each one separately.
SkTArray<ConvolutionFixed> fFilterValues;
// The maximum size of any filter we've added.
int fMaxFilter;
};
typedef void (*SkConvolveVertically_pointer)(
const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
int filterLength,
unsigned char* const* sourceDataRows,
int pixelWidth,
unsigned char* outRow,
bool hasAlpha);
typedef void (*SkConvolve4RowsHorizontally_pointer)(
const unsigned char* srcData[4],
const SkConvolutionFilter1D& filter,
unsigned char* outRow[4]);
typedef void (*SkConvolveHorizontally_pointer)(
const unsigned char* srcData,
const SkConvolutionFilter1D& filter,
unsigned char* outRow,
bool hasAlpha);
typedef void (*SkConvolveFilterPadding_pointer)(
SkConvolutionFilter1D* filter);
struct SkConvolutionProcs {
// This is how many extra pixels may be read by the
// conolve*horizontally functions.
int fExtraHorizontalReads;
SkConvolveVertically_pointer fConvolveVertically;
SkConvolve4RowsHorizontally_pointer fConvolve4RowsHorizontally;
SkConvolveHorizontally_pointer fConvolveHorizontally;
SkConvolveFilterPadding_pointer fApplySIMDPadding;
};
// Does a two-dimensional convolution on the given source image.
//
// It is assumed the source pixel offsets referenced in the input filters
// reference only valid pixels, so the source image size is not required. Each
// row of the source image starts |sourceByteRowStride| after the previous
// one (this allows you to have rows with some padding at the end).
//
// The result will be put into the given output buffer. The destination image
// size will be xfilter.numValues() * yfilter.numValues() pixels. It will be
// in rows of exactly xfilter.numValues() * 4 bytes.
//
// |sourceHasAlpha| is a hint that allows us to avoid doing computations on
// the alpha channel if the image is opaque. If you don't know, set this to
// true and it will work properly, but setting this to false will be a few
// percent faster if you know the image is opaque.
//
// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
// (this is ARGB when loaded into 32-bit words on a little-endian machine).
SK_API void BGRAConvolve2D(const unsigned char* sourceData,
int sourceByteRowStride,
bool sourceHasAlpha,
const SkConvolutionFilter1D& xfilter,
const SkConvolutionFilter1D& yfilter,
int outputByteRowStride,
unsigned char* output,
SkConvolutionProcs* convolveProcs,
bool useSimdIfPossible);
#endif // SK_CONVOLVER_H

View File

@ -11,6 +11,7 @@
#include "SkColorPriv.h" #include "SkColorPriv.h"
#include "SkUnPreMultiply.h" #include "SkUnPreMultiply.h"
#include "SkShader.h" #include "SkShader.h"
#include "SkConvolver.h"
#include "SkBitmapFilter_opts_SSE2.h" #include "SkBitmapFilter_opts_SSE2.h"
@ -180,3 +181,456 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
} }
} }
// Convolves horizontally along a single row. The row data is given in
// |src_data| and continues for the num_values() of the filter.
void convolveHorizontally_SSE2(const unsigned char* src_data,
const SkConvolutionFilter1D& filter,
unsigned char* out_row,
bool /*has_alpha*/) {
int num_values = filter.numValues();
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
__m128i mask[4];
// |mask| will be used to decimate all extra filter coefficients that are
// loaded by SIMD when |filter_length| is not divisible by 4.
// mask[0] is not used in following algorithm.
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = 0; out_x < num_values; out_x++) {
const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
filter.FilterForValue(out_x, &filter_offset, &filter_length);
__m128i accum = _mm_setzero_si128();
// Compute the first pixel in this row that the filter affects. It will
// touch |filter_length| pixels (4 bytes each) after this.
const __m128i* row_to_filter =
reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
// We will load and accumulate with four coefficients per iteration.
for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
// Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
__m128i coeff, coeff16;
// [16] xx xx xx xx c3 c2 c1 c0
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// [16] xx xx xx xx c1 c1 c0 c0
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
// [16] c1 c1 c1 c1 c0 c0 c0 c0
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
// Load four pixels => unpack the first two pixels to 16 bits =>
// multiply with coefficients => accumulate the convolution result.
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
__m128i src8 = _mm_loadu_si128(row_to_filter);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a0*c0 b0*c0 g0*c0 r0*c0
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// [32] a1*c1 b1*c1 g1*c1 r1*c1
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// Duplicate 3rd and 4th coefficients for all channels =>
// unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
// => accumulate the convolution results.
// [16] xx xx xx xx c3 c3 c2 c2
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
// [16] c3 c3 c3 c3 c2 c2 c2 c2
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
// [16] a3 g3 b3 r3 a2 g2 b2 r2
src16 = _mm_unpackhi_epi8(src8, zero);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a2*c2 b2*c2 g2*c2 r2*c2
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// [32] a3*c3 b3*c3 g3*c3 r3*c3
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// Advance the pixel and coefficients pointers.
row_to_filter += 1;
filter_values += 4;
}
// When |filter_length| is not divisible by 4, we need to decimate some of
// the filter coefficient that was loaded incorrectly to zero; Other than
// that the algorithm is same with above, exceot that the 4th pixel will be
// always absent.
int r = filter_length&3;
if (r) {
// Note: filter_values must be padded to align_up(filter_offset, 8).
__m128i coeff, coeff16;
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// Mask out extra filter taps.
coeff = _mm_and_si128(coeff, mask[r]);
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
// Note: line buffer must be padded to align_up(filter_offset, 16).
// We resolve this by use C-version for the last horizontal line.
__m128i src8 = _mm_loadu_si128(row_to_filter);
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
src16 = _mm_unpackhi_epi8(src8, zero);
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
}
// Shift right for fixed point implementation.
accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
// Packing 32 bits |accum| to 16 bits per channel (signed saturation).
accum = _mm_packs_epi32(accum, zero);
// Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
accum = _mm_packus_epi16(accum, zero);
// Store the pixel value of 32 bits.
*(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
out_row += 4;
}
}
// Convolves horizontally along four rows. The row data is given in
// |src_data| and continues for the num_values() of the filter.
// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
// refer to that function for detailed comments.
void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
const SkConvolutionFilter1D& filter,
unsigned char* out_row[4]) {
int num_values = filter.numValues();
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
__m128i mask[4];
// |mask| will be used to decimate all extra filter coefficients that are
// loaded by SIMD when |filter_length| is not divisible by 4.
// mask[0] is not used in following algorithm.
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = 0; out_x < num_values; out_x++) {
const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
filter.FilterForValue(out_x, &filter_offset, &filter_length);
// four pixels in a column per iteration.
__m128i accum0 = _mm_setzero_si128();
__m128i accum1 = _mm_setzero_si128();
__m128i accum2 = _mm_setzero_si128();
__m128i accum3 = _mm_setzero_si128();
int start = (filter_offset<<2);
// We will load and accumulate with four coefficients per iteration.
for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
__m128i coeff, coeff16lo, coeff16hi;
// [16] xx xx xx xx c3 c2 c1 c0
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// [16] xx xx xx xx c1 c1 c0 c0
coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
// [16] c1 c1 c1 c1 c0 c0 c0 c0
coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
// [16] xx xx xx xx c3 c3 c2 c2
coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
// [16] c3 c3 c3 c3 c2 c2 c2 c2
coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
__m128i src8, src16, mul_hi, mul_lo, t;
#define ITERATION(src, accum) \
src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
src16 = _mm_unpacklo_epi8(src8, zero); \
mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t); \
t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t); \
src16 = _mm_unpackhi_epi8(src8, zero); \
mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t); \
t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t)
ITERATION(src_data[0] + start, accum0);
ITERATION(src_data[1] + start, accum1);
ITERATION(src_data[2] + start, accum2);
ITERATION(src_data[3] + start, accum3);
start += 16;
filter_values += 4;
}
int r = filter_length & 3;
if (r) {
// Note: filter_values must be padded to align_up(filter_offset, 8);
__m128i coeff;
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// Mask out extra filter taps.
coeff = _mm_and_si128(coeff, mask[r]);
__m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
/* c1 c1 c1 c1 c0 c0 c0 c0 */
coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
__m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
__m128i src8, src16, mul_hi, mul_lo, t;
ITERATION(src_data[0] + start, accum0);
ITERATION(src_data[1] + start, accum1);
ITERATION(src_data[2] + start, accum2);
ITERATION(src_data[3] + start, accum3);
}
accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
accum0 = _mm_packs_epi32(accum0, zero);
accum0 = _mm_packus_epi16(accum0, zero);
accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
accum1 = _mm_packs_epi32(accum1, zero);
accum1 = _mm_packus_epi16(accum1, zero);
accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
accum2 = _mm_packs_epi32(accum2, zero);
accum2 = _mm_packus_epi16(accum2, zero);
accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
accum3 = _mm_packs_epi32(accum3, zero);
accum3 = _mm_packus_epi16(accum3, zero);
*(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
*(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
*(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
*(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
out_row[0] += 4;
out_row[1] += 4;
out_row[2] += 4;
out_row[3] += 4;
}
}
// Does vertical convolution to produce one output row. The filter values and
// length are given in the first two parameters. These are applied to each
// of the rows pointed to in the |source_data_rows| array, with each row
// being |pixel_width| wide.
//
// The output must have room for |pixel_width * 4| bytes.
template<bool has_alpha>
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int pixel_width,
unsigned char* out_row) {
int width = pixel_width & ~3;
__m128i zero = _mm_setzero_si128();
__m128i accum0, accum1, accum2, accum3, coeff16;
const __m128i* src;
// Output four pixels per iteration (16 bytes).
for (int out_x = 0; out_x < width; out_x += 4) {
// Accumulated result for each pixel. 32 bits per RGBA channel.
accum0 = _mm_setzero_si128();
accum1 = _mm_setzero_si128();
accum2 = _mm_setzero_si128();
accum3 = _mm_setzero_si128();
// Convolve with one filter coefficient per iteration.
for (int filter_y = 0; filter_y < filter_length; filter_y++) {
// Duplicate the filter coefficient 8 times.
// [16] cj cj cj cj cj cj cj cj
coeff16 = _mm_set1_epi16(filter_values[filter_y]);
// Load four pixels (16 bytes) together.
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
src = reinterpret_cast<const __m128i*>(
&source_data_rows[filter_y][out_x << 2]);
__m128i src8 = _mm_loadu_si128(src);
// Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
// multiply with current coefficient => accumulate the result.
// [16] a1 b1 g1 r1 a0 b0 g0 r0
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a0 b0 g0 r0
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum0 = _mm_add_epi32(accum0, t);
// [32] a1 b1 g1 r1
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum1 = _mm_add_epi32(accum1, t);
// Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
// multiply with current coefficient => accumulate the result.
// [16] a3 b3 g3 r3 a2 b2 g2 r2
src16 = _mm_unpackhi_epi8(src8, zero);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a2 b2 g2 r2
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum2 = _mm_add_epi32(accum2, t);
// [32] a3 b3 g3 r3
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum3 = _mm_add_epi32(accum3, t);
}
// Shift right for fixed point implementation.
accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
// Packing 32 bits |accum| to 16 bits per channel (signed saturation).
// [16] a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packs_epi32(accum0, accum1);
// [16] a3 b3 g3 r3 a2 b2 g2 r2
accum2 = _mm_packs_epi32(accum2, accum3);
// Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packus_epi16(accum0, accum2);
if (has_alpha) {
// Compute the max(ri, gi, bi) for each pixel.
// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
__m128i a = _mm_srli_epi32(accum0, 8);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
a = _mm_srli_epi32(accum0, 16);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
b = _mm_max_epu8(a, b); // Max of r and g and b.
// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
b = _mm_slli_epi32(b, 24);
// Make sure the value of alpha channel is always larger than maximum
// value of color channels.
accum0 = _mm_max_epu8(b, accum0);
} else {
// Set value of alpha channels to 0xFF.
__m128i mask = _mm_set1_epi32(0xff000000);
accum0 = _mm_or_si128(accum0, mask);
}
// Store the convolution result (16 bytes) and advance the pixel pointers.
_mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
out_row += 16;
}
// When the width of the output is not divisible by 4, We need to save one
// pixel (4 bytes) each time. And also the fourth pixel is always absent.
if (pixel_width & 3) {
accum0 = _mm_setzero_si128();
accum1 = _mm_setzero_si128();
accum2 = _mm_setzero_si128();
for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
coeff16 = _mm_set1_epi16(filter_values[filter_y]);
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
src = reinterpret_cast<const __m128i*>(
&source_data_rows[filter_y][width<<2]);
__m128i src8 = _mm_loadu_si128(src);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a0 b0 g0 r0
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum0 = _mm_add_epi32(accum0, t);
// [32] a1 b1 g1 r1
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum1 = _mm_add_epi32(accum1, t);
// [16] a3 b3 g3 r3 a2 b2 g2 r2
src16 = _mm_unpackhi_epi8(src8, zero);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a2 b2 g2 r2
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum2 = _mm_add_epi32(accum2, t);
}
accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packs_epi32(accum0, accum1);
// [16] a3 b3 g3 r3 a2 b2 g2 r2
accum2 = _mm_packs_epi32(accum2, zero);
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packus_epi16(accum0, accum2);
if (has_alpha) {
// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
__m128i a = _mm_srli_epi32(accum0, 8);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
a = _mm_srli_epi32(accum0, 16);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
b = _mm_max_epu8(a, b); // Max of r and g and b.
// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
b = _mm_slli_epi32(b, 24);
accum0 = _mm_max_epu8(b, accum0);
} else {
__m128i mask = _mm_set1_epi32(0xff000000);
accum0 = _mm_or_si128(accum0, mask);
}
for (int out_x = width; out_x < pixel_width; out_x++) {
*(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
accum0 = _mm_srli_si128(accum0, 4);
out_row += 4;
}
}
}
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int pixel_width,
unsigned char* out_row,
bool has_alpha) {
if (has_alpha) {
convolveVertically_SSE2<true>(filter_values,
filter_length,
source_data_rows,
pixel_width,
out_row);
} else {
convolveVertically_SSE2<false>(filter_values,
filter_length,
source_data_rows,
pixel_width,
out_row);
}
}
void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
// Padding |paddingCount| of more dummy coefficients after the coefficients
// of last filter to prevent SIMD instructions which load 8 or 16 bytes
// together to access invalid memory areas. We are not trying to align the
// coefficients right now due to the opaqueness of <vector> implementation.
// This has to be done after all |AddFilter| calls.
for (int i = 0; i < 8; ++i) {
filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
}
}

View File

@ -11,10 +11,27 @@
#define SkBitmapFilter_opts_sse2_DEFINED #define SkBitmapFilter_opts_sse2_DEFINED
#include "SkBitmapProcState.h" #include "SkBitmapProcState.h"
#include "SkConvolver.h"
void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y, void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count); SkPMColor *SK_RESTRICT colors, int count);
void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y, void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count); SkPMColor *SK_RESTRICT colors, int count);
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int pixel_width,
unsigned char* out_row,
bool has_alpha);
void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
const SkConvolutionFilter1D& filter,
unsigned char* out_row[4]);
void convolveHorizontally_SSE2(const unsigned char* src_data,
const SkConvolutionFilter1D& filter,
unsigned char* out_row,
bool has_alpha);
void applySIMDPadding_SSE2(SkConvolutionFilter1D* filter);
#endif #endif

View File

@ -21,3 +21,6 @@
// empty implementation just uses default supplied function pointers // empty implementation just uses default supplied function pointers
void SkBitmapProcState::platformProcs() {} void SkBitmapProcState::platformProcs() {}
// empty implementation just uses default supplied function pointers
void SkBitmapProcState::platformScaleProc() {}

View File

@ -107,6 +107,16 @@ static bool cachedHasSSSE3() {
SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters"); SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
void SkBitmapProcState::platformConvolutionProcs() {
if (cachedHasSSE2()) {
fConvolutionProcs->fExtraHorizontalReads = 3;
fConvolutionProcs->fConvolveVertically = &convolveVertically_SSE2;
fConvolutionProcs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
fConvolutionProcs->fConvolveHorizontally = &convolveHorizontally_SSE2;
fConvolutionProcs->fApplySIMDPadding = &applySIMDPadding_SSE2;
}
}
void SkBitmapProcState::platformProcs() { void SkBitmapProcState::platformProcs() {
if (cachedHasSSSE3()) { if (cachedHasSSSE3()) {
#if !defined(SK_BUILD_FOR_ANDROID) #if !defined(SK_BUILD_FOR_ANDROID)
@ -151,9 +161,6 @@ void SkBitmapProcState::platformProcs() {
if (fShaderProc32 == highQualityFilter) { if (fShaderProc32 == highQualityFilter) {
fShaderProc32 = highQualityFilter_SSE2; fShaderProc32 = highQualityFilter_SSE2;
} }
if (fShaderProc32 == highQualityFilter_ScaleOnly) {
fShaderProc32 = highQualityFilter_ScaleOnly_SSE2;
}
} }
} }
} }