The image resampling code has been transplanted from Chrome; it's incredibly fast.

We've tested this CL plumbed into Chrome and done benchmarking with excellent results.

This CL can land independent of any Chrome changes; it's completely internal to skia.

BUG=
R=reed@google.com

Review URL: https://codereview.chromium.org/19335002

git-svn-id: http://skia.googlecode.com/svn/trunk@10206 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
humper@google.com 2013-07-19 20:20:04 +00:00
parent d322cf4939
commit 138ebc3e40
15 changed files with 1689 additions and 265 deletions

View File

@ -75,7 +75,7 @@ protected:
curWidth = (int) (fBM.width() * curScale + 2);
curX += curWidth;
curScale *= 0.75f;
} while (curX < 4*fBM.width());
} while (curWidth >= 2 && curX < 4*fBM.width());
}
private:

View File

@ -32,6 +32,8 @@
'<(skia_src_path)/core/SkBitmapProcState_matrix.h',
'<(skia_src_path)/core/SkBitmapProcState_matrixProcs.cpp',
'<(skia_src_path)/core/SkBitmapProcState_sample.h',
'<(skia_src_path)/core/SkBitmapScaler.h',
'<(skia_src_path)/core/SkBitmapScaler.cpp',
'<(skia_src_path)/core/SkBitmapShader16BilerpTemplate.h',
'<(skia_src_path)/core/SkBitmapShaderTemplate.h',
'<(skia_src_path)/core/SkBitmap_scroll.cpp',
@ -56,6 +58,8 @@
'<(skia_src_path)/core/SkComposeShader.cpp',
'<(skia_src_path)/core/SkConfig8888.cpp',
'<(skia_src_path)/core/SkConfig8888.h',
'<(skia_src_path)/core/SkConvolver.cpp',
'<(skia_src_path)/core/SkConvolver.h',
'<(skia_src_path)/core/SkCordic.cpp',
'<(skia_src_path)/core/SkCordic.h',
'<(skia_src_path)/core/SkCoreBlitters.h',

View File

@ -702,19 +702,7 @@ private:
int extractMipLevel(SkBitmap* dst, SkFixed sx, SkFixed sy);
bool hasMipMap() const;
void freeMipMap();
/** Make a scaled copy of this bitmap into the provided destination.
* The caller is responsible for having set the width and height of the
* provided destination bitmap, and also having allocated its pixel
* memory.
*
* This function is temporary and for testing purposes only; it will
* likely move once it has been properly plumbed into the bitmap
* shader infrastructure.
*/
void scale(SkBitmap *dst) const;
friend struct SkBitmapProcState;
};

View File

@ -5,15 +5,23 @@
* found in the LICENSE file.
*/
#include "SkErrorInternals.h"
#include "SkConvolver.h"
#include "SkBitmapProcState.h"
#include "SkBitmap.h"
#include "SkColor.h"
#include "SkColorPriv.h"
#include "SkConvolver.h"
#include "SkUnPreMultiply.h"
#include "SkShader.h"
#include "SkRTConf.h"
#include "SkMath.h"
// These are the per-scanline callbacks that are used when we must resort to
// resampling an image as it is blitted. Typically these are used only when
// the image is rotated or has some other complex transformation applied.
// Scaled images will usually be rescaled directly before rasterization.
void highQualityFilter(const SkBitmapProcState& s, int x, int y,
SkPMColor* SK_RESTRICT colors, int count) {
@ -68,71 +76,15 @@ void highQualityFilter(const SkBitmapProcState& s, int x, int y,
}
}
void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count) {
const int maxX = s.fBitmap->width() - 1;
const int maxY = s.fBitmap->height() - 1;
SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which scanline bitmap filter to use [mitchell, lanczos, hamming, gaussian, triangle, box]");
SkPoint srcPt;
s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
SkFloatToScalar(y + 0.5f), &srcPt);
srcPt.fY -= SK_ScalarHalf;
int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()), maxY);
while (count-- > 0) {
s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
SkFloatToScalar(y + 0.5f), &srcPt);
srcPt.fX -= SK_ScalarHalf;
srcPt.fY -= SK_ScalarHalf;
SkScalar weight = 0;
SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width()), maxX);
for (int srcY = y0; srcY <= y1; srcY++) {
SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
for (int srcX = x0; srcX <= x1 ; srcX++) {
SkScalar xWeight = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
fr += combined_weight * SkGetPackedR32(c);
fg += combined_weight * SkGetPackedG32(c);
fb += combined_weight * SkGetPackedB32(c);
fa += combined_weight * SkGetPackedA32(c);
weight += combined_weight;
}
}
fr = SkScalarDiv(fr, weight);
fg = SkScalarDiv(fg, weight);
fb = SkScalarDiv(fb, weight);
fa = SkScalarDiv(fa, weight);
int a = SkClampMax(SkScalarRoundToInt(fa), 255);
int r = SkClampMax(SkScalarRoundToInt(fr), a);
int g = SkClampMax(SkScalarRoundToInt(fg), a);
int b = SkClampMax(SkScalarRoundToInt(fb), a);
*colors++ = SkPackARGB32(a, r, g, b);
x++;
}
}
SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which bitmap filter to use [mitchell, sinc, gaussian, triangle, box]");
static SkBitmapFilter *allocateBitmapFilter() {
SkBitmapFilter *SkBitmapFilter::Allocate() {
if (!strcmp(c_bitmapFilter, "mitchell")) {
return SkNEW_ARGS(SkMitchellFilter,(1.f/3.f,1.f/3.f));
} else if (!strcmp(c_bitmapFilter, "sinc")) {
return SkNEW_ARGS(SkSincFilter,(3));
} else if (!strcmp(c_bitmapFilter, "lanczos")) {
return SkNEW(SkLanczosFilter);
} else if (!strcmp(c_bitmapFilter, "hamming")) {
return SkNEW(SkHammingFilter);
} else if (!strcmp(c_bitmapFilter, "gaussian")) {
return SkNEW_ARGS(SkGaussianFilter,(2));
} else if (!strcmp(c_bitmapFilter, "triangle")) {
@ -168,159 +120,12 @@ SkBitmapProcState::chooseBitmapFilterProc() {
}
if (fInvType & (SkMatrix::kAffine_Mask | SkMatrix::kScale_Mask)) {
fBitmapFilter = allocateBitmapFilter();
fBitmapFilter = SkBitmapFilter::Allocate();
}
if (fInvType & SkMatrix::kAffine_Mask) {
if (fInvType & SkMatrix::kScale_Mask) {
return highQualityFilter;
} else if (fInvType & SkMatrix::kScale_Mask) {
return highQualityFilter_ScaleOnly;
} else {
return NULL;
}
}
static void divideByWeights(SkScalar *sums, SkScalar *weights, SkBitmap *dst) {
for (int y = 0 ; y < dst->height() ; y++) {
for (int x = 0 ; x < dst->width() ; x++) {
SkScalar fr = SkScalarDiv(sums[4*(y*dst->width() + x) + 0], weights[y*dst->width() + x]);
SkScalar fg = SkScalarDiv(sums[4*(y*dst->width() + x) + 1], weights[y*dst->width() + x]);
SkScalar fb = SkScalarDiv(sums[4*(y*dst->width() + x) + 2], weights[y*dst->width() + x]);
SkScalar fa = SkScalarDiv(sums[4*(y*dst->width() + x) + 3], weights[y*dst->width() + x]);
int a = SkClampMax(SkScalarRoundToInt(fa), 255);
int r = SkClampMax(SkScalarRoundToInt(fr), a);
int g = SkClampMax(SkScalarRoundToInt(fg), a);
int b = SkClampMax(SkScalarRoundToInt(fb), a);
*dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
}
}
}
static void upScaleHorizTranspose(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
for (int y = 0 ; y < dst->height() ; y++) {
for (int x = 0 ; x < dst->width() ; x++) {
float sx = (y + 0.5f) / scale - 0.5f;
int x0 = SkClampMax(sk_float_ceil2int(sx-filter->width()), src->width()-1);
int x1 = SkClampMax(sk_float_floor2int(sx+filter->width()), src->width()-1);
SkScalar totalWeight = 0;
SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
for (int srcX = x0 ; srcX <= x1 ; srcX++) {
SkScalar weight = filter->lookupScalar(sx - srcX);
SkPMColor c = *src->getAddr32(srcX, x);
fr += SkScalarMul(weight,SkGetPackedR32(c));
fg += SkScalarMul(weight,SkGetPackedG32(c));
fb += SkScalarMul(weight,SkGetPackedB32(c));
fa += SkScalarMul(weight,SkGetPackedA32(c));
totalWeight += weight;
}
fr = SkScalarDiv(fr,totalWeight);
fg = SkScalarDiv(fg,totalWeight);
fb = SkScalarDiv(fb,totalWeight);
fa = SkScalarDiv(fa,totalWeight);
int a = SkClampMax(SkScalarRoundToInt(fa), 255);
int r = SkClampMax(SkScalarRoundToInt(fr), a);
int g = SkClampMax(SkScalarRoundToInt(fg), a);
int b = SkClampMax(SkScalarRoundToInt(fb), a);
*dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
}
}
}
static void downScaleHoriz(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
SkAutoTDeleteArray<SkScalar> ada1(sums);
SkAutoTDeleteArray<SkScalar> ada2(weights);
memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
for (int y = 0 ; y < src->height() ; y++) {
for (int x = 0 ; x < src->width() ; x++) {
// splat each source pixel into the destination image
float dx = (x + 0.5f) * scale - 0.5f;
int x0 = SkClampMax(sk_float_ceil2int(dx-filter->width()), dst->width()-1);
int x1 = SkClampMax(sk_float_floor2int(dx+filter->width()), dst->width()-1);
SkPMColor c = *src->getAddr32(x,y);
for (int dst_x = x0 ; dst_x <= x1 ; dst_x++) {
SkScalar weight = filter->lookup(dx - dst_x);
sums[4*(y*dst->width() + dst_x) + 0] += weight*SkGetPackedR32(c);
sums[4*(y*dst->width() + dst_x) + 1] += weight*SkGetPackedG32(c);
sums[4*(y*dst->width() + dst_x) + 2] += weight*SkGetPackedB32(c);
sums[4*(y*dst->width() + dst_x) + 3] += weight*SkGetPackedA32(c);
weights[y*dst->width() + dst_x] += weight;
}
}
}
divideByWeights(sums, weights, dst);
}
static void downScaleVert(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
SkAutoTDeleteArray<SkScalar> ada1(sums);
SkAutoTDeleteArray<SkScalar> ada2(weights);
memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
for (int y = 0 ; y < src->height() ; y++) {
for (int x = 0 ; x < src->width() ; x++) {
// splat each source pixel into the destination image
float dy = (y + 0.5f) * scale - 0.5f;
int y0 = SkClampMax(sk_float_ceil2int(dy-filter->width()), dst->height()-1);
int y1 = SkClampMax(sk_float_ceil2int(dy+filter->width()), dst->height()-1);
SkPMColor c = *src->getAddr32(x,y);
for (int dst_y = y0 ; dst_y <= y1 ; dst_y++) {
SkScalar weight = filter->lookupScalar(dy - dst_y);
sums[4*(dst_y*dst->width() + x) + 0] += weight*SkGetPackedR32(c);
sums[4*(dst_y*dst->width() + x) + 1] += weight*SkGetPackedG32(c);
sums[4*(dst_y*dst->width() + x) + 2] += weight*SkGetPackedB32(c);
sums[4*(dst_y*dst->width() + x) + 3] += weight*SkGetPackedA32(c);
weights[dst_y*dst->width() + x] += weight;
}
}
}
divideByWeights(sums, weights, dst);
}
void SkBitmap::scale(SkBitmap *dst) const {
SkBitmap horizTemp;
horizTemp.setConfig(SkBitmap::kARGB_8888_Config, height(), dst->width());
horizTemp.allocPixels();
SkBitmapFilter *filter = allocateBitmapFilter();
float horizScale = float(dst->width()) / width();
if (horizScale >= 1) {
upScaleHorizTranspose(this, &horizTemp, horizScale, filter);
} else if (horizScale < 1) {
downScaleHoriz(this, &horizTemp, horizScale, filter);
}
float vertScale = float(dst->height()) / height();
if (vertScale >= 1) {
upScaleHorizTranspose(&horizTemp, dst, vertScale, filter);
} else if (vertScale < 1) {
downScaleVert(&horizTemp, dst, vertScale, filter);
}
SkDELETE(filter);
}

View File

@ -26,28 +26,30 @@ class SkBitmapFilter {
fLookupMultiplier = this->invWidth() * (SKBITMAP_FILTER_TABLE_SIZE-1);
}
SkFixed lookup( float x ) const {
SkFixed lookup(float x) const {
if (!fPrecomputed) {
precomputeTable();
}
int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
return fFilterTable[ filter_idx ];
return fFilterTable[filter_idx];
}
SkScalar lookupScalar( float x ) const {
SkScalar lookupScalar(float x) const {
if (!fPrecomputed) {
precomputeTable();
}
int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
return fFilterTableScalar[ filter_idx ];
return fFilterTableScalar[filter_idx];
}
float width() const { return fWidth; }
float invWidth() const { return fInvWidth; }
virtual float evaluate(float x) const = 0;
virtual ~SkBitmapFilter() {}
static SkBitmapFilter* Allocate();
protected:
float fWidth;
float fInvWidth;
@ -126,29 +128,47 @@ class SkBoxFilter: public SkBitmapFilter {
}
virtual float evaluate(float x) const SK_OVERRIDE {
return 1;
return (x >= -fWidth && x < fWidth) ? 1.0f : 0.0f;
}
protected:
};
class SkHammingFilter: public SkBitmapFilter {
public:
SkHammingFilter(float width=1.f)
: SkBitmapFilter(width) {
}
virtual float evaluate(float x) const SK_OVERRIDE {
if (x <= -fWidth || x >= fWidth) {
return 0.0f; // Outside of the window.
}
if (x > -FLT_EPSILON && x < FLT_EPSILON) {
return 1.0f; // Special case the sinc discontinuity at the origin.
}
const float xpi = x * static_cast<float>(M_PI);
class SkSincFilter: public SkBitmapFilter {
return ((sk_float_sin(xpi) / xpi) * // sinc(x)
(0.54f + 0.46f * sk_float_cos(xpi / fWidth))); // hamming(x)
}
};
class SkLanczosFilter: public SkBitmapFilter {
public:
SkSincFilter(float t, float width=3.f)
: SkBitmapFilter(width), tau(t) {
SkLanczosFilter(float width=3.f)
: SkBitmapFilter(width) {
}
virtual float evaluate(float x) const SK_OVERRIDE {
x = sk_float_abs(x * fInvWidth);
if (x < 1e-5f) return 1.f;
if (x > 1.f) return 0.f;
x *= SK_ScalarPI;
float sinc = sk_float_sin(x) / x;
float lanczos = sk_float_sin(x * tau) / (x * tau);
return sinc * lanczos;
}
protected:
float tau;
if (x <= -fWidth || x >= fWidth) {
return 0.0f; // Outside of the window.
}
if (x > -FLT_EPSILON && x < FLT_EPSILON) {
return 1.0f; // Special case the discontinuity at the origin.
}
float xpi = x * static_cast<float>(M_PI);
return (sk_float_sin(xpi) / xpi) * // sinc(x)
sk_float_sin(xpi / fWidth) / (xpi / fWidth); // sinc(x/fWidth)
}
};

View File

@ -11,6 +11,7 @@
#include "SkPaint.h"
#include "SkShader.h" // for tilemodes
#include "SkUtilsArm.h"
#include "SkBitmapScaler.h"
#if !SK_ARM_NEON_IS_NONE
// These are defined in src/opts/SkBitmapProcState_arm_neon.cpp
@ -99,23 +100,45 @@ void SkBitmapProcState::possiblyScaleImage() {
if (fFilterQuality != kHQ_BitmapFilter) {
return;
}
// see if our platform has any specialized convolution code.
// Set up a pointer to a local (instead of storing the structure in the
// proc state) to avoid introducing a header dependency; this makes
// recompiles a lot less painful.
SkConvolutionProcs simd;
fConvolutionProcs = &simd;
fConvolutionProcs->fExtraHorizontalReads = 0;
fConvolutionProcs->fConvolveVertically = NULL;
fConvolutionProcs->fConvolve4RowsHorizontally = NULL;
fConvolutionProcs->fConvolveHorizontally = NULL;
fConvolutionProcs->fApplySIMDPadding = NULL;
this->platformConvolutionProcs();
// STEP 1: UPSAMPLE?
// STEP 1: Highest quality direct scale?
// Check to see if the transformation matrix is scaling up, and if
// the matrix is simple, and if we're doing high quality scaling.
// If so, do the bitmap scale here and remove the scaling component from the matrix.
// Check to see if the transformation matrix is simple, and if we're
// doing high quality scaling. If so, do the bitmap scale here and
// remove the scaling component from the matrix.
if (fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) &&
(fInvMatrix.getScaleX() < 1 || fInvMatrix.getScaleY() < 1) &&
if (fFilterQuality == kHQ_BitmapFilter &&
fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) &&
fOrigBitmap.config() == SkBitmap::kARGB_8888_Config) {
int dest_width = SkScalarCeilToInt(fOrigBitmap.width() / fInvMatrix.getScaleX());
int dest_height = SkScalarCeilToInt(fOrigBitmap.height() / fInvMatrix.getScaleY());
// All the criteria are met; let's make a new bitmap.
fScaledBitmap.setConfig(SkBitmap::kARGB_8888_Config,
(int)(fOrigBitmap.width() / fInvMatrix.getScaleX()),
(int)(fOrigBitmap.height() / fInvMatrix.getScaleY()));
fScaledBitmap.allocPixels();
fOrigBitmap.scale(&fScaledBitmap);
fScaledBitmap = SkBitmapScaler::Resize( fOrigBitmap, SkBitmapScaler::RESIZE_BEST,
dest_width, dest_height, fConvolutionProcs );
fScaledBitmap.lockPixels();
fBitmap = &fScaledBitmap;
// set the inv matrix type to translate-only;
@ -130,9 +153,9 @@ void SkBitmapProcState::possiblyScaleImage() {
return;
}
if (!fOrigBitmap.hasMipMap()) {
if (!fOrigBitmap.hasMipMap() && fFilterQuality != kNone_BitmapFilter) {
// STEP 2: DOWNSAMPLE
// STEP 2: MIPMAP DOWNSAMPLE?
// Check to see if the transformation matrix is scaling *down*.
// If so, automatically build mipmaps.

View File

@ -31,6 +31,7 @@
#endif
class SkPaint;
class SkConvolutionProcs;
struct SkBitmapProcState {
@ -59,7 +60,7 @@ struct SkBitmapProcState {
const uint32_t[],
int count,
uint16_t colors[]);
typedef U16CPU (*FixedTileProc)(SkFixed); // returns 0..0xFFFF
typedef U16CPU (*FixedTileLowBitsProc)(SkFixed, int); // returns 0..0xF
typedef U16CPU (*IntTileProc)(int value, int count); // returns 0..count-1
@ -78,6 +79,8 @@ struct SkBitmapProcState {
IntTileProc fIntTileProcY; // chooseProcs
SkFixed fFilterOneX;
SkFixed fFilterOneY;
SkConvolutionProcs* fConvolutionProcs; // possiblyScaleImage
SkPMColor fPaintPMColor; // chooseProcs - A8 config
SkFixed fInvSx; // chooseProcs
@ -113,7 +116,12 @@ struct SkBitmapProcState {
implementation can do nothing (see SkBitmapProcState_opts_none.cpp)
*/
void platformProcs();
/** Platforms can also optionally overwrite the convolution functions
if we have SIMD versions of them.
*/
void platformConvolutionProcs();
/** Given the byte size of the index buffer to be passed to the matrix proc,
return the maximum number of resulting pixels that can be computed
@ -160,7 +168,7 @@ private:
void possiblyScaleImage();
SkBitmapFilter *fBitmapFilter;
SkBitmapFilter* fBitmapFilter;
ShaderProc32 chooseBitmapFilterProc();
@ -218,8 +226,6 @@ void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s,
void S32_D16_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, uint16_t* colors);
void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);
void highQualityFilter(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);

315
src/core/SkBitmapScaler.cpp Normal file
View File

@ -0,0 +1,315 @@
#include "SkBitmapScaler.h"
#include "SkBitmapFilter.h"
#include "SkRect.h"
#include "SkTArray.h"
#include "SkErrorInternals.h"
#include "SkConvolver.h"
// SkResizeFilter ----------------------------------------------------------------
// Encapsulates computation and storage of the filters required for one complete
// resize operation.
class SkResizeFilter {
public:
SkResizeFilter(SkBitmapScaler::ResizeMethod method,
int srcFullWidth, int srcFullHeight,
int destWidth, int destHeight,
const SkIRect& destSubset,
SkConvolutionProcs* convolveProcs);
~SkResizeFilter() {
SkDELETE( fBitmapFilter );
}
// Returns the filled filter values.
const SkConvolutionFilter1D& xFilter() { return fXFilter; }
const SkConvolutionFilter1D& yFilter() { return fYFilter; }
private:
SkBitmapFilter* fBitmapFilter;
// Computes one set of filters either horizontally or vertically. The caller
// will specify the "min" and "max" rather than the bottom/top and
// right/bottom so that the same code can be re-used in each dimension.
//
// |srcDependLo| and |srcDependSize| gives the range for the source
// depend rectangle (horizontally or vertically at the caller's discretion
// -- see above for what this means).
//
// Likewise, the range of destination values to compute and the scale factor
// for the transform is also specified.
void computeFilters(int srcSize,
int destSubsetLo, int destSubsetSize,
float scale,
SkConvolutionFilter1D* output,
SkConvolutionProcs* convolveProcs);
// Subset of scaled destination bitmap to compute.
SkIRect fOutBounds;
SkConvolutionFilter1D fXFilter;
SkConvolutionFilter1D fYFilter;
};
SkResizeFilter::SkResizeFilter(SkBitmapScaler::ResizeMethod method,
int srcFullWidth, int srcFullHeight,
int destWidth, int destHeight,
const SkIRect& destSubset,
SkConvolutionProcs* convolveProcs)
: fOutBounds(destSubset) {
// method will only ever refer to an "algorithm method".
SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
(method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
switch(method) {
case SkBitmapScaler::RESIZE_BOX:
fBitmapFilter = SkNEW(SkBoxFilter);
break;
case SkBitmapScaler::RESIZE_TRIANGLE:
fBitmapFilter = SkNEW(SkTriangleFilter);
break;
case SkBitmapScaler::RESIZE_MITCHELL:
fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
break;
case SkBitmapScaler::RESIZE_HAMMING:
fBitmapFilter = SkNEW(SkHammingFilter);
break;
case SkBitmapScaler::RESIZE_LANCZOS3:
fBitmapFilter = SkNEW(SkLanczosFilter);
break;
default:
// NOTREACHED:
fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
break;
}
float scaleX = static_cast<float>(destWidth) /
static_cast<float>(srcFullWidth);
float scaleY = static_cast<float>(destHeight) /
static_cast<float>(srcFullHeight);
this->computeFilters(srcFullWidth, destSubset.fLeft, destSubset.width(),
scaleX, &fXFilter, convolveProcs);
this->computeFilters(srcFullHeight, destSubset.fTop, destSubset.height(),
scaleY, &fYFilter, convolveProcs);
}
// TODO(egouriou): Take advantage of periods in the convolution.
// Practical resizing filters are periodic outside of the border area.
// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the
// source become p pixels in the destination) will have a period of p.
// A nice consequence is a period of 1 when downscaling by an integral
// factor. Downscaling from typical display resolutions is also bound
// to produce interesting periods as those are chosen to have multiple
// small factors.
// Small periods reduce computational load and improve cache usage if
// the coefficients can be shared. For periods of 1 we can consider
// loading the factors only once outside the borders.
void SkResizeFilter::computeFilters(int srcSize,
int destSubsetLo, int destSubsetSize,
float scale,
SkConvolutionFilter1D* output,
SkConvolutionProcs* convolveProcs) {
int destSubsetHi = destSubsetLo + destSubsetSize; // [lo, hi)
// When we're doing a magnification, the scale will be larger than one. This
// means the destination pixels are much smaller than the source pixels, and
// that the range covered by the filter won't necessarily cover any source
// pixel boundaries. Therefore, we use these clamped values (max of 1) for
// some computations.
float clampedScale = SkTMin(1.0f, scale);
// This is how many source pixels from the center we need to count
// to support the filtering function.
float srcSupport = fBitmapFilter->width() / clampedScale;
// Speed up the divisions below by turning them into multiplies.
float invScale = 1.0f / scale;
SkTArray<float> filterValues(64);
SkTArray<short> fixedFilterValues(64);
// Loop over all pixels in the output range. We will generate one set of
// filter values for each one. Those values will tell us how to blend the
// source pixels to compute the destination pixel.
for (int destSubsetI = destSubsetLo; destSubsetI < destSubsetHi;
destSubsetI++) {
// Reset the arrays. We don't declare them inside so they can re-use the
// same malloc-ed buffer.
filterValues.reset();
fixedFilterValues.reset();
// This is the pixel in the source directly under the pixel in the dest.
// Note that we base computations on the "center" of the pixels. To see
// why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
// downscale should "cover" the pixels around the pixel with *its center*
// at coordinates (2.5, 2.5) in the source, not those around (0, 0).
// Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
float srcPixel = (static_cast<float>(destSubsetI) + 0.5f) * invScale;
// Compute the (inclusive) range of source pixels the filter covers.
int srcBegin = SkTMax(0, SkScalarFloorToInt(srcPixel - srcSupport));
int srcEnd = SkTMin(srcSize - 1, SkScalarCeilToInt(srcPixel + srcSupport));
// Compute the unnormalized filter value at each location of the source
// it covers.
float filterSum = 0.0f; // Sub of the filter values for normalizing.
for (int curFilterPixel = srcBegin; curFilterPixel <= srcEnd;
curFilterPixel++) {
// Distance from the center of the filter, this is the filter coordinate
// in source space. We also need to consider the center of the pixel
// when comparing distance against 'srcPixel'. In the 5x downscale
// example used above the distance from the center of the filter to
// the pixel with coordinates (2, 2) should be 0, because its center
// is at (2.5, 2.5).
float srcFilterDist =
((static_cast<float>(curFilterPixel) + 0.5f) - srcPixel);
// Since the filter really exists in dest space, map it there.
float destFilterDist = srcFilterDist * clampedScale;
// Compute the filter value at that location.
float filterValue = fBitmapFilter->evaluate(destFilterDist);
filterValues.push_back(filterValue);
filterSum += filterValue;
}
SkASSERT(!filterValues.empty());
// The filter must be normalized so that we don't affect the brightness of
// the image. Convert to normalized fixed point.
short fixedSum = 0;
for (int i = 0; i < filterValues.count(); i++) {
short curFixed = output->FloatToFixed(filterValues[i] / filterSum);
fixedSum += curFixed;
fixedFilterValues.push_back(curFixed);
}
// The conversion to fixed point will leave some rounding errors, which
// we add back in to avoid affecting the brightness of the image. We
// arbitrarily add this to the center of the filter array (this won't always
// be the center of the filter function since it could get clipped on the
// edges, but it doesn't matter enough to worry about that case).
short leftovers = output->FloatToFixed(1.0f) - fixedSum;
fixedFilterValues[fixedFilterValues.count() / 2] += leftovers;
// Now it's ready to go.
output->AddFilter(srcBegin, &fixedFilterValues[0],
static_cast<int>(fixedFilterValues.count()));
}
if (convolveProcs->fApplySIMDPadding) {
convolveProcs->fApplySIMDPadding( output );
}
}
static SkBitmapScaler::ResizeMethod ResizeMethodToAlgorithmMethod(
SkBitmapScaler::ResizeMethod method) {
// Convert any "Quality Method" into an "Algorithm Method"
if (method >= SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD &&
method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD) {
return method;
}
// The call to SkBitmapScalerGtv::Resize() above took care of
// GPU-acceleration in the cases where it is possible. So now we just
// pick the appropriate software method for each resize quality.
switch (method) {
// Users of RESIZE_GOOD are willing to trade a lot of quality to
// get speed, allowing the use of linear resampling to get hardware
// acceleration (SRB). Hence any of our "good" software filters
// will be acceptable, so we use a triangle.
case SkBitmapScaler::RESIZE_GOOD:
return SkBitmapScaler::RESIZE_TRIANGLE;
// Users of RESIZE_BETTER are willing to trade some quality in order
// to improve performance, but are guaranteed not to devolve to a linear
// resampling. In visual tests we see that Hamming-1 is not as good as
// Lanczos-2, however it is about 40% faster and Lanczos-2 itself is
// about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed
// an acceptable trade-off between quality and speed.
case SkBitmapScaler::RESIZE_BETTER:
return SkBitmapScaler::RESIZE_HAMMING;
default:
return SkBitmapScaler::RESIZE_MITCHELL;
}
}
// static
SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
ResizeMethod method,
int destWidth, int destHeight,
const SkIRect& destSubset,
SkConvolutionProcs* convolveProcs,
SkBitmap::Allocator* allocator) {
// Ensure that the ResizeMethod enumeration is sound.
SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) &&
(method <= RESIZE_LAST_QUALITY_METHOD)) ||
((RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
(method <= RESIZE_LAST_ALGORITHM_METHOD)));
SkIRect dest = { 0, 0, destWidth, destHeight };
if (!dest.contains(destSubset)) {
SkErrorInternals::SetError( kInvalidArgument_SkError,
"Sorry, you passed me a bitmap resize "
" method I have never heard of: %d",
method );
}
// If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just
// return empty.
if (source.width() < 1 || source.height() < 1 ||
destWidth < 1 || destHeight < 1) {
return SkBitmap();
}
method = ResizeMethodToAlgorithmMethod(method);
// Check that we deal with an "algorithm methods" from this point onward.
SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
(method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
SkAutoLockPixels locker(source);
if (!source.readyToDraw() || source.config() != SkBitmap::kARGB_8888_Config)
return SkBitmap();
SkResizeFilter filter(method, source.width(), source.height(),
destWidth, destHeight, destSubset, convolveProcs);
// Get a source bitmap encompassing this touched area. We construct the
// offsets and row strides such that it looks like a new bitmap, while
// referring to the old data.
const unsigned char* sourceSubset =
reinterpret_cast<const unsigned char*>(source.getPixels());
// Convolve into the result.
SkBitmap result;
result.setConfig(SkBitmap::kARGB_8888_Config,
destSubset.width(), destSubset.height());
result.allocPixels(allocator, NULL);
if (!result.readyToDraw())
return SkBitmap();
BGRAConvolve2D(sourceSubset, static_cast<int>(source.rowBytes()),
!source.isOpaque(), filter.xFilter(), filter.yFilter(),
static_cast<int>(result.rowBytes()),
static_cast<unsigned char*>(result.getPixels()),
convolveProcs, true);
// Preserve the "opaque" flag for use as an optimization later.
result.setIsOpaque(source.isOpaque());
return result;
}
// static
SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
ResizeMethod method,
int destWidth, int destHeight,
SkConvolutionProcs* convolveProcs,
SkBitmap::Allocator* allocator) {
SkIRect destSubset = { 0, 0, destWidth, destHeight };
return Resize(source, method, destWidth, destHeight, destSubset,
convolveProcs, allocator);
}

106
src/core/SkBitmapScaler.h Normal file
View File

@ -0,0 +1,106 @@
/*
* Copyright 2013 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkBitmapScaler_DEFINED
#define SkBitmapScaler_DEFINED
#include "SkBitmap.h"
#include "SkConvolver.h"
/** \class SkBitmapScaler
Provides the interface for high quality image resampling.
*/
class SK_API SkBitmapScaler {
public:
enum ResizeMethod {
// Quality Methods
//
// Those enumeration values express a desired quality/speed tradeoff.
// They are translated into an algorithm-specific method that depends
// on the capabilities (CPU, GPU) of the underlying platform.
// It is possible for all three methods to be mapped to the same
// algorithm on a given platform.
// Good quality resizing. Fastest resizing with acceptable visual quality.
// This is typically intended for use during interactive layouts
// where slower platforms may want to trade image quality for large
// increase in resizing performance.
//
// For example the resizing implementation may devolve to linear
// filtering if this enables GPU acceleration to be used.
//
// Note that the underlying resizing method may be determined
// on the fly based on the parameters for a given resize call.
// For example an implementation using a GPU-based linear filter
// in the common case may still use a higher-quality software-based
// filter in cases where using the GPU would actually be slower - due
// to too much latency - or impossible - due to image format or size
// constraints.
RESIZE_GOOD,
// Medium quality resizing. Close to high quality resizing (better
// than linear interpolation) with potentially some quality being
// traded-off for additional speed compared to RESIZE_BEST.
//
// This is intended, for example, for generation of large thumbnails
// (hundreds of pixels in each dimension) from large sources, where
// a linear filter would produce too many artifacts but where
// a RESIZE_HIGH might be too costly time-wise.
RESIZE_BETTER,
// High quality resizing. The algorithm is picked to favor image quality.
RESIZE_BEST,
//
// Algorithm-specific enumerations
//
// Box filter. This is a weighted average of all of the pixels touching
// the destination pixel. For enlargement, this is nearest neighbor.
//
// You probably don't want this, it is here for testing since it is easy to
// compute. Use RESIZE_LANCZOS3 instead.
RESIZE_BOX,
RESIZE_TRIANGLE,
RESIZE_LANCZOS3,
RESIZE_HAMMING,
RESIZE_MITCHELL,
// enum aliases for first and last methods by algorithm or by quality.
RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD,
RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST,
RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX,
RESIZE_LAST_ALGORITHM_METHOD = RESIZE_MITCHELL,
};
// Resizes the given source bitmap using the specified resize method, so that
// the entire image is (dest_size) big. The dest_subset is the rectangle in
// this destination image that should actually be returned.
//
// The output image will be (dest_subset.width(), dest_subset.height()). This
// will save work if you do not need the entire bitmap.
//
// The destination subset must be smaller than the destination image.
static SkBitmap Resize(const SkBitmap& source,
ResizeMethod method,
int dest_width, int dest_height,
const SkIRect& dest_subset,
SkConvolutionProcs *convolveProcs = NULL,
SkBitmap::Allocator* allocator = NULL);
// Alternate version for resizing and returning the entire bitmap rather than
// a subset.
static SkBitmap Resize(const SkBitmap& source,
ResizeMethod method,
int dest_width, int dest_height,
SkConvolutionProcs *convolveProcs = NULL,
SkBitmap::Allocator* allocator = NULL);
};
#endif

473
src/core/SkConvolver.cpp Normal file
View File

@ -0,0 +1,473 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "SkConvolver.h"
#include "SkSize.h"
#include "SkTypes.h"
namespace {
// Converts the argument to an 8-bit unsigned value by clamping to the range
// 0-255.
inline unsigned char ClampTo8(int a) {
if (static_cast<unsigned>(a) < 256) {
return a; // Avoid the extra check in the common case.
}
if (a < 0) {
return 0;
}
return 255;
}
// Takes the value produced by accumulating element-wise product of image with
// a kernel and brings it back into range.
// All of the filter scaling factors are in fixed point with kShiftBits bits of
// fractional part.
inline unsigned char BringBackTo8(int a, bool takeAbsolute) {
a >>= SkConvolutionFilter1D::kShiftBits;
if (takeAbsolute) {
a = abs(a);
}
return ClampTo8(a);
}
// Stores a list of rows in a circular buffer. The usage is you write into it
// by calling AdvanceRow. It will keep track of which row in the buffer it
// should use next, and the total number of rows added.
class CircularRowBuffer {
public:
// The number of pixels in each row is given in |sourceRowPixelWidth|.
// The maximum number of rows needed in the buffer is |maxYFilterSize|
// (we only need to store enough rows for the biggest filter).
//
// We use the |firstInputRow| to compute the coordinates of all of the
// following rows returned by Advance().
CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize,
int firstInputRow)
: fRowByteWidth(destRowPixelWidth * 4),
fNumRows(maxYFilterSize),
fNextRow(0),
fNextRowCoordinate(firstInputRow) {
fBuffer.reset(fRowByteWidth * maxYFilterSize);
fRowAddresses.reset(fNumRows);
}
// Moves to the next row in the buffer, returning a pointer to the beginning
// of it.
unsigned char* advanceRow() {
unsigned char* row = &fBuffer[fNextRow * fRowByteWidth];
fNextRowCoordinate++;
// Set the pointer to the next row to use, wrapping around if necessary.
fNextRow++;
if (fNextRow == fNumRows) {
fNextRow = 0;
}
return row;
}
// Returns a pointer to an "unrolled" array of rows. These rows will start
// at the y coordinate placed into |*firstRowIndex| and will continue in
// order for the maximum number of rows in this circular buffer.
//
// The |firstRowIndex_| may be negative. This means the circular buffer
// starts before the top of the image (it hasn't been filled yet).
unsigned char* const* GetRowAddresses(int* firstRowIndex) {
// Example for a 4-element circular buffer holding coords 6-9.
// Row 0 Coord 8
// Row 1 Coord 9
// Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10.
// Row 3 Coord 7
//
// The "next" row is also the first (lowest) coordinate. This computation
// may yield a negative value, but that's OK, the math will work out
// since the user of this buffer will compute the offset relative
// to the firstRowIndex and the negative rows will never be used.
*firstRowIndex = fNextRowCoordinate - fNumRows;
int curRow = fNextRow;
for (int i = 0; i < fNumRows; i++) {
fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth];
// Advance to the next row, wrapping if necessary.
curRow++;
if (curRow == fNumRows) {
curRow = 0;
}
}
return &fRowAddresses[0];
}
private:
// The buffer storing the rows. They are packed, each one fRowByteWidth.
SkTArray<unsigned char> fBuffer;
// Number of bytes per row in the |buffer|.
int fRowByteWidth;
// The number of rows available in the buffer.
int fNumRows;
// The next row index we should write into. This wraps around as the
// circular buffer is used.
int fNextRow;
// The y coordinate of the |fNextRow|. This is incremented each time a
// new row is appended and does not wrap.
int fNextRowCoordinate;
// Buffer used by GetRowAddresses().
SkTArray<unsigned char*> fRowAddresses;
};
// Convolves horizontally along a single row. The row data is given in
// |srcData| and continues for the numValues() of the filter.
template<bool hasAlpha>
void ConvolveHorizontally(const unsigned char* srcData,
const SkConvolutionFilter1D& filter,
unsigned char* outRow) {
// Loop over each pixel on this row in the output image.
int numValues = filter.numValues();
for (int outX = 0; outX < numValues; outX++) {
// Get the filter that determines the current output pixel.
int filterOffset, filterLength;
const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
filter.FilterForValue(outX, &filterOffset, &filterLength);
// Compute the first pixel in this row that the filter affects. It will
// touch |filterLength| pixels (4 bytes each) after this.
const unsigned char* rowToFilter = &srcData[filterOffset * 4];
// Apply the filter to the row to get the destination pixel in |accum|.
int accum[4] = {0};
for (int filterX = 0; filterX < filterLength; filterX++) {
SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX];
accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
if (hasAlpha) {
accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
}
}
// Bring this value back in range. All of the filter scaling factors
// are in fixed point with kShiftBits bits of fractional part.
accum[0] >>= SkConvolutionFilter1D::kShiftBits;
accum[1] >>= SkConvolutionFilter1D::kShiftBits;
accum[2] >>= SkConvolutionFilter1D::kShiftBits;
if (hasAlpha) {
accum[3] >>= SkConvolutionFilter1D::kShiftBits;
}
// Store the new pixel.
outRow[outX * 4 + 0] = ClampTo8(accum[0]);
outRow[outX * 4 + 1] = ClampTo8(accum[1]);
outRow[outX * 4 + 2] = ClampTo8(accum[2]);
if (hasAlpha) {
outRow[outX * 4 + 3] = ClampTo8(accum[3]);
}
}
}
// Does vertical convolution to produce one output row. The filter values and
// length are given in the first two parameters. These are applied to each
// of the rows pointed to in the |sourceDataRows| array, with each row
// being |pixelWidth| wide.
//
// The output must have room for |pixelWidth * 4| bytes.
template<bool hasAlpha>
void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
int filterLength,
unsigned char* const* sourceDataRows,
int pixelWidth,
unsigned char* outRow) {
// We go through each column in the output and do a vertical convolution,
// generating one output pixel each time.
for (int outX = 0; outX < pixelWidth; outX++) {
// Compute the number of bytes over in each row that the current column
// we're convolving starts at. The pixel will cover the next 4 bytes.
int byteOffset = outX * 4;
// Apply the filter to one column of pixels.
int accum[4] = {0};
for (int filterY = 0; filterY < filterLength; filterY++) {
SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY];
accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
if (hasAlpha) {
accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
}
}
// Bring this value back in range. All of the filter scaling factors
// are in fixed point with kShiftBits bits of precision.
accum[0] >>= SkConvolutionFilter1D::kShiftBits;
accum[1] >>= SkConvolutionFilter1D::kShiftBits;
accum[2] >>= SkConvolutionFilter1D::kShiftBits;
if (hasAlpha) {
accum[3] >>= SkConvolutionFilter1D::kShiftBits;
}
// Store the new pixel.
outRow[byteOffset + 0] = ClampTo8(accum[0]);
outRow[byteOffset + 1] = ClampTo8(accum[1]);
outRow[byteOffset + 2] = ClampTo8(accum[2]);
if (hasAlpha) {
unsigned char alpha = ClampTo8(accum[3]);
// Make sure the alpha channel doesn't come out smaller than any of the
// color channels. We use premultipled alpha channels, so this should
// never happen, but rounding errors will cause this from time to time.
// These "impossible" colors will cause overflows (and hence random pixel
// values) when the resulting bitmap is drawn to the screen.
//
// We only need to do this when generating the final output row (here).
int maxColorChannel = SkTMax(outRow[byteOffset + 0],
SkTMax(outRow[byteOffset + 1],
outRow[byteOffset + 2]));
if (alpha < maxColorChannel) {
outRow[byteOffset + 3] = maxColorChannel;
} else {
outRow[byteOffset + 3] = alpha;
}
} else {
// No alpha channel, the image is opaque.
outRow[byteOffset + 3] = 0xff;
}
}
}
void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
int filterLength,
unsigned char* const* sourceDataRows,
int pixelWidth,
unsigned char* outRow,
bool sourceHasAlpha) {
if (sourceHasAlpha) {
ConvolveVertically<true>(filterValues, filterLength,
sourceDataRows, pixelWidth,
outRow);
} else {
ConvolveVertically<false>(filterValues, filterLength,
sourceDataRows, pixelWidth,
outRow);
}
}
} // namespace
// SkConvolutionFilter1D ---------------------------------------------------------
SkConvolutionFilter1D::SkConvolutionFilter1D()
: fMaxFilter(0) {
}
SkConvolutionFilter1D::~SkConvolutionFilter1D() {
}
void SkConvolutionFilter1D::AddFilter(int filterOffset,
const float* filterValues,
int filterLength) {
SkASSERT(filterLength > 0);
SkTArray<ConvolutionFixed> fixedValues;
fixedValues.reset(filterLength);
for (int i = 0; i < filterLength; ++i) {
fixedValues.push_back(FloatToFixed(filterValues[i]));
}
AddFilter(filterOffset, &fixedValues[0], filterLength);
}
void SkConvolutionFilter1D::AddFilter(int filterOffset,
const ConvolutionFixed* filterValues,
int filterLength) {
// It is common for leading/trailing filter values to be zeros. In such
// cases it is beneficial to only store the central factors.
// For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
// a 1080p image this optimization gives a ~10% speed improvement.
int filterSize = filterLength;
int firstNonZero = 0;
while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) {
firstNonZero++;
}
if (firstNonZero < filterLength) {
// Here we have at least one non-zero factor.
int lastNonZero = filterLength - 1;
while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) {
lastNonZero--;
}
filterOffset += firstNonZero;
filterLength = lastNonZero + 1 - firstNonZero;
SkASSERT(filterLength > 0);
for (int i = firstNonZero; i <= lastNonZero; i++) {
fFilterValues.push_back(filterValues[i]);
}
} else {
// Here all the factors were zeroes.
filterLength = 0;
}
FilterInstance instance;
// We pushed filterLength elements onto fFilterValues
instance.fDataLocation = (static_cast<int>(fFilterValues.count()) -
filterLength);
instance.fOffset = filterOffset;
instance.fTrimmedLength = filterLength;
instance.fLength = filterSize;
fFilters.push_back(instance);
fMaxFilter = SkTMax(fMaxFilter, filterLength);
}
const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter(
int* specifiedFilterlength,
int* filterOffset,
int* filterLength) const {
const FilterInstance& filter = fFilters[0];
*filterOffset = filter.fOffset;
*filterLength = filter.fTrimmedLength;
*specifiedFilterlength = filter.fLength;
if (filter.fTrimmedLength == 0) {
return NULL;
}
return &fFilterValues[filter.fDataLocation];
}
void BGRAConvolve2D(const unsigned char* sourceData,
int sourceByteRowStride,
bool sourceHasAlpha,
const SkConvolutionFilter1D& filterX,
const SkConvolutionFilter1D& filterY,
int outputByteRowStride,
unsigned char* output,
SkConvolutionProcs* convolveProcs,
bool useSimdIfPossible) {
int maxYFilterSize = filterY.maxFilter();
// The next row in the input that we will generate a horizontally
// convolved row for. If the filter doesn't start at the beginning of the
// image (this is the case when we are only resizing a subset), then we
// don't want to generate any output rows before that. Compute the starting
// row for convolution as the first pixel for the first vertical filter.
int filterOffset, filterLength;
const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
filterY.FilterForValue(0, &filterOffset, &filterLength);
int nextXRow = filterOffset;
// We loop over each row in the input doing a horizontal convolution. This
// will result in a horizontally convolved image. We write the results into
// a circular buffer of convolved rows and do vertical convolution as rows
// are available. This prevents us from having to store the entire
// intermediate image and helps cache coherency.
// We will need four extra rows to allow horizontal convolution could be done
// simultaneously. We also pad each row in row buffer to be aligned-up to
// 16 bytes.
// TODO(jiesun): We do not use aligned load from row buffer in vertical
// convolution pass yet. Somehow Windows does not like it.
int rowBufferWidth = (filterX.numValues() + 15) & ~0xF;
int rowBufferHeight = maxYFilterSize +
(convolveProcs->fConvolve4RowsHorizontally ? 4 : 0);
CircularRowBuffer rowBuffer(rowBufferWidth,
rowBufferHeight,
filterOffset);
// Loop over every possible output row, processing just enough horizontal
// convolutions to run each subsequent vertical convolution.
SkASSERT(outputByteRowStride >= filterX.numValues() * 4);
int numOutputRows = filterY.numValues();
// We need to check which is the last line to convolve before we advance 4
// lines in one iteration.
int lastFilterOffset, lastFilterLength;
// SSE2 can access up to 3 extra pixels past the end of the
// buffer. At the bottom of the image, we have to be careful
// not to access data past the end of the buffer. Normally
// we fall back to the C++ implementation for the last row.
// If the last row is less than 3 pixels wide, we may have to fall
// back to the C++ version for more rows. Compute how many
// rows we need to avoid the SSE implementation for here.
filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset,
&lastFilterLength);
int avoidSimdRows = 1 + convolveProcs->fExtraHorizontalReads /
(lastFilterOffset + lastFilterLength);
filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset,
&lastFilterLength);
for (int outY = 0; outY < numOutputRows; outY++) {
filterValues = filterY.FilterForValue(outY,
&filterOffset, &filterLength);
// Generate output rows until we have enough to run the current filter.
while (nextXRow < filterOffset + filterLength) {
if (convolveProcs->fConvolve4RowsHorizontally &&
nextXRow + 3 < lastFilterOffset + lastFilterLength -
avoidSimdRows) {
const unsigned char* src[4];
unsigned char* outRow[4];
for (int i = 0; i < 4; ++i) {
src[i] = &sourceData[(nextXRow + i) * sourceByteRowStride];
outRow[i] = rowBuffer.advanceRow();
}
convolveProcs->fConvolve4RowsHorizontally(src, filterX, outRow);
nextXRow += 4;
} else {
// Check if we need to avoid SSE2 for this row.
if (convolveProcs->fConvolveHorizontally &&
nextXRow < lastFilterOffset + lastFilterLength -
avoidSimdRows) {
convolveProcs->fConvolveHorizontally(
&sourceData[nextXRow * sourceByteRowStride],
filterX, rowBuffer.advanceRow(), sourceHasAlpha);
} else {
if (sourceHasAlpha) {
ConvolveHorizontally<true>(
&sourceData[nextXRow * sourceByteRowStride],
filterX, rowBuffer.advanceRow());
} else {
ConvolveHorizontally<false>(
&sourceData[nextXRow * sourceByteRowStride],
filterX, rowBuffer.advanceRow());
}
}
nextXRow++;
}
}
// Compute where in the output image this row of final data will go.
unsigned char* curOutputRow = &output[outY * outputByteRowStride];
// Get the list of rows that the circular buffer has, in order.
int firstRowInCircularBuffer;
unsigned char* const* rowsToConvolve =
rowBuffer.GetRowAddresses(&firstRowInCircularBuffer);
// Now compute the start of the subset of those rows that the filter
// needs.
unsigned char* const* firstRowForFilter =
&rowsToConvolve[filterOffset - firstRowInCircularBuffer];
if (convolveProcs->fConvolveVertically) {
convolveProcs->fConvolveVertically(filterValues, filterLength,
firstRowForFilter,
filterX.numValues(), curOutputRow,
sourceHasAlpha);
} else {
ConvolveVertically(filterValues, filterLength,
firstRowForFilter,
filterX.numValues(), curOutputRow,
sourceHasAlpha);
}
}
}

203
src/core/SkConvolver.h Normal file
View File

@ -0,0 +1,203 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SK_CONVOLVER_H
#define SK_CONVOLVER_H
#include "SkSize.h"
#include "SkTypes.h"
#include "SkTArray.h"
// avoid confusion with Mac OS X's math library (Carbon)
#if defined(__APPLE__)
#undef FloatToConvolutionFixed
#undef ConvolutionFixedToFloat
#endif
// Represents a filter in one dimension. Each output pixel has one entry in this
// object for the filter values contributing to it. You build up the filter
// list by calling AddFilter for each output pixel (in order).
//
// We do 2-dimensional convolution by first convolving each row by one
// SkConvolutionFilter1D, then convolving each column by another one.
//
// Entries are stored in ConvolutionFixed point, shifted left by kShiftBits.
class SkConvolutionFilter1D {
public:
typedef short ConvolutionFixed;
// The number of bits that ConvolutionFixed point values are shifted by.
enum { kShiftBits = 14 };
SK_API SkConvolutionFilter1D();
SK_API ~SkConvolutionFilter1D();
// Convert between floating point and our ConvolutionFixed point representation.
static ConvolutionFixed FloatToFixed(float f) {
return static_cast<ConvolutionFixed>(f * (1 << kShiftBits));
}
static unsigned char FixedToChar(ConvolutionFixed x) {
return static_cast<unsigned char>(x >> kShiftBits);
}
static float FixedToFloat(ConvolutionFixed x) {
// The cast relies on ConvolutionFixed being a short, implying that on
// the platforms we care about all (16) bits will fit into
// the mantissa of a (32-bit) float.
SK_COMPILE_ASSERT(sizeof(ConvolutionFixed) == 2, ConvolutionFixed_type_should_fit_in_float_mantissa);
float raw = static_cast<float>(x);
return ldexpf(raw, -kShiftBits);
}
// Returns the maximum pixel span of a filter.
int maxFilter() const { return fMaxFilter; }
// Returns the number of filters in this filter. This is the dimension of the
// output image.
int numValues() const { return static_cast<int>(fFilters.count()); }
// Appends the given list of scaling values for generating a given output
// pixel. |filterOffset| is the distance from the edge of the image to where
// the scaling factors start. The scaling factors apply to the source pixels
// starting from this position, and going for the next |filterLength| pixels.
//
// You will probably want to make sure your input is normalized (that is,
// all entries in |filterValuesg| sub to one) to prevent affecting the overall
// brighness of the image.
//
// The filterLength must be > 0.
//
// This version will automatically convert your input to ConvolutionFixed point.
SK_API void AddFilter(int filterOffset,
const float* filterValues,
int filterLength);
// Same as the above version, but the input is already ConvolutionFixed point.
void AddFilter(int filterOffset,
const ConvolutionFixed* filterValues,
int filterLength);
// Retrieves a filter for the given |valueOffset|, a position in the output
// image in the direction we're convolving. The offset and length of the
// filter values are put into the corresponding out arguments (see AddFilter
// above for what these mean), and a pointer to the first scaling factor is
// returned. There will be |filterLength| values in this array.
inline const ConvolutionFixed* FilterForValue(int valueOffset,
int* filterOffset,
int* filterLength) const {
const FilterInstance& filter = fFilters[valueOffset];
*filterOffset = filter.fOffset;
*filterLength = filter.fTrimmedLength;
if (filter.fTrimmedLength == 0) {
return NULL;
}
return &fFilterValues[filter.fDataLocation];
}
// Retrieves the filter for the offset 0, presumed to be the one and only.
// The offset and length of the filter values are put into the corresponding
// out arguments (see AddFilter). Note that |filterLegth| and
// |specifiedFilterLength| may be different if leading/trailing zeros of the
// original floating point form were clipped.
// There will be |filterLength| values in the return array.
// Returns NULL if the filter is 0-length (for instance when all floating
// point values passed to AddFilter were clipped to 0).
SK_API const ConvolutionFixed* GetSingleFilter(int* specifiedFilterLength,
int* filterOffset,
int* filterLength) const;
// Add another value to the fFilterValues array -- useful for
// SIMD padding which happens outside of this class.
void addFilterValue( ConvolutionFixed val ) {
fFilterValues.push_back( val );
}
private:
struct FilterInstance {
// Offset within filterValues for this instance of the filter.
int fDataLocation;
// Distance from the left of the filter to the center. IN PIXELS
int fOffset;
// Number of values in this filter instance.
int fTrimmedLength;
// Filter length as specified. Note that this may be different from
// 'trimmed_length' if leading/trailing zeros of the original floating
// point form were clipped differently on each tail.
int fLength;
};
// Stores the information for each filter added to this class.
SkTArray<FilterInstance> fFilters;
// We store all the filter values in this flat list, indexed by
// |FilterInstance.data_location| to avoid the mallocs required for storing
// each one separately.
SkTArray<ConvolutionFixed> fFilterValues;
// The maximum size of any filter we've added.
int fMaxFilter;
};
typedef void (*SkConvolveVertically_pointer)(
const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
int filterLength,
unsigned char* const* sourceDataRows,
int pixelWidth,
unsigned char* outRow,
bool hasAlpha);
typedef void (*SkConvolve4RowsHorizontally_pointer)(
const unsigned char* srcData[4],
const SkConvolutionFilter1D& filter,
unsigned char* outRow[4]);
typedef void (*SkConvolveHorizontally_pointer)(
const unsigned char* srcData,
const SkConvolutionFilter1D& filter,
unsigned char* outRow,
bool hasAlpha);
typedef void (*SkConvolveFilterPadding_pointer)(
SkConvolutionFilter1D* filter);
struct SkConvolutionProcs {
// This is how many extra pixels may be read by the
// conolve*horizontally functions.
int fExtraHorizontalReads;
SkConvolveVertically_pointer fConvolveVertically;
SkConvolve4RowsHorizontally_pointer fConvolve4RowsHorizontally;
SkConvolveHorizontally_pointer fConvolveHorizontally;
SkConvolveFilterPadding_pointer fApplySIMDPadding;
};
// Does a two-dimensional convolution on the given source image.
//
// It is assumed the source pixel offsets referenced in the input filters
// reference only valid pixels, so the source image size is not required. Each
// row of the source image starts |sourceByteRowStride| after the previous
// one (this allows you to have rows with some padding at the end).
//
// The result will be put into the given output buffer. The destination image
// size will be xfilter.numValues() * yfilter.numValues() pixels. It will be
// in rows of exactly xfilter.numValues() * 4 bytes.
//
// |sourceHasAlpha| is a hint that allows us to avoid doing computations on
// the alpha channel if the image is opaque. If you don't know, set this to
// true and it will work properly, but setting this to false will be a few
// percent faster if you know the image is opaque.
//
// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
// (this is ARGB when loaded into 32-bit words on a little-endian machine).
SK_API void BGRAConvolve2D(const unsigned char* sourceData,
int sourceByteRowStride,
bool sourceHasAlpha,
const SkConvolutionFilter1D& xfilter,
const SkConvolutionFilter1D& yfilter,
int outputByteRowStride,
unsigned char* output,
SkConvolutionProcs* convolveProcs,
bool useSimdIfPossible);
#endif // SK_CONVOLVER_H

View File

@ -11,6 +11,7 @@
#include "SkColorPriv.h"
#include "SkUnPreMultiply.h"
#include "SkShader.h"
#include "SkConvolver.h"
#include "SkBitmapFilter_opts_SSE2.h"
@ -180,3 +181,456 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
}
}
// Convolves horizontally along a single row. The row data is given in
// |src_data| and continues for the num_values() of the filter.
void convolveHorizontally_SSE2(const unsigned char* src_data,
const SkConvolutionFilter1D& filter,
unsigned char* out_row,
bool /*has_alpha*/) {
int num_values = filter.numValues();
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
__m128i mask[4];
// |mask| will be used to decimate all extra filter coefficients that are
// loaded by SIMD when |filter_length| is not divisible by 4.
// mask[0] is not used in following algorithm.
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = 0; out_x < num_values; out_x++) {
const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
filter.FilterForValue(out_x, &filter_offset, &filter_length);
__m128i accum = _mm_setzero_si128();
// Compute the first pixel in this row that the filter affects. It will
// touch |filter_length| pixels (4 bytes each) after this.
const __m128i* row_to_filter =
reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
// We will load and accumulate with four coefficients per iteration.
for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
// Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
__m128i coeff, coeff16;
// [16] xx xx xx xx c3 c2 c1 c0
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// [16] xx xx xx xx c1 c1 c0 c0
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
// [16] c1 c1 c1 c1 c0 c0 c0 c0
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
// Load four pixels => unpack the first two pixels to 16 bits =>
// multiply with coefficients => accumulate the convolution result.
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
__m128i src8 = _mm_loadu_si128(row_to_filter);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a0*c0 b0*c0 g0*c0 r0*c0
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// [32] a1*c1 b1*c1 g1*c1 r1*c1
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// Duplicate 3rd and 4th coefficients for all channels =>
// unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
// => accumulate the convolution results.
// [16] xx xx xx xx c3 c3 c2 c2
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
// [16] c3 c3 c3 c3 c2 c2 c2 c2
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
// [16] a3 g3 b3 r3 a2 g2 b2 r2
src16 = _mm_unpackhi_epi8(src8, zero);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a2*c2 b2*c2 g2*c2 r2*c2
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// [32] a3*c3 b3*c3 g3*c3 r3*c3
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
// Advance the pixel and coefficients pointers.
row_to_filter += 1;
filter_values += 4;
}
// When |filter_length| is not divisible by 4, we need to decimate some of
// the filter coefficient that was loaded incorrectly to zero; Other than
// that the algorithm is same with above, exceot that the 4th pixel will be
// always absent.
int r = filter_length&3;
if (r) {
// Note: filter_values must be padded to align_up(filter_offset, 8).
__m128i coeff, coeff16;
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// Mask out extra filter taps.
coeff = _mm_and_si128(coeff, mask[r]);
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
// Note: line buffer must be padded to align_up(filter_offset, 16).
// We resolve this by use C-version for the last horizontal line.
__m128i src8 = _mm_loadu_si128(row_to_filter);
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
src16 = _mm_unpackhi_epi8(src8, zero);
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum = _mm_add_epi32(accum, t);
}
// Shift right for fixed point implementation.
accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
// Packing 32 bits |accum| to 16 bits per channel (signed saturation).
accum = _mm_packs_epi32(accum, zero);
// Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
accum = _mm_packus_epi16(accum, zero);
// Store the pixel value of 32 bits.
*(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
out_row += 4;
}
}
// Convolves horizontally along four rows. The row data is given in
// |src_data| and continues for the num_values() of the filter.
// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
// refer to that function for detailed comments.
void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
const SkConvolutionFilter1D& filter,
unsigned char* out_row[4]) {
int num_values = filter.numValues();
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
__m128i mask[4];
// |mask| will be used to decimate all extra filter coefficients that are
// loaded by SIMD when |filter_length| is not divisible by 4.
// mask[0] is not used in following algorithm.
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = 0; out_x < num_values; out_x++) {
const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
filter.FilterForValue(out_x, &filter_offset, &filter_length);
// four pixels in a column per iteration.
__m128i accum0 = _mm_setzero_si128();
__m128i accum1 = _mm_setzero_si128();
__m128i accum2 = _mm_setzero_si128();
__m128i accum3 = _mm_setzero_si128();
int start = (filter_offset<<2);
// We will load and accumulate with four coefficients per iteration.
for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
__m128i coeff, coeff16lo, coeff16hi;
// [16] xx xx xx xx c3 c2 c1 c0
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// [16] xx xx xx xx c1 c1 c0 c0
coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
// [16] c1 c1 c1 c1 c0 c0 c0 c0
coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
// [16] xx xx xx xx c3 c3 c2 c2
coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
// [16] c3 c3 c3 c3 c2 c2 c2 c2
coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
__m128i src8, src16, mul_hi, mul_lo, t;
#define ITERATION(src, accum) \
src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
src16 = _mm_unpacklo_epi8(src8, zero); \
mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t); \
t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t); \
src16 = _mm_unpackhi_epi8(src8, zero); \
mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t); \
t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
accum = _mm_add_epi32(accum, t)
ITERATION(src_data[0] + start, accum0);
ITERATION(src_data[1] + start, accum1);
ITERATION(src_data[2] + start, accum2);
ITERATION(src_data[3] + start, accum3);
start += 16;
filter_values += 4;
}
int r = filter_length & 3;
if (r) {
// Note: filter_values must be padded to align_up(filter_offset, 8);
__m128i coeff;
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// Mask out extra filter taps.
coeff = _mm_and_si128(coeff, mask[r]);
__m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
/* c1 c1 c1 c1 c0 c0 c0 c0 */
coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
__m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
__m128i src8, src16, mul_hi, mul_lo, t;
ITERATION(src_data[0] + start, accum0);
ITERATION(src_data[1] + start, accum1);
ITERATION(src_data[2] + start, accum2);
ITERATION(src_data[3] + start, accum3);
}
accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
accum0 = _mm_packs_epi32(accum0, zero);
accum0 = _mm_packus_epi16(accum0, zero);
accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
accum1 = _mm_packs_epi32(accum1, zero);
accum1 = _mm_packus_epi16(accum1, zero);
accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
accum2 = _mm_packs_epi32(accum2, zero);
accum2 = _mm_packus_epi16(accum2, zero);
accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
accum3 = _mm_packs_epi32(accum3, zero);
accum3 = _mm_packus_epi16(accum3, zero);
*(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
*(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
*(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
*(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
out_row[0] += 4;
out_row[1] += 4;
out_row[2] += 4;
out_row[3] += 4;
}
}
// Does vertical convolution to produce one output row. The filter values and
// length are given in the first two parameters. These are applied to each
// of the rows pointed to in the |source_data_rows| array, with each row
// being |pixel_width| wide.
//
// The output must have room for |pixel_width * 4| bytes.
template<bool has_alpha>
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int pixel_width,
unsigned char* out_row) {
int width = pixel_width & ~3;
__m128i zero = _mm_setzero_si128();
__m128i accum0, accum1, accum2, accum3, coeff16;
const __m128i* src;
// Output four pixels per iteration (16 bytes).
for (int out_x = 0; out_x < width; out_x += 4) {
// Accumulated result for each pixel. 32 bits per RGBA channel.
accum0 = _mm_setzero_si128();
accum1 = _mm_setzero_si128();
accum2 = _mm_setzero_si128();
accum3 = _mm_setzero_si128();
// Convolve with one filter coefficient per iteration.
for (int filter_y = 0; filter_y < filter_length; filter_y++) {
// Duplicate the filter coefficient 8 times.
// [16] cj cj cj cj cj cj cj cj
coeff16 = _mm_set1_epi16(filter_values[filter_y]);
// Load four pixels (16 bytes) together.
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
src = reinterpret_cast<const __m128i*>(
&source_data_rows[filter_y][out_x << 2]);
__m128i src8 = _mm_loadu_si128(src);
// Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
// multiply with current coefficient => accumulate the result.
// [16] a1 b1 g1 r1 a0 b0 g0 r0
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a0 b0 g0 r0
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum0 = _mm_add_epi32(accum0, t);
// [32] a1 b1 g1 r1
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum1 = _mm_add_epi32(accum1, t);
// Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
// multiply with current coefficient => accumulate the result.
// [16] a3 b3 g3 r3 a2 b2 g2 r2
src16 = _mm_unpackhi_epi8(src8, zero);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a2 b2 g2 r2
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum2 = _mm_add_epi32(accum2, t);
// [32] a3 b3 g3 r3
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum3 = _mm_add_epi32(accum3, t);
}
// Shift right for fixed point implementation.
accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
// Packing 32 bits |accum| to 16 bits per channel (signed saturation).
// [16] a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packs_epi32(accum0, accum1);
// [16] a3 b3 g3 r3 a2 b2 g2 r2
accum2 = _mm_packs_epi32(accum2, accum3);
// Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packus_epi16(accum0, accum2);
if (has_alpha) {
// Compute the max(ri, gi, bi) for each pixel.
// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
__m128i a = _mm_srli_epi32(accum0, 8);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
a = _mm_srli_epi32(accum0, 16);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
b = _mm_max_epu8(a, b); // Max of r and g and b.
// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
b = _mm_slli_epi32(b, 24);
// Make sure the value of alpha channel is always larger than maximum
// value of color channels.
accum0 = _mm_max_epu8(b, accum0);
} else {
// Set value of alpha channels to 0xFF.
__m128i mask = _mm_set1_epi32(0xff000000);
accum0 = _mm_or_si128(accum0, mask);
}
// Store the convolution result (16 bytes) and advance the pixel pointers.
_mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
out_row += 16;
}
// When the width of the output is not divisible by 4, We need to save one
// pixel (4 bytes) each time. And also the fourth pixel is always absent.
if (pixel_width & 3) {
accum0 = _mm_setzero_si128();
accum1 = _mm_setzero_si128();
accum2 = _mm_setzero_si128();
for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
coeff16 = _mm_set1_epi16(filter_values[filter_y]);
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
src = reinterpret_cast<const __m128i*>(
&source_data_rows[filter_y][width<<2]);
__m128i src8 = _mm_loadu_si128(src);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a0 b0 g0 r0
__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum0 = _mm_add_epi32(accum0, t);
// [32] a1 b1 g1 r1
t = _mm_unpackhi_epi16(mul_lo, mul_hi);
accum1 = _mm_add_epi32(accum1, t);
// [16] a3 b3 g3 r3 a2 b2 g2 r2
src16 = _mm_unpackhi_epi8(src8, zero);
mul_hi = _mm_mulhi_epi16(src16, coeff16);
mul_lo = _mm_mullo_epi16(src16, coeff16);
// [32] a2 b2 g2 r2
t = _mm_unpacklo_epi16(mul_lo, mul_hi);
accum2 = _mm_add_epi32(accum2, t);
}
accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packs_epi32(accum0, accum1);
// [16] a3 b3 g3 r3 a2 b2 g2 r2
accum2 = _mm_packs_epi32(accum2, zero);
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
accum0 = _mm_packus_epi16(accum0, accum2);
if (has_alpha) {
// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
__m128i a = _mm_srli_epi32(accum0, 8);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
a = _mm_srli_epi32(accum0, 16);
// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
b = _mm_max_epu8(a, b); // Max of r and g and b.
// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
b = _mm_slli_epi32(b, 24);
accum0 = _mm_max_epu8(b, accum0);
} else {
__m128i mask = _mm_set1_epi32(0xff000000);
accum0 = _mm_or_si128(accum0, mask);
}
for (int out_x = width; out_x < pixel_width; out_x++) {
*(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
accum0 = _mm_srli_si128(accum0, 4);
out_row += 4;
}
}
}
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int pixel_width,
unsigned char* out_row,
bool has_alpha) {
if (has_alpha) {
convolveVertically_SSE2<true>(filter_values,
filter_length,
source_data_rows,
pixel_width,
out_row);
} else {
convolveVertically_SSE2<false>(filter_values,
filter_length,
source_data_rows,
pixel_width,
out_row);
}
}
void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
// Padding |paddingCount| of more dummy coefficients after the coefficients
// of last filter to prevent SIMD instructions which load 8 or 16 bytes
// together to access invalid memory areas. We are not trying to align the
// coefficients right now due to the opaqueness of <vector> implementation.
// This has to be done after all |AddFilter| calls.
for (int i = 0; i < 8; ++i) {
filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
}
}

View File

@ -11,10 +11,27 @@
#define SkBitmapFilter_opts_sse2_DEFINED
#include "SkBitmapProcState.h"
#include "SkConvolver.h"
void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);
void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int pixel_width,
unsigned char* out_row,
bool has_alpha);
void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
const SkConvolutionFilter1D& filter,
unsigned char* out_row[4]);
void convolveHorizontally_SSE2(const unsigned char* src_data,
const SkConvolutionFilter1D& filter,
unsigned char* out_row,
bool has_alpha);
void applySIMDPadding_SSE2(SkConvolutionFilter1D* filter);
#endif

View File

@ -21,3 +21,6 @@
// empty implementation just uses default supplied function pointers
void SkBitmapProcState::platformProcs() {}
// empty implementation just uses default supplied function pointers
void SkBitmapProcState::platformScaleProc() {}

View File

@ -107,6 +107,16 @@ static bool cachedHasSSSE3() {
SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
void SkBitmapProcState::platformConvolutionProcs() {
if (cachedHasSSE2()) {
fConvolutionProcs->fExtraHorizontalReads = 3;
fConvolutionProcs->fConvolveVertically = &convolveVertically_SSE2;
fConvolutionProcs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
fConvolutionProcs->fConvolveHorizontally = &convolveHorizontally_SSE2;
fConvolutionProcs->fApplySIMDPadding = &applySIMDPadding_SSE2;
}
}
void SkBitmapProcState::platformProcs() {
if (cachedHasSSSE3()) {
#if !defined(SK_BUILD_FOR_ANDROID)
@ -151,9 +161,6 @@ void SkBitmapProcState::platformProcs() {
if (fShaderProc32 == highQualityFilter) {
fShaderProc32 = highQualityFilter_SSE2;
}
if (fShaderProc32 == highQualityFilter_ScaleOnly) {
fShaderProc32 = highQualityFilter_ScaleOnly_SSE2;
}
}
}
}