restore _DXDY image shader on ARM

This is mostly the patch we've been looking at, rebased,
with some of my comments from the review folded in.

The perf speedup is qualitatively the same as I saw on the other patch.
On that same Snapdragon 835, with draw_bitmap_aa_rotate runs about 30%
faster (543.39 vs 712.71us) and draw_bitmap_noaa_rotate about 15% faster
(481.93 vs.  572.13us).

The main thing I have omitted is the NEON specialization of matrix
procs.  It looks like both nofilter_affine() and filter_affine() are
autovectorized well, and we seem to perform fine enough without manual
specialization here.  I'm even tempted to remove [no]filter_scale_neon()
as a follow up.

Image diffs look mostly fine.  This unexpectedly fixes rotated lighting
shaders in GMs.  Clearly that lighting shader must get a lot of use...

Change-Id: I67ee0b3ab92d6e56584ece05feb6e66d6fb7c660
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/249860
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Mike Reed <reed@google.com>
This commit is contained in:
Mike Klein 2019-10-21 13:10:07 -05:00 committed by Skia Commit-Bot
parent 9136bd7686
commit 37bc8f9652
7 changed files with 187 additions and 33 deletions

View File

@ -118,6 +118,26 @@ static void S32_alpha_D32_nofilter_DX(const SkBitmapProcState& s,
}
}
static void S32_alpha_D32_nofilter_DXDY(const SkBitmapProcState& s,
const uint32_t* xy, int count, SkPMColor* colors) {
SkASSERT(count > 0 && colors != nullptr);
SkASSERT(kNone_SkFilterQuality == s.fFilterQuality);
SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
SkASSERT(s.fAlphaScale <= 256);
auto src = (const char*)s.fPixmap.addr();
size_t rb = s.fPixmap.rowBytes();
while (count --> 0) {
uint32_t XY = *xy++,
x = XY & 0xffff,
y = XY >> 16;
SkASSERT(x < (unsigned)s.fPixmap.width ());
SkASSERT(y < (unsigned)s.fPixmap.height());
*colors++ = ((const SkPMColor*)(src + y*rb))[x];
}
}
SkBitmapProcInfo::SkBitmapProcInfo(const SkImage_Base* image, SkTileMode tmx, SkTileMode tmy)
: fImage(image)
, fTileModeX(tmx)
@ -165,7 +185,8 @@ static bool valid_for_filtering(unsigned dimension) {
}
bool SkBitmapProcInfo::init(const SkMatrix& inv, const SkPaint& paint) {
SkASSERT(inv.isScaleTranslate());
SkASSERT(!inv.hasPerspective());
SkASSERT(SkOpts::S32_alpha_D32_filter_DXDY || inv.isScaleTranslate());
fPixmap.reset();
fInvMatrix = inv;
@ -236,7 +257,8 @@ bool SkBitmapProcInfo::init(const SkMatrix& inv, const SkPaint& paint) {
* and may be removed.
*/
bool SkBitmapProcState::chooseProcs() {
SkASSERT(fInvMatrix.isScaleTranslate());
SkASSERT(!fInvMatrix.hasPerspective());
SkASSERT(SkOpts::S32_alpha_D32_filter_DXDY || fInvMatrix.isScaleTranslate());
SkASSERT(fPixmap.colorType() == kN32_SkColorType);
SkASSERT(fPixmap.alphaType() == kPremul_SkAlphaType ||
fPixmap.alphaType() == kOpaque_SkAlphaType);
@ -246,6 +268,7 @@ bool SkBitmapProcState::chooseProcs() {
fInvProc = SkMatrixPriv::GetMapXYProc(fInvMatrix);
fInvSxFractionalInt = SkScalarToFractionalInt(fInvMatrix.getScaleX());
fInvKyFractionalInt = SkScalarToFractionalInt(fInvMatrix.getSkewY ());
fAlphaScale = SkAlpha255To256(SkColorGetA(fPaintColor));
@ -253,17 +276,20 @@ bool SkBitmapProcState::chooseProcs() {
fMatrixProc = this->chooseMatrixProc(translate_only);
SkASSERT(fMatrixProc);
if (fFilterQuality > kNone_SkFilterQuality) {
fSampleProc32 = SkOpts::S32_alpha_D32_filter_DX;
const bool filter = fFilterQuality > kNone_SkFilterQuality;
if (fInvMatrix.isScaleTranslate()) {
fSampleProc32 = filter ? SkOpts::S32_alpha_D32_filter_DX : S32_alpha_D32_nofilter_DX ;
} else {
fSampleProc32 = S32_alpha_D32_nofilter_DX;
fSampleProc32 = filter ? SkOpts::S32_alpha_D32_filter_DXDY : S32_alpha_D32_nofilter_DXDY;
}
SkASSERT(fSampleProc32);
// our special-case shaderprocs
// TODO: move this one into chooseShaderProc32() or pull all that in here.
if (fAlphaScale == 256
&& fFilterQuality == kNone_SkFilterQuality
&& SkTileMode::kClamp == fTileModeX) {
&& SkTileMode::kClamp == fTileModeX
&& fInvMatrix.isScaleTranslate()) {
fShaderProc32 = Clamp_S32_opaque_D32_nofilter_DX_shaderproc;
} else {
fShaderProc32 = this->chooseShaderProc32();
@ -591,6 +617,32 @@ static void check_scale_filter(uint32_t bitmapXY[], int count,
}
}
static void check_affine_nofilter(uint32_t bitmapXY[], int count, unsigned mx, unsigned my) {
for (int i = 0; i < count; ++i) {
uint32_t XY = bitmapXY[i];
unsigned x = XY & 0xFFFF;
unsigned y = XY >> 16;
SkASSERT(x < mx);
SkASSERT(y < my);
}
}
static void check_affine_filter(uint32_t bitmapXY[], int count, unsigned mx, unsigned my) {
for (int i = 0; i < count; ++i) {
uint32_t YY = *bitmapXY++;
unsigned y0 = YY >> 18;
unsigned y1 = YY & 0x3FFF;
SkASSERT(y0 < my);
SkASSERT(y1 < my);
uint32_t XX = *bitmapXY++;
unsigned x0 = XX >> 18;
unsigned x1 = XX & 0x3FFF;
SkASSERT(x0 < mx);
SkASSERT(x1 < mx);
}
}
void SkBitmapProcState::DebugMatrixProc(const SkBitmapProcState& state,
uint32_t bitmapXY[], int count,
int x, int y) {
@ -601,11 +653,13 @@ void SkBitmapProcState::DebugMatrixProc(const SkBitmapProcState& state,
void (*proc)(uint32_t bitmapXY[], int count, unsigned mx, unsigned my);
// There are two formats possible:
// filter -vs- nofilter
SkASSERT(state.fInvMatrix.isScaleTranslate());
proc = state.fFilterQuality != kNone_SkFilterQuality ?
check_scale_filter : check_scale_nofilter;
const bool filter = state.fFilterQuality > kNone_SkFilterQuality;
if (state.fInvMatrix.isScaleTranslate()) {
proc = filter ? check_scale_filter : check_scale_nofilter;
} else {
proc = filter ? check_affine_filter : check_affine_nofilter;
}
proc(bitmapXY, count, state.fPixmap.width(), state.fPixmap.height());
}

View File

@ -72,6 +72,7 @@ struct SkBitmapProcState : public SkBitmapProcInfo {
SkMatrixPriv::MapXYProc fInvProc; // chooseProcs
SkFractionalInt fInvSxFractionalInt;
SkFractionalInt fInvKyFractionalInt;
SkFixed fFilterOneX;
SkFixed fFilterOneY;

View File

@ -124,6 +124,28 @@ static void nofilter_scale(const SkBitmapProcState& s,
}
}
template <unsigned (*tile)(SkFixed, int)>
static void nofilter_affine(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT(!s.fInvMatrix.hasPerspective());
const SkBitmapProcStateAutoMapper mapper(s, x, y);
SkFractionalInt fx = mapper.fractionalIntX(),
fy = mapper.fractionalIntY(),
dx = s.fInvSxFractionalInt,
dy = s.fInvKyFractionalInt;
int maxX = s.fPixmap.width () - 1,
maxY = s.fPixmap.height() - 1;
while (count --> 0) {
*xy++ = (tile(SkFractionalIntToFixed(fy), maxY) << 16)
| (tile(SkFractionalIntToFixed(fx), maxX) );
fx += dx;
fy += dy;
}
}
// Extract the high four fractional bits from fx, the lerp parameter when filtering.
static unsigned extract_low_bits_clamp(SkFixed fx, int /*max*/) {
// If we're already scaled up to by max like clamp/decal,
@ -136,26 +158,27 @@ static unsigned extract_low_bits_repeat_mirror(SkFixed fx, int max) {
return extract_low_bits_clamp((fx & 0xffff) * (max+1), max);
}
template <unsigned (*tile)(SkFixed, int), unsigned (*extract_low_bits)(SkFixed, int)>
static uint32_t pack(SkFixed f, unsigned max, SkFixed one) {
uint32_t packed = tile(f, max); // low coordinate in high bits
packed = (packed << 4) | extract_low_bits(f, max); // (lerp weight _is_ coord fractional part)
packed = (packed << 14) | tile((f + one), max); // high coordinate in low bits
return packed;
}
template <unsigned (*tile)(SkFixed, int), unsigned (*extract_low_bits)(SkFixed, int), bool tryDecal>
static void filter_scale(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT(s.fInvMatrix.isScaleTranslate());
auto pack = [](SkFixed f, unsigned max, SkFixed one) {
unsigned i = tile(f, max);
i = (i << 4) | extract_low_bits(f, max);
return (i << 14) | (tile((f + one), max));
};
const unsigned maxX = s.fPixmap.width() - 1;
const SkFractionalInt dx = s.fInvSxFractionalInt;
SkFractionalInt fx;
{
const SkBitmapProcStateAutoMapper mapper(s, x, y);
const SkFixed fy = mapper.fixedY();
const unsigned maxY = s.fPixmap.height() - 1;
// compute our two Y values up front
*xy++ = pack(fy, maxY, s.fFilterOneY);
*xy++ = pack<tile, extract_low_bits>(mapper.fixedY(), maxY, s.fFilterOneY);
// now initialize fx
fx = mapper.fractionalIntX();
}
@ -175,8 +198,32 @@ static void filter_scale(const SkBitmapProcState& s,
}
while (count --> 0) {
SkFixed fixedFx = SkFractionalIntToFixed(fx);
*xy++ = pack(fixedFx, maxX, s.fFilterOneX);
*xy++ = pack<tile, extract_low_bits>(SkFractionalIntToFixed(fx), maxX, s.fFilterOneX);
fx += dx;
}
}
template <unsigned (*tile)(SkFixed, int), unsigned (*extract_low_bits)(SkFixed, int)>
static void filter_affine(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT(!s.fInvMatrix.hasPerspective());
const SkBitmapProcStateAutoMapper mapper(s, x, y);
SkFixed oneX = s.fFilterOneX,
oneY = s.fFilterOneY;
SkFractionalInt fx = mapper.fractionalIntX(),
fy = mapper.fractionalIntY(),
dx = s.fInvSxFractionalInt,
dy = s.fInvKyFractionalInt;
unsigned maxX = s.fPixmap.width () - 1,
maxY = s.fPixmap.height() - 1;
while (count --> 0) {
*xy++ = pack<tile, extract_low_bits>(SkFractionalIntToFixed(fy), maxY, oneY);
*xy++ = pack<tile, extract_low_bits>(SkFractionalIntToFixed(fx), maxX, oneX);
fy += dy;
fx += dx;
}
}
@ -205,8 +252,8 @@ static unsigned mirror(SkFixed fx, int max) {
// Mirror/Mirror's always just portable code.
static const SkBitmapProcState::MatrixProc MirrorX_MirrorY_Procs[] = {
nofilter_scale<mirror, false>,
filter_scale<mirror, extract_low_bits_repeat_mirror, false>,
nofilter_scale <mirror, false>, filter_scale <mirror, extract_low_bits_repeat_mirror, false>,
nofilter_affine<mirror>, filter_affine<mirror, extract_low_bits_repeat_mirror>,
};
// Clamp/Clamp and Repeat/Repeat have NEON or portable implementations.
@ -473,6 +520,8 @@ static const SkBitmapProcState::MatrixProc MirrorX_MirrorY_Procs[] = {
}
}
// TODO: nofilter_affine_neon
template <unsigned (*tile )(SkFixed, int),
int32x4_t (*tile4)(int32x4_t, unsigned),
unsigned (*extract_low_bits )(SkFixed, int),
@ -553,6 +602,8 @@ static const SkBitmapProcState::MatrixProc MirrorX_MirrorY_Procs[] = {
}
}
// TODO: filter_affine_neon
static const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs[] = {
nofilter_scale_neon<clamp, clamp8, true>,
filter_scale_neon<clamp,
@ -560,6 +611,8 @@ static const SkBitmapProcState::MatrixProc MirrorX_MirrorY_Procs[] = {
extract_low_bits_clamp,
extract_low_bits_clamp4,
true>,
nofilter_affine<clamp>, filter_affine<clamp, extract_low_bits_clamp>,
};
static const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs[] = {
@ -569,17 +622,19 @@ static const SkBitmapProcState::MatrixProc MirrorX_MirrorY_Procs[] = {
extract_low_bits_repeat_mirror,
extract_low_bits_repeat_mirror4,
false>,
nofilter_affine<repeat>, filter_affine<repeat, extract_low_bits_repeat_mirror>,
};
#else
static const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs[] = {
nofilter_scale<clamp, true>,
filter_scale<clamp, extract_low_bits_clamp, true>,
nofilter_scale <clamp, true>, filter_scale <clamp, extract_low_bits_clamp, true>,
nofilter_affine<clamp>, filter_affine<clamp, extract_low_bits_clamp>,
};
static const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs[] = {
nofilter_scale<repeat, false>,
filter_scale<repeat, extract_low_bits_repeat_mirror, false>,
nofilter_scale <repeat, false>, filter_scale <repeat, extract_low_bits_repeat_mirror,false>,
nofilter_affine<repeat>, filter_affine<repeat, extract_low_bits_repeat_mirror>,
};
#endif
@ -788,7 +843,7 @@ static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
// The main entry point to the file, choosing between everything above.
SkBitmapProcState::MatrixProc SkBitmapProcState::chooseMatrixProc(bool translate_only_matrix) {
SkASSERT(fInvMatrix.isScaleTranslate());
SkASSERT(!fInvMatrix.hasPerspective());
SkASSERT(fTileModeX == fTileModeY);
SkASSERT(fTileModeX != SkTileMode::kDecal);
@ -804,6 +859,9 @@ SkBitmapProcState::MatrixProc SkBitmapProcState::chooseMatrixProc(bool translate
// The arrays are all [ nofilter, filter ].
int index = fFilterQuality > kNone_SkFilterQuality ? 1 : 0;
if (!fInvMatrix.isScaleTranslate()) {
index |= 2;
}
if (fTileModeX == SkTileMode::kClamp) {
// clamp gets special version of filterOne, working in non-normalized space (allowing decal)

View File

@ -84,6 +84,7 @@ namespace SkOpts {
DEFINE_DEFAULT(hash_fn);
DEFINE_DEFAULT(S32_alpha_D32_filter_DX);
DEFINE_DEFAULT(S32_alpha_D32_filter_DXDY);
#undef DEFINE_DEFAULT
#define M(st) (StageFn)SK_OPTS_NS::st,

View File

@ -61,9 +61,10 @@ namespace SkOpts {
}
// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
// This is the only one that can use anything past SSE2/NEON.
extern void (*S32_alpha_D32_filter_DX)(const SkBitmapProcState&,
const uint32_t* xy, int count, SkPMColor*);
extern void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
const uint32_t* xy, int count, SkPMColor*);
#define M(st) +1
// We can't necessarily express the type of SkJumper stage functions here,

View File

@ -497,6 +497,40 @@ static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, O
#endif
#if defined(SK_ARM_HAS_NEON)
/*not static*/ inline
void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
const uint32_t* xy, int count, SkPMColor* colors) {
SkASSERT(count > 0 && colors != nullptr);
SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
SkASSERT(s.fAlphaScale <= 256);
auto src = (const char*)s.fPixmap.addr();
size_t rb = s.fPixmap.rowBytes();
while (count --> 0) {
int y0, y1, wy,
x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
auto row0 = (const uint32_t*)(src + y0*rb),
row1 = (const uint32_t*)(src + y1*rb);
filter_and_scale_by_alpha(wx, wy,
row0[x0], row0[x1],
row1[x0], row1[x1],
colors++,
s.fAlphaScale);
}
}
#else
// It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2.
constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
const uint32_t*, int, SkPMColor*) = nullptr;
#endif
} // namespace SK_OPTS_NS
#endif

View File

@ -9,6 +9,7 @@
#include "src/core/SkBitmapController.h"
#include "src/core/SkColorSpacePriv.h"
#include "src/core/SkColorSpaceXformSteps.h"
#include "src/core/SkOpts.h"
#include "src/core/SkRasterPipeline.h"
#include "src/core/SkReadBuffer.h"
#include "src/core/SkWriteBuffer.h"
@ -72,20 +73,24 @@ bool SkImageShader::isOpaque() const {
#ifdef SK_ENABLE_LEGACY_SHADERCONTEXT
static bool legacy_shader_can_handle(const SkMatrix& inv) {
if (!inv.isScaleTranslate()) {
if (inv.hasPerspective()) {
return false;
}
// Scale+translate methods are always present, but affine might not be.
if (!SkOpts::S32_alpha_D32_filter_DXDY && !inv.isScaleTranslate()) {
return false;
}
// legacy code uses SkFixed 32.32, so ensure the inverse doesn't map device coordinates
// out of range.
const SkScalar max_dev_coord = 32767.0f;
SkRect src;
SkAssertResult(inv.mapRect(&src, SkRect::MakeWH(max_dev_coord, max_dev_coord)));
const SkRect src = inv.mapRect(SkRect::MakeWH(max_dev_coord, max_dev_coord));
// take 1/4 of max signed 32bits so we have room to subtract local values
const SkScalar max_fixed32dot32 = SK_MaxS32 * 0.25f;
if (!SkRect::MakeLTRB(-max_fixed32dot32, -max_fixed32dot32,
max_fixed32dot32, max_fixed32dot32).contains(src)) {
+max_fixed32dot32, +max_fixed32dot32).contains(src)) {
return false;
}