attempt 2: add experimental bilerp_clamp_8888 stage

It looks like we can specialize hot image shaders into their
own single stages for a good speedup on both x86 and ARM.

I've started here with bilerp_clamp_8888, and will
follow up with bgra and 565, and lowp versions of those,
and probably also the same for nearest neighbors.

All pixels are identical in GMs.

Change-Id: Ib5ed6e528efd9e3eed96ba67d02fbec2e8133a81
Reviewed-on: https://skia-review.googlesource.com/86860
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-12-11 09:59:47 -05:00 committed by Skia Commit-Bot
parent f226e66d75
commit 8a64e52a98
7 changed files with 11151 additions and 9242 deletions

View File

@ -53,6 +53,7 @@ struct SkJumper_Engine;
M(load_f32) M(load_f32_dst) M(store_f32) \
M(load_8888) M(load_8888_dst) M(store_8888) M(gather_8888) \
M(load_bgra) M(load_bgra_dst) M(store_bgra) M(gather_bgra) \
M(bilerp_clamp_8888) \
M(load_u16_be) M(load_rgb_u16_be) M(store_u16_be) \
M(load_tables_u16_be) M(load_tables_rgb_u16_be) M(load_tables) \
M(load_rgba) M(store_rgba) \

View File

@ -219,6 +219,7 @@ extern "C" {
NOPE(load_f32) NOPE(load_f32_dst) NOPE(store_f32)
LOWP(load_8888) LOWP(load_8888_dst) LOWP(store_8888) LOWP(gather_8888)
LOWP(load_bgra) LOWP(load_bgra_dst) LOWP(store_bgra) LOWP(gather_bgra)
TODO(bilerp_clamp_8888)
TODO(load_u16_be) TODO(load_rgb_u16_be) TODO(store_u16_be)
NOPE(load_tables_u16_be) NOPE(load_tables_rgb_u16_be) NOPE(load_tables)
NOPE(load_rgba) NOPE(store_rgba)

View File

@ -48,10 +48,10 @@ struct SkJumper_MemoryCtx {
};
struct SkJumper_GatherCtx {
void* pixels;
int stride;
float width,
height;
const void* pixels;
int stride;
float width;
float height;
};
// State shared by save_xy, accumulate, and bilinear_* / bicubic_*.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -194,13 +194,15 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int dx, int dy) {
return (T*)ctx->pixels + dy*ctx->stride + dx;
}
// clamp v to [0,limit).
SI F clamp(F v, F limit) {
F inclusive = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
return min(max(0, v), inclusive);
}
// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
template <typename T>
SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
auto clamp = [](F v, F limit) {
limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
return min(max(0, v), limit);
};
x = clamp(x, ctx->width);
y = clamp(y, ctx->height);
@ -1521,3 +1523,47 @@ STAGE(gauss_a_to_rgba, Ctx::None) {
g = a;
b = a;
}
// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
STAGE(bilerp_clamp_8888, SkJumper_GatherCtx* ctx) {
// (cx,cy) are the center of our sample.
F cx = r,
cy = g;
// All sample points are at the same fractional offset (fx,fy).
// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
F fx = fract(cx + 0.5f),
fy = fract(cy + 0.5f);
// We'll accumulate the color of all four samples into {r,g,b,a} directly.
r = g = b = a = 0;
float offsets[] = {-0.5f,+0.5f};
for (float dy : offsets)
for (float dx : offsets) {
// (x,y) are the coordinates of this sample point.
F x = cx + dx,
y = cy + dy;
// ix_and_ptr() will clamp to the image's bounds for us.
const uint32_t* ptr;
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
F sr,sg,sb,sa;
from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
// are combined in direct proportion to their area overlapping that logical query pixel.
// At positive offsets, the x-axis contribution to that rectangle is fx,
// or (1-fx) at negative x. Same deal for y.
F sx = (dx > 0) ? fx : 1.0f - fx,
sy = (dy > 0) ? fy : 1.0f - fy,
area = sx * sy;
r += sr * area;
g += sg * area;
b += sb * area;
a += sa * area;
}
}

View File

@ -313,7 +313,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
p->append_matrix(alloc, matrix);
auto gather = alloc->make<SkJumper_GatherCtx>();
gather->pixels = pm.writable_addr(); // Don't worry, we won't write to it.
gather->pixels = pm.addr();
gather->stride = pm.rowBytesAsPixels();
gather->width = pm.width();
gather->height = pm.height();
@ -325,6 +325,8 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
limit_y->scale = pm.height();
limit_y->invScale = 1.0f / pm.height();
bool is_srgb = rec.fDstCS && (!info.colorSpace() || info.gammaCloseToSRGB());
auto append_tiling_and_gather = [&] {
switch (fTileModeX) {
case kClamp_TileMode: /* The gather_xxx stage will clamp for us. */ break;
@ -346,11 +348,38 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
case kRGBA_F16_SkColorType: p->append(SkRasterPipeline::gather_f16, gather); break;
default: SkASSERT(false);
}
if (rec.fDstCS && (!info.colorSpace() || info.gammaCloseToSRGB())) {
if (is_srgb) {
p->append(SkRasterPipeline::from_srgb);
}
};
auto append_misc = [&] {
if (info.colorType() == kAlpha_8_SkColorType) {
p->append(SkRasterPipeline::set_rgb, &misc->paint_color);
}
if (info.colorType() == kAlpha_8_SkColorType ||
info.alphaType() == kUnpremul_SkAlphaType) {
p->append(SkRasterPipeline::premul);
}
if (quality > kLow_SkFilterQuality) {
// Bicubic filtering naturally produces out of range values on both sides.
p->append(SkRasterPipeline::clamp_0);
p->append(SkRasterPipeline::clamp_a);
}
append_gamut_transform(p, alloc, info.colorSpace(), rec.fDstCS, kPremul_SkAlphaType);
return true;
};
if (quality == kLow_SkFilterQuality &&
info.colorType() == kRGBA_8888_SkColorType &&
fTileModeX == SkShader::kClamp_TileMode &&
fTileModeY == SkShader::kClamp_TileMode &&
!is_srgb) {
p->append(SkRasterPipeline::bilerp_clamp_8888, gather);
return append_misc();
}
SkJumper_SamplerCtx* sampler = nullptr;
if (quality != kNone_SkFilterQuality) {
sampler = alloc->make<SkJumper_SamplerCtx>();
@ -366,6 +395,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
if (quality == kNone_SkFilterQuality) {
append_tiling_and_gather();
} else if (quality == kLow_SkFilterQuality) {
p->append(SkRasterPipeline::save_xy, sampler);
@ -375,6 +405,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
sample(SkRasterPipeline::bilinear_px, SkRasterPipeline::bilinear_py);
p->append(SkRasterPipeline::move_dst_src);
} else {
p->append(SkRasterPipeline::save_xy, sampler);
@ -401,17 +432,5 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
p->append(SkRasterPipeline::move_dst_src);
}
if (info.colorType() == kAlpha_8_SkColorType) {
p->append(SkRasterPipeline::set_rgb, &misc->paint_color);
}
if (info.colorType() == kAlpha_8_SkColorType || info.alphaType() == kUnpremul_SkAlphaType) {
p->append(SkRasterPipeline::premul);
}
if (quality > kLow_SkFilterQuality) {
// Bicubic filtering naturally produces out of range values on both sides.
p->append(SkRasterPipeline::clamp_0);
p->append(SkRasterPipeline::clamp_a);
}
append_gamut_transform(p, alloc, info.colorSpace(), rec.fDstCS, kPremul_SkAlphaType);
return true;
return append_misc();
}