attempt 2: add experimental bilerp_clamp_8888 stage
It looks like we can specialize hot image shaders into their own single stages for a good speedup on both x86 and ARM. I've started here with bilerp_clamp_8888, and will follow up with bgra and 565, and lowp versions of those, and probably also the same for nearest neighbors. All pixels are identical in GMs. Change-Id: Ib5ed6e528efd9e3eed96ba67d02fbec2e8133a81 Reviewed-on: https://skia-review.googlesource.com/86860 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
f226e66d75
commit
8a64e52a98
@ -53,6 +53,7 @@ struct SkJumper_Engine;
|
||||
M(load_f32) M(load_f32_dst) M(store_f32) \
|
||||
M(load_8888) M(load_8888_dst) M(store_8888) M(gather_8888) \
|
||||
M(load_bgra) M(load_bgra_dst) M(store_bgra) M(gather_bgra) \
|
||||
M(bilerp_clamp_8888) \
|
||||
M(load_u16_be) M(load_rgb_u16_be) M(store_u16_be) \
|
||||
M(load_tables_u16_be) M(load_tables_rgb_u16_be) M(load_tables) \
|
||||
M(load_rgba) M(store_rgba) \
|
||||
|
@ -219,6 +219,7 @@ extern "C" {
|
||||
NOPE(load_f32) NOPE(load_f32_dst) NOPE(store_f32)
|
||||
LOWP(load_8888) LOWP(load_8888_dst) LOWP(store_8888) LOWP(gather_8888)
|
||||
LOWP(load_bgra) LOWP(load_bgra_dst) LOWP(store_bgra) LOWP(gather_bgra)
|
||||
TODO(bilerp_clamp_8888)
|
||||
TODO(load_u16_be) TODO(load_rgb_u16_be) TODO(store_u16_be)
|
||||
NOPE(load_tables_u16_be) NOPE(load_tables_rgb_u16_be) NOPE(load_tables)
|
||||
NOPE(load_rgba) NOPE(store_rgba)
|
||||
|
@ -48,10 +48,10 @@ struct SkJumper_MemoryCtx {
|
||||
};
|
||||
|
||||
struct SkJumper_GatherCtx {
|
||||
void* pixels;
|
||||
int stride;
|
||||
float width,
|
||||
height;
|
||||
const void* pixels;
|
||||
int stride;
|
||||
float width;
|
||||
float height;
|
||||
};
|
||||
|
||||
// State shared by save_xy, accumulate, and bilinear_* / bicubic_*.
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -194,13 +194,15 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int dx, int dy) {
|
||||
return (T*)ctx->pixels + dy*ctx->stride + dx;
|
||||
}
|
||||
|
||||
// clamp v to [0,limit).
|
||||
SI F clamp(F v, F limit) {
|
||||
F inclusive = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
|
||||
return min(max(0, v), inclusive);
|
||||
}
|
||||
|
||||
// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
|
||||
template <typename T>
|
||||
SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
|
||||
auto clamp = [](F v, F limit) {
|
||||
limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
|
||||
return min(max(0, v), limit);
|
||||
};
|
||||
x = clamp(x, ctx->width);
|
||||
y = clamp(y, ctx->height);
|
||||
|
||||
@ -1521,3 +1523,47 @@ STAGE(gauss_a_to_rgba, Ctx::None) {
|
||||
g = a;
|
||||
b = a;
|
||||
}
|
||||
|
||||
// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
|
||||
STAGE(bilerp_clamp_8888, SkJumper_GatherCtx* ctx) {
|
||||
// (cx,cy) are the center of our sample.
|
||||
F cx = r,
|
||||
cy = g;
|
||||
|
||||
// All sample points are at the same fractional offset (fx,fy).
|
||||
// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
|
||||
F fx = fract(cx + 0.5f),
|
||||
fy = fract(cy + 0.5f);
|
||||
|
||||
// We'll accumulate the color of all four samples into {r,g,b,a} directly.
|
||||
r = g = b = a = 0;
|
||||
|
||||
float offsets[] = {-0.5f,+0.5f};
|
||||
|
||||
for (float dy : offsets)
|
||||
for (float dx : offsets) {
|
||||
// (x,y) are the coordinates of this sample point.
|
||||
F x = cx + dx,
|
||||
y = cy + dy;
|
||||
|
||||
// ix_and_ptr() will clamp to the image's bounds for us.
|
||||
const uint32_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
||||
|
||||
F sr,sg,sb,sa;
|
||||
from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
|
||||
|
||||
// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
|
||||
// are combined in direct proportion to their area overlapping that logical query pixel.
|
||||
// At positive offsets, the x-axis contribution to that rectangle is fx,
|
||||
// or (1-fx) at negative x. Same deal for y.
|
||||
F sx = (dx > 0) ? fx : 1.0f - fx,
|
||||
sy = (dy > 0) ? fy : 1.0f - fy,
|
||||
area = sx * sy;
|
||||
|
||||
r += sr * area;
|
||||
g += sg * area;
|
||||
b += sb * area;
|
||||
a += sa * area;
|
||||
}
|
||||
}
|
||||
|
@ -313,7 +313,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
|
||||
p->append_matrix(alloc, matrix);
|
||||
|
||||
auto gather = alloc->make<SkJumper_GatherCtx>();
|
||||
gather->pixels = pm.writable_addr(); // Don't worry, we won't write to it.
|
||||
gather->pixels = pm.addr();
|
||||
gather->stride = pm.rowBytesAsPixels();
|
||||
gather->width = pm.width();
|
||||
gather->height = pm.height();
|
||||
@ -325,6 +325,8 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
|
||||
limit_y->scale = pm.height();
|
||||
limit_y->invScale = 1.0f / pm.height();
|
||||
|
||||
bool is_srgb = rec.fDstCS && (!info.colorSpace() || info.gammaCloseToSRGB());
|
||||
|
||||
auto append_tiling_and_gather = [&] {
|
||||
switch (fTileModeX) {
|
||||
case kClamp_TileMode: /* The gather_xxx stage will clamp for us. */ break;
|
||||
@ -346,11 +348,38 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
|
||||
case kRGBA_F16_SkColorType: p->append(SkRasterPipeline::gather_f16, gather); break;
|
||||
default: SkASSERT(false);
|
||||
}
|
||||
if (rec.fDstCS && (!info.colorSpace() || info.gammaCloseToSRGB())) {
|
||||
if (is_srgb) {
|
||||
p->append(SkRasterPipeline::from_srgb);
|
||||
}
|
||||
};
|
||||
|
||||
auto append_misc = [&] {
|
||||
if (info.colorType() == kAlpha_8_SkColorType) {
|
||||
p->append(SkRasterPipeline::set_rgb, &misc->paint_color);
|
||||
}
|
||||
if (info.colorType() == kAlpha_8_SkColorType ||
|
||||
info.alphaType() == kUnpremul_SkAlphaType) {
|
||||
p->append(SkRasterPipeline::premul);
|
||||
}
|
||||
if (quality > kLow_SkFilterQuality) {
|
||||
// Bicubic filtering naturally produces out of range values on both sides.
|
||||
p->append(SkRasterPipeline::clamp_0);
|
||||
p->append(SkRasterPipeline::clamp_a);
|
||||
}
|
||||
append_gamut_transform(p, alloc, info.colorSpace(), rec.fDstCS, kPremul_SkAlphaType);
|
||||
return true;
|
||||
};
|
||||
|
||||
if (quality == kLow_SkFilterQuality &&
|
||||
info.colorType() == kRGBA_8888_SkColorType &&
|
||||
fTileModeX == SkShader::kClamp_TileMode &&
|
||||
fTileModeY == SkShader::kClamp_TileMode &&
|
||||
!is_srgb) {
|
||||
|
||||
p->append(SkRasterPipeline::bilerp_clamp_8888, gather);
|
||||
return append_misc();
|
||||
}
|
||||
|
||||
SkJumper_SamplerCtx* sampler = nullptr;
|
||||
if (quality != kNone_SkFilterQuality) {
|
||||
sampler = alloc->make<SkJumper_SamplerCtx>();
|
||||
@ -366,6 +395,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
|
||||
|
||||
if (quality == kNone_SkFilterQuality) {
|
||||
append_tiling_and_gather();
|
||||
|
||||
} else if (quality == kLow_SkFilterQuality) {
|
||||
p->append(SkRasterPipeline::save_xy, sampler);
|
||||
|
||||
@ -375,6 +405,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
|
||||
sample(SkRasterPipeline::bilinear_px, SkRasterPipeline::bilinear_py);
|
||||
|
||||
p->append(SkRasterPipeline::move_dst_src);
|
||||
|
||||
} else {
|
||||
p->append(SkRasterPipeline::save_xy, sampler);
|
||||
|
||||
@ -401,17 +432,5 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
|
||||
p->append(SkRasterPipeline::move_dst_src);
|
||||
}
|
||||
|
||||
if (info.colorType() == kAlpha_8_SkColorType) {
|
||||
p->append(SkRasterPipeline::set_rgb, &misc->paint_color);
|
||||
}
|
||||
if (info.colorType() == kAlpha_8_SkColorType || info.alphaType() == kUnpremul_SkAlphaType) {
|
||||
p->append(SkRasterPipeline::premul);
|
||||
}
|
||||
if (quality > kLow_SkFilterQuality) {
|
||||
// Bicubic filtering naturally produces out of range values on both sides.
|
||||
p->append(SkRasterPipeline::clamp_0);
|
||||
p->append(SkRasterPipeline::clamp_a);
|
||||
}
|
||||
append_gamut_transform(p, alloc, info.colorSpace(), rec.fDstCS, kPremul_SkAlphaType);
|
||||
return true;
|
||||
return append_misc();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user