Start moving SkRasterPipeline stages to SkOpts.

This lets them pick up runtime CPU specializations.  Here I've plugged in SSE4.1.  This is still one of the N prelude CLs to full 8-at-a-time AVX.

I've moved the union of the stages used by SkRasterPipelineBench and SkRasterPipelineBlitter to SkOpts... they'll all be used by the blitter eventually.  Picking up SSE4.1 specialization here (even still just 4 pixels at a time) is a significant speedup, especially to store_srgb(), so much that it's no longer really interesting to compare against the fused-but-default-instruction-set version in the bench.  So that's gone now.

That left the SkRasterPipeline unit test as the only other user of the EasyFn simplified interface to SkRasterPipeline.  So I converted that back down to the bare-metal interface, and EasyFn and its friends became SkRasterPipeline_opts.h exclusive abbreviations (now called Kernel_Sk4f).  This isn't really unexpected: SkXfermode also wanted to build up its own little abstractions, and once you build your own abstraction, the value of an additional EasyFn-like layer plummets to negative.

For simplicity I've left the SkXfermode stages alone, except srcover() which was always part of the blitter.  No particular reason except keeping the churn down while I hack.  These _can_ be in SkOpts, but don't have to be until we go 8-at-a-time.

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2752
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Change-Id: I3b476b18232a1598d8977e425be2150059ab71dc
Reviewed-on: https://skia-review.googlesource.com/2752
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2016-09-29 09:04:15 -04:00 committed by Skia Commit-Bot
parent 8e6791fb9a
commit baaf8ad952
8 changed files with 458 additions and 453 deletions

View File

@ -6,8 +6,8 @@
*/
#include "Benchmark.h"
#include "SkOpts.h"
#include "SkRasterPipeline.h"
#include "SkSRGB.h"
static const int N = 1023;
@ -22,186 +22,21 @@ static uint8_t mask[N];
// - src = srcover(dst, src)
// - store src back as srgb
SK_RASTER_STAGE(load_s_srgb) {
auto ptr = (const uint32_t*)ctx + x;
if (tail) {
float rs[] = {0,0,0,0},
gs[] = {0,0,0,0},
bs[] = {0,0,0,0},
as[] = {0,0,0,0};
for (size_t i = 0; i < (tail&3); i++) {
rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff];
gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff];
bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
as[i] = (ptr[i] >> 24) * (1/255.0f);
}
r = Sk4f::Load(rs);
g = Sk4f::Load(gs);
b = Sk4f::Load(bs);
a = Sk4f::Load(as);
return;
}
r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
g = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
b = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
}
SK_RASTER_STAGE(load_d_srgb) {
auto ptr = (const uint32_t*)ctx + x;
if (tail) {
float rs[] = {0,0,0,0},
gs[] = {0,0,0,0},
bs[] = {0,0,0,0},
as[] = {0,0,0,0};
for (size_t i = 0; i < (tail&3); i++) {
rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff];
gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff];
bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
as[i] = (ptr[i] >> 24) * (1/255.0f);
}
dr = Sk4f::Load(rs);
dg = Sk4f::Load(gs);
db = Sk4f::Load(bs);
da = Sk4f::Load(as);
return;
}
dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
dg = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
db = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
}
SK_RASTER_STAGE(scale_u8) {
auto ptr = (const uint8_t*)ctx + x;
Sk4b cov;
if (tail) {
uint8_t cs[] = {0,0,0,0};
switch (tail&3) {
case 3: cs[2] = ptr[2];
case 2: cs[1] = ptr[1];
case 1: cs[0] = ptr[0];
}
cov = Sk4b::Load(cs);
} else {
cov = Sk4b::Load(ptr);
}
auto c = SkNx_cast<float>(cov) * (1/255.0f);
r *= c;
g *= c;
b *= c;
a *= c;
}
SK_RASTER_STAGE(srcover) {
auto A = 1.0f - a;
r += dr * A;
g += dg * A;
b += db * A;
a += da * A;
}
SK_RASTER_STAGE(store_srgb) {
auto ptr = (uint32_t*)ctx + x;
uint32_t* dst = nullptr;
uint32_t stack[4];
if (tail) {
dst = ptr;
ptr = stack;
}
( sk_linear_to_srgb(r)
| sk_linear_to_srgb(g) << 8
| sk_linear_to_srgb(b) << 16
| Sk4f_round(255.0f*a) << 24).store(ptr);
switch (tail&3) {
case 3: dst[2] = ptr[2];
case 2: dst[1] = ptr[1];
case 1: dst[0] = ptr[0];
}
}
class SkRasterPipelineBench : public Benchmark {
public:
SkRasterPipelineBench(bool fused) : fFused(fused) {}
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
const char* onGetName() override { return fFused ? "SkRasterPipelineBench_fused"
: "SkRasterPipelineBench_pipeline"; }
const char* onGetName() override { return "SkRasterPipeline"; }
void onDraw(int loops, SkCanvas*) override {
while (loops --> 0) {
fFused ? this->runFused() : this->runPipeline();
SkRasterPipeline p;
p.append(SkOpts::load_s_srgb_body, SkOpts::load_s_srgb_tail, src);
p.append(SkOpts::scale_u8_body, SkOpts::scale_u8_tail, mask);
p.append(SkOpts::load_d_srgb_body, SkOpts::load_d_srgb_tail, dst);
p.append(SkOpts::srcover);
p.append(SkOpts::store_srgb_body, SkOpts::store_srgb_tail, dst);
p.run(N);
}
}
void runFused() {
Sk4f r,g,b,a, dr,dg,db,da;
size_t x = 0, n = N;
while (n >= 4) {
load_s_srgb(src , x,0, r,g,b,a, dr,dg,db,da);
scale_u8 (mask , x,0, r,g,b,a, dr,dg,da,da);
load_d_srgb(dst , x,0, r,g,b,a, dr,dg,da,da);
srcover (nullptr, x,0, r,g,b,a, dr,dg,da,da);
store_srgb (dst , x,0, r,g,b,a, dr,dg,da,da);
x += 4;
n -= 4;
}
if (n > 0) {
load_s_srgb(src , x,n, r,g,b,a, dr,dg,db,da);
scale_u8 (mask , x,n, r,g,b,a, dr,dg,da,da);
load_d_srgb(dst , x,n, r,g,b,a, dr,dg,da,da);
srcover (nullptr, x,n, r,g,b,a, dr,dg,da,da);
store_srgb (dst , x,n, r,g,b,a, dr,dg,da,da);
}
}
void runPipeline() {
SkRasterPipeline p;
p.append<load_s_srgb>(src);
p.append< scale_u8>(mask);
p.append<load_d_srgb>(dst);
p.append< srcover>();
p.last < store_srgb>(dst);
p.run(N);
}
bool fFused;
};
DEF_BENCH( return new SkRasterPipelineBench(true); )
DEF_BENCH( return new SkRasterPipelineBench(false); )
DEF_BENCH( return new SkRasterPipelineBench; )

View File

@ -43,6 +43,7 @@
#include "SkChecksum_opts.h"
#include "SkColorCubeFilter_opts.h"
#include "SkMorphologyImageFilter_opts.h"
#include "SkRasterPipeline_opts.h"
#include "SkSwizzler_opts.h"
#include "SkTextureCompressor_opts.h"
#include "SkXfermode_opts.h"
@ -89,6 +90,38 @@ namespace SkOpts {
DEFINE_DEFAULT(hash_fn);
#undef DEFINE_DEFAULT
// Stages that are not sensitive to the tail parameter can be represented by one function.
#define DEFINE_DEFAULT(stage, kCallNext) \
decltype(stage) stage = body<SK_OPTS_NS::stage, kCallNext>
DEFINE_DEFAULT(srcover, true);
DEFINE_DEFAULT(constant_color, true);
DEFINE_DEFAULT(lerp_constant_float, true);
#undef DEFINE_DEFAULT
// Stages that are sensitive to the tail parameter need two versions, _body and _tail.
#define DEFINE_DEFAULT(stage, kCallNext) \
decltype(stage##_body) stage##_body = body<SK_OPTS_NS::stage, kCallNext>; \
decltype(stage##_tail) stage##_tail = tail<SK_OPTS_NS::stage, kCallNext>
DEFINE_DEFAULT(load_d_srgb, true);
DEFINE_DEFAULT(load_s_srgb, true);
DEFINE_DEFAULT( store_srgb, false);
DEFINE_DEFAULT(load_d_f16, true);
DEFINE_DEFAULT(load_s_f16, true);
DEFINE_DEFAULT( store_f16, false);
DEFINE_DEFAULT(load_d_565, true);
DEFINE_DEFAULT(load_s_565, true);
DEFINE_DEFAULT( store_565, false);
DEFINE_DEFAULT(scale_u8, true);
DEFINE_DEFAULT(lerp_u8, true);
DEFINE_DEFAULT(lerp_565, true);
#undef DEFINE_DEFAULT
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();
void Init_sse41();

View File

@ -8,6 +8,7 @@
#ifndef SkOpts_DEFINED
#define SkOpts_DEFINED
#include "SkRasterPipeline.h"
#include "SkTextureCompressor.h"
#include "SkTypes.h"
#include "SkXfermode.h"
@ -71,6 +72,25 @@ namespace SkOpts {
static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {
return hash_fn(data, bytes, seed);
}
// Each of the SkRasterPipeline::Fn's lists its context pointer type in the comments.
extern SkRasterPipeline::Fn srcover, // (none)
constant_color, // const SkPM4f*
lerp_constant_float; // const float*, in [0,1]
extern SkRasterPipeline::Fn load_d_srgb_body, load_d_srgb_tail, // const uint32_t*
load_s_srgb_body, load_s_srgb_tail, // const uint32_t*
store_srgb_body, store_srgb_tail, // uint32_t*
load_d_f16_body, load_d_f16_tail, // const uint64_t*
load_s_f16_body, load_s_f16_tail, // const uint64_t*
store_f16_body, store_f16_tail, // uint64_t*
load_d_565_body, load_d_565_tail, // const uint16_t*
load_s_565_body, load_s_565_tail, // const uint16_t*
store_565_body, store_565_tail, // uint16_t*
scale_u8_body, scale_u8_tail, // const uint8_t*
lerp_u8_body, lerp_u8_tail, // const uint8_t*
lerp_565_body, lerp_565_tail; // const uint16_t*
}
#endif//SkOpts_DEFINED

View File

@ -48,14 +48,6 @@
*
* Some stages that typically return are those that write a color to a destination pointer,
* but any stage can short-circuit the rest of the pipeline by returning instead of calling next().
*
* Most simple pipeline stages can use the SK_RASTER_STAGE macro to define a static EasyFn,
* which simplifies the user interface a bit:
* - the context pointer is available directly as the first parameter;
* - instead of manually calling a next() function, just modify registers in place.
*
* To add an EasyFn stage to the pipeline, call append<fn>() instead of append(&fn).
* It's a slight performance benefit to call last<fn>() for the last stage of a pipeline.
*/
// TODO: There may be a better place to stuff tail, e.g. in the bottom alignment bits of
@ -66,9 +58,6 @@ public:
struct Stage;
using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
Sk4f,Sk4f,Sk4f,Sk4f);
using EasyFn = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&,
Sk4f&, Sk4f&, Sk4f&, Sk4f&);
struct Stage {
template <typename T>
T ctx() { return static_cast<T>(fCtx); }
@ -99,17 +88,6 @@ public:
void append(Fn body, Fn tail, const void* ctx = nullptr);
void append(Fn fn, const void* ctx = nullptr) { this->append(fn, fn, ctx); }
// Version of append that can be used with static EasyFn (see SK_RASTER_STAGE).
template <EasyFn fn>
void append(const void* ctx = nullptr) {
this->append(Body<fn,true>, Tail<fn,true>, ctx);
}
// If this is the last stage of the pipeline, last() is a bit faster than append().
template <EasyFn fn>
void last(const void* ctx = nullptr) {
this->append(Body<fn,false>, Tail<fn,false>, ctx);
}
// Append all stages to this pipeline.
void extend(const SkRasterPipeline&);
@ -122,42 +100,10 @@ private:
// buggy pipeline can't walk off its own end.
static void SK_VECTORCALL JustReturn(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
Sk4f,Sk4f,Sk4f,Sk4f);
template <EasyFn kernel, bool kCallNext>
static void SK_VECTORCALL Body(SkRasterPipeline::Stage* st, size_t x, size_t tail,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
// Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel.
kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);
if (kCallNext) {
st->next(x,tail, r,g,b,a, dr,dg,db,da); // It's faster to pass tail here than 0.
}
}
template <EasyFn kernel, bool kCallNext>
static void SK_VECTORCALL Tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
#if defined(__clang__)
__builtin_assume(tail > 0); // This flourish lets Clang compile away any tail==0 code.
#endif
kernel(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da);
if (kCallNext) {
st->next(x,tail, r,g,b,a, dr,dg,db,da);
}
}
Stages fBody,
fTail;
Fn fBodyStart = &JustReturn,
fTailStart = &JustReturn;
};
// These are always static, and we _really_ want them to inline.
// If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly.
#define SK_RASTER_STAGE(name) \
static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \
Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \
Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da)
#endif//SkRasterPipeline_DEFINED

View File

@ -8,11 +8,10 @@
#include "SkBlitter.h"
#include "SkColor.h"
#include "SkColorFilter.h"
#include "SkHalf.h"
#include "SkOpts.h"
#include "SkPM4f.h"
#include "SkRasterPipeline.h"
#include "SkShader.h"
#include "SkSRGB.h"
#include "SkXfermode.h"
@ -57,200 +56,6 @@ SkBlitter* SkCreateRasterPipelineBlitter(const SkPixmap& dst,
return SkRasterPipelineBlitter::Create(dst, paint, alloc);
}
// Clamp colors into [0,1] premul (e.g. just before storing back to memory).
SK_RASTER_STAGE(clamp_01_premul) {
a = Sk4f::Max(a, 0.0f);
r = Sk4f::Max(r, 0.0f);
g = Sk4f::Max(g, 0.0f);
b = Sk4f::Max(b, 0.0f);
a = Sk4f::Min(a, 1.0f);
r = Sk4f::Min(r, a);
g = Sk4f::Min(g, a);
b = Sk4f::Min(b, a);
}
// The default shader produces a constant color (from the SkPaint).
SK_RASTER_STAGE(constant_color) {
auto color = (const SkPM4f*)ctx;
r = color->r();
g = color->g();
b = color->b();
a = color->a();
}
// The default transfer mode is srcover, s' = s + d*(1-sa).
SK_RASTER_STAGE(srcover) {
r += dr*(1.0f - a);
g += dg*(1.0f - a);
b += db*(1.0f - a);
a += da*(1.0f - a);
}
static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) {
return from + (to-from)*cov;
}
// s' = d(1-c) + sc, for a constant c.
SK_RASTER_STAGE(lerp_constant_float) {
Sk4f c = *(const float*)ctx;
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
a = lerp(da, a, c);
}
template <typename T>
static SkNx<4,T> load_tail(size_t tail, const T* src) {
if (tail) {
return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0);
}
return SkNx<4,T>::Load(src);
}
template <typename T>
static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) {
switch(tail) {
case 0: return v.store(dst);
case 3: dst[2] = v[2];
case 2: dst[1] = v[1];
case 1: dst[0] = v[0];
}
}
// s' = d(1-c) + sc for 8-bit c.
SK_RASTER_STAGE(lerp_a8) {
auto ptr = (const uint8_t*)ctx + x;
Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
a = lerp(da, a, c);
}
static void from_565(const Sk4h& _565, Sk4f* r, Sk4f* g, Sk4f* b) {
Sk4i _32_bit = SkNx_cast<int>(_565);
*r = SkNx_cast<float>(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE);
*g = SkNx_cast<float>(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE);
*b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE);
}
static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) {
return SkNx_cast<uint16_t>( Sk4f_round(r * SK_R16_MASK) << SK_R16_SHIFT
| Sk4f_round(g * SK_G16_MASK) << SK_G16_SHIFT
| Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT);
}
// s' = d(1-c) + sc for 565 c.
SK_RASTER_STAGE(lerp_lcd16) {
auto ptr = (const uint16_t*)ctx + x;
Sk4f cr, cg, cb;
from_565(load_tail(tail, ptr), &cr, &cg, &cb);
r = lerp(dr, r, cr);
g = lerp(dg, g, cg);
b = lerp(db, b, cb);
a = 1.0f;
}
SK_RASTER_STAGE(load_d_565) {
auto ptr = (const uint16_t*)ctx + x;
from_565(load_tail(tail, ptr), &dr,&dg,&db);
da = 1.0f;
}
SK_RASTER_STAGE(store_565) {
auto ptr = (uint16_t*)ctx + x;
store_tail(tail, to_565(r,g,b), ptr);
}
SK_RASTER_STAGE(load_d_f16) {
auto ptr = (const uint64_t*)ctx + x;
if (tail) {
auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) ,
p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0},
p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0};
dr = { p0[0],p1[0],p2[0],0 };
dg = { p0[1],p1[1],p2[1],0 };
db = { p0[2],p1[2],p2[2],0 };
da = { p0[3],p1[3],p2[3],0 };
return;
}
Sk4h rh, gh, bh, ah;
Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
dr = SkHalfToFloat_finite_ftz(rh);
dg = SkHalfToFloat_finite_ftz(gh);
db = SkHalfToFloat_finite_ftz(bh);
da = SkHalfToFloat_finite_ftz(ah);
}
SK_RASTER_STAGE(store_f16) {
auto ptr = (uint64_t*)ctx + x;
switch (tail) {
case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g),
SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a));
case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2);
case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1);
case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0);
}
}
// Load 8-bit SkPMColor-order sRGB.
SK_RASTER_STAGE(load_d_srgb) {
auto ptr = (const uint32_t*)ctx + x;
if (tail) {
float rs[] = {0,0,0,0},
gs[] = {0,0,0,0},
bs[] = {0,0,0,0},
as[] = {0,0,0,0};
for (size_t i = 0; i < tail; i++) {
rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff];
gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff];
bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff];
as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ;
}
dr = Sk4f::Load(rs);
dg = Sk4f::Load(gs);
db = Sk4f::Load(bs);
da = Sk4f::Load(as);
return;
}
dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] };
dg = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] };
db = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
}
// Store 8-bit SkPMColor-order sRGB.
SK_RASTER_STAGE(store_srgb) {
auto ptr = (uint32_t*)ctx + x;
store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
| sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
| sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
| Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr);
}
static bool supported(const SkImageInfo& info) {
switch (info.colorType()) {
case kN32_SkColorType: return info.gammaCloseToSRGB();
@ -297,10 +102,10 @@ SkBlitter* SkRasterPipelineBlitter::Create(const SkPixmap& dst,
color.premul());
if (!paint.getShader()) {
blitter->fShader.append<constant_color>(&blitter->fPaintColor);
blitter->fShader.append(SkOpts::constant_color, &blitter->fPaintColor);
}
if (!paint.getXfermode()) {
blitter->fXfermode.append<srcover>();
blitter->fXfermode.append(SkOpts::srcover);
}
return blitter;
@ -312,41 +117,33 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst
switch (fDst.info().colorType()) {
case kN32_SkColorType:
if (fDst.info().gammaCloseToSRGB()) {
p->append<load_d_srgb>(dst);
p->append(SkOpts::load_d_srgb_body, SkOpts::load_d_srgb_tail, dst);
}
break;
case kRGBA_F16_SkColorType:
p->append<load_d_f16>(dst);
p->append(SkOpts::load_d_f16_body, SkOpts::load_d_f16_tail, dst);
break;
case kRGB_565_SkColorType:
p->append<load_d_565>(dst);
p->append(SkOpts::load_d_565_body, SkOpts::load_d_565_tail, dst);
break;
default: break;
}
}
template <SkRasterPipeline::EasyFn fn>
static void clamp_01_premul_then(void* ctx, size_t x, size_t tail,
Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a,
Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da) {
clamp_01_premul(nullptr, x,tail, r,g,b,a, dr,dg,db,da);
fn( ctx, x,tail, r,g,b,a, dr,dg,db,da);
}
void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p, void* dst) const {
SkASSERT(supported(fDst.info()));
switch (fDst.info().colorType()) {
case kN32_SkColorType:
if (fDst.info().gammaCloseToSRGB()) {
p->last<clamp_01_premul_then<store_srgb>>(dst);
p->append(SkOpts::store_srgb_body, SkOpts::store_srgb_tail, dst);
}
break;
case kRGBA_F16_SkColorType:
p->last<clamp_01_premul_then<store_f16>>(dst);
p->append(SkOpts::store_f16_body, SkOpts::store_f16_tail, dst);
break;
case kRGB_565_SkColorType:
p->last<clamp_01_premul_then<store_565>>(dst);
p->append(SkOpts::store_565_body, SkOpts::store_565_tail, dst);
break;
default: break;
}
@ -374,7 +171,7 @@ void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const
p.extend(fColorFilter);
this->append_load_d(&p, dst);
p.extend(fXfermode);
p.append<lerp_constant_float>(&coverage);
p.append(SkOpts::lerp_constant_float, &coverage);
this->append_store(&p, dst);
for (int16_t run = *runs; run > 0; run = *runs) {
@ -404,10 +201,10 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip)
p.extend(fXfermode);
switch (mask.fFormat) {
case SkMask::kA8_Format:
p.append<lerp_a8>(mask.getAddr8(x,y)-x);
p.append(SkOpts::lerp_u8_body, SkOpts::lerp_u8_tail, mask.getAddr8(x,y)-x);
break;
case SkMask::kLCD16_Format:
p.append<lerp_lcd16>(mask.getAddrLCD16(x,y)-x);
p.append(SkOpts::lerp_565_body, SkOpts::lerp_565_tail, mask.getAddrLCD16(x,y)-x);
break;
default: break;
}

View File

@ -11,6 +11,7 @@
#include "SkBlurImageFilter_opts.h"
#include "SkBlitRow_opts.h"
#include "SkBlend_opts.h"
#include "SkRasterPipeline_opts.h"
namespace SkOpts {
void Init_sse41() {
@ -19,5 +20,35 @@ namespace SkOpts {
box_blur_yx = sse41::box_blur_yx;
srcover_srgb_srgb = sse41::srcover_srgb_srgb;
blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
#define STAGE(stage, kCallNext) \
stage = body<SK_OPTS_NS::stage, kCallNext>
STAGE(srcover, true);
STAGE(constant_color, true);
STAGE(lerp_constant_float, true);
#undef STAGE
#define STAGE(stage, kCallNext) \
stage##_body = body<SK_OPTS_NS::stage, kCallNext>; \
stage##_tail = tail<SK_OPTS_NS::stage, kCallNext>
STAGE(load_d_srgb, true);
STAGE(load_s_srgb, true);
STAGE( store_srgb, false);
STAGE(load_d_f16, true);
STAGE(load_s_f16, true);
STAGE( store_f16, false);
STAGE(load_d_565, true);
STAGE(load_s_565, true);
STAGE( store_565, false);
STAGE(scale_u8, true);
STAGE(lerp_u8, true);
STAGE(lerp_565, true);
#undef STAGE
}
}

View File

@ -0,0 +1,333 @@
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkRasterPipeline_opts_DEFINED
#define SkRasterPipeline_opts_DEFINED
#include "SkHalf.h"
#include "SkPM4f.h"
#include "SkRasterPipeline.h"
#include "SkSRGB.h"
using Kernel_Sk4f = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&,
Sk4f&, Sk4f&, Sk4f&, Sk4f&);
// These are always static, and we _really_ want them to inline.
// If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly.
#define KERNEL_Sk4f(name) \
static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \
Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \
Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da)
template <Kernel_Sk4f kernel, bool kCallNext>
static inline void SK_VECTORCALL body(SkRasterPipeline::Stage* st, size_t x, size_t t,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
// Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel.
kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);
if (kCallNext) {
st->next(x,t, r,g,b,a, dr,dg,db,da); // It's faster to pass t here than 0.
}
}
template <Kernel_Sk4f kernel, bool kCallNext>
static inline void SK_VECTORCALL tail(SkRasterPipeline::Stage* st, size_t x, size_t t,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
#if defined(__clang__)
__builtin_assume(t > 0); // This flourish lets Clang compile away any tail==0 code.
#endif
kernel(st->ctx<void*>(), x,t, r,g,b,a, dr,dg,db,da);
if (kCallNext) {
st->next(x,t, r,g,b,a, dr,dg,db,da);
}
}
namespace SK_OPTS_NS {
// Clamp colors into [0,1] premul (e.g. just before storing back to memory).
static void clamp_01_premul(Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a) {
a = Sk4f::Max(a, 0.0f);
r = Sk4f::Max(r, 0.0f);
g = Sk4f::Max(g, 0.0f);
b = Sk4f::Max(b, 0.0f);
a = Sk4f::Min(a, 1.0f);
r = Sk4f::Min(r, a);
g = Sk4f::Min(g, a);
b = Sk4f::Min(b, a);
}
static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) {
return from + (to-from)*cov;
}
template <typename T>
static SkNx<4,T> load_tail(size_t tail, const T* src) {
if (tail) {
return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0);
}
return SkNx<4,T>::Load(src);
}
template <typename T>
static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) {
switch(tail) {
case 0: return v.store(dst);
case 3: dst[2] = v[2];
case 2: dst[1] = v[1];
case 1: dst[0] = v[0];
}
}
static void from_565(const Sk4h& _565, Sk4f* r, Sk4f* g, Sk4f* b) {
Sk4i _32_bit = SkNx_cast<int>(_565);
*r = SkNx_cast<float>(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE);
*g = SkNx_cast<float>(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE);
*b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE);
}
static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) {
return SkNx_cast<uint16_t>( Sk4f_round(r * SK_R16_MASK) << SK_R16_SHIFT
| Sk4f_round(g * SK_G16_MASK) << SK_G16_SHIFT
| Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT);
}
// The default shader produces a constant color (from the SkPaint).
KERNEL_Sk4f(constant_color) {
auto color = (const SkPM4f*)ctx;
r = color->r();
g = color->g();
b = color->b();
a = color->a();
}
// The default transfer mode is srcover, s' = s + d*(1-sa).
KERNEL_Sk4f(srcover) {
r += dr*(1.0f - a);
g += dg*(1.0f - a);
b += db*(1.0f - a);
a += da*(1.0f - a);
}
// s' = d(1-c) + sc, for a constant c.
KERNEL_Sk4f(lerp_constant_float) {
Sk4f c = *(const float*)ctx;
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
a = lerp(da, a, c);
}
// s' = sc for 8-bit c.
KERNEL_Sk4f(scale_u8) {
auto ptr = (const uint8_t*)ctx + x;
Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f);
r = r*c;
g = g*c;
b = b*c;
a = a*c;
}
// s' = d(1-c) + sc for 8-bit c.
KERNEL_Sk4f(lerp_u8) {
auto ptr = (const uint8_t*)ctx + x;
Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
a = lerp(da, a, c);
}
// s' = d(1-c) + sc for 565 c.
KERNEL_Sk4f(lerp_565) {
auto ptr = (const uint16_t*)ctx + x;
Sk4f cr, cg, cb;
from_565(load_tail(tail, ptr), &cr, &cg, &cb);
r = lerp(dr, r, cr);
g = lerp(dg, g, cg);
b = lerp(db, b, cb);
a = 1.0f;
}
KERNEL_Sk4f(load_d_565) {
auto ptr = (const uint16_t*)ctx + x;
from_565(load_tail(tail, ptr), &dr,&dg,&db);
da = 1.0f;
}
KERNEL_Sk4f(load_s_565) {
auto ptr = (const uint16_t*)ctx + x;
from_565(load_tail(tail, ptr), &r,&g,&b);
a = 1.0f;
}
KERNEL_Sk4f(store_565) {
clamp_01_premul(r,g,b,a);
auto ptr = (uint16_t*)ctx + x;
store_tail(tail, to_565(r,g,b), ptr);
}
KERNEL_Sk4f(load_d_f16) {
auto ptr = (const uint64_t*)ctx + x;
if (tail) {
auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) ,
p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0},
p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0};
dr = { p0[0],p1[0],p2[0],0 };
dg = { p0[1],p1[1],p2[1],0 };
db = { p0[2],p1[2],p2[2],0 };
da = { p0[3],p1[3],p2[3],0 };
return;
}
Sk4h rh, gh, bh, ah;
Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
dr = SkHalfToFloat_finite_ftz(rh);
dg = SkHalfToFloat_finite_ftz(gh);
db = SkHalfToFloat_finite_ftz(bh);
da = SkHalfToFloat_finite_ftz(ah);
}
KERNEL_Sk4f(load_s_f16) {
auto ptr = (const uint64_t*)ctx + x;
if (tail) {
auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) ,
p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0},
p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0};
r = { p0[0],p1[0],p2[0],0 };
g = { p0[1],p1[1],p2[1],0 };
b = { p0[2],p1[2],p2[2],0 };
a = { p0[3],p1[3],p2[3],0 };
return;
}
Sk4h rh, gh, bh, ah;
Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
r = SkHalfToFloat_finite_ftz(rh);
g = SkHalfToFloat_finite_ftz(gh);
b = SkHalfToFloat_finite_ftz(bh);
a = SkHalfToFloat_finite_ftz(ah);
}
KERNEL_Sk4f(store_f16) {
clamp_01_premul(r,g,b,a);
auto ptr = (uint64_t*)ctx + x;
switch (tail) {
case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r),
SkFloatToHalf_finite_ftz(g),
SkFloatToHalf_finite_ftz(b),
SkFloatToHalf_finite_ftz(a));
case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2);
case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1);
case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0);
}
}
// Load 8-bit SkPMColor-order sRGB.
KERNEL_Sk4f(load_d_srgb) {
auto ptr = (const uint32_t*)ctx + x;
if (tail) {
float rs[] = {0,0,0,0},
gs[] = {0,0,0,0},
bs[] = {0,0,0,0},
as[] = {0,0,0,0};
for (size_t i = 0; i < tail; i++) {
rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff];
gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff];
bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff];
as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ;
}
dr = Sk4f::Load(rs);
dg = Sk4f::Load(gs);
db = Sk4f::Load(bs);
da = Sk4f::Load(as);
return;
}
dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] };
dg = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] };
db = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
}
KERNEL_Sk4f(load_s_srgb) {
auto ptr = (const uint32_t*)ctx + x;
if (tail) {
float rs[] = {0,0,0,0},
gs[] = {0,0,0,0},
bs[] = {0,0,0,0},
as[] = {0,0,0,0};
for (size_t i = 0; i < tail; i++) {
rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff];
gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff];
bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff];
as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ;
}
r = Sk4f::Load(rs);
g = Sk4f::Load(gs);
b = Sk4f::Load(bs);
a = Sk4f::Load(as);
return;
}
r = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] };
g = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] };
b = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
a = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
}
KERNEL_Sk4f(store_srgb) {
clamp_01_premul(r,g,b,a);
auto ptr = (uint32_t*)ctx + x;
store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
| sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
| sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
| Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr);
}
}
#endif//SkRasterPipeline_opts_DEFINED

View File

@ -8,25 +8,33 @@
#include "Test.h"
#include "SkRasterPipeline.h"
SK_RASTER_STAGE(load) {
auto ptr = (const float*)ctx + x;
static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x, size_t tail,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const float*>() + x;
switch(tail&3) {
case 0: a = Sk4f{ptr[3]};
case 3: b = Sk4f{ptr[2]};
case 2: g = Sk4f{ptr[1]};
case 1: r = Sk4f{ptr[0]};
}
st->next(x,tail, r,g,b,a, dr,dg,db,da);
}
SK_RASTER_STAGE(square) {
static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x, size_t tail,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
r *= r;
g *= g;
b *= b;
a *= a;
st->next(x,tail, r,g,b,a, dr,dg,db,da);
}
SK_RASTER_STAGE(store) {
auto ptr = (float*)ctx + x;
static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x, size_t tail,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<float*>() + x;
switch (tail&3) {
case 0: ptr[3] = a[0];
case 3: ptr[2] = b[0];
@ -41,6 +49,8 @@ DEF_TEST(SkRasterPipeline, r) {
// - context pointers (load,store)
// - stages sensitive to the number of pixels (load,store)
// - stages insensitive to the number of pixels (square)
// - stages that chain to the next stage (load,square)
// - stages that terminate the pipeline (store)
//
// This pipeline loads up some values, squares them, then writes them back to memory.
@ -48,9 +58,9 @@ DEF_TEST(SkRasterPipeline, r) {
float dst_vals[] = { 0,0,0,0,0 };
SkRasterPipeline p;
p.append<load>(src_vals);
p.append<square>();
p.append<store>(dst_vals);
p.append(load, src_vals);
p.append(square);
p.append(store, dst_vals);
p.run(5);
@ -71,6 +81,6 @@ DEF_TEST(SkRasterPipeline_nonsense, r) {
// No asserts... just a test that this is safe to run and terminates.
// square() always calls st->next(); this makes sure we've always got something there to call.
SkRasterPipeline p;
p.append<square>();
p.append(square);
p.run(20);
}