diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp index aa3e355e45..17a93da549 100644 --- a/bench/SkRasterPipelineBench.cpp +++ b/bench/SkRasterPipelineBench.cpp @@ -6,8 +6,8 @@ */ #include "Benchmark.h" +#include "SkOpts.h" #include "SkRasterPipeline.h" -#include "SkSRGB.h" static const int N = 1023; @@ -22,186 +22,21 @@ static uint8_t mask[N]; // - src = srcover(dst, src) // - store src back as srgb -SK_RASTER_STAGE(load_s_srgb) { - auto ptr = (const uint32_t*)ctx + x; - - if (tail) { - float rs[] = {0,0,0,0}, - gs[] = {0,0,0,0}, - bs[] = {0,0,0,0}, - as[] = {0,0,0,0}; - for (size_t i = 0; i < (tail&3); i++) { - rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff]; - gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff]; - bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff]; - as[i] = (ptr[i] >> 24) * (1/255.0f); - } - r = Sk4f::Load(rs); - g = Sk4f::Load(gs); - b = Sk4f::Load(bs); - a = Sk4f::Load(as); - return; - } - - r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], - sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], - sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], - sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] }; - - g = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff], - sk_linear_from_srgb[(ptr[1] >> 8) & 0xff], - sk_linear_from_srgb[(ptr[2] >> 8) & 0xff], - sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] }; - - b = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff], - sk_linear_from_srgb[(ptr[1] >> 16) & 0xff], - sk_linear_from_srgb[(ptr[2] >> 16) & 0xff], - sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] }; - a = SkNx_cast((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); -} - -SK_RASTER_STAGE(load_d_srgb) { - auto ptr = (const uint32_t*)ctx + x; - - if (tail) { - float rs[] = {0,0,0,0}, - gs[] = {0,0,0,0}, - bs[] = {0,0,0,0}, - as[] = {0,0,0,0}; - for (size_t i = 0; i < (tail&3); i++) { - rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff]; - gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff]; - bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff]; - as[i] = (ptr[i] >> 24) * (1/255.0f); - } - dr = Sk4f::Load(rs); - dg = Sk4f::Load(gs); - db = Sk4f::Load(bs); - da = Sk4f::Load(as); - return; - } - - dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], - sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], - sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], - sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] }; - - dg = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff], - sk_linear_from_srgb[(ptr[1] >> 8) & 0xff], - sk_linear_from_srgb[(ptr[2] >> 8) & 0xff], - sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] }; - - db = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff], - sk_linear_from_srgb[(ptr[1] >> 16) & 0xff], - sk_linear_from_srgb[(ptr[2] >> 16) & 0xff], - sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] }; - - da = SkNx_cast((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); -} - -SK_RASTER_STAGE(scale_u8) { - auto ptr = (const uint8_t*)ctx + x; - - Sk4b cov; - - if (tail) { - uint8_t cs[] = {0,0,0,0}; - switch (tail&3) { - case 3: cs[2] = ptr[2]; - case 2: cs[1] = ptr[1]; - case 1: cs[0] = ptr[0]; - } - cov = Sk4b::Load(cs); - } else { - cov = Sk4b::Load(ptr); - } - - auto c = SkNx_cast(cov) * (1/255.0f); - r *= c; - g *= c; - b *= c; - a *= c; -} - -SK_RASTER_STAGE(srcover) { - auto A = 1.0f - a; - r += dr * A; - g += dg * A; - b += db * A; - a += da * A; -} - -SK_RASTER_STAGE(store_srgb) { - auto ptr = (uint32_t*)ctx + x; - - uint32_t* dst = nullptr; - uint32_t stack[4]; - - if (tail) { - dst = ptr; - ptr = stack; - } - - ( sk_linear_to_srgb(r) - | sk_linear_to_srgb(g) << 8 - | sk_linear_to_srgb(b) << 16 - | Sk4f_round(255.0f*a) << 24).store(ptr); - - switch (tail&3) { - case 3: dst[2] = ptr[2]; - case 2: dst[1] = ptr[1]; - case 1: dst[0] = ptr[0]; - } -} - class SkRasterPipelineBench : public Benchmark { public: - SkRasterPipelineBench(bool fused) : fFused(fused) {} - bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } - const char* onGetName() override { return fFused ? "SkRasterPipelineBench_fused" - : "SkRasterPipelineBench_pipeline"; } + const char* onGetName() override { return "SkRasterPipeline"; } void onDraw(int loops, SkCanvas*) override { while (loops --> 0) { - fFused ? this->runFused() : this->runPipeline(); + SkRasterPipeline p; + p.append(SkOpts::load_s_srgb_body, SkOpts::load_s_srgb_tail, src); + p.append(SkOpts::scale_u8_body, SkOpts::scale_u8_tail, mask); + p.append(SkOpts::load_d_srgb_body, SkOpts::load_d_srgb_tail, dst); + p.append(SkOpts::srcover); + p.append(SkOpts::store_srgb_body, SkOpts::store_srgb_tail, dst); + p.run(N); } } - - void runFused() { - Sk4f r,g,b,a, dr,dg,db,da; - size_t x = 0, n = N; - while (n >= 4) { - load_s_srgb(src , x,0, r,g,b,a, dr,dg,db,da); - scale_u8 (mask , x,0, r,g,b,a, dr,dg,da,da); - load_d_srgb(dst , x,0, r,g,b,a, dr,dg,da,da); - srcover (nullptr, x,0, r,g,b,a, dr,dg,da,da); - store_srgb (dst , x,0, r,g,b,a, dr,dg,da,da); - - x += 4; - n -= 4; - } - if (n > 0) { - load_s_srgb(src , x,n, r,g,b,a, dr,dg,db,da); - scale_u8 (mask , x,n, r,g,b,a, dr,dg,da,da); - load_d_srgb(dst , x,n, r,g,b,a, dr,dg,da,da); - srcover (nullptr, x,n, r,g,b,a, dr,dg,da,da); - store_srgb (dst , x,n, r,g,b,a, dr,dg,da,da); - } - } - - void runPipeline() { - SkRasterPipeline p; - p.append(src); - p.append< scale_u8>(mask); - p.append(dst); - p.append< srcover>(); - p.last < store_srgb>(dst); - p.run(N); - } - - bool fFused; }; - -DEF_BENCH( return new SkRasterPipelineBench(true); ) -DEF_BENCH( return new SkRasterPipelineBench(false); ) +DEF_BENCH( return new SkRasterPipelineBench; ) diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index 9ecad15416..1f686ff56b 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -43,6 +43,7 @@ #include "SkChecksum_opts.h" #include "SkColorCubeFilter_opts.h" #include "SkMorphologyImageFilter_opts.h" +#include "SkRasterPipeline_opts.h" #include "SkSwizzler_opts.h" #include "SkTextureCompressor_opts.h" #include "SkXfermode_opts.h" @@ -89,6 +90,38 @@ namespace SkOpts { DEFINE_DEFAULT(hash_fn); #undef DEFINE_DEFAULT +// Stages that are not sensitive to the tail parameter can be represented by one function. +#define DEFINE_DEFAULT(stage, kCallNext) \ + decltype(stage) stage = body + + DEFINE_DEFAULT(srcover, true); + DEFINE_DEFAULT(constant_color, true); + DEFINE_DEFAULT(lerp_constant_float, true); +#undef DEFINE_DEFAULT + +// Stages that are sensitive to the tail parameter need two versions, _body and _tail. +#define DEFINE_DEFAULT(stage, kCallNext) \ + decltype(stage##_body) stage##_body = body; \ + decltype(stage##_tail) stage##_tail = tail + + DEFINE_DEFAULT(load_d_srgb, true); + DEFINE_DEFAULT(load_s_srgb, true); + DEFINE_DEFAULT( store_srgb, false); + + DEFINE_DEFAULT(load_d_f16, true); + DEFINE_DEFAULT(load_s_f16, true); + DEFINE_DEFAULT( store_f16, false); + + DEFINE_DEFAULT(load_d_565, true); + DEFINE_DEFAULT(load_s_565, true); + DEFINE_DEFAULT( store_565, false); + + DEFINE_DEFAULT(scale_u8, true); + + DEFINE_DEFAULT(lerp_u8, true); + DEFINE_DEFAULT(lerp_565, true); +#undef DEFINE_DEFAULT + // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp. void Init_ssse3(); void Init_sse41(); diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h index 44e337d950..c310a79aa0 100644 --- a/src/core/SkOpts.h +++ b/src/core/SkOpts.h @@ -8,6 +8,7 @@ #ifndef SkOpts_DEFINED #define SkOpts_DEFINED +#include "SkRasterPipeline.h" #include "SkTextureCompressor.h" #include "SkTypes.h" #include "SkXfermode.h" @@ -71,6 +72,25 @@ namespace SkOpts { static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) { return hash_fn(data, bytes, seed); } + + // Each of the SkRasterPipeline::Fn's lists its context pointer type in the comments. + + extern SkRasterPipeline::Fn srcover, // (none) + constant_color, // const SkPM4f* + lerp_constant_float; // const float*, in [0,1] + + extern SkRasterPipeline::Fn load_d_srgb_body, load_d_srgb_tail, // const uint32_t* + load_s_srgb_body, load_s_srgb_tail, // const uint32_t* + store_srgb_body, store_srgb_tail, // uint32_t* + load_d_f16_body, load_d_f16_tail, // const uint64_t* + load_s_f16_body, load_s_f16_tail, // const uint64_t* + store_f16_body, store_f16_tail, // uint64_t* + load_d_565_body, load_d_565_tail, // const uint16_t* + load_s_565_body, load_s_565_tail, // const uint16_t* + store_565_body, store_565_tail, // uint16_t* + scale_u8_body, scale_u8_tail, // const uint8_t* + lerp_u8_body, lerp_u8_tail, // const uint8_t* + lerp_565_body, lerp_565_tail; // const uint16_t* } #endif//SkOpts_DEFINED diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h index 7e934f1731..9f4dcb34da 100644 --- a/src/core/SkRasterPipeline.h +++ b/src/core/SkRasterPipeline.h @@ -48,14 +48,6 @@ * * Some stages that typically return are those that write a color to a destination pointer, * but any stage can short-circuit the rest of the pipeline by returning instead of calling next(). - * - * Most simple pipeline stages can use the SK_RASTER_STAGE macro to define a static EasyFn, - * which simplifies the user interface a bit: - * - the context pointer is available directly as the first parameter; - * - instead of manually calling a next() function, just modify registers in place. - * - * To add an EasyFn stage to the pipeline, call append() instead of append(&fn). - * It's a slight performance benefit to call last() for the last stage of a pipeline. */ // TODO: There may be a better place to stuff tail, e.g. in the bottom alignment bits of @@ -66,9 +58,6 @@ public: struct Stage; using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f, Sk4f,Sk4f,Sk4f,Sk4f); - using EasyFn = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&, - Sk4f&, Sk4f&, Sk4f&, Sk4f&); - struct Stage { template T ctx() { return static_cast(fCtx); } @@ -99,17 +88,6 @@ public: void append(Fn body, Fn tail, const void* ctx = nullptr); void append(Fn fn, const void* ctx = nullptr) { this->append(fn, fn, ctx); } - // Version of append that can be used with static EasyFn (see SK_RASTER_STAGE). - template - void append(const void* ctx = nullptr) { - this->append(Body, Tail, ctx); - } - - // If this is the last stage of the pipeline, last() is a bit faster than append(). - template - void last(const void* ctx = nullptr) { - this->append(Body, Tail, ctx); - } // Append all stages to this pipeline. void extend(const SkRasterPipeline&); @@ -122,42 +100,10 @@ private: // buggy pipeline can't walk off its own end. static void SK_VECTORCALL JustReturn(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f, Sk4f,Sk4f,Sk4f,Sk4f); - - template - static void SK_VECTORCALL Body(SkRasterPipeline::Stage* st, size_t x, size_t tail, - Sk4f r, Sk4f g, Sk4f b, Sk4f a, - Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { - // Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel. - kernel(st->ctx(), x,0, r,g,b,a, dr,dg,db,da); - if (kCallNext) { - st->next(x,tail, r,g,b,a, dr,dg,db,da); // It's faster to pass tail here than 0. - } - } - - template - static void SK_VECTORCALL Tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, - Sk4f r, Sk4f g, Sk4f b, Sk4f a, - Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { - #if defined(__clang__) - __builtin_assume(tail > 0); // This flourish lets Clang compile away any tail==0 code. - #endif - kernel(st->ctx(), x,tail, r,g,b,a, dr,dg,db,da); - if (kCallNext) { - st->next(x,tail, r,g,b,a, dr,dg,db,da); - } - } - Stages fBody, fTail; Fn fBodyStart = &JustReturn, fTailStart = &JustReturn; }; -// These are always static, and we _really_ want them to inline. -// If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly. -#define SK_RASTER_STAGE(name) \ - static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \ - Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \ - Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da) - #endif//SkRasterPipeline_DEFINED diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp index 46820d3379..2ada336cd8 100644 --- a/src/core/SkRasterPipelineBlitter.cpp +++ b/src/core/SkRasterPipelineBlitter.cpp @@ -8,11 +8,10 @@ #include "SkBlitter.h" #include "SkColor.h" #include "SkColorFilter.h" -#include "SkHalf.h" +#include "SkOpts.h" #include "SkPM4f.h" #include "SkRasterPipeline.h" #include "SkShader.h" -#include "SkSRGB.h" #include "SkXfermode.h" @@ -57,200 +56,6 @@ SkBlitter* SkCreateRasterPipelineBlitter(const SkPixmap& dst, return SkRasterPipelineBlitter::Create(dst, paint, alloc); } -// Clamp colors into [0,1] premul (e.g. just before storing back to memory). -SK_RASTER_STAGE(clamp_01_premul) { - a = Sk4f::Max(a, 0.0f); - r = Sk4f::Max(r, 0.0f); - g = Sk4f::Max(g, 0.0f); - b = Sk4f::Max(b, 0.0f); - - a = Sk4f::Min(a, 1.0f); - r = Sk4f::Min(r, a); - g = Sk4f::Min(g, a); - b = Sk4f::Min(b, a); -} - -// The default shader produces a constant color (from the SkPaint). -SK_RASTER_STAGE(constant_color) { - auto color = (const SkPM4f*)ctx; - r = color->r(); - g = color->g(); - b = color->b(); - a = color->a(); -} - -// The default transfer mode is srcover, s' = s + d*(1-sa). -SK_RASTER_STAGE(srcover) { - r += dr*(1.0f - a); - g += dg*(1.0f - a); - b += db*(1.0f - a); - a += da*(1.0f - a); -} - -static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) { - return from + (to-from)*cov; -} - -// s' = d(1-c) + sc, for a constant c. -SK_RASTER_STAGE(lerp_constant_float) { - Sk4f c = *(const float*)ctx; - - r = lerp(dr, r, c); - g = lerp(dg, g, c); - b = lerp(db, b, c); - a = lerp(da, a, c); -} - -template -static SkNx<4,T> load_tail(size_t tail, const T* src) { - if (tail) { - return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0); - } - return SkNx<4,T>::Load(src); -} - -template -static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) { - switch(tail) { - case 0: return v.store(dst); - case 3: dst[2] = v[2]; - case 2: dst[1] = v[1]; - case 1: dst[0] = v[0]; - } -} - -// s' = d(1-c) + sc for 8-bit c. -SK_RASTER_STAGE(lerp_a8) { - auto ptr = (const uint8_t*)ctx + x; - - Sk4f c = SkNx_cast(load_tail(tail, ptr)) * (1/255.0f); - r = lerp(dr, r, c); - g = lerp(dg, g, c); - b = lerp(db, b, c); - a = lerp(da, a, c); -} - -static void from_565(const Sk4h& _565, Sk4f* r, Sk4f* g, Sk4f* b) { - Sk4i _32_bit = SkNx_cast(_565); - - *r = SkNx_cast(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE); - *g = SkNx_cast(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE); - *b = SkNx_cast(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE); -} - -static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) { - return SkNx_cast( Sk4f_round(r * SK_R16_MASK) << SK_R16_SHIFT - | Sk4f_round(g * SK_G16_MASK) << SK_G16_SHIFT - | Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT); -} - -// s' = d(1-c) + sc for 565 c. -SK_RASTER_STAGE(lerp_lcd16) { - auto ptr = (const uint16_t*)ctx + x; - Sk4f cr, cg, cb; - from_565(load_tail(tail, ptr), &cr, &cg, &cb); - - r = lerp(dr, r, cr); - g = lerp(dg, g, cg); - b = lerp(db, b, cb); - a = 1.0f; -} - -SK_RASTER_STAGE(load_d_565) { - auto ptr = (const uint16_t*)ctx + x; - from_565(load_tail(tail, ptr), &dr,&dg,&db); - da = 1.0f; -} - -SK_RASTER_STAGE(store_565) { - auto ptr = (uint16_t*)ctx + x; - store_tail(tail, to_565(r,g,b), ptr); -} - -SK_RASTER_STAGE(load_d_f16) { - auto ptr = (const uint64_t*)ctx + x; - - if (tail) { - auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) , - p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0}, - p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0}; - dr = { p0[0],p1[0],p2[0],0 }; - dg = { p0[1],p1[1],p2[1],0 }; - db = { p0[2],p1[2],p2[2],0 }; - da = { p0[3],p1[3],p2[3],0 }; - return; - } - - Sk4h rh, gh, bh, ah; - Sk4h_load4(ptr, &rh, &gh, &bh, &ah); - dr = SkHalfToFloat_finite_ftz(rh); - dg = SkHalfToFloat_finite_ftz(gh); - db = SkHalfToFloat_finite_ftz(bh); - da = SkHalfToFloat_finite_ftz(ah); -} - -SK_RASTER_STAGE(store_f16) { - auto ptr = (uint64_t*)ctx + x; - - switch (tail) { - case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g), - SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a)); - - case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2); - case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1); - case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0); - } -} - -// Load 8-bit SkPMColor-order sRGB. -SK_RASTER_STAGE(load_d_srgb) { - auto ptr = (const uint32_t*)ctx + x; - - if (tail) { - float rs[] = {0,0,0,0}, - gs[] = {0,0,0,0}, - bs[] = {0,0,0,0}, - as[] = {0,0,0,0}; - for (size_t i = 0; i < tail; i++) { - rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff]; - gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff]; - bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff]; - as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ; - } - dr = Sk4f::Load(rs); - dg = Sk4f::Load(gs); - db = Sk4f::Load(bs); - da = Sk4f::Load(as); - return; - } - - dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] }; - - dg = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] }; - - db = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff], - sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] }; - - da = SkNx_cast(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f); -} - -// Store 8-bit SkPMColor-order sRGB. -SK_RASTER_STAGE(store_srgb) { - auto ptr = (uint32_t*)ctx + x; - store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT - | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT - | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT - | Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr); -} - static bool supported(const SkImageInfo& info) { switch (info.colorType()) { case kN32_SkColorType: return info.gammaCloseToSRGB(); @@ -297,10 +102,10 @@ SkBlitter* SkRasterPipelineBlitter::Create(const SkPixmap& dst, color.premul()); if (!paint.getShader()) { - blitter->fShader.append(&blitter->fPaintColor); + blitter->fShader.append(SkOpts::constant_color, &blitter->fPaintColor); } if (!paint.getXfermode()) { - blitter->fXfermode.append(); + blitter->fXfermode.append(SkOpts::srcover); } return blitter; @@ -312,41 +117,33 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst switch (fDst.info().colorType()) { case kN32_SkColorType: if (fDst.info().gammaCloseToSRGB()) { - p->append(dst); + p->append(SkOpts::load_d_srgb_body, SkOpts::load_d_srgb_tail, dst); } break; case kRGBA_F16_SkColorType: - p->append(dst); + p->append(SkOpts::load_d_f16_body, SkOpts::load_d_f16_tail, dst); break; case kRGB_565_SkColorType: - p->append(dst); + p->append(SkOpts::load_d_565_body, SkOpts::load_d_565_tail, dst); break; default: break; } } -template -static void clamp_01_premul_then(void* ctx, size_t x, size_t tail, - Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, - Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da) { - clamp_01_premul(nullptr, x,tail, r,g,b,a, dr,dg,db,da); - fn( ctx, x,tail, r,g,b,a, dr,dg,db,da); -} - void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p, void* dst) const { SkASSERT(supported(fDst.info())); switch (fDst.info().colorType()) { case kN32_SkColorType: if (fDst.info().gammaCloseToSRGB()) { - p->last>(dst); + p->append(SkOpts::store_srgb_body, SkOpts::store_srgb_tail, dst); } break; case kRGBA_F16_SkColorType: - p->last>(dst); + p->append(SkOpts::store_f16_body, SkOpts::store_f16_tail, dst); break; case kRGB_565_SkColorType: - p->last>(dst); + p->append(SkOpts::store_565_body, SkOpts::store_565_tail, dst); break; default: break; } @@ -374,7 +171,7 @@ void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const p.extend(fColorFilter); this->append_load_d(&p, dst); p.extend(fXfermode); - p.append(&coverage); + p.append(SkOpts::lerp_constant_float, &coverage); this->append_store(&p, dst); for (int16_t run = *runs; run > 0; run = *runs) { @@ -404,10 +201,10 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip) p.extend(fXfermode); switch (mask.fFormat) { case SkMask::kA8_Format: - p.append(mask.getAddr8(x,y)-x); + p.append(SkOpts::lerp_u8_body, SkOpts::lerp_u8_tail, mask.getAddr8(x,y)-x); break; case SkMask::kLCD16_Format: - p.append(mask.getAddrLCD16(x,y)-x); + p.append(SkOpts::lerp_565_body, SkOpts::lerp_565_tail, mask.getAddrLCD16(x,y)-x); break; default: break; } diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp index 17ce0668ff..e4e3246780 100644 --- a/src/opts/SkOpts_sse41.cpp +++ b/src/opts/SkOpts_sse41.cpp @@ -11,6 +11,7 @@ #include "SkBlurImageFilter_opts.h" #include "SkBlitRow_opts.h" #include "SkBlend_opts.h" +#include "SkRasterPipeline_opts.h" namespace SkOpts { void Init_sse41() { @@ -19,5 +20,35 @@ namespace SkOpts { box_blur_yx = sse41::box_blur_yx; srcover_srgb_srgb = sse41::srcover_srgb_srgb; blit_row_s32a_opaque = sse41::blit_row_s32a_opaque; + + #define STAGE(stage, kCallNext) \ + stage = body + + STAGE(srcover, true); + STAGE(constant_color, true); + STAGE(lerp_constant_float, true); + #undef STAGE + + #define STAGE(stage, kCallNext) \ + stage##_body = body; \ + stage##_tail = tail + + STAGE(load_d_srgb, true); + STAGE(load_s_srgb, true); + STAGE( store_srgb, false); + + STAGE(load_d_f16, true); + STAGE(load_s_f16, true); + STAGE( store_f16, false); + + STAGE(load_d_565, true); + STAGE(load_s_565, true); + STAGE( store_565, false); + + STAGE(scale_u8, true); + + STAGE(lerp_u8, true); + STAGE(lerp_565, true); + #undef STAGE } } diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h new file mode 100644 index 0000000000..70c4d0c225 --- /dev/null +++ b/src/opts/SkRasterPipeline_opts.h @@ -0,0 +1,333 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkRasterPipeline_opts_DEFINED +#define SkRasterPipeline_opts_DEFINED + +#include "SkHalf.h" +#include "SkPM4f.h" +#include "SkRasterPipeline.h" +#include "SkSRGB.h" + +using Kernel_Sk4f = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&, + Sk4f&, Sk4f&, Sk4f&, Sk4f&); + +// These are always static, and we _really_ want them to inline. +// If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly. +#define KERNEL_Sk4f(name) \ + static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \ + Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \ + Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da) + + +template +static inline void SK_VECTORCALL body(SkRasterPipeline::Stage* st, size_t x, size_t t, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + // Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel. + kernel(st->ctx(), x,0, r,g,b,a, dr,dg,db,da); + if (kCallNext) { + st->next(x,t, r,g,b,a, dr,dg,db,da); // It's faster to pass t here than 0. + } +} + +template +static inline void SK_VECTORCALL tail(SkRasterPipeline::Stage* st, size_t x, size_t t, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { +#if defined(__clang__) + __builtin_assume(t > 0); // This flourish lets Clang compile away any tail==0 code. +#endif + kernel(st->ctx(), x,t, r,g,b,a, dr,dg,db,da); + if (kCallNext) { + st->next(x,t, r,g,b,a, dr,dg,db,da); + } +} + +namespace SK_OPTS_NS { + + // Clamp colors into [0,1] premul (e.g. just before storing back to memory). + static void clamp_01_premul(Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a) { + a = Sk4f::Max(a, 0.0f); + r = Sk4f::Max(r, 0.0f); + g = Sk4f::Max(g, 0.0f); + b = Sk4f::Max(b, 0.0f); + + a = Sk4f::Min(a, 1.0f); + r = Sk4f::Min(r, a); + g = Sk4f::Min(g, a); + b = Sk4f::Min(b, a); + } + + static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) { + return from + (to-from)*cov; + } + + template + static SkNx<4,T> load_tail(size_t tail, const T* src) { + if (tail) { + return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0); + } + return SkNx<4,T>::Load(src); + } + + template + static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) { + switch(tail) { + case 0: return v.store(dst); + case 3: dst[2] = v[2]; + case 2: dst[1] = v[1]; + case 1: dst[0] = v[0]; + } + } + + static void from_565(const Sk4h& _565, Sk4f* r, Sk4f* g, Sk4f* b) { + Sk4i _32_bit = SkNx_cast(_565); + + *r = SkNx_cast(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE); + *g = SkNx_cast(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE); + *b = SkNx_cast(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE); + } + + static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) { + return SkNx_cast( Sk4f_round(r * SK_R16_MASK) << SK_R16_SHIFT + | Sk4f_round(g * SK_G16_MASK) << SK_G16_SHIFT + | Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT); + } + + + // The default shader produces a constant color (from the SkPaint). + KERNEL_Sk4f(constant_color) { + auto color = (const SkPM4f*)ctx; + r = color->r(); + g = color->g(); + b = color->b(); + a = color->a(); + } + + // The default transfer mode is srcover, s' = s + d*(1-sa). + KERNEL_Sk4f(srcover) { + r += dr*(1.0f - a); + g += dg*(1.0f - a); + b += db*(1.0f - a); + a += da*(1.0f - a); + } + + // s' = d(1-c) + sc, for a constant c. + KERNEL_Sk4f(lerp_constant_float) { + Sk4f c = *(const float*)ctx; + + r = lerp(dr, r, c); + g = lerp(dg, g, c); + b = lerp(db, b, c); + a = lerp(da, a, c); + } + + // s' = sc for 8-bit c. + KERNEL_Sk4f(scale_u8) { + auto ptr = (const uint8_t*)ctx + x; + + Sk4f c = SkNx_cast(load_tail(tail, ptr)) * (1/255.0f); + r = r*c; + g = g*c; + b = b*c; + a = a*c; + } + + // s' = d(1-c) + sc for 8-bit c. + KERNEL_Sk4f(lerp_u8) { + auto ptr = (const uint8_t*)ctx + x; + + Sk4f c = SkNx_cast(load_tail(tail, ptr)) * (1/255.0f); + r = lerp(dr, r, c); + g = lerp(dg, g, c); + b = lerp(db, b, c); + a = lerp(da, a, c); + } + + // s' = d(1-c) + sc for 565 c. + KERNEL_Sk4f(lerp_565) { + auto ptr = (const uint16_t*)ctx + x; + Sk4f cr, cg, cb; + from_565(load_tail(tail, ptr), &cr, &cg, &cb); + + r = lerp(dr, r, cr); + g = lerp(dg, g, cg); + b = lerp(db, b, cb); + a = 1.0f; + } + + KERNEL_Sk4f(load_d_565) { + auto ptr = (const uint16_t*)ctx + x; + from_565(load_tail(tail, ptr), &dr,&dg,&db); + da = 1.0f; + } + + KERNEL_Sk4f(load_s_565) { + auto ptr = (const uint16_t*)ctx + x; + from_565(load_tail(tail, ptr), &r,&g,&b); + a = 1.0f; + } + + KERNEL_Sk4f(store_565) { + clamp_01_premul(r,g,b,a); + auto ptr = (uint16_t*)ctx + x; + store_tail(tail, to_565(r,g,b), ptr); + } + + KERNEL_Sk4f(load_d_f16) { + auto ptr = (const uint64_t*)ctx + x; + + if (tail) { + auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) , + p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0}, + p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0}; + dr = { p0[0],p1[0],p2[0],0 }; + dg = { p0[1],p1[1],p2[1],0 }; + db = { p0[2],p1[2],p2[2],0 }; + da = { p0[3],p1[3],p2[3],0 }; + return; + } + + Sk4h rh, gh, bh, ah; + Sk4h_load4(ptr, &rh, &gh, &bh, &ah); + dr = SkHalfToFloat_finite_ftz(rh); + dg = SkHalfToFloat_finite_ftz(gh); + db = SkHalfToFloat_finite_ftz(bh); + da = SkHalfToFloat_finite_ftz(ah); + } + + KERNEL_Sk4f(load_s_f16) { + auto ptr = (const uint64_t*)ctx + x; + + if (tail) { + auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) , + p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0}, + p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0}; + r = { p0[0],p1[0],p2[0],0 }; + g = { p0[1],p1[1],p2[1],0 }; + b = { p0[2],p1[2],p2[2],0 }; + a = { p0[3],p1[3],p2[3],0 }; + return; + } + + Sk4h rh, gh, bh, ah; + Sk4h_load4(ptr, &rh, &gh, &bh, &ah); + r = SkHalfToFloat_finite_ftz(rh); + g = SkHalfToFloat_finite_ftz(gh); + b = SkHalfToFloat_finite_ftz(bh); + a = SkHalfToFloat_finite_ftz(ah); + } + + KERNEL_Sk4f(store_f16) { + clamp_01_premul(r,g,b,a); + auto ptr = (uint64_t*)ctx + x; + + switch (tail) { + case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), + SkFloatToHalf_finite_ftz(g), + SkFloatToHalf_finite_ftz(b), + SkFloatToHalf_finite_ftz(a)); + + case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2); + case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1); + case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0); + } + } + + + // Load 8-bit SkPMColor-order sRGB. + KERNEL_Sk4f(load_d_srgb) { + auto ptr = (const uint32_t*)ctx + x; + + if (tail) { + float rs[] = {0,0,0,0}, + gs[] = {0,0,0,0}, + bs[] = {0,0,0,0}, + as[] = {0,0,0,0}; + for (size_t i = 0; i < tail; i++) { + rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff]; + gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff]; + bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff]; + as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ; + } + dr = Sk4f::Load(rs); + dg = Sk4f::Load(gs); + db = Sk4f::Load(bs); + da = Sk4f::Load(as); + return; + } + + dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] }; + + dg = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] }; + + db = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] }; + + da = SkNx_cast(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f); + } + + KERNEL_Sk4f(load_s_srgb) { + auto ptr = (const uint32_t*)ctx + x; + + if (tail) { + float rs[] = {0,0,0,0}, + gs[] = {0,0,0,0}, + bs[] = {0,0,0,0}, + as[] = {0,0,0,0}; + for (size_t i = 0; i < tail; i++) { + rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff]; + gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff]; + bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff]; + as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ; + } + r = Sk4f::Load(rs); + g = Sk4f::Load(gs); + b = Sk4f::Load(bs); + a = Sk4f::Load(as); + return; + } + + r = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] }; + + g = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] }; + + b = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff], + sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] }; + + a = SkNx_cast(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f); + } + + KERNEL_Sk4f(store_srgb) { + clamp_01_premul(r,g,b,a); + auto ptr = (uint32_t*)ctx + x; + store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT + | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT + | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT + | Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr); + } + +} + +#endif//SkRasterPipeline_opts_DEFINED diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp index 867baf7918..ccc728e64a 100644 --- a/tests/SkRasterPipelineTest.cpp +++ b/tests/SkRasterPipelineTest.cpp @@ -8,25 +8,33 @@ #include "Test.h" #include "SkRasterPipeline.h" -SK_RASTER_STAGE(load) { - auto ptr = (const float*)ctx + x; +static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x, size_t tail, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx() + x; switch(tail&3) { case 0: a = Sk4f{ptr[3]}; case 3: b = Sk4f{ptr[2]}; case 2: g = Sk4f{ptr[1]}; case 1: r = Sk4f{ptr[0]}; } + st->next(x,tail, r,g,b,a, dr,dg,db,da); } -SK_RASTER_STAGE(square) { +static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x, size_t tail, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { r *= r; g *= g; b *= b; a *= a; + st->next(x,tail, r,g,b,a, dr,dg,db,da); } -SK_RASTER_STAGE(store) { - auto ptr = (float*)ctx + x; +static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x, size_t tail, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx() + x; switch (tail&3) { case 0: ptr[3] = a[0]; case 3: ptr[2] = b[0]; @@ -41,6 +49,8 @@ DEF_TEST(SkRasterPipeline, r) { // - context pointers (load,store) // - stages sensitive to the number of pixels (load,store) // - stages insensitive to the number of pixels (square) + // - stages that chain to the next stage (load,square) + // - stages that terminate the pipeline (store) // // This pipeline loads up some values, squares them, then writes them back to memory. @@ -48,9 +58,9 @@ DEF_TEST(SkRasterPipeline, r) { float dst_vals[] = { 0,0,0,0,0 }; SkRasterPipeline p; - p.append(src_vals); - p.append(); - p.append(dst_vals); + p.append(load, src_vals); + p.append(square); + p.append(store, dst_vals); p.run(5); @@ -71,6 +81,6 @@ DEF_TEST(SkRasterPipeline_nonsense, r) { // No asserts... just a test that this is safe to run and terminates. // square() always calls st->next(); this makes sure we've always got something there to call. SkRasterPipeline p; - p.append(); + p.append(square); p.run(20); }