SkRasterPipeline refactor

- Give body and tail functions separate types.  This frees a register in body functions, especially important for Windows.

  - Fill out default, SSE4.1, and HSW versions of all functions.  This means we don't have to mess around with SkNf_abi... all functions come from the same compilation unit where SkNf is a single consistent type.

  - Move Stage::next() into SkRasterPipeline_opts.h as a static inline function.

  - Remove Stage::ctx() entirely... fCtx is literally the same thing.

This is a step along the way toward building the entire pipeline in src/opts, removing the need for all the stages to be functions living in SkOpts.

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3680
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-ASAN-Trybot

Change-Id: I7de78ffebc15b9bad4eda187c9f50369cd7e5e42
Reviewed-on: https://skia-review.googlesource.com/3680
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2016-10-19 21:05:17 -04:00 committed by Skia Commit-Bot
parent d2fe3bce07
commit 2878e76247
5 changed files with 112 additions and 110 deletions

View File

@ -144,8 +144,8 @@ namespace SkOpts {
static_assert(SK_ARRAY_COUNT(body) == SkRasterPipeline::kNumStockStages, "");
SkOpts::VoidFn tail[] = {
(SkOpts::VoidFn)SK_OPTS_NS::just_return,
(SkOpts::VoidFn)SK_OPTS_NS::swap_src_dst,
(SkOpts::VoidFn)SK_OPTS_NS::just_return_tail,
(SkOpts::VoidFn)SK_OPTS_NS::swap_src_dst_tail,
(SkOpts::VoidFn)SK_OPTS_NS::store_565_tail,
(SkOpts::VoidFn)SK_OPTS_NS::store_srgb_tail,
@ -163,34 +163,34 @@ namespace SkOpts {
(SkOpts::VoidFn)SK_OPTS_NS::lerp_u8_tail,
(SkOpts::VoidFn)SK_OPTS_NS::lerp_565_tail,
(SkOpts::VoidFn)SK_OPTS_NS::lerp_constant_float,
(SkOpts::VoidFn)SK_OPTS_NS::lerp_constant_float_tail,
(SkOpts::VoidFn)SK_OPTS_NS::constant_color,
(SkOpts::VoidFn)SK_OPTS_NS::constant_color_tail,
(SkOpts::VoidFn)SK_OPTS_NS::dst,
(SkOpts::VoidFn)SK_OPTS_NS::dstatop,
(SkOpts::VoidFn)SK_OPTS_NS::dstin,
(SkOpts::VoidFn)SK_OPTS_NS::dstout,
(SkOpts::VoidFn)SK_OPTS_NS::dstover,
(SkOpts::VoidFn)SK_OPTS_NS::srcatop,
(SkOpts::VoidFn)SK_OPTS_NS::srcin,
(SkOpts::VoidFn)SK_OPTS_NS::srcout,
(SkOpts::VoidFn)SK_OPTS_NS::srcover,
(SkOpts::VoidFn)SK_OPTS_NS::clear,
(SkOpts::VoidFn)SK_OPTS_NS::modulate,
(SkOpts::VoidFn)SK_OPTS_NS::multiply,
(SkOpts::VoidFn)SK_OPTS_NS::plus_,
(SkOpts::VoidFn)SK_OPTS_NS::screen,
(SkOpts::VoidFn)SK_OPTS_NS::xor_,
(SkOpts::VoidFn)SK_OPTS_NS::colorburn,
(SkOpts::VoidFn)SK_OPTS_NS::colordodge,
(SkOpts::VoidFn)SK_OPTS_NS::darken,
(SkOpts::VoidFn)SK_OPTS_NS::difference,
(SkOpts::VoidFn)SK_OPTS_NS::exclusion,
(SkOpts::VoidFn)SK_OPTS_NS::hardlight,
(SkOpts::VoidFn)SK_OPTS_NS::lighten,
(SkOpts::VoidFn)SK_OPTS_NS::overlay,
(SkOpts::VoidFn)SK_OPTS_NS::softlight,
(SkOpts::VoidFn)SK_OPTS_NS::dst_tail,
(SkOpts::VoidFn)SK_OPTS_NS::dstatop_tail,
(SkOpts::VoidFn)SK_OPTS_NS::dstin_tail,
(SkOpts::VoidFn)SK_OPTS_NS::dstout_tail,
(SkOpts::VoidFn)SK_OPTS_NS::dstover_tail,
(SkOpts::VoidFn)SK_OPTS_NS::srcatop_tail,
(SkOpts::VoidFn)SK_OPTS_NS::srcin_tail,
(SkOpts::VoidFn)SK_OPTS_NS::srcout_tail,
(SkOpts::VoidFn)SK_OPTS_NS::srcover_tail,
(SkOpts::VoidFn)SK_OPTS_NS::clear_tail,
(SkOpts::VoidFn)SK_OPTS_NS::modulate_tail,
(SkOpts::VoidFn)SK_OPTS_NS::multiply_tail,
(SkOpts::VoidFn)SK_OPTS_NS::plus__tail,
(SkOpts::VoidFn)SK_OPTS_NS::screen_tail,
(SkOpts::VoidFn)SK_OPTS_NS::xor__tail,
(SkOpts::VoidFn)SK_OPTS_NS::colorburn_tail,
(SkOpts::VoidFn)SK_OPTS_NS::colordodge_tail,
(SkOpts::VoidFn)SK_OPTS_NS::darken_tail,
(SkOpts::VoidFn)SK_OPTS_NS::difference_tail,
(SkOpts::VoidFn)SK_OPTS_NS::exclusion_tail,
(SkOpts::VoidFn)SK_OPTS_NS::hardlight_tail,
(SkOpts::VoidFn)SK_OPTS_NS::lighten_tail,
(SkOpts::VoidFn)SK_OPTS_NS::overlay_tail,
(SkOpts::VoidFn)SK_OPTS_NS::softlight_tail,
};
static_assert(SK_ARRAY_COUNT(tail) == SkRasterPipeline::kNumStockStages, "");

View File

@ -55,33 +55,13 @@
class SkRasterPipeline {
public:
struct Stage;
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
using V = SkNx_abi<8,float>;
#else
using V = SkNx_abi<4,float>;
#endif
using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, V,V,V,V,
V,V,V,V);
struct Stage {
template <typename T>
T ctx() { return static_cast<T>(fCtx); }
void SK_VECTORCALL next(size_t x, size_t tail, V v0, V v1, V v2, V v3,
V v4, V v5, V v6, V v7) {
// Stages are logically a pipeline, and physically are contiguous in an array.
// To get to the next stage, we just increment our pointer to the next array element.
((Fn)fNext)(this+1, x,tail, v0,v1,v2,v3, v4,v5,v6,v7);
}
// It makes next() a good bit cheaper if we hold the next function to call here,
// rather than logically simpler choice of the function implementing this stage.
void (*fNext)();
void* fCtx;
};
SkRasterPipeline();
// Run the pipeline constructed with append(), walking x through [x,x+n),

View File

@ -35,11 +35,6 @@ namespace SkOpts {
STAGE(lerp_u8);
STAGE(lerp_565);
#undef STAGE
#define STAGE(stage) \
body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage
STAGE(just_return);
STAGE(swap_src_dst);

View File

@ -21,7 +21,7 @@ namespace SkOpts {
srcover_srgb_srgb = sse41::srcover_srgb_srgb;
blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
#define STAGE(stage) \
#define STAGE(stage) \
body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage##_tail
@ -41,40 +41,34 @@ namespace SkOpts {
STAGE(lerp_u8);
STAGE(lerp_565);
#undef STAGE
#define STAGE(stage) \
body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage
STAGE(just_return);
STAGE(swap_src_dst);
STAGE(lerp_constant_float);
STAGE(constant_color);
// The commented-out stages don't actually benefit from SSE 4.1.
// To cut down on code bloat we skip them here, using the identical SSE2 defaults.
//STAGE(lerp_constant_float);
//STAGE(constant_color);
//STAGE(dst);
//STAGE(dstatop);
//STAGE(dstin);
//STAGE(dstout);
//STAGE(dstover);
//STAGE(srcatop);
//STAGE(srcin);
//STAGE(srcout);
//STAGE(srcover);
//STAGE(clear);
//STAGE(modulate);
//STAGE(multiply);
//STAGE(plus_);
//STAGE(screen);
//STAGE(xor_);
STAGE(dst);
STAGE(dstatop);
STAGE(dstin);
STAGE(dstout);
STAGE(dstover);
STAGE(srcatop);
STAGE(srcin);
STAGE(srcout);
STAGE(srcover);
STAGE(clear);
STAGE(modulate);
STAGE(multiply);
STAGE(plus_);
STAGE(screen);
STAGE(xor_);
STAGE(colorburn);
STAGE(colordodge);
//STAGE(darken);
//STAGE(difference);
//STAGE(exclusion);
STAGE(darken);
STAGE(difference);
STAGE(exclusion);
STAGE(hardlight);
//STAGE(lighten);
STAGE(lighten);
STAGE(overlay);
STAGE(softlight);
#undef STAGE

View File

@ -12,36 +12,53 @@
#include "SkPM4f.h"
#include "SkRasterPipeline.h"
#include "SkSRGB.h"
#include <utility>
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
static constexpr int N = 8;
#else
static constexpr int N = 4;
#endif
using SkNf_abi = SkRasterPipeline::V;
static constexpr auto N = sizeof(SkNf_abi) / sizeof(float);
using SkNf = SkNx<N, float>;
using SkNi = SkNx<N, int>;
using SkNh = SkNx<N, uint16_t>;
using Body = void(SK_VECTORCALL *)(SkRasterPipeline::Stage*, size_t,
SkNf,SkNf,SkNf,SkNf,
SkNf,SkNf,SkNf,SkNf);
using Tail = void(SK_VECTORCALL *)(SkRasterPipeline::Stage*, size_t, size_t,
SkNf,SkNf,SkNf,SkNf,
SkNf,SkNf,SkNf,SkNf);
#define SI static inline
template <typename Fn, typename... Args>
SI void next(SkRasterPipeline::Stage* st, Args&&... args) {
// Stages are logically a pipeline, and physically are contiguous in an array.
// To get to the next stage, we just increment our pointer to the next array element.
((Fn)st->fNext)(st+1, std::forward<Args>(args)...);
}
#define STAGE(name, kCallNext) \
template <bool kIsTail> \
static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \
SkNf& r, SkNf& g, SkNf& b, SkNf& a, \
SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \
SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
name##_kernel<false>(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da); \
SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
name##_kernel<false>(st->fCtx, x,0, r,g,b,a, dr,dg,db,da); \
if (kCallNext) { \
st->next(x,tail, r,g,b,a, dr,dg,db,da); \
next<Body>(st, x, r,g,b,a, dr,dg,db,da); \
} \
} \
SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
name##_kernel<true>(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da); \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
name##_kernel<true>(st->fCtx, x,tail, r,g,b,a, dr,dg,db,da); \
if (kCallNext) { \
st->next(x,tail, r,g,b,a, dr,dg,db,da); \
next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da); \
} \
} \
template <bool kIsTail> \
@ -54,15 +71,23 @@ using SkNh = SkNx<N, uint16_t>;
#define RGBA_XFERMODE(name) \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da); \
SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = name##_kernel(a,a,da,da); \
st->next(x,tail, r,g,b,a, dr,dg,db,da); \
next<Body>(st, x, r,g,b,a, dr,dg,db,da); \
} \
SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = name##_kernel(a,a,da,da); \
next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da); \
} \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da)
@ -71,15 +96,23 @@ using SkNh = SkNx<N, uint16_t>;
#define RGB_XFERMODE(name) \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da); \
SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = a + (da * (1.0f-a)); \
st->next(x,tail, r,g,b,a, dr,dg,db,da); \
next<Body>(st, x, r,g,b,a, dr,dg,db,da); \
} \
SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = a + (da * (1.0f-a)); \
next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da); \
} \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da)
@ -90,11 +123,11 @@ namespace SK_OPTS_NS {
SI void run_pipeline(size_t x, size_t n,
void (*vBodyStart)(), SkRasterPipeline::Stage* body,
void (*vTailStart)(), SkRasterPipeline::Stage* tail) {
auto bodyStart = (SkRasterPipeline::Fn)vBodyStart,
tailStart = (SkRasterPipeline::Fn)vTailStart;
SkNf v{0}; // TODO: uninitialized would be a bit faster, but some compilers are whiny.
auto bodyStart = (Body)vBodyStart;
auto tailStart = (Tail)vTailStart;
SkNf v; // Fastest to start uninitialized.
while (n >= N) {
bodyStart(body, x,0, v,v,v,v, v,v,v,v);
bodyStart(body, x, v,v,v,v, v,v,v,v);
x += N;
n -= N;
}