SkRasterPipeline preliminaries

Re-uploading to see if I can get a CL number < 2^31.
    patch from issue 2147533002 at patchset 240001 (http://crrev.com/2147533002#ps240001)

Already reviewed at the other crrev link.
TBR=

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2147533002
CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2144573004
This commit is contained in:
mtklein 2016-07-12 15:01:26 -07:00 committed by Commit bot
parent 7438bfc080
commit 281b33fdd9
7 changed files with 454 additions and 0 deletions

View File

@ -0,0 +1,195 @@
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "Benchmark.h"
#include "SkRasterPipeline.h"
#include "SkSRGB.h"
static const int N = 1023;
static uint32_t dst[N],
src[N];
static uint8_t mask[N];
// We'll build up a somewhat realistic useful pipeline:
// - load srgb src
// - scale src by 8-bit mask
// - load srgb dst
// - src = srcover(dst, src)
// - store src back as srgb
// Every stage except for srcover interacts with memory, and so will need _tail variants.
static void SK_VECTORCALL load_s_srgb(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const uint32_t*>() + x;
r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
g = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
b = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
st->next(x, r,g,b,a, dr,dg,db,da);
}
static void SK_VECTORCALL load_s_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const uint32_t*>() + x;
r = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 };
g = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 };
b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
a = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 };
st->next(x, r,g,b,a, dr,dg,db,da);
}
static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const uint32_t*>() + x;
dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
dg = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
db = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
st->next(x, r,g,b,a, dr,dg,db,da);
}
static void SK_VECTORCALL load_d_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const uint32_t*>() + x;
dr = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 };
dg = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 };
db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
da = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 };
st->next(x, r,g,b,a, dr,dg,db,da);
}
static void SK_VECTORCALL scale_u8(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const uint8_t*>() + x;
auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
r *= c;
g *= c;
b *= c;
a *= c;
st->next(x, r,g,b,a, dr,dg,db,da);
}
static void SK_VECTORCALL scale_u8_tail(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const uint8_t*>() + x;
auto c = *ptr * (1/255.0f);
r *= c;
g *= c;
b *= c;
a *= c;
st->next(x, r,g,b,a, dr,dg,db,da);
}
static void SK_VECTORCALL srcover(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto A = 1.0f - a;
r += dr * A;
g += dg * A;
b += db * A;
a += da * A;
st->next(x, r,g,b,a, dr,dg,db,da);
}
static Sk4f clamp(const Sk4f& x) {
return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
}
static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<uint32_t*>() + x;
r = clamp(sk_linear_to_srgb(r));
g = clamp(sk_linear_to_srgb(g));
b = clamp(sk_linear_to_srgb(b));
a = clamp( 255.0f * a );
( SkNx_cast<int>(r)
| SkNx_cast<int>(g) << 8
| SkNx_cast<int>(b) << 16
| SkNx_cast<int>(a) << 24 ).store(ptr);
}
static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
Sk4f r, Sk4f g, Sk4f b, Sk4f a,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<uint32_t*>() + x;
auto rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
rgba = {rgba[0], rgba[1], rgba[2], 255.0f*a[0]};
rgba = clamp(rgba);
SkNx_cast<uint8_t>(rgba).store(ptr);
}
class SkRasterPipelineBench : public Benchmark {
public:
SkRasterPipelineBench() {}
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
const char* onGetName() override { return "SkRasterPipelineBench"; }
void onDraw(int loops, SkCanvas*) override {
SkRasterPipeline p;
p.append(load_s_srgb, load_s_srgb_tail, src);
p.append( scale_u8, scale_u8_tail, mask);
p.append(load_d_srgb, load_d_srgb_tail, dst);
p.append(srcover);
p.append( store_srgb, store_srgb_tail, dst);
while (loops --> 0) {
p.run(N);
}
}
};
DEF_BENCH( return new SkRasterPipelineBench; )

View File

@ -231,6 +231,7 @@
'<(skia_src_path)/core/SkQuadClipper.cpp',
'<(skia_src_path)/core/SkQuadClipper.h',
'<(skia_src_path)/core/SkRasterClip.cpp',
'<(skia_src_path)/core/SkRasterPipeline.cpp',
'<(skia_src_path)/core/SkRasterizer.cpp',
'<(skia_src_path)/core/SkReadBuffer.h',
'<(skia_src_path)/core/SkReadBuffer.cpp',

View File

@ -0,0 +1,65 @@
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkRasterPipeline.h"
SkRasterPipeline::SkRasterPipeline() {}
void SkRasterPipeline::append(SkRasterPipeline::Fn body, const void* body_ctx,
SkRasterPipeline::Fn tail, const void* tail_ctx) {
// We can't add more stages after being rewired to run().
SkASSERT(!fReadyToRun);
// For now, just stash the stage's function in its own fNext slot.
// We'll rewire our stages before running the pipeline so fNext makes sense.
fBody.push_back({ body, const_cast<void*>(body_ctx) });
fTail.push_back({ tail, const_cast<void*>(tail_ctx) });
}
void SkRasterPipeline::run(size_t n) {
if (fBody.empty() || fTail.empty()) {
return;
}
if (!fReadyToRun) {
auto rewire = [](Stages* stages) {
SkASSERT(!stages->empty());
// Rotate the fNext pointers so they point to the next function to
// call, not function we're currently calling as set by append().
auto start = stages->front().fNext;
for (int i = 0; i < stages->count() - 1; i++) {
(*stages)[i].fNext = (*stages)[i+1].fNext;
}
stages->back().fNext = start; // This is a pretty handy place to stash this.
};
rewire(&fBody);
rewire(&fTail);
fReadyToRun = true;
}
// It's fastest to start uninitialized if the compilers all let us. If not, next fastest is 0.
Sk4f v;
auto start_body = fBody.back().fNext, // See rewire().
start_tail = fTail.back().fNext;
auto body = fBody.begin(),
tail = fTail.begin();
size_t x = 0;
while (n >= 4) {
start_body(body, x, v,v,v,v, v,v,v,v);
x += 4;
n -= 4;
}
while (n > 0) {
start_tail(tail, x, v,v,v,v, v,v,v,v);
x += 1;
n -= 1;
}
}

104
src/core/SkRasterPipeline.h Normal file
View File

@ -0,0 +1,104 @@
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkRasterPipeline_DEFINED
#define SkRasterPipeline_DEFINED
#include "SkNx.h"
#include "SkTArray.h"
#include "SkTypes.h"
/**
* SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
*
* It's particularly designed for situations where the potential pipeline is extremely
* combinatoric: {N dst formats} x {M source formats} x {K mask formats} x {C transfer modes} ...
* No one wants to write specialized routines for all those combinations, and if we did, we'd
* end up bloating our code size dramatically. SkRasterPipeline stages can be chained together
* at runtime, so we can scale this problem linearly rather than combinatorically.
*
* Each stage is represented by a function conforming to a common interface, SkRasterPipeline::Fn,
* and by an arbitrary context pointer. Fn's arguments, and sometimes custom calling convention,
* are designed to maximize the amount of data we can pass along the pipeline cheaply.
* On many machines all arguments stay in registers the entire time.
*
* The meaning of the arguments to Fn are sometimes fixed...
* - The Stage* always represents the current stage, mainly providing access to ctx().
* - The size_t is always the destination x coordinate. If you need y, put it in your context.
* - By the time the shader's done, the first four vectors should hold source red,
* green, blue, and alpha, up to 4 pixels' worth each.
*
* ...and sometimes flexible:
* - In the shader, the first four vectors can be used for anything, e.g. sample coordinates.
* - The last four vectors are scratch registers that can be used to communicate between
* stages; transfer modes use these to hold the original destination pixel components.
*
* On some platforms the last four vectors are slower to work with than the other arguments.
*
* When done mutating its arguments and/or context, a stage can either:
* 1) call st->next() with its mutated arguments, chaining to the next stage of the pipeline; or
* 2) return, indicating the pipeline is complete for these pixels.
*
* Some obvious stages that typically return are those that write a color to a destination pointer,
* but any stage can short-circuit the rest of the pipeline by returning instead of calling next().
*/
class SkRasterPipeline {
public:
struct Stage;
using Fn = void(SK_VECTORCALL *)(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
Sk4f,Sk4f,Sk4f,Sk4f);
struct Stage {
template <typename T>
T ctx() { return static_cast<T>(fCtx); }
void SK_VECTORCALL next(size_t x, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
// Stages are logically a pipeline, and physically are contiguous in an array.
// To get to the next stage, we just increment our pointer to the next array element.
fNext(this+1, x, v0,v1,v2,v3, v4,v5,v6,v7);
}
// It makes next() a good bit cheaper if we hold the next function to call here,
// rather than logically simpler choice of the function implementing this stage.
Fn fNext;
void* fCtx;
};
SkRasterPipeline();
// Run the pipeline constructed with append(), walking x through [0,n),
// generally in 4 pixel steps, but sometimes 1 pixel at a time.
void run(size_t n);
// Use this append() if your stage is sensitive to the number of pixels you're working with:
// - body will always be called for a full 4 pixels
// - tail will always be called for a single pixel
// Typically this is only an essential distintion for stages that read or write memory.
void append(Fn body, const void* body_ctx,
Fn tail, const void* tail_ctx);
// Most stages don't actually care if they're working on 4 or 1 pixel.
void append(Fn fn, const void* ctx = nullptr) {
this->append(fn, ctx, fn, ctx);
}
// Most 4 pixel or 1 pixel variants share the same context pointer.
void append(Fn body, Fn tail, const void* ctx = nullptr) {
this->append(body, ctx, tail, ctx);
}
private:
using Stages = SkSTArray<10, Stage, /*MEM_COPY=*/true>;
Stages fBody,
fTail;
bool fReadyToRun = false;
};
#endif//SkRasterPipeline_DEFINED

View File

@ -386,6 +386,7 @@ public:
SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }

View File

@ -150,6 +150,7 @@ public:
_mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
}
SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }

View File

@ -0,0 +1,87 @@
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "Test.h"
#include "SkRasterPipeline.h"
// load needs two variants, one to load 4 values...
static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x,
Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
auto ptr = st->ctx<const float*>();
v0 = Sk4f{ptr[x+0]};
v1 = Sk4f{ptr[x+1]};
v2 = Sk4f{ptr[x+2]};
v3 = Sk4f{ptr[x+3]};
st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
}
// ...and one to load a single value.
static void SK_VECTORCALL load_tail(SkRasterPipeline::Stage* st, size_t x,
Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
auto ptr = st->ctx<const float*>();
v0 = Sk4f{ptr[x]};
st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
}
// square doesn't really care how many of its inputs are active, nor does it need a context.
static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x,
Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
v0 *= v0;
v1 *= v1;
v2 *= v2;
v3 *= v3;
st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
}
// Like load, store has a _tail variant. It ends the pipeline by returning.
static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x,
Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
auto ptr = st->ctx<float*>();
ptr[x+0] = v0[0];
ptr[x+1] = v1[0];
ptr[x+2] = v2[0];
ptr[x+3] = v3[0];
}
static void SK_VECTORCALL store_tail(SkRasterPipeline::Stage* st, size_t x,
Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
auto ptr = st->ctx<float*>();
ptr[x+0] = v0[0];
}
DEF_TEST(SkRasterPipeline, r) {
// We'll build up and run a simple pipeline that exercises the salient
// mechanics of SkRasterPipeline:
// - context pointers
// - stages sensitive to the number of pixels
// - stages insensitive to the number of pixels
//
// This pipeline loads up some values, squares them, then writes them back to memory.
const float src_vals[] = { 1,2,3,4,5 };
float dst_vals[] = { 0,0,0,0,0 };
SkRasterPipeline p;
p.append(load, load_tail, src_vals);
p.append(square);
p.append(store, store_tail, dst_vals);
p.run(5);
REPORTER_ASSERT(r, dst_vals[0] == 1);
REPORTER_ASSERT(r, dst_vals[1] == 4);
REPORTER_ASSERT(r, dst_vals[2] == 9);
REPORTER_ASSERT(r, dst_vals[3] == 16);
REPORTER_ASSERT(r, dst_vals[4] == 25);
}