Interpreter: Support striped inputs for less overhead

Change-Id: I8c7bd5ed3fb6aebbfb1c5c224acfd73862252621
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220778
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
Reviewed-by: Mike Reed <reed@google.com>
This commit is contained in:
Brian Osman 2019-06-18 11:03:10 -04:00 committed by Skia Commit-Bot
parent fb32ddf622
commit 5d89b66ff6
6 changed files with 121 additions and 23 deletions

View File

@ -12,10 +12,11 @@
// Benchmarks the interpreter with a function that has a color-filter style signature
class SkSLInterpreterCFBench : public Benchmark {
public:
SkSLInterpreterCFBench(SkSL::String name, int pixels, const char* src)
: fName(SkStringPrintf("sksl_interp_cf_%d_%s", pixels, name.c_str()))
SkSLInterpreterCFBench(SkSL::String name, int pixels, bool striped, const char* src)
: fName(SkStringPrintf("sksl_interp_cf_%d_%d_%s", pixels, striped ? 1 : 0, name.c_str()))
, fSrc(src)
, fCount(pixels) {}
, fCount(pixels)
, fStriped(striped) {}
protected:
const char* onGetName() override {
@ -44,7 +45,18 @@ protected:
void onDraw(int loops, SkCanvas*) override {
for (int i = 0; i < loops; i++) {
fByteCode->run(fMain, fPixels.data(), nullptr, fCount, nullptr, 0);
if (fStriped) {
float* args[] = {
fPixels.data() + 0 * fCount,
fPixels.data() + 1 * fCount,
fPixels.data() + 2 * fCount,
fPixels.data() + 3 * fCount,
};
fByteCode->runStriped(fMain, args, 4, fCount, nullptr, 0);
} else {
fByteCode->run(fMain, fPixels.data(), nullptr, fCount, nullptr, 0);
}
}
}
@ -55,6 +67,7 @@ private:
const SkSL::ByteCodeFunction* fMain;
int fCount;
bool fStriped;
std::vector<float> fPixels;
typedef Benchmark INHERITED;
@ -62,16 +75,16 @@ private:
///////////////////////////////////////////////////////////////////////////////
DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, R"(
const char* kLumaToAlphaSrc = R"(
void main(inout float4 color) {
color.a = color.r*0.3 + color.g*0.6 + color.b*0.1;
color.r = 0;
color.g = 0;
color.b = 0;
}
)"));
)";
DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
const char* kHighContrastFilterSrc = R"(
half ucontrast_Stage2;
half hue2rgb_Stage2(half p, half q, half t) {
if (t < 0) t += 1;
@ -129,7 +142,13 @@ DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
color.rgb = sqrt(color.rgb);
color.rgb *= color.a;
}
)"));
)";
DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, false, kLumaToAlphaSrc));
DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, true, kLumaToAlphaSrc));
DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, false, kHighContrastFilterSrc));
DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, true, kHighContrastFilterSrc));
class SkSLInterpreterSortBench : public Benchmark {
public:

View File

@ -404,13 +404,7 @@ public:
};
rec.fPipeline->append(SkRasterPipeline::callback, ctx);
} else {
struct InterpreterCtx : public SkRasterPipeline_CallbackCtx {
SkSL::ByteCode* byteCode;
SkSL::ByteCodeFunction* main;
const void* inputs;
int ninputs;
};
auto ctx = rec.fAlloc->make<InterpreterCtx>();
auto ctx = rec.fAlloc->make<SkRasterPipeline_InterpreterCtx>();
ctx->inputs = fInputs->data();
ctx->ninputs = fInputs->size() / 4;
@ -427,13 +421,8 @@ public:
fByteCode = c.toByteCode(*prog);
}
ctx->byteCode = fByteCode.get();
ctx->main = ctx->byteCode->fFunctions[0].get();
ctx->fn = [](SkRasterPipeline_CallbackCtx* arg, int active_pixels) {
auto ctx = (InterpreterCtx*)arg;
ctx->byteCode->run(ctx->main, ctx->rgba, nullptr, active_pixels,
(float*)ctx->inputs, ctx->ninputs);
};
rec.fPipeline->append(SkRasterPipeline::callback, ctx);
ctx->fn = ctx->byteCode->fFunctions[0].get();
rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
}
return true;
}

View File

@ -33,7 +33,7 @@
*/
#define SK_RASTER_PIPELINE_STAGES(M) \
M(callback) \
M(callback) M(interpreter) \
M(move_src_dst) M(move_dst_src) \
M(clamp_0) M(clamp_1) M(clamp_a) M(clamp_gamut) \
M(unpremul) M(premul) M(premul_dst) \
@ -150,6 +150,19 @@ struct SkRasterPipeline_CallbackCtx {
float* read_from = rgba;
};
namespace SkSL {
struct ByteCode;
struct ByteCodeFunction;
}
struct SkRasterPipeline_InterpreterCtx {
SkSL::ByteCode* byteCode;
SkSL::ByteCodeFunction* fn;
const void* inputs;
int ninputs;
};
struct SkRasterPipeline_GradientCtx {
size_t stopCount;
float* fs[4];

View File

@ -10,6 +10,7 @@
#include "include/core/SkTypes.h"
#include "src/core/SkUtils.h" // unaligned_{load,store}
#include "src/sksl/SkSLByteCode.h"
// Every function in this file should be marked static and inline using SI.
#if defined(__clang__)
@ -2552,6 +2553,27 @@ STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
load4(c->read_from,0, &r,&g,&b,&a);
}
STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) {
float rr[N];
float gg[N];
float bb[N];
float aa[N];
sk_unaligned_store(rr, r);
sk_unaligned_store(gg, g);
sk_unaligned_store(bb, b);
sk_unaligned_store(aa, a);
float* args[] = { rr, gg, bb, aa };
c->byteCode->runStriped(c->fn, args, SK_ARRAY_COUNT(args), tail ? tail : N,
(const float*)c->inputs, c->ninputs);
r = sk_unaligned_load<F>(rr);
g = sk_unaligned_load<F>(gg);
b = sk_unaligned_load<F>(bb);
a = sk_unaligned_load<F>(aa);
}
STAGE(gauss_a_to_rgba, Ctx::None) {
// x = 1 - x;
// exp(-x * x * 4) - 0.018f;
@ -3830,6 +3852,7 @@ STAGE_PP(swizzle, void* ctx) {
// If a pipeline uses these stages, it'll boot it out of lowp into highp.
#define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
NOT_IMPLEMENTED(callback)
NOT_IMPLEMENTED(interpreter)
NOT_IMPLEMENTED(unbounded_set_rgb)
NOT_IMPLEMENTED(unbounded_uniform_color)
NOT_IMPLEMENTED(unpremul)

View File

@ -1041,6 +1041,57 @@ void ByteCode::run(const ByteCodeFunction* f, float* args, float* outReturn, int
}
}
void ByteCode::runStriped(const ByteCodeFunction* f, float* args[], int nargs, int N,
const float* uniforms, int uniformCount) const {
#ifdef TRACE
disassemble(f);
#endif
Interpreter::VValue stack[128];
// Needs to be the first N non-negative integers, at least as large as VecWidth
static const Interpreter::I32 gLanes = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
SkASSERT(f->fReturnCount == 0);
SkASSERT(nargs == f->fParameterCount);
SkASSERT(uniformCount == (int)fInputSlots.size());
Interpreter::VValue globals[32];
SkASSERT((int)SK_ARRAY_COUNT(globals) >= fGlobalCount);
for (uint8_t slot : fInputSlots) {
globals[slot].fFloat = *uniforms++;
}
while (N) {
int w = std::min(N, Interpreter::VecWidth);
// Copy args into stack
for (int i = 0; i < nargs; ++i) {
memcpy(stack + i, args[i], w * sizeof(float));
}
auto mask = w > gLanes;
innerRun(this, f, stack, nullptr, mask, globals);
// Copy out parameters back
int slot = 0;
for (const auto& p : f->fParameters) {
if (p.fIsOutParameter) {
for (int i = slot; i < slot + p.fSlotCount; ++i) {
memcpy(args[i], stack + i, w * sizeof(float));
}
}
slot += p.fSlotCount;
}
// Step each argument pointer ahead
for (int i = 0; i < nargs; ++i) {
args[i] += w;
}
N -= w;
}
}
} // namespace SkSL
#endif

View File

@ -195,6 +195,9 @@ struct SK_API ByteCode {
*/
void run(const ByteCodeFunction*, float* args, float* outReturn, int N,
const float* uniforms, int uniformCount) const;
void runStriped(const ByteCodeFunction*, float* args[], int nargs, int N,
const float* uniforms, int uniformCount) const;
};
}