Interpreter: Support striped inputs for less overhead
Change-Id: I8c7bd5ed3fb6aebbfb1c5c224acfd73862252621 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220778 Commit-Queue: Brian Osman <brianosman@google.com> Reviewed-by: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
This commit is contained in:
parent
fb32ddf622
commit
5d89b66ff6
@ -12,10 +12,11 @@
|
||||
// Benchmarks the interpreter with a function that has a color-filter style signature
|
||||
class SkSLInterpreterCFBench : public Benchmark {
|
||||
public:
|
||||
SkSLInterpreterCFBench(SkSL::String name, int pixels, const char* src)
|
||||
: fName(SkStringPrintf("sksl_interp_cf_%d_%s", pixels, name.c_str()))
|
||||
SkSLInterpreterCFBench(SkSL::String name, int pixels, bool striped, const char* src)
|
||||
: fName(SkStringPrintf("sksl_interp_cf_%d_%d_%s", pixels, striped ? 1 : 0, name.c_str()))
|
||||
, fSrc(src)
|
||||
, fCount(pixels) {}
|
||||
, fCount(pixels)
|
||||
, fStriped(striped) {}
|
||||
|
||||
protected:
|
||||
const char* onGetName() override {
|
||||
@ -44,7 +45,18 @@ protected:
|
||||
|
||||
void onDraw(int loops, SkCanvas*) override {
|
||||
for (int i = 0; i < loops; i++) {
|
||||
fByteCode->run(fMain, fPixels.data(), nullptr, fCount, nullptr, 0);
|
||||
if (fStriped) {
|
||||
float* args[] = {
|
||||
fPixels.data() + 0 * fCount,
|
||||
fPixels.data() + 1 * fCount,
|
||||
fPixels.data() + 2 * fCount,
|
||||
fPixels.data() + 3 * fCount,
|
||||
};
|
||||
|
||||
fByteCode->runStriped(fMain, args, 4, fCount, nullptr, 0);
|
||||
} else {
|
||||
fByteCode->run(fMain, fPixels.data(), nullptr, fCount, nullptr, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -55,6 +67,7 @@ private:
|
||||
const SkSL::ByteCodeFunction* fMain;
|
||||
|
||||
int fCount;
|
||||
bool fStriped;
|
||||
std::vector<float> fPixels;
|
||||
|
||||
typedef Benchmark INHERITED;
|
||||
@ -62,16 +75,16 @@ private:
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, R"(
|
||||
const char* kLumaToAlphaSrc = R"(
|
||||
void main(inout float4 color) {
|
||||
color.a = color.r*0.3 + color.g*0.6 + color.b*0.1;
|
||||
color.r = 0;
|
||||
color.g = 0;
|
||||
color.b = 0;
|
||||
}
|
||||
)"));
|
||||
)";
|
||||
|
||||
DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
|
||||
const char* kHighContrastFilterSrc = R"(
|
||||
half ucontrast_Stage2;
|
||||
half hue2rgb_Stage2(half p, half q, half t) {
|
||||
if (t < 0) t += 1;
|
||||
@ -129,7 +142,13 @@ DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
|
||||
color.rgb = sqrt(color.rgb);
|
||||
color.rgb *= color.a;
|
||||
}
|
||||
)"));
|
||||
)";
|
||||
|
||||
DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, false, kLumaToAlphaSrc));
|
||||
DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, true, kLumaToAlphaSrc));
|
||||
|
||||
DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, false, kHighContrastFilterSrc));
|
||||
DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, true, kHighContrastFilterSrc));
|
||||
|
||||
class SkSLInterpreterSortBench : public Benchmark {
|
||||
public:
|
||||
|
@ -404,13 +404,7 @@ public:
|
||||
};
|
||||
rec.fPipeline->append(SkRasterPipeline::callback, ctx);
|
||||
} else {
|
||||
struct InterpreterCtx : public SkRasterPipeline_CallbackCtx {
|
||||
SkSL::ByteCode* byteCode;
|
||||
SkSL::ByteCodeFunction* main;
|
||||
const void* inputs;
|
||||
int ninputs;
|
||||
};
|
||||
auto ctx = rec.fAlloc->make<InterpreterCtx>();
|
||||
auto ctx = rec.fAlloc->make<SkRasterPipeline_InterpreterCtx>();
|
||||
ctx->inputs = fInputs->data();
|
||||
ctx->ninputs = fInputs->size() / 4;
|
||||
|
||||
@ -427,13 +421,8 @@ public:
|
||||
fByteCode = c.toByteCode(*prog);
|
||||
}
|
||||
ctx->byteCode = fByteCode.get();
|
||||
ctx->main = ctx->byteCode->fFunctions[0].get();
|
||||
ctx->fn = [](SkRasterPipeline_CallbackCtx* arg, int active_pixels) {
|
||||
auto ctx = (InterpreterCtx*)arg;
|
||||
ctx->byteCode->run(ctx->main, ctx->rgba, nullptr, active_pixels,
|
||||
(float*)ctx->inputs, ctx->ninputs);
|
||||
};
|
||||
rec.fPipeline->append(SkRasterPipeline::callback, ctx);
|
||||
ctx->fn = ctx->byteCode->fFunctions[0].get();
|
||||
rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -33,7 +33,7 @@
|
||||
*/
|
||||
|
||||
#define SK_RASTER_PIPELINE_STAGES(M) \
|
||||
M(callback) \
|
||||
M(callback) M(interpreter) \
|
||||
M(move_src_dst) M(move_dst_src) \
|
||||
M(clamp_0) M(clamp_1) M(clamp_a) M(clamp_gamut) \
|
||||
M(unpremul) M(premul) M(premul_dst) \
|
||||
@ -150,6 +150,19 @@ struct SkRasterPipeline_CallbackCtx {
|
||||
float* read_from = rgba;
|
||||
};
|
||||
|
||||
namespace SkSL {
|
||||
struct ByteCode;
|
||||
struct ByteCodeFunction;
|
||||
}
|
||||
|
||||
struct SkRasterPipeline_InterpreterCtx {
|
||||
SkSL::ByteCode* byteCode;
|
||||
SkSL::ByteCodeFunction* fn;
|
||||
|
||||
const void* inputs;
|
||||
int ninputs;
|
||||
};
|
||||
|
||||
struct SkRasterPipeline_GradientCtx {
|
||||
size_t stopCount;
|
||||
float* fs[4];
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include "include/core/SkTypes.h"
|
||||
#include "src/core/SkUtils.h" // unaligned_{load,store}
|
||||
#include "src/sksl/SkSLByteCode.h"
|
||||
|
||||
// Every function in this file should be marked static and inline using SI.
|
||||
#if defined(__clang__)
|
||||
@ -2552,6 +2553,27 @@ STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
|
||||
load4(c->read_from,0, &r,&g,&b,&a);
|
||||
}
|
||||
|
||||
STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) {
|
||||
float rr[N];
|
||||
float gg[N];
|
||||
float bb[N];
|
||||
float aa[N];
|
||||
|
||||
sk_unaligned_store(rr, r);
|
||||
sk_unaligned_store(gg, g);
|
||||
sk_unaligned_store(bb, b);
|
||||
sk_unaligned_store(aa, a);
|
||||
|
||||
float* args[] = { rr, gg, bb, aa };
|
||||
c->byteCode->runStriped(c->fn, args, SK_ARRAY_COUNT(args), tail ? tail : N,
|
||||
(const float*)c->inputs, c->ninputs);
|
||||
|
||||
r = sk_unaligned_load<F>(rr);
|
||||
g = sk_unaligned_load<F>(gg);
|
||||
b = sk_unaligned_load<F>(bb);
|
||||
a = sk_unaligned_load<F>(aa);
|
||||
}
|
||||
|
||||
STAGE(gauss_a_to_rgba, Ctx::None) {
|
||||
// x = 1 - x;
|
||||
// exp(-x * x * 4) - 0.018f;
|
||||
@ -3830,6 +3852,7 @@ STAGE_PP(swizzle, void* ctx) {
|
||||
// If a pipeline uses these stages, it'll boot it out of lowp into highp.
|
||||
#define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
|
||||
NOT_IMPLEMENTED(callback)
|
||||
NOT_IMPLEMENTED(interpreter)
|
||||
NOT_IMPLEMENTED(unbounded_set_rgb)
|
||||
NOT_IMPLEMENTED(unbounded_uniform_color)
|
||||
NOT_IMPLEMENTED(unpremul)
|
||||
|
@ -1041,6 +1041,57 @@ void ByteCode::run(const ByteCodeFunction* f, float* args, float* outReturn, int
|
||||
}
|
||||
}
|
||||
|
||||
void ByteCode::runStriped(const ByteCodeFunction* f, float* args[], int nargs, int N,
|
||||
const float* uniforms, int uniformCount) const {
|
||||
#ifdef TRACE
|
||||
disassemble(f);
|
||||
#endif
|
||||
Interpreter::VValue stack[128];
|
||||
|
||||
// Needs to be the first N non-negative integers, at least as large as VecWidth
|
||||
static const Interpreter::I32 gLanes = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
};
|
||||
|
||||
SkASSERT(f->fReturnCount == 0);
|
||||
SkASSERT(nargs == f->fParameterCount);
|
||||
SkASSERT(uniformCount == (int)fInputSlots.size());
|
||||
Interpreter::VValue globals[32];
|
||||
SkASSERT((int)SK_ARRAY_COUNT(globals) >= fGlobalCount);
|
||||
for (uint8_t slot : fInputSlots) {
|
||||
globals[slot].fFloat = *uniforms++;
|
||||
}
|
||||
|
||||
while (N) {
|
||||
int w = std::min(N, Interpreter::VecWidth);
|
||||
|
||||
// Copy args into stack
|
||||
for (int i = 0; i < nargs; ++i) {
|
||||
memcpy(stack + i, args[i], w * sizeof(float));
|
||||
}
|
||||
|
||||
auto mask = w > gLanes;
|
||||
innerRun(this, f, stack, nullptr, mask, globals);
|
||||
|
||||
// Copy out parameters back
|
||||
int slot = 0;
|
||||
for (const auto& p : f->fParameters) {
|
||||
if (p.fIsOutParameter) {
|
||||
for (int i = slot; i < slot + p.fSlotCount; ++i) {
|
||||
memcpy(args[i], stack + i, w * sizeof(float));
|
||||
}
|
||||
}
|
||||
slot += p.fSlotCount;
|
||||
}
|
||||
|
||||
// Step each argument pointer ahead
|
||||
for (int i = 0; i < nargs; ++i) {
|
||||
args[i] += w;
|
||||
}
|
||||
N -= w;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace SkSL
|
||||
|
||||
#endif
|
||||
|
@ -195,6 +195,9 @@ struct SK_API ByteCode {
|
||||
*/
|
||||
void run(const ByteCodeFunction*, float* args, float* outReturn, int N,
|
||||
const float* uniforms, int uniformCount) const;
|
||||
|
||||
void runStriped(const ByteCodeFunction*, float* args[], int nargs, int N,
|
||||
const float* uniforms, int uniformCount) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user