ByteCode: Better signature for innerRun, avoid memory stomping

This centralizes the initial-lane-mask logic, and makes the return value
copying much more straightforward by just passing in the width. Lets us
shrink the arrays in the interpreter pipeline stage to the correct size.

Also normalize some formatting and structure.

Change-Id: I446598dcdd550d88ff1db1afe7507f31fa96d1d7
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/222510
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
This commit is contained in:
Brian Osman 2019-06-21 09:50:29 -04:00 committed by Skia Commit-Bot
parent 7924d9a4ae
commit 4b202a3ba9
2 changed files with 22 additions and 41 deletions

View File

@ -2554,18 +2554,12 @@ STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
}
STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) {
// Logically we should just size these arrays by N, the width of our registers,
// but the interpreter has its own idea of 'width' in kVecWidth, so we make sure
// that we allocate enough space for it to memcpy its results back into our arrays.
//
// If N < kVecWidth, then we are doing more work than necessary in the interpreter.
// This is a known issue, and will be addressed at some point.
constexpr int COUNT = SkTMax<int>(N, SkSL::ByteCode::kVecWidth);
float rr[COUNT];
float gg[COUNT];
float bb[COUNT];
float aa[COUNT];
// If N is less than the interpreter's VecWidth, then we are doing more work than necessary in
// the interpreter. This is a known issue, and will be addressed at some point.
float rr[N];
float gg[N];
float bb[N];
float aa[N];
size_t in_count, out_count;
float* args[] = { rr, gg, bb, aa };

View File

@ -348,8 +348,13 @@ static T vec_mod(T a, T b) {
return a - skvx::trunc(a / b) * b;
}
void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack,
float* outReturn[], I32 initMask, VValue globals[], bool stripedOutput) {
static void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack,
float* outReturn[], VValue globals[], bool stripedOutput, int N) {
// Needs to be the first N non-negative integers, at least as large as VecWidth
static const Interpreter::I32 gLanes = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
auto POP = [&] { SkASSERT(sp >= stack); return *(sp--); };
@ -363,7 +368,7 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack
I32 maskStack[16]; // Combined masks (eg maskStack[0] & maskStack[1] & ...)
I32 contStack[16]; // Continue flags for loops
I32 loopStack[16]; // Loop execution masks
condStack[0] = maskStack[0] = initMask;
condStack[0] = maskStack[0] = (gLanes < N);
contStack[0] = I32( 0);
loopStack[0] = I32(~0);
I32* condPtr = condStack;
@ -722,22 +727,18 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack
int count = READ8();
if (frames.empty()) {
if (outReturn) {
// TODO: This can be smarter, knowing that mask is left-justified
I32 m = mask();
VValue* src = sp - count + 1;
if (stripedOutput) {
for (int i = 0; i < count; ++i) {
memcpy(outReturn[i], &src->fFloat, VecWidth * sizeof(float));
src += 1;
memcpy(outReturn[i], &src->fFloat, N * sizeof(float));
++src;
}
} else {
float* outPtr = outReturn[0];
for (int i = 0; i < count; ++i) {
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
for (int j = 0; j < N; ++j) {
outPtr[count * j] = src->fFloat[j];
}
}
++outPtr;
++src;
}
@ -998,12 +999,7 @@ void ByteCode::run(const ByteCodeFunction* f, float* args, float* outReturn, int
#ifdef TRACE
f->disassemble();
#endif
Interpreter::VValue smallStack[128];
// Needs to be the first N non-negative integers, at least as large as VecWidth
static const Interpreter::I32 gLanes = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
Interpreter::VValue stack[128];
SkASSERT(uniformCount == (int)fInputSlots.size());
Interpreter::VValue globals[32];
@ -1013,10 +1009,7 @@ void ByteCode::run(const ByteCodeFunction* f, float* args, float* outReturn, int
}
while (N) {
Interpreter::VValue* stack = smallStack;
int w = std::min(N, Interpreter::VecWidth);
N -= w;
// Transpose args into stack
{
@ -1032,8 +1025,7 @@ void ByteCode::run(const ByteCodeFunction* f, float* args, float* outReturn, int
bool stripedOutput = false;
float** outArray = outReturn ? &outReturn : nullptr;
auto mask = w > gLanes;
innerRun(this, f, stack, outArray, mask, globals, stripedOutput);
innerRun(this, f, stack, outArray, globals, stripedOutput, w);
// Transpose out parameters back
{
@ -1058,6 +1050,7 @@ void ByteCode::run(const ByteCodeFunction* f, float* args, float* outReturn, int
if (outReturn) {
outReturn += f->fReturnCount * w;
}
N -= w;
}
}
@ -1069,11 +1062,6 @@ void ByteCode::runStriped(const ByteCodeFunction* f, float* args[], int nargs, i
#endif
Interpreter::VValue stack[128];
// Needs to be the first N non-negative integers, at least as large as VecWidth
static const Interpreter::I32 gLanes = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
// innerRun just takes outArgs, so clear it if the count is zero
if (outCount == 0) {
outArgs = nullptr;
@ -1097,8 +1085,7 @@ void ByteCode::runStriped(const ByteCodeFunction* f, float* args[], int nargs, i
}
bool stripedOutput = true;
auto mask = w > gLanes;
innerRun(this, f, stack, outArgs, mask, globals, stripedOutput);
innerRun(this, f, stack, outArgs, globals, stripedOutput, w);
// Copy out parameters back
int slot = 0;