Interpreter: Vectorized interpreter

This converts the SkSL interpreter to operate in SIMT fashion. It handles
all the same features as the previous scalar implementation, but operates
on N lanes at a time. (Currently 8).

It's modeled after GPU and other parallel architectures, using execution
masks to handle control flow, including divergent control-flow.

Change-Id: Ieb38ffe2f55a10f72bdab844c297126fe9bedb6c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/217122
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
This commit is contained in:
Brian Osman 2019-06-13 11:23:57 -04:00 committed by Skia Commit-Bot
parent d608e224b2
commit 569f12f0e5
9 changed files with 792 additions and 238 deletions

View File

@ -0,0 +1,223 @@
/*
* Copyright 2019 Google LLC
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "bench/Benchmark.h"
#include "include/utils/SkRandom.h"
#include "src/sksl/SkSLCompiler.h"
#include "src/sksl/SkSLInterpreter.h"
// Benchmarks the interpreter with a function that has a color-filter style signature
class SkSLInterpreterCFBench : public Benchmark {
public:
SkSLInterpreterCFBench(SkSL::String name, int pixels, const char* src)
: fName(SkStringPrintf("sksl_interp_cf_%d_%s", pixels, name.c_str()))
, fSrc(src)
, fCount(pixels) {}
protected:
const char* onGetName() override {
return fName.c_str();
}
bool isSuitableFor(Backend backend) override {
return backend == kNonRendering_Backend;
}
void onDelayedSetup() override {
SkSL::Compiler compiler;
SkSL::Program::Settings settings;
auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, fSrc, settings);
SkASSERT(compiler.errorCount() == 0);
fByteCode = compiler.toByteCode(*program);
SkASSERT(compiler.errorCount() == 0);
fMain = fByteCode->getFunction("main");
SkRandom rnd;
fPixels.resize(fCount);
for (int i = 0; i < fCount; ++i) {
fPixels[i] = SkColor4f::FromColor(rnd.nextU());
}
}
void onDraw(int loops, SkCanvas*) override {
for (int i = 0; i < loops; i++) {
SkSL::Interpreter::VecRun(fByteCode.get(), fMain,
(SkSL::Interpreter::Value*)fPixels.data(), nullptr, fCount,
nullptr, 0);
}
}
private:
SkString fName;
SkSL::String fSrc;
std::unique_ptr<SkSL::ByteCode> fByteCode;
const SkSL::ByteCodeFunction* fMain;
int fCount;
std::vector<SkColor4f> fPixels;
typedef Benchmark INHERITED;
};
///////////////////////////////////////////////////////////////////////////////
DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, R"(
void main(inout float4 color) {
color.a = color.r*0.3 + color.g*0.6 + color.b*0.1;
color.r = 0;
color.g = 0;
color.b = 0;
}
)"));
DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
half ucontrast_Stage2;
half hue2rgb_Stage2(half p, half q, half t) {
if (t < 0) t += 1;
if (t > 1) t -= 1;
if (t < 1 / 6.) return p + (q - p) * 6 * t;
if (t < 1 / 2.) return q;
if (t < 2 / 3.) return p + (q - p) * (2 / 3. - t) * 6;
return p;
}
half max(half a, half b) { return a > b ? a : b; }
half min(half a, half b) { return a < b ? a : b; }
void main(inout half4 color) {
ucontrast_Stage2 = 0.2;
// HighContrastFilter
half nonZeroAlpha = max(color.a, 0.0001);
color = half4(color.rgb / nonZeroAlpha, nonZeroAlpha);
color.rgb = color.rgb * color.rgb;
half fmax = max(color.r, max(color.g, color.b));
half fmin = min(color.r, min(color.g, color.b));
half l = (fmax + fmin) / 2;
half h;
half s;
if (fmax == fmin) {
h = 0;
s = 0;
} else {
half d = fmax - fmin;
s = l > 0.5 ? d / (2 - fmax - fmin) : d / (fmax + fmin);
if (color.r >= color.g && color.r >= color.b) {
h = (color.g - color.b) / d + (color.g < color.b ? 6 : 0);
} else if (color.g >= color.b) {
h = (color.b - color.r) / d + 2;
} else {
h = (color.r - color.g) / d + 4;
}
}
h /= 6;
l = 1.0 - l;
if (s == 0) {
color = half4(l, l, l, 0);
} else {
half q = l < 0.5 ? l * (1 + s) : l + s - l * s;
half p = 2 * l - q;
color.r = hue2rgb_Stage2(p, q, h + 1 / 3.);
color.g = hue2rgb_Stage2(p, q, h);
color.b = hue2rgb_Stage2(p, q, h - 1 / 3.);
}
if (ucontrast_Stage2 != 0) {
half m = (1 + ucontrast_Stage2) / (1 - ucontrast_Stage2);
half off = (-0.5 * m + 0.5);
color = m * color + off;
}
// color = saturate(color);
color.rgb = sqrt(color.rgb);
color.rgb *= color.a;
}
)"));
class SkSLInterpreterSortBench : public Benchmark {
public:
SkSLInterpreterSortBench(int groups, int values, const char* src)
: fName(SkStringPrintf("sksl_interp_sort_%dx%d", groups, values))
, fCode(src)
, fGroups(groups)
, fValues(values) {
}
protected:
const char* onGetName() override {
return fName.c_str();
}
bool isSuitableFor(Backend backend) override {
return backend == kNonRendering_Backend;
}
void onDelayedSetup() override {
SkSL::Compiler compiler;
SkSL::Program::Settings settings;
auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, fCode, settings);
SkASSERT(compiler.errorCount() == 0);
fByteCode = compiler.toByteCode(*program);
SkASSERT(compiler.errorCount() == 0);
fMain = fByteCode->getFunction("main");
fSrc.resize(fGroups * fValues);
fDst.resize(fGroups * fValues);
SkRandom rnd;
for (auto& x : fSrc) {
x = rnd.nextS();
}
// Trigger one run now to check correctness
SkSL::Interpreter::VecRun(fByteCode.get(), fMain,
(SkSL::Interpreter::Value*)fSrc.data(),
(SkSL::Interpreter::Value*)fDst.data(),
fGroups,
nullptr, 0);
for (int i = 0; i < fGroups; ++i) {
for (int j = 1; j < fValues; ++j) {
SkASSERT(fDst[i * fValues + j] >= fDst[i * fValues + j - 1]);
}
}
}
void onDraw(int loops, SkCanvas*) override {
for (int i = 0; i < loops; i++) {
SkSL::Interpreter::VecRun(fByteCode.get(), fMain,
(SkSL::Interpreter::Value*)fSrc.data(),
(SkSL::Interpreter::Value*)fDst.data(),
fGroups,
nullptr, 0);
}
}
private:
SkString fName;
SkSL::String fCode;
std::unique_ptr<SkSL::ByteCode> fByteCode;
const SkSL::ByteCodeFunction* fMain;
int fGroups;
int fValues;
std::vector<int32_t> fSrc;
std::vector<int32_t> fDst;
typedef Benchmark INHERITED;
};
// Currently, this exceeds the interpreter's stack. Consider it a test case for some eventual
// bounds checking.
#if 0
DEF_BENCH(return new SkSLInterpreterSortBench(1024, 32, R"(
int[32] main(int v[32]) {
for (int i = 1; i < 32; ++i) {
for (int j = i; j > 0 && v[j-1] > v[j]; --j) {
int t = v[j];
v[j] = v[j-1];
v[j-1] = t;
}
}
return v;
}
)"));
#endif

View File

@ -111,6 +111,7 @@ bench_sources = [
"$_bench/SkVMBench.cpp",
"$_bench/SKPBench.cpp",
"$_bench/SkSLBench.cpp",
"$_bench/SkSLInterpreterBench.cpp",
"$_bench/StreamBench.cpp",
"$_bench/SortBench.cpp",
"$_bench/StrokeBench.cpp",

View File

@ -425,12 +425,10 @@ public:
ctx->main = ctx->byteCode->fFunctions[0].get();
ctx->fn = [](SkRasterPipeline_CallbackCtx* arg, int active_pixels) {
auto ctx = (InterpreterCtx*)arg;
for (int i = 0; i < active_pixels; i++) {
SkSL::Interpreter::Run(ctx->byteCode.get(), ctx->main,
(SkSL::Interpreter::Value*) (ctx->rgba + i * 4),
nullptr, (SkSL::Interpreter::Value*)ctx->inputs,
ctx->ninputs);
}
SkSL::Interpreter::VecRun(ctx->byteCode.get(), ctx->main,
(SkSL::Interpreter::Value*)ctx->rgba,
nullptr, active_pixels,
(SkSL::Interpreter::Value*)ctx->inputs, ctx->ninputs);
};
rec.fPipeline->append(SkRasterPipeline::callback, ctx);
}

View File

@ -49,8 +49,6 @@ enum class ByteCodeInstruction : uint16_t {
VECTOR(kCompareUGTEQ),
VECTOR(kCompareULT),
VECTOR(kCompareULTEQ),
// Followed by a 16 bit address
kConditionalBranch,
VECTOR(kConvertFtoI),
VECTOR(kConvertStoF),
VECTOR(kConvertUtoF),
@ -83,7 +81,7 @@ enum class ByteCodeInstruction : uint16_t {
VECTOR(kMix),
VECTOR_MATRIX(kMultiplyF),
VECTOR(kMultiplyI),
kNot,
kNotB,
kOrB,
VECTOR_MATRIX(kPop),
// Followed by a 32 bit value containing the value to push
@ -124,6 +122,22 @@ enum class ByteCodeInstruction : uint16_t {
VECTOR(kTan),
// Followed by a byte indicating external value to write
VECTOR(kWriteExternal),
kXorB,
kMaskPush,
kMaskPop,
kMaskNegate,
// Followed by count byte
kMaskBlend,
// Followed by address
kBranchIfAllFalse,
kLoopBegin,
kLoopNext,
kLoopMask,
kLoopEnd,
kLoopBreak,
kLoopContinue,
};
#undef VECTOR

View File

@ -459,6 +459,24 @@ bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool di
ByteCodeInstruction::kMultiplyF,
count);
break;
case Token::Kind::LOGICALAND:
SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
this->write(ByteCodeInstruction::kAndB);
break;
case Token::Kind::LOGICALNOT:
SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
this->write(ByteCodeInstruction::kNotB);
break;
case Token::Kind::LOGICALOR:
SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
this->write(ByteCodeInstruction::kOrB);
break;
case Token::Kind::LOGICALXOR:
SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
this->write(ByteCodeInstruction::kXorB);
break;
default:
SkASSERT(false);
}
@ -472,7 +490,7 @@ bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool di
void ByteCodeGenerator::writeBoolLiteral(const BoolLiteral& b) {
this->write(ByteCodeInstruction::kPushImmediate);
this->write32(b.fValue ? 1 : 0);
this->write32(b.fValue ? ~0 : 0);
}
void ByteCodeGenerator::writeConstructor(const Constructor& c) {
@ -744,14 +762,12 @@ void ByteCodeGenerator::writeSwizzle(const Swizzle& s) {
void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t) {
this->writeExpression(*t.fTest);
this->write(ByteCodeInstruction::kConditionalBranch);
DeferredLocation trueLocation(this);
this->writeExpression(*t.fIfFalse);
this->write(ByteCodeInstruction::kBranch);
DeferredLocation endLocation(this);
trueLocation.set();
this->write(ByteCodeInstruction::kMaskPush);
this->writeExpression(*t.fIfTrue);
endLocation.set();
this->write(ByteCodeInstruction::kMaskNegate);
this->writeExpression(*t.fIfFalse);
this->write(ByteCodeInstruction::kMaskBlend);
this->write8(SlotCount(t.fType));
}
void ByteCodeGenerator::writeExpression(const Expression& e, bool discard) {
@ -976,25 +992,29 @@ void ByteCodeGenerator::setContinueTargets() {
}
void ByteCodeGenerator::writeBreakStatement(const BreakStatement& b) {
this->write(ByteCodeInstruction::kBranch);
fBreakTargets.top().emplace_back(this);
// TODO: Include BranchIfAllFalse to top-most LoopNext
this->write(ByteCodeInstruction::kLoopBreak);
}
void ByteCodeGenerator::writeContinueStatement(const ContinueStatement& c) {
this->write(ByteCodeInstruction::kBranch);
fContinueTargets.top().emplace_back(this);
// TODO: Include BranchIfAllFalse to top-most LoopNext
this->write(ByteCodeInstruction::kLoopContinue);
}
void ByteCodeGenerator::writeDoStatement(const DoStatement& d) {
fContinueTargets.emplace();
fBreakTargets.emplace();
this->write(ByteCodeInstruction::kLoopBegin);
size_t start = fCode->size();
this->writeStatement(*d.fStatement);
this->setContinueTargets();
this->write(ByteCodeInstruction::kLoopNext);
this->writeExpression(*d.fTest);
this->write(ByteCodeInstruction::kConditionalBranch);
this->write(ByteCodeInstruction::kLoopMask);
// TODO: Could shorten this with kBranchIfAnyTrue
this->write(ByteCodeInstruction::kBranchIfAllFalse);
DeferredLocation endLocation(this);
this->write(ByteCodeInstruction::kBranch);
this->write16(start);
this->setBreakTargets();
endLocation.set();
this->write(ByteCodeInstruction::kLoopEnd);
}
void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
@ -1003,53 +1023,40 @@ void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
if (f.fInitializer) {
this->writeStatement(*f.fInitializer);
}
this->write(ByteCodeInstruction::kLoopBegin);
size_t start = fCode->size();
if (f.fTest) {
this->writeExpression(*f.fTest);
this->write(ByteCodeInstruction::kNot);
this->write(ByteCodeInstruction::kConditionalBranch);
DeferredLocation endLocation(this);
this->writeStatement(*f.fStatement);
this->setContinueTargets();
if (f.fNext) {
this->writeExpression(*f.fNext, true);
}
this->write(ByteCodeInstruction::kBranch);
this->write16(start);
endLocation.set();
} else {
this->writeStatement(*f.fStatement);
this->setContinueTargets();
if (f.fNext) {
this->writeExpression(*f.fNext, true);
}
this->write(ByteCodeInstruction::kBranch);
this->write16(start);
this->write(ByteCodeInstruction::kLoopMask);
}
this->setBreakTargets();
this->write(ByteCodeInstruction::kBranchIfAllFalse);
DeferredLocation endLocation(this);
this->writeStatement(*f.fStatement);
this->write(ByteCodeInstruction::kLoopNext);
if (f.fNext) {
this->writeExpression(*f.fNext, true);
}
this->write(ByteCodeInstruction::kBranch);
this->write16(start);
endLocation.set();
this->write(ByteCodeInstruction::kLoopEnd);
}
void ByteCodeGenerator::writeIfStatement(const IfStatement& i) {
this->writeExpression(*i.fTest);
this->write(ByteCodeInstruction::kMaskPush);
this->write(ByteCodeInstruction::kBranchIfAllFalse);
DeferredLocation falseLocation(this);
this->writeStatement(*i.fIfTrue);
falseLocation.set();
if (i.fIfFalse) {
// if (test) { ..ifTrue.. } else { .. ifFalse .. }
this->writeExpression(*i.fTest);
this->write(ByteCodeInstruction::kConditionalBranch);
DeferredLocation trueLocation(this);
this->write(ByteCodeInstruction::kMaskNegate);
this->write(ByteCodeInstruction::kBranchIfAllFalse);
DeferredLocation endLocation(this);
this->writeStatement(*i.fIfFalse);
this->write(ByteCodeInstruction::kBranch);
DeferredLocation endLocation(this);
trueLocation.set();
this->writeStatement(*i.fIfTrue);
endLocation.set();
} else {
// if (test) { ..ifTrue.. }
this->writeExpression(*i.fTest);
this->write(ByteCodeInstruction::kNot);
this->write(ByteCodeInstruction::kConditionalBranch);
DeferredLocation endLocation(this);
this->writeStatement(*i.fIfTrue);
endLocation.set();
}
this->write(ByteCodeInstruction::kMaskPop);
}
void ByteCodeGenerator::writeReturnStatement(const ReturnStatement& r) {
@ -1086,19 +1093,18 @@ void ByteCodeGenerator::writeVarDeclarations(const VarDeclarations& v) {
}
void ByteCodeGenerator::writeWhileStatement(const WhileStatement& w) {
fContinueTargets.emplace();
fBreakTargets.emplace();
size_t start = fCode->size();
this->write(ByteCodeInstruction::kLoopBegin);
size_t cond = fCode->size();
this->writeExpression(*w.fTest);
this->write(ByteCodeInstruction::kNot);
this->write(ByteCodeInstruction::kConditionalBranch);
this->write(ByteCodeInstruction::kLoopMask);
this->write(ByteCodeInstruction::kBranchIfAllFalse);
DeferredLocation endLocation(this);
this->writeStatement(*w.fStatement);
this->setContinueTargets();
this->write(ByteCodeInstruction::kLoopNext);
this->write(ByteCodeInstruction::kBranch);
this->write16(start);
this->write16(cond);
endLocation.set();
this->setBreakTargets();
this->write(ByteCodeInstruction::kLoopEnd);
}
void ByteCodeGenerator::writeStatement(const Statement& s) {

View File

@ -8,6 +8,7 @@
#ifndef SKSL_STANDALONE
#include "include/core/SkPoint3.h"
#include "include/private/SkVx.h"
#include "src/sksl/SkSLByteCode.h"
#include "src/sksl/SkSLByteCodeGenerator.h"
#include "src/sksl/SkSLExternalValue.h"
@ -18,6 +19,16 @@
namespace SkSL {
namespace Interpreter {
constexpr int VecWidth = 16;
using F32 = skvx::Vec<VecWidth, float>;
using I32 = skvx::Vec<VecWidth, int32_t>;
using U32 = skvx::Vec<VecWidth, uint32_t>;
// Needs to be the first N non-negative integers, at least as large as VecWidth
static const I32 gLanes = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
template <typename T>
static T unaligned_load(const void* ptr) {
T val;
@ -72,9 +83,6 @@ static const uint8_t* disassemble_instruction(const uint8_t* ip) {
VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
VECTOR_DISASSEMBLE(kCompareULT, "compareult")
VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
case ByteCodeInstruction::kConditionalBranch:
printf("conditionalbranch %d", READ16());
break;
VECTOR_DISASSEMBLE(kConvertFtoI, "convertftoi")
VECTOR_DISASSEMBLE(kConvertStoF, "convertstof")
VECTOR_DISASSEMBLE(kConvertUtoF, "convertutof")
@ -132,7 +140,7 @@ static const uint8_t* disassemble_instruction(const uint8_t* ip) {
VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
VECTOR_MATRIX_DISASSEMBLE(kNegateF, "negatef")
VECTOR_DISASSEMBLE(kNegateI, "negatei")
case ByteCodeInstruction::kNot: printf("not"); break;
case ByteCodeInstruction::kNotB: printf("notb"); break;
case ByteCodeInstruction::kOrB: printf("orb"); break;
VECTOR_MATRIX_DISASSEMBLE(kPop, "pop")
case ByteCodeInstruction::kPushImmediate: {
@ -218,6 +226,20 @@ static const uint8_t* disassemble_instruction(const uint8_t* ip) {
case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ8()); break;
case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ8()); break;
case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ8()); break;
case ByteCodeInstruction::kXorB: printf("xorb"); break;
case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
case ByteCodeInstruction::kMaskBlend: printf("maskblend %d", READ8()); break;
case ByteCodeInstruction::kBranchIfAllFalse:
printf("branchifallfalse %d", READ16());
break;
case ByteCodeInstruction::kLoopBegin: printf("loopbegin"); break;
case ByteCodeInstruction::kLoopNext: printf("loopnext"); break;
case ByteCodeInstruction::kLoopMask: printf("loopmask"); break;
case ByteCodeInstruction::kLoopEnd: printf("loopend"); break;
case ByteCodeInstruction::kLoopContinue: printf("loopcontinue"); break;
case ByteCodeInstruction::kLoopBreak: printf("loopbreak"); break;
default: printf("unknown(%d)\n", *(ip - 1)); SkASSERT(false);
}
return ip;
@ -294,27 +316,80 @@ void Disassemble(const ByteCodeFunction* f) {
case ByteCodeInstruction::base: sp[ 0] = fn(sp[ 0].field); \
break;
#define VECTOR_UNARY_FN_VEC(base, fn) \
case ByteCodeInstruction::base ## 4: \
case ByteCodeInstruction::base ## 3: \
case ByteCodeInstruction::base ## 2: \
case ByteCodeInstruction::base : { \
int count = (int)inst - (int)ByteCodeInstruction::base + 1; \
float* v = (float*)sp - count + 1; \
for (int i = VecWidth * count; i > 0; --i, ++v) { \
*v = fn(*v); \
} \
break; \
}
union VValue {
VValue() {}
VValue(F32 f)
: fFloat(f) {
}
VValue(I32 s)
: fSigned(s) {
}
VValue(U32 u)
: fUnsigned(u) {
}
F32 fFloat;
I32 fSigned;
U32 fUnsigned;
};
struct StackFrame {
const uint8_t* fCode;
const uint8_t* fIP;
Interpreter::Value* fStack;
VValue* fStack;
};
static float mix(float start, float end, float t) {
static F32 mix(F32 start, F32 end, F32 t) {
return start * (1 - t) + end * t;
}
void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack, Value* outReturn,
Value globals[], int globalCount) {
Value* sp = stack + f->fParameterCount + f->fLocalCount - 1;
// TODO: trunc on integers?
template <typename T>
static T vec_mod(T a, T b) {
return a - skvx::trunc(a / b) * b;
}
auto POP = [&] { SkASSERT(sp >= stack); return *(sp--); };
auto PUSH = [&](Value v) { SkASSERT(sp + 1 >= stack); *(++sp) = v; };
void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack, Value* outReturn,
I32 initMask, VValue globals[], int globalCount) {
VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
auto POP = [&] { SkASSERT(sp >= stack); return *(sp--); };
auto PUSH = [&](VValue v) { SkASSERT(sp + 1 >= stack); *(++sp) = v; };
const uint8_t* code = f->fCode.data();
const uint8_t* ip = code;
std::vector<StackFrame> frames;
I32 condStack[16]; // Independent condition masks
I32 maskStack[16]; // Combined masks (eg maskStack[0] & maskStack[1] & ...)
I32 contStack[16]; // Continue flags for loops
I32 loopStack[16]; // Loop execution masks
condStack[0] = maskStack[0] = initMask;
contStack[0] = I32( 0);
loopStack[0] = I32(~0);
I32* condPtr = condStack;
I32* maskPtr = maskStack;
I32* contPtr = contStack;
I32* loopPtr = loopStack;
auto mask = [&]() { return *maskPtr & *loopPtr; };
for (;;) {
#ifdef TRACE
printf("at %3d ", (int) (ip - code));
@ -325,8 +400,21 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
switch (inst) {
VECTOR_BINARY_OP(kAddI, fSigned, +)
VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
// Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
case ByteCodeInstruction::kAndB:
sp[-1] = sp[-1].fBool && sp[0].fBool;
sp[-1] = sp[-1].fSigned & sp[0].fSigned;
POP();
break;
case ByteCodeInstruction::kNotB:
sp[0] = ~sp[0].fSigned;
break;
case ByteCodeInstruction::kOrB:
sp[-1] = sp[-1].fSigned | sp[0].fSigned;
POP();
break;
case ByteCodeInstruction::kXorB:
sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
POP();
break;
@ -340,10 +428,15 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
// (plus space for locals).
int target = READ8();
const ByteCodeFunction* fun = byteCode->fFunctions[target].get();
frames.push_back({ code, ip, stack });
ip = code = fun->fCode.data();
stack = sp - fun->fParameterCount + 1;
sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
if (skvx::any(mask())) {
frames.push_back({ code, ip, stack });
ip = code = fun->fCode.data();
stack = sp - fun->fParameterCount + 1;
sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
} else {
sp -= fun->fParameterCount;
sp += fun->fReturnCount;
}
break;
}
@ -354,10 +447,23 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
ExternalValue* v = byteCode->fExternalValues[target];
sp -= argumentCount - 1;
Value tmp[4];
SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmp));
v->call(sp, tmp);
memcpy(sp, tmp, returnCount * sizeof(Value));
Value tmpArgs[4];
Value tmpReturn[4];
SkASSERT(argumentCount <= (int)SK_ARRAY_COUNT(tmpArgs));
SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmpReturn));
I32 m = mask();
for (int i = 0; i < VecWidth; ++i) {
if (m[i]) {
for (int j = 0; j < argumentCount; ++j) {
tmpArgs[j].fSigned = sp[j].fSigned[i];
}
v->call(tmpArgs, tmpReturn);
for (int j = 0; j < returnCount; ++j) {
sp[j].fSigned[i] = tmpReturn[j].fSigned;
}
}
}
sp += returnCount - 1;
break;
}
@ -379,45 +485,36 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)
case ByteCodeInstruction::kConditionalBranch: {
int target = READ16();
if (POP().fBool) {
ip = code + target;
}
break;
}
case ByteCodeInstruction::kConvertFtoI4: sp[-3].fSigned = (int)sp[-3].fFloat;
case ByteCodeInstruction::kConvertFtoI3: sp[-2].fSigned = (int)sp[-2].fFloat;
case ByteCodeInstruction::kConvertFtoI2: sp[-1].fSigned = (int)sp[-1].fFloat;
case ByteCodeInstruction::kConvertFtoI: sp[ 0].fSigned = (int)sp[ 0].fFloat;
case ByteCodeInstruction::kConvertFtoI4: sp[-3] = skvx::cast<int>(sp[-3].fFloat);
case ByteCodeInstruction::kConvertFtoI3: sp[-2] = skvx::cast<int>(sp[-2].fFloat);
case ByteCodeInstruction::kConvertFtoI2: sp[-1] = skvx::cast<int>(sp[-1].fFloat);
case ByteCodeInstruction::kConvertFtoI: sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
break;
case ByteCodeInstruction::kConvertStoF4: sp[-3].fFloat = sp[-3].fSigned;
case ByteCodeInstruction::kConvertStoF3: sp[-2].fFloat = sp[-2].fSigned;
case ByteCodeInstruction::kConvertStoF2: sp[-1].fFloat = sp[-1].fSigned;
case ByteCodeInstruction::kConvertStoF : sp[ 0].fFloat = sp[ 0].fSigned;
case ByteCodeInstruction::kConvertStoF4: sp[-3] = skvx::cast<float>(sp[-3].fSigned);
case ByteCodeInstruction::kConvertStoF3: sp[-2] = skvx::cast<float>(sp[-2].fSigned);
case ByteCodeInstruction::kConvertStoF2: sp[-1] = skvx::cast<float>(sp[-1].fSigned);
case ByteCodeInstruction::kConvertStoF : sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
break;
case ByteCodeInstruction::kConvertUtoF4: sp[-3].fFloat = sp[-3].fUnsigned;
case ByteCodeInstruction::kConvertUtoF3: sp[-2].fFloat = sp[-2].fUnsigned;
case ByteCodeInstruction::kConvertUtoF2: sp[-1].fFloat = sp[-1].fUnsigned;
case ByteCodeInstruction::kConvertUtoF : sp[ 0].fFloat = sp[ 0].fUnsigned;
case ByteCodeInstruction::kConvertUtoF4: sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
case ByteCodeInstruction::kConvertUtoF3: sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
case ByteCodeInstruction::kConvertUtoF2: sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
case ByteCodeInstruction::kConvertUtoF : sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
break;
VECTOR_UNARY_FN(kCos, cosf, fFloat)
VECTOR_UNARY_FN_VEC(kCos, cosf)
case ByteCodeInstruction::kCross: {
SkPoint3 cross = SkPoint3::CrossProduct(SkPoint3::Make(sp[-5].fFloat,
sp[-4].fFloat,
sp[-3].fFloat),
SkPoint3::Make(sp[-2].fFloat,
sp[-1].fFloat,
sp[ 0].fFloat));
F32 ax = sp[-5].fFloat, ay = sp[-4].fFloat, az = sp[-3].fFloat,
bx = sp[-2].fFloat, by = sp[-1].fFloat, bz = sp[ 0].fFloat;
F32 cx = ay*bz - az*by,
cy = az*bx - ax*bz,
cz = ax*by - ay*bx;
sp -= 3;
sp[-2] = cross.fX;
sp[-1] = cross.fY;
sp[ 0] = cross.fZ;
sp[-2] = cx;
sp[-1] = cy;
sp[ 0] = cz;
break;
}
@ -433,7 +530,7 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
case ByteCodeInstruction::kDupN: {
int count = READ8();
memcpy(sp + 1, sp - count + 1, count * sizeof(Value));
memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
sp += count;
break;
}
@ -457,17 +554,30 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
case ByteCodeInstruction::kLoadExtended: {
int count = READ8();
int src = POP().fSigned;
memcpy(sp + 1, &stack[src], count * sizeof(Value));
I32 src = POP().fSigned;
I32 m = mask();
for (int i = 0; i < count; ++i) {
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
sp[i + 1].fSigned[j] = stack[src[j] + i].fSigned[j];
}
}
}
sp += count;
break;
}
case ByteCodeInstruction::kLoadExtendedGlobal: {
int count = READ8();
int src = POP().fSigned;
SkASSERT(src + count <= globalCount);
memcpy(sp + 1, &globals[src], count * sizeof(Value));
I32 src = POP().fSigned;
I32 m = mask();
for (int i = 0; i < count; ++i) {
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
sp[i + 1].fSigned[j] = globals[src[j] + i].fSigned[j];
}
}
}
sp += count;
break;
}
@ -502,15 +612,17 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
SkASSERT(srcRows >= 2 && srcRows <= 4);
SkASSERT(dstCols >= 2 && dstCols <= 4);
SkASSERT(dstRows >= 2 && dstRows <= 4);
SkMatrix44 m;
F32 tmp[16];
memset(tmp, 0, sizeof(tmp));
tmp[0] = tmp[5] = tmp[10] = tmp[15] = F32(1.0f);
for (int c = srcCols - 1; c >= 0; --c) {
for (int r = srcRows - 1; r >= 0; --r) {
m.set(r, c, POP().fFloat);
tmp[c*4 + r] = POP().fFloat;
}
}
for (int c = 0; c < dstCols; ++c) {
for (int r = 0; r < dstRows; ++r) {
PUSH(m.get(r, c));
PUSH(tmp[c*4 + r]);
}
}
break;
@ -521,9 +633,9 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
int lRows = READ8();
int rCols = READ8();
int rRows = lCols;
float tmp[16] = { 0.0f };
float* B = &(sp - (rCols * rRows) + 1)->fFloat;
float* A = B - (lCols * lRows);
F32 tmp[16] = { 0.0f };
F32* B = &(sp - (rCols * rRows) + 1)->fFloat;
F32* A = B - (lCols * lRows);
for (int c = 0; c < rCols; ++c) {
for (int r = 0; r < lRows; ++r) {
for (int j = 0; j < lCols; ++j) {
@ -532,7 +644,7 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
}
}
sp -= (lCols * lRows) + (rCols * rRows);
memcpy(sp + 1, tmp, rCols * lRows * sizeof(Value));
memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
sp += (rCols * lRows);
break;
}
@ -564,10 +676,6 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)
case ByteCodeInstruction::kNot:
sp[0].fBool = !sp[0].fBool;
break;
case ByteCodeInstruction::kNegateF4: sp[-3] = -sp[-3].fFloat;
case ByteCodeInstruction::kNegateF3: sp[-2] = -sp[-2].fFloat;
case ByteCodeInstruction::kNegateF2: sp[-1] = -sp[-1].fFloat;
@ -585,14 +693,9 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
case ByteCodeInstruction::kNegateI4: sp[-3] = -sp[-3].fSigned;
case ByteCodeInstruction::kNegateI3: sp[-2] = -sp[-2].fSigned;
case ByteCodeInstruction::kNegateI2: sp[-1] = -sp[-1].fSigned;
case ByteCodeInstruction::kNegateI : sp[ 0] = -sp [0].fSigned;
case ByteCodeInstruction::kNegateI : sp[ 0] = -sp[ 0].fSigned;
break;
case ByteCodeInstruction::kOrB:
sp[-1] = sp[-1].fBool || sp[0].fBool;
POP();
break;
case ByteCodeInstruction::kPop4: POP();
case ByteCodeInstruction::kPop3: POP();
case ByteCodeInstruction::kPop2: POP();
@ -604,34 +707,56 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
break;
case ByteCodeInstruction::kPushImmediate:
PUSH(READ32());
PUSH(U32(READ32()));
break;
case ByteCodeInstruction::kReadExternal: // fall through
case ByteCodeInstruction::kReadExternal2: // fall through
case ByteCodeInstruction::kReadExternal3: // fall through
case ByteCodeInstruction::kReadExternal:
case ByteCodeInstruction::kReadExternal2:
case ByteCodeInstruction::kReadExternal3:
case ByteCodeInstruction::kReadExternal4: {
// TODO: Support striped external values, or passing lane index? This model is odd.
int count = (int)inst - (int)ByteCodeInstruction::kReadExternal + 1;
int src = READ8();
byteCode->fExternalValues[src]->read(sp + 1);
sp += (int) inst - (int) ByteCodeInstruction::kReadExternal + 1;
int32_t tmp[4];
I32 m = mask();
for (int i = 0; i < VecWidth; ++i) {
if (m[i]) {
byteCode->fExternalValues[src]->read(tmp);
for (int j = 0; j < count; ++j) {
sp[j + 1].fSigned[i] = tmp[j];
}
}
}
sp += count;
break;
}
VECTOR_BINARY_FN(kRemainderF, fFloat, fmodf)
VECTOR_BINARY_OP(kRemainderS, fSigned, %)
VECTOR_BINARY_OP(kRemainderU, fUnsigned, %)
VECTOR_BINARY_FN(kRemainderF, fFloat, vec_mod<F32>)
VECTOR_BINARY_FN(kRemainderS, fSigned, vec_mod<I32>)
VECTOR_BINARY_FN(kRemainderU, fUnsigned, vec_mod<U32>)
case ByteCodeInstruction::kReturn: {
int count = READ8();
if (frames.empty()) {
if (outReturn) {
memcpy(outReturn, sp - count + 1, count * sizeof(Value));
// TODO: This can be smarter, knowing that mask is left-justified
I32 m = mask();
VValue* src = sp - count + 1;
for (int i = 0; i < count; ++i) {
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
outReturn[count * j].fSigned = src->fSigned[j];
}
}
++outReturn;
++src;
}
}
return;
} else {
// When we were called, 'stack' was positioned at the old top-of-stack (where
// our parameters were placed). So copy our return values to that same spot.
memmove(stack, sp - count + 1, count * sizeof(Value));
memmove(stack, sp - count + 1, count * sizeof(VValue));
// Now move the stack pointer to the end of the just-pushed return values,
// and restore everything else.
@ -648,44 +773,67 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
case ByteCodeInstruction::kScalarToMatrix: {
int cols = READ8();
int rows = READ8();
Value v = POP();
VValue v = POP();
for (int c = 0; c < cols; ++c) {
for (int r = 0; r < rows; ++r) {
PUSH(c == r ? v : 0.0f);
PUSH(c == r ? v : F32(0.0f));
}
}
break;
}
VECTOR_UNARY_FN(kSin, sinf, fFloat)
VECTOR_UNARY_FN(kSqrt, sqrtf, fFloat)
VECTOR_UNARY_FN_VEC(kSin, sinf)
VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)
case ByteCodeInstruction::kStore4: stack[*ip + 3] = POP();
case ByteCodeInstruction::kStore3: stack[*ip + 2] = POP();
case ByteCodeInstruction::kStore2: stack[*ip + 1] = POP();
case ByteCodeInstruction::kStore : stack[*ip + 0] = POP();
++ip;
break;
case ByteCodeInstruction::kStore4:
stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
case ByteCodeInstruction::kStore3:
stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
case ByteCodeInstruction::kStore2:
stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
case ByteCodeInstruction::kStore :
stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
++ip;
break;
case ByteCodeInstruction::kStoreGlobal4: globals[*ip + 3] = POP();
case ByteCodeInstruction::kStoreGlobal3: globals[*ip + 2] = POP();
case ByteCodeInstruction::kStoreGlobal2: globals[*ip + 1] = POP();
case ByteCodeInstruction::kStoreGlobal : globals[*ip + 0] = POP();
++ip;
break;
case ByteCodeInstruction::kStoreGlobal4:
globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
case ByteCodeInstruction::kStoreGlobal3:
globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
case ByteCodeInstruction::kStoreGlobal2:
globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
case ByteCodeInstruction::kStoreGlobal :
globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
++ip;
break;
case ByteCodeInstruction::kStoreExtended: {
int count = READ8();
int target = POP().fSigned;
memcpy(&stack[target], sp - count + 1, count * sizeof(Value));
I32 target = POP().fSigned;
VValue* src = sp - count + 1;
I32 m = mask();
for (int i = 0; i < count; ++i) {
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
stack[target[j] + i].fSigned[j] = src[i].fSigned[j];
}
}
}
sp -= count;
break;
}
case ByteCodeInstruction::kStoreExtendedGlobal: {
int count = READ8();
int target = POP().fSigned;
SkASSERT(target + count <= globalCount);
memcpy(&globals[target], sp - count + 1, count * sizeof(Value));
I32 target = POP().fSigned;
VValue* src = sp - count + 1;
I32 m = mask();
for (int i = 0; i < count; ++i) {
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
globals[target[j] + i].fSigned[j] = src[i].fSigned[j];
}
}
}
sp -= count;
break;
}
@ -694,7 +842,8 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
int target = READ8();
int count = READ8();
for (int i = count - 1; i >= 0; --i) {
stack[target + *(ip + i)] = POP();
stack[target + *(ip + i)] = skvx::if_then_else(
mask(), POP().fFloat, stack[target + *(ip + i)].fFloat);
}
ip += count;
break;
@ -704,25 +853,40 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
int target = READ8();
int count = READ8();
for (int i = count - 1; i >= 0; --i) {
globals[target + *(ip + i)] = POP();
globals[target + *(ip + i)] = skvx::if_then_else(
mask(), POP().fFloat, globals[target + *(ip + i)].fFloat);
}
ip += count;
break;
}
case ByteCodeInstruction::kStoreSwizzleIndirect: {
int target = POP().fSigned;
int count = READ8();
I32 target = POP().fSigned;
I32 m = mask();
for (int i = count - 1; i >= 0; --i) {
stack[target + *(ip + i)] = POP();
I32 v = POP().fSigned;
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
stack[target[j] + *(ip + i)].fSigned[j] = v[j];
}
}
}
ip += count;
break;
}
case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
int target = POP().fSigned;
int count = READ8();
I32 target = POP().fSigned;
I32 m = mask();
for (int i = count - 1; i >= 0; --i) {
globals[target + *(ip + i)] = POP();
I32 v = POP().fSigned;
for (int j = 0; j < VecWidth; ++j) {
if (m[j]) {
globals[target[j] + *(ip + i)].fSigned[j] = v[j];
}
}
}
ip += count;
break;
@ -732,7 +896,7 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)
case ByteCodeInstruction::kSwizzle: {
Value tmp[4];
VValue tmp[4];
for (int i = READ8() - 1; i >= 0; --i) {
tmp[i] = POP();
}
@ -742,70 +906,150 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
break;
}
VECTOR_UNARY_FN(kTan, tanf, fFloat)
VECTOR_UNARY_FN_VEC(kTan, tanf)
case ByteCodeInstruction::kWriteExternal: // fall through
case ByteCodeInstruction::kWriteExternal2: // fall through
case ByteCodeInstruction::kWriteExternal3: // fall through
case ByteCodeInstruction::kWriteExternal:
case ByteCodeInstruction::kWriteExternal2:
case ByteCodeInstruction::kWriteExternal3:
case ByteCodeInstruction::kWriteExternal4: {
int count = (int) inst - (int) ByteCodeInstruction::kWriteExternal + 1;
int count = (int)inst - (int)ByteCodeInstruction::kWriteExternal + 1;
int target = READ8();
byteCode->fExternalValues[target]->write(sp - count + 1);
int32_t tmp[4];
I32 m = mask();
sp -= count;
for (int i = 0; i < VecWidth; ++i) {
if (m[i]) {
for (int j = 0; j < count; ++j) {
tmp[j] = sp[j + 1].fSigned[i];
}
byteCode->fExternalValues[target]->write(tmp);
}
}
break;
}
case ByteCodeInstruction::kMaskPush:
condPtr[1] = POP().fSigned;
maskPtr[1] = maskPtr[0] & condPtr[1];
++condPtr; ++maskPtr;
break;
case ByteCodeInstruction::kMaskPop:
--condPtr; --maskPtr;
break;
case ByteCodeInstruction::kMaskNegate:
maskPtr[0] = maskPtr[-1] & ~condPtr[0];
break;
case ByteCodeInstruction::kMaskBlend: {
int count = READ8();
I32 m = condPtr[0];
--condPtr; --maskPtr;
for (int i = 0; i < count; ++i) {
sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
--sp;
}
break;
}
case ByteCodeInstruction::kBranchIfAllFalse: {
int target = READ16();
if (!skvx::any(mask())) {
ip = code + target;
}
break;
}
case ByteCodeInstruction::kLoopBegin:
*(++contPtr) = 0;
*(++loopPtr) = ~0;
break;
case ByteCodeInstruction::kLoopNext:
*loopPtr |= *contPtr;
*contPtr = 0;
break;
case ByteCodeInstruction::kLoopMask:
*loopPtr &= POP().fSigned;
break;
case ByteCodeInstruction::kLoopEnd:
--contPtr; --loopPtr;
break;
case ByteCodeInstruction::kLoopBreak:
*loopPtr &= ~mask();
break;
case ByteCodeInstruction::kLoopContinue: {
I32 m = mask();
*contPtr |= m;
*loopPtr &= ~m;
break;
}
default:
SkDEBUGFAILF("unsupported instruction %d\n", (int) inst);
}
}
}
void VecRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
int N, Value uniforms[], int uniformCount) {
#ifdef TRACE
int stackSize = (int) (sp - stack + 1);
printf("STACK(%d):", stackSize);
for (int i = 0; i < stackSize; ++i) {
printf(" %d(%g)", stack[i].fSigned, stack[i].fFloat);
}
printf("\n");
disassemble(f);
#endif
VValue smallStack[128];
SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
VValue smallGlobals[32];
VValue* globals = smallGlobals;
SkASSERT((int)SK_ARRAY_COUNT(smallGlobals) >= byteCode->fGlobalCount);
for (uint8_t slot : byteCode->fInputSlots) {
globals[slot].fUnsigned = (uniforms++)->fUnsigned;
}
while (N) {
VValue* stack = smallStack;
int w = std::min(N, VecWidth);
N -= w;
// Transpose args into stack
{
uint32_t* src = (uint32_t*)args;
for (int i = 0; i < w; ++i) {
uint32_t* dst = (uint32_t*)stack + i;
for (int j = f->fParameterCount; j > 0; --j) {
*dst = *src++;
dst += VecWidth;
}
}
}
auto mask = w > gLanes;
innerRun(byteCode, f, stack, outReturn, mask, globals, byteCode->fGlobalCount);
// Transpose out parameters back
{
uint32_t* dst = (uint32_t*)args;
for (int i = 0; i < w; ++i) {
uint32_t* src = (uint32_t*)stack + i;
for (const auto& p : f->fParameters) {
if (p.fIsOutParameter) {
for (int j = p.fSlotCount; j > 0; --j) {
*dst++ = *src;
src += VecWidth;
}
} else {
dst += p.fSlotCount;
src += p.fSlotCount * VecWidth;
}
}
}
}
args += f->fParameterCount * w;
outReturn += f->fReturnCount * w;
}
}
void Run(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
Value uniforms[], int uniformCount) {
#ifdef TRACE
disassemble(f);
#endif
Value smallStack[128];
std::unique_ptr<Value[]> largeStack;
Value* stack = smallStack;
if ((int)SK_ARRAY_COUNT(smallStack) < f->fStackCount) {
largeStack.reset(new Value[f->fStackCount]);
stack = largeStack.get();
}
if (f->fParameterCount) {
memcpy(stack, args, f->fParameterCount * sizeof(Value));
}
SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
Value smallGlobals[32];
std::unique_ptr<Value[]> largeGlobals;
Value* globals = smallGlobals;
if ((int)SK_ARRAY_COUNT(smallGlobals) < byteCode->fGlobalCount) {
largeGlobals.reset(new Value[byteCode->fGlobalCount]);
globals = largeGlobals.get();
}
for (uint8_t slot : byteCode->fInputSlots) {
globals[slot] = *uniforms++;
}
innerRun(byteCode, f, stack, outReturn, globals, byteCode->fGlobalCount);
for (const auto& p : f->fParameters) {
if (p.fIsOutParameter) {
memcpy(args, stack, p.fSlotCount * sizeof(Value));
}
args += p.fSlotCount;
stack += p.fSlotCount;
}
VecRun(byteCode, f, args, outReturn, 1, uniforms, uniformCount);
}
} // namespace Interpreter

View File

@ -46,6 +46,9 @@ namespace Interpreter {
void Run(const ByteCode*, const ByteCodeFunction*, Value args[], Value* outReturn,
Value uniforms[], int uniformCount);
void VecRun(const ByteCode*, const ByteCodeFunction*, Value args[], Value* outReturn,
int N, Value uniforms[], int uniformCount);
/**
* Print bytecode disassembly to stdout.
*/

View File

@ -1,9 +1,13 @@
STRINGIFY(
float cos(float y);
$genType cos($genType y);
$genHType cos($genHType y);
float3 cross(float3 x, float3 y);
float dot($genType x, $genType y);
$genType mix($genType x, $genType y, float t);
float sin(float x);
float sqrt(float x);
float tan(float x);
$genType sin($genType x);
$genHType sin($genHType x);
$genType sqrt($genType x);
$genHType sqrt($genHType x);
$genType tan($genType x);
$genHType tan($genHType x);
)

View File

@ -57,6 +57,61 @@ void test(skiatest::Reporter* r, const char* src, SkSL::Interpreter::Value* in,
}
}
void vec_test(skiatest::Reporter* r, const char* src) {
// Test on four different vectors (with varying orderings to get divergent control flow)
const float input[16] = { 1, 2, 3, 4,
4, 3, 2, 1,
7, 5, 8, 6,
6, 8, 5, 7 };
float out_s[16], out_v[16];
memcpy(out_s, input, sizeof(out_s));
memcpy(out_v, input, sizeof(out_v));
for (int pass = 0; pass < 2; ++pass) {
SkSL::Compiler compiler;
std::unique_ptr<SkSL::Program> program = compiler.convertProgram(
SkSL::Program::kGeneric_Kind, SkSL::String(src), SkSL::Program::Settings());
if (!program) {
REPORT_FAILURE(r, "!program", SkString(compiler.errorText().c_str()));
return;
}
std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
if (compiler.errorCount() > 0) {
REPORT_FAILURE(r, "!toByteCode", SkString(compiler.errorText().c_str()));
return;
}
const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
if (pass == 0) {
// First pass - run in scalar mode to determine the expected output
for (int i = 0; i < 4; ++i) {
SkSL::Interpreter::Run(byteCode.get(), main,
(SkSL::Interpreter::Value*)(out_s + i * 4), nullptr,
nullptr, 0);
}
} else {
// Second pass - run the vectorized interpreter and compare results
SkSL::Interpreter::VecRun(byteCode.get(), main,
(SkSL::Interpreter::Value*)out_v, nullptr, 4,
nullptr, 0);
if (memcmp(out_s, out_v, sizeof(out_s)) != 0) {
printf("for program: %s\n", src);
for (int i = 0; i < 4; ++i) {
printf("(%g %g %g %g) -> (%g %g %g %g), expected (%g %g %g %g)\n",
input[4*i + 0], input[4*i + 1], input[4*i + 2], input[4*i + 3],
out_v[4*i + 0], out_v[4*i + 1], out_v[4*i + 2], out_v[4*i + 3],
out_s[4*i + 0], out_s[4*i + 1], out_s[4*i + 2], out_s[4*i + 3]);
}
SkSL::Interpreter::Disassemble(main);
REPORT_FAILURE(r, "VecInterpreter mismatch", SkString());
}
}
}
}
void test(skiatest::Reporter* r, const char* src, float inR, float inG, float inB, float inA,
float expectedR, float expectedG, float expectedB, float expectedA) {
SkSL::Compiler compiler;
@ -73,7 +128,7 @@ void test(skiatest::Reporter* r, const char* src, float inR, float inG, float in
printf("%s\n%s", src, compiler.errorText().c_str());
return;
}
SkSL::ByteCodeFunction* main = byteCode->fFunctions[0].get();
const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
float inoutColor[4] = { inR, inG, inB, inA };
SkSL::Interpreter::Run(byteCode.get(), main, (SkSL::Interpreter::Value*) inoutColor,
nullptr, nullptr, 0);
@ -92,6 +147,9 @@ void test(skiatest::Reporter* r, const char* src, float inR, float inG, float in
} else {
printf("%s\n%s", src, compiler.errorText().c_str());
}
// Do additional testing of 4x1 vs 1x4 to stress divergent control flow, etc.
vec_test(r, src);
}
DEF_TEST(SkSLInterpreterAdd, r) {
@ -250,6 +308,9 @@ DEF_TEST(SkSLInterpreterTernary, r) {
0, 1, 2, 0, 2, 1, 2, 0);
test(r, "void main(inout half4 color) { color.r = color.g > color.b ? color.g : color.b; }",
0, 3, 2, 0, 3, 3, 2, 0);
test(r, "int fib(int i) { return (i < 2) ? 1 : fib(i - 1) + fib(i - 2); }"
"void main(inout half4 color) { color.r = half(fib(int(color.r))); }",
3, 0, 0, 0, 3, 0, 0, 0);
}
DEF_TEST(SkSLInterpreterCast, r) {
@ -336,10 +397,10 @@ DEF_TEST(SkSLInterpreterIfVector, r) {
DEF_TEST(SkSLInterpreterWhile, r) {
test(r, "void main(inout half4 color) { while (color.r < 1) color.r += 0.25; }", 0, 0, 0, 0, 1,
0, 0, 0);
test(r, "void main(inout half4 color) { while (color.r > 1) color.r += 0.25; }", 0, 0, 0, 0, 0,
test(r, "void main(inout half4 color) { while (color.r > 1) color.r -= 0.25; }", 0, 0, 0, 0, 0,
0, 0, 0);
test(r, "void main(inout half4 color) { while (true) { color.r += 0.5; "
"if (color.r > 1) break; } }", 0, 0, 0, 0, 1.5, 0, 0, 0);
"if (color.r > 5) break; } }", 0, 0, 0, 0, 5.5, 0, 0, 0);
test(r, "void main(inout half4 color) { while (color.r < 10) { color.r += 0.5; "
"if (color.r < 5) continue; break; } }", 0, 0, 0, 0, 5, 0, 0, 0);
}
@ -347,8 +408,8 @@ DEF_TEST(SkSLInterpreterWhile, r) {
DEF_TEST(SkSLInterpreterDo, r) {
test(r, "void main(inout half4 color) { do color.r += 0.25; while (color.r < 1); }", 0, 0, 0, 0,
1, 0, 0, 0);
test(r, "void main(inout half4 color) { do color.r += 0.25; while (color.r > 1); }", 0, 0, 0, 0,
0.25, 0, 0, 0);
test(r, "void main(inout half4 color) { do color.r -= 0.25; while (color.r > 1); }", 0, 0, 0, 0,
-0.25, 0, 0, 0);
test(r, "void main(inout half4 color) { do { color.r += 0.5; if (color.r > 1) break; } while "
"(true); }", 0, 0, 0, 0, 1.5, 0, 0, 0);
test(r, "void main(inout half4 color) {do { color.r += 0.5; if (color.r < 5) "