Interpreter: Vectorized interpreter

This converts the SkSL interpreter to operate in SIMT fashion. It handles all the same features as the previous scalar implementation, but operates on N lanes at a time. (Currently 8). It's modeled after GPU and other parallel architectures, using execution masks to handle control flow, including divergent control-flow. Change-Id: Ieb38ffe2f55a10f72bdab844c297126fe9bedb6c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/217122 Commit-Queue: Brian Osman <brianosman@google.com> Reviewed-by: Mike Klein <mtklein@google.com>
2019-06-13 11:23:57 -04:00 · 2019-06-13 11:23:57 -04:00 · 569f12f0e5
commit 569f12f0e5
parent d608e224b2
9 changed files with 792 additions and 238 deletions
--- a/bench/SkSLInterpreterBench.cpp
+++ b/bench/SkSLInterpreterBench.cpp
@ -0,0 +1,223 @@
+/*
+ * Copyright 2019 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#include "bench/Benchmark.h"
+#include "include/utils/SkRandom.h"
+#include "src/sksl/SkSLCompiler.h"
+#include "src/sksl/SkSLInterpreter.h"
+
+// Benchmarks the interpreter with a function that has a color-filter style signature
+class SkSLInterpreterCFBench : public Benchmark {
+public:
+    SkSLInterpreterCFBench(SkSL::String name, int pixels, const char* src)
+        : fName(SkStringPrintf("sksl_interp_cf_%d_%s", pixels, name.c_str()))
+        , fSrc(src)
+        , fCount(pixels) {}
+
+protected:
+    const char* onGetName() override {
+        return fName.c_str();
+    }
+
+    bool isSuitableFor(Backend backend) override {
+        return backend == kNonRendering_Backend;
+    }
+
+    void onDelayedSetup() override {
+        SkSL::Compiler compiler;
+        SkSL::Program::Settings settings;
+        auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, fSrc, settings);
+        SkASSERT(compiler.errorCount() == 0);
+        fByteCode = compiler.toByteCode(*program);
+        SkASSERT(compiler.errorCount() == 0);
+        fMain = fByteCode->getFunction("main");
+
+        SkRandom rnd;
+        fPixels.resize(fCount);
+        for (int i = 0; i < fCount; ++i) {
+            fPixels[i] = SkColor4f::FromColor(rnd.nextU());
+        }
+    }
+
+    void onDraw(int loops, SkCanvas*) override {
+        for (int i = 0; i < loops; i++) {
+            SkSL::Interpreter::VecRun(fByteCode.get(), fMain,
+                                      (SkSL::Interpreter::Value*)fPixels.data(), nullptr, fCount,
+                                      nullptr, 0);
+        }
+    }
+
+private:
+    SkString fName;
+    SkSL::String fSrc;
+    std::unique_ptr<SkSL::ByteCode> fByteCode;
+    const SkSL::ByteCodeFunction* fMain;
+
+    int fCount;
+    std::vector<SkColor4f> fPixels;
+
+    typedef Benchmark INHERITED;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, R"(
+    void main(inout float4 color) {
+        color.a = color.r*0.3 + color.g*0.6 + color.b*0.1;
+        color.r = 0;
+        color.g = 0;
+        color.b = 0;
+    }
+)"));
+
+DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
+    half ucontrast_Stage2;
+    half hue2rgb_Stage2(half p, half q, half t) {
+        if (t < 0)  t += 1;
+        if (t > 1)  t -= 1;
+        if (t < 1 / 6.)  return p + (q - p) * 6 * t;
+        if (t < 1 / 2.)  return q;
+        if (t < 2 / 3.)  return p + (q - p) * (2 / 3. - t) * 6;
+        return p;
+    }
+    half max(half a, half b) { return a > b ? a : b; }
+    half min(half a, half b) { return a < b ? a : b; }
+    void main(inout half4 color) {
+        ucontrast_Stage2 = 0.2;
+
+        // HighContrastFilter
+        half nonZeroAlpha = max(color.a, 0.0001);
+        color = half4(color.rgb / nonZeroAlpha, nonZeroAlpha);
+        color.rgb = color.rgb * color.rgb;
+        half fmax = max(color.r, max(color.g, color.b));
+        half fmin = min(color.r, min(color.g, color.b));
+        half l = (fmax + fmin) / 2;
+        half h;
+        half s;
+        if (fmax == fmin) {
+            h = 0;
+            s = 0;
+        } else {
+            half d = fmax - fmin;
+            s = l > 0.5 ? d / (2 - fmax - fmin) : d / (fmax + fmin);
+            if (color.r >= color.g && color.r >= color.b) {
+                h = (color.g - color.b) / d + (color.g < color.b ? 6 : 0);
+            } else if (color.g >= color.b) {
+                h = (color.b - color.r) / d + 2;
+            } else {
+                h = (color.r - color.g) / d + 4;
+            }
+        }
+        h /= 6;
+        l = 1.0 - l;
+        if (s == 0) {
+            color = half4(l, l, l, 0);
+        } else {
+            half q = l < 0.5 ? l * (1 + s) : l + s - l * s;
+            half p = 2 * l - q;
+            color.r = hue2rgb_Stage2(p, q, h + 1 / 3.);
+            color.g = hue2rgb_Stage2(p, q, h);
+            color.b = hue2rgb_Stage2(p, q, h - 1 / 3.);
+        }
+        if (ucontrast_Stage2 != 0) {
+            half m = (1 + ucontrast_Stage2) / (1 - ucontrast_Stage2);
+            half off = (-0.5 * m + 0.5);
+            color = m * color + off;
+        }
+        // color = saturate(color);
+        color.rgb = sqrt(color.rgb);
+        color.rgb *= color.a;
+    }
+)"));
+
+class SkSLInterpreterSortBench : public Benchmark {
+public:
+    SkSLInterpreterSortBench(int groups, int values, const char* src)
+        : fName(SkStringPrintf("sksl_interp_sort_%dx%d", groups, values))
+        , fCode(src)
+        , fGroups(groups)
+        , fValues(values) {
+    }
+
+protected:
+    const char* onGetName() override {
+        return fName.c_str();
+    }
+
+    bool isSuitableFor(Backend backend) override {
+        return backend == kNonRendering_Backend;
+    }
+
+    void onDelayedSetup() override {
+        SkSL::Compiler compiler;
+        SkSL::Program::Settings settings;
+        auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, fCode, settings);
+        SkASSERT(compiler.errorCount() == 0);
+        fByteCode = compiler.toByteCode(*program);
+        SkASSERT(compiler.errorCount() == 0);
+        fMain = fByteCode->getFunction("main");
+
+        fSrc.resize(fGroups * fValues);
+        fDst.resize(fGroups * fValues);
+
+        SkRandom rnd;
+        for (auto& x : fSrc) {
+            x = rnd.nextS();
+        }
+
+        // Trigger one run now to check correctness
+        SkSL::Interpreter::VecRun(fByteCode.get(), fMain,
+                                  (SkSL::Interpreter::Value*)fSrc.data(),
+                                  (SkSL::Interpreter::Value*)fDst.data(),
+                                  fGroups,
+                                  nullptr, 0);
+        for (int i = 0; i < fGroups; ++i) {
+            for (int j = 1; j < fValues; ++j) {
+                SkASSERT(fDst[i * fValues + j] >= fDst[i * fValues + j - 1]);
+            }
+        }
+    }
+
+    void onDraw(int loops, SkCanvas*) override {
+        for (int i = 0; i < loops; i++) {
+            SkSL::Interpreter::VecRun(fByteCode.get(), fMain,
+                                      (SkSL::Interpreter::Value*)fSrc.data(),
+                                      (SkSL::Interpreter::Value*)fDst.data(),
+                                      fGroups,
+                                      nullptr, 0);
+        }
+    }
+
+private:
+    SkString fName;
+    SkSL::String fCode;
+    std::unique_ptr<SkSL::ByteCode> fByteCode;
+    const SkSL::ByteCodeFunction* fMain;
+
+    int fGroups;
+    int fValues;
+    std::vector<int32_t> fSrc;
+    std::vector<int32_t> fDst;
+
+    typedef Benchmark INHERITED;
+};
+
+// Currently, this exceeds the interpreter's stack. Consider it a test case for some eventual
+// bounds checking.
+#if 0
+DEF_BENCH(return new SkSLInterpreterSortBench(1024, 32, R"(
+    int[32] main(int v[32]) {
+        for (int i = 1; i < 32; ++i) {
+            for (int j = i; j > 0 && v[j-1] > v[j]; --j) {
+                int t = v[j];
+                v[j] = v[j-1];
+                v[j-1] = t;
+            }
+        }
+        return v;
+    }
+)"));
+#endif
--- a/gn/bench.gni
+++ b/gn/bench.gni
@ -111,6 +111,7 @@ bench_sources = [
  "$_bench/SkVMBench.cpp",
  "$_bench/SKPBench.cpp",
  "$_bench/SkSLBench.cpp",
+  "$_bench/SkSLInterpreterBench.cpp",
  "$_bench/StreamBench.cpp",
  "$_bench/SortBench.cpp",
  "$_bench/StrokeBench.cpp",
--- a/src/core/SkColorFilter.cpp
+++ b/src/core/SkColorFilter.cpp
@ -425,12 +425,10 @@ public:
            ctx->main = ctx->byteCode->fFunctions[0].get();
            ctx->fn = [](SkRasterPipeline_CallbackCtx* arg, int active_pixels) {
                auto ctx = (InterpreterCtx*)arg;
-                for (int i = 0; i < active_pixels; i++) {
-                    SkSL::Interpreter::Run(ctx->byteCode.get(), ctx->main,
-                                           (SkSL::Interpreter::Value*) (ctx->rgba + i * 4),
-                                           nullptr, (SkSL::Interpreter::Value*)ctx->inputs,
-                                           ctx->ninputs);
-                }
+                SkSL::Interpreter::VecRun(ctx->byteCode.get(), ctx->main,
+                                            (SkSL::Interpreter::Value*)ctx->rgba,
+                                            nullptr, active_pixels,
+                                            (SkSL::Interpreter::Value*)ctx->inputs, ctx->ninputs);
            };
            rec.fPipeline->append(SkRasterPipeline::callback, ctx);
        }
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@ -49,8 +49,6 @@ enum class ByteCodeInstruction : uint16_t {
    VECTOR(kCompareUGTEQ),
    VECTOR(kCompareULT),
    VECTOR(kCompareULTEQ),
-    // Followed by a 16 bit address
-    kConditionalBranch,
    VECTOR(kConvertFtoI),
    VECTOR(kConvertStoF),
    VECTOR(kConvertUtoF),
@ -83,7 +81,7 @@ enum class ByteCodeInstruction : uint16_t {
    VECTOR(kMix),
    VECTOR_MATRIX(kMultiplyF),
    VECTOR(kMultiplyI),
-    kNot,
+    kNotB,
    kOrB,
    VECTOR_MATRIX(kPop),
    // Followed by a 32 bit value containing the value to push
@ -124,6 +122,22 @@ enum class ByteCodeInstruction : uint16_t {
    VECTOR(kTan),
    // Followed by a byte indicating external value to write
    VECTOR(kWriteExternal),
+    kXorB,
+
+    kMaskPush,
+    kMaskPop,
+    kMaskNegate,
+    // Followed by count byte
+    kMaskBlend,
+    // Followed by address
+    kBranchIfAllFalse,
+
+    kLoopBegin,
+    kLoopNext,
+    kLoopMask,
+    kLoopEnd,
+    kLoopBreak,
+    kLoopContinue,
 };
 #undef VECTOR

--- a/src/sksl/SkSLByteCodeGenerator.cpp
+++ b/src/sksl/SkSLByteCodeGenerator.cpp
@ -459,6 +459,24 @@ bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool di
                                            ByteCodeInstruction::kMultiplyF,
                                            count);
                break;
+
+            case Token::Kind::LOGICALAND:
+                SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
+                this->write(ByteCodeInstruction::kAndB);
+                break;
+            case Token::Kind::LOGICALNOT:
+                SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
+                this->write(ByteCodeInstruction::kNotB);
+                break;
+            case Token::Kind::LOGICALOR:
+                SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
+                this->write(ByteCodeInstruction::kOrB);
+                break;
+            case Token::Kind::LOGICALXOR:
+                SkASSERT(type_category(lType) == SkSL::TypeCategory::kBool && count == 1);
+                this->write(ByteCodeInstruction::kXorB);
+                break;
+
            default:
                SkASSERT(false);
        }
@ -472,7 +490,7 @@ bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool di

 void ByteCodeGenerator::writeBoolLiteral(const BoolLiteral& b) {
    this->write(ByteCodeInstruction::kPushImmediate);
-    this->write32(b.fValue ? 1 : 0);
+    this->write32(b.fValue ? ~0 : 0);
 }

 void ByteCodeGenerator::writeConstructor(const Constructor& c) {
@ -744,14 +762,12 @@ void ByteCodeGenerator::writeSwizzle(const Swizzle& s) {

 void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t) {
    this->writeExpression(*t.fTest);
-    this->write(ByteCodeInstruction::kConditionalBranch);
-    DeferredLocation trueLocation(this);
-    this->writeExpression(*t.fIfFalse);
-    this->write(ByteCodeInstruction::kBranch);
-    DeferredLocation endLocation(this);
-    trueLocation.set();
+    this->write(ByteCodeInstruction::kMaskPush);
    this->writeExpression(*t.fIfTrue);
-    endLocation.set();
+    this->write(ByteCodeInstruction::kMaskNegate);
+    this->writeExpression(*t.fIfFalse);
+    this->write(ByteCodeInstruction::kMaskBlend);
+    this->write8(SlotCount(t.fType));
 }

 void ByteCodeGenerator::writeExpression(const Expression& e, bool discard) {
@ -976,25 +992,29 @@ void ByteCodeGenerator::setContinueTargets() {
 }

 void ByteCodeGenerator::writeBreakStatement(const BreakStatement& b) {
-    this->write(ByteCodeInstruction::kBranch);
-    fBreakTargets.top().emplace_back(this);
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopBreak);
 }

 void ByteCodeGenerator::writeContinueStatement(const ContinueStatement& c) {
-    this->write(ByteCodeInstruction::kBranch);
-    fContinueTargets.top().emplace_back(this);
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopContinue);
 }

 void ByteCodeGenerator::writeDoStatement(const DoStatement& d) {
-    fContinueTargets.emplace();
-    fBreakTargets.emplace();
+    this->write(ByteCodeInstruction::kLoopBegin);
    size_t start = fCode->size();
    this->writeStatement(*d.fStatement);
-    this->setContinueTargets();
+    this->write(ByteCodeInstruction::kLoopNext);
    this->writeExpression(*d.fTest);
-    this->write(ByteCodeInstruction::kConditionalBranch);
+    this->write(ByteCodeInstruction::kLoopMask);
+    // TODO: Could shorten this with kBranchIfAnyTrue
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
+    DeferredLocation endLocation(this);
+    this->write(ByteCodeInstruction::kBranch);
    this->write16(start);
-    this->setBreakTargets();
+    endLocation.set();
+    this->write(ByteCodeInstruction::kLoopEnd);
 }

 void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
@ -1003,53 +1023,40 @@ void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
    if (f.fInitializer) {
        this->writeStatement(*f.fInitializer);
    }
+    this->write(ByteCodeInstruction::kLoopBegin);
    size_t start = fCode->size();
    if (f.fTest) {
        this->writeExpression(*f.fTest);
-        this->write(ByteCodeInstruction::kNot);
-        this->write(ByteCodeInstruction::kConditionalBranch);
-        DeferredLocation endLocation(this);
-        this->writeStatement(*f.fStatement);
-        this->setContinueTargets();
-        if (f.fNext) {
-            this->writeExpression(*f.fNext, true);
-        }
-        this->write(ByteCodeInstruction::kBranch);
-        this->write16(start);
-        endLocation.set();
-    } else {
-        this->writeStatement(*f.fStatement);
-        this->setContinueTargets();
-        if (f.fNext) {
-            this->writeExpression(*f.fNext, true);
-        }
-        this->write(ByteCodeInstruction::kBranch);
-        this->write16(start);
+        this->write(ByteCodeInstruction::kLoopMask);
    }
-    this->setBreakTargets();
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
+    DeferredLocation endLocation(this);
+    this->writeStatement(*f.fStatement);
+    this->write(ByteCodeInstruction::kLoopNext);
+    if (f.fNext) {
+        this->writeExpression(*f.fNext, true);
+    }
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(start);
+    endLocation.set();
+    this->write(ByteCodeInstruction::kLoopEnd);
 }

 void ByteCodeGenerator::writeIfStatement(const IfStatement& i) {
+    this->writeExpression(*i.fTest);
+    this->write(ByteCodeInstruction::kMaskPush);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
+    DeferredLocation falseLocation(this);
+    this->writeStatement(*i.fIfTrue);
+    falseLocation.set();
    if (i.fIfFalse) {
-        // if (test) { ..ifTrue.. } else { .. ifFalse .. }
-        this->writeExpression(*i.fTest);
-        this->write(ByteCodeInstruction::kConditionalBranch);
-        DeferredLocation trueLocation(this);
+        this->write(ByteCodeInstruction::kMaskNegate);
+        this->write(ByteCodeInstruction::kBranchIfAllFalse);
+        DeferredLocation endLocation(this);
        this->writeStatement(*i.fIfFalse);
-        this->write(ByteCodeInstruction::kBranch);
-        DeferredLocation endLocation(this);
-        trueLocation.set();
-        this->writeStatement(*i.fIfTrue);
-        endLocation.set();
-    } else {
-        // if (test) { ..ifTrue.. }
-        this->writeExpression(*i.fTest);
-        this->write(ByteCodeInstruction::kNot);
-        this->write(ByteCodeInstruction::kConditionalBranch);
-        DeferredLocation endLocation(this);
-        this->writeStatement(*i.fIfTrue);
        endLocation.set();
    }
+    this->write(ByteCodeInstruction::kMaskPop);
 }

 void ByteCodeGenerator::writeReturnStatement(const ReturnStatement& r) {
@ -1086,19 +1093,18 @@ void ByteCodeGenerator::writeVarDeclarations(const VarDeclarations& v) {
 }

 void ByteCodeGenerator::writeWhileStatement(const WhileStatement& w) {
-    fContinueTargets.emplace();
-    fBreakTargets.emplace();
-    size_t start = fCode->size();
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t cond = fCode->size();
    this->writeExpression(*w.fTest);
-    this->write(ByteCodeInstruction::kNot);
-    this->write(ByteCodeInstruction::kConditionalBranch);
+    this->write(ByteCodeInstruction::kLoopMask);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
    DeferredLocation endLocation(this);
    this->writeStatement(*w.fStatement);
-    this->setContinueTargets();
+    this->write(ByteCodeInstruction::kLoopNext);
    this->write(ByteCodeInstruction::kBranch);
-    this->write16(start);
+    this->write16(cond);
    endLocation.set();
-    this->setBreakTargets();
+    this->write(ByteCodeInstruction::kLoopEnd);
 }

 void ByteCodeGenerator::writeStatement(const Statement& s) {
--- a/src/sksl/SkSLInterpreter.cpp
+++ b/src/sksl/SkSLInterpreter.cpp
@ -8,6 +8,7 @@
 #ifndef SKSL_STANDALONE

 #include "include/core/SkPoint3.h"
+#include "include/private/SkVx.h"
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLByteCodeGenerator.h"
 #include "src/sksl/SkSLExternalValue.h"
@ -18,6 +19,16 @@
 namespace SkSL {
 namespace Interpreter {

+constexpr int VecWidth = 16;
+
+using F32 = skvx::Vec<VecWidth, float>;
+using I32 = skvx::Vec<VecWidth, int32_t>;
+using U32 = skvx::Vec<VecWidth, uint32_t>;
+
+// Needs to be the first N non-negative integers, at least as large as VecWidth
+static const I32 gLanes = {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+
 template <typename T>
 static T unaligned_load(const void* ptr) {
    T val;
@ -72,9 +83,6 @@ static const uint8_t* disassemble_instruction(const uint8_t* ip) {
        VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
        VECTOR_DISASSEMBLE(kCompareULT, "compareult")
        VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
-        case ByteCodeInstruction::kConditionalBranch:
-            printf("conditionalbranch %d", READ16());
-            break;
        VECTOR_DISASSEMBLE(kConvertFtoI, "convertftoi")
        VECTOR_DISASSEMBLE(kConvertStoF, "convertstof")
        VECTOR_DISASSEMBLE(kConvertUtoF, "convertutof")
@ -132,7 +140,7 @@ static const uint8_t* disassemble_instruction(const uint8_t* ip) {
        VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
        VECTOR_MATRIX_DISASSEMBLE(kNegateF, "negatef")
        VECTOR_DISASSEMBLE(kNegateI, "negatei")
-        case ByteCodeInstruction::kNot: printf("not"); break;
+        case ByteCodeInstruction::kNotB: printf("notb"); break;
        case ByteCodeInstruction::kOrB: printf("orb"); break;
        VECTOR_MATRIX_DISASSEMBLE(kPop, "pop")
        case ByteCodeInstruction::kPushImmediate: {
@ -218,6 +226,20 @@ static const uint8_t* disassemble_instruction(const uint8_t* ip) {
        case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ8()); break;
        case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ8()); break;
        case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ8()); break;
+        case ByteCodeInstruction::kXorB: printf("xorb"); break;
+        case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
+        case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
+        case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
+        case ByteCodeInstruction::kMaskBlend: printf("maskblend %d", READ8()); break;
+        case ByteCodeInstruction::kBranchIfAllFalse:
+            printf("branchifallfalse %d", READ16());
+            break;
+        case ByteCodeInstruction::kLoopBegin: printf("loopbegin"); break;
+        case ByteCodeInstruction::kLoopNext: printf("loopnext"); break;
+        case ByteCodeInstruction::kLoopMask: printf("loopmask"); break;
+        case ByteCodeInstruction::kLoopEnd: printf("loopend"); break;
+        case ByteCodeInstruction::kLoopContinue: printf("loopcontinue"); break;
+        case ByteCodeInstruction::kLoopBreak: printf("loopbreak"); break;
        default: printf("unknown(%d)\n", *(ip - 1)); SkASSERT(false);
    }
    return ip;
@ -294,27 +316,80 @@ void Disassemble(const ByteCodeFunction* f) {
    case ByteCodeInstruction::base:      sp[ 0] = fn(sp[ 0].field); \
                                         break;

+#define VECTOR_UNARY_FN_VEC(base, fn)                               \
+    case ByteCodeInstruction::base ## 4:                            \
+    case ByteCodeInstruction::base ## 3:                            \
+    case ByteCodeInstruction::base ## 2:                            \
+    case ByteCodeInstruction::base     : {                          \
+        int count = (int)inst - (int)ByteCodeInstruction::base + 1; \
+        float* v = (float*)sp - count + 1;                          \
+        for (int i = VecWidth * count; i > 0; --i, ++v) {           \
+            *v = fn(*v);                                            \
+        }                                                           \
+        break;                                                      \
+    }
+
+union VValue {
+    VValue() {}
+
+    VValue(F32 f)
+        : fFloat(f) {
+    }
+
+    VValue(I32 s)
+        : fSigned(s) {
+    }
+
+    VValue(U32 u)
+        : fUnsigned(u) {
+    }
+
+    F32 fFloat;
+    I32 fSigned;
+    U32 fUnsigned;
+};
+
 struct StackFrame {
    const uint8_t* fCode;
    const uint8_t* fIP;
-    Interpreter::Value* fStack;
+    VValue* fStack;
 };

-static float mix(float start, float end, float t) {
+static F32 mix(F32 start, F32 end, F32 t) {
    return start * (1 - t) + end * t;
 }

-void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack, Value* outReturn,
-              Value globals[], int globalCount) {
-    Value* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+// TODO: trunc on integers?
+template <typename T>
+static T vec_mod(T a, T b) {
+    return a - skvx::trunc(a / b) * b;
+}

-    auto POP =  [&]          { SkASSERT(sp     >= stack); return *(sp--); };
-    auto PUSH = [&](Value v) { SkASSERT(sp + 1 >= stack); *(++sp) = v;    };
+void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack, Value* outReturn,
+              I32 initMask, VValue globals[], int globalCount) {
+    VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+
+    auto POP =  [&]           { SkASSERT(sp     >= stack); return *(sp--); };
+    auto PUSH = [&](VValue v) { SkASSERT(sp + 1 >= stack); *(++sp) = v;    };

    const uint8_t* code = f->fCode.data();
    const uint8_t* ip = code;
    std::vector<StackFrame> frames;

+    I32 condStack[16];  // Independent condition masks
+    I32 maskStack[16];  // Combined masks (eg maskStack[0] & maskStack[1] & ...)
+    I32 contStack[16];  // Continue flags for loops
+    I32 loopStack[16];  // Loop execution masks
+    condStack[0] = maskStack[0] = initMask;
+    contStack[0] = I32( 0);
+    loopStack[0] = I32(~0);
+    I32* condPtr = condStack;
+    I32* maskPtr = maskStack;
+    I32* contPtr = contStack;
+    I32* loopPtr = loopStack;
+
+    auto mask = [&]() { return *maskPtr & *loopPtr; };
+
    for (;;) {
 #ifdef TRACE
        printf("at %3d  ", (int) (ip - code));
@ -325,8 +400,21 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
        switch (inst) {
            VECTOR_BINARY_OP(kAddI, fSigned, +)
            VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
+
+            // Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
            case ByteCodeInstruction::kAndB:
-                sp[-1] = sp[-1].fBool && sp[0].fBool;
+                sp[-1] = sp[-1].fSigned & sp[0].fSigned;
+                POP();
+                break;
+            case ByteCodeInstruction::kNotB:
+                sp[0] = ~sp[0].fSigned;
+                break;
+            case ByteCodeInstruction::kOrB:
+                sp[-1] = sp[-1].fSigned | sp[0].fSigned;
+                POP();
+                break;
+            case ByteCodeInstruction::kXorB:
+                sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
                POP();
                break;

@ -340,10 +428,15 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                // (plus space for locals).
                int target = READ8();
                const ByteCodeFunction* fun = byteCode->fFunctions[target].get();
-                frames.push_back({ code, ip, stack });
-                ip = code = fun->fCode.data();
-                stack = sp - fun->fParameterCount + 1;
-                sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+                if (skvx::any(mask())) {
+                    frames.push_back({ code, ip, stack });
+                    ip = code = fun->fCode.data();
+                    stack = sp - fun->fParameterCount + 1;
+                    sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+                } else {
+                    sp -= fun->fParameterCount;
+                    sp += fun->fReturnCount;
+                }
                break;
            }

@ -354,10 +447,23 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                ExternalValue* v = byteCode->fExternalValues[target];
                sp -= argumentCount - 1;

-                Value tmp[4];
-                SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmp));
-                v->call(sp, tmp);
-                memcpy(sp, tmp, returnCount * sizeof(Value));
+                Value tmpArgs[4];
+                Value tmpReturn[4];
+                SkASSERT(argumentCount <= (int)SK_ARRAY_COUNT(tmpArgs));
+                SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmpReturn));
+
+                I32 m = mask();
+                for (int i = 0; i < VecWidth; ++i) {
+                    if (m[i]) {
+                        for (int j = 0; j < argumentCount; ++j) {
+                            tmpArgs[j].fSigned = sp[j].fSigned[i];
+                        }
+                        v->call(tmpArgs, tmpReturn);
+                        for (int j = 0; j < returnCount; ++j) {
+                            sp[j].fSigned[i] = tmpReturn[j].fSigned;
+                        }
+                    }
+                }
                sp += returnCount - 1;
                break;
            }
@ -379,45 +485,36 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
            VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
            VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)

-            case ByteCodeInstruction::kConditionalBranch: {
-                int target = READ16();
-                if (POP().fBool) {
-                    ip = code + target;
-                }
-                break;
-            }
-
-            case ByteCodeInstruction::kConvertFtoI4: sp[-3].fSigned = (int)sp[-3].fFloat;
-            case ByteCodeInstruction::kConvertFtoI3: sp[-2].fSigned = (int)sp[-2].fFloat;
-            case ByteCodeInstruction::kConvertFtoI2: sp[-1].fSigned = (int)sp[-1].fFloat;
-            case ByteCodeInstruction::kConvertFtoI:  sp[ 0].fSigned = (int)sp[ 0].fFloat;
+            case ByteCodeInstruction::kConvertFtoI4: sp[-3] = skvx::cast<int>(sp[-3].fFloat);
+            case ByteCodeInstruction::kConvertFtoI3: sp[-2] = skvx::cast<int>(sp[-2].fFloat);
+            case ByteCodeInstruction::kConvertFtoI2: sp[-1] = skvx::cast<int>(sp[-1].fFloat);
+            case ByteCodeInstruction::kConvertFtoI:  sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
                                                     break;

-            case ByteCodeInstruction::kConvertStoF4: sp[-3].fFloat = sp[-3].fSigned;
-            case ByteCodeInstruction::kConvertStoF3: sp[-2].fFloat = sp[-2].fSigned;
-            case ByteCodeInstruction::kConvertStoF2: sp[-1].fFloat = sp[-1].fSigned;
-            case ByteCodeInstruction::kConvertStoF : sp[ 0].fFloat = sp[ 0].fSigned;
+            case ByteCodeInstruction::kConvertStoF4: sp[-3] = skvx::cast<float>(sp[-3].fSigned);
+            case ByteCodeInstruction::kConvertStoF3: sp[-2] = skvx::cast<float>(sp[-2].fSigned);
+            case ByteCodeInstruction::kConvertStoF2: sp[-1] = skvx::cast<float>(sp[-1].fSigned);
+            case ByteCodeInstruction::kConvertStoF : sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
                                                     break;

-            case ByteCodeInstruction::kConvertUtoF4: sp[-3].fFloat = sp[-3].fUnsigned;
-            case ByteCodeInstruction::kConvertUtoF3: sp[-2].fFloat = sp[-2].fUnsigned;
-            case ByteCodeInstruction::kConvertUtoF2: sp[-1].fFloat = sp[-1].fUnsigned;
-            case ByteCodeInstruction::kConvertUtoF : sp[ 0].fFloat = sp[ 0].fUnsigned;
+            case ByteCodeInstruction::kConvertUtoF4: sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
+            case ByteCodeInstruction::kConvertUtoF3: sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
+            case ByteCodeInstruction::kConvertUtoF2: sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
+            case ByteCodeInstruction::kConvertUtoF : sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
                                                     break;

-            VECTOR_UNARY_FN(kCos, cosf, fFloat)
+            VECTOR_UNARY_FN_VEC(kCos, cosf)

            case ByteCodeInstruction::kCross: {
-                SkPoint3 cross = SkPoint3::CrossProduct(SkPoint3::Make(sp[-5].fFloat,
-                                                                       sp[-4].fFloat,
-                                                                       sp[-3].fFloat),
-                                                        SkPoint3::Make(sp[-2].fFloat,
-                                                                       sp[-1].fFloat,
-                                                                       sp[ 0].fFloat));
+                F32 ax = sp[-5].fFloat, ay = sp[-4].fFloat, az = sp[-3].fFloat,
+                    bx = sp[-2].fFloat, by = sp[-1].fFloat, bz = sp[ 0].fFloat;
+                F32 cx = ay*bz - az*by,
+                    cy = az*bx - ax*bz,
+                    cz = ax*by - ay*bx;
                sp -= 3;
-                sp[-2] = cross.fX;
-                sp[-1] = cross.fY;
-                sp[ 0] = cross.fZ;
+                sp[-2] = cx;
+                sp[-1] = cy;
+                sp[ 0] = cz;
                break;
            }

@ -433,7 +530,7 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,

            case ByteCodeInstruction::kDupN: {
                int count = READ8();
-                memcpy(sp + 1, sp - count + 1, count * sizeof(Value));
+                memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
                sp += count;
                break;
            }
@ -457,17 +554,30 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,

            case ByteCodeInstruction::kLoadExtended: {
                int count = READ8();
-                int src = POP().fSigned;
-                memcpy(sp + 1, &stack[src], count * sizeof(Value));
+                I32 src = POP().fSigned;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            sp[i + 1].fSigned[j] = stack[src[j] + i].fSigned[j];
+                        }
+                    }
+                }
                sp += count;
                break;
            }

            case ByteCodeInstruction::kLoadExtendedGlobal: {
                int count = READ8();
-                int src = POP().fSigned;
-                SkASSERT(src + count <= globalCount);
-                memcpy(sp + 1, &globals[src], count * sizeof(Value));
+                I32 src = POP().fSigned;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            sp[i + 1].fSigned[j] = globals[src[j] + i].fSigned[j];
+                        }
+                    }
+                }
                sp += count;
                break;
            }
@ -502,15 +612,17 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                SkASSERT(srcRows >= 2 && srcRows <= 4);
                SkASSERT(dstCols >= 2 && dstCols <= 4);
                SkASSERT(dstRows >= 2 && dstRows <= 4);
-                SkMatrix44 m;
+                F32 tmp[16];
+                memset(tmp, 0, sizeof(tmp));
+                tmp[0] = tmp[5] = tmp[10] = tmp[15] = F32(1.0f);
                for (int c = srcCols - 1; c >= 0; --c) {
                    for (int r = srcRows - 1; r >= 0; --r) {
-                        m.set(r, c, POP().fFloat);
+                        tmp[c*4 + r] = POP().fFloat;
                    }
                }
                for (int c = 0; c < dstCols; ++c) {
                    for (int r = 0; r < dstRows; ++r) {
-                        PUSH(m.get(r, c));
+                        PUSH(tmp[c*4 + r]);
                    }
                }
                break;
@ -521,9 +633,9 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                int lRows = READ8();
                int rCols = READ8();
                int rRows = lCols;
-                float tmp[16] = { 0.0f };
-                float* B = &(sp - (rCols * rRows) + 1)->fFloat;
-                float* A = B - (lCols * lRows);
+                F32 tmp[16] = { 0.0f };
+                F32* B = &(sp - (rCols * rRows) + 1)->fFloat;
+                F32* A = B - (lCols * lRows);
                for (int c = 0; c < rCols; ++c) {
                    for (int r = 0; r < lRows; ++r) {
                        for (int j = 0; j < lCols; ++j) {
@ -532,7 +644,7 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                    }
                }
                sp -= (lCols * lRows) + (rCols * rRows);
-                memcpy(sp + 1, tmp, rCols * lRows * sizeof(Value));
+                memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
                sp += (rCols * lRows);
                break;
            }
@ -564,10 +676,6 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
            VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
            VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)

-            case ByteCodeInstruction::kNot:
-                sp[0].fBool = !sp[0].fBool;
-                break;
-
            case ByteCodeInstruction::kNegateF4: sp[-3] = -sp[-3].fFloat;
            case ByteCodeInstruction::kNegateF3: sp[-2] = -sp[-2].fFloat;
            case ByteCodeInstruction::kNegateF2: sp[-1] = -sp[-1].fFloat;
@ -585,14 +693,9 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
            case ByteCodeInstruction::kNegateI4: sp[-3] = -sp[-3].fSigned;
            case ByteCodeInstruction::kNegateI3: sp[-2] = -sp[-2].fSigned;
            case ByteCodeInstruction::kNegateI2: sp[-1] = -sp[-1].fSigned;
-            case ByteCodeInstruction::kNegateI : sp[ 0] = -sp [0].fSigned;
+            case ByteCodeInstruction::kNegateI : sp[ 0] = -sp[ 0].fSigned;
                                                 break;

-            case ByteCodeInstruction::kOrB:
-                sp[-1] = sp[-1].fBool || sp[0].fBool;
-                POP();
-                break;
-
            case ByteCodeInstruction::kPop4: POP();
            case ByteCodeInstruction::kPop3: POP();
            case ByteCodeInstruction::kPop2: POP();
@ -604,34 +707,56 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                break;

            case ByteCodeInstruction::kPushImmediate:
-                PUSH(READ32());
+                PUSH(U32(READ32()));
                break;

-            case ByteCodeInstruction::kReadExternal:  // fall through
-            case ByteCodeInstruction::kReadExternal2: // fall through
-            case ByteCodeInstruction::kReadExternal3: // fall through
+            case ByteCodeInstruction::kReadExternal:
+            case ByteCodeInstruction::kReadExternal2:
+            case ByteCodeInstruction::kReadExternal3:
            case ByteCodeInstruction::kReadExternal4: {
+                // TODO: Support striped external values, or passing lane index? This model is odd.
+                int count = (int)inst - (int)ByteCodeInstruction::kReadExternal + 1;
                int src = READ8();
-                byteCode->fExternalValues[src]->read(sp + 1);
-                sp += (int) inst - (int) ByteCodeInstruction::kReadExternal + 1;
+                int32_t tmp[4];
+                I32 m = mask();
+                for (int i = 0; i < VecWidth; ++i) {
+                    if (m[i]) {
+                        byteCode->fExternalValues[src]->read(tmp);
+                        for (int j = 0; j < count; ++j) {
+                            sp[j + 1].fSigned[i] = tmp[j];
+                        }
+                    }
+                }
+                sp += count;
                break;
            }

-            VECTOR_BINARY_FN(kRemainderF, fFloat, fmodf)
-            VECTOR_BINARY_OP(kRemainderS, fSigned, %)
-            VECTOR_BINARY_OP(kRemainderU,  fUnsigned, %)
+            VECTOR_BINARY_FN(kRemainderF, fFloat, vec_mod<F32>)
+            VECTOR_BINARY_FN(kRemainderS, fSigned, vec_mod<I32>)
+            VECTOR_BINARY_FN(kRemainderU, fUnsigned, vec_mod<U32>)

            case ByteCodeInstruction::kReturn: {
                int count = READ8();
                if (frames.empty()) {
                    if (outReturn) {
-                        memcpy(outReturn, sp - count + 1, count * sizeof(Value));
+                        // TODO: This can be smarter, knowing that mask is left-justified
+                        I32 m = mask();
+                        VValue* src = sp - count + 1;
+                        for (int i = 0; i < count; ++i) {
+                            for (int j = 0; j < VecWidth; ++j) {
+                                if (m[j]) {
+                                    outReturn[count * j].fSigned = src->fSigned[j];
+                                }
+                            }
+                            ++outReturn;
+                            ++src;
+                        }
                    }
                    return;
                } else {
                    // When we were called, 'stack' was positioned at the old top-of-stack (where
                    // our parameters were placed). So copy our return values to that same spot.
-                    memmove(stack, sp - count + 1, count * sizeof(Value));
+                    memmove(stack, sp - count + 1, count * sizeof(VValue));

                    // Now move the stack pointer to the end of the just-pushed return values,
                    // and restore everything else.
@ -648,44 +773,67 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
            case ByteCodeInstruction::kScalarToMatrix: {
                int cols = READ8();
                int rows = READ8();
-                Value v = POP();
+                VValue v = POP();
                for (int c = 0; c < cols; ++c) {
                    for (int r = 0; r < rows; ++r) {
-                        PUSH(c == r ? v : 0.0f);
+                        PUSH(c == r ? v : F32(0.0f));
                    }
                }
                break;
            }

-            VECTOR_UNARY_FN(kSin, sinf, fFloat)
-            VECTOR_UNARY_FN(kSqrt, sqrtf, fFloat)
+            VECTOR_UNARY_FN_VEC(kSin, sinf)
+            VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)

-            case ByteCodeInstruction::kStore4: stack[*ip + 3] = POP();
-            case ByteCodeInstruction::kStore3: stack[*ip + 2] = POP();
-            case ByteCodeInstruction::kStore2: stack[*ip + 1] = POP();
-            case ByteCodeInstruction::kStore : stack[*ip + 0] = POP();
-                                               ++ip;
-                                               break;
+            case ByteCodeInstruction::kStore4:
+                stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
+            case ByteCodeInstruction::kStore3:
+                stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
+            case ByteCodeInstruction::kStore2:
+                stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
+            case ByteCodeInstruction::kStore :
+                stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
+                ++ip;
+                break;

-            case ByteCodeInstruction::kStoreGlobal4: globals[*ip + 3] = POP();
-            case ByteCodeInstruction::kStoreGlobal3: globals[*ip + 2] = POP();
-            case ByteCodeInstruction::kStoreGlobal2: globals[*ip + 1] = POP();
-            case ByteCodeInstruction::kStoreGlobal : globals[*ip + 0] = POP();
-                                                     ++ip;
-                                                     break;
+            case ByteCodeInstruction::kStoreGlobal4:
+                globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
+            case ByteCodeInstruction::kStoreGlobal3:
+                globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
+            case ByteCodeInstruction::kStoreGlobal2:
+                globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
+            case ByteCodeInstruction::kStoreGlobal :
+                globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
+                ++ip;
+                break;

            case ByteCodeInstruction::kStoreExtended: {
                int count = READ8();
-                int target = POP().fSigned;
-                memcpy(&stack[target], sp - count + 1, count * sizeof(Value));
+                I32 target = POP().fSigned;
+                VValue* src = sp - count + 1;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            stack[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                        }
+                    }
+                }
                sp -= count;
                break;
            }
            case ByteCodeInstruction::kStoreExtendedGlobal: {
                int count = READ8();
-                int target = POP().fSigned;
-                SkASSERT(target + count <= globalCount);
-                memcpy(&globals[target], sp - count + 1, count * sizeof(Value));
+                I32 target = POP().fSigned;
+                VValue* src = sp - count + 1;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            globals[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                        }
+                    }
+                }
                sp -= count;
                break;
            }
@ -694,7 +842,8 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                int target = READ8();
                int count = READ8();
                for (int i = count - 1; i >= 0; --i) {
-                    stack[target + *(ip + i)] = POP();
+                    stack[target + *(ip + i)] = skvx::if_then_else(
+                            mask(), POP().fFloat, stack[target + *(ip + i)].fFloat);
                }
                ip += count;
                break;
@ -704,25 +853,40 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                int target = READ8();
                int count = READ8();
                for (int i = count - 1; i >= 0; --i) {
-                    globals[target + *(ip + i)] = POP();
+                    globals[target + *(ip + i)] = skvx::if_then_else(
+                            mask(), POP().fFloat, globals[target + *(ip + i)].fFloat);
                }
                ip += count;
                break;
            }
+
            case ByteCodeInstruction::kStoreSwizzleIndirect: {
-                int target = POP().fSigned;
                int count = READ8();
+                I32 target = POP().fSigned;
+                I32 m = mask();
                for (int i = count - 1; i >= 0; --i) {
-                    stack[target + *(ip + i)] = POP();
+                    I32 v = POP().fSigned;
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            stack[target[j] + *(ip + i)].fSigned[j] = v[j];
+                        }
+                    }
                }
                ip += count;
                break;
            }
+
            case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
-                int target = POP().fSigned;
                int count = READ8();
+                I32 target = POP().fSigned;
+                I32 m = mask();
                for (int i = count - 1; i >= 0; --i) {
-                    globals[target + *(ip + i)] = POP();
+                    I32 v = POP().fSigned;
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            globals[target[j] + *(ip + i)].fSigned[j] = v[j];
+                        }
+                    }
                }
                ip += count;
                break;
@ -732,7 +896,7 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
            VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)

            case ByteCodeInstruction::kSwizzle: {
-                Value tmp[4];
+                VValue tmp[4];
                for (int i = READ8() - 1; i >= 0; --i) {
                    tmp[i] = POP();
                }
@ -742,70 +906,150 @@ void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack,
                break;
            }

-            VECTOR_UNARY_FN(kTan, tanf, fFloat)
+            VECTOR_UNARY_FN_VEC(kTan, tanf)

-            case ByteCodeInstruction::kWriteExternal:  // fall through
-            case ByteCodeInstruction::kWriteExternal2: // fall through
-            case ByteCodeInstruction::kWriteExternal3: // fall through
+            case ByteCodeInstruction::kWriteExternal:
+            case ByteCodeInstruction::kWriteExternal2:
+            case ByteCodeInstruction::kWriteExternal3:
            case ByteCodeInstruction::kWriteExternal4: {
-                int count = (int) inst - (int) ByteCodeInstruction::kWriteExternal + 1;
+                int count = (int)inst - (int)ByteCodeInstruction::kWriteExternal + 1;
                int target = READ8();
-                byteCode->fExternalValues[target]->write(sp - count + 1);
+                int32_t tmp[4];
+                I32 m = mask();
                sp -= count;
+                for (int i = 0; i < VecWidth; ++i) {
+                    if (m[i]) {
+                        for (int j = 0; j < count; ++j) {
+                            tmp[j] = sp[j + 1].fSigned[i];
+                        }
+                        byteCode->fExternalValues[target]->write(tmp);
+                    }
+                }
+                break;
+            }
+
+            case ByteCodeInstruction::kMaskPush:
+                condPtr[1] = POP().fSigned;
+                maskPtr[1] = maskPtr[0] & condPtr[1];
+                ++condPtr; ++maskPtr;
+                break;
+            case ByteCodeInstruction::kMaskPop:
+                --condPtr; --maskPtr;
+                break;
+            case ByteCodeInstruction::kMaskNegate:
+                maskPtr[0] = maskPtr[-1] & ~condPtr[0];
+                break;
+            case ByteCodeInstruction::kMaskBlend: {
+                int count = READ8();
+                I32 m = condPtr[0];
+                --condPtr; --maskPtr;
+                for (int i = 0; i < count; ++i) {
+                    sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
+                    --sp;
+                }
+                break;
+            }
+            case ByteCodeInstruction::kBranchIfAllFalse: {
+                int target = READ16();
+                if (!skvx::any(mask())) {
+                    ip = code + target;
+                }
+                break;
+            }
+
+            case ByteCodeInstruction::kLoopBegin:
+                *(++contPtr) =  0;
+                *(++loopPtr) = ~0;
+                break;
+            case ByteCodeInstruction::kLoopNext:
+                *loopPtr |= *contPtr;
+                *contPtr = 0;
+                break;
+            case ByteCodeInstruction::kLoopMask:
+                *loopPtr &= POP().fSigned;
+                break;
+            case ByteCodeInstruction::kLoopEnd:
+                --contPtr; --loopPtr;
+                break;
+            case ByteCodeInstruction::kLoopBreak:
+                *loopPtr &= ~mask();
+                break;
+            case ByteCodeInstruction::kLoopContinue: {
+                I32 m = mask();
+                *contPtr |=  m;
+                *loopPtr &= ~m;
                break;
            }

            default:
                SkDEBUGFAILF("unsupported instruction %d\n", (int) inst);
        }
+    }
+}
+
+void VecRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
+            int N, Value uniforms[], int uniformCount) {
 #ifdef TRACE
-        int stackSize = (int) (sp - stack + 1);
-        printf("STACK(%d):", stackSize);
-        for (int i = 0; i < stackSize; ++i) {
-            printf(" %d(%g)", stack[i].fSigned, stack[i].fFloat);
-        }
-        printf("\n");
+    disassemble(f);
 #endif
+    VValue smallStack[128];
+
+    SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
+    VValue smallGlobals[32];
+    VValue* globals = smallGlobals;
+    SkASSERT((int)SK_ARRAY_COUNT(smallGlobals) >= byteCode->fGlobalCount);
+    for (uint8_t slot : byteCode->fInputSlots) {
+        globals[slot].fUnsigned = (uniforms++)->fUnsigned;
+    }
+
+    while (N) {
+        VValue* stack = smallStack;
+
+        int w = std::min(N, VecWidth);
+        N -= w;
+
+        // Transpose args into stack
+        {
+            uint32_t* src = (uint32_t*)args;
+            for (int i = 0; i < w; ++i) {
+                uint32_t* dst = (uint32_t*)stack + i;
+                for (int j = f->fParameterCount; j > 0; --j) {
+                    *dst = *src++;
+                    dst += VecWidth;
+                }
+            }
+        }
+
+        auto mask = w > gLanes;
+        innerRun(byteCode, f, stack, outReturn, mask, globals, byteCode->fGlobalCount);
+
+        // Transpose out parameters back
+        {
+            uint32_t* dst = (uint32_t*)args;
+            for (int i = 0; i < w; ++i) {
+                uint32_t* src = (uint32_t*)stack + i;
+                for (const auto& p : f->fParameters) {
+                    if (p.fIsOutParameter) {
+                        for (int j = p.fSlotCount; j > 0; --j) {
+                            *dst++ = *src;
+                            src += VecWidth;
+                        }
+                    } else {
+                        dst += p.fSlotCount;
+                        src += p.fSlotCount * VecWidth;
+                    }
+                }
+            }
+        }
+
+        args += f->fParameterCount * w;
+        outReturn += f->fReturnCount * w;
    }
 }

 void Run(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
         Value uniforms[], int uniformCount) {
-#ifdef TRACE
-    disassemble(f);
-#endif
-    Value smallStack[128];
-    std::unique_ptr<Value[]> largeStack;
-    Value* stack = smallStack;
-    if ((int)SK_ARRAY_COUNT(smallStack) < f->fStackCount) {
-        largeStack.reset(new Value[f->fStackCount]);
-        stack = largeStack.get();
-    }
-
-    if (f->fParameterCount) {
-        memcpy(stack, args, f->fParameterCount * sizeof(Value));
-    }
-
-    SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
-    Value smallGlobals[32];
-    std::unique_ptr<Value[]> largeGlobals;
-    Value* globals = smallGlobals;
-    if ((int)SK_ARRAY_COUNT(smallGlobals) < byteCode->fGlobalCount) {
-        largeGlobals.reset(new Value[byteCode->fGlobalCount]);
-        globals = largeGlobals.get();
-    }
-    for (uint8_t slot : byteCode->fInputSlots) {
-        globals[slot] = *uniforms++;
-    }
-    innerRun(byteCode, f, stack, outReturn, globals, byteCode->fGlobalCount);
-
-    for (const auto& p : f->fParameters) {
-        if (p.fIsOutParameter) {
-            memcpy(args, stack, p.fSlotCount * sizeof(Value));
-        }
-        args += p.fSlotCount;
-        stack += p.fSlotCount;
-    }
+    VecRun(byteCode, f, args, outReturn, 1, uniforms, uniformCount);
 }

 } // namespace Interpreter
--- a/src/sksl/SkSLInterpreter.h
+++ b/src/sksl/SkSLInterpreter.h
@ -46,6 +46,9 @@ namespace Interpreter {
    void Run(const ByteCode*, const ByteCodeFunction*, Value args[], Value* outReturn,
             Value uniforms[], int uniformCount);

+    void VecRun(const ByteCode*, const ByteCodeFunction*, Value args[], Value* outReturn,
+                int N, Value uniforms[], int uniformCount);
+
    /**
     * Print bytecode disassembly to stdout.
     */
--- a/src/sksl/sksl_interp.inc
+++ b/src/sksl/sksl_interp.inc
@ -1,9 +1,13 @@
 STRINGIFY(
-    float cos(float y);
+    $genType cos($genType y);
+    $genHType cos($genHType y);
    float3 cross(float3 x, float3 y);
    float dot($genType x, $genType y);
    $genType mix($genType x, $genType y, float t);
-    float sin(float x);
-    float sqrt(float x);
-    float tan(float x);
+    $genType sin($genType x);
+    $genHType sin($genHType x);
+    $genType sqrt($genType x);
+    $genHType sqrt($genHType x);
+    $genType tan($genType x);
+    $genHType tan($genHType x);
 )
--- a/tests/SkSLInterpreterTest.cpp
+++ b/tests/SkSLInterpreterTest.cpp
@ -57,6 +57,61 @@ void test(skiatest::Reporter* r, const char* src, SkSL::Interpreter::Value* in,
    }
 }

+void vec_test(skiatest::Reporter* r, const char* src) {
+    // Test on four different vectors (with varying orderings to get divergent control flow)
+    const float input[16] = { 1, 2, 3, 4,
+                              4, 3, 2, 1,
+                              7, 5, 8, 6,
+                              6, 8, 5, 7 };
+
+    float out_s[16], out_v[16];
+    memcpy(out_s, input, sizeof(out_s));
+    memcpy(out_v, input, sizeof(out_v));
+
+    for (int pass = 0; pass < 2; ++pass) {
+        SkSL::Compiler compiler;
+        std::unique_ptr<SkSL::Program> program = compiler.convertProgram(
+                SkSL::Program::kGeneric_Kind, SkSL::String(src), SkSL::Program::Settings());
+        if (!program) {
+            REPORT_FAILURE(r, "!program", SkString(compiler.errorText().c_str()));
+            return;
+        }
+
+        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
+        if (compiler.errorCount() > 0) {
+            REPORT_FAILURE(r, "!toByteCode", SkString(compiler.errorText().c_str()));
+            return;
+        }
+
+        const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
+
+        if (pass == 0) {
+            // First pass - run in scalar mode to determine the expected output
+            for (int i = 0; i < 4; ++i) {
+                SkSL::Interpreter::Run(byteCode.get(), main,
+                                       (SkSL::Interpreter::Value*)(out_s + i * 4), nullptr,
+                                       nullptr, 0);
+            }
+        } else {
+            // Second pass - run the vectorized interpreter and compare results
+            SkSL::Interpreter::VecRun(byteCode.get(), main,
+                                      (SkSL::Interpreter::Value*)out_v, nullptr, 4,
+                                      nullptr, 0);
+            if (memcmp(out_s, out_v, sizeof(out_s)) != 0) {
+                printf("for program: %s\n", src);
+                for (int i = 0; i < 4; ++i) {
+                    printf("(%g %g %g %g) -> (%g %g %g %g), expected (%g %g %g %g)\n",
+                           input[4*i + 0], input[4*i + 1], input[4*i + 2], input[4*i + 3],
+                           out_v[4*i + 0], out_v[4*i + 1], out_v[4*i + 2], out_v[4*i + 3],
+                           out_s[4*i + 0], out_s[4*i + 1], out_s[4*i + 2], out_s[4*i + 3]);
+                }
+                SkSL::Interpreter::Disassemble(main);
+                REPORT_FAILURE(r, "VecInterpreter mismatch", SkString());
+            }
+        }
+    }
+}
+
 void test(skiatest::Reporter* r, const char* src, float inR, float inG, float inB, float inA,
        float expectedR, float expectedG, float expectedB, float expectedA) {
    SkSL::Compiler compiler;
@ -73,7 +128,7 @@ void test(skiatest::Reporter* r, const char* src, float inR, float inG, float in
            printf("%s\n%s", src, compiler.errorText().c_str());
            return;
        }
-        SkSL::ByteCodeFunction* main = byteCode->fFunctions[0].get();
+        const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
        float inoutColor[4] = { inR, inG, inB, inA };
        SkSL::Interpreter::Run(byteCode.get(), main, (SkSL::Interpreter::Value*) inoutColor,
                               nullptr, nullptr, 0);
@ -92,6 +147,9 @@ void test(skiatest::Reporter* r, const char* src, float inR, float inG, float in
    } else {
        printf("%s\n%s", src, compiler.errorText().c_str());
    }
+
+    // Do additional testing of 4x1 vs 1x4 to stress divergent control flow, etc.
+    vec_test(r, src);
 }

 DEF_TEST(SkSLInterpreterAdd, r) {
@ -250,6 +308,9 @@ DEF_TEST(SkSLInterpreterTernary, r) {
         0, 1, 2, 0, 2, 1, 2, 0);
    test(r, "void main(inout half4 color) { color.r = color.g > color.b ? color.g : color.b; }",
         0, 3, 2, 0, 3, 3, 2, 0);
+    test(r, "int fib(int i) { return (i < 2) ? 1 : fib(i - 1) + fib(i - 2); }"
+            "void main(inout half4 color) { color.r = half(fib(int(color.r))); }",
+         3, 0, 0, 0, 3, 0, 0, 0);
 }

 DEF_TEST(SkSLInterpreterCast, r) {
@ -336,10 +397,10 @@ DEF_TEST(SkSLInterpreterIfVector, r) {
 DEF_TEST(SkSLInterpreterWhile, r) {
    test(r, "void main(inout half4 color) { while (color.r < 1) color.r += 0.25; }", 0, 0, 0, 0, 1,
         0, 0, 0);
-    test(r, "void main(inout half4 color) { while (color.r > 1) color.r += 0.25; }", 0, 0, 0, 0, 0,
+    test(r, "void main(inout half4 color) { while (color.r > 1) color.r -= 0.25; }", 0, 0, 0, 0, 0,
         0, 0, 0);
    test(r, "void main(inout half4 color) { while (true) { color.r += 0.5; "
-         "if (color.r > 1) break; } }", 0, 0, 0, 0, 1.5, 0, 0, 0);
+         "if (color.r > 5) break; } }", 0, 0, 0, 0, 5.5, 0, 0, 0);
    test(r, "void main(inout half4 color) { while (color.r < 10) { color.r += 0.5; "
            "if (color.r < 5) continue; break; } }", 0, 0, 0, 0, 5, 0, 0, 0);
 }
@ -347,8 +408,8 @@ DEF_TEST(SkSLInterpreterWhile, r) {
 DEF_TEST(SkSLInterpreterDo, r) {
    test(r, "void main(inout half4 color) { do color.r += 0.25; while (color.r < 1); }", 0, 0, 0, 0,
         1, 0, 0, 0);
-    test(r, "void main(inout half4 color) { do color.r += 0.25; while (color.r > 1); }", 0, 0, 0, 0,
-         0.25, 0, 0, 0);
+    test(r, "void main(inout half4 color) { do color.r -= 0.25; while (color.r > 1); }", 0, 0, 0, 0,
+         -0.25, 0, 0, 0);
    test(r, "void main(inout half4 color) { do { color.r += 0.5; if (color.r > 1) break; } while "
            "(true); }", 0, 0, 0, 0, 1.5, 0, 0, 0);
    test(r, "void main(inout half4 color) {do { color.r += 0.5; if (color.r < 5) "