fix SKVM_ benches

Things were running suspiciously well... _I32 had a typo that cut out 3/4 of its multiplies... _I32_SWAR was missing a mask operation needed to drop the junk low byte of the high half after the multiply. The bench times now make a bit more sense and are in line with how much work we're actually doing: F32's the slowest, I32 a little faster, and I32_SWAR fastest: curr/maxrss loops min median mean max stddev samples config bench 35/36 MB 58 2.03ns 2.04ns 2.04ns 2.04ns 0% ▂▂▂▂▁▁█▁▂▁ nonrendering SkVM_4096_I32_SWAR 35/36 MB 42 3.44ns 3.48ns 3.49ns 3.59ns 1% ▂▆▅█▃▃▁▂▂▄ nonrendering SkVM_4096_I32 35/36 MB 30 4.9ns 5.21ns 5.11ns 5.33ns 3% ▆▇█▆▆▁▂▁▁▅ nonrendering SkVM_4096_F32 35/36 MB 203 0.696ns 0.697ns 0.705ns 0.758ns 3% █▂▂▁▁▁▁▁▁▂ nonrendering SkVM_4096_RP 35/36 MB 942 0.188ns 0.188ns 0.188ns 0.189ns 0% ▂▁▂▁▃█▂▁▁▁ nonrendering SkVM_4096_Opts Change-Id: I2850dc3f9df1828f03499eb278b8231f48eaae63 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/217982 Commit-Queue: Mike Klein <mtklein@google.com> Commit-Queue: Brian Osman <brianosman@google.com> Auto-Submit: Mike Klein <mtklein@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
2019-06-03 14:53:15 -05:00 · 2019-06-03 14:53:15 -05:00 · 03ce675b5f
commit 03ce675b5f
parent ef032cd9bf
1 changed files with 13 additions and 7 deletions
--- a/bench/SkVMBench.cpp
+++ b/bench/SkVMBench.cpp
@ -9,9 +9,6 @@
 #include "src/core/SkOpts.h"
 #include "src/core/SkVM.h"

-// N.B. I have not tested that the math performed by these benchmarks is correct.
-// They're really more meant to be representative load.  (Wouldn't hurt to be correct though.)
-
 namespace {

    enum Mode {Opts, RP, F32, I32, I32_SWAR};
@ -85,9 +82,9 @@ namespace {

            skvm::I32 invA = sub(splat(0xff), a);
            r = add(r, mul_unorm8(dr, invA));
-            g = add(g, mul_unorm8(dr, invA));
-            b = add(b, mul_unorm8(dr, invA));
-            a = add(a, mul_unorm8(dr, invA));
+            g = add(g, mul_unorm8(dg, invA));
+            b = add(b, mul_unorm8(db, invA));
+            a = add(a, mul_unorm8(da, invA));

            store32(dst, bit_or(    r     ,
                         bit_or(shl(g,  8),
@ -110,7 +107,10 @@ namespace {

            auto mul_unorm8 = [&](skvm::I32 x, skvm::I32 y) {
                // As above, assuming x is two SWAR bytes in lanes 0 and 2, and y is a byte.
-                return shr(add(mul(x, y), splat(0x00ff00ff)), 8);
+                return bit_and(shr(add(mul(x, y),
+                                       splat(0x00ff00ff)),
+                                   8),
+                               splat(0x00ff00ff));
            };

            skvm::I32 rb, ga;
@ -157,6 +157,12 @@ private:
            fPipeline.append(SkRasterPipeline::srcover);
            fPipeline.append(SkRasterPipeline::store_8888, &fDstCtx);
        }
+
+        // Trigger one run now so we can do a quick correctness check.
+        this->draw(1,nullptr);
+        for (int i = 0; i < fPixels; i++) {
+            SkASSERT(fDst[i] == 0xff5e6f80);
+        }
    }

    void onDraw(int loops, SkCanvas*) override {