composeshader stages

needed to add two helper stages for composeshader load_rgba, store_rgba These just read/write the r,g,b,a registers to context memory, making no promise as to how the memory is formatted (e.g. interleaved -vs- planar). Note that we have similar existing stages, but they did not seem to suit: constant_color This guy loads 4 floats from memory, and splats them into registers. I need to load 4 entire registers. load_f32, store_f32 These offset where they read/write based on the 'x' register, plus they guarantee that the memory will be interleaved ala SkPM4f. Bug: skia: Change-Id: Iaa81f950660b837bdb34416ab3e342d56a92239b Reviewed-on: https://skia-review.googlesource.com/16716 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Reed <reed@google.com>
2017-05-15 09:34:22 -04:00 · 2017-05-15 09:34:22 -04:00 · 9959f723c3
commit 9959f723c3
parent 787a16dd9e
6 changed files with 3596 additions and 3309 deletions
--- a/src/core/SkComposeShader.cpp
+++ b/src/core/SkComposeShader.cpp
@ -6,13 +6,16 @@
 */

 #include "SkArenaAlloc.h"
+#include "SkBlendModePriv.h"
 #include "SkComposeShader.h"
 #include "SkColorFilter.h"
 #include "SkColorPriv.h"
 #include "SkColorShader.h"
+#include "SkRasterPipeline.h"
 #include "SkReadBuffer.h"
 #include "SkWriteBuffer.h"
 #include "SkString.h"
+#include "../jumper/SkJumper.h"

 sk_sp<SkShader> SkShader::MakeComposeShader(sk_sp<SkShader> dst, sk_sp<SkShader> src,
                                            SkBlendMode mode) {
@ -117,6 +120,45 @@ bool SkComposeShader::asACompose(ComposeRec* rec) const {
    return true;
 }

+bool SkComposeShader::onAppendStages(SkRasterPipeline* pipeline, SkColorSpace* dstCS,
+                                     SkArenaAlloc* alloc, const SkMatrix& ctm,
+                                     const SkPaint& paint, const SkMatrix* localM) const {
+    struct Storage {
+        float   fXY[4 * SkJumper_kMaxStride];
+        float   fRGBA[4 * SkJumper_kMaxStride];
+        float   fAlpha;
+    };
+    auto storage = alloc->make<Storage>();
+
+    // We need to save off device x,y (inputs to shader), since after calling fShaderA they
+    // will be smashed, and I'll need them again for fShaderB. store_rgba saves off 4 registers
+    // even though we only need to save r,g.
+    pipeline->append(SkRasterPipeline::store_rgba, storage->fXY);
+    if (!fShaderB->appendStages(pipeline, dstCS, alloc, ctm, paint, localM)) { // SRC
+        return false;
+    }
+    // This outputs r,g,b,a, which we'll need later when we apply the mode, but we save it off now
+    // since fShaderB will overwrite them.
+    pipeline->append(SkRasterPipeline::store_rgba, storage->fRGBA);
+    // Now we restore the device x,y for the next shader
+    pipeline->append(SkRasterPipeline::load_rgba, storage->fXY);
+    if (!fShaderA->appendStages(pipeline, dstCS, alloc, ctm, paint, localM)) {  // DST
+        return false;
+    }
+    // We now have our logical 'dst' in r,g,b,a, but we need it in dr,dg,db,da for the mode
+    // so we have to shuttle them. If we had a stage the would load_into_dst, then we could
+    // reverse the two shader invocations, and avoid this move...
+    pipeline->append(SkRasterPipeline::move_src_dst);
+    pipeline->append(SkRasterPipeline::load_rgba, storage->fRGBA);
+
+    // Idea: should time this, and see if it helps to have custom versions of the overflow modes
+    //       that do their own clamping, avoiding the overhead of an extra stage.
+    SkBlendMode_AppendStages(fMode, pipeline);
+    if (SkBlendMode_CanOverflow(fMode)) {
+        pipeline->append(SkRasterPipeline::clamp_a);
+    }
+    return true;
+}

 // larger is better (fewer times we have to loop), but we shouldn't
 // take up too much stack-space (each element is 4 bytes)
--- a/src/core/SkComposeShader.h
+++ b/src/core/SkComposeShader.h
@ -72,6 +72,8 @@ protected:
    void flatten(SkWriteBuffer&) const override;
    Context* onMakeContext(const ContextRec&, SkArenaAlloc*) const override;
    sk_sp<SkShader> onMakeColorSpace(SkColorSpaceXformer* xformer) const override;
+    bool onAppendStages(SkRasterPipeline*, SkColorSpace* dstCS, SkArenaAlloc*,
+                        const SkMatrix&, const SkPaint&, const SkMatrix* localM) const override;

 private:
    sk_sp<SkShader>     fShaderA;
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -72,7 +72,7 @@
    M(load_8888) M(store_8888)                                   \
    M(load_u16_be) M(load_rgb_u16_be) M(store_u16_be)            \
    M(load_tables_u16_be) M(load_tables_rgb_u16_be)              \
-    M(load_tables)                                               \
+    M(load_tables) M(load_rgba) M(store_rgba)                    \
    M(scale_u8) M(scale_1_float)                                 \
    M(lerp_u8) M(lerp_565) M(lerp_1_float)                       \
    M(dstatop) M(dstin) M(dstout) M(dstover)                     \
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -322,6 +322,7 @@ STAGE(dither) {
    b += c->rate*dither;
 }

+// load 4 floats from memory, and splat them into r,g,b,a
 STAGE(constant_color) {
    auto rgba = (const float*)ctx;
    r = rgba[0];
@ -330,6 +331,24 @@ STAGE(constant_color) {
    a = rgba[3];
 }

+// load registers r,g,b,a from context (mirrors store_rgba)
+STAGE(load_rgba) {
+    auto ptr = (const float*)ctx;
+    r = unaligned_load<F>(ptr + 0*kStride);
+    g = unaligned_load<F>(ptr + 1*kStride);
+    b = unaligned_load<F>(ptr + 2*kStride);
+    a = unaligned_load<F>(ptr + 3*kStride);
+}
+
+// store registers r,g,b,a into context (mirrors load_rgba)
+STAGE(store_rgba) {
+    auto ptr = (float*)ctx;
+    memcpy(ptr + 0*kStride, &r, sizeof(F));
+    memcpy(ptr + 1*kStride, &g, sizeof(F));
+    memcpy(ptr + 2*kStride, &b, sizeof(F));
+    memcpy(ptr + 3*kStride, &a, sizeof(F));
+}
+
 // Most blend modes apply the same logic to each channel.
 #define BLEND_MODE(name)                       \
    SI F name##_channel(F s, F d, F sa, F da); \