Reland "Respect full precision for RGB16 PNGs" (part 2)

This lands all the new xform hooks but no change to src/codec. So the new decode features are turned off. I'm relanding this in pieces to try to bisect a strange MSAN error. Original CL: https://skia-review.googlesource.com/c/7085/ BUG=skia: CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-MSAN,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD,Build-Ubuntu-Clang-x86_64-Release-Fast Change-Id: I451a2a29c73ca475e9e7a5ded58d4948d6b8be19 Reviewed-on: https://skia-review.googlesource.com/7277 Reviewed-by: Matt Sarett <msarett@google.com> Commit-Queue: Matt Sarett <msarett@google.com>
2017-01-19 12:04:32 -05:00 · 2017-01-19 12:04:32 -05:00 · 5bee0b6de6
commit 5bee0b6de6
parent f720098671
8 changed files with 129 additions and 4 deletions
--- a/include/core/SkColorSpaceXform.h
+++ b/include/core/SkColorSpaceXform.h
@ -29,6 +29,7 @@ public:
        kBGRA_8888_ColorFormat,

        // Unsigned, big-endian, 16-bit integer
+        kRGB_U16_BE_ColorFormat,   // Src only
        kRGBA_U16_BE_ColorFormat,  // Src only

        kRGBA_F16_ColorFormat,     // Dst only
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@ -1104,7 +1104,10 @@ bool SkColorSpaceXform_XYZ<kCSM>
        }
    }

-    if (kRGBA_F32_ColorFormat == dstColorFormat || kRGBA_U16_BE_ColorFormat == srcColorFormat) {
+    if (kRGBA_F32_ColorFormat == dstColorFormat ||
+        kRGBA_U16_BE_ColorFormat == srcColorFormat ||
+        kRGB_U16_BE_ColorFormat == srcColorFormat)
+    {
        return this->applyPipeline(dstColorFormat, dst, srcColorFormat, src, len, alphaType);
    }

@ -1221,6 +1224,24 @@ bool SkColorSpaceXform_XYZ<kCSM>
                    break;
            }
            break;
+        case kRGB_U16_BE_ColorFormat:
+            switch (fSrcGamma) {
+                case kLinear_SrcGamma:
+                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+                    break;
+                case kSRGB_SrcGamma:
+                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+                    pipeline.append_from_srgb(kUnpremul_SkAlphaType);
+                    break;
+                case kTable_SrcGamma:
+                    loadTables.fSrc = src;
+                    loadTables.fR = fSrcGammaTables[0];
+                    loadTables.fG = fSrcGammaTables[1];
+                    loadTables.fB = fSrcGammaTables[2];
+                    pipeline.append(SkRasterPipeline::load_tables_rgb_u16_be, &loadTables);
+                    break;
+            }
+            break;
        default:
            return false;
    }
--- a/src/core/SkColorSpaceXform_A2B.cpp
+++ b/src/core/SkColorSpaceXform_A2B.cpp
@ -31,6 +31,9 @@ bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorForma
        case kRGBA_U16_BE_ColorFormat:
            pipeline.append(SkRasterPipeline::load_u16_be, &src);
            break;
+        case kRGB_U16_BE_ColorFormat:
+            pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+            break;
        default:
            SkCSXformPrintf("F16/F32 source color format not supported\n");
            return false;
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@ -68,6 +68,16 @@ struct SkNx {
        *c = SkNx{cl, ch};
        *d = SkNx{dl, dh};
    }
+    AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) {
+        auto ptr = (const char*)vptr;
+        Half al, bl, cl,
+             ah, bh, ch;
+        Half::Load3(ptr                  , &al, &bl, &cl);
+        Half::Load3(ptr + 3*N/2*sizeof(T), &ah, &bh, &ch);
+        *a = SkNx{al, ah};
+        *b = SkNx{bl, bh};
+        *c = SkNx{cl, ch};
+    }
    AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
        auto ptr = (char*)vptr;
        Half::Store4(ptr,                   a.fLo, b.fLo, c.fLo, d.fLo);
@ -149,6 +159,12 @@ struct SkNx<1,T> {
        *c = Load(ptr + 2*sizeof(T));
        *d = Load(ptr + 3*sizeof(T));
    }
+    AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) {
+        auto ptr = (const char*)vptr;
+        *a = Load(ptr + 0*sizeof(T));
+        *b = Load(ptr + 1*sizeof(T));
+        *c = Load(ptr + 2*sizeof(T));
+    }
    AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
        auto ptr = (char*)vptr;
        a.store(ptr + 0*sizeof(T));
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -69,8 +69,9 @@
    M(load_565)  M(store_565)                                    \
    M(load_f16)  M(store_f16)                                    \
    M(load_8888) M(store_8888)                                   \
-    M(load_u16_be)                                               \
-    M(load_tables) M(load_tables_u16_be) M(store_tables)         \
+    M(load_u16_be) M(load_rgb_u16_be)                            \
+    M(load_tables_u16_be) M(load_tables_rgb_u16_be)              \
+    M(load_tables) M(store_tables)                               \
    M(scale_u8) M(scale_1_float)                                 \
    M(lerp_u8) M(lerp_565) M(lerp_1_float)                       \
    M(dstatop) M(dstin) M(dstout) M(dstover)                     \
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@ -241,7 +241,12 @@ public:
        *b = rgba.val[2];
        *a = rgba.val[3];
    }
-
+    AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
+        uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr);
+        *r = rgba.val[0];
+        *g = rgba.val[1];
+        *b = rgba.val[2];
+    }
    AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
        uint16x4x4_t rgba = {{
            r.fVec,
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@ -271,6 +271,22 @@ public:
        *b = ba;
        *a = _mm_srli_si128(ba, 8);
    }
+    AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
+        // The idea here is to get 4 vectors that are R G B _ _ _ _ _.
+        // The second load is at a funny location to make sure we don't read past
+        // the bounds of memory.  This is fine, we just need to shift it a little bit.
+        const uint8_t* ptr8 = (const uint8_t*) ptr;
+        __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0));
+        __m128i rgb1 = _mm_srli_si128(rgb0, 3*2);
+        __m128i rgb2 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 4*2)), 2*2);
+        __m128i rgb3 = _mm_srli_si128(rgb2, 3*2);
+
+        __m128i rrggbb01 = _mm_unpacklo_epi16(rgb0, rgb1);
+        __m128i rrggbb23 = _mm_unpacklo_epi16(rgb2, rgb3);
+        *r = _mm_unpacklo_epi32(rrggbb01, rrggbb23);
+        *g = _mm_srli_si128(r->fVec, 4*2);
+        *b = _mm_unpackhi_epi32(rrggbb01, rrggbb23);
+    }
    AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
        __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
        __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);
@ -334,6 +350,32 @@ public:
        *b = _mm_unpacklo_epi64(ba0123, ba4567);
        *a = _mm_unpackhi_epi64(ba0123, ba4567);
    }
+    AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
+        // TODO: AVX2 version
+        const uint8_t* ptr8 = (const uint8_t*) ptr;
+        __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 +  0*2));
+        __m128i rgb1 = _mm_srli_si128(rgb0, 3*2);
+        __m128i rgb2 = _mm_loadu_si128((const __m128i*) (ptr8 +  6*2));
+        __m128i rgb3 = _mm_srli_si128(rgb2, 3*2);
+        __m128i rgb4 = _mm_loadu_si128((const __m128i*) (ptr8 + 12*2));
+        __m128i rgb5 = _mm_srli_si128(rgb4, 3*2);
+        __m128i rgb6 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 16*2)), 2*2);
+        __m128i rgb7 = _mm_srli_si128(rgb6, 3*2);
+
+        __m128i rgb01 = _mm_unpacklo_epi16(rgb0, rgb1);
+        __m128i rgb23 = _mm_unpacklo_epi16(rgb2, rgb3);
+        __m128i rgb45 = _mm_unpacklo_epi16(rgb4, rgb5);
+        __m128i rgb67 = _mm_unpacklo_epi16(rgb6, rgb7);
+
+        __m128i rg03 = _mm_unpacklo_epi32(rgb01, rgb23);
+        __m128i bx03 = _mm_unpackhi_epi32(rgb01, rgb23);
+        __m128i rg47 = _mm_unpacklo_epi32(rgb45, rgb67);
+        __m128i bx47 = _mm_unpackhi_epi32(rgb45, rgb67);
+
+        *r = _mm_unpacklo_epi64(rg03, rg47);
+        *g = _mm_unpackhi_epi64(rg03, rg47);
+        *b = _mm_unpacklo_epi64(bx03, bx47);
+    }
    AI static void Store4(void* ptr, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
        // TODO: AVX2 version
        __m128i rg0123 = _mm_unpacklo_epi16(r.fVec, g.fVec),  // r0 g0 r1 g1 r2 g2 r3 g3
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@ -591,6 +591,23 @@ STAGE_CTX(load_u16_be, const uint64_t**) {
    a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8));
 }

+STAGE_CTX(load_rgb_u16_be, const uint16_t**) {
+    auto ptr = *ctx + 3*x;
+    const void* src = ptr;
+    uint16_t buf[N*3] = {0};
+    if (tail) {
+        memcpy(buf, src, tail*3*sizeof(uint16_t));
+        src = buf;
+    }
+
+    SkNh rh, gh, bh;
+    SkNh::Load3(src, &rh, &gh, &bh);
+    r = (1.0f / 65535.0f) * SkNx_cast<float>((rh << 8) | (rh >> 8));
+    g = (1.0f / 65535.0f) * SkNx_cast<float>((gh << 8) | (gh >> 8));
+    b = (1.0f / 65535.0f) * SkNx_cast<float>((bh << 8) | (bh >> 8));
+    a = 1.0f;
+}
+
 STAGE_CTX(load_tables, const LoadTablesContext*) {
    auto ptr = (const uint32_t*)ctx->fSrc + x;

@ -621,6 +638,25 @@ STAGE_CTX(load_tables_u16_be, const LoadTablesContext*) {
    a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8));
 }

+STAGE_CTX(load_tables_rgb_u16_be, const LoadTablesContext*) {
+    auto ptr = (const uint16_t*)ctx->fSrc + 3*x;
+    const void* src = ptr;
+    uint16_t buf[N*3] = {0};
+    if (tail) {
+        memcpy(buf, src, tail*3*sizeof(uint16_t));
+        src = buf;
+    }
+
+    SkNh rh, gh, bh;
+    SkNh::Load3(src, &rh, &gh, &bh);
+
+    // ctx->fSrc is big-endian, so "& 0xff" grabs the 8 most significant bits of each component.
+    r = gather(tail, ctx->fR, SkNx_cast<int>(rh & 0xff));
+    g = gather(tail, ctx->fG, SkNx_cast<int>(gh & 0xff));
+    b = gather(tail, ctx->fB, SkNx_cast<int>(bh & 0xff));
+    a = 1.0f;
+}
+
 STAGE_CTX(store_tables, const StoreTablesContext*) {
    auto ptr = ctx->fDst + x;