From 5bee0b6de6b3ad1166d067e6b5046b48b8240a29 Mon Sep 17 00:00:00 2001
From: Matt Sarett <msarett@google.com>
Date: Thu, 19 Jan 2017 12:04:32 -0500
Subject: [PATCH] Reland "Respect full precision for RGB16 PNGs" (part 2)

This lands all the new xform hooks but no change to src/codec.
So the new decode features are turned off.

I'm relanding this in pieces to try to bisect a
strange MSAN error.

Original CL:
https://skia-review.googlesource.com/c/7085/

BUG=skia:

CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-MSAN,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD,Build-Ubuntu-Clang-x86_64-Release-Fast


Change-Id: I451a2a29c73ca475e9e7a5ded58d4948d6b8be19
Reviewed-on: https://skia-review.googlesource.com/7277
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Matt Sarett <msarett@google.com>
---
 include/core/SkColorSpaceXform.h   |  1 +
 src/core/SkColorSpaceXform.cpp     | 23 +++++++++++++++-
 src/core/SkColorSpaceXform_A2B.cpp |  3 +++
 src/core/SkNx.h                    | 16 ++++++++++++
 src/core/SkRasterPipeline.h        |  5 ++--
 src/opts/SkNx_neon.h               |  7 ++++-
 src/opts/SkNx_sse.h                | 42 ++++++++++++++++++++++++++++++
 src/opts/SkRasterPipeline_opts.h   | 36 +++++++++++++++++++++++++
 8 files changed, 129 insertions(+), 4 deletions(-)
diff --git a/include/core/SkColorSpaceXform.h b/include/core/SkColorSpaceXform.h
index 299fe26a3a..0c4fd7aa33 100644
--- a/include/core/SkColorSpaceXform.h
+++ b/include/core/SkColorSpaceXform.h
@@ -29,6 +29,7 @@ public:
         kBGRA_8888_ColorFormat,
 
         // Unsigned, big-endian, 16-bit integer
+        kRGB_U16_BE_ColorFormat,   // Src only
         kRGBA_U16_BE_ColorFormat,  // Src only
 
         kRGBA_F16_ColorFormat,     // Dst only
diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp
index bdb3784681..dccb92ee9a 100644
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@@ -1104,7 +1104,10 @@ bool SkColorSpaceXform_XYZ<kCSM>
         }
     }
 
-    if (kRGBA_F32_ColorFormat == dstColorFormat || kRGBA_U16_BE_ColorFormat == srcColorFormat) {
+    if (kRGBA_F32_ColorFormat == dstColorFormat ||
+        kRGBA_U16_BE_ColorFormat == srcColorFormat ||
+        kRGB_U16_BE_ColorFormat == srcColorFormat)
+    {
         return this->applyPipeline(dstColorFormat, dst, srcColorFormat, src, len, alphaType);
     }
 
@@ -1221,6 +1224,24 @@ bool SkColorSpaceXform_XYZ<kCSM>
                     break;
             }
             break;
+        case kRGB_U16_BE_ColorFormat:
+            switch (fSrcGamma) {
+                case kLinear_SrcGamma:
+                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+                    break;
+                case kSRGB_SrcGamma:
+                    pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+                    pipeline.append_from_srgb(kUnpremul_SkAlphaType);
+                    break;
+                case kTable_SrcGamma:
+                    loadTables.fSrc = src;
+                    loadTables.fR = fSrcGammaTables[0];
+                    loadTables.fG = fSrcGammaTables[1];
+                    loadTables.fB = fSrcGammaTables[2];
+                    pipeline.append(SkRasterPipeline::load_tables_rgb_u16_be, &loadTables);
+                    break;
+            }
+            break;
         default:
             return false;
     }
diff --git a/src/core/SkColorSpaceXform_A2B.cpp b/src/core/SkColorSpaceXform_A2B.cpp
index 6e49e84aee..27c9faa971 100644
--- a/src/core/SkColorSpaceXform_A2B.cpp
+++ b/src/core/SkColorSpaceXform_A2B.cpp
@@ -31,6 +31,9 @@ bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorForma
         case kRGBA_U16_BE_ColorFormat:
             pipeline.append(SkRasterPipeline::load_u16_be, &src);
             break;
+        case kRGB_U16_BE_ColorFormat:
+            pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src);
+            break;
         default:
             SkCSXformPrintf("F16/F32 source color format not supported\n");
             return false;
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index a75495cd2a..5df575342c 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -68,6 +68,16 @@ struct SkNx {
         *c = SkNx{cl, ch};
         *d = SkNx{dl, dh};
     }
+    AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) {
+        auto ptr = (const char*)vptr;
+        Half al, bl, cl,
+             ah, bh, ch;
+        Half::Load3(ptr                  , &al, &bl, &cl);
+        Half::Load3(ptr + 3*N/2*sizeof(T), &ah, &bh, &ch);
+        *a = SkNx{al, ah};
+        *b = SkNx{bl, bh};
+        *c = SkNx{cl, ch};
+    }
     AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
         auto ptr = (char*)vptr;
         Half::Store4(ptr,                   a.fLo, b.fLo, c.fLo, d.fLo);
@@ -149,6 +159,12 @@ struct SkNx<1,T> {
         *c = Load(ptr + 2*sizeof(T));
         *d = Load(ptr + 3*sizeof(T));
     }
+    AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) {
+        auto ptr = (const char*)vptr;
+        *a = Load(ptr + 0*sizeof(T));
+        *b = Load(ptr + 1*sizeof(T));
+        *c = Load(ptr + 2*sizeof(T));
+    }
     AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
         auto ptr = (char*)vptr;
         a.store(ptr + 0*sizeof(T));
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 949146cc2d..3d1bb88011 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -69,8 +69,9 @@
     M(load_565)  M(store_565)                                    \
     M(load_f16)  M(store_f16)                                    \
     M(load_8888) M(store_8888)                                   \
-    M(load_u16_be)                                               \
-    M(load_tables) M(load_tables_u16_be) M(store_tables)         \
+    M(load_u16_be) M(load_rgb_u16_be)                            \
+    M(load_tables_u16_be) M(load_tables_rgb_u16_be)              \
+    M(load_tables) M(store_tables)                               \
     M(scale_u8) M(scale_1_float)                                 \
     M(lerp_u8) M(lerp_565) M(lerp_1_float)                       \
     M(dstatop) M(dstin) M(dstout) M(dstover)                     \
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index f39ef118b1..5671f71315 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -241,7 +241,12 @@ public:
         *b = rgba.val[2];
         *a = rgba.val[3];
     }
-
+    AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
+        uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr);
+        *r = rgba.val[0];
+        *g = rgba.val[1];
+        *b = rgba.val[2];
+    }
     AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
         uint16x4x4_t rgba = {{
             r.fVec,
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 5ef5eda3a9..0dfb43a756 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -271,6 +271,22 @@ public:
         *b = ba;
         *a = _mm_srli_si128(ba, 8);
     }
+    AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
+        // The idea here is to get 4 vectors that are R G B _ _ _ _ _.
+        // The second load is at a funny location to make sure we don't read past
+        // the bounds of memory.  This is fine, we just need to shift it a little bit.
+        const uint8_t* ptr8 = (const uint8_t*) ptr;
+        __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0));
+        __m128i rgb1 = _mm_srli_si128(rgb0, 3*2);
+        __m128i rgb2 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 4*2)), 2*2);
+        __m128i rgb3 = _mm_srli_si128(rgb2, 3*2);
+
+        __m128i rrggbb01 = _mm_unpacklo_epi16(rgb0, rgb1);
+        __m128i rrggbb23 = _mm_unpacklo_epi16(rgb2, rgb3);
+        *r = _mm_unpacklo_epi32(rrggbb01, rrggbb23);
+        *g = _mm_srli_si128(r->fVec, 4*2);
+        *b = _mm_unpackhi_epi32(rrggbb01, rrggbb23);
+    }
     AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
         __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
         __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);
@@ -334,6 +350,32 @@ public:
         *b = _mm_unpacklo_epi64(ba0123, ba4567);
         *a = _mm_unpackhi_epi64(ba0123, ba4567);
     }
+    AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
+        // TODO: AVX2 version
+        const uint8_t* ptr8 = (const uint8_t*) ptr;
+        __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 +  0*2));
+        __m128i rgb1 = _mm_srli_si128(rgb0, 3*2);
+        __m128i rgb2 = _mm_loadu_si128((const __m128i*) (ptr8 +  6*2));
+        __m128i rgb3 = _mm_srli_si128(rgb2, 3*2);
+        __m128i rgb4 = _mm_loadu_si128((const __m128i*) (ptr8 + 12*2));
+        __m128i rgb5 = _mm_srli_si128(rgb4, 3*2);
+        __m128i rgb6 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 16*2)), 2*2);
+        __m128i rgb7 = _mm_srli_si128(rgb6, 3*2);
+
+        __m128i rgb01 = _mm_unpacklo_epi16(rgb0, rgb1);
+        __m128i rgb23 = _mm_unpacklo_epi16(rgb2, rgb3);
+        __m128i rgb45 = _mm_unpacklo_epi16(rgb4, rgb5);
+        __m128i rgb67 = _mm_unpacklo_epi16(rgb6, rgb7);
+
+        __m128i rg03 = _mm_unpacklo_epi32(rgb01, rgb23);
+        __m128i bx03 = _mm_unpackhi_epi32(rgb01, rgb23);
+        __m128i rg47 = _mm_unpacklo_epi32(rgb45, rgb67);
+        __m128i bx47 = _mm_unpackhi_epi32(rgb45, rgb67);
+
+        *r = _mm_unpacklo_epi64(rg03, rg47);
+        *g = _mm_unpackhi_epi64(rg03, rg47);
+        *b = _mm_unpacklo_epi64(bx03, bx47);
+    }
     AI static void Store4(void* ptr, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
         // TODO: AVX2 version
         __m128i rg0123 = _mm_unpacklo_epi16(r.fVec, g.fVec),  // r0 g0 r1 g1 r2 g2 r3 g3
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 71a15c68a8..f496bd531c 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -591,6 +591,23 @@ STAGE_CTX(load_u16_be, const uint64_t**) {
     a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8));
 }
 
+STAGE_CTX(load_rgb_u16_be, const uint16_t**) {
+    auto ptr = *ctx + 3*x;
+    const void* src = ptr;
+    uint16_t buf[N*3] = {0};
+    if (tail) {
+        memcpy(buf, src, tail*3*sizeof(uint16_t));
+        src = buf;
+    }
+
+    SkNh rh, gh, bh;
+    SkNh::Load3(src, &rh, &gh, &bh);
+    r = (1.0f / 65535.0f) * SkNx_cast<float>((rh << 8) | (rh >> 8));
+    g = (1.0f / 65535.0f) * SkNx_cast<float>((gh << 8) | (gh >> 8));
+    b = (1.0f / 65535.0f) * SkNx_cast<float>((bh << 8) | (bh >> 8));
+    a = 1.0f;
+}
+
 STAGE_CTX(load_tables, const LoadTablesContext*) {
     auto ptr = (const uint32_t*)ctx->fSrc + x;
 
@@ -621,6 +638,25 @@ STAGE_CTX(load_tables_u16_be, const LoadTablesContext*) {
     a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8));
 }
 
+STAGE_CTX(load_tables_rgb_u16_be, const LoadTablesContext*) {
+    auto ptr = (const uint16_t*)ctx->fSrc + 3*x;
+    const void* src = ptr;
+    uint16_t buf[N*3] = {0};
+    if (tail) {
+        memcpy(buf, src, tail*3*sizeof(uint16_t));
+        src = buf;
+    }
+
+    SkNh rh, gh, bh;
+    SkNh::Load3(src, &rh, &gh, &bh);
+
+    // ctx->fSrc is big-endian, so "& 0xff" grabs the 8 most significant bits of each component.
+    r = gather(tail, ctx->fR, SkNx_cast<int>(rh & 0xff));
+    g = gather(tail, ctx->fG, SkNx_cast<int>(gh & 0xff));
+    b = gather(tail, ctx->fB, SkNx_cast<int>(bh & 0xff));
+    a = 1.0f;
+}
+
 STAGE_CTX(store_tables, const StoreTablesContext*) {
     auto ptr = ctx->fDst + x;