From 5bee0b6de6b3ad1166d067e6b5046b48b8240a29 Mon Sep 17 00:00:00 2001 From: Matt Sarett Date: Thu, 19 Jan 2017 12:04:32 -0500 Subject: [PATCH] Reland "Respect full precision for RGB16 PNGs" (part 2) This lands all the new xform hooks but no change to src/codec. So the new decode features are turned off. I'm relanding this in pieces to try to bisect a strange MSAN error. Original CL: https://skia-review.googlesource.com/c/7085/ BUG=skia: CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-MSAN,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD,Build-Ubuntu-Clang-x86_64-Release-Fast Change-Id: I451a2a29c73ca475e9e7a5ded58d4948d6b8be19 Reviewed-on: https://skia-review.googlesource.com/7277 Reviewed-by: Matt Sarett Commit-Queue: Matt Sarett --- include/core/SkColorSpaceXform.h | 1 + src/core/SkColorSpaceXform.cpp | 23 +++++++++++++++- src/core/SkColorSpaceXform_A2B.cpp | 3 +++ src/core/SkNx.h | 16 ++++++++++++ src/core/SkRasterPipeline.h | 5 ++-- src/opts/SkNx_neon.h | 7 ++++- src/opts/SkNx_sse.h | 42 ++++++++++++++++++++++++++++++ src/opts/SkRasterPipeline_opts.h | 36 +++++++++++++++++++++++++ 8 files changed, 129 insertions(+), 4 deletions(-) diff --git a/include/core/SkColorSpaceXform.h b/include/core/SkColorSpaceXform.h index 299fe26a3a..0c4fd7aa33 100644 --- a/include/core/SkColorSpaceXform.h +++ b/include/core/SkColorSpaceXform.h @@ -29,6 +29,7 @@ public: kBGRA_8888_ColorFormat, // Unsigned, big-endian, 16-bit integer + kRGB_U16_BE_ColorFormat, // Src only kRGBA_U16_BE_ColorFormat, // Src only kRGBA_F16_ColorFormat, // Dst only diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp index bdb3784681..dccb92ee9a 100644 --- a/src/core/SkColorSpaceXform.cpp +++ b/src/core/SkColorSpaceXform.cpp @@ -1104,7 +1104,10 @@ bool SkColorSpaceXform_XYZ } } - if (kRGBA_F32_ColorFormat == dstColorFormat || kRGBA_U16_BE_ColorFormat == srcColorFormat) { + if (kRGBA_F32_ColorFormat == dstColorFormat || + kRGBA_U16_BE_ColorFormat == srcColorFormat || + kRGB_U16_BE_ColorFormat == srcColorFormat) + { return this->applyPipeline(dstColorFormat, dst, srcColorFormat, src, len, alphaType); } @@ -1221,6 +1224,24 @@ bool SkColorSpaceXform_XYZ break; } break; + case kRGB_U16_BE_ColorFormat: + switch (fSrcGamma) { + case kLinear_SrcGamma: + pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src); + break; + case kSRGB_SrcGamma: + pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src); + pipeline.append_from_srgb(kUnpremul_SkAlphaType); + break; + case kTable_SrcGamma: + loadTables.fSrc = src; + loadTables.fR = fSrcGammaTables[0]; + loadTables.fG = fSrcGammaTables[1]; + loadTables.fB = fSrcGammaTables[2]; + pipeline.append(SkRasterPipeline::load_tables_rgb_u16_be, &loadTables); + break; + } + break; default: return false; } diff --git a/src/core/SkColorSpaceXform_A2B.cpp b/src/core/SkColorSpaceXform_A2B.cpp index 6e49e84aee..27c9faa971 100644 --- a/src/core/SkColorSpaceXform_A2B.cpp +++ b/src/core/SkColorSpaceXform_A2B.cpp @@ -31,6 +31,9 @@ bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorForma case kRGBA_U16_BE_ColorFormat: pipeline.append(SkRasterPipeline::load_u16_be, &src); break; + case kRGB_U16_BE_ColorFormat: + pipeline.append(SkRasterPipeline::load_rgb_u16_be, &src); + break; default: SkCSXformPrintf("F16/F32 source color format not supported\n"); return false; diff --git a/src/core/SkNx.h b/src/core/SkNx.h index a75495cd2a..5df575342c 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -68,6 +68,16 @@ struct SkNx { *c = SkNx{cl, ch}; *d = SkNx{dl, dh}; } + AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) { + auto ptr = (const char*)vptr; + Half al, bl, cl, + ah, bh, ch; + Half::Load3(ptr , &al, &bl, &cl); + Half::Load3(ptr + 3*N/2*sizeof(T), &ah, &bh, &ch); + *a = SkNx{al, ah}; + *b = SkNx{bl, bh}; + *c = SkNx{cl, ch}; + } AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { auto ptr = (char*)vptr; Half::Store4(ptr, a.fLo, b.fLo, c.fLo, d.fLo); @@ -149,6 +159,12 @@ struct SkNx<1,T> { *c = Load(ptr + 2*sizeof(T)); *d = Load(ptr + 3*sizeof(T)); } + AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) { + auto ptr = (const char*)vptr; + *a = Load(ptr + 0*sizeof(T)); + *b = Load(ptr + 1*sizeof(T)); + *c = Load(ptr + 2*sizeof(T)); + } AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { auto ptr = (char*)vptr; a.store(ptr + 0*sizeof(T)); diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h index 949146cc2d..3d1bb88011 100644 --- a/src/core/SkRasterPipeline.h +++ b/src/core/SkRasterPipeline.h @@ -69,8 +69,9 @@ M(load_565) M(store_565) \ M(load_f16) M(store_f16) \ M(load_8888) M(store_8888) \ - M(load_u16_be) \ - M(load_tables) M(load_tables_u16_be) M(store_tables) \ + M(load_u16_be) M(load_rgb_u16_be) \ + M(load_tables_u16_be) M(load_tables_rgb_u16_be) \ + M(load_tables) M(store_tables) \ M(scale_u8) M(scale_1_float) \ M(lerp_u8) M(lerp_565) M(lerp_1_float) \ M(dstatop) M(dstin) M(dstout) M(dstover) \ diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index f39ef118b1..5671f71315 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -241,7 +241,12 @@ public: *b = rgba.val[2]; *a = rgba.val[3]; } - + AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) { + uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr); + *r = rgba.val[0]; + *g = rgba.val[1]; + *b = rgba.val[2]; + } AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { uint16x4x4_t rgba = {{ r.fVec, diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 5ef5eda3a9..0dfb43a756 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -271,6 +271,22 @@ public: *b = ba; *a = _mm_srli_si128(ba, 8); } + AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) { + // The idea here is to get 4 vectors that are R G B _ _ _ _ _. + // The second load is at a funny location to make sure we don't read past + // the bounds of memory. This is fine, we just need to shift it a little bit. + const uint8_t* ptr8 = (const uint8_t*) ptr; + __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0)); + __m128i rgb1 = _mm_srli_si128(rgb0, 3*2); + __m128i rgb2 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 4*2)), 2*2); + __m128i rgb3 = _mm_srli_si128(rgb2, 3*2); + + __m128i rrggbb01 = _mm_unpacklo_epi16(rgb0, rgb1); + __m128i rrggbb23 = _mm_unpacklo_epi16(rgb2, rgb3); + *r = _mm_unpacklo_epi32(rrggbb01, rrggbb23); + *g = _mm_srli_si128(r->fVec, 4*2); + *b = _mm_unpackhi_epi32(rrggbb01, rrggbb23); + } AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); @@ -334,6 +350,32 @@ public: *b = _mm_unpacklo_epi64(ba0123, ba4567); *a = _mm_unpackhi_epi64(ba0123, ba4567); } + AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) { + // TODO: AVX2 version + const uint8_t* ptr8 = (const uint8_t*) ptr; + __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0*2)); + __m128i rgb1 = _mm_srli_si128(rgb0, 3*2); + __m128i rgb2 = _mm_loadu_si128((const __m128i*) (ptr8 + 6*2)); + __m128i rgb3 = _mm_srli_si128(rgb2, 3*2); + __m128i rgb4 = _mm_loadu_si128((const __m128i*) (ptr8 + 12*2)); + __m128i rgb5 = _mm_srli_si128(rgb4, 3*2); + __m128i rgb6 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 16*2)), 2*2); + __m128i rgb7 = _mm_srli_si128(rgb6, 3*2); + + __m128i rgb01 = _mm_unpacklo_epi16(rgb0, rgb1); + __m128i rgb23 = _mm_unpacklo_epi16(rgb2, rgb3); + __m128i rgb45 = _mm_unpacklo_epi16(rgb4, rgb5); + __m128i rgb67 = _mm_unpacklo_epi16(rgb6, rgb7); + + __m128i rg03 = _mm_unpacklo_epi32(rgb01, rgb23); + __m128i bx03 = _mm_unpackhi_epi32(rgb01, rgb23); + __m128i rg47 = _mm_unpacklo_epi32(rgb45, rgb67); + __m128i bx47 = _mm_unpackhi_epi32(rgb45, rgb67); + + *r = _mm_unpacklo_epi64(rg03, rg47); + *g = _mm_unpackhi_epi64(rg03, rg47); + *b = _mm_unpacklo_epi64(bx03, bx47); + } AI static void Store4(void* ptr, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { // TODO: AVX2 version __m128i rg0123 = _mm_unpacklo_epi16(r.fVec, g.fVec), // r0 g0 r1 g1 r2 g2 r3 g3 diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 71a15c68a8..f496bd531c 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -591,6 +591,23 @@ STAGE_CTX(load_u16_be, const uint64_t**) { a = (1.0f / 65535.0f) * SkNx_cast((ah << 8) | (ah >> 8)); } +STAGE_CTX(load_rgb_u16_be, const uint16_t**) { + auto ptr = *ctx + 3*x; + const void* src = ptr; + uint16_t buf[N*3] = {0}; + if (tail) { + memcpy(buf, src, tail*3*sizeof(uint16_t)); + src = buf; + } + + SkNh rh, gh, bh; + SkNh::Load3(src, &rh, &gh, &bh); + r = (1.0f / 65535.0f) * SkNx_cast((rh << 8) | (rh >> 8)); + g = (1.0f / 65535.0f) * SkNx_cast((gh << 8) | (gh >> 8)); + b = (1.0f / 65535.0f) * SkNx_cast((bh << 8) | (bh >> 8)); + a = 1.0f; +} + STAGE_CTX(load_tables, const LoadTablesContext*) { auto ptr = (const uint32_t*)ctx->fSrc + x; @@ -621,6 +638,25 @@ STAGE_CTX(load_tables_u16_be, const LoadTablesContext*) { a = (1.0f / 65535.0f) * SkNx_cast((ah << 8) | (ah >> 8)); } +STAGE_CTX(load_tables_rgb_u16_be, const LoadTablesContext*) { + auto ptr = (const uint16_t*)ctx->fSrc + 3*x; + const void* src = ptr; + uint16_t buf[N*3] = {0}; + if (tail) { + memcpy(buf, src, tail*3*sizeof(uint16_t)); + src = buf; + } + + SkNh rh, gh, bh; + SkNh::Load3(src, &rh, &gh, &bh); + + // ctx->fSrc is big-endian, so "& 0xff" grabs the 8 most significant bits of each component. + r = gather(tail, ctx->fR, SkNx_cast(rh & 0xff)); + g = gather(tail, ctx->fG, SkNx_cast(gh & 0xff)); + b = gather(tail, ctx->fB, SkNx_cast(bh & 0xff)); + a = 1.0f; +} + STAGE_CTX(store_tables, const StoreTablesContext*) { auto ptr = ctx->fDst + x;