Roll skia/third_party/skcms dfd5b3a4a61f..7362d8835a07 (1 commits)

https://skia.googlesource.com/skcms.git/+log/dfd5b3a4a61f..7362d8835a07 2018-10-12 mtklein@google.com relax CLUT inlining a bit The AutoRoll server is located here: https://autoroll.skia.org/r/skcms-skia-autoroll Documentation for the AutoRoller is here: https://skia.googlesource.com/buildbot/+/master/autoroll/README.md If the roll is causing failures, please contact the current sheriff, who should be CC'd on the roll, and stop the roller if necessary. CQ_INCLUDE_TRYBOTS=master.tryserver.blink:linux_trusty_blink_rel TBR=stani@google.com Change-Id: I07c9643d3a8df43042fb517536a040a14b4b59a4 Reviewed-on: https://skia-review.googlesource.com/c/161917 Reviewed-by: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com> Commit-Queue: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com>
2018-10-12 20:36:16 +00:00 · 2018-10-12 20:36:16 +00:00 · 07d747af9d
commit 07d747af9d
parent 3655e4059e
2 changed files with 52 additions and 71 deletions
--- a/third_party/skcms/src/Transform_inl.h
+++ b/third_party/skcms/src/Transform_inl.h
@ -509,81 +509,62 @@ SI F table_16(const skcms_Curve* curve, F v) {
    return l + (h-l)*t;
 }

-// Color lookup tables, by input dimension and bit depth.
-SI void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+template <int>
+static void sample_clut(const skcms_A2B*, I32 ix, F* r, F* g, F* b);
+
+template <> void sample_clut<8>(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
    U32 rgb = gather_24(a2b->grid_8, ix);

    *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
    *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
    *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
-
-    (void)a;
-    (void)stride;
-}
-SI void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
-    #if defined(__arm__)
-        // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
-        *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
-        *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
-        *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
-    #else
-        // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
-        U64 rgb;
-        gather_48(a2b->grid_16, ix, &rgb);
-        rgb = swap_endian_16x4(rgb);
-
-        *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
-        *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
-        *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
-    #endif
-    (void)a;
-    (void)stride;
 }

-// __attribute__((always_inline)) hits some pathological case in GCC that makes
-// compilation way too slow for my patience.
-#if defined(__clang__)
-    #define MAYBE_SI SI
+template <> void sample_clut<16>(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
+#if defined(__arm__)
+    // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
+    *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
+    *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
+    *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
 #else
-    #define MAYBE_SI static inline
-#endif
+    // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
+    U64 rgb;
+    gather_48(a2b->grid_16, ix, &rgb);
+    rgb = swap_endian_16x4(rgb);

-// These are all the same basic approach: handle one dimension, then the rest recursively.
-// We let "I" be the current dimension, and "J" the previous dimension, I-1.  "B" is the bit depth.
-#define DEF_CLUT(I,J,B)                                                                    \
-    MAYBE_SI \
-    void clut_##I##_##B(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \
-        I32 limit = cast<I32>(F0);                                                         \
-        limit += a2b->grid_points[I-1];                                                    \
-                                                                                           \
-        const F* srcs[] = { r,g,b,&a };                                                    \
-        F src = *srcs[I-1];                                                                \
-                                                                                           \
-        F x = max_(F0, min_(src, F1)) * cast<F>(limit - 1);                                \
-                                                                                           \
-        I32 lo = cast<I32>(            x      ),                                           \
-            hi = cast<I32>(minus_1_ulp(x+1.0f));                                           \
-        F lr = *r, lg = *g, lb = *b,                                                       \
-          hr = *r, hg = *g, hb = *b;                                                       \
-        clut_##J##_##B(a2b, stride*lo + ix, stride*limit, &lr,&lg,&lb,a);                  \
-        clut_##J##_##B(a2b, stride*hi + ix, stride*limit, &hr,&hg,&hb,a);                  \
-                                                                                           \
-        F t = x - cast<F>(lo);                                                             \
-        *r = lr + (hr-lr)*t;                                                               \
-        *g = lg + (hg-lg)*t;                                                               \
-        *b = lb + (hb-lb)*t;                                                               \
+    *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
+    *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
+    *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
+#endif
+}
+
+template <int kBitDepth>
+static void clut(const skcms_A2B* a2b, int dim, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+    if (dim == 0) {
+        sample_clut<kBitDepth>(a2b,ix, r,g,b);
+        return;
    }

-DEF_CLUT(1,0,8)
-DEF_CLUT(2,1,8)
-DEF_CLUT(3,2,8)
-DEF_CLUT(4,3,8)
+    I32 limit = cast<I32>(F0);
+    limit += a2b->grid_points[dim-1];

-DEF_CLUT(1,0,16)
-DEF_CLUT(2,1,16)
-DEF_CLUT(3,2,16)
-DEF_CLUT(4,3,16)
+    const F* srcs[] = { r,g,b,&a };
+    F src = *srcs[dim-1];

+    F x = max_(F0, min_(src, F1)) * cast<F>(limit - 1);
+
+    I32 lo = cast<I32>(            x      ),
+        hi = cast<I32>(minus_1_ulp(x+1.0f));
+    F lr = *r, lg = *g, lb = *b,
+      hr = *r, hg = *g, hb = *b;
+    clut<kBitDepth>(a2b, dim-1, stride*lo + ix, stride*limit, &lr,&lg,&lb,a);
+    clut<kBitDepth>(a2b, dim-1, stride*hi + ix, stride*limit, &hr,&hg,&hb,a);
+
+    F t = x - cast<F>(lo);
+    *r = lr + (hr-lr)*t;
+    *g = lg + (hg-lg)*t;
+    *b = lb + (hb-lb)*t;
+}

 static void exec_ops(const Op* ops, const void** args,
                     const char* src, char* dst, int i) {
@ -911,44 +892,44 @@ static void exec_ops(const Op* ops, const void** args,

            case Op_clut_1D_8:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_1_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<8>(a2b, 1, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
            } break;

            case Op_clut_1D_16:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_1_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<16>(a2b, 1, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
            } break;

            case Op_clut_2D_8:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_2_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<8>(a2b, 2, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
            } break;

            case Op_clut_2D_16:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_2_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<16>(a2b, 2, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
            } break;

            case Op_clut_3D_8:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_3_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<8>(a2b, 3, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
            } break;

            case Op_clut_3D_16:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_3_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<16>(a2b, 3, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
            } break;

            case Op_clut_4D_8:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_4_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<8>(a2b, 4, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
                // 'a' was really a CMYK K, so our output is actually opaque.
                a = F1;
            } break;

            case Op_clut_4D_16:{
                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
-                clut_4_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+                clut<16>(a2b, 4, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
                // 'a' was really a CMYK K, so our output is actually opaque.
                a = F1;
            } break;
--- a/third_party/skcms/version.sha1
+++ b/third_party/skcms/version.sha1
@ -1 +1 @@
-dfd5b3a4a61f6fe1f91e0370d100132c17e29ca6
+7362d8835a07a4a3cb1cc5305238a472bbe614c8