Roll skia/third_party/skcms 36eeb1311a72..0da672fc2c69 (1 commits)

https://skia.googlesource.com/skcms.git/+log/36eeb1311a72..0da672fc2c69 2019-04-09 mtklein@google.com runtime detection for AVX-512 The AutoRoll server is located here: https://autoroll.skia.org/r/skcms-skia-autoroll Documentation for the AutoRoller is here: https://skia.googlesource.com/buildbot/+/master/autoroll/README.md If the roll is causing failures, please contact the current sheriff, who should be CC'd on the roll, and stop the roller if necessary. CQ_INCLUDE_TRYBOTS=luci.chromium.try:linux-blink-rel TBR=egdaniel@google.com Change-Id: I03188337dea03591fecfae9b3e93a5c4a8bd3725 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/206880 Reviewed-by: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com> Commit-Queue: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com>
2019-04-09 13:57:02 +00:00 · 2019-04-09 13:57:02 +00:00 · ba6087c8b7
commit ba6087c8b7
parent a519999fa5
3 changed files with 150 additions and 69 deletions
--- a/third_party/skcms/skcms.cc
+++ b/third_party/skcms/skcms.cc
@ -17,6 +17,19 @@
    #include <arm_neon.h>
 #elif defined(__SSE__)
    #include <immintrin.h>
+
+    #if defined(__clang__)
+        // That #include <immintrin.h> is usually enough, but Clang's headers
+        // "helpfully" skip including the whole kitchen sink when _MSC_VER is
+        // defined, because lots of programs on Windows would include that and
+        // it'd be a lot slower.  But we want all those headers included so we
+        // can use their features after runtime checks later.
+        #include <smmintrin.h>
+        #include <avxintrin.h>
+        #include <avx2intrin.h>
+        #include <avx512fintrin.h>
+        #include <avx512dqintrin.h>
+    #endif
 #endif

 // sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others.
@ -1864,80 +1877,127 @@ namespace baseline {
 #if !defined(SKCMS_PORTABLE) &&                           \
        (( defined(__clang__) && __clang_major__ >= 5) || \
         (!defined(__clang__) && defined(__GNUC__)))      \
-     && defined(__x86_64__) && !defined(__AVX2__)
+     && defined(__x86_64__)

-    #if defined(__clang__)
-        #pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
-    #elif defined(__GNUC__)
-        #pragma GCC push_options
-        #pragma GCC target("avx2,f16c")
+    #if !defined(__AVX2__)
+        #if defined(__clang__)
+            #pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
+        #elif defined(__GNUC__)
+            #pragma GCC push_options
+            #pragma GCC target("avx2,f16c")
+        #endif
+
+        namespace hsw {
+            #define USING_AVX
+            #define USING_AVX_F16C
+            #define USING_AVX2
+            #define N 8
+            using   F = Vec<N,float>;
+            using I32 = Vec<N,int32_t>;
+            using U64 = Vec<N,uint64_t>;
+            using U32 = Vec<N,uint32_t>;
+            using U16 = Vec<N,uint16_t>;
+            using  U8 = Vec<N,uint8_t>;
+
+            #include "src/Transform_inl.h"
+
+            // src/Transform_inl.h will undefine USING_* for us.
+            #undef N
+        }
+
+        #if defined(__clang__)
+            #pragma clang attribute pop
+        #elif defined(__GNUC__)
+            #pragma GCC pop_options
+        #endif
+
+        #define TEST_FOR_HSW
    #endif

-    namespace hsw {
-        #define USING_AVX
-        #define USING_AVX_F16C
-        #define USING_AVX2
-        #define N 8
-        using   F = Vec<N,float>;
-        using I32 = Vec<N,int32_t>;
-        using U64 = Vec<N,uint64_t>;
-        using U32 = Vec<N,uint32_t>;
-        using U16 = Vec<N,uint16_t>;
-        using  U8 = Vec<N,uint8_t>;
+    #if !defined(__AVX512F__)
+        #if defined(__clang__)
+            #pragma clang attribute push(__attribute__((target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl"))), apply_to=function)
+        #elif defined(__GNUC__)
+            #pragma GCC push_options
+            #pragma GCC target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl")
+        #endif

-        #include "src/Transform_inl.h"
+        namespace skx {
+            #define USING_AVX512F
+            #define N 16
+            using   F = Vec<N,float>;
+            using I32 = Vec<N,int32_t>;
+            using U64 = Vec<N,uint64_t>;
+            using U32 = Vec<N,uint32_t>;
+            using U16 = Vec<N,uint16_t>;
+            using  U8 = Vec<N,uint8_t>;

-        // src/Transform_inl.h will undefine USING_* for us.
-        #undef N
-    }
+            #include "src/Transform_inl.h"

-    #if defined(__clang__)
-        #pragma clang attribute pop
-    #elif defined(__GNUC__)
-        #pragma GCC pop_options
+            // src/Transform_inl.h will undefine USING_* for us.
+            #undef N
+        }
+
+        #if defined(__clang__)
+            #pragma clang attribute pop
+        #elif defined(__GNUC__)
+            #pragma GCC pop_options
+        #endif
+
+        #define TEST_FOR_SKX
    #endif

-    #define TEST_FOR_HSW
+    #if defined(TEST_FOR_HSW) || defined(TEST_FOR_SKX)
+        enum class CpuType { None, HSW, SKX };
+        static CpuType cpu_type() {
+            static const CpuType type = []{
+                // See http://www.sandpile.org/x86/cpuid.htm

-    static bool hsw_ok() {
-        static const bool ok = []{
-            // See http://www.sandpile.org/x86/cpuid.htm
+                // First, a basic cpuid(1) lets us check prerequisites for HSW, SKX.
+                uint32_t eax, ebx, ecx, edx;
+                __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                                             : "0"(1), "2"(0));
+                if ((edx & (1u<<25)) &&  // SSE
+                    (edx & (1u<<26)) &&  // SSE2
+                    (ecx & (1u<< 0)) &&  // SSE3
+                    (ecx & (1u<< 9)) &&  // SSSE3
+                    (ecx & (1u<<12)) &&  // FMA (N.B. not used, avoided even)
+                    (ecx & (1u<<19)) &&  // SSE4.1
+                    (ecx & (1u<<20)) &&  // SSE4.2
+                    (ecx & (1u<<26)) &&  // XSAVE
+                    (ecx & (1u<<27)) &&  // OSXSAVE
+                    (ecx & (1u<<28)) &&  // AVX
+                    (ecx & (1u<<29))) {  // F16C

-            // First, a basic cpuid(1).
-            uint32_t eax, ebx, ecx, edx;
-            __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
-                                         : "0"(1), "2"(0));
+                    // Call cpuid(7) to check for AVX2 and AVX-512 bits.
+                    __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                                                 : "0"(7), "2"(0));
+                    // eax from xgetbv(0) will tell us whether XMM, YMM, and ZMM state is saved.
+                    uint32_t xcr0, dont_need_edx;
+                    __asm__ __volatile__("xgetbv" : "=a"(xcr0), "=d"(dont_need_edx) : "c"(0));

-            // Sanity check for prerequisites.
-            if ((edx & (1<<25)) != (1<<25)) { return false; }   // SSE
-            if ((edx & (1<<26)) != (1<<26)) { return false; }   // SSE2
-            if ((ecx & (1<< 0)) != (1<< 0)) { return false; }   // SSE3
-            if ((ecx & (1<< 9)) != (1<< 9)) { return false; }   // SSSE3
-            if ((ecx & (1<<19)) != (1<<19)) { return false; }   // SSE4.1
-            if ((ecx & (1<<20)) != (1<<20)) { return false; }   // SSE4.2
-
-            if ((ecx & (3<<26)) != (3<<26)) { return false; }   // XSAVE + OSXSAVE
-
-            {
-                uint32_t eax_xgetbv, edx_xgetbv;
-                __asm__ __volatile__("xgetbv" : "=a"(eax_xgetbv), "=d"(edx_xgetbv) : "c"(0));
-                if ((eax_xgetbv & (3<<1)) != (3<<1)) { return false; }  // XMM+YMM state saved?
-            }
-
-            if ((ecx & (1<<28)) != (1<<28)) { return false; }   // AVX
-            if ((ecx & (1<<29)) != (1<<29)) { return false; }   // F16C
-            if ((ecx & (1<<12)) != (1<<12)) { return false; }   // FMA  (TODO: not currently used)
-
-            // Call cpuid(7) to check for our final AVX2 feature bit!
-            __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
-                                         : "0"(7), "2"(0));
-            if ((ebx & (1<< 5)) != (1<< 5)) { return false; }   // AVX2
-
-            return true;
-        }();
-
-        return ok;
-    }
+                    if ((xcr0 & (1u<<1)) &&  // XMM register state saved?
+                        (xcr0 & (1u<<2)) &&  // YMM register state saved?
+                        (ebx  & (1u<<5))) {  // AVX2
+                        // At this point we're at least HSW.  Continue checking for SKX.
+                        if ((xcr0 & (1u<< 5)) && // Opmasks state saved?
+                            (xcr0 & (1u<< 6)) && // First 16 ZMM registers saved?
+                            (xcr0 & (1u<< 7)) && // High 16 ZMM registers saved?
+                            (ebx  & (1u<<16)) && // AVX512F
+                            (ebx  & (1u<<17)) && // AVX512DQ
+                            (ebx  & (1u<<28)) && // AVX512CD
+                            (ebx  & (1u<<30)) && // AVX512BW
+                            (ebx  & (1u<<31))) { // AVX512VL
+                            return CpuType::SKX;
+                        }
+                        return CpuType::HSW;
+                    }
+                }
+                return CpuType::None;
+            }();
+            return type;
+        }
+    #endif

 #endif

@ -2260,7 +2320,18 @@ bool skcms_TransformWithPalette(const void*             src,

    auto run = baseline::run_program;
 #if defined(TEST_FOR_HSW)
-    if (hsw_ok()) { run = hsw::run_program; }
+    switch (cpu_type()) {
+        case CpuType::None:                        break;
+        case CpuType::HSW: run = hsw::run_program; break;
+        case CpuType::SKX: run = hsw::run_program; break;
+    }
+#endif
+#if defined(TEST_FOR_SKX)
+    switch (cpu_type()) {
+        case CpuType::None:                        break;
+        case CpuType::HSW:                         break;
+        case CpuType::SKX: run = skx::run_program; break;
+    }
 #endif
    run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp);
    return true;
--- a/third_party/skcms/src/Transform_inl.h
+++ b/third_party/skcms/src/Transform_inl.h
@ -43,6 +43,9 @@
 #if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
    #define  USING_AVX2
 #endif
+#if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__)
+    #define  USING_AVX512F
+#endif

 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
 // This is more for organizational clarity... skcms.cc doesn't force these.
@ -138,7 +141,7 @@ SI T if_then_else(I32 cond, T t, T e) {
 SI F F_from_Half(U16 half) {
 #if defined(USING_NEON_F16C)
    return vcvt_f32_f16((float16x4_t)half);
-#elif defined(__AVX512F__)
+#elif defined(USING_AVX512F)
    return (F)_mm512_cvtph_ps((__m256i)half);
 #elif defined(USING_AVX_F16C)
    typedef int16_t __attribute__((vector_size(16))) I16;
@ -165,7 +168,7 @@ SI F F_from_Half(U16 half) {
 SI U16 Half_from_F(F f) {
 #if defined(USING_NEON_F16C)
    return (U16)vcvt_f16_f32(f);
-#elif defined(__AVX512F__)
+#elif defined(USING_AVX512F)
    return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
 #elif defined(USING_AVX_F16C)
    return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
@ -206,8 +209,12 @@ SI F floor_(F x) {
    return floorf_(x);
 #elif defined(__aarch64__)
    return vrndmq_f32(x);
-#elif defined(__AVX512F__)
-    return _mm512_floor_ps(x);
+#elif defined(USING_AVX512F)
+    // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
+    // and integer santizer catches that this implicit cast changes the
+    // value from -1 to 65535.  We'll cast manually to work around it.
+    // Read this as `return _mm512_floor_ps(x)`.
+    return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
 #elif defined(USING_AVX)
    return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
 #elif defined(__SSE4_1__)
@ -1238,6 +1245,9 @@ static void run_program(const Op* program, const void** arguments,
 #if defined(USING_AVX2)
    #undef  USING_AVX2
 #endif
+#if defined(USING_AVX512F)
+    #undef  USING_AVX512F
+#endif

 #if defined(USING_NEON)
    #undef  USING_NEON
--- a/third_party/skcms/version.sha1
+++ b/third_party/skcms/version.sha1
@ -1 +1 @@
-36eeb1311a720755fb065f46987158e42edbdfb6
+0da672fc2c69d3d7fd4c524c2d873ca725586d97