Roll skia/third_party/skcms 36eeb1311a72..0da672fc2c69 (1 commits)
https://skia.googlesource.com/skcms.git/+log/36eeb1311a72..0da672fc2c69 2019-04-09 mtklein@google.com runtime detection for AVX-512 The AutoRoll server is located here: https://autoroll.skia.org/r/skcms-skia-autoroll Documentation for the AutoRoller is here: https://skia.googlesource.com/buildbot/+/master/autoroll/README.md If the roll is causing failures, please contact the current sheriff, who should be CC'd on the roll, and stop the roller if necessary. CQ_INCLUDE_TRYBOTS=luci.chromium.try:linux-blink-rel TBR=egdaniel@google.com Change-Id: I03188337dea03591fecfae9b3e93a5c4a8bd3725 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/206880 Reviewed-by: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com> Commit-Queue: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com>
This commit is contained in:
parent
a519999fa5
commit
ba6087c8b7
199
third_party/skcms/skcms.cc
vendored
199
third_party/skcms/skcms.cc
vendored
@ -17,6 +17,19 @@
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__SSE__)
|
||||
#include <immintrin.h>
|
||||
|
||||
#if defined(__clang__)
|
||||
// That #include <immintrin.h> is usually enough, but Clang's headers
|
||||
// "helpfully" skip including the whole kitchen sink when _MSC_VER is
|
||||
// defined, because lots of programs on Windows would include that and
|
||||
// it'd be a lot slower. But we want all those headers included so we
|
||||
// can use their features after runtime checks later.
|
||||
#include <smmintrin.h>
|
||||
#include <avxintrin.h>
|
||||
#include <avx2intrin.h>
|
||||
#include <avx512fintrin.h>
|
||||
#include <avx512dqintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others.
|
||||
@ -1864,80 +1877,127 @@ namespace baseline {
|
||||
#if !defined(SKCMS_PORTABLE) && \
|
||||
(( defined(__clang__) && __clang_major__ >= 5) || \
|
||||
(!defined(__clang__) && defined(__GNUC__))) \
|
||||
&& defined(__x86_64__) && !defined(__AVX2__)
|
||||
&& defined(__x86_64__)
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx2,f16c")
|
||||
#if !defined(__AVX2__)
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx2,f16c")
|
||||
#endif
|
||||
|
||||
namespace hsw {
|
||||
#define USING_AVX
|
||||
#define USING_AVX_F16C
|
||||
#define USING_AVX2
|
||||
#define N 8
|
||||
using F = Vec<N,float>;
|
||||
using I32 = Vec<N,int32_t>;
|
||||
using U64 = Vec<N,uint64_t>;
|
||||
using U32 = Vec<N,uint32_t>;
|
||||
using U16 = Vec<N,uint16_t>;
|
||||
using U8 = Vec<N,uint8_t>;
|
||||
|
||||
#include "src/Transform_inl.h"
|
||||
|
||||
// src/Transform_inl.h will undefine USING_* for us.
|
||||
#undef N
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
|
||||
#define TEST_FOR_HSW
|
||||
#endif
|
||||
|
||||
namespace hsw {
|
||||
#define USING_AVX
|
||||
#define USING_AVX_F16C
|
||||
#define USING_AVX2
|
||||
#define N 8
|
||||
using F = Vec<N,float>;
|
||||
using I32 = Vec<N,int32_t>;
|
||||
using U64 = Vec<N,uint64_t>;
|
||||
using U32 = Vec<N,uint32_t>;
|
||||
using U16 = Vec<N,uint16_t>;
|
||||
using U8 = Vec<N,uint8_t>;
|
||||
#if !defined(__AVX512F__)
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push(__attribute__((target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl"))), apply_to=function)
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl")
|
||||
#endif
|
||||
|
||||
#include "src/Transform_inl.h"
|
||||
namespace skx {
|
||||
#define USING_AVX512F
|
||||
#define N 16
|
||||
using F = Vec<N,float>;
|
||||
using I32 = Vec<N,int32_t>;
|
||||
using U64 = Vec<N,uint64_t>;
|
||||
using U32 = Vec<N,uint32_t>;
|
||||
using U16 = Vec<N,uint16_t>;
|
||||
using U8 = Vec<N,uint8_t>;
|
||||
|
||||
// src/Transform_inl.h will undefine USING_* for us.
|
||||
#undef N
|
||||
}
|
||||
#include "src/Transform_inl.h"
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC pop_options
|
||||
// src/Transform_inl.h will undefine USING_* for us.
|
||||
#undef N
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
|
||||
#define TEST_FOR_SKX
|
||||
#endif
|
||||
|
||||
#define TEST_FOR_HSW
|
||||
#if defined(TEST_FOR_HSW) || defined(TEST_FOR_SKX)
|
||||
enum class CpuType { None, HSW, SKX };
|
||||
static CpuType cpu_type() {
|
||||
static const CpuType type = []{
|
||||
// See http://www.sandpile.org/x86/cpuid.htm
|
||||
|
||||
static bool hsw_ok() {
|
||||
static const bool ok = []{
|
||||
// See http://www.sandpile.org/x86/cpuid.htm
|
||||
// First, a basic cpuid(1) lets us check prerequisites for HSW, SKX.
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
|
||||
: "0"(1), "2"(0));
|
||||
if ((edx & (1u<<25)) && // SSE
|
||||
(edx & (1u<<26)) && // SSE2
|
||||
(ecx & (1u<< 0)) && // SSE3
|
||||
(ecx & (1u<< 9)) && // SSSE3
|
||||
(ecx & (1u<<12)) && // FMA (N.B. not used, avoided even)
|
||||
(ecx & (1u<<19)) && // SSE4.1
|
||||
(ecx & (1u<<20)) && // SSE4.2
|
||||
(ecx & (1u<<26)) && // XSAVE
|
||||
(ecx & (1u<<27)) && // OSXSAVE
|
||||
(ecx & (1u<<28)) && // AVX
|
||||
(ecx & (1u<<29))) { // F16C
|
||||
|
||||
// First, a basic cpuid(1).
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
|
||||
: "0"(1), "2"(0));
|
||||
// Call cpuid(7) to check for AVX2 and AVX-512 bits.
|
||||
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
|
||||
: "0"(7), "2"(0));
|
||||
// eax from xgetbv(0) will tell us whether XMM, YMM, and ZMM state is saved.
|
||||
uint32_t xcr0, dont_need_edx;
|
||||
__asm__ __volatile__("xgetbv" : "=a"(xcr0), "=d"(dont_need_edx) : "c"(0));
|
||||
|
||||
// Sanity check for prerequisites.
|
||||
if ((edx & (1<<25)) != (1<<25)) { return false; } // SSE
|
||||
if ((edx & (1<<26)) != (1<<26)) { return false; } // SSE2
|
||||
if ((ecx & (1<< 0)) != (1<< 0)) { return false; } // SSE3
|
||||
if ((ecx & (1<< 9)) != (1<< 9)) { return false; } // SSSE3
|
||||
if ((ecx & (1<<19)) != (1<<19)) { return false; } // SSE4.1
|
||||
if ((ecx & (1<<20)) != (1<<20)) { return false; } // SSE4.2
|
||||
|
||||
if ((ecx & (3<<26)) != (3<<26)) { return false; } // XSAVE + OSXSAVE
|
||||
|
||||
{
|
||||
uint32_t eax_xgetbv, edx_xgetbv;
|
||||
__asm__ __volatile__("xgetbv" : "=a"(eax_xgetbv), "=d"(edx_xgetbv) : "c"(0));
|
||||
if ((eax_xgetbv & (3<<1)) != (3<<1)) { return false; } // XMM+YMM state saved?
|
||||
}
|
||||
|
||||
if ((ecx & (1<<28)) != (1<<28)) { return false; } // AVX
|
||||
if ((ecx & (1<<29)) != (1<<29)) { return false; } // F16C
|
||||
if ((ecx & (1<<12)) != (1<<12)) { return false; } // FMA (TODO: not currently used)
|
||||
|
||||
// Call cpuid(7) to check for our final AVX2 feature bit!
|
||||
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
|
||||
: "0"(7), "2"(0));
|
||||
if ((ebx & (1<< 5)) != (1<< 5)) { return false; } // AVX2
|
||||
|
||||
return true;
|
||||
}();
|
||||
|
||||
return ok;
|
||||
}
|
||||
if ((xcr0 & (1u<<1)) && // XMM register state saved?
|
||||
(xcr0 & (1u<<2)) && // YMM register state saved?
|
||||
(ebx & (1u<<5))) { // AVX2
|
||||
// At this point we're at least HSW. Continue checking for SKX.
|
||||
if ((xcr0 & (1u<< 5)) && // Opmasks state saved?
|
||||
(xcr0 & (1u<< 6)) && // First 16 ZMM registers saved?
|
||||
(xcr0 & (1u<< 7)) && // High 16 ZMM registers saved?
|
||||
(ebx & (1u<<16)) && // AVX512F
|
||||
(ebx & (1u<<17)) && // AVX512DQ
|
||||
(ebx & (1u<<28)) && // AVX512CD
|
||||
(ebx & (1u<<30)) && // AVX512BW
|
||||
(ebx & (1u<<31))) { // AVX512VL
|
||||
return CpuType::SKX;
|
||||
}
|
||||
return CpuType::HSW;
|
||||
}
|
||||
}
|
||||
return CpuType::None;
|
||||
}();
|
||||
return type;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@ -2260,7 +2320,18 @@ bool skcms_TransformWithPalette(const void* src,
|
||||
|
||||
auto run = baseline::run_program;
|
||||
#if defined(TEST_FOR_HSW)
|
||||
if (hsw_ok()) { run = hsw::run_program; }
|
||||
switch (cpu_type()) {
|
||||
case CpuType::None: break;
|
||||
case CpuType::HSW: run = hsw::run_program; break;
|
||||
case CpuType::SKX: run = hsw::run_program; break;
|
||||
}
|
||||
#endif
|
||||
#if defined(TEST_FOR_SKX)
|
||||
switch (cpu_type()) {
|
||||
case CpuType::None: break;
|
||||
case CpuType::HSW: break;
|
||||
case CpuType::SKX: run = skx::run_program; break;
|
||||
}
|
||||
#endif
|
||||
run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp);
|
||||
return true;
|
||||
|
18
third_party/skcms/src/Transform_inl.h
vendored
18
third_party/skcms/src/Transform_inl.h
vendored
@ -43,6 +43,9 @@
|
||||
#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
|
||||
#define USING_AVX2
|
||||
#endif
|
||||
#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__)
|
||||
#define USING_AVX512F
|
||||
#endif
|
||||
|
||||
// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
|
||||
// This is more for organizational clarity... skcms.cc doesn't force these.
|
||||
@ -138,7 +141,7 @@ SI T if_then_else(I32 cond, T t, T e) {
|
||||
SI F F_from_Half(U16 half) {
|
||||
#if defined(USING_NEON_F16C)
|
||||
return vcvt_f32_f16((float16x4_t)half);
|
||||
#elif defined(__AVX512F__)
|
||||
#elif defined(USING_AVX512F)
|
||||
return (F)_mm512_cvtph_ps((__m256i)half);
|
||||
#elif defined(USING_AVX_F16C)
|
||||
typedef int16_t __attribute__((vector_size(16))) I16;
|
||||
@ -165,7 +168,7 @@ SI F F_from_Half(U16 half) {
|
||||
SI U16 Half_from_F(F f) {
|
||||
#if defined(USING_NEON_F16C)
|
||||
return (U16)vcvt_f16_f32(f);
|
||||
#elif defined(__AVX512F__)
|
||||
#elif defined(USING_AVX512F)
|
||||
return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
|
||||
#elif defined(USING_AVX_F16C)
|
||||
return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
|
||||
@ -206,8 +209,12 @@ SI F floor_(F x) {
|
||||
return floorf_(x);
|
||||
#elif defined(__aarch64__)
|
||||
return vrndmq_f32(x);
|
||||
#elif defined(__AVX512F__)
|
||||
return _mm512_floor_ps(x);
|
||||
#elif defined(USING_AVX512F)
|
||||
// Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
|
||||
// and integer santizer catches that this implicit cast changes the
|
||||
// value from -1 to 65535. We'll cast manually to work around it.
|
||||
// Read this as `return _mm512_floor_ps(x)`.
|
||||
return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
|
||||
#elif defined(USING_AVX)
|
||||
return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
|
||||
#elif defined(__SSE4_1__)
|
||||
@ -1238,6 +1245,9 @@ static void run_program(const Op* program, const void** arguments,
|
||||
#if defined(USING_AVX2)
|
||||
#undef USING_AVX2
|
||||
#endif
|
||||
#if defined(USING_AVX512F)
|
||||
#undef USING_AVX512F
|
||||
#endif
|
||||
|
||||
#if defined(USING_NEON)
|
||||
#undef USING_NEON
|
||||
|
2
third_party/skcms/version.sha1
vendored
2
third_party/skcms/version.sha1
vendored
@ -1 +1 @@
|
||||
36eeb1311a720755fb065f46987158e42edbdfb6
|
||||
0da672fc2c69d3d7fd4c524c2d873ca725586d97
|
Loading…
Reference in New Issue
Block a user