Roll skia/third_party/skcms 36eeb1311a72..0da672fc2c69 (1 commits)

https://skia.googlesource.com/skcms.git/+log/36eeb1311a72..0da672fc2c69

2019-04-09 mtklein@google.com runtime detection for AVX-512


The AutoRoll server is located here: https://autoroll.skia.org/r/skcms-skia-autoroll

Documentation for the AutoRoller is here:
https://skia.googlesource.com/buildbot/+/master/autoroll/README.md

If the roll is causing failures, please contact the current sheriff, who should
be CC'd on the roll, and stop the roller if necessary.



CQ_INCLUDE_TRYBOTS=luci.chromium.try:linux-blink-rel
TBR=egdaniel@google.com

Change-Id: I03188337dea03591fecfae9b3e93a5c4a8bd3725
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/206880
Reviewed-by: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com>
Commit-Queue: skia-autoroll <skia-autoroll@skia-public.iam.gserviceaccount.com>
This commit is contained in:
skia-autoroll 2019-04-09 13:57:02 +00:00 committed by Skia Commit-Bot
parent a519999fa5
commit ba6087c8b7
3 changed files with 150 additions and 69 deletions

View File

@ -17,6 +17,19 @@
#include <arm_neon.h>
#elif defined(__SSE__)
#include <immintrin.h>
#if defined(__clang__)
// That #include <immintrin.h> is usually enough, but Clang's headers
// "helpfully" skip including the whole kitchen sink when _MSC_VER is
// defined, because lots of programs on Windows would include that and
// it'd be a lot slower. But we want all those headers included so we
// can use their features after runtime checks later.
#include <smmintrin.h>
#include <avxintrin.h>
#include <avx2intrin.h>
#include <avx512fintrin.h>
#include <avx512dqintrin.h>
#endif
#endif
// sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others.
@ -1864,80 +1877,127 @@ namespace baseline {
#if !defined(SKCMS_PORTABLE) && \
(( defined(__clang__) && __clang_major__ >= 5) || \
(!defined(__clang__) && defined(__GNUC__))) \
&& defined(__x86_64__) && !defined(__AVX2__)
&& defined(__x86_64__)
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
#elif defined(__GNUC__)
#pragma GCC push_options
#pragma GCC target("avx2,f16c")
#if !defined(__AVX2__)
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
#elif defined(__GNUC__)
#pragma GCC push_options
#pragma GCC target("avx2,f16c")
#endif
namespace hsw {
#define USING_AVX
#define USING_AVX_F16C
#define USING_AVX2
#define N 8
using F = Vec<N,float>;
using I32 = Vec<N,int32_t>;
using U64 = Vec<N,uint64_t>;
using U32 = Vec<N,uint32_t>;
using U16 = Vec<N,uint16_t>;
using U8 = Vec<N,uint8_t>;
#include "src/Transform_inl.h"
// src/Transform_inl.h will undefine USING_* for us.
#undef N
}
#if defined(__clang__)
#pragma clang attribute pop
#elif defined(__GNUC__)
#pragma GCC pop_options
#endif
#define TEST_FOR_HSW
#endif
namespace hsw {
#define USING_AVX
#define USING_AVX_F16C
#define USING_AVX2
#define N 8
using F = Vec<N,float>;
using I32 = Vec<N,int32_t>;
using U64 = Vec<N,uint64_t>;
using U32 = Vec<N,uint32_t>;
using U16 = Vec<N,uint16_t>;
using U8 = Vec<N,uint8_t>;
#if !defined(__AVX512F__)
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl"))), apply_to=function)
#elif defined(__GNUC__)
#pragma GCC push_options
#pragma GCC target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl")
#endif
#include "src/Transform_inl.h"
namespace skx {
#define USING_AVX512F
#define N 16
using F = Vec<N,float>;
using I32 = Vec<N,int32_t>;
using U64 = Vec<N,uint64_t>;
using U32 = Vec<N,uint32_t>;
using U16 = Vec<N,uint16_t>;
using U8 = Vec<N,uint8_t>;
// src/Transform_inl.h will undefine USING_* for us.
#undef N
}
#include "src/Transform_inl.h"
#if defined(__clang__)
#pragma clang attribute pop
#elif defined(__GNUC__)
#pragma GCC pop_options
// src/Transform_inl.h will undefine USING_* for us.
#undef N
}
#if defined(__clang__)
#pragma clang attribute pop
#elif defined(__GNUC__)
#pragma GCC pop_options
#endif
#define TEST_FOR_SKX
#endif
#define TEST_FOR_HSW
#if defined(TEST_FOR_HSW) || defined(TEST_FOR_SKX)
enum class CpuType { None, HSW, SKX };
static CpuType cpu_type() {
static const CpuType type = []{
// See http://www.sandpile.org/x86/cpuid.htm
static bool hsw_ok() {
static const bool ok = []{
// See http://www.sandpile.org/x86/cpuid.htm
// First, a basic cpuid(1) lets us check prerequisites for HSW, SKX.
uint32_t eax, ebx, ecx, edx;
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
: "0"(1), "2"(0));
if ((edx & (1u<<25)) && // SSE
(edx & (1u<<26)) && // SSE2
(ecx & (1u<< 0)) && // SSE3
(ecx & (1u<< 9)) && // SSSE3
(ecx & (1u<<12)) && // FMA (N.B. not used, avoided even)
(ecx & (1u<<19)) && // SSE4.1
(ecx & (1u<<20)) && // SSE4.2
(ecx & (1u<<26)) && // XSAVE
(ecx & (1u<<27)) && // OSXSAVE
(ecx & (1u<<28)) && // AVX
(ecx & (1u<<29))) { // F16C
// First, a basic cpuid(1).
uint32_t eax, ebx, ecx, edx;
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
: "0"(1), "2"(0));
// Call cpuid(7) to check for AVX2 and AVX-512 bits.
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
: "0"(7), "2"(0));
// eax from xgetbv(0) will tell us whether XMM, YMM, and ZMM state is saved.
uint32_t xcr0, dont_need_edx;
__asm__ __volatile__("xgetbv" : "=a"(xcr0), "=d"(dont_need_edx) : "c"(0));
// Sanity check for prerequisites.
if ((edx & (1<<25)) != (1<<25)) { return false; } // SSE
if ((edx & (1<<26)) != (1<<26)) { return false; } // SSE2
if ((ecx & (1<< 0)) != (1<< 0)) { return false; } // SSE3
if ((ecx & (1<< 9)) != (1<< 9)) { return false; } // SSSE3
if ((ecx & (1<<19)) != (1<<19)) { return false; } // SSE4.1
if ((ecx & (1<<20)) != (1<<20)) { return false; } // SSE4.2
if ((ecx & (3<<26)) != (3<<26)) { return false; } // XSAVE + OSXSAVE
{
uint32_t eax_xgetbv, edx_xgetbv;
__asm__ __volatile__("xgetbv" : "=a"(eax_xgetbv), "=d"(edx_xgetbv) : "c"(0));
if ((eax_xgetbv & (3<<1)) != (3<<1)) { return false; } // XMM+YMM state saved?
}
if ((ecx & (1<<28)) != (1<<28)) { return false; } // AVX
if ((ecx & (1<<29)) != (1<<29)) { return false; } // F16C
if ((ecx & (1<<12)) != (1<<12)) { return false; } // FMA (TODO: not currently used)
// Call cpuid(7) to check for our final AVX2 feature bit!
__asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
: "0"(7), "2"(0));
if ((ebx & (1<< 5)) != (1<< 5)) { return false; } // AVX2
return true;
}();
return ok;
}
if ((xcr0 & (1u<<1)) && // XMM register state saved?
(xcr0 & (1u<<2)) && // YMM register state saved?
(ebx & (1u<<5))) { // AVX2
// At this point we're at least HSW. Continue checking for SKX.
if ((xcr0 & (1u<< 5)) && // Opmasks state saved?
(xcr0 & (1u<< 6)) && // First 16 ZMM registers saved?
(xcr0 & (1u<< 7)) && // High 16 ZMM registers saved?
(ebx & (1u<<16)) && // AVX512F
(ebx & (1u<<17)) && // AVX512DQ
(ebx & (1u<<28)) && // AVX512CD
(ebx & (1u<<30)) && // AVX512BW
(ebx & (1u<<31))) { // AVX512VL
return CpuType::SKX;
}
return CpuType::HSW;
}
}
return CpuType::None;
}();
return type;
}
#endif
#endif
@ -2260,7 +2320,18 @@ bool skcms_TransformWithPalette(const void* src,
auto run = baseline::run_program;
#if defined(TEST_FOR_HSW)
if (hsw_ok()) { run = hsw::run_program; }
switch (cpu_type()) {
case CpuType::None: break;
case CpuType::HSW: run = hsw::run_program; break;
case CpuType::SKX: run = hsw::run_program; break;
}
#endif
#if defined(TEST_FOR_SKX)
switch (cpu_type()) {
case CpuType::None: break;
case CpuType::HSW: break;
case CpuType::SKX: run = skx::run_program; break;
}
#endif
run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp);
return true;

View File

@ -43,6 +43,9 @@
#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
#define USING_AVX2
#endif
#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__)
#define USING_AVX512F
#endif
// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
// This is more for organizational clarity... skcms.cc doesn't force these.
@ -138,7 +141,7 @@ SI T if_then_else(I32 cond, T t, T e) {
SI F F_from_Half(U16 half) {
#if defined(USING_NEON_F16C)
return vcvt_f32_f16((float16x4_t)half);
#elif defined(__AVX512F__)
#elif defined(USING_AVX512F)
return (F)_mm512_cvtph_ps((__m256i)half);
#elif defined(USING_AVX_F16C)
typedef int16_t __attribute__((vector_size(16))) I16;
@ -165,7 +168,7 @@ SI F F_from_Half(U16 half) {
SI U16 Half_from_F(F f) {
#if defined(USING_NEON_F16C)
return (U16)vcvt_f16_f32(f);
#elif defined(__AVX512F__)
#elif defined(USING_AVX512F)
return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
#elif defined(USING_AVX_F16C)
return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
@ -206,8 +209,12 @@ SI F floor_(F x) {
return floorf_(x);
#elif defined(__aarch64__)
return vrndmq_f32(x);
#elif defined(__AVX512F__)
return _mm512_floor_ps(x);
#elif defined(USING_AVX512F)
// Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
// and integer santizer catches that this implicit cast changes the
// value from -1 to 65535. We'll cast manually to work around it.
// Read this as `return _mm512_floor_ps(x)`.
return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
#elif defined(USING_AVX)
return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
#elif defined(__SSE4_1__)
@ -1238,6 +1245,9 @@ static void run_program(const Op* program, const void** arguments,
#if defined(USING_AVX2)
#undef USING_AVX2
#endif
#if defined(USING_AVX512F)
#undef USING_AVX512F
#endif
#if defined(USING_NEON)
#undef USING_NEON

View File

@ -1 +1 @@
36eeb1311a720755fb065f46987158e42edbdfb6
0da672fc2c69d3d7fd4c524c2d873ca725586d97