Refactor and recomment SkJumper_stages.cpp.

SkJumper_stages.cpp is starting to get unweildy.
This spins some logical parts out into their own headers.

I will follow up by moving more of the very specific
f16/f32 load/store logic into SkJumper_vectors.h too.

Change-Id: I2a3a055e9d1b65f56983d05649270772a4c69f31
Reviewed-on: https://skia-review.googlesource.com/11133
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-04-03 13:54:55 -04:00 committed by Skia Commit-Bot
parent dbcb607f3c
commit b9c4a6fc7d
3 changed files with 442 additions and 393 deletions

View File

@ -0,0 +1,49 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkJumper_misc_DEFINED
#define SkJumper_misc_DEFINED
#include "SkJumper.h" // for memcpy()
// Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h.
// Every function in this file should be marked static and inline using SI.
#define SI static inline
template <typename T, typename P>
SI T unaligned_load(const P* p) { // const void* would work too, but const P* helps ARMv7 codegen.
T v;
memcpy(&v, p, sizeof(v));
return v;
}
template <typename Dst, typename Src>
SI Dst bit_cast(const Src& src) {
static_assert(sizeof(Dst) == sizeof(Src), "");
return unaligned_load<Dst>(&src);
}
// A couple functions for embedding constants directly into code,
// so that no .const or .literal4 section is created.
SI int C(int x) {
#if defined(JUMPER) && defined(__x86_64__)
// Move x-the-compile-time-constant as a literal into x-the-register.
asm("mov %1, %0" : "=r"(x) : "i"(x));
#endif
return x;
}
SI float C(float f) {
int x = C(unaligned_load<int>(&f));
return unaligned_load<float>(&x);
}
// Syntax sugar to make C() easy to use for constant literals.
SI int operator "" _i(unsigned long long int i) { return C( (int)i); }
SI float operator "" _f( long double f) { return C((float)f); }
#endif//SkJumper_misc_DEFINED

View File

@ -6,253 +6,177 @@
*/
#include "SkJumper.h"
#include "SkJumper_misc.h" // SI, unaligned_load(), bit_cast(), C(), operator"" _i and _f.
#include "SkJumper_vectors.h" // F, I32, U32, U16, U8, cast(), expand()
#define SI static inline
template <typename T, typename P>
SI T unaligned_load(const P* p) {
T v;
memcpy(&v, p, sizeof(v));
return v;
}
template <typename Dst, typename Src>
SI Dst bit_cast(const Src& src) {
static_assert(sizeof(Dst) == sizeof(Src), "");
return unaligned_load<Dst>(&src);
}
// A couple functions for embedding constants directly into code,
// so that no .const or .literal4 section is created.
SI int C(int x) {
#if defined(JUMPER) && defined(__x86_64__)
// Move x-the-compile-time-constant as a literal into x-the-register.
asm("mov %1, %0" : "=r"(x) : "i"(x));
#endif
return x;
}
SI float C(float f) {
int x = C(unaligned_load<int>(&f));
return unaligned_load<float>(&x);
}
SI int operator "" _i(unsigned long long int i) { return C( (int)i); }
SI float operator "" _f( long double f) { return C((float)f); }
// Not all constants can be generated using C() or _i/_f. We read the rest from this struct.
using K = const SkJumper_constants;
#if !defined(JUMPER)
// This path should lead to portable code that can be compiled directly into Skia.
// (All other paths are compiled offline by Clang into SkJumper_generated.h.)
#include <math.h>
using F = float;
using I32 = int32_t;
using U32 = uint32_t;
using U16 = uint16_t;
using U8 = uint8_t;
SI F mad(F f, F m, F a) { return f*m+a; }
SI F min(F a, F b) { return fminf(a,b); }
SI F max(F a, F b) { return fmaxf(a,b); }
SI F abs_ (F v) { return fabsf(v); }
SI F floor_(F v) { return floorf(v); }
SI F rcp (F v) { return 1.0f / v; }
SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
SI U32 round (F v, F scale) { return (uint32_t)lrintf(v*scale); }
SI U16 pack(U32 v) { return (U16)v; }
SI U8 pack(U16 v) { return (U8)v; }
SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
SI F gather(const float* p, U32 ix) { return p[ix]; }
#define WRAP(name) sk_##name
#elif defined(__aarch64__)
#include <arm_neon.h>
// Since we know we're using Clang, we can use its vector extensions.
using F = float __attribute__((ext_vector_type(4)));
using I32 = int32_t __attribute__((ext_vector_type(4)));
using U32 = uint32_t __attribute__((ext_vector_type(4)));
using U16 = uint16_t __attribute__((ext_vector_type(4)));
using U8 = uint8_t __attribute__((ext_vector_type(4)));
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
SI F min(F a, F b) { return vminq_f32(a,b); }
SI F max(F a, F b) { return vmaxq_f32(a,b); }
SI F abs_ (F v) { return vabsq_f32(v); }
SI F floor_(F v) { return vrndmq_f32(v); }
SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
SI U32 round (F v, F scale) { return vcvtnq_u32_f32(v*scale); }
SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#define WRAP(name) sk_##name##_aarch64
#elif defined(__arm__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
#endif
#include <arm_neon.h>
// We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers.
using F = float __attribute__((ext_vector_type(2)));
using I32 = int32_t __attribute__((ext_vector_type(2)));
using U32 = uint32_t __attribute__((ext_vector_type(2)));
using U16 = uint16_t __attribute__((ext_vector_type(2)));
using U8 = uint8_t __attribute__((ext_vector_type(2)));
SI F mad(F f, F m, F a) { return vfma_f32(a,f,m); }
SI F min(F a, F b) { return vmin_f32(a,b); }
SI F max(F a, F b) { return vmax_f32(a,b); }
SI F abs_ (F v) { return vabs_f32(v); }
SI F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
SI F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
SI U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); }
SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
SI F floor_(F v) {
F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
}
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
#define WRAP(name) sk_##name##_vfp4
#elif defined(__AVX__)
#include <immintrin.h>
// These are __m256 and __m256i, but friendlier and strongly-typed.
using F = float __attribute__((ext_vector_type(8)));
using I32 = int32_t __attribute__((ext_vector_type(8)));
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using U16 = uint16_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
SI F mad(F f, F m, F a) {
#if defined(__FMA__)
return _mm256_fmadd_ps(f,m,a);
#else
return f*m+a;
#endif
}
SI F min(F a, F b) { return _mm256_min_ps(a,b); }
SI F max(F a, F b) { return _mm256_max_ps(a,b); }
SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
SI F floor_(F v) { return _mm256_floor_ps(v); }
SI F rcp (F v) { return _mm256_rcp_ps (v); }
SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) {
return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
_mm256_extractf128_si256(v, 1));
}
SI U8 pack(U16 v) {
auto r = _mm_packus_epi16(v,v);
return unaligned_load<U8>(&r);
}
SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
SI F gather(const float* p, U32 ix) {
#if defined(__AVX2__)
return _mm256_i32gather_ps(p, ix, 4);
#else
return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
#endif
}
#if defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
#define WRAP(name) sk_##name##_hsw
#else
#define WRAP(name) sk_##name##_avx
#endif
#elif defined(__SSE2__)
#include <immintrin.h>
using F = float __attribute__((ext_vector_type(4)));
using I32 = int32_t __attribute__((ext_vector_type(4)));
using U32 = uint32_t __attribute__((ext_vector_type(4)));
using U16 = uint16_t __attribute__((ext_vector_type(4)));
using U8 = uint8_t __attribute__((ext_vector_type(4)));
SI F mad(F f, F m, F a) { return f*m+a; }
SI F min(F a, F b) { return _mm_min_ps(a,b); }
SI F max(F a, F b) { return _mm_max_ps(a,b); }
SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
SI F rcp (F v) { return _mm_rcp_ps (v); }
SI F rsqrt(F v) { return _mm_rsqrt_ps(v); }
SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) {
#if defined(__SSE4_1__)
auto p = _mm_packus_epi32(v,v);
#else
// Sign extend so that _mm_packs_epi32() does the pack we want.
auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
p = _mm_packs_epi32(p,p);
#endif
return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
}
SI U8 pack(U16 v) {
__m128i r;
memcpy(&r, &v, sizeof(v));
r = _mm_packus_epi16(r,r);
return unaligned_load<U8>(&r);
}
SI F if_then_else(I32 c, F t, F e) {
return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
}
SI F floor_(F v) {
#if defined(__SSE4_1__)
return _mm_floor_ps(v);
#else
F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
#endif
}
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#if defined(__SSE4_1__)
#define WRAP(name) sk_##name##_sse41
#else
#define WRAP(name) sk_##name##_sse2
#endif
#endif
// Our fundamental vector depth is our pixel stride.
static const size_t kStride = sizeof(F) / sizeof(float);
// We need to be a careful with casts.
// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
// These named casts and bit_cast() are always what they seem to be.
#if defined(JUMPER)
SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
// A reminder:
// Code guarded by defined(JUMPER) can assume that it will be compiled by Clang
// and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type.
// Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1).
// Another reminder:
// You can't generally use constants in this file except via C() or operator"" _i/_f.
// Not all constants can be generated using C() or _i/_f. Stages read the rest from this struct.
using K = const SkJumper_constants;
// Let's start first with the mechanisms we use to build Stages.
// Our program is an array of void*, either
// - 1 void* per stage with no context pointer, the next stage;
// - 2 void* per stage with a context pointer, first the context pointer, then the next stage.
// load_and_inc() steps the program forward by 1 void*, returning that pointer.
SI void* load_and_inc(void**& program) {
#if defined(__GNUC__) && defined(__x86_64__)
// If program is in %rsi (we try to make this likely) then this is a single instruction.
void* rax;
asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
return rax;
#else
SI F cast (U32 v) { return (F)v; }
SI U32 expand(U16 v) { return (U32)v; }
SI U32 expand(U8 v) { return (U32)v; }
// On ARM *program++ compiles into pretty ideal code without any handholding.
return *program++;
#endif
}
// LazyCtx doesn't do anything unless you call operator T*() or load(), encapsulating the
// logic from above that stages without a context pointer are represented by just 1 void*.
struct LazyCtx {
void* ptr;
void**& program;
explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
template <typename T>
operator T*() {
if (!ptr) { ptr = load_and_inc(program); }
return (T*)ptr;
}
template <typename T>
T load() {
if (!ptr) { ptr = load_and_inc(program); }
return unaligned_load<T>(ptr);
}
};
// A little wrapper macro to name Stages differently depending on the instruction set.
// That lets us link together several options.
#if !defined(JUMPER)
#define WRAP(name) sk_##name
#elif defined(__aarch64__)
#define WRAP(name) sk_##name##_aarch64
#elif defined(__arm__)
#define WRAP(name) sk_##name##_vfp4
#elif defined(__AVX2__)
#define WRAP(name) sk_##name##_hsw
#elif defined(__AVX__)
#define WRAP(name) sk_##name##_avx
#elif defined(__SSE4_1__)
#define WRAP(name) sk_##name##_sse41
#elif defined(__SSE2__)
#define WRAP(name) sk_##name##_sse2
#endif
// We're finally going to get to what a Stage function looks like!
// It's best to jump down to the #else case first, then to come back up here for AVX.
#if defined(JUMPER) && defined(__AVX__)
// There's a big cost to switch between SSE and AVX, so we do a little
// extra work to handle even the jagged <kStride tail in AVX mode.
// Compared to normal stages, we maintain an extra tail register:
// tail == 0 ~~> work on a full kStride pixels
// tail != 0 ~~> work on only the first tail pixels
// tail is always < kStride.
using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{};
auto start = (Stage*)load_and_inc(program);
while (x + kStride <= limit) {
start(x,program,k,0, v,v,v,v, v,v,v,v);
x += kStride;
}
if (size_t tail = limit - x) {
start(x,program,k,tail, v,v,v,v, v,v,v,v);
}
return limit;
}
#define STAGE(name) \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
LazyCtx ctx(program); \
name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#else
// Other instruction sets (SSE, NEON, portable) can fall back on narrower
// pipelines cheaply, which frees us to always assume tail==0.
// Stages tail call between each other by following program as described above.
// x is our induction variable, stepping forward kStride at a time.
using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
// On Windows, start_pipeline() has a normal Windows ABI, and then the rest is System V.
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{};
auto start = (Stage*)load_and_inc(program);
while (x + kStride <= limit) {
start(x,program,k, v,v,v,v, v,v,v,v);
x += kStride;
}
return x;
}
// This STAGE macro makes it easier to write stages, handling all the Stage chaining for you.
#define STAGE(name) \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t x, void** program, K* k, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
LazyCtx ctx(program); \
name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(x,program,k, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#endif
// just_return() is a simple no-op stage that only exists to end the chain,
// returning back up to start_pipeline(), and from there to the caller.
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
// We could start defining normal Stages now. But first, some helper functions and types.
// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
#if defined(JUMPER)
using F4 = float __attribute__((ext_vector_type(4)));
#else
struct F4 {
float vals[4];
float operator[](int i) const { return vals[i]; }
};
#endif
// These load() and store() methods are tail-aware,
// but focus mainly on keeping the at-stride tail==0 case fast.
template <typename V, typename T>
SI V load(const T* src, size_t tail) {
@ -295,7 +219,9 @@ SI void store(T* dst, V v, size_t tail) {
memcpy(dst, &v, sizeof(v));
}
#if 1 && defined(JUMPER) && defined(__AVX__)
// This doesn't look strictly necessary, but without it Clang would generate load() using
// compiler-generated constants that we can't support. This version doesn't need constants.
#if defined(JUMPER) && defined(__AVX__)
template <>
inline U8 load(const uint8_t* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
@ -312,8 +238,11 @@ SI void store(T* dst, V v, size_t tail) {
}
#endif
#if 1 && defined(JUMPER) && defined(__AVX2__)
// AVX2 adds some mask loads and stores that make for shorter, faster code.
#if defined(JUMPER) && defined(__AVX2__)
SI U32 mask(size_t tail) {
// We go a little out of our way to avoid needing large constant values here.
// It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff.
// Start fully on, then shift away lanes from the top until we've got our mask.
uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
@ -342,10 +271,6 @@ SI void store(T* dst, V v, size_t tail) {
#endif
SI F lerp(F from, F to, F t) {
return mad(to-from, t, from);
}
SI void from_565(U16 _565, F* r, F* g, F* b) {
U32 wide = expand(_565);
*r = cast(wide & C(31<<11)) * C(1.0f / (31<<11));
@ -360,150 +285,7 @@ SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
*a = cast(wide & C(15<< 0)) * C(1.0f / (15<< 0));
}
// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
#if defined(JUMPER)
using F4 = float __attribute__((ext_vector_type(4)));
#else
struct F4 {
float vals[4];
float operator[](int i) const { return vals[i]; }
};
#endif
SI void* load_and_inc(void**& program) {
#if defined(__GNUC__) && defined(__x86_64__)
// Passing program as the second Stage argument makes it likely that it's in %rsi,
// so this is usually a single instruction *program++.
void* rax;
asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
return rax;
// When a Stage uses its ctx pointer, this optimization typically cuts an instruction:
// mov (%rsi), %rcx // ctx = program[0]
// ...
// mov 0x8(%rsi), %rax // next = program[1]
// add $0x10, %rsi // program += 2
// jmpq *%rax // JUMP!
// becomes
// lods %ds:(%rsi),%rax // ctx = *program++;
// ...
// lods %ds:(%rsi),%rax // next = *program++;
// jmpq *%rax // JUMP!
//
// When a Stage doesn't use its ctx pointer, it's 3 instructions either way,
// but using lodsq (a 2-byte instruction) tends to trim a few bytes.
#else
// On ARM *program++ compiles into a single instruction without any handholding.
return *program++;
#endif
}
// Doesn't do anything unless you resolve it, either by casting to a pointer or calling load().
// This makes it free in stages that have no context pointer to load (i.e. built with nullptr).
struct LazyCtx {
void* ptr;
void**& program;
explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
template <typename T>
operator T*() {
if (!ptr) { ptr = load_and_inc(program); }
return (T*)ptr;
}
template <typename T>
T load() {
if (!ptr) { ptr = load_and_inc(program); }
return unaligned_load<T>(ptr);
}
};
#if defined(JUMPER) && defined(__AVX__)
// There's a big cost to switch between SSE and AVX+, so we do a little
// extra work to handle even the jagged <kStride tail in AVX+ mode.
using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{};
auto start = (Stage*)load_and_inc(program);
while (x + kStride <= limit) {
start(x,program,k,0, v,v,v,v, v,v,v,v);
x += kStride;
}
if (size_t tail = limit - x) {
start(x,program,k,tail, v,v,v,v, v,v,v,v);
}
return limit;
}
#define STAGE(name) \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
LazyCtx ctx(program); \
name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#else
// Other instruction sets (SSE, NEON, portable) can fall back on narrower
// pipelines cheaply, which frees us to always assume tail==0.
// Stages tail call between each other by following program,
// an interlaced sequence of Stage pointers and context pointers.
using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{};
auto start = (Stage*)load_and_inc(program);
while (x + kStride <= limit) {
start(x,program,k, v,v,v,v, v,v,v,v);
x += kStride;
}
return x;
}
#define STAGE(name) \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t x, void** program, K* k, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
LazyCtx ctx(program); \
name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(x,program,k, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#endif
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
// We can now define Stages!
// Some things to keep in mind while writing Stages:
// - do not branch; (i.e. avoid jmp)
// - do not call functions that don't inline; (i.e. avoid call, ret)
// - do not use constant literals other than 0, ~0 and 0.0f. (i.e. avoid rip relative addressing)
//
// Some things that should work fine:
// - 0, ~0, and 0.0f;
// - arithmetic;
// - functions of F and U32 that we've defined above;
// - temporary values;
// - lambdas;
// - memcpy() with a compile-time constant size argument.
// Now finally, normal Stages!
STAGE(seed_shader) {
auto y = *(const int*)ctx;
@ -526,6 +308,7 @@ STAGE(constant_color) {
a = rgba[3];
}
// Most blend modes apply the same logic to each channel.
#define BLEND_MODE(name) \
SI F name##_channel(F s, F d, F sa, F da); \
STAGE(name) { \
@ -554,8 +337,9 @@ BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
BLEND_MODE(plus_) { return s + d; }
BLEND_MODE(screen) { return s + d - s*d; }
BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); }
#undef BLEND_MODE
// Most other blend modes apply the same logic to colors, and srcover to alpha.
#define BLEND_MODE(name) \
SI F name##_channel(F s, F d, F sa, F da); \
STAGE(name) { \
@ -605,6 +389,7 @@ BLEND_MODE(softlight) {
liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
}
#undef BLEND_MODE
STAGE(clamp_0) {
r = max(r, 0);
@ -719,6 +504,10 @@ STAGE(scale_u8) {
a = a * c;
}
SI F lerp(F from, F to, F t) {
return mad(to-from, t, from);
}
STAGE(lerp_1_float) {
auto c = *(const float*)ctx;

View File

@ -0,0 +1,211 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkJumper_vectors_DEFINED
#define SkJumper_vectors_DEFINED
#include "SkJumper.h"
#include "SkJumper_misc.h"
// This file contains vector types that SkJumper_stages.cpp uses to define stages.
// Every function in this file should be marked static and inline using SI (see SkJumper_misc.h).
#if !defined(JUMPER)
// This path should lead to portable code that can be compiled directly into Skia.
// (All other paths are compiled offline by Clang into SkJumper_generated.S.)
#include <math.h>
using F = float;
using I32 = int32_t;
using U32 = uint32_t;
using U16 = uint16_t;
using U8 = uint8_t;
SI F mad(F f, F m, F a) { return f*m+a; }
SI F min(F a, F b) { return fminf(a,b); }
SI F max(F a, F b) { return fmaxf(a,b); }
SI F abs_ (F v) { return fabsf(v); }
SI F floor_(F v) { return floorf(v); }
SI F rcp (F v) { return 1.0f / v; }
SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
SI U32 round (F v, F scale) { return (uint32_t)lrintf(v*scale); }
SI U16 pack(U32 v) { return (U16)v; }
SI U8 pack(U16 v) { return (U8)v; }
SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
SI F gather(const float* p, U32 ix) { return p[ix]; }
#elif defined(__aarch64__)
#include <arm_neon.h>
// Since we know we're using Clang, we can use its vector extensions.
using F = float __attribute__((ext_vector_type(4)));
using I32 = int32_t __attribute__((ext_vector_type(4)));
using U32 = uint32_t __attribute__((ext_vector_type(4)));
using U16 = uint16_t __attribute__((ext_vector_type(4)));
using U8 = uint8_t __attribute__((ext_vector_type(4)));
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
SI F min(F a, F b) { return vminq_f32(a,b); }
SI F max(F a, F b) { return vmaxq_f32(a,b); }
SI F abs_ (F v) { return vabsq_f32(v); }
SI F floor_(F v) { return vrndmq_f32(v); }
SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
SI U32 round (F v, F scale) { return vcvtnq_u32_f32(v*scale); }
SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#elif defined(__arm__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
#endif
#include <arm_neon.h>
// We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers.
using F = float __attribute__((ext_vector_type(2)));
using I32 = int32_t __attribute__((ext_vector_type(2)));
using U32 = uint32_t __attribute__((ext_vector_type(2)));
using U16 = uint16_t __attribute__((ext_vector_type(2)));
using U8 = uint8_t __attribute__((ext_vector_type(2)));
SI F mad(F f, F m, F a) { return vfma_f32(a,f,m); }
SI F min(F a, F b) { return vmin_f32(a,b); }
SI F max(F a, F b) { return vmax_f32(a,b); }
SI F abs_ (F v) { return vabs_f32(v); }
SI F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
SI F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
SI U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); }
SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
SI F floor_(F v) {
F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
}
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
#elif defined(__AVX__)
#include <immintrin.h>
// These are __m256 and __m256i, but friendlier and strongly-typed.
using F = float __attribute__((ext_vector_type(8)));
using I32 = int32_t __attribute__((ext_vector_type(8)));
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using U16 = uint16_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
SI F mad(F f, F m, F a) {
#if defined(__FMA__)
return _mm256_fmadd_ps(f,m,a);
#else
return f*m+a;
#endif
}
SI F min(F a, F b) { return _mm256_min_ps(a,b); }
SI F max(F a, F b) { return _mm256_max_ps(a,b); }
SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
SI F floor_(F v) { return _mm256_floor_ps(v); }
SI F rcp (F v) { return _mm256_rcp_ps (v); }
SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) {
return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
_mm256_extractf128_si256(v, 1));
}
SI U8 pack(U16 v) {
auto r = _mm_packus_epi16(v,v);
return unaligned_load<U8>(&r);
}
SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
SI F gather(const float* p, U32 ix) {
#if defined(__AVX2__)
return _mm256_i32gather_ps(p, ix, 4);
#else
return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
#endif
}
#elif defined(__SSE2__)
#include <immintrin.h>
using F = float __attribute__((ext_vector_type(4)));
using I32 = int32_t __attribute__((ext_vector_type(4)));
using U32 = uint32_t __attribute__((ext_vector_type(4)));
using U16 = uint16_t __attribute__((ext_vector_type(4)));
using U8 = uint8_t __attribute__((ext_vector_type(4)));
SI F mad(F f, F m, F a) { return f*m+a; }
SI F min(F a, F b) { return _mm_min_ps(a,b); }
SI F max(F a, F b) { return _mm_max_ps(a,b); }
SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
SI F rcp (F v) { return _mm_rcp_ps (v); }
SI F rsqrt(F v) { return _mm_rsqrt_ps(v); }
SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) {
#if defined(__SSE4_1__)
auto p = _mm_packus_epi32(v,v);
#else
// Sign extend so that _mm_packs_epi32() does the pack we want.
auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
p = _mm_packs_epi32(p,p);
#endif
return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
}
SI U8 pack(U16 v) {
__m128i r;
memcpy(&r, &v, sizeof(v));
r = _mm_packus_epi16(r,r);
return unaligned_load<U8>(&r);
}
SI F if_then_else(I32 c, F t, F e) {
return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
}
SI F floor_(F v) {
#if defined(__SSE4_1__)
return _mm_floor_ps(v);
#else
F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
#endif
}
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#endif
// We need to be a careful with casts.
// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
// These named casts and bit_cast() are always what they seem to be.
#if defined(JUMPER)
SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
#else
SI F cast (U32 v) { return (F)v; }
SI U32 expand(U16 v) { return (U32)v; }
SI U32 expand(U8 v) { return (U32)v; }
#endif
#endif//SkJumper_vectors_DEFINED