SkJumper: handle the <kStride tail in AVX+ mode.

We have plenty general purpose registers to spare on x86-64,
so the cheapest thing to do is use one to hold the usual 'tail'.

Speedups on HSW:
    SkRasterPipeline_srgb: 292 -> 170
    SkRasterPipeline_f16:  122 ->  90

There's plenty more room to improve here, e.g. using mask loads and
stores, but this seems to be enough to get things working reasonably.

BUG=skia:6289

Change-Id: I8c0ed325391822e9f36636500350205e93942111
Reviewed-on: https://skia-review.googlesource.com/9110
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-03-01 13:07:40 -05:00
parent f7cf81aefd
commit c31858bcba
4 changed files with 1599 additions and 474 deletions

View File

@ -249,11 +249,11 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
const size_t limit = x+n;
auto build_and_run = [&](size_t stride,
auto build_and_run = [&](size_t min_stride,
StageFn* (*lookup)(SkRasterPipeline::StockStage),
StageFn* just_return,
size_t (*start_pipeline)(size_t, void**, K*, size_t)) {
if (x + stride <= limit) {
if (x + min_stride <= limit) {
void** ip = program.get();
for (auto&& st : fStages) {
auto fn = lookup(st.stage);
@ -288,12 +288,12 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
#elif defined(__x86_64__) || defined(_M_X64)
if (1 && SkCpu::Supports(SkCpu::HSW)) {
if (!build_and_run(8, lookup_hsw, ASM(just_return,hsw), ASM(start_pipeline,hsw))) {
if (!build_and_run(1, lookup_hsw, ASM(just_return,hsw), ASM(start_pipeline,hsw))) {
return false;
}
}
if (1 && SkCpu::Supports(SkCpu::AVX)) {
if (!build_and_run(8, lookup_avx, ASM(just_return,avx), ASM(start_pipeline,avx))) {
if (!build_and_run(1, lookup_avx, ASM(just_return,avx), ASM(start_pipeline,avx))) {
return false;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -222,6 +222,8 @@ static Dst bit_cast(const Src& src) {
#endif
#endif
static const size_t kStride = sizeof(F) / sizeof(float);
// We need to be a careful with casts.
// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
// These named casts and bit_cast() are always what they seem to be.
@ -235,6 +237,52 @@ static Dst bit_cast(const Src& src) {
static U32 expand(U8 v) { return (U32)v; }
#endif
template <typename V, typename T>
static inline V load(const T* src, size_t tail) {
#if defined(JUMPER)
if (__builtin_expect(tail, 0)) {
V v{}; // Any inactive lanes are zeroed.
#pragma nounroll
for (size_t i = 0; i < tail; i++) {
v[i] = src[i];
}
return v;
}
#endif
return unaligned_load<V>(src);
}
#if 1 && defined(JUMPER) && defined(__AVX__)
template <>
inline U8 load(const uint8_t* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
uint64_t v = 0;
size_t shift = 0;
#pragma nounroll
while (tail --> 0) {
v |= (uint64_t)*src++ << shift;
shift += 8;
}
return unaligned_load<U8>(&v);
}
return unaligned_load<U8>(src);
}
#endif
template <typename V, typename T>
static inline void store(T* dst, V v, size_t tail) {
#if defined(JUMPER)
if (__builtin_expect(tail, 0)) {
#pragma nounroll
for (size_t i = 0; i < tail; i++) {
dst[i] = v[i];
}
return;
}
#endif
memcpy(dst, &v, sizeof(v));
}
static F lerp(F from, F to, F t) {
return mad(to-from, t, from);
@ -257,10 +305,6 @@ static void from_565(U16 _565, F* r, F* g, F* b, K* k) {
};
#endif
// Stages tail call between each other by following program,
// an interlaced sequence of Stage pointers and context pointers.
using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
static void* load_and_inc(void**& program) {
#if defined(__GNUC__) && defined(__x86_64__)
// Passing program as the second Stage argument makes it likely that it's in %rsi,
@ -288,34 +332,74 @@ static void* load_and_inc(void**& program) {
#endif
}
#define STAGE(name) \
static void name##_k(size_t& x, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t x, void** program, K* k, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
auto ctx = load_and_inc(program); \
name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(x,program,k, r,g,b,a, dr,dg,db,da); \
} \
static void name##_k(size_t& x, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#if defined(JUMPER) && defined(__AVX__)
// There's a big cost to switch between SSE and AVX+, so we do a little
// extra work to handle even the jagged <kStride tail in AVX+ mode.
using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
// Some glue stages that don't fit the normal pattern of stages.
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{};
size_t stride = sizeof(F) / sizeof(float);
auto start = (Stage*)load_and_inc(program);
while (x + stride <= limit) {
start(x,program,k, v,v,v,v, v,v,v,v);
x += stride;
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{};
auto start = (Stage*)load_and_inc(program);
while (x + kStride <= limit) {
start(x,program,k,0, v,v,v,v, v,v,v,v);
x += kStride;
}
if (size_t tail = limit - x) {
start(x,program,k,tail, v,v,v,v, v,v,v,v);
}
return limit;
}
return x;
}
#define STAGE(name) \
static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
auto ctx = load_and_inc(program); \
name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
} \
static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#else
// Other instruction sets (SSE, NEON, portable) can fall back on narrower
// pipelines cheaply, which frees us to always assume tail==0.
// Stages tail call between each other by following program,
// an interlaced sequence of Stage pointers and context pointers.
using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{};
auto start = (Stage*)load_and_inc(program);
while (x + kStride <= limit) {
start(x,program,k, v,v,v,v, v,v,v,v);
x += kStride;
}
return x;
}
#define STAGE(name) \
static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t x, void** program, K* k, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
auto ctx = load_and_inc(program); \
name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(x,program,k, r,g,b,a, dr,dg,db,da); \
} \
static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#endif
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
@ -488,7 +572,7 @@ STAGE(scale_1_float) {
STAGE(scale_u8) {
auto ptr = *(const uint8_t**)ctx + x;
auto scales = unaligned_load<U8>(ptr);
auto scales = load<U8>(ptr, tail);
auto c = cast(expand(scales)) * k->_1_255;
r = r * c;
@ -508,7 +592,7 @@ STAGE(lerp_1_float) {
STAGE(lerp_u8) {
auto ptr = *(const uint8_t**)ctx + x;
auto scales = unaligned_load<U8>(ptr);
auto scales = load<U8>(ptr, tail);
auto c = cast(expand(scales)) * k->_1_255;
r = lerp(dr, r, c);
@ -520,7 +604,7 @@ STAGE(lerp_565) {
auto ptr = *(const uint16_t**)ctx + x;
F cr,cg,cb;
from_565(unaligned_load<U16>(ptr), &cr, &cg, &cb, k);
from_565(load<U16>(ptr, tail), &cr, &cg, &cb, k);
r = lerp(dr, r, cr);
g = lerp(dg, g, cg);
@ -535,7 +619,7 @@ STAGE(load_tables) {
};
auto c = (const Ctx*)ctx;
auto px = unaligned_load<U32>(c->src + x);
auto px = load<U32>(c->src + x, tail);
r = gather(c->r, (px ) & k->_0x000000ff);
g = gather(c->g, (px >> 8) & k->_0x000000ff);
b = gather(c->b, (px >> 16) & k->_0x000000ff);
@ -546,19 +630,19 @@ STAGE(load_a8) {
auto ptr = *(const uint8_t**)ctx + x;
r = g = b = 0.0f;
a = cast(expand(unaligned_load<U8>(ptr))) * k->_1_255;
a = cast(expand(load<U8>(ptr, tail))) * k->_1_255;
}
STAGE(store_a8) {
auto ptr = *(uint8_t**)ctx + x;
U8 packed = pack(pack(round(a, k->_255)));
memcpy(ptr, &packed, sizeof(packed));
store(ptr, packed, tail);
}
STAGE(load_565) {
auto ptr = *(const uint16_t**)ctx + x;
from_565(unaligned_load<U16>(ptr), &r,&g,&b, k);
from_565(load<U16>(ptr, tail), &r,&g,&b, k);
a = k->_1;
}
STAGE(store_565) {
@ -567,13 +651,13 @@ STAGE(store_565) {
U16 px = pack( round(r, k->_31) << 11
| round(g, k->_63) << 5
| round(b, k->_31) );
memcpy(ptr, &px, sizeof(px));
store(ptr, px, tail);
}
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
auto px = unaligned_load<U32>(ptr);
auto px = load<U32>(ptr, tail);
r = cast((px ) & k->_0x000000ff) * k->_1_255;
g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
@ -587,7 +671,7 @@ STAGE(store_8888) {
| round(g, k->_255) << 8
| round(b, k->_255) << 16
| round(a, k->_255) << 24;
memcpy(ptr, &px, sizeof(px));
store(ptr, px, tail);
}
STAGE(load_f16) {
@ -619,10 +703,23 @@ STAGE(load_f16) {
b = {rb[1], rb[3]};
a = {ga[1], ga[3]};
#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
_67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
__m128i _01, _23, _45, _67;
if (__builtin_expect(tail,0)) {
auto src = (const double*)ptr;
_01 = _23 = _45 = _67 = _mm_setzero_si128();
if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
} else {
_01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
_67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
}
auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
@ -639,10 +736,23 @@ STAGE(load_f16) {
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
#elif defined(__AVX__)
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
_67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
__m128i _01, _23, _45, _67;
if (__builtin_expect(tail,0)) {
auto src = (const double*)ptr;
_01 = _23 = _45 = _67 = _mm_setzero_si128();
if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
} else {
_01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
_67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
}
auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
@ -750,10 +860,26 @@ STAGE(store_f16) {
ba0123 = _mm_unpacklo_epi16(B, A),
ba4567 = _mm_unpackhi_epi16(B, A);
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
_23 = _mm_unpackhi_epi32(rg0123, ba0123),
_45 = _mm_unpacklo_epi32(rg4567, ba4567),
_67 = _mm_unpackhi_epi32(rg4567, ba4567);
if (__builtin_expect(tail,0)) {
auto dst = (double*)ptr;
if (tail > 0) { _mm_storel_pd(dst+0, _01); }
if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
if (tail > 2) { _mm_storel_pd(dst+2, _23); }
if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
if (tail > 4) { _mm_storel_pd(dst+4, _45); }
if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
if (tail > 6) { _mm_storel_pd(dst+6, _67); }
} else {
_mm_storeu_si128((__m128i*)ptr + 0, _01);
_mm_storeu_si128((__m128i*)ptr + 1, _23);
_mm_storeu_si128((__m128i*)ptr + 2, _45);
_mm_storeu_si128((__m128i*)ptr + 3, _67);
}
#elif defined(__AVX__)
auto float_to_half = [&](F f) {
return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,
@ -775,10 +901,27 @@ STAGE(store_f16) {
rg4567 = r4567 | _mm_slli_si128(g4567,2),
ba0123 = b0123 | _mm_slli_si128(a0123,2),
ba4567 = b4567 | _mm_slli_si128(a4567,2);
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
_23 = _mm_unpackhi_epi32(rg0123, ba0123),
_45 = _mm_unpacklo_epi32(rg4567, ba4567),
_67 = _mm_unpackhi_epi32(rg4567, ba4567);
if (__builtin_expect(tail,0)) {
auto dst = (double*)ptr;
if (tail > 0) { _mm_storel_pd(dst+0, _01); }
if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
if (tail > 2) { _mm_storel_pd(dst+2, _23); }
if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
if (tail > 4) { _mm_storel_pd(dst+4, _45); }
if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
if (tail > 6) { _mm_storel_pd(dst+6, _67); }
} else {
_mm_storeu_si128((__m128i*)ptr + 0, _01);
_mm_storeu_si128((__m128i*)ptr + 1, _23);
_mm_storeu_si128((__m128i*)ptr + 2, _45);
_mm_storeu_si128((__m128i*)ptr + 3, _67);
}
#elif defined(__SSE2__)
auto float_to_half = [&](F f) {
return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,