add _hsw lowp backend

CQ_INCLUDE_TRYBOTS=skia.primary:Build-Ubuntu-Clang-x86_64-Debug-MSAN

Change-Id: Id53279c17589b3434629bb644358ee238af8649f
Reviewed-on: https://skia-review.googlesource.com/20269
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
Reviewed-by: Mike Reed <reed@google.com>
This commit is contained in:
Mike Klein 2017-06-19 14:37:10 -07:00 committed by Skia Commit-Bot
parent 7f7b902d51
commit 8c3d5156c7
5 changed files with 3643 additions and 117 deletions

View File

@ -31,8 +31,7 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
#undef M
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
#if 0
#if 0 && !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
#include <atomic>
#define M(st) #st,
@ -57,7 +56,6 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
static void log_missing(SkRasterPipeline::StockStage) {}
#endif
#endif
#endif
// We can't express the real types of most stage functions portably, so we use a stand-in.
// We'll only ever call start_pipeline(), which then chains into the rest for us.
@ -128,12 +126,14 @@ extern "C" {
ASM(start_pipeline,avx ),
ASM(start_pipeline,sse41 ),
ASM(start_pipeline,sse2 ),
ASM(start_pipeline,hsw_lowp ),
ASM(start_pipeline,ssse3_lowp);
StageFn ASM(just_return,hsw),
ASM(just_return,avx),
ASM(just_return,sse41),
ASM(just_return,sse2),
ASM(just_return,hsw_lowp ),
ASM(just_return,ssse3_lowp);
#define M(st) StageFn ASM(st,hsw);
@ -149,6 +149,9 @@ extern "C" {
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#define M(st) StageFn ASM(st,hsw_lowp);
LOWP_STAGES(M)
#undef M
#define M(st) StageFn ASM(st,ssse3_lowp);
LOWP_STAGES(M)
#undef M
@ -162,6 +165,24 @@ extern "C" {
#undef M
}
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
template <SkRasterPipeline::StockStage st>
static constexpr StageFn* hsw_lowp() { return nullptr; }
template <SkRasterPipeline::StockStage st>
static constexpr StageFn* ssse3_lowp() { return nullptr; }
#define M(st) \
template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
return ASM(st,hsw_lowp); \
} \
template <> constexpr StageFn* ssse3_lowp<SkRasterPipeline::st>() { \
return ASM(st,ssse3_lowp); \
}
LOWP_STAGES(M)
#undef M
#endif
// Engines comprise everything we need to run SkRasterPipelines.
struct SkJumper_Engine {
StageFn* stages[kNumStages];
@ -239,41 +260,70 @@ static SkJumper_Engine choose_engine() {
return kPortable;
}
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
static const SkJumper_Engine kNone = {
#define M(stage) nullptr,
{ SK_RASTER_PIPELINE_STAGES(M) },
#undef M
nullptr,
nullptr,
};
static SkJumper_Engine gLowp = kNone;
static SkOnce gChooseLowpOnce;
static SkJumper_Engine choose_lowp() {
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
if (1 && SkCpu::Supports(SkCpu::HSW)) {
return {
#define M(st) hsw_lowp<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
ASM(start_pipeline,hsw_lowp),
ASM(just_return,hsw_lowp)
#undef M
};
}
if (1 && SkCpu::Supports(SkCpu::SSSE3)) {
return {
#define M(st) ssse3_lowp<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
ASM(start_pipeline,ssse3_lowp),
ASM(just_return,ssse3_lowp)
#undef M
};
}
#endif
return kNone;
}
#endif
StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
if (SkCpu::Supports(SkCpu::SSSE3)) {
void** reset_point = ip;
gChooseLowpOnce([]{ gLowp = choose_lowp(); });
*--ip = (void*)ASM(just_return,ssse3_lowp);
for (const StageList* st = fStages; st; st = st->prev) {
StageFn* fn = nullptr;
switch (st->stage) {
#define M(st) case SkRasterPipeline::st: fn = ASM(st, ssse3_lowp); break;
LOWP_STAGES(M)
#undef M
case SkRasterPipeline::clamp_0: continue; // clamp_0 is a no-op in lowp.
default:
log_missing(st->stage);
ip = reset_point;
}
if (ip == reset_point) {
break;
}
// First try to build a lowp pipeline. If that fails, fall back to normal float gEngine.
void** reset_point = ip;
*--ip = (void*)gLowp.just_return;
for (const StageList* st = fStages; st; st = st->prev) {
if (st->stage == SkRasterPipeline::clamp_0) {
continue; // No-op in lowp.
}
if (StageFn* fn = gLowp.stages[st->stage]) {
if (st->ctx) {
*--ip = st->ctx;
}
*--ip = (void*)fn;
}
if (ip != reset_point) {
return ASM(start_pipeline,ssse3_lowp);
} else {
log_missing(st->stage);
ip = reset_point;
break;
}
}
if (ip != reset_point) {
return gLowp.start_pipeline;
}
#endif
#endif
gChooseEngineOnce([]{ gEngine = choose_engine(); });
gChooseEngineOnce([]{ gEngine = choose_engine(); });
// We're building the pipeline backwards, so we start with the final stage just_return.
*--ip = (void*)gEngine.just_return;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -9,16 +9,22 @@
#include "SkJumper_misc.h"
#include <immintrin.h>
#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
#error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
#if !defined(__clang__) || !defined(__x86_64__)
#error "We're starting with just x86-64 for now, and will always require Clang."
#endif
#define WRAP(name) sk_##name##_ssse3_lowp
using K = const SkJumper_constants;
static const size_t kStride = 8;
template <typename T> using V = T __attribute__((ext_vector_type(8)));
#if defined(__AVX2__)
#define WRAP(name) sk_##name##_hsw_lowp
template <typename T> using V = T __attribute__((ext_vector_type(16)));
static const size_t kStride = 16;
#else
#define WRAP(name) sk_##name##_ssse3_lowp
template <typename T> using V = T __attribute__((ext_vector_type(8)));
static const size_t kStride = 8;
#endif
using U8 = V<uint8_t>;
using U16 = V<uint16_t>;
using U32 = V<uint32_t>;
@ -40,7 +46,14 @@ struct F {
SI F operator+(F x, F y) { return x.vec + y.vec; }
SI F operator-(F x, F y) { return x.vec - y.vec; }
SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
SI F operator*(F x, F y) {
#if defined(__AVX2__)
return _mm256_abs_epi16(_mm256_mulhrs_epi16(x.vec, y.vec));
#else
return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec));
#endif
}
SI F mad(F f, F m, F a) { return f*m+a; }
SI F inv(F v) { return 1.0f - v; }
SI F two(F v) { return v + v; }
@ -51,6 +64,11 @@ SI F operator>>(F x, int bits) { return x.vec >> bits; }
using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
#if defined(__AVX__)
// We really want to make sure all paths go through this function's (implicit) vzeroupper.
// If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
__attribute__((disable_tail_calls))
#endif
MAYBE_MSABI
extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
F v{};
@ -88,13 +106,21 @@ SI V load(const T* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
V v{}; // Any inactive lanes are zeroed.
switch (tail) {
case 7: v[6] = src[6];
case 6: v[5] = src[5];
case 5: v[4] = src[4];
case 4: memcpy(&v, src, 4*sizeof(T)); break;
case 3: v[2] = src[2];
case 2: memcpy(&v, src, 2*sizeof(T)); break;
case 1: memcpy(&v, src, 1*sizeof(T)); break;
case 15: v[14] = src[14];
case 14: v[13] = src[13];
case 13: v[12] = src[12];
case 12: memcpy(&v, src, 12*sizeof(T)); break;
case 11: v[10] = src[10];
case 10: v[ 9] = src[ 9];
case 9: v[ 8] = src[ 8];
case 8: memcpy(&v, src, 8*sizeof(T)); break;
case 7: v[6] = src[6];
case 6: v[5] = src[5];
case 5: v[4] = src[4];
case 4: memcpy(&v, src, 4*sizeof(T)); break;
case 3: v[2] = src[2];
case 2: memcpy(&v, src, 2*sizeof(T)); break;
case 1: memcpy(&v, src, 1*sizeof(T)); break;
}
return v;
}
@ -106,25 +132,39 @@ SI void store(T* dst, V v, size_t tail) {
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
switch (tail) {
case 7: dst[6] = v[6];
case 6: dst[5] = v[5];
case 5: dst[4] = v[4];
case 4: memcpy(dst, &v, 4*sizeof(T)); break;
case 3: dst[2] = v[2];
case 2: memcpy(dst, &v, 2*sizeof(T)); break;
case 1: memcpy(dst, &v, 1*sizeof(T)); break;
case 15: dst[14] = v[14];
case 14: dst[13] = v[13];
case 13: dst[12] = v[12];
case 12: memcpy(dst, &v, 12*sizeof(T)); break;
case 11: dst[10] = v[10];
case 10: dst[ 9] = v[ 9];
case 9: dst[ 8] = v[ 8];
case 8: memcpy(dst, &v, 8*sizeof(T)); break;
case 7: dst[6] = v[6];
case 6: dst[5] = v[5];
case 5: dst[4] = v[4];
case 4: memcpy(dst, &v, 4*sizeof(T)); break;
case 3: dst[2] = v[2];
case 2: memcpy(dst, &v, 2*sizeof(T)); break;
case 1: memcpy(dst, &v, 1*sizeof(T)); break;
}
return;
}
unaligned_store(dst, v);
}
// TODO: mask loads and stores with AVX2
// Scale from [0,255] up to [0,32768].
SI F from_wide_byte(U16 bytes) {
// Ideally we'd scale by 32768/255 = 128.50196, but instead we'll approximate
// that a little more cheaply as 256*32897/65536 = 128.50391.
// 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1 bit.
return _mm_mulhi_epu16(bytes << 8, U16(32897));
#if defined(__AVX2__)
return _mm256_mulhi_epu16(bytes << 8, U16(32897));
#else
return _mm_mulhi_epu16(bytes << 8, U16(32897));
#endif
}
SI F from_byte(U8 bytes) {
return from_wide_byte(__builtin_convertvector(bytes, U16));
@ -133,13 +173,22 @@ SI F from_byte(U8 bytes) {
// Pack from [0,32768] down to [0,255].
SI U16 to_wide_byte(F v) {
// The simplest thing works great: divide by 128 and saturate.
return _mm_min_epi16(v>>7, U16(255));
#if defined(__AVX2__)
return _mm256_min_epi16(v >> 7, U16(255));
#else
return _mm_min_epi16(v >> 7, U16(255));
#endif
}
SI U8 to_byte(F v) {
// Like to_wide_byte(), but we'll bake the saturation into the 16->8 bit pack.
#if defined(__AVX2__)
return _mm_packus_epi16(_mm256_extracti128_si256(v >> 7, 0),
_mm256_extracti128_si256(v >> 7, 1));
#else
// Only the bottom 8 bytes are of interest... it doesn't matter what we pack on top.
__m128i packed = _mm_packus_epi16(v>>7, v>>7);
__m128i packed = _mm_packus_epi16(v >> 7, v >> 7);
return unaligned_load<U8>(&packed);
#endif
}
SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {

View File

@ -60,6 +60,12 @@ subprocess.check_call(clang + cflags + hsw +
subprocess.check_call(clang + cflags + hsw + win +
['-c', 'src/jumper/SkJumper_stages.cpp'] +
['-o', 'win_hsw.o'])
subprocess.check_call(clang + cflags + hsw +
['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
['-o', 'lowp_hsw.o'])
subprocess.check_call(clang + cflags + hsw + win +
['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
['-o', 'win_lowp_hsw.o'])
aarch64 = [ '--target=aarch64' ]
subprocess.check_call(clang + cflags + aarch64 +
@ -196,6 +202,8 @@ parse_object_file('sse41.o', '.byte')
print 'BALIGN32'
parse_object_file('sse2.o', '.byte')
print 'BALIGN32'
parse_object_file('lowp_hsw.o', '.byte')
print 'BALIGN32'
parse_object_file('lowp_ssse3.o', '.byte')
print '#endif'
@ -221,6 +229,8 @@ parse_object_file('win_sse41.o', 'DB')
print 'ALIGN 32'
parse_object_file('win_sse2.o', 'DB')
print 'ALIGN 32'
parse_object_file('win_lowp_hsw.o', 'DB')
print 'ALIGN 32'
parse_object_file('win_lowp_ssse3.o', 'DB')
print 'ENDIF'
print 'END'