add _hsw lowp backend
CQ_INCLUDE_TRYBOTS=skia.primary:Build-Ubuntu-Clang-x86_64-Debug-MSAN Change-Id: Id53279c17589b3434629bb644358ee238af8649f Reviewed-on: https://skia-review.googlesource.com/20269 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Mike Reed <reed@google.com>
This commit is contained in:
parent
7f7b902d51
commit
8c3d5156c7
@ -31,8 +31,7 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
|
||||
#undef M
|
||||
|
||||
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
||||
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||
#if 0
|
||||
#if 0 && !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||
#include <atomic>
|
||||
|
||||
#define M(st) #st,
|
||||
@ -57,7 +56,6 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
|
||||
static void log_missing(SkRasterPipeline::StockStage) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// We can't express the real types of most stage functions portably, so we use a stand-in.
|
||||
// We'll only ever call start_pipeline(), which then chains into the rest for us.
|
||||
@ -128,12 +126,14 @@ extern "C" {
|
||||
ASM(start_pipeline,avx ),
|
||||
ASM(start_pipeline,sse41 ),
|
||||
ASM(start_pipeline,sse2 ),
|
||||
ASM(start_pipeline,hsw_lowp ),
|
||||
ASM(start_pipeline,ssse3_lowp);
|
||||
|
||||
StageFn ASM(just_return,hsw),
|
||||
ASM(just_return,avx),
|
||||
ASM(just_return,sse41),
|
||||
ASM(just_return,sse2),
|
||||
ASM(just_return,hsw_lowp ),
|
||||
ASM(just_return,ssse3_lowp);
|
||||
|
||||
#define M(st) StageFn ASM(st,hsw);
|
||||
@ -149,6 +149,9 @@ extern "C" {
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
#undef M
|
||||
|
||||
#define M(st) StageFn ASM(st,hsw_lowp);
|
||||
LOWP_STAGES(M)
|
||||
#undef M
|
||||
#define M(st) StageFn ASM(st,ssse3_lowp);
|
||||
LOWP_STAGES(M)
|
||||
#undef M
|
||||
@ -162,6 +165,24 @@ extern "C" {
|
||||
#undef M
|
||||
}
|
||||
|
||||
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||
template <SkRasterPipeline::StockStage st>
|
||||
static constexpr StageFn* hsw_lowp() { return nullptr; }
|
||||
|
||||
template <SkRasterPipeline::StockStage st>
|
||||
static constexpr StageFn* ssse3_lowp() { return nullptr; }
|
||||
|
||||
#define M(st) \
|
||||
template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
|
||||
return ASM(st,hsw_lowp); \
|
||||
} \
|
||||
template <> constexpr StageFn* ssse3_lowp<SkRasterPipeline::st>() { \
|
||||
return ASM(st,ssse3_lowp); \
|
||||
}
|
||||
LOWP_STAGES(M)
|
||||
#undef M
|
||||
#endif
|
||||
|
||||
// Engines comprise everything we need to run SkRasterPipelines.
|
||||
struct SkJumper_Engine {
|
||||
StageFn* stages[kNumStages];
|
||||
@ -239,41 +260,70 @@ static SkJumper_Engine choose_engine() {
|
||||
return kPortable;
|
||||
}
|
||||
|
||||
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
||||
static const SkJumper_Engine kNone = {
|
||||
#define M(stage) nullptr,
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
#undef M
|
||||
nullptr,
|
||||
nullptr,
|
||||
};
|
||||
static SkJumper_Engine gLowp = kNone;
|
||||
static SkOnce gChooseLowpOnce;
|
||||
|
||||
static SkJumper_Engine choose_lowp() {
|
||||
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||
if (1 && SkCpu::Supports(SkCpu::HSW)) {
|
||||
return {
|
||||
#define M(st) hsw_lowp<SkRasterPipeline::st>(),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
ASM(start_pipeline,hsw_lowp),
|
||||
ASM(just_return,hsw_lowp)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
if (1 && SkCpu::Supports(SkCpu::SSSE3)) {
|
||||
return {
|
||||
#define M(st) ssse3_lowp<SkRasterPipeline::st>(),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
ASM(start_pipeline,ssse3_lowp),
|
||||
ASM(just_return,ssse3_lowp)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
#endif
|
||||
return kNone;
|
||||
}
|
||||
#endif
|
||||
|
||||
StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
|
||||
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
||||
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||
if (SkCpu::Supports(SkCpu::SSSE3)) {
|
||||
void** reset_point = ip;
|
||||
gChooseLowpOnce([]{ gLowp = choose_lowp(); });
|
||||
|
||||
*--ip = (void*)ASM(just_return,ssse3_lowp);
|
||||
for (const StageList* st = fStages; st; st = st->prev) {
|
||||
StageFn* fn = nullptr;
|
||||
switch (st->stage) {
|
||||
#define M(st) case SkRasterPipeline::st: fn = ASM(st, ssse3_lowp); break;
|
||||
LOWP_STAGES(M)
|
||||
#undef M
|
||||
case SkRasterPipeline::clamp_0: continue; // clamp_0 is a no-op in lowp.
|
||||
default:
|
||||
log_missing(st->stage);
|
||||
ip = reset_point;
|
||||
}
|
||||
if (ip == reset_point) {
|
||||
break;
|
||||
}
|
||||
// First try to build a lowp pipeline. If that fails, fall back to normal float gEngine.
|
||||
void** reset_point = ip;
|
||||
*--ip = (void*)gLowp.just_return;
|
||||
for (const StageList* st = fStages; st; st = st->prev) {
|
||||
if (st->stage == SkRasterPipeline::clamp_0) {
|
||||
continue; // No-op in lowp.
|
||||
}
|
||||
if (StageFn* fn = gLowp.stages[st->stage]) {
|
||||
if (st->ctx) {
|
||||
*--ip = st->ctx;
|
||||
}
|
||||
*--ip = (void*)fn;
|
||||
}
|
||||
|
||||
if (ip != reset_point) {
|
||||
return ASM(start_pipeline,ssse3_lowp);
|
||||
} else {
|
||||
log_missing(st->stage);
|
||||
ip = reset_point;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ip != reset_point) {
|
||||
return gLowp.start_pipeline;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
gChooseEngineOnce([]{ gEngine = choose_engine(); });
|
||||
|
||||
gChooseEngineOnce([]{ gEngine = choose_engine(); });
|
||||
// We're building the pipeline backwards, so we start with the final stage just_return.
|
||||
*--ip = (void*)gEngine.just_return;
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -9,16 +9,22 @@
|
||||
#include "SkJumper_misc.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
|
||||
#error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
|
||||
#if !defined(__clang__) || !defined(__x86_64__)
|
||||
#error "We're starting with just x86-64 for now, and will always require Clang."
|
||||
#endif
|
||||
|
||||
#define WRAP(name) sk_##name##_ssse3_lowp
|
||||
|
||||
using K = const SkJumper_constants;
|
||||
static const size_t kStride = 8;
|
||||
|
||||
template <typename T> using V = T __attribute__((ext_vector_type(8)));
|
||||
#if defined(__AVX2__)
|
||||
#define WRAP(name) sk_##name##_hsw_lowp
|
||||
template <typename T> using V = T __attribute__((ext_vector_type(16)));
|
||||
static const size_t kStride = 16;
|
||||
#else
|
||||
#define WRAP(name) sk_##name##_ssse3_lowp
|
||||
template <typename T> using V = T __attribute__((ext_vector_type(8)));
|
||||
static const size_t kStride = 8;
|
||||
#endif
|
||||
|
||||
using U8 = V<uint8_t>;
|
||||
using U16 = V<uint16_t>;
|
||||
using U32 = V<uint32_t>;
|
||||
@ -40,7 +46,14 @@ struct F {
|
||||
|
||||
SI F operator+(F x, F y) { return x.vec + y.vec; }
|
||||
SI F operator-(F x, F y) { return x.vec - y.vec; }
|
||||
SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
|
||||
SI F operator*(F x, F y) {
|
||||
#if defined(__AVX2__)
|
||||
return _mm256_abs_epi16(_mm256_mulhrs_epi16(x.vec, y.vec));
|
||||
#else
|
||||
return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec));
|
||||
#endif
|
||||
}
|
||||
|
||||
SI F mad(F f, F m, F a) { return f*m+a; }
|
||||
SI F inv(F v) { return 1.0f - v; }
|
||||
SI F two(F v) { return v + v; }
|
||||
@ -51,6 +64,11 @@ SI F operator>>(F x, int bits) { return x.vec >> bits; }
|
||||
|
||||
using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
|
||||
|
||||
#if defined(__AVX__)
|
||||
// We really want to make sure all paths go through this function's (implicit) vzeroupper.
|
||||
// If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
|
||||
__attribute__((disable_tail_calls))
|
||||
#endif
|
||||
MAYBE_MSABI
|
||||
extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
|
||||
F v{};
|
||||
@ -88,13 +106,21 @@ SI V load(const T* src, size_t tail) {
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
V v{}; // Any inactive lanes are zeroed.
|
||||
switch (tail) {
|
||||
case 7: v[6] = src[6];
|
||||
case 6: v[5] = src[5];
|
||||
case 5: v[4] = src[4];
|
||||
case 4: memcpy(&v, src, 4*sizeof(T)); break;
|
||||
case 3: v[2] = src[2];
|
||||
case 2: memcpy(&v, src, 2*sizeof(T)); break;
|
||||
case 1: memcpy(&v, src, 1*sizeof(T)); break;
|
||||
case 15: v[14] = src[14];
|
||||
case 14: v[13] = src[13];
|
||||
case 13: v[12] = src[12];
|
||||
case 12: memcpy(&v, src, 12*sizeof(T)); break;
|
||||
case 11: v[10] = src[10];
|
||||
case 10: v[ 9] = src[ 9];
|
||||
case 9: v[ 8] = src[ 8];
|
||||
case 8: memcpy(&v, src, 8*sizeof(T)); break;
|
||||
case 7: v[6] = src[6];
|
||||
case 6: v[5] = src[5];
|
||||
case 5: v[4] = src[4];
|
||||
case 4: memcpy(&v, src, 4*sizeof(T)); break;
|
||||
case 3: v[2] = src[2];
|
||||
case 2: memcpy(&v, src, 2*sizeof(T)); break;
|
||||
case 1: memcpy(&v, src, 1*sizeof(T)); break;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
@ -106,25 +132,39 @@ SI void store(T* dst, V v, size_t tail) {
|
||||
__builtin_assume(tail < kStride);
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
switch (tail) {
|
||||
case 7: dst[6] = v[6];
|
||||
case 6: dst[5] = v[5];
|
||||
case 5: dst[4] = v[4];
|
||||
case 4: memcpy(dst, &v, 4*sizeof(T)); break;
|
||||
case 3: dst[2] = v[2];
|
||||
case 2: memcpy(dst, &v, 2*sizeof(T)); break;
|
||||
case 1: memcpy(dst, &v, 1*sizeof(T)); break;
|
||||
case 15: dst[14] = v[14];
|
||||
case 14: dst[13] = v[13];
|
||||
case 13: dst[12] = v[12];
|
||||
case 12: memcpy(dst, &v, 12*sizeof(T)); break;
|
||||
case 11: dst[10] = v[10];
|
||||
case 10: dst[ 9] = v[ 9];
|
||||
case 9: dst[ 8] = v[ 8];
|
||||
case 8: memcpy(dst, &v, 8*sizeof(T)); break;
|
||||
case 7: dst[6] = v[6];
|
||||
case 6: dst[5] = v[5];
|
||||
case 5: dst[4] = v[4];
|
||||
case 4: memcpy(dst, &v, 4*sizeof(T)); break;
|
||||
case 3: dst[2] = v[2];
|
||||
case 2: memcpy(dst, &v, 2*sizeof(T)); break;
|
||||
case 1: memcpy(dst, &v, 1*sizeof(T)); break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
unaligned_store(dst, v);
|
||||
}
|
||||
|
||||
// TODO: mask loads and stores with AVX2
|
||||
|
||||
// Scale from [0,255] up to [0,32768].
|
||||
SI F from_wide_byte(U16 bytes) {
|
||||
// Ideally we'd scale by 32768/255 = 128.50196, but instead we'll approximate
|
||||
// that a little more cheaply as 256*32897/65536 = 128.50391.
|
||||
// 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1 bit.
|
||||
return _mm_mulhi_epu16(bytes << 8, U16(32897));
|
||||
#if defined(__AVX2__)
|
||||
return _mm256_mulhi_epu16(bytes << 8, U16(32897));
|
||||
#else
|
||||
return _mm_mulhi_epu16(bytes << 8, U16(32897));
|
||||
#endif
|
||||
}
|
||||
SI F from_byte(U8 bytes) {
|
||||
return from_wide_byte(__builtin_convertvector(bytes, U16));
|
||||
@ -133,13 +173,22 @@ SI F from_byte(U8 bytes) {
|
||||
// Pack from [0,32768] down to [0,255].
|
||||
SI U16 to_wide_byte(F v) {
|
||||
// The simplest thing works great: divide by 128 and saturate.
|
||||
return _mm_min_epi16(v>>7, U16(255));
|
||||
#if defined(__AVX2__)
|
||||
return _mm256_min_epi16(v >> 7, U16(255));
|
||||
#else
|
||||
return _mm_min_epi16(v >> 7, U16(255));
|
||||
#endif
|
||||
}
|
||||
SI U8 to_byte(F v) {
|
||||
// Like to_wide_byte(), but we'll bake the saturation into the 16->8 bit pack.
|
||||
#if defined(__AVX2__)
|
||||
return _mm_packus_epi16(_mm256_extracti128_si256(v >> 7, 0),
|
||||
_mm256_extracti128_si256(v >> 7, 1));
|
||||
#else
|
||||
// Only the bottom 8 bytes are of interest... it doesn't matter what we pack on top.
|
||||
__m128i packed = _mm_packus_epi16(v>>7, v>>7);
|
||||
__m128i packed = _mm_packus_epi16(v >> 7, v >> 7);
|
||||
return unaligned_load<U8>(&packed);
|
||||
#endif
|
||||
}
|
||||
|
||||
SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
|
||||
|
@ -60,6 +60,12 @@ subprocess.check_call(clang + cflags + hsw +
|
||||
subprocess.check_call(clang + cflags + hsw + win +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'win_hsw.o'])
|
||||
subprocess.check_call(clang + cflags + hsw +
|
||||
['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
|
||||
['-o', 'lowp_hsw.o'])
|
||||
subprocess.check_call(clang + cflags + hsw + win +
|
||||
['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
|
||||
['-o', 'win_lowp_hsw.o'])
|
||||
|
||||
aarch64 = [ '--target=aarch64' ]
|
||||
subprocess.check_call(clang + cflags + aarch64 +
|
||||
@ -196,6 +202,8 @@ parse_object_file('sse41.o', '.byte')
|
||||
print 'BALIGN32'
|
||||
parse_object_file('sse2.o', '.byte')
|
||||
print 'BALIGN32'
|
||||
parse_object_file('lowp_hsw.o', '.byte')
|
||||
print 'BALIGN32'
|
||||
parse_object_file('lowp_ssse3.o', '.byte')
|
||||
|
||||
print '#endif'
|
||||
@ -221,6 +229,8 @@ parse_object_file('win_sse41.o', 'DB')
|
||||
print 'ALIGN 32'
|
||||
parse_object_file('win_sse2.o', 'DB')
|
||||
print 'ALIGN 32'
|
||||
parse_object_file('win_lowp_hsw.o', 'DB')
|
||||
print 'ALIGN 32'
|
||||
parse_object_file('win_lowp_ssse3.o', 'DB')
|
||||
print 'ENDIF'
|
||||
print 'END'
|
||||
|
Loading…
Reference in New Issue
Block a user