add _hsw lowp backend
CQ_INCLUDE_TRYBOTS=skia.primary:Build-Ubuntu-Clang-x86_64-Debug-MSAN Change-Id: Id53279c17589b3434629bb644358ee238af8649f Reviewed-on: https://skia-review.googlesource.com/20269 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Mike Reed <reed@google.com>
This commit is contained in:
parent
7f7b902d51
commit
8c3d5156c7
@ -31,8 +31,7 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
|
|||||||
#undef M
|
#undef M
|
||||||
|
|
||||||
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
||||||
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
#if 0 && !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||||
#if 0
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
|
||||||
#define M(st) #st,
|
#define M(st) #st,
|
||||||
@ -57,7 +56,6 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
|
|||||||
static void log_missing(SkRasterPipeline::StockStage) {}
|
static void log_missing(SkRasterPipeline::StockStage) {}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
// We can't express the real types of most stage functions portably, so we use a stand-in.
|
// We can't express the real types of most stage functions portably, so we use a stand-in.
|
||||||
// We'll only ever call start_pipeline(), which then chains into the rest for us.
|
// We'll only ever call start_pipeline(), which then chains into the rest for us.
|
||||||
@ -128,12 +126,14 @@ extern "C" {
|
|||||||
ASM(start_pipeline,avx ),
|
ASM(start_pipeline,avx ),
|
||||||
ASM(start_pipeline,sse41 ),
|
ASM(start_pipeline,sse41 ),
|
||||||
ASM(start_pipeline,sse2 ),
|
ASM(start_pipeline,sse2 ),
|
||||||
|
ASM(start_pipeline,hsw_lowp ),
|
||||||
ASM(start_pipeline,ssse3_lowp);
|
ASM(start_pipeline,ssse3_lowp);
|
||||||
|
|
||||||
StageFn ASM(just_return,hsw),
|
StageFn ASM(just_return,hsw),
|
||||||
ASM(just_return,avx),
|
ASM(just_return,avx),
|
||||||
ASM(just_return,sse41),
|
ASM(just_return,sse41),
|
||||||
ASM(just_return,sse2),
|
ASM(just_return,sse2),
|
||||||
|
ASM(just_return,hsw_lowp ),
|
||||||
ASM(just_return,ssse3_lowp);
|
ASM(just_return,ssse3_lowp);
|
||||||
|
|
||||||
#define M(st) StageFn ASM(st,hsw);
|
#define M(st) StageFn ASM(st,hsw);
|
||||||
@ -149,6 +149,9 @@ extern "C" {
|
|||||||
SK_RASTER_PIPELINE_STAGES(M)
|
SK_RASTER_PIPELINE_STAGES(M)
|
||||||
#undef M
|
#undef M
|
||||||
|
|
||||||
|
#define M(st) StageFn ASM(st,hsw_lowp);
|
||||||
|
LOWP_STAGES(M)
|
||||||
|
#undef M
|
||||||
#define M(st) StageFn ASM(st,ssse3_lowp);
|
#define M(st) StageFn ASM(st,ssse3_lowp);
|
||||||
LOWP_STAGES(M)
|
LOWP_STAGES(M)
|
||||||
#undef M
|
#undef M
|
||||||
@ -162,6 +165,24 @@ extern "C" {
|
|||||||
#undef M
|
#undef M
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||||
|
template <SkRasterPipeline::StockStage st>
|
||||||
|
static constexpr StageFn* hsw_lowp() { return nullptr; }
|
||||||
|
|
||||||
|
template <SkRasterPipeline::StockStage st>
|
||||||
|
static constexpr StageFn* ssse3_lowp() { return nullptr; }
|
||||||
|
|
||||||
|
#define M(st) \
|
||||||
|
template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
|
||||||
|
return ASM(st,hsw_lowp); \
|
||||||
|
} \
|
||||||
|
template <> constexpr StageFn* ssse3_lowp<SkRasterPipeline::st>() { \
|
||||||
|
return ASM(st,ssse3_lowp); \
|
||||||
|
}
|
||||||
|
LOWP_STAGES(M)
|
||||||
|
#undef M
|
||||||
|
#endif
|
||||||
|
|
||||||
// Engines comprise everything we need to run SkRasterPipelines.
|
// Engines comprise everything we need to run SkRasterPipelines.
|
||||||
struct SkJumper_Engine {
|
struct SkJumper_Engine {
|
||||||
StageFn* stages[kNumStages];
|
StageFn* stages[kNumStages];
|
||||||
@ -239,41 +260,70 @@ static SkJumper_Engine choose_engine() {
|
|||||||
return kPortable;
|
return kPortable;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
||||||
|
static const SkJumper_Engine kNone = {
|
||||||
|
#define M(stage) nullptr,
|
||||||
|
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||||
|
#undef M
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
};
|
||||||
|
static SkJumper_Engine gLowp = kNone;
|
||||||
|
static SkOnce gChooseLowpOnce;
|
||||||
|
|
||||||
|
static SkJumper_Engine choose_lowp() {
|
||||||
|
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
||||||
|
if (1 && SkCpu::Supports(SkCpu::HSW)) {
|
||||||
|
return {
|
||||||
|
#define M(st) hsw_lowp<SkRasterPipeline::st>(),
|
||||||
|
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||||
|
ASM(start_pipeline,hsw_lowp),
|
||||||
|
ASM(just_return,hsw_lowp)
|
||||||
|
#undef M
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (1 && SkCpu::Supports(SkCpu::SSSE3)) {
|
||||||
|
return {
|
||||||
|
#define M(st) ssse3_lowp<SkRasterPipeline::st>(),
|
||||||
|
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||||
|
ASM(start_pipeline,ssse3_lowp),
|
||||||
|
ASM(just_return,ssse3_lowp)
|
||||||
|
#undef M
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return kNone;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
|
StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
|
||||||
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
|
||||||
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
|
gChooseLowpOnce([]{ gLowp = choose_lowp(); });
|
||||||
if (SkCpu::Supports(SkCpu::SSSE3)) {
|
|
||||||
void** reset_point = ip;
|
|
||||||
|
|
||||||
*--ip = (void*)ASM(just_return,ssse3_lowp);
|
// First try to build a lowp pipeline. If that fails, fall back to normal float gEngine.
|
||||||
|
void** reset_point = ip;
|
||||||
|
*--ip = (void*)gLowp.just_return;
|
||||||
for (const StageList* st = fStages; st; st = st->prev) {
|
for (const StageList* st = fStages; st; st = st->prev) {
|
||||||
StageFn* fn = nullptr;
|
if (st->stage == SkRasterPipeline::clamp_0) {
|
||||||
switch (st->stage) {
|
continue; // No-op in lowp.
|
||||||
#define M(st) case SkRasterPipeline::st: fn = ASM(st, ssse3_lowp); break;
|
|
||||||
LOWP_STAGES(M)
|
|
||||||
#undef M
|
|
||||||
case SkRasterPipeline::clamp_0: continue; // clamp_0 is a no-op in lowp.
|
|
||||||
default:
|
|
||||||
log_missing(st->stage);
|
|
||||||
ip = reset_point;
|
|
||||||
}
|
|
||||||
if (ip == reset_point) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
if (StageFn* fn = gLowp.stages[st->stage]) {
|
||||||
if (st->ctx) {
|
if (st->ctx) {
|
||||||
*--ip = st->ctx;
|
*--ip = st->ctx;
|
||||||
}
|
}
|
||||||
*--ip = (void*)fn;
|
*--ip = (void*)fn;
|
||||||
|
} else {
|
||||||
|
log_missing(st->stage);
|
||||||
|
ip = reset_point;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ip != reset_point) {
|
if (ip != reset_point) {
|
||||||
return ASM(start_pipeline,ssse3_lowp);
|
return gLowp.start_pipeline;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
gChooseEngineOnce([]{ gEngine = choose_engine(); });
|
|
||||||
|
|
||||||
|
gChooseEngineOnce([]{ gEngine = choose_engine(); });
|
||||||
// We're building the pipeline backwards, so we start with the final stage just_return.
|
// We're building the pipeline backwards, so we start with the final stage just_return.
|
||||||
*--ip = (void*)gEngine.just_return;
|
*--ip = (void*)gEngine.just_return;
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -9,16 +9,22 @@
|
|||||||
#include "SkJumper_misc.h"
|
#include "SkJumper_misc.h"
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
|
#if !defined(__clang__) || !defined(__x86_64__)
|
||||||
#error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
|
#error "We're starting with just x86-64 for now, and will always require Clang."
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define WRAP(name) sk_##name##_ssse3_lowp
|
|
||||||
|
|
||||||
using K = const SkJumper_constants;
|
using K = const SkJumper_constants;
|
||||||
static const size_t kStride = 8;
|
|
||||||
|
|
||||||
template <typename T> using V = T __attribute__((ext_vector_type(8)));
|
#if defined(__AVX2__)
|
||||||
|
#define WRAP(name) sk_##name##_hsw_lowp
|
||||||
|
template <typename T> using V = T __attribute__((ext_vector_type(16)));
|
||||||
|
static const size_t kStride = 16;
|
||||||
|
#else
|
||||||
|
#define WRAP(name) sk_##name##_ssse3_lowp
|
||||||
|
template <typename T> using V = T __attribute__((ext_vector_type(8)));
|
||||||
|
static const size_t kStride = 8;
|
||||||
|
#endif
|
||||||
|
|
||||||
using U8 = V<uint8_t>;
|
using U8 = V<uint8_t>;
|
||||||
using U16 = V<uint16_t>;
|
using U16 = V<uint16_t>;
|
||||||
using U32 = V<uint32_t>;
|
using U32 = V<uint32_t>;
|
||||||
@ -40,7 +46,14 @@ struct F {
|
|||||||
|
|
||||||
SI F operator+(F x, F y) { return x.vec + y.vec; }
|
SI F operator+(F x, F y) { return x.vec + y.vec; }
|
||||||
SI F operator-(F x, F y) { return x.vec - y.vec; }
|
SI F operator-(F x, F y) { return x.vec - y.vec; }
|
||||||
SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
|
SI F operator*(F x, F y) {
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
return _mm256_abs_epi16(_mm256_mulhrs_epi16(x.vec, y.vec));
|
||||||
|
#else
|
||||||
|
return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
SI F mad(F f, F m, F a) { return f*m+a; }
|
SI F mad(F f, F m, F a) { return f*m+a; }
|
||||||
SI F inv(F v) { return 1.0f - v; }
|
SI F inv(F v) { return 1.0f - v; }
|
||||||
SI F two(F v) { return v + v; }
|
SI F two(F v) { return v + v; }
|
||||||
@ -51,6 +64,11 @@ SI F operator>>(F x, int bits) { return x.vec >> bits; }
|
|||||||
|
|
||||||
using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
|
using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
|
||||||
|
|
||||||
|
#if defined(__AVX__)
|
||||||
|
// We really want to make sure all paths go through this function's (implicit) vzeroupper.
|
||||||
|
// If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
|
||||||
|
__attribute__((disable_tail_calls))
|
||||||
|
#endif
|
||||||
MAYBE_MSABI
|
MAYBE_MSABI
|
||||||
extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
|
extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
|
||||||
F v{};
|
F v{};
|
||||||
@ -88,6 +106,14 @@ SI V load(const T* src, size_t tail) {
|
|||||||
if (__builtin_expect(tail, 0)) {
|
if (__builtin_expect(tail, 0)) {
|
||||||
V v{}; // Any inactive lanes are zeroed.
|
V v{}; // Any inactive lanes are zeroed.
|
||||||
switch (tail) {
|
switch (tail) {
|
||||||
|
case 15: v[14] = src[14];
|
||||||
|
case 14: v[13] = src[13];
|
||||||
|
case 13: v[12] = src[12];
|
||||||
|
case 12: memcpy(&v, src, 12*sizeof(T)); break;
|
||||||
|
case 11: v[10] = src[10];
|
||||||
|
case 10: v[ 9] = src[ 9];
|
||||||
|
case 9: v[ 8] = src[ 8];
|
||||||
|
case 8: memcpy(&v, src, 8*sizeof(T)); break;
|
||||||
case 7: v[6] = src[6];
|
case 7: v[6] = src[6];
|
||||||
case 6: v[5] = src[5];
|
case 6: v[5] = src[5];
|
||||||
case 5: v[4] = src[4];
|
case 5: v[4] = src[4];
|
||||||
@ -106,6 +132,14 @@ SI void store(T* dst, V v, size_t tail) {
|
|||||||
__builtin_assume(tail < kStride);
|
__builtin_assume(tail < kStride);
|
||||||
if (__builtin_expect(tail, 0)) {
|
if (__builtin_expect(tail, 0)) {
|
||||||
switch (tail) {
|
switch (tail) {
|
||||||
|
case 15: dst[14] = v[14];
|
||||||
|
case 14: dst[13] = v[13];
|
||||||
|
case 13: dst[12] = v[12];
|
||||||
|
case 12: memcpy(dst, &v, 12*sizeof(T)); break;
|
||||||
|
case 11: dst[10] = v[10];
|
||||||
|
case 10: dst[ 9] = v[ 9];
|
||||||
|
case 9: dst[ 8] = v[ 8];
|
||||||
|
case 8: memcpy(dst, &v, 8*sizeof(T)); break;
|
||||||
case 7: dst[6] = v[6];
|
case 7: dst[6] = v[6];
|
||||||
case 6: dst[5] = v[5];
|
case 6: dst[5] = v[5];
|
||||||
case 5: dst[4] = v[4];
|
case 5: dst[4] = v[4];
|
||||||
@ -119,12 +153,18 @@ SI void store(T* dst, V v, size_t tail) {
|
|||||||
unaligned_store(dst, v);
|
unaligned_store(dst, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: mask loads and stores with AVX2
|
||||||
|
|
||||||
// Scale from [0,255] up to [0,32768].
|
// Scale from [0,255] up to [0,32768].
|
||||||
SI F from_wide_byte(U16 bytes) {
|
SI F from_wide_byte(U16 bytes) {
|
||||||
// Ideally we'd scale by 32768/255 = 128.50196, but instead we'll approximate
|
// Ideally we'd scale by 32768/255 = 128.50196, but instead we'll approximate
|
||||||
// that a little more cheaply as 256*32897/65536 = 128.50391.
|
// that a little more cheaply as 256*32897/65536 = 128.50391.
|
||||||
// 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1 bit.
|
// 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1 bit.
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
return _mm256_mulhi_epu16(bytes << 8, U16(32897));
|
||||||
|
#else
|
||||||
return _mm_mulhi_epu16(bytes << 8, U16(32897));
|
return _mm_mulhi_epu16(bytes << 8, U16(32897));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
SI F from_byte(U8 bytes) {
|
SI F from_byte(U8 bytes) {
|
||||||
return from_wide_byte(__builtin_convertvector(bytes, U16));
|
return from_wide_byte(__builtin_convertvector(bytes, U16));
|
||||||
@ -133,13 +173,22 @@ SI F from_byte(U8 bytes) {
|
|||||||
// Pack from [0,32768] down to [0,255].
|
// Pack from [0,32768] down to [0,255].
|
||||||
SI U16 to_wide_byte(F v) {
|
SI U16 to_wide_byte(F v) {
|
||||||
// The simplest thing works great: divide by 128 and saturate.
|
// The simplest thing works great: divide by 128 and saturate.
|
||||||
return _mm_min_epi16(v>>7, U16(255));
|
#if defined(__AVX2__)
|
||||||
|
return _mm256_min_epi16(v >> 7, U16(255));
|
||||||
|
#else
|
||||||
|
return _mm_min_epi16(v >> 7, U16(255));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
SI U8 to_byte(F v) {
|
SI U8 to_byte(F v) {
|
||||||
// Like to_wide_byte(), but we'll bake the saturation into the 16->8 bit pack.
|
// Like to_wide_byte(), but we'll bake the saturation into the 16->8 bit pack.
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
return _mm_packus_epi16(_mm256_extracti128_si256(v >> 7, 0),
|
||||||
|
_mm256_extracti128_si256(v >> 7, 1));
|
||||||
|
#else
|
||||||
// Only the bottom 8 bytes are of interest... it doesn't matter what we pack on top.
|
// Only the bottom 8 bytes are of interest... it doesn't matter what we pack on top.
|
||||||
__m128i packed = _mm_packus_epi16(v>>7, v>>7);
|
__m128i packed = _mm_packus_epi16(v >> 7, v >> 7);
|
||||||
return unaligned_load<U8>(&packed);
|
return unaligned_load<U8>(&packed);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
|
SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
|
||||||
|
@ -60,6 +60,12 @@ subprocess.check_call(clang + cflags + hsw +
|
|||||||
subprocess.check_call(clang + cflags + hsw + win +
|
subprocess.check_call(clang + cflags + hsw + win +
|
||||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||||
['-o', 'win_hsw.o'])
|
['-o', 'win_hsw.o'])
|
||||||
|
subprocess.check_call(clang + cflags + hsw +
|
||||||
|
['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
|
||||||
|
['-o', 'lowp_hsw.o'])
|
||||||
|
subprocess.check_call(clang + cflags + hsw + win +
|
||||||
|
['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
|
||||||
|
['-o', 'win_lowp_hsw.o'])
|
||||||
|
|
||||||
aarch64 = [ '--target=aarch64' ]
|
aarch64 = [ '--target=aarch64' ]
|
||||||
subprocess.check_call(clang + cflags + aarch64 +
|
subprocess.check_call(clang + cflags + aarch64 +
|
||||||
@ -196,6 +202,8 @@ parse_object_file('sse41.o', '.byte')
|
|||||||
print 'BALIGN32'
|
print 'BALIGN32'
|
||||||
parse_object_file('sse2.o', '.byte')
|
parse_object_file('sse2.o', '.byte')
|
||||||
print 'BALIGN32'
|
print 'BALIGN32'
|
||||||
|
parse_object_file('lowp_hsw.o', '.byte')
|
||||||
|
print 'BALIGN32'
|
||||||
parse_object_file('lowp_ssse3.o', '.byte')
|
parse_object_file('lowp_ssse3.o', '.byte')
|
||||||
|
|
||||||
print '#endif'
|
print '#endif'
|
||||||
@ -221,6 +229,8 @@ parse_object_file('win_sse41.o', 'DB')
|
|||||||
print 'ALIGN 32'
|
print 'ALIGN 32'
|
||||||
parse_object_file('win_sse2.o', 'DB')
|
parse_object_file('win_sse2.o', 'DB')
|
||||||
print 'ALIGN 32'
|
print 'ALIGN 32'
|
||||||
|
parse_object_file('win_lowp_hsw.o', 'DB')
|
||||||
|
print 'ALIGN 32'
|
||||||
parse_object_file('win_lowp_ssse3.o', 'DB')
|
parse_object_file('win_lowp_ssse3.o', 'DB')
|
||||||
print 'ENDIF'
|
print 'ENDIF'
|
||||||
print 'END'
|
print 'END'
|
||||||
|
Loading…
Reference in New Issue
Block a user