SkJumper: start on asm
Will follow up with Linux, then Android aarch64 and armv7, then iOS, then Windows. I took some opportunities to refactor. CQ_INCLUDE_trybots=skia.primary:Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Debug,Perf-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Debug Change-Id: Ifcf1edabdfe5df0a91bd089f09523aba95cdf5ef Reviewed-on: https://skia-review.googlesource.com/8611 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
parent
2e777ead12
commit
d1fe9522e3
3
BUILD.gn
3
BUILD.gn
@ -29,7 +29,7 @@ declare_args() {
|
||||
skia_enable_android_framework_defines = false
|
||||
skia_enable_discrete_gpu = true
|
||||
skia_enable_effects = true
|
||||
skia_enable_jumper = false
|
||||
skia_enable_jumper = is_skia_standalone && is_mac
|
||||
skia_enable_gpu = true
|
||||
skia_enable_pdf = true
|
||||
skia_enable_tools = is_skia_standalone
|
||||
@ -498,6 +498,7 @@ optional("jumper") {
|
||||
public_defines = [ "SK_JUMPER" ]
|
||||
sources = [
|
||||
"src/jumper/SkJumper.cpp",
|
||||
"src/jumper/SkJumper_generated_x86_64.s",
|
||||
"src/jumper/SkJumper_stages.cpp",
|
||||
]
|
||||
}
|
||||
|
@ -7,13 +7,14 @@
|
||||
|
||||
#include "SkCpu.h"
|
||||
#include "SkJumper.h"
|
||||
#include "SkJumper_generated.h"
|
||||
#include "SkRasterPipeline.h"
|
||||
#include "SkTemplates.h"
|
||||
|
||||
|
||||
// Stages expect these constants to be set to these values.
|
||||
// It's fine to rearrange and add new ones if you update SkJumper_constants.
|
||||
static const SkJumper_constants kConstants = {
|
||||
using K = const SkJumper_constants;
|
||||
static K kConstants = {
|
||||
1.0f, 0.5f, 255.0f, 1/255.0f, 0x000000ff,
|
||||
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
|
||||
0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
|
||||
@ -50,171 +51,125 @@ static const SkJumper_constants kConstants = {
|
||||
M(clamp_y) \
|
||||
M(linear_gradient_2stops)
|
||||
|
||||
// Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o.
|
||||
extern "C" {
|
||||
void sk_start_pipeline(size_t, void**, const SkJumper_constants*);
|
||||
// We can't express the real types of most stage functions portably, so we use a stand-in.
|
||||
// We'll only ever call start_pipeline(), which then chains into the rest for us.
|
||||
using StageFn = void(void);
|
||||
|
||||
// We use void() as a convenient stand-in for the real stage function type.
|
||||
// We never call these directly, so we don't really need to know their real types.
|
||||
void sk_just_return(void);
|
||||
#define M(st) void sk_##st(void);
|
||||
STAGES(M)
|
||||
#undef M
|
||||
extern "C" {
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
void sk_start_pipeline_hsw (size_t, void**, K*);
|
||||
void sk_start_pipeline_sse41(size_t, void**, K*);
|
||||
void sk_start_pipeline_sse2 (size_t, void**, K*);
|
||||
|
||||
StageFn sk_just_return_hsw,
|
||||
sk_just_return_sse41,
|
||||
sk_just_return_sse2;
|
||||
|
||||
#define M(st) StageFn sk_##st##_hsw;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
#define M(st) StageFn sk_##st##_sse41;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
#define M(st) StageFn sk_##st##_sse2;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
#endif
|
||||
|
||||
// Portable, single-pixel stages.
|
||||
void sk_start_pipeline(size_t, void**, K*);
|
||||
StageFn sk_just_return;
|
||||
#define M(st) StageFn sk_##st;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
|
||||
// Translate SkRasterPipeline's enum to pointers to our portable, single pixel stages.
|
||||
static void* portable_lookup(SkRasterPipeline::StockStage st) {
|
||||
// Translate SkRasterPipeline's StockStage enum to StageFn function pointers.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
static StageFn* lookup_hsw(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return sk_##st##_hsw;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
static StageFn* lookup_sse41(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return sk_##st##_sse41;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
static StageFn* lookup_sse2(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return sk_##st##_sse2;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static StageFn* lookup_portable(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return (void*)sk_##st;
|
||||
#define M(st) case SkRasterPipeline::st: return sk_##st;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
|
||||
// The non-portable options are pre-compiled static data arrays pulled in from SkJumper_generated.h.
|
||||
#if defined(__aarch64__)
|
||||
static void* aarch64_lookup(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return (void*)aarch64_sk_##st;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
#elif defined(__ARM_NEON__)
|
||||
static void* armv7_lookup(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return (void*)armv7_sk_##st;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
#elif defined(__x86_64__) || defined(_M_X64)
|
||||
static void* sse2_lookup(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return (void*)sse2_sk_##st;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
static void* sse41_lookup(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return (void*)sse41_sk_##st;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
static void* hsw_lookup(SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return nullptr;
|
||||
#define M(st) case SkRasterPipeline::st: return (void*)hsw_sk_##st;
|
||||
STAGES(M)
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
|
||||
// We'll look for the best vector instruction set and stride we can use.
|
||||
size_t stride = 0;
|
||||
void* (*lookup)(SkRasterPipeline::StockStage) = nullptr;
|
||||
void* start_pipeline = nullptr;
|
||||
void* just_return = nullptr;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
stride = 4;
|
||||
lookup = aarch64_lookup;
|
||||
start_pipeline = (void*)aarch64_sk_start_pipeline;
|
||||
just_return = (void*)aarch64_sk_just_return;
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
|
||||
stride = 2;
|
||||
lookup = armv7_lookup;
|
||||
start_pipeline = (void*)armv7_sk_start_pipeline;
|
||||
just_return = (void*)armv7_sk_just_return;
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(_M_X64)
|
||||
stride = 4;
|
||||
lookup = sse2_lookup;
|
||||
start_pipeline = (void*)sse2_sk_start_pipeline;
|
||||
just_return = (void*)sse2_sk_just_return;
|
||||
if (SkCpu::Supports(SkCpu::SSE41)) {
|
||||
stride = 4;
|
||||
lookup = sse41_lookup;
|
||||
start_pipeline = (void*)sse41_sk_start_pipeline;
|
||||
just_return = (void*)sse41_sk_just_return;
|
||||
}
|
||||
if (SkCpu::Supports(SkCpu::HSW)) {
|
||||
stride = 8;
|
||||
lookup = hsw_lookup;
|
||||
start_pipeline = (void*)hsw_sk_start_pipeline;
|
||||
just_return = (void*)hsw_sk_just_return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
if (start_pipeline == (void*)sse2_sk_start_pipeline) {
|
||||
start_pipeline = (void*)sse2_sk_start_pipeline_ms;
|
||||
}
|
||||
if (start_pipeline == (void*)sse41_sk_start_pipeline) {
|
||||
start_pipeline = (void*)sse41_sk_start_pipeline_ms;
|
||||
}
|
||||
if (start_pipeline == (void*)hsw_sk_start_pipeline) {
|
||||
start_pipeline = (void*)hsw_sk_start_pipeline_ms;
|
||||
}
|
||||
#endif
|
||||
|
||||
SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
|
||||
|
||||
// If possible, build and run a program to run at full vector stride.
|
||||
const size_t limit = x+n;
|
||||
|
||||
if (stride) {
|
||||
void** ip = program.get();
|
||||
for (auto&& st : fStages) {
|
||||
auto fn = lookup(st.stage);
|
||||
if (!fn) {
|
||||
return false;
|
||||
auto build_and_run = [&](size_t stride,
|
||||
StageFn* (*lookup)(SkRasterPipeline::StockStage),
|
||||
StageFn* just_return,
|
||||
void (*start_pipeline)(size_t, void**, K*)) {
|
||||
if (x + stride <= limit) {
|
||||
void** ip = program.get();
|
||||
for (auto&& st : fStages) {
|
||||
auto fn = lookup(st.stage);
|
||||
if (!fn) {
|
||||
return false;
|
||||
}
|
||||
*ip++ = (void*)fn;
|
||||
*ip++ = st.ctx;
|
||||
}
|
||||
*ip++ = fn;
|
||||
*ip++ = st.ctx;
|
||||
}
|
||||
*ip = (void*)just_return;
|
||||
*ip = (void*)just_return;
|
||||
|
||||
auto start = (decltype(&sk_start_pipeline))start_pipeline;
|
||||
while (x + stride <= limit) {
|
||||
start(x, program.get(), &kConstants);
|
||||
x += stride;
|
||||
while (x + stride <= limit) {
|
||||
start_pipeline(x, program.get(), &kConstants);
|
||||
x += stride;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// While possible, build and run at full vector stride.
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
if (1 && SkCpu::Supports(SkCpu::HSW)) {
|
||||
if (!build_and_run(8, lookup_hsw, sk_just_return_hsw, sk_start_pipeline_hsw)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// If there's any leftover, build and run stride=1 portable code.
|
||||
if (x < limit) {
|
||||
stride = 1;
|
||||
|
||||
void** ip = program.get();
|
||||
for (auto&& st : fStages) {
|
||||
auto fn = portable_lookup(st.stage);
|
||||
if (!fn) {
|
||||
return false;
|
||||
}
|
||||
*ip++ = fn;
|
||||
*ip++ = st.ctx;
|
||||
}
|
||||
*ip = (void*)sk_just_return;
|
||||
|
||||
auto start = sk_start_pipeline;
|
||||
while (x + stride <= limit) {
|
||||
start(x, program.get(), &kConstants);
|
||||
x += stride;
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE41)) {
|
||||
if (!build_and_run(4, lookup_sse41, sk_just_return_sse41, sk_start_pipeline_sse41)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (!build_and_run(4, lookup_sse2, sk_just_return_sse2, sk_start_pipeline_sse2)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
// Finish up any leftover with portable code one pixel at a time.
|
||||
return build_and_run(1, lookup_portable, sk_just_return, sk_start_pipeline);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
1982
src/jumper/SkJumper_generated_x86_64.s
Normal file
1982
src/jumper/SkJumper_generated_x86_64.s
Normal file
File diff suppressed because it is too large
Load Diff
@ -32,6 +32,8 @@ using K = const SkJumper_constants;
|
||||
|
||||
static F gather(const float* p, U32 ix) { return p[ix]; }
|
||||
|
||||
#define WRAP(name) sk_##name
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
#include <arm_neon.h>
|
||||
|
||||
@ -53,6 +55,8 @@ using K = const SkJumper_constants;
|
||||
|
||||
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
|
||||
|
||||
#define WRAP(name) sk_##name##_aarch64
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
|
||||
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
|
||||
@ -76,6 +80,8 @@ using K = const SkJumper_constants;
|
||||
|
||||
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
|
||||
|
||||
#define WRAP(name) sk_##name##_armv7
|
||||
|
||||
#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
|
||||
#include <immintrin.h>
|
||||
|
||||
@ -96,6 +102,8 @@ using K = const SkJumper_constants;
|
||||
|
||||
static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
|
||||
|
||||
#define WRAP(name) sk_##name##_hsw
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
#include <immintrin.h>
|
||||
|
||||
@ -120,6 +128,12 @@ using K = const SkJumper_constants;
|
||||
}
|
||||
|
||||
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#define WRAP(name) sk_##name##_sse41
|
||||
#else
|
||||
#define WRAP(name) sk_##name##_sse2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// We need to be a careful with casts.
|
||||
@ -190,7 +204,7 @@ static void* load_and_inc(void**& program) {
|
||||
#define STAGE(name) \
|
||||
static void name##_k(size_t& x, void* ctx, K* k, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
|
||||
extern "C" void sk_##name(size_t x, void** program, K* k, \
|
||||
extern "C" void WRAP(name)(size_t x, void** program, K* k, \
|
||||
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
|
||||
auto ctx = load_and_inc(program); \
|
||||
name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \
|
||||
@ -202,7 +216,7 @@ static void* load_and_inc(void**& program) {
|
||||
|
||||
// Some glue stages that don't fit the normal pattern of stages.
|
||||
|
||||
extern "C" void sk_start_pipeline(size_t x, void** program, K* k) {
|
||||
extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) {
|
||||
auto next = (Stage*)load_and_inc(program);
|
||||
F v{}; // TODO: faster uninitialized?
|
||||
next(x,program,k, v,v,v,v, v,v,v,v);
|
||||
@ -210,13 +224,17 @@ extern "C" void sk_start_pipeline(size_t x, void** program, K* k) {
|
||||
|
||||
#if defined(JUMPER) && defined(__x86_64__)
|
||||
__attribute__((ms_abi))
|
||||
extern "C" void sk_start_pipeline_ms(size_t x, void** program, K* k) {
|
||||
sk_start_pipeline(x,program,k);
|
||||
extern "C" void WRAP(start_pipeline_ms)(size_t x, void** program, K* k) {
|
||||
WRAP(start_pipeline)(x,program,k);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
|
||||
extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
|
||||
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {
|
||||
#if defined(JUMPER) && defined(__AVX2__)
|
||||
asm("vzeroupper");
|
||||
#endif
|
||||
}
|
||||
|
||||
// We can now define Stages!
|
||||
|
||||
|
@ -9,8 +9,6 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
sys.stdout = open('src/jumper/SkJumper_generated.h', 'w')
|
||||
|
||||
ndk = '/Users/mtklein/brew/opt/android-ndk/'
|
||||
objdump = 'gobjdump'
|
||||
|
||||
@ -52,30 +50,23 @@ subprocess.check_call(['clang++'] + cflags + armv7 +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'armv7.o'])
|
||||
|
||||
def parse_object_file(dot_o, array_type, target=None):
|
||||
prefix = dot_o.replace('.o', '_')
|
||||
def parse_object_file(dot_o, target=None):
|
||||
cmd = [ objdump, '-d', '--insn-width=9', dot_o]
|
||||
if target:
|
||||
cmd += ['--target', target]
|
||||
|
||||
active = False
|
||||
for line in subprocess.check_output(cmd).split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if line.startswith(dot_o) or line.startswith('Disassembly'):
|
||||
continue
|
||||
|
||||
if not line:
|
||||
if active:
|
||||
print '};'
|
||||
active = False
|
||||
if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
|
||||
continue
|
||||
|
||||
# E.g. 00000000000003a4 <_load_f16>:
|
||||
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
|
||||
if m:
|
||||
print 'static const', array_type, prefix + m.group(1) + '[] = {'
|
||||
active = True
|
||||
print
|
||||
print '.globl _' + m.group(1)
|
||||
print '_' + m.group(1) + ':'
|
||||
continue
|
||||
|
||||
columns = line.split('\t')
|
||||
@ -93,26 +84,26 @@ def parse_object_file(dot_o, array_type, target=None):
|
||||
for arg in args:
|
||||
assert 'rip' not in arg # TODO: detect on aarch64 too
|
||||
|
||||
hexed = ''.join('0x'+x+',' for x in code.split(' '))
|
||||
print ' ' + hexed + ' '*(48-len(hexed)) + \
|
||||
'// ' + inst + (' '*(14-len(inst)) + args if args else '')
|
||||
hexed = ','.join('0x'+x for x in code.split(' '))
|
||||
|
||||
print '''/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
print ' ' + '.byte ' + hexed + ' '*(48-len(hexed)) + \
|
||||
'# ' + inst + (' '*(14-len(inst)) + args if args else '')
|
||||
|
||||
#ifndef SkJumper_generated_DEFINED
|
||||
#define SkJumper_generated_DEFINED
|
||||
sys.stdout = open('src/jumper/SkJumper_generated_x86_64.s', 'w')
|
||||
|
||||
// This file is generated semi-automatically with this command:
|
||||
// $ src/jumper/build_stages.py
|
||||
print '''# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
# This file is generated semi-automatically with this command:
|
||||
# $ src/jumper/build_stages.py
|
||||
'''
|
||||
parse_object_file('aarch64.o', 'unsigned int')
|
||||
parse_object_file('armv7.o', 'unsigned int', target='elf32-littlearm')
|
||||
parse_object_file('hsw.o', 'unsigned char')
|
||||
parse_object_file('sse41.o', 'unsigned char')
|
||||
parse_object_file('sse2.o', 'unsigned char')
|
||||
print '#endif//SkJumper_generated_DEFINED'
|
||||
|
||||
print '.text'
|
||||
parse_object_file('hsw.o')
|
||||
parse_object_file('sse41.o')
|
||||
parse_object_file('sse2.o')
|
||||
|
||||
#parse_object_file('aarch64.o')
|
||||
#parse_object_file('armv7.o', target='elf32-littlearm')
|
||||
|
Loading…
Reference in New Issue
Block a user