SkJumper: start on asm

Will follow up with Linux, then Android aarch64 and armv7, then iOS, then Windows.

I took some opportunities to refactor.

CQ_INCLUDE_trybots=skia.primary:Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Debug,Perf-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Debug

Change-Id: Ifcf1edabdfe5df0a91bd089f09523aba95cdf5ef
Reviewed-on: https://skia-review.googlesource.com/8611
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2017-02-17 09:41:09 -05:00 committed by Skia Commit-Bot
parent 2e777ead12
commit d1fe9522e3
6 changed files with 2134 additions and 3162 deletions

View File

@ -29,7 +29,7 @@ declare_args() {
skia_enable_android_framework_defines = false
skia_enable_discrete_gpu = true
skia_enable_effects = true
skia_enable_jumper = false
skia_enable_jumper = is_skia_standalone && is_mac
skia_enable_gpu = true
skia_enable_pdf = true
skia_enable_tools = is_skia_standalone
@ -498,6 +498,7 @@ optional("jumper") {
public_defines = [ "SK_JUMPER" ]
sources = [
"src/jumper/SkJumper.cpp",
"src/jumper/SkJumper_generated_x86_64.s",
"src/jumper/SkJumper_stages.cpp",
]
}

View File

@ -7,13 +7,14 @@
#include "SkCpu.h"
#include "SkJumper.h"
#include "SkJumper_generated.h"
#include "SkRasterPipeline.h"
#include "SkTemplates.h"
// Stages expect these constants to be set to these values.
// It's fine to rearrange and add new ones if you update SkJumper_constants.
static const SkJumper_constants kConstants = {
using K = const SkJumper_constants;
static K kConstants = {
1.0f, 0.5f, 255.0f, 1/255.0f, 0x000000ff,
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
@ -50,171 +51,125 @@ static const SkJumper_constants kConstants = {
M(clamp_y) \
M(linear_gradient_2stops)
// Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o.
extern "C" {
void sk_start_pipeline(size_t, void**, const SkJumper_constants*);
// We can't express the real types of most stage functions portably, so we use a stand-in.
// We'll only ever call start_pipeline(), which then chains into the rest for us.
using StageFn = void(void);
// We use void() as a convenient stand-in for the real stage function type.
// We never call these directly, so we don't really need to know their real types.
void sk_just_return(void);
#define M(st) void sk_##st(void);
STAGES(M)
#undef M
extern "C" {
#if defined(__x86_64__) || defined(_M_X64)
void sk_start_pipeline_hsw (size_t, void**, K*);
void sk_start_pipeline_sse41(size_t, void**, K*);
void sk_start_pipeline_sse2 (size_t, void**, K*);
StageFn sk_just_return_hsw,
sk_just_return_sse41,
sk_just_return_sse2;
#define M(st) StageFn sk_##st##_hsw;
STAGES(M)
#undef M
#define M(st) StageFn sk_##st##_sse41;
STAGES(M)
#undef M
#define M(st) StageFn sk_##st##_sse2;
STAGES(M)
#undef M
#endif
// Portable, single-pixel stages.
void sk_start_pipeline(size_t, void**, K*);
StageFn sk_just_return;
#define M(st) StageFn sk_##st;
STAGES(M)
#undef M
}
// Translate SkRasterPipeline's enum to pointers to our portable, single pixel stages.
static void* portable_lookup(SkRasterPipeline::StockStage st) {
// Translate SkRasterPipeline's StockStage enum to StageFn function pointers.
#if defined(__x86_64__) || defined(_M_X64)
static StageFn* lookup_hsw(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return sk_##st##_hsw;
STAGES(M)
#undef M
}
}
static StageFn* lookup_sse41(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return sk_##st##_sse41;
STAGES(M)
#undef M
}
}
static StageFn* lookup_sse2(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return sk_##st##_sse2;
STAGES(M)
#undef M
}
}
#endif
static StageFn* lookup_portable(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)sk_##st;
#define M(st) case SkRasterPipeline::st: return sk_##st;
STAGES(M)
#undef M
}
}
// The non-portable options are pre-compiled static data arrays pulled in from SkJumper_generated.h.
#if defined(__aarch64__)
static void* aarch64_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)aarch64_sk_##st;
STAGES(M)
#undef M
}
}
#elif defined(__ARM_NEON__)
static void* armv7_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)armv7_sk_##st;
STAGES(M)
#undef M
}
}
#elif defined(__x86_64__) || defined(_M_X64)
static void* sse2_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)sse2_sk_##st;
STAGES(M)
#undef M
}
}
static void* sse41_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)sse41_sk_##st;
STAGES(M)
#undef M
}
}
static void* hsw_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)hsw_sk_##st;
STAGES(M)
#undef M
}
}
#endif
bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
// We'll look for the best vector instruction set and stride we can use.
size_t stride = 0;
void* (*lookup)(SkRasterPipeline::StockStage) = nullptr;
void* start_pipeline = nullptr;
void* just_return = nullptr;
#if defined(__aarch64__)
stride = 4;
lookup = aarch64_lookup;
start_pipeline = (void*)aarch64_sk_start_pipeline;
just_return = (void*)aarch64_sk_just_return;
#elif defined(__ARM_NEON__)
if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
stride = 2;
lookup = armv7_lookup;
start_pipeline = (void*)armv7_sk_start_pipeline;
just_return = (void*)armv7_sk_just_return;
}
#elif defined(__x86_64__) || defined(_M_X64)
stride = 4;
lookup = sse2_lookup;
start_pipeline = (void*)sse2_sk_start_pipeline;
just_return = (void*)sse2_sk_just_return;
if (SkCpu::Supports(SkCpu::SSE41)) {
stride = 4;
lookup = sse41_lookup;
start_pipeline = (void*)sse41_sk_start_pipeline;
just_return = (void*)sse41_sk_just_return;
}
if (SkCpu::Supports(SkCpu::HSW)) {
stride = 8;
lookup = hsw_lookup;
start_pipeline = (void*)hsw_sk_start_pipeline;
just_return = (void*)hsw_sk_just_return;
}
#endif
#if defined(_MSC_VER)
if (start_pipeline == (void*)sse2_sk_start_pipeline) {
start_pipeline = (void*)sse2_sk_start_pipeline_ms;
}
if (start_pipeline == (void*)sse41_sk_start_pipeline) {
start_pipeline = (void*)sse41_sk_start_pipeline_ms;
}
if (start_pipeline == (void*)hsw_sk_start_pipeline) {
start_pipeline = (void*)hsw_sk_start_pipeline_ms;
}
#endif
SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
// If possible, build and run a program to run at full vector stride.
const size_t limit = x+n;
if (stride) {
void** ip = program.get();
for (auto&& st : fStages) {
auto fn = lookup(st.stage);
if (!fn) {
return false;
auto build_and_run = [&](size_t stride,
StageFn* (*lookup)(SkRasterPipeline::StockStage),
StageFn* just_return,
void (*start_pipeline)(size_t, void**, K*)) {
if (x + stride <= limit) {
void** ip = program.get();
for (auto&& st : fStages) {
auto fn = lookup(st.stage);
if (!fn) {
return false;
}
*ip++ = (void*)fn;
*ip++ = st.ctx;
}
*ip++ = fn;
*ip++ = st.ctx;
}
*ip = (void*)just_return;
*ip = (void*)just_return;
auto start = (decltype(&sk_start_pipeline))start_pipeline;
while (x + stride <= limit) {
start(x, program.get(), &kConstants);
x += stride;
while (x + stride <= limit) {
start_pipeline(x, program.get(), &kConstants);
x += stride;
}
}
return true;
};
// While possible, build and run at full vector stride.
#if defined(__x86_64__) || defined(_M_X64)
if (1 && SkCpu::Supports(SkCpu::HSW)) {
if (!build_and_run(8, lookup_hsw, sk_just_return_hsw, sk_start_pipeline_hsw)) {
return false;
}
}
// If there's any leftover, build and run stride=1 portable code.
if (x < limit) {
stride = 1;
void** ip = program.get();
for (auto&& st : fStages) {
auto fn = portable_lookup(st.stage);
if (!fn) {
return false;
}
*ip++ = fn;
*ip++ = st.ctx;
}
*ip = (void*)sk_just_return;
auto start = sk_start_pipeline;
while (x + stride <= limit) {
start(x, program.get(), &kConstants);
x += stride;
if (1 && SkCpu::Supports(SkCpu::SSE41)) {
if (!build_and_run(4, lookup_sse41, sk_just_return_sse41, sk_start_pipeline_sse41)) {
return false;
}
}
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
if (!build_and_run(4, lookup_sse2, sk_just_return_sse2, sk_start_pipeline_sse2)) {
return false;
}
}
#endif
return true;
// Finish up any leftover with portable code one pixel at a time.
return build_and_run(1, lookup_portable, sk_just_return, sk_start_pipeline);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -32,6 +32,8 @@ using K = const SkJumper_constants;
static F gather(const float* p, U32 ix) { return p[ix]; }
#define WRAP(name) sk_##name
#elif defined(__aarch64__)
#include <arm_neon.h>
@ -53,6 +55,8 @@ using K = const SkJumper_constants;
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#define WRAP(name) sk_##name##_aarch64
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@ -76,6 +80,8 @@ using K = const SkJumper_constants;
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
#define WRAP(name) sk_##name##_armv7
#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
#include <immintrin.h>
@ -96,6 +102,8 @@ using K = const SkJumper_constants;
static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
#define WRAP(name) sk_##name##_hsw
#elif defined(__SSE2__)
#include <immintrin.h>
@ -120,6 +128,12 @@ using K = const SkJumper_constants;
}
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#if defined(__SSE4_1__)
#define WRAP(name) sk_##name##_sse41
#else
#define WRAP(name) sk_##name##_sse2
#endif
#endif
// We need to be a careful with casts.
@ -190,7 +204,7 @@ static void* load_and_inc(void**& program) {
#define STAGE(name) \
static void name##_k(size_t& x, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void sk_##name(size_t x, void** program, K* k, \
extern "C" void WRAP(name)(size_t x, void** program, K* k, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
auto ctx = load_and_inc(program); \
name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \
@ -202,7 +216,7 @@ static void* load_and_inc(void**& program) {
// Some glue stages that don't fit the normal pattern of stages.
extern "C" void sk_start_pipeline(size_t x, void** program, K* k) {
extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) {
auto next = (Stage*)load_and_inc(program);
F v{}; // TODO: faster uninitialized?
next(x,program,k, v,v,v,v, v,v,v,v);
@ -210,13 +224,17 @@ extern "C" void sk_start_pipeline(size_t x, void** program, K* k) {
#if defined(JUMPER) && defined(__x86_64__)
__attribute__((ms_abi))
extern "C" void sk_start_pipeline_ms(size_t x, void** program, K* k) {
sk_start_pipeline(x,program,k);
extern "C" void WRAP(start_pipeline_ms)(size_t x, void** program, K* k) {
WRAP(start_pipeline)(x,program,k);
}
#endif
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {
#if defined(JUMPER) && defined(__AVX2__)
asm("vzeroupper");
#endif
}
// We can now define Stages!

View File

@ -9,8 +9,6 @@ import re
import subprocess
import sys
sys.stdout = open('src/jumper/SkJumper_generated.h', 'w')
ndk = '/Users/mtklein/brew/opt/android-ndk/'
objdump = 'gobjdump'
@ -52,30 +50,23 @@ subprocess.check_call(['clang++'] + cflags + armv7 +
['-c', 'src/jumper/SkJumper_stages.cpp'] +
['-o', 'armv7.o'])
def parse_object_file(dot_o, array_type, target=None):
prefix = dot_o.replace('.o', '_')
def parse_object_file(dot_o, target=None):
cmd = [ objdump, '-d', '--insn-width=9', dot_o]
if target:
cmd += ['--target', target]
active = False
for line in subprocess.check_output(cmd).split('\n'):
line = line.strip()
if line.startswith(dot_o) or line.startswith('Disassembly'):
continue
if not line:
if active:
print '};'
active = False
if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
continue
# E.g. 00000000000003a4 <_load_f16>:
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
if m:
print 'static const', array_type, prefix + m.group(1) + '[] = {'
active = True
print
print '.globl _' + m.group(1)
print '_' + m.group(1) + ':'
continue
columns = line.split('\t')
@ -93,26 +84,26 @@ def parse_object_file(dot_o, array_type, target=None):
for arg in args:
assert 'rip' not in arg # TODO: detect on aarch64 too
hexed = ''.join('0x'+x+',' for x in code.split(' '))
print ' ' + hexed + ' '*(48-len(hexed)) + \
'// ' + inst + (' '*(14-len(inst)) + args if args else '')
hexed = ','.join('0x'+x for x in code.split(' '))
print '''/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
print ' ' + '.byte ' + hexed + ' '*(48-len(hexed)) + \
'# ' + inst + (' '*(14-len(inst)) + args if args else '')
#ifndef SkJumper_generated_DEFINED
#define SkJumper_generated_DEFINED
sys.stdout = open('src/jumper/SkJumper_generated_x86_64.s', 'w')
// This file is generated semi-automatically with this command:
// $ src/jumper/build_stages.py
print '''# Copyright 2017 Google Inc.
#
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This file is generated semi-automatically with this command:
# $ src/jumper/build_stages.py
'''
parse_object_file('aarch64.o', 'unsigned int')
parse_object_file('armv7.o', 'unsigned int', target='elf32-littlearm')
parse_object_file('hsw.o', 'unsigned char')
parse_object_file('sse41.o', 'unsigned char')
parse_object_file('sse2.o', 'unsigned char')
print '#endif//SkJumper_generated_DEFINED'
print '.text'
parse_object_file('hsw.o')
parse_object_file('sse41.o')
parse_object_file('sse2.o')
#parse_object_file('aarch64.o')
#parse_object_file('armv7.o', target='elf32-littlearm')