remove vfpv4 requirement for SkJumper on ARMv7

VFPv4 gives us two interesting features:
  - FMA
  - f16<->f32 conversions

Even without FMAs, NEON still has non-fused MLA instructions.  We don't
really care about the fusedness of those mul-adds, so losing FMA here is
kind of no big deal.

We already maintain portable code to do f16<->f32 conversions, so it's
not much of a maintanence hit to use that instead of the native
instructions.  To my knowledge software F16 rendering is not a
performance critical mode of operation for any of our users.

This drops our minimum requirement to basically just having NEON.
Devices like the Nexus 7 2012 will now take SkJumper fast paths
instead of portable code.  (Though actually, we've only ever
required NEON for _lowp... only the float code also needed vfpv4).

The main file to look at here is actually SkJumper_vectors.h,
where you will see all the substantive changes.  The rest just
kind of tears down most of the old complexity, add adds ABI
to put just a little of it back.  :)

Change-Id: Ia9237117698729c91e5fa51126baf80748093bf4
Bug: skia:
Reviewed-on: https://skia-review.googlesource.com/83521
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Florin Malita <fmalita@chromium.org>
This commit is contained in:
Mike Klein 2017-12-11 16:53:26 -05:00 committed by Skia Commit-Bot
parent ba096c0dc3
commit 376fd31ad4
8 changed files with 51 additions and 9555 deletions

View File

@ -78,13 +78,6 @@ extern "C" {
#if !SK_JUMPER_USE_ASSEMBLY
// We'll just run baseline code.
#elif defined(__arm__)
StartPipelineFn ASM(start_pipeline,vfp4);
StageFn ASM(just_return,vfp4);
#define M(st) StageFn ASM(st,vfp4);
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#elif defined(__x86_64__) || defined(_M_X64)
StartPipelineFn ASM(start_pipeline, skx),
ASM(start_pipeline, hsw),
@ -135,7 +128,6 @@ extern "C" {
#undef M
#if defined(JUMPER_HAS_NEON_LOWP)
// We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
StartPipelineFn sk_start_pipeline_lowp;
StageFn sk_just_return_lowp;
#define M(st) StageFn sk_##st##_lowp;
@ -295,17 +287,6 @@ static SkJumper_Engine choose_engine() {
#if !SK_JUMPER_USE_ASSEMBLY
// We'll just run baseline code.
#elif defined(__arm__)
if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
return {
#define M(stage) ASM(stage, vfp4),
{ SK_RASTER_PIPELINE_STAGES(M) },
M(start_pipeline)
M(just_return)
#undef M
};
}
#elif defined(__x86_64__) || defined(_M_X64)
#if !defined(_MSC_VER) // No _skx stages for Windows yet.
if (1 && SkCpu::Supports(SkCpu::SKX)) {

View File

@ -8,6 +8,9 @@
#ifndef SkJumper_DEFINED
#define SkJumper_DEFINED
#include <stddef.h>
#include <stdint.h>
// This file contains definitions shared by SkJumper.cpp (compiled normally as part of Skia)
// and SkJumper_stages.cpp (compiled into Skia _and_ offline into SkJumper_generated.h).
// Keep it simple!
@ -21,36 +24,18 @@
#define MAYBE_MSABI
#endif
#if defined(JUMPER_IS_OFFLINE) && (defined(__aarch64__) || defined(__arm__))
// To reduce SkJumper's dependency on the Android NDK,
// we provide what we need from <string.h>, <stdint.h>, and <stddef.h> ourselves.
#define memcpy __builtin_memcpy
using int8_t = signed char;
using uint8_t = unsigned char;
using int16_t = signed short;
using uint16_t = unsigned short;
using int32_t = signed int;
using uint32_t = unsigned int;
#if defined(__aarch64__)
using int64_t = signed long;
using uint64_t = unsigned long;
using size_t = uint64_t;
#else
using int64_t = signed long long;
using uint64_t = unsigned long long;
using size_t = uint32_t;
#endif
// Now pretend we've included <stdint.h> (or it'll be included again by <arm_neon.h>).
#define __CLANG_STDINT_H
#define _STDINT_H_
// Any custom ABI to use for all non-externally-facing stage functions.
#if defined(__ARM_NEON) && defined(__arm__)
// This lets us pass vectors more efficiently on 32-bit ARM.
#define ABI __attribute__((pcs("aapcs-vfp")))
#else
#include <string.h>
#include <stdint.h>
#define ABI
#endif
// When compiled with Clang on ARM, we'll have 8-bit NEON stages.
// On ARM we expect that you're using Clang if you want SkJumper to be fast.
// If you are, the baseline float stages will use NEON, and lowp stages will
// also be available. (If somehow you're building for ARM not using Clang,
// you'll get scalar baseline stages and no lowp support.)
#if defined(__clang__) && defined(__ARM_NEON)
#define JUMPER_HAS_NEON_LOWP
#endif

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,7 @@
#ifndef SkJumper_misc_DEFINED
#define SkJumper_misc_DEFINED
#include "SkJumper.h" // for memcpy()
#include <string.h> // for memcpy()
// Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h.

View File

@ -50,17 +50,17 @@ static const size_t N = sizeof(F) / sizeof(float);
size_t dx, dy, tail;
F dr,dg,db,da;
};
using Stage = void(Params*, void** program, F r, F g, F b, F a);
using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a);
#else
// We keep program the second argument, so that it's passed in rsi for load_and_inc().
using Stage = void(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
#endif
MAYBE_MSABI
extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit,
void** program) {
auto start = (Stage*)load_and_inc(program);
extern "C" MAYBE_MSABI void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit,
void** program) {
auto start = (Stage)load_and_inc(program);
const size_t x0 = dx;
for (; dy < ylimit; dy++) {
#if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
@ -90,26 +90,26 @@ extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t
#define STAGE(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(Params* params, void** program, \
F r, F g, F b, F a) { \
extern "C" ABI void WRAP(name)(Params* params, void** program, \
F r, F g, F b, F a) { \
name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \
params->dr, params->dg, params->db, params->da); \
auto next = (Stage*)load_and_inc(program); \
auto next = (Stage)load_and_inc(program); \
next(params,program, r,g,b,a); \
} \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#else
#define STAGE(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
auto next = (Stage*)load_and_inc(program); \
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
#define STAGE(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
auto next = (Stage)load_and_inc(program); \
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#endif
@ -117,9 +117,9 @@ extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t
// just_return() is a simple no-op stage that only exists to end the chain,
// returning back up to start_pipeline(), and from there to the caller.
#if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
extern "C" void WRAP(just_return)(Params*, void**, F,F,F,F) {}
extern "C" ABI void WRAP(just_return)(Params*, void**, F,F,F,F) {}
#else
extern "C" void WRAP(just_return)(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
extern "C" ABI void WRAP(just_return)(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
#endif

View File

@ -16,17 +16,10 @@
#if defined(__ARM_NEON)
#include <arm_neon.h>
#if defined(__arm__)
#define ABI __attribute__((pcs("aapcs-vfp")))
#else
#define ABI
#endif
#elif defined(__SSE2__)
#include <immintrin.h>
#define ABI
#else
#include <math.h>
#define ABI
#endif
#if !defined(JUMPER_IS_OFFLINE)
@ -62,12 +55,11 @@ using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
U16 r, U16 g, U16 b, U16 a,
U16 dr, U16 dg, U16 db, U16 da);
MAYBE_MSABI
ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
const size_t y0,
const size_t xlimit,
const size_t ylimit,
void** program) {
extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0,
const size_t y0,
const size_t xlimit,
const size_t ylimit,
void** program) {
auto start = (Stage)load_and_inc(program);
for (size_t dy = y0; dy < ylimit; dy++) {
size_t dx = x0;
@ -80,7 +72,7 @@ ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
}
}
ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t,
U16,U16,U16,U16, U16,U16,U16,U16) {}
// All stages use the same function call ABI to chain into each other, but there are three types:
@ -95,7 +87,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
#define STAGE_GG(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
U16 r, U16 g, U16 b, U16 a, \
U16 dr, U16 dg, U16 db, U16 da) { \
auto x = join<F>(r,g), \
@ -112,7 +104,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
U16& r, U16& g, U16& b, U16& a, \
U16& dr, U16& dg, U16& db, U16& da); \
ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
U16 r, U16 g, U16 b, U16 a, \
U16 dr, U16 dg, U16 db, U16 da) { \
auto x = join<F>(r,g), \
@ -129,7 +121,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
U16& r, U16& g, U16& b, U16& a, \
U16& dr, U16& dg, U16& db, U16& da); \
ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
U16 r, U16 g, U16 b, U16 a, \
U16 dr, U16 dg, U16 db, U16 da) { \
name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \

View File

@ -10,6 +10,7 @@
#include "SkJumper.h"
#include "SkJumper_misc.h"
#include <stdint.h>
// This file contains vector types that SkJumper_stages.cpp uses to define stages.
@ -17,7 +18,7 @@
#if !defined(__clang__)
#define JUMPER_IS_SCALAR
#elif defined(__aarch64__) || defined(__ARM_VFPV4__)
#elif defined(__ARM_NEON)
#define JUMPER_IS_NEON
#elif defined(__AVX512F__)
#define JUMPER_IS_AVX512
@ -105,7 +106,6 @@
using U8 = V<uint8_t >;
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
SI F min(F a, F b) { return vminq_f32(a,b); }
SI F max(F a, F b) { return vmaxq_f32(a,b); }
SI F abs_ (F v) { return vabsq_f32(v); }
@ -117,10 +117,12 @@
SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
#if defined(__aarch64__)
SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
SI F floor_(F v) { return vrndmq_f32(v); }
SI F sqrt_(F v) { return vsqrtq_f32(v); }
SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
#else
SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
SI F floor_(F v) {
F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
return roundtrip - if_then_else(roundtrip > v, 1, 0);
@ -643,7 +645,7 @@ SI F approx_powf(F x, F y) {
}
SI F from_half(U16 h) {
#if defined(JUMPER_IS_NEON)
#if defined(__aarch64__)
return vcvt_f32_f16(h);
#elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
@ -663,7 +665,7 @@ SI F from_half(U16 h) {
}
SI U16 to_half(F f) {
#if defined(JUMPER_IS_NEON)
#if defined(__aarch64__)
return vcvt_f16_f32(f);
#elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)

View File

@ -120,14 +120,6 @@ subprocess.check_call(['ld', '-r', '-o', 'win_x86_merged.o',
'win_x86_sse2.o',
'win_x86_lowp_sse2.o'])
vfp4 = [
'--target=armv7a-linux-gnueabihf',
'-mfpu=neon-vfpv4',
]
subprocess.check_call(clang + cflags + vfp4 +
['-c', stages] +
['-o', 'vfp4.o'])
def parse_object_file(dot_o, directive, target=None):
globl, hidden, label, comment, align = \
'.globl', 'HIDDEN', ':', '// ', 'BALIGN'
@ -235,11 +227,7 @@ print ' #define BALIGN32 .balign 32'
print '#endif'
print '.text'
print '#if defined(__arm__)'
print 'BALIGN4'
parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
print '#elif defined(__x86_64__)'
print '#if defined(__x86_64__)'
print 'BALIGN32'
parse_object_file('merged.o', '.byte')