remove vfpv4 requirement for SkJumper on ARMv7
VFPv4 gives us two interesting features: - FMA - f16<->f32 conversions Even without FMAs, NEON still has non-fused MLA instructions. We don't really care about the fusedness of those mul-adds, so losing FMA here is kind of no big deal. We already maintain portable code to do f16<->f32 conversions, so it's not much of a maintanence hit to use that instead of the native instructions. To my knowledge software F16 rendering is not a performance critical mode of operation for any of our users. This drops our minimum requirement to basically just having NEON. Devices like the Nexus 7 2012 will now take SkJumper fast paths instead of portable code. (Though actually, we've only ever required NEON for _lowp... only the float code also needed vfpv4). The main file to look at here is actually SkJumper_vectors.h, where you will see all the substantive changes. The rest just kind of tears down most of the old complexity, add adds ABI to put just a little of it back. :) Change-Id: Ia9237117698729c91e5fa51126baf80748093bf4 Bug: skia: Reviewed-on: https://skia-review.googlesource.com/83521 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
This commit is contained in:
parent
ba096c0dc3
commit
376fd31ad4
@ -78,13 +78,6 @@ extern "C" {
|
||||
#if !SK_JUMPER_USE_ASSEMBLY
|
||||
// We'll just run baseline code.
|
||||
|
||||
#elif defined(__arm__)
|
||||
StartPipelineFn ASM(start_pipeline,vfp4);
|
||||
StageFn ASM(just_return,vfp4);
|
||||
#define M(st) StageFn ASM(st,vfp4);
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
#undef M
|
||||
|
||||
#elif defined(__x86_64__) || defined(_M_X64)
|
||||
StartPipelineFn ASM(start_pipeline, skx),
|
||||
ASM(start_pipeline, hsw),
|
||||
@ -135,7 +128,6 @@ extern "C" {
|
||||
#undef M
|
||||
|
||||
#if defined(JUMPER_HAS_NEON_LOWP)
|
||||
// We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
|
||||
StartPipelineFn sk_start_pipeline_lowp;
|
||||
StageFn sk_just_return_lowp;
|
||||
#define M(st) StageFn sk_##st##_lowp;
|
||||
@ -295,17 +287,6 @@ static SkJumper_Engine choose_engine() {
|
||||
#if !SK_JUMPER_USE_ASSEMBLY
|
||||
// We'll just run baseline code.
|
||||
|
||||
#elif defined(__arm__)
|
||||
if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
|
||||
return {
|
||||
#define M(stage) ASM(stage, vfp4),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
M(start_pipeline)
|
||||
M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(_M_X64)
|
||||
#if !defined(_MSC_VER) // No _skx stages for Windows yet.
|
||||
if (1 && SkCpu::Supports(SkCpu::SKX)) {
|
||||
|
@ -8,6 +8,9 @@
|
||||
#ifndef SkJumper_DEFINED
|
||||
#define SkJumper_DEFINED
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// This file contains definitions shared by SkJumper.cpp (compiled normally as part of Skia)
|
||||
// and SkJumper_stages.cpp (compiled into Skia _and_ offline into SkJumper_generated.h).
|
||||
// Keep it simple!
|
||||
@ -21,36 +24,18 @@
|
||||
#define MAYBE_MSABI
|
||||
#endif
|
||||
|
||||
#if defined(JUMPER_IS_OFFLINE) && (defined(__aarch64__) || defined(__arm__))
|
||||
// To reduce SkJumper's dependency on the Android NDK,
|
||||
// we provide what we need from <string.h>, <stdint.h>, and <stddef.h> ourselves.
|
||||
#define memcpy __builtin_memcpy
|
||||
|
||||
using int8_t = signed char;
|
||||
using uint8_t = unsigned char;
|
||||
using int16_t = signed short;
|
||||
using uint16_t = unsigned short;
|
||||
using int32_t = signed int;
|
||||
using uint32_t = unsigned int;
|
||||
#if defined(__aarch64__)
|
||||
using int64_t = signed long;
|
||||
using uint64_t = unsigned long;
|
||||
using size_t = uint64_t;
|
||||
// Any custom ABI to use for all non-externally-facing stage functions.
|
||||
#if defined(__ARM_NEON) && defined(__arm__)
|
||||
// This lets us pass vectors more efficiently on 32-bit ARM.
|
||||
#define ABI __attribute__((pcs("aapcs-vfp")))
|
||||
#else
|
||||
using int64_t = signed long long;
|
||||
using uint64_t = unsigned long long;
|
||||
using size_t = uint32_t;
|
||||
#define ABI
|
||||
#endif
|
||||
|
||||
// Now pretend we've included <stdint.h> (or it'll be included again by <arm_neon.h>).
|
||||
#define __CLANG_STDINT_H
|
||||
#define _STDINT_H_
|
||||
#else
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
// When compiled with Clang on ARM, we'll have 8-bit NEON stages.
|
||||
// On ARM we expect that you're using Clang if you want SkJumper to be fast.
|
||||
// If you are, the baseline float stages will use NEON, and lowp stages will
|
||||
// also be available. (If somehow you're building for ARM not using Clang,
|
||||
// you'll get scalar baseline stages and no lowp support.)
|
||||
#if defined(__clang__) && defined(__ARM_NEON)
|
||||
#define JUMPER_HAS_NEON_LOWP
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,7 @@
|
||||
#ifndef SkJumper_misc_DEFINED
|
||||
#define SkJumper_misc_DEFINED
|
||||
|
||||
#include "SkJumper.h" // for memcpy()
|
||||
#include <string.h> // for memcpy()
|
||||
|
||||
// Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h.
|
||||
|
||||
|
@ -50,17 +50,17 @@ static const size_t N = sizeof(F) / sizeof(float);
|
||||
size_t dx, dy, tail;
|
||||
F dr,dg,db,da;
|
||||
};
|
||||
using Stage = void(Params*, void** program, F r, F g, F b, F a);
|
||||
using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a);
|
||||
|
||||
#else
|
||||
// We keep program the second argument, so that it's passed in rsi for load_and_inc().
|
||||
using Stage = void(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
|
||||
using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
|
||||
#endif
|
||||
|
||||
MAYBE_MSABI
|
||||
extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit,
|
||||
|
||||
extern "C" MAYBE_MSABI void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit,
|
||||
void** program) {
|
||||
auto start = (Stage*)load_and_inc(program);
|
||||
auto start = (Stage)load_and_inc(program);
|
||||
const size_t x0 = dx;
|
||||
for (; dy < ylimit; dy++) {
|
||||
#if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
|
||||
@ -90,11 +90,11 @@ extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t
|
||||
#define STAGE(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
|
||||
extern "C" void WRAP(name)(Params* params, void** program, \
|
||||
extern "C" ABI void WRAP(name)(Params* params, void** program, \
|
||||
F r, F g, F b, F a) { \
|
||||
name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \
|
||||
params->dr, params->dg, params->db, params->da); \
|
||||
auto next = (Stage*)load_and_inc(program); \
|
||||
auto next = (Stage)load_and_inc(program); \
|
||||
next(params,program, r,g,b,a); \
|
||||
} \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
@ -103,10 +103,10 @@ extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t
|
||||
#define STAGE(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
|
||||
extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
|
||||
name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
|
||||
auto next = (Stage*)load_and_inc(program); \
|
||||
auto next = (Stage)load_and_inc(program); \
|
||||
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
|
||||
} \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
@ -117,9 +117,9 @@ extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t
|
||||
// just_return() is a simple no-op stage that only exists to end the chain,
|
||||
// returning back up to start_pipeline(), and from there to the caller.
|
||||
#if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
|
||||
extern "C" void WRAP(just_return)(Params*, void**, F,F,F,F) {}
|
||||
extern "C" ABI void WRAP(just_return)(Params*, void**, F,F,F,F) {}
|
||||
#else
|
||||
extern "C" void WRAP(just_return)(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
|
||||
extern "C" ABI void WRAP(just_return)(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -16,17 +16,10 @@
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#if defined(__arm__)
|
||||
#define ABI __attribute__((pcs("aapcs-vfp")))
|
||||
#else
|
||||
#define ABI
|
||||
#endif
|
||||
#elif defined(__SSE2__)
|
||||
#include <immintrin.h>
|
||||
#define ABI
|
||||
#else
|
||||
#include <math.h>
|
||||
#define ABI
|
||||
#endif
|
||||
|
||||
#if !defined(JUMPER_IS_OFFLINE)
|
||||
@ -62,8 +55,7 @@ using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
|
||||
U16 r, U16 g, U16 b, U16 a,
|
||||
U16 dr, U16 dg, U16 db, U16 da);
|
||||
|
||||
MAYBE_MSABI
|
||||
ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
|
||||
extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0,
|
||||
const size_t y0,
|
||||
const size_t xlimit,
|
||||
const size_t ylimit,
|
||||
@ -80,7 +72,7 @@ ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
|
||||
}
|
||||
}
|
||||
|
||||
ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
|
||||
extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t,
|
||||
U16,U16,U16,U16, U16,U16,U16,U16) {}
|
||||
|
||||
// All stages use the same function call ABI to chain into each other, but there are three types:
|
||||
@ -95,7 +87,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
|
||||
|
||||
#define STAGE_GG(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
|
||||
ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
auto x = join<F>(r,g), \
|
||||
@ -112,7 +104,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
auto x = join<F>(r,g), \
|
||||
@ -129,7 +121,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include "SkJumper.h"
|
||||
#include "SkJumper_misc.h"
|
||||
#include <stdint.h>
|
||||
|
||||
// This file contains vector types that SkJumper_stages.cpp uses to define stages.
|
||||
|
||||
@ -17,7 +18,7 @@
|
||||
|
||||
#if !defined(__clang__)
|
||||
#define JUMPER_IS_SCALAR
|
||||
#elif defined(__aarch64__) || defined(__ARM_VFPV4__)
|
||||
#elif defined(__ARM_NEON)
|
||||
#define JUMPER_IS_NEON
|
||||
#elif defined(__AVX512F__)
|
||||
#define JUMPER_IS_AVX512
|
||||
@ -105,7 +106,6 @@
|
||||
using U8 = V<uint8_t >;
|
||||
|
||||
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
|
||||
SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
|
||||
SI F min(F a, F b) { return vminq_f32(a,b); }
|
||||
SI F max(F a, F b) { return vmaxq_f32(a,b); }
|
||||
SI F abs_ (F v) { return vabsq_f32(v); }
|
||||
@ -117,10 +117,12 @@
|
||||
SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
|
||||
|
||||
#if defined(__aarch64__)
|
||||
SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
|
||||
SI F floor_(F v) { return vrndmq_f32(v); }
|
||||
SI F sqrt_(F v) { return vsqrtq_f32(v); }
|
||||
SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
|
||||
#else
|
||||
SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
|
||||
SI F floor_(F v) {
|
||||
F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
|
||||
return roundtrip - if_then_else(roundtrip > v, 1, 0);
|
||||
@ -643,7 +645,7 @@ SI F approx_powf(F x, F y) {
|
||||
}
|
||||
|
||||
SI F from_half(U16 h) {
|
||||
#if defined(JUMPER_IS_NEON)
|
||||
#if defined(__aarch64__)
|
||||
return vcvt_f32_f16(h);
|
||||
|
||||
#elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
|
||||
@ -663,7 +665,7 @@ SI F from_half(U16 h) {
|
||||
}
|
||||
|
||||
SI U16 to_half(F f) {
|
||||
#if defined(JUMPER_IS_NEON)
|
||||
#if defined(__aarch64__)
|
||||
return vcvt_f16_f32(f);
|
||||
|
||||
#elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
|
||||
|
@ -120,14 +120,6 @@ subprocess.check_call(['ld', '-r', '-o', 'win_x86_merged.o',
|
||||
'win_x86_sse2.o',
|
||||
'win_x86_lowp_sse2.o'])
|
||||
|
||||
vfp4 = [
|
||||
'--target=armv7a-linux-gnueabihf',
|
||||
'-mfpu=neon-vfpv4',
|
||||
]
|
||||
subprocess.check_call(clang + cflags + vfp4 +
|
||||
['-c', stages] +
|
||||
['-o', 'vfp4.o'])
|
||||
|
||||
def parse_object_file(dot_o, directive, target=None):
|
||||
globl, hidden, label, comment, align = \
|
||||
'.globl', 'HIDDEN', ':', '// ', 'BALIGN'
|
||||
@ -235,11 +227,7 @@ print ' #define BALIGN32 .balign 32'
|
||||
print '#endif'
|
||||
|
||||
print '.text'
|
||||
print '#if defined(__arm__)'
|
||||
print 'BALIGN4'
|
||||
parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
|
||||
|
||||
print '#elif defined(__x86_64__)'
|
||||
print '#if defined(__x86_64__)'
|
||||
print 'BALIGN32'
|
||||
parse_object_file('merged.o', '.byte')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user