small ABI + narrow/wide code updates
The only tangible effect this CL should have is to use __vectorcall on all Windows builds, including scalar ones. The code generation is a little better there with __vectorcall than not, so might as well. This is a baby step towards vector stages with MSVC, but a very baby step indeed. Mostly this refactors and regroups a bunch of logic to reflect my current thoughts. The BUILD.gn changes are essentially no-ops, but they simplify things and make our flags more similar to how those targets are built in Chromium. (And I cleaned up other /arch: uses so this works.) Change-Id: I73dd39d15cdc7b3d268231a707952bbbfd91496e Reviewed-on: https://skia-review.googlesource.com/115644 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
42409c2efa
commit
4d4b3aaa20
10
BUILD.gn
10
BUILD.gn
@ -303,7 +303,7 @@ opts("sse42") {
|
||||
opts("avx") {
|
||||
enabled = is_x86
|
||||
sources = skia_opts.avx_sources
|
||||
if (!is_clang && is_win) {
|
||||
if (is_win) {
|
||||
cflags = [ "/arch:AVX" ]
|
||||
} else {
|
||||
cflags = [ "-mavx" ]
|
||||
@ -313,14 +313,10 @@ opts("avx") {
|
||||
opts("hsw") {
|
||||
enabled = is_x86
|
||||
sources = skia_opts.hsw_sources
|
||||
if (!is_clang && is_win) {
|
||||
if (is_win) {
|
||||
cflags = [ "/arch:AVX2" ]
|
||||
} else {
|
||||
cflags = [
|
||||
"-mavx2",
|
||||
"-mf16c",
|
||||
"-mfma",
|
||||
]
|
||||
cflags = [ "-march=haswell" ]
|
||||
}
|
||||
|
||||
# Oddly, clang-cl doesn't recognize this as a valid flag.
|
||||
|
@ -112,9 +112,7 @@ config("default") {
|
||||
"-mfpu=neon",
|
||||
"-mthumb",
|
||||
]
|
||||
} else if (current_cpu == "x86" && is_win) {
|
||||
cflags += [ "/arch:SSE2" ]
|
||||
} else if (current_cpu == "x86") {
|
||||
} else if (current_cpu == "x86" && !is_win) {
|
||||
asmflags += [ "-m32" ]
|
||||
cflags += [
|
||||
"-m32",
|
||||
|
@ -700,28 +700,32 @@ static const size_t N = sizeof(F) / sizeof(float);
|
||||
// tail != 0 ~~> work on only the first tail pixels
|
||||
// tail is always < N.
|
||||
|
||||
// Any custom ABI to use for all non-externally-facing stage functions.
|
||||
#if defined(__ARM_NEON) && defined(__arm__)
|
||||
// Any custom ABI to use for all (non-externally-facing) stage functions?
|
||||
// Also decide here whether to use narrow (compromise) or wide (ideal) stages.
|
||||
#if defined(__arm__) && defined(__ARM_NEON)
|
||||
// This lets us pass vectors more efficiently on 32-bit ARM.
|
||||
// We can still only pass 16 floats, so best as 4x {r,g,b,a}.
|
||||
#define ABI __attribute__((pcs("aapcs-vfp")))
|
||||
#elif 0 || defined(__clang__) && defined(_MSC_VER)
|
||||
// TODO: can we use sysv_abi here instead? It'd allow passing far more registers.
|
||||
#define ABI __attribute__((vectorcall))
|
||||
#else
|
||||
#define ABI
|
||||
#endif
|
||||
|
||||
// On 32-bit x86 we've only got 8 xmm registers, so we keep the 4 hottest (r,g,b,a)
|
||||
// in registers and the d-registers on the stack (giving us 4 temporary registers).
|
||||
// General-purpose registers are also tight, so we put most of those on the stack too.
|
||||
//
|
||||
// On ARMv7, we do the same so that we can make the r,g,b,a vectors wider.
|
||||
//
|
||||
// Finally, this narrower stage calling convention also fits Windows' __vectorcall very well.
|
||||
#if 0 || defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER)
|
||||
#define JUMPER_NARROW_STAGES 1
|
||||
#else
|
||||
#elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__)
|
||||
// SysV ABI makes it very sensible to use wide stages with clang-cl.
|
||||
// TODO: crashes during compilation :(
|
||||
#define ABI __attribute__((sysv_abi))
|
||||
#define JUMPER_NARROW_STAGES 0
|
||||
#elif defined(_MSC_VER)
|
||||
// Even if not vectorized, this lets us pass {r,g,b,a} as registers,
|
||||
// instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
|
||||
#define ABI __vectorcall
|
||||
#define JUMPER_NARROW_STAGES 1
|
||||
#elif defined(__x86_64__) || defined(__aarch64__)
|
||||
// These platforms are ideal for wider stages, and their default ABI is ideal.
|
||||
#define ABI
|
||||
#define JUMPER_NARROW_STAGES 0
|
||||
#else
|
||||
// 32-bit or unknown... shunt them down the narrow path.
|
||||
// Odds are these have few registers and are better off there.
|
||||
#define ABI
|
||||
#define JUMPER_NARROW_STAGES 1
|
||||
#endif
|
||||
|
||||
#if JUMPER_NARROW_STAGES
|
||||
@ -767,7 +771,7 @@ static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, v
|
||||
#define STAGE(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
|
||||
static ABI void name(Params* params, void** program, \
|
||||
static void ABI name(Params* params, void** program, \
|
||||
F r, F g, F b, F a) { \
|
||||
name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \
|
||||
params->dr, params->dg, params->db, params->da); \
|
||||
@ -780,7 +784,7 @@ static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, v
|
||||
#define STAGE(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
|
||||
static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
|
||||
name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
|
||||
auto next = (Stage)load_and_inc(program); \
|
||||
@ -794,9 +798,9 @@ static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, v
|
||||
// just_return() is a simple no-op stage that only exists to end the chain,
|
||||
// returning back up to start_pipeline(), and from there to the caller.
|
||||
#if JUMPER_NARROW_STAGES
|
||||
static ABI void just_return(Params*, void**, F,F,F,F) {}
|
||||
static void ABI just_return(Params*, void**, F,F,F,F) {}
|
||||
#else
|
||||
static ABI void just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
|
||||
static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
|
||||
#endif
|
||||
|
||||
|
||||
@ -2375,9 +2379,9 @@ static void start_pipeline(const size_t x0, const size_t y0,
|
||||
}
|
||||
|
||||
#if JUMPER_NARROW_STAGES
|
||||
static ABI void just_return(Params*, void**, U16,U16,U16,U16) {}
|
||||
static void ABI just_return(Params*, void**, U16,U16,U16,U16) {}
|
||||
#else
|
||||
static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
|
||||
static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
|
||||
#endif
|
||||
|
||||
// All stages use the same function call ABI to chain into each other, but there are three types:
|
||||
@ -2393,7 +2397,7 @@ static void start_pipeline(const size_t x0, const size_t y0,
|
||||
#if JUMPER_NARROW_STAGES
|
||||
#define STAGE_GG(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
|
||||
static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
|
||||
static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
|
||||
auto x = join<F>(r,g), \
|
||||
y = join<F>(b,a); \
|
||||
name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
|
||||
@ -2408,7 +2412,7 @@ static void start_pipeline(const size_t x0, const size_t y0,
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
|
||||
static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
|
||||
auto x = join<F>(r,g), \
|
||||
y = join<F>(b,a); \
|
||||
name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
|
||||
@ -2424,7 +2428,7 @@ static void start_pipeline(const size_t x0, const size_t y0,
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
|
||||
static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
|
||||
name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
|
||||
params->dr,params->dg,params->db,params->da); \
|
||||
auto next = (Stage)load_and_inc(program); \
|
||||
@ -2436,7 +2440,7 @@ static void start_pipeline(const size_t x0, const size_t y0,
|
||||
#else
|
||||
#define STAGE_GG(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
|
||||
static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
auto x = join<F>(r,g), \
|
||||
@ -2453,7 +2457,7 @@ static void start_pipeline(const size_t x0, const size_t y0,
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
auto x = join<F>(r,g), \
|
||||
@ -2470,7 +2474,7 @@ static void start_pipeline(const size_t x0, const size_t y0,
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
|
||||
|
Loading…
Reference in New Issue
Block a user