remove vfpv4 requirement for SkJumper on ARMv7

VFPv4 gives us two interesting features: - FMA - f16<->f32 conversions Even without FMAs, NEON still has non-fused MLA instructions. We don't really care about the fusedness of those mul-adds, so losing FMA here is kind of no big deal. We already maintain portable code to do f16<->f32 conversions, so it's not much of a maintanence hit to use that instead of the native instructions. To my knowledge software F16 rendering is not a performance critical mode of operation for any of our users. This drops our minimum requirement to basically just having NEON. Devices like the Nexus 7 2012 will now take SkJumper fast paths instead of portable code. (Though actually, we've only ever required NEON for _lowp... only the float code also needed vfpv4). The main file to look at here is actually SkJumper_vectors.h, where you will see all the substantive changes. The rest just kind of tears down most of the old complexity, add adds ABI to put just a little of it back. :) Change-Id: Ia9237117698729c91e5fa51126baf80748093bf4 Bug: skia: Reviewed-on: https://skia-review.googlesource.com/83521 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
2017-12-11 16:53:26 -05:00 · 2017-12-11 16:53:26 -05:00 · 376fd31ad4
commit 376fd31ad4
parent ba096c0dc3
8 changed files with 51 additions and 9555 deletions
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -78,13 +78,6 @@ extern "C" {
 #if !SK_JUMPER_USE_ASSEMBLY
    // We'll just run baseline code.

-#elif defined(__arm__)
-    StartPipelineFn ASM(start_pipeline,vfp4);
-    StageFn ASM(just_return,vfp4);
-    #define M(st) StageFn ASM(st,vfp4);
-        SK_RASTER_PIPELINE_STAGES(M)
-    #undef M
-
 #elif defined(__x86_64__) || defined(_M_X64)
    StartPipelineFn ASM(start_pipeline,       skx),
                    ASM(start_pipeline,       hsw),
@ -135,7 +128,6 @@ extern "C" {
    #undef M

 #if defined(JUMPER_HAS_NEON_LOWP)
-    // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
    StartPipelineFn sk_start_pipeline_lowp;
    StageFn sk_just_return_lowp;
    #define M(st) StageFn sk_##st##_lowp;
@ -295,17 +287,6 @@ static SkJumper_Engine choose_engine() {
 #if !SK_JUMPER_USE_ASSEMBLY
    // We'll just run baseline code.

-#elif defined(__arm__)
-    if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
-        return {
-        #define M(stage) ASM(stage, vfp4),
-            { SK_RASTER_PIPELINE_STAGES(M) },
-            M(start_pipeline)
-            M(just_return)
-        #undef M
-        };
-    }
-
 #elif defined(__x86_64__) || defined(_M_X64)
    #if !defined(_MSC_VER)  // No _skx stages for Windows yet.
        if (1 && SkCpu::Supports(SkCpu::SKX)) {
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@ -8,6 +8,9 @@
 #ifndef SkJumper_DEFINED
 #define SkJumper_DEFINED

+#include <stddef.h>
+#include <stdint.h>
+
 // This file contains definitions shared by SkJumper.cpp (compiled normally as part of Skia)
 // and SkJumper_stages.cpp (compiled into Skia _and_ offline into SkJumper_generated.h).
 // Keep it simple!
@ -21,36 +24,18 @@
    #define MAYBE_MSABI
 #endif

-#if defined(JUMPER_IS_OFFLINE) && (defined(__aarch64__) || defined(__arm__))
-    // To reduce SkJumper's dependency on the Android NDK,
-    // we provide what we need from <string.h>, <stdint.h>, and <stddef.h> ourselves.
-    #define memcpy __builtin_memcpy
-
-    using  int8_t  =   signed char;
-    using uint8_t  = unsigned char;
-    using  int16_t =   signed short;
-    using uint16_t = unsigned short;
-    using  int32_t =   signed int;
-    using uint32_t = unsigned int;
-    #if defined(__aarch64__)
-        using  int64_t =   signed long;
-        using uint64_t = unsigned long;
-        using size_t = uint64_t;
-    #else
-        using  int64_t =   signed long long;
-        using uint64_t = unsigned long long;
-        using size_t = uint32_t;
-    #endif
-
-    // Now pretend we've included <stdint.h> (or it'll be included again by <arm_neon.h>).
-    #define __CLANG_STDINT_H
-    #define _STDINT_H_
+// Any custom ABI to use for all non-externally-facing stage functions.
+#if defined(__ARM_NEON) && defined(__arm__)
+    // This lets us pass vectors more efficiently on 32-bit ARM.
+    #define ABI __attribute__((pcs("aapcs-vfp")))
 #else
-    #include <string.h>
-    #include <stdint.h>
+    #define ABI
 #endif

-// When compiled with Clang on ARM, we'll have 8-bit NEON stages.
+// On ARM we expect that you're using Clang if you want SkJumper to be fast.
+// If you are, the baseline float stages will use NEON, and lowp stages will
+// also be available. (If somehow you're building for ARM not using Clang,
+// you'll get scalar baseline stages and no lowp support.)
 #if defined(__clang__) && defined(__ARM_NEON)
    #define JUMPER_HAS_NEON_LOWP
 #endif
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_misc.h
+++ b/src/jumper/SkJumper_misc.h
@ -8,7 +8,7 @@
 #ifndef SkJumper_misc_DEFINED
 #define SkJumper_misc_DEFINED

-#include "SkJumper.h"  // for memcpy()
+#include <string.h>  // for memcpy()

 // Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h.

--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -50,17 +50,17 @@ static const size_t N = sizeof(F) / sizeof(float);
        size_t dx, dy, tail;
        F dr,dg,db,da;
    };
-    using Stage = void(Params*, void** program, F r, F g, F b, F a);
+    using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a);

 #else
    // We keep program the second argument, so that it's passed in rsi for load_and_inc().
-    using Stage = void(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
+    using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
 #endif

-MAYBE_MSABI
-extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit,
-                                     void** program) {
-    auto start = (Stage*)load_and_inc(program);
+
+extern "C" MAYBE_MSABI void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit,
+                                                 void** program) {
+    auto start = (Stage)load_and_inc(program);
    const size_t x0 = dx;
    for (; dy < ylimit; dy++) {
    #if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
@ -90,26 +90,26 @@ extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t
    #define STAGE(name, ...)                                                          \
        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);         \
-        extern "C" void WRAP(name)(Params* params, void** program,                    \
-                                   F r, F g, F b, F a) {                              \
+        extern "C" ABI void WRAP(name)(Params* params, void** program,                \
+                                       F r, F g, F b, F a) {                          \
            name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a,        \
                     params->dr, params->dg, params->db, params->da);                 \
-            auto next = (Stage*)load_and_inc(program);                                \
+            auto next = (Stage)load_and_inc(program);                                 \
            next(params,program, r,g,b,a);                                            \
        }                                                                             \
        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
 #else
-    #define STAGE(name, ...)                                                          \
-        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
-                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);         \
-        extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
-                                   F r, F g, F b, F a, F dr, F dg, F db, F da) {      \
-            name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da);                  \
-            auto next = (Stage*)load_and_inc(program);                                \
-            next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                           \
-        }                                                                             \
-        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
+    #define STAGE(name, ...)                                                              \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                  \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
+        extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
+                                       F r, F g, F b, F a, F dr, F dg, F db, F da) {      \
+            name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
+            auto next = (Stage)load_and_inc(program);                                     \
+            next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                               \
+        }                                                                                 \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                  \
                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
 #endif

@ -117,9 +117,9 @@ extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t
 // just_return() is a simple no-op stage that only exists to end the chain,
 // returning back up to start_pipeline(), and from there to the caller.
 #if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
-    extern "C" void WRAP(just_return)(Params*, void**, F,F,F,F) {}
+    extern "C" ABI void WRAP(just_return)(Params*, void**, F,F,F,F) {}
 #else
-    extern "C" void WRAP(just_return)(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
+    extern "C" ABI void WRAP(just_return)(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
 #endif


--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@ -16,17 +16,10 @@

 #if defined(__ARM_NEON)
    #include <arm_neon.h>
-    #if defined(__arm__)
-        #define ABI __attribute__((pcs("aapcs-vfp")))
-    #else
-        #define ABI
-    #endif
 #elif defined(__SSE2__)
    #include <immintrin.h>
-    #define ABI
 #else
    #include <math.h>
-    #define ABI
 #endif

 #if !defined(JUMPER_IS_OFFLINE)
@ -62,12 +55,11 @@ using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
                          U16  r, U16  g, U16  b, U16  a,
                          U16 dr, U16 dg, U16 db, U16 da);

-MAYBE_MSABI
-ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
-                                         const size_t y0,
-                                         const size_t xlimit,
-                                         const size_t ylimit,
-                                         void** program) {
+extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0,
+                                                 const size_t y0,
+                                                 const size_t xlimit,
+                                                 const size_t ylimit,
+                                                 void** program) {
    auto start = (Stage)load_and_inc(program);
    for (size_t dy = y0; dy < ylimit; dy++) {
        size_t dx = x0;
@ -80,7 +72,7 @@ ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
    }
 }

-ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
+extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t,
                                      U16,U16,U16,U16, U16,U16,U16,U16) {}

 // All stages use the same function call ABI to chain into each other, but there are three types:
@ -95,7 +87,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,

 #define STAGE_GG(name, ...)                                                            \
    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
-    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
+    extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
                                   U16  r, U16  g, U16  b, U16  a,                     \
                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
        auto x = join<F>(r,g),                                                         \
@ -112,7 +104,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
                     U16& dr, U16& dg, U16& db, U16& da);                              \
-    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
+    extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
                                   U16  r, U16  g, U16  b, U16  a,                     \
                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
        auto x = join<F>(r,g),                                                         \
@ -129,7 +121,7 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
                     U16& dr, U16& dg, U16& db, U16& da);                              \
-    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
+    extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
                                   U16  r, U16  g, U16  b, U16  a,                     \
                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
        name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@ -10,6 +10,7 @@

 #include "SkJumper.h"
 #include "SkJumper_misc.h"
+#include <stdint.h>

 // This file contains vector types that SkJumper_stages.cpp uses to define stages.

@ -17,7 +18,7 @@

 #if !defined(__clang__)
    #define JUMPER_IS_SCALAR
-#elif defined(__aarch64__) || defined(__ARM_VFPV4__)
+#elif defined(__ARM_NEON)
    #define JUMPER_IS_NEON
 #elif defined(__AVX512F__)
    #define JUMPER_IS_AVX512
@ -105,7 +106,6 @@
    using U8  = V<uint8_t >;

    // We polyfill a few routines that Clang doesn't build into ext_vector_types.
-    SI F   mad(F f, F m, F a)                    { return vfmaq_f32(a,f,m);        }
    SI F   min(F a, F b)                         { return vminq_f32(a,b);          }
    SI F   max(F a, F b)                         { return vmaxq_f32(a,b);          }
    SI F   abs_  (F v)                           { return vabsq_f32(v);            }
@ -117,10 +117,12 @@
    SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }

    #if defined(__aarch64__)
+        SI F     mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
        SI F  floor_(F v) { return vrndmq_f32(v); }
        SI F   sqrt_(F v) { return vsqrtq_f32(v); }
        SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
    #else
+        SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
        SI F floor_(F v) {
            F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
            return roundtrip - if_then_else(roundtrip > v, 1, 0);
@ -643,7 +645,7 @@ SI F approx_powf(F x, F y) {
 }

 SI F from_half(U16 h) {
-#if defined(JUMPER_IS_NEON)
+#if defined(__aarch64__)
    return vcvt_f32_f16(h);

 #elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
@ -663,7 +665,7 @@ SI F from_half(U16 h) {
 }

 SI U16 to_half(F f) {
-#if defined(JUMPER_IS_NEON)
+#if defined(__aarch64__)
    return vcvt_f16_f32(f);

 #elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@ -120,14 +120,6 @@ subprocess.check_call(['ld', '-r', '-o', 'win_x86_merged.o',
                       'win_x86_sse2.o',
                       'win_x86_lowp_sse2.o'])

-vfp4 = [
-    '--target=armv7a-linux-gnueabihf',
-    '-mfpu=neon-vfpv4',
-]
-subprocess.check_call(clang + cflags + vfp4 +
-                      ['-c', stages] +
-                      ['-o', 'vfp4.o'])
-
 def parse_object_file(dot_o, directive, target=None):
  globl, hidden, label, comment, align = \
      '.globl', 'HIDDEN', ':', '// ', 'BALIGN'
@ -235,11 +227,7 @@ print '    #define BALIGN32 .balign 32'
 print '#endif'

 print '.text'
-print '#if defined(__arm__)'
-print 'BALIGN4'
-parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
-
-print '#elif defined(__x86_64__)'
+print '#if defined(__x86_64__)'
 print 'BALIGN32'
 parse_object_file('merged.o', '.byte')