Revert "Reland "make SkJumper stages normal Skia code""

This reverts commit 78cb579f33.

Reason for revert: lowp should be controlled by defined(JUMPER_IS_SCALAR), not defined(__clang__).  So close.

Original change's description:
> Reland "make SkJumper stages normal Skia code"
> 
> This is a reland of 22e536e3a1
> 
> Now with fixed #include paths in SkRasterPipeline_opts.h,
> and -ffp-contract=fast for the :hsw target to minimize
> diffs on non-Windows Clang AVX2/AVX-512 bots.
> 
> Original change's description:
> > make SkJumper stages normal Skia code
> >
> > Enough clients are using Clang now that we can say, use Clang to build
> > if you want these software pipeline stages to go fast.
> >
> > This lets us drop the offline build aspect of SkJumper stages, instead
> > building as part of Skia using the SkOpts framework.
> >
> > I think everything should work, except I've (temporarily) removed
> > AVX-512 support.  I will put this back in a follow up.
> >
> > I have had to drop Windows down to __vectorcall and our narrower
> > stage calling convention that keeps the d-registers on the stack.
> > I tried forcing sysv_abi, but that crashed Clang.  :/
> >
> > Added a TODO to up the same narrower stage calling convention
> > for lowp stages... we just *don't* today, for no good reason.
> >
> > Change-Id: Iaaa792ffe4deab3508d2dc5d0008c163c24b3383
> > Reviewed-on: https://skia-review.googlesource.com/110641
> > Commit-Queue: Mike Klein <mtklein@chromium.org>
> > Reviewed-by: Herb Derby <herb@google.com>
> > Reviewed-by: Florin Malita <fmalita@chromium.org>
> 
> Change-Id: I44f2c03d33958e3807747e40904b6351957dd448
> Reviewed-on: https://skia-review.googlesource.com/112742
> Reviewed-by: Mike Klein <mtklein@chromium.org>

TBR=mtklein@chromium.org,herb@google.com,fmalita@chromium.org

Change-Id: Ie64da98f5187d44e03c0ce05d7cb189d4a6e6663
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Reviewed-on: https://skia-review.googlesource.com/112743
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2018-03-07 17:04:18 +00:00 committed by Skia Commit-Bot
parent 61d56b92a4
commit 5cc94cc393
21 changed files with 158946 additions and 1213 deletions

View File

@ -48,6 +48,10 @@ declare_args() {
skia_compile_processors = false
skia_lex = false
skia_jumper_clang = ""
skia_jumper_objdump = ""
skia_jumper_ccache = ""
skia_skqp_enable_driver_correctness_workarounds = false
skia_skqp_global_error_tolerance = 0
}
@ -310,28 +314,6 @@ opts("avx") {
}
}
opts("hsw") {
enabled = is_x86
sources = skia_opts.hsw_sources
if (!is_clang && is_win) {
cflags = [ "/arch:AVX2" ]
} else {
cflags = [
"-mavx2",
"-mf16c",
"-mfma",
]
}
# Oddly, clang-cl doesn't recognize this as a valid flag.
# If it ever does, it'd nice to move this up with -mavx2 and co.
if (is_clang && !is_win) {
# This flag lets Clang generate FMAs when it sees a mul-then-add. It's optional,
# but nice to have, generating slightly better code for paths without explicit FMAs.
cflags += [ "-ffp-contract=fast" ]
}
}
# Any feature of Skia that requires third-party code should be optional and use this template.
template("optional") {
if (invoker.enabled) {
@ -793,7 +775,6 @@ component("skia") {
":fontmgr_fuchsia",
":gpu",
":heif",
":hsw",
":jpeg",
":none",
":pdf",
@ -2120,3 +2101,28 @@ if (skia_enable_tools) {
}
}
}
if (skia_jumper_clang != "") {
action("regen_jumper") {
script = "src/jumper/build_stages.py"
inputs = [
"src/jumper/SkJumper_stages.cpp",
"src/jumper/SkJumper_stages_lowp.cpp",
]
# GN insists its outputs should go somewhere underneath target_out_dir, so we trick it.
outputs = [
"$target_out_dir/" +
rebase_path("src/jumper/SkJumper_generated.S", target_out_dir),
"$target_out_dir/" +
rebase_path("src/jumper/SkJumper_generated_win.S", target_out_dir),
]
args = [
skia_jumper_clang,
skia_jumper_objdump,
skia_jumper_ccache,
] + rebase_path(inputs) + rebase_path(outputs)
}
}

View File

@ -440,8 +440,6 @@ skia_core_sources = [
"$_include/core/SkUnPreMultiply.h",
"$_include/core/SkVertices.h",
"$_src/jumper/SkJumper.cpp",
# private
"$_include/private/SkAtomics.h",
"$_include/private/SkChecksum.h",
@ -529,4 +527,20 @@ skia_core_sources = [
"$_src/pathops/SkReduceOrder.h",
]
skia_core_defines = [] # Used to be used by Chromium, but no longer.
skia_core_sources += [
"$_src/jumper/SkJumper.cpp",
"$_src/jumper/SkJumper_stages.cpp",
"$_src/jumper/SkJumper_stages_lowp.cpp",
]
skia_core_defines = []
if (is_win) {
if (host_os == "win") {
skia_core_sources += [ "$_src/jumper/SkJumper_generated_win.S" ]
} else {
# TODO(thakis): Enable jumper in linux->win cross builds once the
# assembler situation is figured out, https://crbug.com/762167
skia_core_defines += [ "SK_JUMPER_USE_ASSEMBLY=0" ]
}
} else if (target_cpu != "wasm") {
skia_core_sources += [ "$_src/jumper/SkJumper_generated.S" ]
}

View File

@ -289,8 +289,7 @@ with open('Android.bp', 'w') as f:
defs['ssse3'] +
defs['sse41'] +
defs['sse42'] +
defs['avx' ] +
defs['hsw' ]),
defs['avx' ]),
'dm_includes' : bpfmt(8, dm_includes),
'dm_srcs' : bpfmt(8, dm_srcs),

View File

@ -51,4 +51,3 @@ ssse3 = [
sse41 = [ "$_src/opts/SkOpts_sse41.cpp" ]
sse42 = [ "$_src/opts/SkOpts_sse42.cpp" ]
avx = [ "$_src/opts/SkOpts_avx.cpp" ]
hsw = [ "$_src/opts/SkOpts_hsw.cpp" ]

View File

@ -24,7 +24,7 @@ skia_opts = {
sse41_sources = sse41
sse42_sources = sse42
avx_sources = avx
hsw_sources = hsw
hsw_sources = [] # remove after we update Chrome
}
# Skia Chromium defines. These flags will be defined in chromium If these

103
site/dev/contrib/jumper.md Normal file
View File

@ -0,0 +1,103 @@
Contributing to SkJumper
========================
SkJumper is the execution engine of SkRasterPipeline, a system we've been using
to accelerate CPU-bound work inside Skia, most notably color-space conversions
and color-correct drawing.
(This is where I'd put my link to design document if I had one...)
SkJumper is more annoying to contribute to than most Skia code because of its
offline compilation step. You'll need particular tools installed on your
machine and to tell GN about them. This document is designed to guide you
through this process and ease some of that annoyance.
One-time Setup
--------------
To generate stage code you need Clang 5.0, objdump, and ccache. It's best that
Clang is exactly the same version we typically use (as of writing 5.0.0) and
you'll need objdump to be compiled with support for x86-64, ARMv7, and ARMv8.
The easiest way to satisfy these contraints is to get your hands on a Mac and
install Xcode, Xcode command line tools, and [Homebrew](https://brew.sh). Once
you have `brew` installed, run these commands to get the tools you need:
<!--?prettify lang=sh?-->
ls -d /usr/include >/dev/null || xcode-select --install
brew install llvm binutils ccache
Setting up GN
-------------------------
With your tools installed, tell GN about them
skia_jumper_clang = path/to/clang-5.0
skia_jumper_objdump = path/to/gobjdump
skia_jumper_ccache = path/to/ccache
then regenerate and build as normal.
If you look in your GN out directory, you should now see a bunch of `.o` files,
and `git status` should show no changes to `src/jumper/SkJumper_generated*.S`.
That's good. Those object files are the intermediates we parse to produce
the assembly files. We just leave them around in case you want to look at
them yourself.
Make A Change
-------------
Let's use the `from_srgb` stage as a little playground to make a real change.
Linearizing sRGB encoded bytes is slow, so let's pretend we've decided to trade
quality for speed, approximating the existing implementation with a simple square.
Open up `SkJumper_stages.cpp` and find the `from_srgb` stage. It'll look like
<!--?prettify lang=cc?-->
STAGE(from_srgb) {
r = from_srgb(r);
g = from_srgb(g);
b = from_srgb(b);
}
Let's replace whatever's there with our fast approximation:
<!--?prettify lang=cc?-->
STAGE(from_srgb) {
r *= r;
g *= g;
b *= b;
}
When you save and re-Ninja, you should now see changes to
`src/jumper/SkJumper_generated.S` and `src/jumper/SkJumper_generated_win.S`.
If you can't read assembly, no big deal. If you can, run `git diff`. You
should see the various `sk_from_srgb_*` functions get dramatically simpler,
something like three multiplies and a couple other bookkeeping instructions.
It's not unusual for isolated changes in one stage to cause seemingly unrelated
changes in another. When adding or removing any code you'll usually see all
the comments in branch instructions change a little bit, but the actual
instruction on the left won't change. When adding or removing uses of
constants, you'll often see both the comment and instruction on the left change
for other loads of constants from memory, especially on x86-64. You'll also
see some code that looks like garbage change; those are the constants. If
any of this worries you, please do go running to someone who knows more for
help, but odds are everything is fine.
At this point things should just be business as usual. Any time you change
`SkJumper_stages.cpp`, Ninja ought to notice and regenerate the assembly files.
Adding a new Stage
------------------
Adding a new stage is a lot like changing an existing stage. Edit
`SkJumper_stages.cpp`, build Skia, test, repeat until correct.
You'll just need to also edit `SkRasterPipeline.h` to add your new stage to the
macro listing all the stages. The stage name is the handle normal Skia code
uses to refer to the stage abstractly, and the wiring between
`SkRasterPipeline::foo` and `STAGE(foo) { ... }` should work automatically.

View File

@ -33,18 +33,6 @@ link Skia against the headers and libaries found on the system paths.
use `extra_cflags` and `extra_ldflags` to add include or library paths if
needed.
A note on software backend performance
--------------------------------------
A number of routines in Skia's software backend have been written to run
fastest when compiled by Clang. If you depend on software rasterization, image
decoding, or color space conversion and compile Skia with GCC, MSVC or another
compiler, you will see dramatically worse performance than if you use Clang.
This choice was only a matter of prioritization; there is nothing fundamentally
wrong with non-Clang compilers. So if this is a serious issue for you, please
let us know on the mailing list.
Quickstart
----------

View File

@ -40,7 +40,6 @@
#include "SkBlitRow_opts.h"
#include "SkChecksum_opts.h"
#include "SkMorphologyImageFilter_opts.h"
#include "SkRasterPipeline_opts.h"
#include "SkSwizzler_opts.h"
#include "SkUtils_opts.h"
#include "SkXfermode_opts.h"
@ -82,26 +81,11 @@ namespace SkOpts {
#undef DEFINE_DEFAULT
#define M(st) (StageFn)SK_OPTS_NS::st,
StageFn stages_highp[] = { SK_RASTER_PIPELINE_STAGES(M) };
StageFn just_return_highp = (StageFn)SK_OPTS_NS::just_return;
void (*start_pipeline_highp)(size_t,size_t,size_t,size_t,void**)
= SK_OPTS_NS::start_pipeline;
#undef M
#define M(st) (StageFn)SK_OPTS_NS::lowp::st,
StageFn stages_lowp[] = { SK_RASTER_PIPELINE_STAGES(M) };
StageFn just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
void (*start_pipeline_lowp)(size_t,size_t,size_t,size_t,void**)
= SK_OPTS_NS::lowp::start_pipeline;
#undef M
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();
void Init_sse41();
void Init_sse42();
void Init_avx();
void Init_hsw();
void Init_crc32();
static void init() {
@ -120,8 +104,7 @@ namespace SkOpts {
#endif
#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_AVX
if (SkCpu::Supports(SkCpu::AVX)) { Init_avx(); }
if (SkCpu::Supports(SkCpu::HSW)) { Init_hsw(); }
if (SkCpu::Supports(SkCpu::AVX )) { Init_avx(); }
#endif
#elif defined(SK_CPU_ARM64)

View File

@ -54,17 +54,6 @@ namespace SkOpts {
static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {
return hash_fn(data, bytes, seed);
}
#define M(st) +1
// We can't necessarily express the type of SkJumper stage functions here,
// so we just use this void(*)(void) as a stand-in.
using StageFn = void(*)(void);
extern StageFn stages_highp[SK_RASTER_PIPELINE_STAGES(M)], just_return_highp;
extern StageFn stages_lowp [SK_RASTER_PIPELINE_STAGES(M)], just_return_lowp;
extern void (*start_pipeline_highp)(size_t,size_t,size_t,size_t, void**);
extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
#undef M
}
#endif//SkOpts_DEFINED

View File

@ -17,6 +17,8 @@
#include <functional>
#include <vector>
struct SkJumper_Engine;
/**
* SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
*
@ -161,9 +163,7 @@ private:
void* ctx;
};
using StartPipelineFn = void(*)(size_t,size_t,size_t,size_t, void** program);
StartPipelineFn build_pipeline(void**) const;
const SkJumper_Engine& build_pipeline(void**) const;
void unchecked_append(StockStage, void*);
SkArenaAlloc* fAlloc;

View File

@ -5,46 +5,468 @@
* found in the LICENSE file.
*/
#include "SkColorData.h"
#include "SkCpu.h"
#include "SkJumper.h"
#include "SkOpts.h"
#include "SkOnce.h"
#include "SkRasterPipeline.h"
#include "SkTemplates.h"
SkRasterPipeline::StartPipelineFn SkRasterPipeline::build_pipeline(void** ip) const {
#ifndef SK_JUMPER_DISABLE_8BIT
// We'll try to build a lowp pipeline, but if that fails fallback to a highp float pipeline.
void** reset_point = ip;
#if !defined(SK_JUMPER_USE_ASSEMBLY)
// We'll use __has_feature(memory_sanitizer) to detect MSAN.
// SkJumper_generated.S is not compiled with MSAN, so MSAN would yell really loud.
#if !defined(__has_feature)
#define __has_feature(x) 0
#endif
// Stages are stored backwards in fStages, so we reverse here, back to front.
*--ip = (void*)SkOpts::just_return_lowp;
#if 0 || __has_feature(memory_sanitizer)
#define SK_JUMPER_USE_ASSEMBLY 0
#else
#define SK_JUMPER_USE_ASSEMBLY 1
#endif
#endif
#define M(st) +1
static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
#undef M
#ifndef SK_JUMPER_DISABLE_8BIT
// Intentionally commented out; optional logging for local debugging.
#if 0 && SK_JUMPER_USE_ASSEMBLY && (defined(__x86_64__) || defined(_M_X64))
#include <atomic>
#define M(st) #st,
static const char* kStageNames[] = { SK_RASTER_PIPELINE_STAGES(M) };
#undef M
static std::atomic<int> gMissingStageCounters[kNumStages];
static void log_missing(SkRasterPipeline::StockStage st) {
static SkOnce once;
once([] { atexit([] {
int total = 0;
for (int i = 0; i < kNumStages; i++) {
if (int count = gMissingStageCounters[i].load()) {
SkDebugf("%7d\t%s\n", count, kStageNames[i]);
total += count;
}
}
SkDebugf("%7d total\n", total);
}); });
gMissingStageCounters[st]++;
}
#else
static void log_missing(SkRasterPipeline::StockStage) {}
#endif
#endif
// We can't express the real types of most stage functions portably, so we use a stand-in.
// We'll only ever call start_pipeline(), which then chains into the rest.
using StageFn = void(void);
using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**);
// Some platforms expect C "name" maps to asm "_name", others to "name".
#if defined(__APPLE__)
#define ASM(name, suffix) sk_##name##_##suffix
#else
#define ASM(name, suffix) _sk_##name##_##suffix
#endif
extern "C" {
#if !SK_JUMPER_USE_ASSEMBLY
// We'll just run baseline code.
#elif defined(__x86_64__) || defined(_M_X64)
StartPipelineFn ASM(start_pipeline, skx),
ASM(start_pipeline, hsw),
ASM(start_pipeline, avx),
ASM(start_pipeline, sse41),
ASM(start_pipeline, sse2),
ASM(start_pipeline, hsw_lowp),
ASM(start_pipeline,sse41_lowp),
ASM(start_pipeline, sse2_lowp);
StageFn ASM(just_return, skx),
ASM(just_return, hsw),
ASM(just_return, avx),
ASM(just_return, sse41),
ASM(just_return, sse2),
ASM(just_return, hsw_lowp),
ASM(just_return,sse41_lowp),
ASM(just_return, sse2_lowp);
#define M(st) StageFn ASM(st, skx), \
ASM(st, hsw), \
ASM(st, avx), \
ASM(st,sse41), \
ASM(st, sse2), \
ASM(st, hsw_lowp), \
ASM(st,sse41_lowp), \
ASM(st, sse2_lowp);
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#elif defined(__i386__) || defined(_M_IX86)
StartPipelineFn ASM(start_pipeline,sse2),
ASM(start_pipeline,sse2_lowp);
StageFn ASM(just_return,sse2),
ASM(just_return,sse2_lowp);
#define M(st) StageFn ASM(st,sse2), \
ASM(st,sse2_lowp);
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#endif
// Baseline code compiled as a normal part of Skia.
StartPipelineFn sk_start_pipeline;
StageFn sk_just_return;
#define M(st) StageFn sk_##st;
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#if defined(JUMPER_HAS_NEON_LOWP)
StartPipelineFn sk_start_pipeline_lowp;
StageFn sk_just_return_lowp;
#define M(st) StageFn sk_##st##_lowp;
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#endif
}
#if SK_JUMPER_USE_ASSEMBLY
#if defined(__x86_64__) || defined(_M_X64)
template <SkRasterPipeline::StockStage st>
static constexpr StageFn* hsw_lowp();
template <SkRasterPipeline::StockStage st>
static constexpr StageFn* sse41_lowp();
template <SkRasterPipeline::StockStage st>
static constexpr StageFn* sse2_lowp();
#define LOWP(st) \
template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
return ASM(st,hsw_lowp); \
} \
template <> constexpr StageFn* sse41_lowp<SkRasterPipeline::st>() { \
return ASM(st,sse41_lowp); \
} \
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
return ASM(st,sse2_lowp); \
}
#define NOPE(st) \
template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
return nullptr; \
} \
template <> constexpr StageFn* sse41_lowp<SkRasterPipeline::st>() { \
return nullptr; \
} \
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
return nullptr; \
}
#elif defined(__i386__) || defined(_M_IX86)
template <SkRasterPipeline::StockStage st>
static constexpr StageFn* sse2_lowp();
#define LOWP(st) \
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
return ASM(st,sse2_lowp); \
}
#define NOPE(st) \
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
return nullptr; \
}
#elif defined(JUMPER_HAS_NEON_LOWP)
template <SkRasterPipeline::StockStage st>
static constexpr StageFn* neon_lowp();
#define LOWP(st) \
template <> constexpr StageFn* neon_lowp<SkRasterPipeline::st>() { \
return sk_##st##_lowp; \
}
#define NOPE(st) \
template <> constexpr StageFn* neon_lowp<SkRasterPipeline::st>() { \
return nullptr; \
}
#else
#define LOWP(st)
#define NOPE(st)
#endif
#define TODO(st) NOPE(st) // stages that should be implemented in lowp, but aren't.
NOPE(callback)
LOWP(move_src_dst) LOWP(move_dst_src)
NOPE(clamp_0) NOPE(clamp_1) LOWP(clamp_a) LOWP(clamp_a_dst)
NOPE(unpremul) LOWP(premul) LOWP(premul_dst)
LOWP(force_opaque) LOWP(force_opaque_dst)
LOWP(set_rgb) LOWP(swap_rb) LOWP(invert)
NOPE(from_srgb) NOPE(from_srgb_dst) NOPE(to_srgb)
LOWP(black_color) LOWP(white_color) LOWP(uniform_color)
LOWP(seed_shader) NOPE(dither)
LOWP(load_a8) LOWP(load_a8_dst) LOWP(store_a8) LOWP(gather_a8)
LOWP(load_g8) LOWP(load_g8_dst) LOWP(gather_g8)
LOWP(load_565) LOWP(load_565_dst) LOWP(store_565) LOWP(gather_565)
LOWP(load_4444) LOWP(load_4444_dst) LOWP(store_4444) LOWP(gather_4444)
NOPE(load_f16) NOPE(load_f16_dst) NOPE(store_f16) NOPE(gather_f16)
NOPE(load_f32) NOPE(load_f32_dst) NOPE(store_f32)
LOWP(load_8888) LOWP(load_8888_dst) LOWP(store_8888) LOWP(gather_8888)
LOWP(load_bgra) LOWP(load_bgra_dst) LOWP(store_bgra) LOWP(gather_bgra)
NOPE(load_1010102) NOPE(load_1010102_dst) NOPE(store_1010102) NOPE(gather_1010102)
TODO(bilerp_clamp_8888)
TODO(load_u16_be) TODO(load_rgb_u16_be) TODO(store_u16_be)
NOPE(load_tables_u16_be) NOPE(load_tables_rgb_u16_be) NOPE(load_tables)
NOPE(load_rgba) NOPE(store_rgba)
LOWP(scale_u8) LOWP(scale_565) LOWP(scale_1_float)
LOWP( lerp_u8) LOWP( lerp_565) LOWP( lerp_1_float)
LOWP(dstatop) LOWP(dstin) LOWP(dstout) LOWP(dstover)
LOWP(srcatop) LOWP(srcin) LOWP(srcout) LOWP(srcover)
LOWP(clear) LOWP(modulate) LOWP(multiply) LOWP(plus_) LOWP(screen) LOWP(xor_)
NOPE(colorburn) NOPE(colordodge) LOWP(darken) LOWP(difference)
LOWP(exclusion) LOWP(hardlight) LOWP(lighten) LOWP(overlay) NOPE(softlight)
NOPE(hue) NOPE(saturation) NOPE(color) NOPE(luminosity)
LOWP(srcover_rgba_8888) LOWP(srcover_bgra_8888)
LOWP(luminance_to_alpha)
LOWP(matrix_translate) LOWP(matrix_scale_translate)
LOWP(matrix_2x3) NOPE(matrix_3x4) TODO(matrix_4x5) TODO(matrix_4x3)
LOWP(matrix_perspective)
NOPE(parametric_r) NOPE(parametric_g) NOPE(parametric_b)
NOPE(parametric_a) NOPE(gamma) NOPE(gamma_dst)
NOPE(table_r) NOPE(table_g) NOPE(table_b) NOPE(table_a)
NOPE(lab_to_xyz)
TODO(mirror_x) TODO(repeat_x)
TODO(mirror_y) TODO(repeat_y)
LOWP(decal_x) LOWP(decal_y) LOWP(decal_x_and_y)
LOWP(check_decal_mask)
TODO(bilinear_nx) TODO(bilinear_px) TODO(bilinear_ny) TODO(bilinear_py)
TODO(bicubic_n3x) TODO(bicubic_n1x) TODO(bicubic_p1x) TODO(bicubic_p3x)
TODO(bicubic_n3y) TODO(bicubic_n1y) TODO(bicubic_p1y) TODO(bicubic_p3y)
TODO(save_xy) TODO(accumulate)
LOWP(clamp_x_1) LOWP(mirror_x_1) LOWP(repeat_x_1)
LOWP(evenly_spaced_gradient)
LOWP(gradient)
LOWP(evenly_spaced_2_stop_gradient)
LOWP(xy_to_unit_angle)
LOWP(xy_to_radius)
TODO(negate_x)
TODO(xy_to_2pt_conical_strip)
TODO(xy_to_2pt_conical_focal_on_circle)
TODO(xy_to_2pt_conical_well_behaved)
TODO(xy_to_2pt_conical_greater)
TODO(xy_to_2pt_conical_smaller)
TODO(alter_2pt_conical_compensate_focal)
TODO(alter_2pt_conical_unswap)
TODO(mask_2pt_conical_nan) TODO(mask_2pt_conical_degenerates) TODO(apply_vector_mask)
TODO(byte_tables) TODO(byte_tables_rgb)
NOPE(rgb_to_hsl) NOPE(hsl_to_rgb)
NOPE(clut_3D) NOPE(clut_4D)
NOPE(gauss_a_to_rgba)
#undef LOWP
#undef TODO
#undef NOPE
#endif
// Engines comprise everything we need to run SkRasterPipelines.
struct SkJumper_Engine {
StageFn* stages[kNumStages];
StartPipelineFn* start_pipeline;
StageFn* just_return;
};
// We'll default to this baseline engine, but try to choose a better one at runtime.
static const SkJumper_Engine kBaseline = {
#define M(stage) sk_##stage,
{ SK_RASTER_PIPELINE_STAGES(M) },
#undef M
sk_start_pipeline,
sk_just_return,
};
static SkJumper_Engine gEngine = kBaseline;
static SkOnce gChooseEngineOnce;
static SkJumper_Engine choose_engine() {
#if !SK_JUMPER_USE_ASSEMBLY
// We'll just run baseline code.
#elif defined(__x86_64__) || defined(_M_X64)
#if !defined(_MSC_VER) // No _skx stages for Windows yet.
if (1 && SkCpu::Supports(SkCpu::SKX)) {
return {
#define M(stage) ASM(stage, skx),
{ SK_RASTER_PIPELINE_STAGES(M) },
M(start_pipeline)
M(just_return)
#undef M
};
}
#endif
if (1 && SkCpu::Supports(SkCpu::HSW)) {
return {
#define M(stage) ASM(stage, hsw),
{ SK_RASTER_PIPELINE_STAGES(M) },
M(start_pipeline)
M(just_return)
#undef M
};
}
if (1 && SkCpu::Supports(SkCpu::AVX)) {
return {
#define M(stage) ASM(stage, avx),
{ SK_RASTER_PIPELINE_STAGES(M) },
M(start_pipeline)
M(just_return)
#undef M
};
}
if (1 && SkCpu::Supports(SkCpu::SSE41)) {
return {
#define M(stage) ASM(stage, sse41),
{ SK_RASTER_PIPELINE_STAGES(M) },
M(start_pipeline)
M(just_return)
#undef M
};
}
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
return {
#define M(stage) ASM(stage, sse2),
{ SK_RASTER_PIPELINE_STAGES(M) },
M(start_pipeline)
M(just_return)
#undef M
};
}
#elif defined(__i386__) || defined(_M_IX86)
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
return {
#define M(stage) ASM(stage, sse2),
{ SK_RASTER_PIPELINE_STAGES(M) },
M(start_pipeline)
M(just_return)
#undef M
};
}
#endif
return kBaseline;
}
#ifndef SK_JUMPER_DISABLE_8BIT
static const SkJumper_Engine kNone = {
#define M(stage) nullptr,
{ SK_RASTER_PIPELINE_STAGES(M) },
#undef M
nullptr,
nullptr,
};
static SkJumper_Engine gLowp = kNone;
static SkOnce gChooseLowpOnce;
static SkJumper_Engine choose_lowp() {
#if SK_JUMPER_USE_ASSEMBLY
#if defined(__x86_64__) || defined(_M_X64)
if (1 && SkCpu::Supports(SkCpu::HSW)) {
return {
#define M(st) hsw_lowp<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
ASM(start_pipeline,hsw_lowp),
ASM(just_return ,hsw_lowp),
#undef M
};
}
if (1 && SkCpu::Supports(SkCpu::SSE41)) {
return {
#define M(st) sse41_lowp<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
ASM(start_pipeline,sse41_lowp),
ASM(just_return ,sse41_lowp),
#undef M
};
}
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
return {
#define M(st) sse2_lowp<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
ASM(start_pipeline,sse2_lowp),
ASM(just_return ,sse2_lowp),
#undef M
};
}
#elif defined(__i386__) || defined(_M_IX86)
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
return {
#define M(st) sse2_lowp<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
ASM(start_pipeline,sse2_lowp),
ASM(just_return ,sse2_lowp),
#undef M
};
}
#elif defined(JUMPER_HAS_NEON_LOWP)
return {
#define M(st) neon_lowp<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
sk_start_pipeline_lowp,
sk_just_return_lowp,
#undef M
};
#endif
#endif
return kNone;
}
#endif
const SkJumper_Engine& SkRasterPipeline::build_pipeline(void** ip) const {
#ifndef SK_JUMPER_DISABLE_8BIT
gChooseLowpOnce([]{ gLowp = choose_lowp(); });
// First try to build a lowp pipeline. If that fails, fall back to normal float gEngine.
void** reset_point = ip;
*--ip = (void*)gLowp.just_return;
for (const StageList* st = fStages; st; st = st->prev) {
if (st->stage == SkRasterPipeline::clamp_0 ||
st->stage == SkRasterPipeline::clamp_1) {
continue; // No-ops in lowp.
}
if (auto fn = SkOpts::stages_lowp[st->stage]) {
if (StageFn* fn = gLowp.stages[st->stage]) {
if (st->ctx) {
*--ip = st->ctx;
}
*--ip = (void*)fn;
} else {
log_missing(st->stage);
ip = reset_point;
break;
}
}
if (ip != reset_point) {
return SkOpts::start_pipeline_lowp;
return gLowp;
}
#endif
*--ip = (void*)SkOpts::just_return_highp;
gChooseEngineOnce([]{ gEngine = choose_engine(); });
// We're building the pipeline backwards, so we start with the final stage just_return.
*--ip = (void*)gEngine.just_return;
// Still going backwards, each stage's context pointer then its StageFn.
for (const StageList* st = fStages; st; st = st->prev) {
if (st->ctx) {
*--ip = st->ctx;
}
*--ip = (void*)SkOpts::stages_highp[st->stage];
*--ip = (void*)gEngine.stages[st->stage];
}
return SkOpts::start_pipeline_highp;
return gEngine;
}
void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
@ -55,8 +477,8 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
// Best to not use fAlloc here... we can't bound how often run() will be called.
SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded);
start_pipeline(x,y,x+w,y+h, program.get());
const SkJumper_Engine& engine = this->build_pipeline(program.get() + fSlotsNeeded);
engine.start_pipeline(x,y,x+w,y+h, program.get());
}
std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile() const {
@ -65,8 +487,9 @@ std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile()
}
void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
const SkJumper_Engine& engine = this->build_pipeline(program + fSlotsNeeded);
auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
auto start_pipeline = engine.start_pipeline;
return [=](size_t x, size_t y, size_t w, size_t h) {
start_pipeline(x,y,x+w,y+h, program);
};

View File

@ -11,11 +11,34 @@
#include <stddef.h>
#include <stdint.h>
// This file contains definitions shared by SkJumper.cpp/SkJumper_stages.cpp
// and the rest of Skia. It is important to keep the interface to SkJumper
// limited and simple to avoid serious ODR violation pitfalls, especially when
// using Microsoft's <math.h> and similar headers with inline-but-not-static
// function definitions.
// This file contains definitions shared by SkJumper.cpp (compiled normally as part of Skia)
// and SkJumper_stages.cpp (compiled into Skia _and_ offline into SkJumper_generated.h).
// Keep it simple!
// Externally facing functions (start_pipeline) are called a little specially on Windows.
#if defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__x86_64__)
#define MAYBE_MSABI __attribute__((ms_abi)) // Use MS' ABI, not System V.
#elif defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__i386__)
#define MAYBE_MSABI __attribute__((force_align_arg_pointer)) // Re-align stack 4 -> 16 bytes.
#else
#define MAYBE_MSABI
#endif
// Any custom ABI to use for all non-externally-facing stage functions.
#if defined(__ARM_NEON) && defined(__arm__)
// This lets us pass vectors more efficiently on 32-bit ARM.
#define ABI __attribute__((pcs("aapcs-vfp")))
#else
#define ABI
#endif
// On ARM we expect that you're using Clang if you want SkJumper to be fast.
// If you are, the baseline float stages will use NEON, and lowp stages will
// also be available. (If somehow you're building for ARM not using Clang,
// you'll get scalar baseline stages and no lowp support.)
#if defined(__clang__) && defined(__ARM_NEON)
#define JUMPER_HAS_NEON_LOWP
#endif
static const int SkJumper_kMaxStride = 16;
@ -53,7 +76,7 @@ struct SkJumper_DecalTileCtx {
};
struct SkJumper_CallbackCtx {
void (*fn)(SkJumper_CallbackCtx* self, int active_pixels/*<= SkJumper_kMaxStride*/);
MAYBE_MSABI void (*fn)(SkJumper_CallbackCtx* self, int active_pixels/*<= SkJumper_kMaxStride*/);
// When called, fn() will have our active pixels available in rgba.
// When fn() returns, the pipeline will read back those active pixels from read_from.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,961 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
// This restricted SkJumper backend works on 8-bit per channel pixels stored in
// 16-bit channels. This is a last attempt to write a performant low-precision
// backend with stage definitions that can be shared by x86 and ARM.
#include "SkJumper.h"
#include "SkJumper_misc.h"
#if defined(__clang__) // This file is empty when not compiled by Clang.
#if defined(__ARM_NEON)
#include <arm_neon.h>
#elif defined(__SSE2__)
#include <immintrin.h>
#else
#include <math.h>
#endif
#if !defined(JUMPER_IS_OFFLINE)
#define WRAP(name) sk_##name##_lowp
#elif defined(__AVX2__)
#define WRAP(name) sk_##name##_hsw_lowp
#elif defined(__SSE4_1__)
#define WRAP(name) sk_##name##_sse41_lowp
#elif defined(__SSE2__)
#define WRAP(name) sk_##name##_sse2_lowp
#endif
#if defined(__AVX2__)
using U8 = uint8_t __attribute__((ext_vector_type(16)));
using U16 = uint16_t __attribute__((ext_vector_type(16)));
using I16 = int16_t __attribute__((ext_vector_type(16)));
using I32 = int32_t __attribute__((ext_vector_type(16)));
using U32 = uint32_t __attribute__((ext_vector_type(16)));
using F = float __attribute__((ext_vector_type(16)));
#else
using U8 = uint8_t __attribute__((ext_vector_type(8)));
using U16 = uint16_t __attribute__((ext_vector_type(8)));
using I16 = int16_t __attribute__((ext_vector_type(8)));
using I32 = int32_t __attribute__((ext_vector_type(8)));
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using F = float __attribute__((ext_vector_type(8)));
#endif
static const size_t N = sizeof(U16) / sizeof(uint16_t);
// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
U16 r, U16 g, U16 b, U16 a,
U16 dr, U16 dg, U16 db, U16 da);
extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0,
const size_t y0,
const size_t xlimit,
const size_t ylimit,
void** program) {
auto start = (Stage)load_and_inc(program);
for (size_t dy = y0; dy < ylimit; dy++) {
size_t dx = x0;
for (; dx + N <= xlimit; dx += N) {
start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0);
}
if (size_t tail = xlimit - dx) {
start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
}
}
}
extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t,
U16,U16,U16,U16, U16,U16,U16,U16) {}
// All stages use the same function call ABI to chain into each other, but there are three types:
// GG: geometry in, geometry out -- think, a matrix
// GP: geometry in, pixels out. -- think, a memory gather
// PP: pixels in, pixels out. -- think, a blend mode
//
// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
//
// These three STAGE_ macros let you define each type of stage,
// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
#define STAGE_GG(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
U16 r, U16 g, U16 b, U16 a, \
U16 dr, U16 dg, U16 db, U16 da) { \
auto x = join<F>(r,g), \
y = join<F>(b,a); \
name##_k(Ctx{program}, dx,dy,tail, x,y); \
split(x, &r,&g); \
split(y, &b,&a); \
auto next = (Stage)load_and_inc(program); \
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
#define STAGE_GP(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
U16& r, U16& g, U16& b, U16& a, \
U16& dr, U16& dg, U16& db, U16& da); \
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
U16 r, U16 g, U16 b, U16 a, \
U16 dr, U16 dg, U16 db, U16 da) { \
auto x = join<F>(r,g), \
y = join<F>(b,a); \
name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
auto next = (Stage)load_and_inc(program); \
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
U16& r, U16& g, U16& b, U16& a, \
U16& dr, U16& dg, U16& db, U16& da)
#define STAGE_PP(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
U16& r, U16& g, U16& b, U16& a, \
U16& dr, U16& dg, U16& db, U16& da); \
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
U16 r, U16 g, U16 b, U16 a, \
U16 dr, U16 dg, U16 db, U16 da) { \
name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
auto next = (Stage)load_and_inc(program); \
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
} \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
U16& r, U16& g, U16& b, U16& a, \
U16& dr, U16& dg, U16& db, U16& da)
// ~~~~~~ Commonly used helper functions ~~~~~~ //
SI U16 div255(U16 v) {
#if 0
return (v+127)/255; // The ideal rounding divide by 255.
#else
return (v+255)/256; // A good approximation of (v+127)/255.
#endif
}
SI U16 inv(U16 v) { return 255-v; }
SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
template <typename D, typename S>
SI D cast(S src) {
return __builtin_convertvector(src, D);
}
template <typename D, typename S>
SI void split(S v, D* lo, D* hi) {
static_assert(2*sizeof(D) == sizeof(S), "");
memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
}
template <typename D, typename S>
SI D join(S lo, S hi) {
static_assert(sizeof(D) == 2*sizeof(S), "");
D v;
memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
return v;
}
template <typename V, typename H>
SI V map(V v, H (*fn)(H)) {
H lo,hi;
split(v, &lo,&hi);
lo = fn(lo);
hi = fn(hi);
return join<V>(lo,hi);
}
// TODO: do we need platform-specific intrinsics for any of these?
SI F if_then_else(I32 c, F t, F e) {
return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
}
SI F max(F x, F y) { return if_then_else(x < y, y, x); }
SI F min(F x, F y) { return if_then_else(x < y, x, y); }
SI F mad(F f, F m, F a) { return f*m+a; }
SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
SI F rcp(F x) {
#if defined(__AVX2__)
return map(x, _mm256_rcp_ps);
#elif defined(__SSE__)
return map(x, _mm_rcp_ps);
#elif defined(__ARM_NEON)
return map(x, +[](float32x4_t v) {
auto est = vrecpeq_f32(v);
return vrecpsq_f32(v,est)*est;
});
#else
return 1.0f / x;
#endif
}
SI F sqrt_(F x) {
#if defined(__AVX2__)
return map(x, _mm256_sqrt_ps);
#elif defined(__SSE__)
return map(x, _mm_sqrt_ps);
#elif defined(__aarch64__)
return map(x, vsqrtq_f32);
#elif defined(__ARM_NEON)
return map(x, +[](float32x4_t v) {
auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
est *= vrsqrtsq_f32(v,est*est);
est *= vrsqrtsq_f32(v,est*est);
return v*est; // sqrt(v) == v*rsqrt(v).
});
#else
return F{
sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
};
#endif
}
SI F floor_(F x) {
#if defined(__aarch64__)
return map(x, vrndmq_f32);
#elif defined(__AVX2__)
return map(x, +[](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro...
#elif defined(__SSE4_1__)
return map(x, +[](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too.
#else
F roundtrip = cast<F>(cast<I32>(x));
return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
#endif
}
SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); }
// ~~~~~~ Basic / misc. stages ~~~~~~ //
STAGE_GG(seed_shader, const float* iota) {
x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
y = cast<F>(I32(dy)) + 0.5f;
}
STAGE_GG(matrix_translate, const float* m) {
x += m[0];
y += m[1];
}
STAGE_GG(matrix_scale_translate, const float* m) {
x = mad(x,m[0], m[2]);
y = mad(y,m[1], m[3]);
}
STAGE_GG(matrix_2x3, const float* m) {
auto X = mad(x,m[0], mad(y,m[2], m[4])),
Y = mad(x,m[1], mad(y,m[3], m[5]));
x = X;
y = Y;
}
STAGE_GG(matrix_perspective, const float* m) {
// N.B. Unlike the other matrix_ stages, this matrix is row-major.
auto X = mad(x,m[0], mad(y,m[1], m[2])),
Y = mad(x,m[3], mad(y,m[4], m[5])),
Z = mad(x,m[6], mad(y,m[7], m[8]));
x = X * rcp(Z);
y = Y * rcp(Z);
}
STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) {
r = c->rgba[0];
g = c->rgba[1];
b = c->rgba[2];
a = c->rgba[3];
}
STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; }
STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
STAGE_PP(set_rgb, const float rgb[3]) {
r = from_float(rgb[0]);
g = from_float(rgb[1]);
b = from_float(rgb[2]);
}
STAGE_PP(clamp_a, Ctx::None) {
r = min(r, a);
g = min(g, a);
b = min(b, a);
}
STAGE_PP(clamp_a_dst, Ctx::None) {
dr = min(dr, da);
dg = min(dg, da);
db = min(db, da);
}
STAGE_PP(premul, Ctx::None) {
r = div255(r * a);
g = div255(g * a);
b = div255(b * a);
}
STAGE_PP(premul_dst, Ctx::None) {
dr = div255(dr * da);
dg = div255(dg * da);
db = div255(db * da);
}
STAGE_PP(force_opaque , Ctx::None) { a = 255; }
STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; }
STAGE_PP(swap_rb, Ctx::None) {
auto tmp = r;
r = b;
b = tmp;
}
STAGE_PP(move_src_dst, Ctx::None) {
dr = r;
dg = g;
db = b;
da = a;
}
STAGE_PP(move_dst_src, Ctx::None) {
r = dr;
g = dg;
b = db;
a = da;
}
STAGE_PP(invert, Ctx::None) {
r = inv(r);
g = inv(g);
b = inv(b);
a = inv(a);
}
// ~~~~~~ Blend modes ~~~~~~ //
// The same logic applied to all 4 channels.
#define BLEND_MODE(name) \
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
STAGE_PP(name, Ctx::None) { \
r = name##_channel(r,dr,a,da); \
g = name##_channel(g,dg,a,da); \
b = name##_channel(b,db,a,da); \
a = name##_channel(a,da,a,da); \
} \
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
BLEND_MODE(clear) { return 0; }
BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
BLEND_MODE(srcin) { return div255( s*da ); }
BLEND_MODE(dstin) { return div255( d*sa ); }
BLEND_MODE(srcout) { return div255( s*inv(da) ); }
BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
BLEND_MODE(modulate) { return div255( s*d ); }
BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
BLEND_MODE(plus_) { return min(s+d, 255); }
BLEND_MODE(screen) { return s + d - div255( s*d ); }
BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
#undef BLEND_MODE
// The same logic applied to color, and srcover for alpha.
#define BLEND_MODE(name) \
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
STAGE_PP(name, Ctx::None) { \
r = name##_channel(r,dr,a,da); \
g = name##_channel(g,dg,a,da); \
b = name##_channel(b,db,a,da); \
a = a + div255( da*inv(a) ); \
} \
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); }
BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); }
BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); }
BLEND_MODE(hardlight) {
return div255( s*inv(da) + d*inv(sa) +
if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
}
BLEND_MODE(overlay) {
return div255( s*inv(da) + d*inv(sa) +
if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
}
#undef BLEND_MODE
// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
template <typename T>
SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
return (T*)ctx->pixels + dy*ctx->stride + dx;
}
template <typename T>
SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
auto clamp = [](F v, F limit) {
limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
return min(max(0, v), limit);
};
x = clamp(x, ctx->width);
y = clamp(y, ctx->height);
*ptr = (const T*)ctx->pixels;
return trunc_(y)*ctx->stride + trunc_(x);
}
template <typename V, typename T>
SI V load(const T* ptr, size_t tail) {
V v = 0;
switch (tail & (N-1)) {
case 0: memcpy(&v, ptr, sizeof(v)); break;
#if defined(__AVX2__)
case 15: v[14] = ptr[14];
case 14: v[13] = ptr[13];
case 13: v[12] = ptr[12];
case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
case 11: v[10] = ptr[10];
case 10: v[ 9] = ptr[ 9];
case 9: v[ 8] = ptr[ 8];
case 8: memcpy(&v, ptr, 8*sizeof(T)); break;
#endif
case 7: v[ 6] = ptr[ 6];
case 6: v[ 5] = ptr[ 5];
case 5: v[ 4] = ptr[ 4];
case 4: memcpy(&v, ptr, 4*sizeof(T)); break;
case 3: v[ 2] = ptr[ 2];
case 2: memcpy(&v, ptr, 2*sizeof(T)); break;
case 1: v[ 0] = ptr[ 0];
}
return v;
}
template <typename V, typename T>
SI void store(T* ptr, size_t tail, V v) {
switch (tail & (N-1)) {
case 0: memcpy(ptr, &v, sizeof(v)); break;
#if defined(__AVX2__)
case 15: ptr[14] = v[14];
case 14: ptr[13] = v[13];
case 13: ptr[12] = v[12];
case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
case 11: ptr[10] = v[10];
case 10: ptr[ 9] = v[ 9];
case 9: ptr[ 8] = v[ 8];
case 8: memcpy(ptr, &v, 8*sizeof(T)); break;
#endif
case 7: ptr[ 6] = v[ 6];
case 6: ptr[ 5] = v[ 5];
case 5: ptr[ 4] = v[ 4];
case 4: memcpy(ptr, &v, 4*sizeof(T)); break;
case 3: ptr[ 2] = v[ 2];
case 2: memcpy(ptr, &v, 2*sizeof(T)); break;
case 1: ptr[ 0] = v[ 0];
}
}
#if defined(__AVX2__)
template <typename V, typename T>
SI V gather(const T* ptr, U32 ix) {
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
}
template<>
F gather(const float* p, U32 ix) {
__m256i lo, hi;
split(ix, &lo, &hi);
return join<F>(_mm256_i32gather_ps(p, lo, 4),
_mm256_i32gather_ps(p, hi, 4));
}
template<>
U32 gather(const uint32_t* p, U32 ix) {
__m256i lo, hi;
split(ix, &lo, &hi);
return join<U32>(_mm256_i32gather_epi32(p, lo, 4),
_mm256_i32gather_epi32(p, hi, 4));
}
#else
template <typename V, typename T>
SI V gather(const T* ptr, U32 ix) {
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
}
#endif
// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
#if 1 && defined(__AVX2__)
// Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
__m256i _01,_23;
split(rgba, &_01, &_23);
__m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
_13 = _mm256_permute2x128_si256(_01,_23, 0x31);
rgba = join<U32>(_02, _13);
auto cast_U16 = [](U32 v) -> U16 {
__m256i _02,_13;
split(v, &_02,&_13);
return _mm256_packus_epi32(_02,_13);
};
#else
auto cast_U16 = [](U32 v) -> U16 {
return cast<U16>(v);
};
#endif
*r = cast_U16(rgba & 65535) & 255;
*g = cast_U16(rgba & 65535) >> 8;
*b = cast_U16(rgba >> 16) & 255;
*a = cast_U16(rgba >> 16) >> 8;
}
SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
#if 1 && defined(__ARM_NEON)
uint8x8x4_t rgba;
switch (tail & (N-1)) {
case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break;
case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
}
*r = cast<U16>(rgba.val[0]);
*g = cast<U16>(rgba.val[1]);
*b = cast<U16>(rgba.val[2]);
*a = cast<U16>(rgba.val[3]);
#else
from_8888(load<U32>(ptr, tail), r,g,b,a);
#endif
}
SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
#if 1 && defined(__ARM_NEON)
uint8x8x4_t rgba = {{
cast<U8>(r),
cast<U8>(g),
cast<U8>(b),
cast<U8>(a),
}};
switch (tail & (N-1)) {
case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break;
case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
}
#else
store(ptr, tail, cast<U32>(r | (g<<8)) << 0
| cast<U32>(b | (a<<8)) << 16);
#endif
}
STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
}
STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
}
STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) {
store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
}
STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
}
STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
}
STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) {
store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
}
STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) {
const uint32_t* ptr;
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
}
STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) {
const uint32_t* ptr;
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a);
}
// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
// Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
U16 R = (rgb >> 11) & 31,
G = (rgb >> 5) & 63,
B = (rgb >> 0) & 31;
// These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
*r = (R << 3) | (R >> 2);
*g = (G << 2) | (G >> 4);
*b = (B << 3) | (B >> 2);
}
SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
from_565(load<U16>(ptr, tail), r,g,b);
}
SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
// Select the top 5,6,5 bits.
U16 R = r >> 3,
G = g >> 2,
B = b >> 3;
// Pack them back into 15|rrrrr gggggg bbbbb|0.
store(ptr, tail, R << 11
| G << 5
| B << 0);
}
STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) {
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
a = 255;
}
STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) {
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
da = 255;
}
STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) {
store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
}
STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) {
const uint16_t* ptr;
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
from_565(gather<U16>(ptr, ix), &r, &g, &b);
a = 255;
}
SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
// Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
U16 R = (rgba >> 12) & 15,
G = (rgba >> 8) & 15,
B = (rgba >> 4) & 15,
A = (rgba >> 0) & 15;
// Scale [0,15] to [0,255].
*r = (R << 4) | R;
*g = (G << 4) | G;
*b = (B << 4) | B;
*a = (A << 4) | A;
}
SI void load_4444(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
from_4444(load<U16>(ptr, tail), r,g,b,a);
}
SI void store_4444(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
// Select the top 4 bits of each.
U16 R = r >> 4,
G = g >> 4,
B = b >> 4,
A = a >> 4;
// Pack them back into 15|rrrr gggg bbbb aaaa|0.
store(ptr, tail, R << 12
| G << 8
| B << 4
| A << 0);
}
STAGE_PP(load_4444, const SkJumper_MemoryCtx* ctx) {
load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
}
STAGE_PP(load_4444_dst, const SkJumper_MemoryCtx* ctx) {
load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
}
STAGE_PP(store_4444, const SkJumper_MemoryCtx* ctx) {
store_4444(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
}
STAGE_GP(gather_4444, const SkJumper_GatherCtx* ctx) {
const uint16_t* ptr;
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
}
// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
SI U16 load_8(const uint8_t* ptr, size_t tail) {
return cast<U16>(load<U8>(ptr, tail));
}
SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
store(ptr, tail, cast<U8>(v));
}
STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) {
r = g = b = 0;
a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
}
STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
dr = dg = db = 0;
da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
}
STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) {
store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
}
STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) {
const uint8_t* ptr;
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
r = g = b = 0;
a = cast<U16>(gather<U8>(ptr, ix));
}
STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) {
r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
a = 255;
}
STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
da = 255;
}
STAGE_PP(luminance_to_alpha, Ctx::None) {
a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
r = g = b = 0;
}
STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) {
const uint8_t* ptr;
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
r = g = b = cast<U16>(gather<U8>(ptr, ix));
a = 255;
}
// ~~~~~~ Coverage scales / lerps ~~~~~~ //
STAGE_PP(scale_1_float, const float* f) {
U16 c = from_float(*f);
r = div255( r * c );
g = div255( g * c );
b = div255( b * c );
a = div255( a * c );
}
STAGE_PP(lerp_1_float, const float* f) {
U16 c = from_float(*f);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
a = lerp(da, a, c);
}
STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) {
U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
r = div255( r * c );
g = div255( g * c );
b = div255( b * c );
a = div255( a * c );
}
STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) {
U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
a = lerp(da, a, c);
}
// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
return if_then_else(a < da, min(cr,cg,cb)
, max(cr,cg,cb));
}
STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) {
U16 cr,cg,cb;
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
r = div255( r * cr );
g = div255( g * cg );
b = div255( b * cb );
a = div255( a * ca );
}
STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) {
U16 cr,cg,cb;
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
r = lerp(dr, r, cr);
g = lerp(dg, g, cg);
b = lerp(db, b, cb);
a = lerp(da, a, ca);
}
// ~~~~~~ Gradient stages ~~~~~~ //
// Clamp x to [0,1], both sides inclusive (think, gradients).
// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
SI F clamp_01(F v) { return min(max(0, v), 1); }
STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
STAGE_GG(mirror_x_1, Ctx::None) {
auto two = [](F x){ return x+x; };
x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
}
SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
STAGE_GG(decal_x, SkJumper_DecalTileCtx* ctx) {
auto w = ctx->limit_x;
unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
}
STAGE_GG(decal_y, SkJumper_DecalTileCtx* ctx) {
auto h = ctx->limit_y;
unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
}
STAGE_GG(decal_x_and_y, SkJumper_DecalTileCtx* ctx) {
auto w = ctx->limit_x;
auto h = ctx->limit_y;
unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
}
STAGE_PP(check_decal_mask, SkJumper_DecalTileCtx* ctx) {
auto mask = unaligned_load<U16>(ctx->mask);
r = r & mask;
g = g & mask;
b = b & mask;
a = a & mask;
}
SI U16 round_F_to_U16(F x) { return cast<U16>(x * 255.0f + 0.5f); }
SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
U16* r, U16* g, U16* b, U16* a) {
F fr, fg, fb, fa, br, bg, bb, ba;
#if defined(__AVX2__)
if (c->stopCount <=8) {
__m256i lo, hi;
split(idx, &lo, &hi);
fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
} else
#endif
{
fr = gather<F>(c->fs[0], idx);
fg = gather<F>(c->fs[1], idx);
fb = gather<F>(c->fs[2], idx);
fa = gather<F>(c->fs[3], idx);
br = gather<F>(c->bs[0], idx);
bg = gather<F>(c->bs[1], idx);
bb = gather<F>(c->bs[2], idx);
ba = gather<F>(c->bs[3], idx);
}
*r = round_F_to_U16(mad(t, fr, br));
*g = round_F_to_U16(mad(t, fg, bg));
*b = round_F_to_U16(mad(t, fb, bb));
*a = round_F_to_U16(mad(t, fa, ba));
}
STAGE_GP(gradient, const SkJumper_GradientCtx* c) {
auto t = x;
U32 idx = 0;
// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
for (size_t i = 1; i < c->stopCount; i++) {
idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
}
gradient_lookup(c, idx, t, &r, &g, &b, &a);
}
STAGE_GP(evenly_spaced_gradient, const SkJumper_GradientCtx* c) {
auto t = x;
auto idx = trunc_(t * (c->stopCount-1));
gradient_lookup(c, idx, t, &r, &g, &b, &a);
}
STAGE_GP(evenly_spaced_2_stop_gradient, const void* ctx) {
// TODO: Rename Ctx SkJumper_EvenlySpaced2StopGradientCtx.
struct Ctx { float f[4], b[4]; };
auto c = (const Ctx*)ctx;
auto t = x;
r = round_F_to_U16(mad(t, c->f[0], c->b[0]));
g = round_F_to_U16(mad(t, c->f[1], c->b[1]));
b = round_F_to_U16(mad(t, c->f[2], c->b[2]));
a = round_F_to_U16(mad(t, c->f[3], c->b[3]));
}
STAGE_GG(xy_to_unit_angle, Ctx::None) {
F xabs = abs_(x),
yabs = abs_(y);
F slope = min(xabs, yabs)/max(xabs, yabs);
F s = slope * slope;
// Use a 7th degree polynomial to approximate atan.
// This was generated using sollya.gforge.inria.fr.
// A float optimized polynomial was generated using the following command.
// P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
F phi = slope
* (0.15912117063999176025390625f + s
* (-5.185396969318389892578125e-2f + s
* (2.476101927459239959716796875e-2f + s
* (-7.0547382347285747528076171875e-3f))));
phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi);
phi = if_then_else(y < 0.0f , 1.0f - phi , phi);
phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
x = phi;
}
STAGE_GG(xy_to_radius, Ctx::None) {
x = sqrt_(x*x + y*y);
}
// ~~~~~~ Compound stages ~~~~~~ //
STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
load_8888(ptr, tail, &dr,&dg,&db,&da);
r = r + div255( dr*inv(a) );
g = g + div255( dg*inv(a) );
b = b + div255( db*inv(a) );
a = a + div255( da*inv(a) );
store_8888(ptr, tail, r,g,b,a);
}
STAGE_PP(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) {
auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
load_8888(ptr, tail, &db,&dg,&dr,&da);
r = r + div255( dr*inv(a) );
g = g + div255( dg*inv(a) );
b = b + div255( db*inv(a) );
a = a + div255( da*inv(a) );
store_8888(ptr, tail, b,g,r,a);
}
#endif//defined(__clang__)

261
src/jumper/build_stages.py Executable file
View File

@ -0,0 +1,261 @@
#!/usr/bin/env python2.7
#
# Copyright 2017 Google Inc.
#
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import re
import subprocess
import sys
clang = 'clang-5.0'
objdump = 'gobjdump'
ccache = 'ccache'
stages = 'src/jumper/SkJumper_stages.cpp'
stages_lowp = 'src/jumper/SkJumper_stages_lowp.cpp'
generated = 'src/jumper/SkJumper_generated.S'
generated_win = 'src/jumper/SkJumper_generated_win.S'
clang = sys.argv[1] if len(sys.argv) > 1 else clang
objdump = sys.argv[2] if len(sys.argv) > 2 else objdump
ccache = sys.argv[3] if len(sys.argv) > 3 else ccache
stages = sys.argv[4] if len(sys.argv) > 4 else stages
stages_lowp = sys.argv[5] if len(sys.argv) > 5 else stages_lowp
generated = sys.argv[6] if len(sys.argv) > 6 else generated
generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win
clang = [ccache, clang, '-x', 'c++']
cflags = ['-std=c++11', '-Os', '-DJUMPER_IS_OFFLINE',
'-momit-leaf-frame-pointer', '-ffp-contract=fast',
'-fno-exceptions', '-fno-rtti', '-fno-unwind-tables']
x86 = [ '-m32' ]
win = ['-DWIN', '-mno-red-zone']
sse2 = ['-msse2', '-mno-sse3', '-mno-ssse3', '-mno-sse4.1']
subprocess.check_call(clang + cflags + sse2 +
['-c', stages] +
['-o', 'sse2.o'])
subprocess.check_call(clang + cflags + sse2 + win +
['-c', stages] +
['-o', 'win_sse2.o'])
subprocess.check_call(clang + cflags + sse2 + x86 +
['-c', stages] +
['-o', 'x86_sse2.o'])
subprocess.check_call(clang + cflags + sse2 + win + x86 +
['-c', stages] +
['-o', 'win_x86_sse2.o'])
subprocess.check_call(clang + cflags + sse2 +
['-c', stages_lowp] +
['-o', 'lowp_sse2.o'])
subprocess.check_call(clang + cflags + sse2 + win +
['-c', stages_lowp] +
['-o', 'win_lowp_sse2.o'])
subprocess.check_call(clang + cflags + sse2 + x86 +
['-c', stages_lowp] +
['-o', 'x86_lowp_sse2.o'])
subprocess.check_call(clang + cflags + sse2 + win + x86 +
['-c', stages_lowp] +
['-o', 'win_x86_lowp_sse2.o'])
sse41 = ['-msse4.1']
subprocess.check_call(clang + cflags + sse41 +
['-c', stages] +
['-o', 'sse41.o'])
subprocess.check_call(clang + cflags + sse41 + win +
['-c', stages] +
['-o', 'win_sse41.o'])
subprocess.check_call(clang + cflags + sse41 +
['-c', stages_lowp] +
['-o', 'lowp_sse41.o'])
subprocess.check_call(clang + cflags + sse41 + win +
['-c', stages_lowp] +
['-o', 'win_lowp_sse41.o'])
avx = ['-mavx']
subprocess.check_call(clang + cflags + avx +
['-c', stages] +
['-o', 'avx.o'])
subprocess.check_call(clang + cflags + avx + win +
['-c', stages] +
['-o', 'win_avx.o'])
hsw = ['-mavx2', '-mfma', '-mf16c']
subprocess.check_call(clang + cflags + hsw +
['-c', stages] +
['-o', 'hsw.o'])
subprocess.check_call(clang + cflags + hsw + win +
['-c', stages] +
['-o', 'win_hsw.o'])
subprocess.check_call(clang + cflags + hsw +
['-c', stages_lowp] +
['-o', 'lowp_hsw.o'])
subprocess.check_call(clang + cflags + hsw + win +
['-c', stages_lowp] +
['-o', 'win_lowp_hsw.o'])
skx = ['-march=skylake-avx512']
subprocess.check_call(clang + cflags + skx +
['-c', stages] +
['-o', 'skx.o'])
# Merge x86-64 object files to deduplicate constants.
# (No other platform has more than one specialization.)
subprocess.check_call(['ld', '-r', '-o', 'merged.o',
'skx.o', 'hsw.o', 'avx.o', 'sse41.o', 'sse2.o',
'lowp_hsw.o', 'lowp_sse41.o', 'lowp_sse2.o'])
subprocess.check_call(['ld', '-r', '-o', 'win_merged.o',
'win_hsw.o', 'win_avx.o', 'win_sse41.o', 'win_sse2.o',
'win_lowp_hsw.o', 'win_lowp_sse41.o', 'win_lowp_sse2.o'])
subprocess.check_call(['ld', '-r', '-o', 'x86_merged.o',
'x86_sse2.o',
'x86_lowp_sse2.o'])
subprocess.check_call(['ld', '-r', '-o', 'win_x86_merged.o',
'win_x86_sse2.o',
'win_x86_lowp_sse2.o'])
def parse_object_file(dot_o, directive, target=None):
globl, hidden, label, comment, align = \
'.globl', 'HIDDEN', ':', '// ', 'BALIGN'
if 'win' in dot_o:
globl, hidden, label, comment, align = \
'PUBLIC', '', ' LABEL PROC', '; ', 'ALIGN '
cmd = [objdump]
if target:
cmd += ['--target', target]
# Look for sections we know we can't handle.
section_headers = subprocess.check_output(cmd + ['-h', dot_o])
for snippet in ['.rodata']:
if snippet in section_headers:
print >>sys.stderr, 'Found %s in section.' % snippet
assert snippet not in section_headers
if directive == '.long':
disassemble = ['-d', dot_o]
dehex = lambda h: '0x'+h
else:
# x86-64... as long as we're using %rip-relative addressing,
# literal sections should be fine to just dump in with .text.
disassemble = ['-d', # DO NOT USE -D.
'-z', # Print zero bytes instead of ...
'--insn-width=11',
'-j', '.text',
'-j', '.literal4',
'-j', '.literal8',
'-j', '.literal16',
'-j', '.const',
dot_o]
dehex = lambda h: str(int(h,16))
# Ok. Let's disassemble.
for line in subprocess.check_output(cmd + disassemble).split('\n'):
line = line.strip()
if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
continue
# E.g. 00000000000003a4 <_load_f16>:
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
if m:
print
sym = m.group(1)
if sym.startswith('.literal'): # .literal4, .literal16, etc
print sym.replace('.literal', align)
elif sym.startswith('.const'): # 32-byte constants
print align + '32'
elif not sym.startswith('sk_'):
print >>sys.stderr, "build_stages.py can't handle '%s' (yet?)." % sym
assert sym.startswith('sk_')
else: # a stage function
if hidden:
print hidden + ' _' + sym
print globl + ' _' + sym
if 'win' not in dot_o:
print 'FUNCTION(_' + sym + ')'
print '_' + sym + label
continue
columns = line.split('\t')
#print >>sys.stderr, columns
code = columns[1]
if len(columns) >= 4:
inst = columns[2]
args = columns[3]
else:
inst, args = columns[2], ''
if ' ' in columns[2]:
inst, args = columns[2].split(' ', 1)
code, inst, args = code.strip(), inst.strip(), args.strip()
hexed = ','.join(dehex(x) for x in code.split(' '))
print ' ' + directive + ' ' + hexed + ' '*(36-len(hexed)) + \
comment + inst + (' '*(14-len(inst)) + args if args else '')
sys.stdout = open(generated, 'w')
print '''# Copyright 2017 Google Inc.
#
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This file is generated semi-automatically with this command:
# $ src/jumper/build_stages.py
'''
print '#if defined(__MACH__)'
print ' #define HIDDEN .private_extern'
print ' #define FUNCTION(name)'
print ' #define BALIGN4 .align 2'
print ' #define BALIGN8 .align 3'
print ' #define BALIGN16 .align 4'
print ' #define BALIGN32 .align 5'
print '#else'
print ' .section .note.GNU-stack,"",%progbits'
print ' #define HIDDEN .hidden'
print ' #define FUNCTION(name) .type name,%function'
print ' #define BALIGN4 .balign 4'
print ' #define BALIGN8 .balign 8'
print ' #define BALIGN16 .balign 16'
print ' #define BALIGN32 .balign 32'
print '#endif'
print '.text'
print '#if defined(__x86_64__)'
print 'BALIGN32'
parse_object_file('merged.o', '.byte')
print '#elif defined(__i386__)'
print 'BALIGN32'
parse_object_file('x86_merged.o', '.byte')
print '#endif'
sys.stdout = open(generated_win, 'w')
print '''; Copyright 2017 Google Inc.
;
; Use of this source code is governed by a BSD-style license that can be
; found in the LICENSE file.
; This file is generated semi-automatically with this command:
; $ src/jumper/build_stages.py
'''
print 'IFDEF RAX'
print "_text32 SEGMENT ALIGN(32) 'CODE'"
print 'ALIGN 32'
parse_object_file('win_merged.o', 'DB')
print 'ELSE'
print '.MODEL FLAT,C'
print "_text32 SEGMENT ALIGN(32) 'CODE'"
print 'ALIGN 32'
parse_object_file('win_x86_merged.o', 'DB')
print 'ENDIF'
print 'END'

View File

@ -19,11 +19,11 @@
namespace SK_OPTS_NS {
template <typename T, typename P>
static inline T unaligned_load(const P* p) {
T v;
memcpy(&v, p, sizeof(v));
return v;
template <typename T>
static inline T unaligned_load(const uint8_t* src) {
T val;
memcpy(&val, src, sizeof(val));
return val;
}
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))

View File

@ -5,10 +5,14 @@
* found in the LICENSE file.
*/
#include "SkSafe_math.h" // Keep this first.
#include "SkOpts.h"
#if defined(_INC_MATH) && !defined(INC_MATH_IS_SAFE_NOW)
#error We have included ucrt\math.h without protecting it against ODR violation.
#endif
#define SK_OPTS_NS avx
#include "SkRasterPipeline_opts.h"
#include "SkUtils_opts.h"
namespace SkOpts {
@ -16,17 +20,5 @@ namespace SkOpts {
memset16 = SK_OPTS_NS::memset16;
memset32 = SK_OPTS_NS::memset32;
memset64 = SK_OPTS_NS::memset64;
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
start_pipeline_highp = SK_OPTS_NS::start_pipeline;
#undef M
#define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
#undef M
}
}

View File

@ -1,28 +0,0 @@
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkOpts.h"
#define SK_OPTS_NS hsw
#include "SkRasterPipeline_opts.h"
#include "SkUtils_opts.h"
namespace SkOpts {
void Init_hsw() {
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
start_pipeline_highp = SK_OPTS_NS::start_pipeline;
#undef M
#define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
#undef M
}
}

View File

@ -8,23 +8,10 @@
#include "SkOpts.h"
#define SK_OPTS_NS sse41
#include "SkRasterPipeline_opts.h"
#include "SkBlitRow_opts.h"
namespace SkOpts {
void Init_sse41() {
blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
start_pipeline_highp = SK_OPTS_NS::start_pipeline;
#undef M
#define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
#undef M
}
}