Revert "Reland "make SkJumper stages normal Skia code""
This reverts commit78cb579f33
. Reason for revert: lowp should be controlled by defined(JUMPER_IS_SCALAR), not defined(__clang__). So close. Original change's description: > Reland "make SkJumper stages normal Skia code" > > This is a reland of22e536e3a1
> > Now with fixed #include paths in SkRasterPipeline_opts.h, > and -ffp-contract=fast for the :hsw target to minimize > diffs on non-Windows Clang AVX2/AVX-512 bots. > > Original change's description: > > make SkJumper stages normal Skia code > > > > Enough clients are using Clang now that we can say, use Clang to build > > if you want these software pipeline stages to go fast. > > > > This lets us drop the offline build aspect of SkJumper stages, instead > > building as part of Skia using the SkOpts framework. > > > > I think everything should work, except I've (temporarily) removed > > AVX-512 support. I will put this back in a follow up. > > > > I have had to drop Windows down to __vectorcall and our narrower > > stage calling convention that keeps the d-registers on the stack. > > I tried forcing sysv_abi, but that crashed Clang. :/ > > > > Added a TODO to up the same narrower stage calling convention > > for lowp stages... we just *don't* today, for no good reason. > > > > Change-Id: Iaaa792ffe4deab3508d2dc5d0008c163c24b3383 > > Reviewed-on: https://skia-review.googlesource.com/110641 > > Commit-Queue: Mike Klein <mtklein@chromium.org> > > Reviewed-by: Herb Derby <herb@google.com> > > Reviewed-by: Florin Malita <fmalita@chromium.org> > > Change-Id: I44f2c03d33958e3807747e40904b6351957dd448 > Reviewed-on: https://skia-review.googlesource.com/112742 > Reviewed-by: Mike Klein <mtklein@chromium.org> TBR=mtklein@chromium.org,herb@google.com,fmalita@chromium.org Change-Id: Ie64da98f5187d44e03c0ce05d7cb189d4a6e6663 No-Presubmit: true No-Tree-Checks: true No-Try: true Reviewed-on: https://skia-review.googlesource.com/112743 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
61d56b92a4
commit
5cc94cc393
52
BUILD.gn
52
BUILD.gn
@ -48,6 +48,10 @@ declare_args() {
|
||||
skia_compile_processors = false
|
||||
skia_lex = false
|
||||
|
||||
skia_jumper_clang = ""
|
||||
skia_jumper_objdump = ""
|
||||
skia_jumper_ccache = ""
|
||||
|
||||
skia_skqp_enable_driver_correctness_workarounds = false
|
||||
skia_skqp_global_error_tolerance = 0
|
||||
}
|
||||
@ -310,28 +314,6 @@ opts("avx") {
|
||||
}
|
||||
}
|
||||
|
||||
opts("hsw") {
|
||||
enabled = is_x86
|
||||
sources = skia_opts.hsw_sources
|
||||
if (!is_clang && is_win) {
|
||||
cflags = [ "/arch:AVX2" ]
|
||||
} else {
|
||||
cflags = [
|
||||
"-mavx2",
|
||||
"-mf16c",
|
||||
"-mfma",
|
||||
]
|
||||
}
|
||||
|
||||
# Oddly, clang-cl doesn't recognize this as a valid flag.
|
||||
# If it ever does, it'd nice to move this up with -mavx2 and co.
|
||||
if (is_clang && !is_win) {
|
||||
# This flag lets Clang generate FMAs when it sees a mul-then-add. It's optional,
|
||||
# but nice to have, generating slightly better code for paths without explicit FMAs.
|
||||
cflags += [ "-ffp-contract=fast" ]
|
||||
}
|
||||
}
|
||||
|
||||
# Any feature of Skia that requires third-party code should be optional and use this template.
|
||||
template("optional") {
|
||||
if (invoker.enabled) {
|
||||
@ -793,7 +775,6 @@ component("skia") {
|
||||
":fontmgr_fuchsia",
|
||||
":gpu",
|
||||
":heif",
|
||||
":hsw",
|
||||
":jpeg",
|
||||
":none",
|
||||
":pdf",
|
||||
@ -2120,3 +2101,28 @@ if (skia_enable_tools) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (skia_jumper_clang != "") {
|
||||
action("regen_jumper") {
|
||||
script = "src/jumper/build_stages.py"
|
||||
|
||||
inputs = [
|
||||
"src/jumper/SkJumper_stages.cpp",
|
||||
"src/jumper/SkJumper_stages_lowp.cpp",
|
||||
]
|
||||
|
||||
# GN insists its outputs should go somewhere underneath target_out_dir, so we trick it.
|
||||
outputs = [
|
||||
"$target_out_dir/" +
|
||||
rebase_path("src/jumper/SkJumper_generated.S", target_out_dir),
|
||||
"$target_out_dir/" +
|
||||
rebase_path("src/jumper/SkJumper_generated_win.S", target_out_dir),
|
||||
]
|
||||
|
||||
args = [
|
||||
skia_jumper_clang,
|
||||
skia_jumper_objdump,
|
||||
skia_jumper_ccache,
|
||||
] + rebase_path(inputs) + rebase_path(outputs)
|
||||
}
|
||||
}
|
||||
|
20
gn/core.gni
20
gn/core.gni
@ -440,8 +440,6 @@ skia_core_sources = [
|
||||
"$_include/core/SkUnPreMultiply.h",
|
||||
"$_include/core/SkVertices.h",
|
||||
|
||||
"$_src/jumper/SkJumper.cpp",
|
||||
|
||||
# private
|
||||
"$_include/private/SkAtomics.h",
|
||||
"$_include/private/SkChecksum.h",
|
||||
@ -529,4 +527,20 @@ skia_core_sources = [
|
||||
"$_src/pathops/SkReduceOrder.h",
|
||||
]
|
||||
|
||||
skia_core_defines = [] # Used to be used by Chromium, but no longer.
|
||||
skia_core_sources += [
|
||||
"$_src/jumper/SkJumper.cpp",
|
||||
"$_src/jumper/SkJumper_stages.cpp",
|
||||
"$_src/jumper/SkJumper_stages_lowp.cpp",
|
||||
]
|
||||
skia_core_defines = []
|
||||
if (is_win) {
|
||||
if (host_os == "win") {
|
||||
skia_core_sources += [ "$_src/jumper/SkJumper_generated_win.S" ]
|
||||
} else {
|
||||
# TODO(thakis): Enable jumper in linux->win cross builds once the
|
||||
# assembler situation is figured out, https://crbug.com/762167
|
||||
skia_core_defines += [ "SK_JUMPER_USE_ASSEMBLY=0" ]
|
||||
}
|
||||
} else if (target_cpu != "wasm") {
|
||||
skia_core_sources += [ "$_src/jumper/SkJumper_generated.S" ]
|
||||
}
|
||||
|
@ -289,8 +289,7 @@ with open('Android.bp', 'w') as f:
|
||||
defs['ssse3'] +
|
||||
defs['sse41'] +
|
||||
defs['sse42'] +
|
||||
defs['avx' ] +
|
||||
defs['hsw' ]),
|
||||
defs['avx' ]),
|
||||
|
||||
'dm_includes' : bpfmt(8, dm_includes),
|
||||
'dm_srcs' : bpfmt(8, dm_srcs),
|
||||
|
@ -51,4 +51,3 @@ ssse3 = [
|
||||
sse41 = [ "$_src/opts/SkOpts_sse41.cpp" ]
|
||||
sse42 = [ "$_src/opts/SkOpts_sse42.cpp" ]
|
||||
avx = [ "$_src/opts/SkOpts_avx.cpp" ]
|
||||
hsw = [ "$_src/opts/SkOpts_hsw.cpp" ]
|
||||
|
@ -24,7 +24,7 @@ skia_opts = {
|
||||
sse41_sources = sse41
|
||||
sse42_sources = sse42
|
||||
avx_sources = avx
|
||||
hsw_sources = hsw
|
||||
hsw_sources = [] # remove after we update Chrome
|
||||
}
|
||||
|
||||
# Skia Chromium defines. These flags will be defined in chromium If these
|
||||
|
103
site/dev/contrib/jumper.md
Normal file
103
site/dev/contrib/jumper.md
Normal file
@ -0,0 +1,103 @@
|
||||
Contributing to SkJumper
|
||||
========================
|
||||
|
||||
SkJumper is the execution engine of SkRasterPipeline, a system we've been using
|
||||
to accelerate CPU-bound work inside Skia, most notably color-space conversions
|
||||
and color-correct drawing.
|
||||
|
||||
(This is where I'd put my link to design document if I had one...)
|
||||
|
||||
SkJumper is more annoying to contribute to than most Skia code because of its
|
||||
offline compilation step. You'll need particular tools installed on your
|
||||
machine and to tell GN about them. This document is designed to guide you
|
||||
through this process and ease some of that annoyance.
|
||||
|
||||
One-time Setup
|
||||
--------------
|
||||
|
||||
To generate stage code you need Clang 5.0, objdump, and ccache. It's best that
|
||||
Clang is exactly the same version we typically use (as of writing 5.0.0) and
|
||||
you'll need objdump to be compiled with support for x86-64, ARMv7, and ARMv8.
|
||||
|
||||
The easiest way to satisfy these contraints is to get your hands on a Mac and
|
||||
install Xcode, Xcode command line tools, and [Homebrew](https://brew.sh). Once
|
||||
you have `brew` installed, run these commands to get the tools you need:
|
||||
|
||||
<!--?prettify lang=sh?-->
|
||||
|
||||
ls -d /usr/include >/dev/null || xcode-select --install
|
||||
brew install llvm binutils ccache
|
||||
|
||||
Setting up GN
|
||||
-------------------------
|
||||
|
||||
With your tools installed, tell GN about them
|
||||
|
||||
skia_jumper_clang = path/to/clang-5.0
|
||||
skia_jumper_objdump = path/to/gobjdump
|
||||
skia_jumper_ccache = path/to/ccache
|
||||
|
||||
then regenerate and build as normal.
|
||||
|
||||
If you look in your GN out directory, you should now see a bunch of `.o` files,
|
||||
and `git status` should show no changes to `src/jumper/SkJumper_generated*.S`.
|
||||
That's good. Those object files are the intermediates we parse to produce
|
||||
the assembly files. We just leave them around in case you want to look at
|
||||
them yourself.
|
||||
|
||||
Make A Change
|
||||
-------------
|
||||
|
||||
Let's use the `from_srgb` stage as a little playground to make a real change.
|
||||
Linearizing sRGB encoded bytes is slow, so let's pretend we've decided to trade
|
||||
quality for speed, approximating the existing implementation with a simple square.
|
||||
|
||||
Open up `SkJumper_stages.cpp` and find the `from_srgb` stage. It'll look like
|
||||
|
||||
<!--?prettify lang=cc?-->
|
||||
|
||||
STAGE(from_srgb) {
|
||||
r = from_srgb(r);
|
||||
g = from_srgb(g);
|
||||
b = from_srgb(b);
|
||||
}
|
||||
|
||||
Let's replace whatever's there with our fast approximation:
|
||||
|
||||
<!--?prettify lang=cc?-->
|
||||
|
||||
STAGE(from_srgb) {
|
||||
r *= r;
|
||||
g *= g;
|
||||
b *= b;
|
||||
}
|
||||
|
||||
When you save and re-Ninja, you should now see changes to
|
||||
`src/jumper/SkJumper_generated.S` and `src/jumper/SkJumper_generated_win.S`.
|
||||
If you can't read assembly, no big deal. If you can, run `git diff`. You
|
||||
should see the various `sk_from_srgb_*` functions get dramatically simpler,
|
||||
something like three multiplies and a couple other bookkeeping instructions.
|
||||
|
||||
It's not unusual for isolated changes in one stage to cause seemingly unrelated
|
||||
changes in another. When adding or removing any code you'll usually see all
|
||||
the comments in branch instructions change a little bit, but the actual
|
||||
instruction on the left won't change. When adding or removing uses of
|
||||
constants, you'll often see both the comment and instruction on the left change
|
||||
for other loads of constants from memory, especially on x86-64. You'll also
|
||||
see some code that looks like garbage change; those are the constants. If
|
||||
any of this worries you, please do go running to someone who knows more for
|
||||
help, but odds are everything is fine.
|
||||
|
||||
At this point things should just be business as usual. Any time you change
|
||||
`SkJumper_stages.cpp`, Ninja ought to notice and regenerate the assembly files.
|
||||
|
||||
Adding a new Stage
|
||||
------------------
|
||||
|
||||
Adding a new stage is a lot like changing an existing stage. Edit
|
||||
`SkJumper_stages.cpp`, build Skia, test, repeat until correct.
|
||||
|
||||
You'll just need to also edit `SkRasterPipeline.h` to add your new stage to the
|
||||
macro listing all the stages. The stage name is the handle normal Skia code
|
||||
uses to refer to the stage abstractly, and the wiring between
|
||||
`SkRasterPipeline::foo` and `STAGE(foo) { ... }` should work automatically.
|
@ -33,18 +33,6 @@ link Skia against the headers and libaries found on the system paths.
|
||||
use `extra_cflags` and `extra_ldflags` to add include or library paths if
|
||||
needed.
|
||||
|
||||
A note on software backend performance
|
||||
--------------------------------------
|
||||
|
||||
A number of routines in Skia's software backend have been written to run
|
||||
fastest when compiled by Clang. If you depend on software rasterization, image
|
||||
decoding, or color space conversion and compile Skia with GCC, MSVC or another
|
||||
compiler, you will see dramatically worse performance than if you use Clang.
|
||||
|
||||
This choice was only a matter of prioritization; there is nothing fundamentally
|
||||
wrong with non-Clang compilers. So if this is a serious issue for you, please
|
||||
let us know on the mailing list.
|
||||
|
||||
Quickstart
|
||||
----------
|
||||
|
||||
|
@ -40,7 +40,6 @@
|
||||
#include "SkBlitRow_opts.h"
|
||||
#include "SkChecksum_opts.h"
|
||||
#include "SkMorphologyImageFilter_opts.h"
|
||||
#include "SkRasterPipeline_opts.h"
|
||||
#include "SkSwizzler_opts.h"
|
||||
#include "SkUtils_opts.h"
|
||||
#include "SkXfermode_opts.h"
|
||||
@ -82,26 +81,11 @@ namespace SkOpts {
|
||||
|
||||
#undef DEFINE_DEFAULT
|
||||
|
||||
#define M(st) (StageFn)SK_OPTS_NS::st,
|
||||
StageFn stages_highp[] = { SK_RASTER_PIPELINE_STAGES(M) };
|
||||
StageFn just_return_highp = (StageFn)SK_OPTS_NS::just_return;
|
||||
void (*start_pipeline_highp)(size_t,size_t,size_t,size_t,void**)
|
||||
= SK_OPTS_NS::start_pipeline;
|
||||
#undef M
|
||||
|
||||
#define M(st) (StageFn)SK_OPTS_NS::lowp::st,
|
||||
StageFn stages_lowp[] = { SK_RASTER_PIPELINE_STAGES(M) };
|
||||
StageFn just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
|
||||
void (*start_pipeline_lowp)(size_t,size_t,size_t,size_t,void**)
|
||||
= SK_OPTS_NS::lowp::start_pipeline;
|
||||
#undef M
|
||||
|
||||
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
|
||||
void Init_ssse3();
|
||||
void Init_sse41();
|
||||
void Init_sse42();
|
||||
void Init_avx();
|
||||
void Init_hsw();
|
||||
void Init_crc32();
|
||||
|
||||
static void init() {
|
||||
@ -120,8 +104,7 @@ namespace SkOpts {
|
||||
#endif
|
||||
|
||||
#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_AVX
|
||||
if (SkCpu::Supports(SkCpu::AVX)) { Init_avx(); }
|
||||
if (SkCpu::Supports(SkCpu::HSW)) { Init_hsw(); }
|
||||
if (SkCpu::Supports(SkCpu::AVX )) { Init_avx(); }
|
||||
#endif
|
||||
|
||||
#elif defined(SK_CPU_ARM64)
|
||||
|
@ -54,17 +54,6 @@ namespace SkOpts {
|
||||
static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {
|
||||
return hash_fn(data, bytes, seed);
|
||||
}
|
||||
|
||||
#define M(st) +1
|
||||
// We can't necessarily express the type of SkJumper stage functions here,
|
||||
// so we just use this void(*)(void) as a stand-in.
|
||||
using StageFn = void(*)(void);
|
||||
extern StageFn stages_highp[SK_RASTER_PIPELINE_STAGES(M)], just_return_highp;
|
||||
extern StageFn stages_lowp [SK_RASTER_PIPELINE_STAGES(M)], just_return_lowp;
|
||||
|
||||
extern void (*start_pipeline_highp)(size_t,size_t,size_t,size_t, void**);
|
||||
extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
|
||||
#undef M
|
||||
}
|
||||
|
||||
#endif//SkOpts_DEFINED
|
||||
|
@ -17,6 +17,8 @@
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
struct SkJumper_Engine;
|
||||
|
||||
/**
|
||||
* SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
|
||||
*
|
||||
@ -161,9 +163,7 @@ private:
|
||||
void* ctx;
|
||||
};
|
||||
|
||||
using StartPipelineFn = void(*)(size_t,size_t,size_t,size_t, void** program);
|
||||
StartPipelineFn build_pipeline(void**) const;
|
||||
|
||||
const SkJumper_Engine& build_pipeline(void**) const;
|
||||
void unchecked_append(StockStage, void*);
|
||||
|
||||
SkArenaAlloc* fAlloc;
|
||||
|
@ -5,46 +5,468 @@
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkColorData.h"
|
||||
#include "SkCpu.h"
|
||||
#include "SkJumper.h"
|
||||
#include "SkOpts.h"
|
||||
#include "SkOnce.h"
|
||||
#include "SkRasterPipeline.h"
|
||||
#include "SkTemplates.h"
|
||||
|
||||
SkRasterPipeline::StartPipelineFn SkRasterPipeline::build_pipeline(void** ip) const {
|
||||
#ifndef SK_JUMPER_DISABLE_8BIT
|
||||
// We'll try to build a lowp pipeline, but if that fails fallback to a highp float pipeline.
|
||||
void** reset_point = ip;
|
||||
#if !defined(SK_JUMPER_USE_ASSEMBLY)
|
||||
// We'll use __has_feature(memory_sanitizer) to detect MSAN.
|
||||
// SkJumper_generated.S is not compiled with MSAN, so MSAN would yell really loud.
|
||||
#if !defined(__has_feature)
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
// Stages are stored backwards in fStages, so we reverse here, back to front.
|
||||
*--ip = (void*)SkOpts::just_return_lowp;
|
||||
#if 0 || __has_feature(memory_sanitizer)
|
||||
#define SK_JUMPER_USE_ASSEMBLY 0
|
||||
#else
|
||||
#define SK_JUMPER_USE_ASSEMBLY 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define M(st) +1
|
||||
static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
|
||||
#undef M
|
||||
|
||||
#ifndef SK_JUMPER_DISABLE_8BIT
|
||||
// Intentionally commented out; optional logging for local debugging.
|
||||
#if 0 && SK_JUMPER_USE_ASSEMBLY && (defined(__x86_64__) || defined(_M_X64))
|
||||
#include <atomic>
|
||||
|
||||
#define M(st) #st,
|
||||
static const char* kStageNames[] = { SK_RASTER_PIPELINE_STAGES(M) };
|
||||
#undef M
|
||||
|
||||
static std::atomic<int> gMissingStageCounters[kNumStages];
|
||||
|
||||
static void log_missing(SkRasterPipeline::StockStage st) {
|
||||
static SkOnce once;
|
||||
once([] { atexit([] {
|
||||
int total = 0;
|
||||
for (int i = 0; i < kNumStages; i++) {
|
||||
if (int count = gMissingStageCounters[i].load()) {
|
||||
SkDebugf("%7d\t%s\n", count, kStageNames[i]);
|
||||
total += count;
|
||||
}
|
||||
}
|
||||
SkDebugf("%7d total\n", total);
|
||||
}); });
|
||||
|
||||
gMissingStageCounters[st]++;
|
||||
}
|
||||
#else
|
||||
static void log_missing(SkRasterPipeline::StockStage) {}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// We can't express the real types of most stage functions portably, so we use a stand-in.
|
||||
// We'll only ever call start_pipeline(), which then chains into the rest.
|
||||
using StageFn = void(void);
|
||||
using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**);
|
||||
|
||||
// Some platforms expect C "name" maps to asm "_name", others to "name".
|
||||
#if defined(__APPLE__)
|
||||
#define ASM(name, suffix) sk_##name##_##suffix
|
||||
#else
|
||||
#define ASM(name, suffix) _sk_##name##_##suffix
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
#if !SK_JUMPER_USE_ASSEMBLY
|
||||
// We'll just run baseline code.
|
||||
|
||||
#elif defined(__x86_64__) || defined(_M_X64)
|
||||
StartPipelineFn ASM(start_pipeline, skx),
|
||||
ASM(start_pipeline, hsw),
|
||||
ASM(start_pipeline, avx),
|
||||
ASM(start_pipeline, sse41),
|
||||
ASM(start_pipeline, sse2),
|
||||
ASM(start_pipeline, hsw_lowp),
|
||||
ASM(start_pipeline,sse41_lowp),
|
||||
ASM(start_pipeline, sse2_lowp);
|
||||
|
||||
StageFn ASM(just_return, skx),
|
||||
ASM(just_return, hsw),
|
||||
ASM(just_return, avx),
|
||||
ASM(just_return, sse41),
|
||||
ASM(just_return, sse2),
|
||||
ASM(just_return, hsw_lowp),
|
||||
ASM(just_return,sse41_lowp),
|
||||
ASM(just_return, sse2_lowp);
|
||||
|
||||
#define M(st) StageFn ASM(st, skx), \
|
||||
ASM(st, hsw), \
|
||||
ASM(st, avx), \
|
||||
ASM(st,sse41), \
|
||||
ASM(st, sse2), \
|
||||
ASM(st, hsw_lowp), \
|
||||
ASM(st,sse41_lowp), \
|
||||
ASM(st, sse2_lowp);
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
#undef M
|
||||
|
||||
#elif defined(__i386__) || defined(_M_IX86)
|
||||
StartPipelineFn ASM(start_pipeline,sse2),
|
||||
ASM(start_pipeline,sse2_lowp);
|
||||
StageFn ASM(just_return,sse2),
|
||||
ASM(just_return,sse2_lowp);
|
||||
#define M(st) StageFn ASM(st,sse2), \
|
||||
ASM(st,sse2_lowp);
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
#undef M
|
||||
|
||||
#endif
|
||||
|
||||
// Baseline code compiled as a normal part of Skia.
|
||||
StartPipelineFn sk_start_pipeline;
|
||||
StageFn sk_just_return;
|
||||
#define M(st) StageFn sk_##st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
#undef M
|
||||
|
||||
#if defined(JUMPER_HAS_NEON_LOWP)
|
||||
StartPipelineFn sk_start_pipeline_lowp;
|
||||
StageFn sk_just_return_lowp;
|
||||
#define M(st) StageFn sk_##st##_lowp;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
#undef M
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#if SK_JUMPER_USE_ASSEMBLY
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
template <SkRasterPipeline::StockStage st>
|
||||
static constexpr StageFn* hsw_lowp();
|
||||
|
||||
template <SkRasterPipeline::StockStage st>
|
||||
static constexpr StageFn* sse41_lowp();
|
||||
|
||||
template <SkRasterPipeline::StockStage st>
|
||||
static constexpr StageFn* sse2_lowp();
|
||||
|
||||
#define LOWP(st) \
|
||||
template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
|
||||
return ASM(st,hsw_lowp); \
|
||||
} \
|
||||
template <> constexpr StageFn* sse41_lowp<SkRasterPipeline::st>() { \
|
||||
return ASM(st,sse41_lowp); \
|
||||
} \
|
||||
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
|
||||
return ASM(st,sse2_lowp); \
|
||||
}
|
||||
#define NOPE(st) \
|
||||
template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
|
||||
return nullptr; \
|
||||
} \
|
||||
template <> constexpr StageFn* sse41_lowp<SkRasterPipeline::st>() { \
|
||||
return nullptr; \
|
||||
} \
|
||||
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
|
||||
return nullptr; \
|
||||
}
|
||||
|
||||
#elif defined(__i386__) || defined(_M_IX86)
|
||||
template <SkRasterPipeline::StockStage st>
|
||||
static constexpr StageFn* sse2_lowp();
|
||||
|
||||
#define LOWP(st) \
|
||||
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
|
||||
return ASM(st,sse2_lowp); \
|
||||
}
|
||||
#define NOPE(st) \
|
||||
template <> constexpr StageFn* sse2_lowp<SkRasterPipeline::st>() { \
|
||||
return nullptr; \
|
||||
}
|
||||
|
||||
#elif defined(JUMPER_HAS_NEON_LOWP)
|
||||
template <SkRasterPipeline::StockStage st>
|
||||
static constexpr StageFn* neon_lowp();
|
||||
|
||||
#define LOWP(st) \
|
||||
template <> constexpr StageFn* neon_lowp<SkRasterPipeline::st>() { \
|
||||
return sk_##st##_lowp; \
|
||||
}
|
||||
#define NOPE(st) \
|
||||
template <> constexpr StageFn* neon_lowp<SkRasterPipeline::st>() { \
|
||||
return nullptr; \
|
||||
}
|
||||
|
||||
#else
|
||||
#define LOWP(st)
|
||||
#define NOPE(st)
|
||||
|
||||
#endif
|
||||
|
||||
#define TODO(st) NOPE(st) // stages that should be implemented in lowp, but aren't.
|
||||
|
||||
NOPE(callback)
|
||||
LOWP(move_src_dst) LOWP(move_dst_src)
|
||||
NOPE(clamp_0) NOPE(clamp_1) LOWP(clamp_a) LOWP(clamp_a_dst)
|
||||
NOPE(unpremul) LOWP(premul) LOWP(premul_dst)
|
||||
LOWP(force_opaque) LOWP(force_opaque_dst)
|
||||
LOWP(set_rgb) LOWP(swap_rb) LOWP(invert)
|
||||
NOPE(from_srgb) NOPE(from_srgb_dst) NOPE(to_srgb)
|
||||
LOWP(black_color) LOWP(white_color) LOWP(uniform_color)
|
||||
LOWP(seed_shader) NOPE(dither)
|
||||
LOWP(load_a8) LOWP(load_a8_dst) LOWP(store_a8) LOWP(gather_a8)
|
||||
LOWP(load_g8) LOWP(load_g8_dst) LOWP(gather_g8)
|
||||
LOWP(load_565) LOWP(load_565_dst) LOWP(store_565) LOWP(gather_565)
|
||||
LOWP(load_4444) LOWP(load_4444_dst) LOWP(store_4444) LOWP(gather_4444)
|
||||
NOPE(load_f16) NOPE(load_f16_dst) NOPE(store_f16) NOPE(gather_f16)
|
||||
NOPE(load_f32) NOPE(load_f32_dst) NOPE(store_f32)
|
||||
LOWP(load_8888) LOWP(load_8888_dst) LOWP(store_8888) LOWP(gather_8888)
|
||||
LOWP(load_bgra) LOWP(load_bgra_dst) LOWP(store_bgra) LOWP(gather_bgra)
|
||||
NOPE(load_1010102) NOPE(load_1010102_dst) NOPE(store_1010102) NOPE(gather_1010102)
|
||||
TODO(bilerp_clamp_8888)
|
||||
TODO(load_u16_be) TODO(load_rgb_u16_be) TODO(store_u16_be)
|
||||
NOPE(load_tables_u16_be) NOPE(load_tables_rgb_u16_be) NOPE(load_tables)
|
||||
NOPE(load_rgba) NOPE(store_rgba)
|
||||
LOWP(scale_u8) LOWP(scale_565) LOWP(scale_1_float)
|
||||
LOWP( lerp_u8) LOWP( lerp_565) LOWP( lerp_1_float)
|
||||
LOWP(dstatop) LOWP(dstin) LOWP(dstout) LOWP(dstover)
|
||||
LOWP(srcatop) LOWP(srcin) LOWP(srcout) LOWP(srcover)
|
||||
LOWP(clear) LOWP(modulate) LOWP(multiply) LOWP(plus_) LOWP(screen) LOWP(xor_)
|
||||
NOPE(colorburn) NOPE(colordodge) LOWP(darken) LOWP(difference)
|
||||
LOWP(exclusion) LOWP(hardlight) LOWP(lighten) LOWP(overlay) NOPE(softlight)
|
||||
NOPE(hue) NOPE(saturation) NOPE(color) NOPE(luminosity)
|
||||
LOWP(srcover_rgba_8888) LOWP(srcover_bgra_8888)
|
||||
LOWP(luminance_to_alpha)
|
||||
LOWP(matrix_translate) LOWP(matrix_scale_translate)
|
||||
LOWP(matrix_2x3) NOPE(matrix_3x4) TODO(matrix_4x5) TODO(matrix_4x3)
|
||||
LOWP(matrix_perspective)
|
||||
NOPE(parametric_r) NOPE(parametric_g) NOPE(parametric_b)
|
||||
NOPE(parametric_a) NOPE(gamma) NOPE(gamma_dst)
|
||||
NOPE(table_r) NOPE(table_g) NOPE(table_b) NOPE(table_a)
|
||||
NOPE(lab_to_xyz)
|
||||
TODO(mirror_x) TODO(repeat_x)
|
||||
TODO(mirror_y) TODO(repeat_y)
|
||||
LOWP(decal_x) LOWP(decal_y) LOWP(decal_x_and_y)
|
||||
LOWP(check_decal_mask)
|
||||
TODO(bilinear_nx) TODO(bilinear_px) TODO(bilinear_ny) TODO(bilinear_py)
|
||||
TODO(bicubic_n3x) TODO(bicubic_n1x) TODO(bicubic_p1x) TODO(bicubic_p3x)
|
||||
TODO(bicubic_n3y) TODO(bicubic_n1y) TODO(bicubic_p1y) TODO(bicubic_p3y)
|
||||
TODO(save_xy) TODO(accumulate)
|
||||
LOWP(clamp_x_1) LOWP(mirror_x_1) LOWP(repeat_x_1)
|
||||
LOWP(evenly_spaced_gradient)
|
||||
LOWP(gradient)
|
||||
LOWP(evenly_spaced_2_stop_gradient)
|
||||
LOWP(xy_to_unit_angle)
|
||||
LOWP(xy_to_radius)
|
||||
TODO(negate_x)
|
||||
TODO(xy_to_2pt_conical_strip)
|
||||
TODO(xy_to_2pt_conical_focal_on_circle)
|
||||
TODO(xy_to_2pt_conical_well_behaved)
|
||||
TODO(xy_to_2pt_conical_greater)
|
||||
TODO(xy_to_2pt_conical_smaller)
|
||||
TODO(alter_2pt_conical_compensate_focal)
|
||||
TODO(alter_2pt_conical_unswap)
|
||||
TODO(mask_2pt_conical_nan) TODO(mask_2pt_conical_degenerates) TODO(apply_vector_mask)
|
||||
TODO(byte_tables) TODO(byte_tables_rgb)
|
||||
NOPE(rgb_to_hsl) NOPE(hsl_to_rgb)
|
||||
NOPE(clut_3D) NOPE(clut_4D)
|
||||
NOPE(gauss_a_to_rgba)
|
||||
|
||||
#undef LOWP
|
||||
#undef TODO
|
||||
#undef NOPE
|
||||
#endif
|
||||
|
||||
// Engines comprise everything we need to run SkRasterPipelines.
|
||||
struct SkJumper_Engine {
|
||||
StageFn* stages[kNumStages];
|
||||
StartPipelineFn* start_pipeline;
|
||||
StageFn* just_return;
|
||||
};
|
||||
|
||||
// We'll default to this baseline engine, but try to choose a better one at runtime.
|
||||
static const SkJumper_Engine kBaseline = {
|
||||
#define M(stage) sk_##stage,
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
#undef M
|
||||
sk_start_pipeline,
|
||||
sk_just_return,
|
||||
};
|
||||
static SkJumper_Engine gEngine = kBaseline;
|
||||
static SkOnce gChooseEngineOnce;
|
||||
|
||||
static SkJumper_Engine choose_engine() {
|
||||
#if !SK_JUMPER_USE_ASSEMBLY
|
||||
// We'll just run baseline code.
|
||||
|
||||
#elif defined(__x86_64__) || defined(_M_X64)
|
||||
#if !defined(_MSC_VER) // No _skx stages for Windows yet.
|
||||
if (1 && SkCpu::Supports(SkCpu::SKX)) {
|
||||
return {
|
||||
#define M(stage) ASM(stage, skx),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
M(start_pipeline)
|
||||
M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
#endif
|
||||
if (1 && SkCpu::Supports(SkCpu::HSW)) {
|
||||
return {
|
||||
#define M(stage) ASM(stage, hsw),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
M(start_pipeline)
|
||||
M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
if (1 && SkCpu::Supports(SkCpu::AVX)) {
|
||||
return {
|
||||
#define M(stage) ASM(stage, avx),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
M(start_pipeline)
|
||||
M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE41)) {
|
||||
return {
|
||||
#define M(stage) ASM(stage, sse41),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
M(start_pipeline)
|
||||
M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
|
||||
return {
|
||||
#define M(stage) ASM(stage, sse2),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
M(start_pipeline)
|
||||
M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
|
||||
#elif defined(__i386__) || defined(_M_IX86)
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
|
||||
return {
|
||||
#define M(stage) ASM(stage, sse2),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
M(start_pipeline)
|
||||
M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
return kBaseline;
|
||||
}
|
||||
|
||||
#ifndef SK_JUMPER_DISABLE_8BIT
|
||||
static const SkJumper_Engine kNone = {
|
||||
#define M(stage) nullptr,
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
#undef M
|
||||
nullptr,
|
||||
nullptr,
|
||||
};
|
||||
static SkJumper_Engine gLowp = kNone;
|
||||
static SkOnce gChooseLowpOnce;
|
||||
|
||||
static SkJumper_Engine choose_lowp() {
|
||||
#if SK_JUMPER_USE_ASSEMBLY
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
if (1 && SkCpu::Supports(SkCpu::HSW)) {
|
||||
return {
|
||||
#define M(st) hsw_lowp<SkRasterPipeline::st>(),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
ASM(start_pipeline,hsw_lowp),
|
||||
ASM(just_return ,hsw_lowp),
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE41)) {
|
||||
return {
|
||||
#define M(st) sse41_lowp<SkRasterPipeline::st>(),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
ASM(start_pipeline,sse41_lowp),
|
||||
ASM(just_return ,sse41_lowp),
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
|
||||
return {
|
||||
#define M(st) sse2_lowp<SkRasterPipeline::st>(),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
ASM(start_pipeline,sse2_lowp),
|
||||
ASM(just_return ,sse2_lowp),
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
#elif defined(__i386__) || defined(_M_IX86)
|
||||
if (1 && SkCpu::Supports(SkCpu::SSE2)) {
|
||||
return {
|
||||
#define M(st) sse2_lowp<SkRasterPipeline::st>(),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
ASM(start_pipeline,sse2_lowp),
|
||||
ASM(just_return ,sse2_lowp),
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
|
||||
#elif defined(JUMPER_HAS_NEON_LOWP)
|
||||
return {
|
||||
#define M(st) neon_lowp<SkRasterPipeline::st>(),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
sk_start_pipeline_lowp,
|
||||
sk_just_return_lowp,
|
||||
#undef M
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
return kNone;
|
||||
}
|
||||
#endif
|
||||
|
||||
const SkJumper_Engine& SkRasterPipeline::build_pipeline(void** ip) const {
|
||||
#ifndef SK_JUMPER_DISABLE_8BIT
|
||||
gChooseLowpOnce([]{ gLowp = choose_lowp(); });
|
||||
|
||||
// First try to build a lowp pipeline. If that fails, fall back to normal float gEngine.
|
||||
void** reset_point = ip;
|
||||
*--ip = (void*)gLowp.just_return;
|
||||
for (const StageList* st = fStages; st; st = st->prev) {
|
||||
if (st->stage == SkRasterPipeline::clamp_0 ||
|
||||
st->stage == SkRasterPipeline::clamp_1) {
|
||||
continue; // No-ops in lowp.
|
||||
}
|
||||
if (auto fn = SkOpts::stages_lowp[st->stage]) {
|
||||
if (StageFn* fn = gLowp.stages[st->stage]) {
|
||||
if (st->ctx) {
|
||||
*--ip = st->ctx;
|
||||
}
|
||||
*--ip = (void*)fn;
|
||||
} else {
|
||||
log_missing(st->stage);
|
||||
ip = reset_point;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ip != reset_point) {
|
||||
return SkOpts::start_pipeline_lowp;
|
||||
return gLowp;
|
||||
}
|
||||
#endif
|
||||
|
||||
*--ip = (void*)SkOpts::just_return_highp;
|
||||
gChooseEngineOnce([]{ gEngine = choose_engine(); });
|
||||
// We're building the pipeline backwards, so we start with the final stage just_return.
|
||||
*--ip = (void*)gEngine.just_return;
|
||||
|
||||
// Still going backwards, each stage's context pointer then its StageFn.
|
||||
for (const StageList* st = fStages; st; st = st->prev) {
|
||||
if (st->ctx) {
|
||||
*--ip = st->ctx;
|
||||
}
|
||||
*--ip = (void*)SkOpts::stages_highp[st->stage];
|
||||
*--ip = (void*)gEngine.stages[st->stage];
|
||||
}
|
||||
return SkOpts::start_pipeline_highp;
|
||||
return gEngine;
|
||||
}
|
||||
|
||||
void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
|
||||
@ -55,8 +477,8 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
|
||||
// Best to not use fAlloc here... we can't bound how often run() will be called.
|
||||
SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
|
||||
|
||||
auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded);
|
||||
start_pipeline(x,y,x+w,y+h, program.get());
|
||||
const SkJumper_Engine& engine = this->build_pipeline(program.get() + fSlotsNeeded);
|
||||
engine.start_pipeline(x,y,x+w,y+h, program.get());
|
||||
}
|
||||
|
||||
std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile() const {
|
||||
@ -65,8 +487,9 @@ std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile()
|
||||
}
|
||||
|
||||
void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
|
||||
const SkJumper_Engine& engine = this->build_pipeline(program + fSlotsNeeded);
|
||||
|
||||
auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
|
||||
auto start_pipeline = engine.start_pipeline;
|
||||
return [=](size_t x, size_t y, size_t w, size_t h) {
|
||||
start_pipeline(x,y,x+w,y+h, program);
|
||||
};
|
||||
|
@ -11,11 +11,34 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// This file contains definitions shared by SkJumper.cpp/SkJumper_stages.cpp
|
||||
// and the rest of Skia. It is important to keep the interface to SkJumper
|
||||
// limited and simple to avoid serious ODR violation pitfalls, especially when
|
||||
// using Microsoft's <math.h> and similar headers with inline-but-not-static
|
||||
// function definitions.
|
||||
// This file contains definitions shared by SkJumper.cpp (compiled normally as part of Skia)
|
||||
// and SkJumper_stages.cpp (compiled into Skia _and_ offline into SkJumper_generated.h).
|
||||
// Keep it simple!
|
||||
|
||||
// Externally facing functions (start_pipeline) are called a little specially on Windows.
|
||||
#if defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__x86_64__)
|
||||
#define MAYBE_MSABI __attribute__((ms_abi)) // Use MS' ABI, not System V.
|
||||
#elif defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__i386__)
|
||||
#define MAYBE_MSABI __attribute__((force_align_arg_pointer)) // Re-align stack 4 -> 16 bytes.
|
||||
#else
|
||||
#define MAYBE_MSABI
|
||||
#endif
|
||||
|
||||
// Any custom ABI to use for all non-externally-facing stage functions.
|
||||
#if defined(__ARM_NEON) && defined(__arm__)
|
||||
// This lets us pass vectors more efficiently on 32-bit ARM.
|
||||
#define ABI __attribute__((pcs("aapcs-vfp")))
|
||||
#else
|
||||
#define ABI
|
||||
#endif
|
||||
|
||||
// On ARM we expect that you're using Clang if you want SkJumper to be fast.
|
||||
// If you are, the baseline float stages will use NEON, and lowp stages will
|
||||
// also be available. (If somehow you're building for ARM not using Clang,
|
||||
// you'll get scalar baseline stages and no lowp support.)
|
||||
#if defined(__clang__) && defined(__ARM_NEON)
|
||||
#define JUMPER_HAS_NEON_LOWP
|
||||
#endif
|
||||
|
||||
static const int SkJumper_kMaxStride = 16;
|
||||
|
||||
@ -53,7 +76,7 @@ struct SkJumper_DecalTileCtx {
|
||||
};
|
||||
|
||||
struct SkJumper_CallbackCtx {
|
||||
void (*fn)(SkJumper_CallbackCtx* self, int active_pixels/*<= SkJumper_kMaxStride*/);
|
||||
MAYBE_MSABI void (*fn)(SkJumper_CallbackCtx* self, int active_pixels/*<= SkJumper_kMaxStride*/);
|
||||
|
||||
// When called, fn() will have our active pixels available in rgba.
|
||||
// When fn() returns, the pipeline will read back those active pixels from read_from.
|
||||
|
83545
src/jumper/SkJumper_generated.S
Normal file
83545
src/jumper/SkJumper_generated.S
Normal file
File diff suppressed because it is too large
Load Diff
73478
src/jumper/SkJumper_generated_win.S
Normal file
73478
src/jumper/SkJumper_generated_win.S
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
961
src/jumper/SkJumper_stages_lowp.cpp
Normal file
961
src/jumper/SkJumper_stages_lowp.cpp
Normal file
@ -0,0 +1,961 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
// This restricted SkJumper backend works on 8-bit per channel pixels stored in
|
||||
// 16-bit channels. This is a last attempt to write a performant low-precision
|
||||
// backend with stage definitions that can be shared by x86 and ARM.
|
||||
|
||||
#include "SkJumper.h"
|
||||
#include "SkJumper_misc.h"
|
||||
|
||||
#if defined(__clang__) // This file is empty when not compiled by Clang.
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__SSE2__)
|
||||
#include <immintrin.h>
|
||||
#else
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#if !defined(JUMPER_IS_OFFLINE)
|
||||
#define WRAP(name) sk_##name##_lowp
|
||||
#elif defined(__AVX2__)
|
||||
#define WRAP(name) sk_##name##_hsw_lowp
|
||||
#elif defined(__SSE4_1__)
|
||||
#define WRAP(name) sk_##name##_sse41_lowp
|
||||
#elif defined(__SSE2__)
|
||||
#define WRAP(name) sk_##name##_sse2_lowp
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
using U8 = uint8_t __attribute__((ext_vector_type(16)));
|
||||
using U16 = uint16_t __attribute__((ext_vector_type(16)));
|
||||
using I16 = int16_t __attribute__((ext_vector_type(16)));
|
||||
using I32 = int32_t __attribute__((ext_vector_type(16)));
|
||||
using U32 = uint32_t __attribute__((ext_vector_type(16)));
|
||||
using F = float __attribute__((ext_vector_type(16)));
|
||||
#else
|
||||
using U8 = uint8_t __attribute__((ext_vector_type(8)));
|
||||
using U16 = uint16_t __attribute__((ext_vector_type(8)));
|
||||
using I16 = int16_t __attribute__((ext_vector_type(8)));
|
||||
using I32 = int32_t __attribute__((ext_vector_type(8)));
|
||||
using U32 = uint32_t __attribute__((ext_vector_type(8)));
|
||||
using F = float __attribute__((ext_vector_type(8)));
|
||||
#endif
|
||||
|
||||
static const size_t N = sizeof(U16) / sizeof(uint16_t);
|
||||
|
||||
// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
|
||||
using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
|
||||
U16 r, U16 g, U16 b, U16 a,
|
||||
U16 dr, U16 dg, U16 db, U16 da);
|
||||
|
||||
extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0,
|
||||
const size_t y0,
|
||||
const size_t xlimit,
|
||||
const size_t ylimit,
|
||||
void** program) {
|
||||
auto start = (Stage)load_and_inc(program);
|
||||
for (size_t dy = y0; dy < ylimit; dy++) {
|
||||
size_t dx = x0;
|
||||
for (; dx + N <= xlimit; dx += N) {
|
||||
start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0);
|
||||
}
|
||||
if (size_t tail = xlimit - dx) {
|
||||
start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t,
|
||||
U16,U16,U16,U16, U16,U16,U16,U16) {}
|
||||
|
||||
// All stages use the same function call ABI to chain into each other, but there are three types:
|
||||
// GG: geometry in, geometry out -- think, a matrix
|
||||
// GP: geometry in, pixels out. -- think, a memory gather
|
||||
// PP: pixels in, pixels out. -- think, a blend mode
|
||||
//
|
||||
// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
|
||||
//
|
||||
// These three STAGE_ macros let you define each type of stage,
|
||||
// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
|
||||
|
||||
#define STAGE_GG(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
|
||||
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
auto x = join<F>(r,g), \
|
||||
y = join<F>(b,a); \
|
||||
name##_k(Ctx{program}, dx,dy,tail, x,y); \
|
||||
split(x, &r,&g); \
|
||||
split(y, &b,&a); \
|
||||
auto next = (Stage)load_and_inc(program); \
|
||||
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
|
||||
} \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
|
||||
|
||||
#define STAGE_GP(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
auto x = join<F>(r,g), \
|
||||
y = join<F>(b,a); \
|
||||
name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
|
||||
auto next = (Stage)load_and_inc(program); \
|
||||
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
|
||||
} \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da)
|
||||
|
||||
#define STAGE_PP(name, ...) \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da); \
|
||||
extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
|
||||
U16 r, U16 g, U16 b, U16 a, \
|
||||
U16 dr, U16 dg, U16 db, U16 da) { \
|
||||
name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
|
||||
auto next = (Stage)load_and_inc(program); \
|
||||
next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
|
||||
} \
|
||||
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
|
||||
U16& r, U16& g, U16& b, U16& a, \
|
||||
U16& dr, U16& dg, U16& db, U16& da)
|
||||
|
||||
// ~~~~~~ Commonly used helper functions ~~~~~~ //
|
||||
|
||||
SI U16 div255(U16 v) {
|
||||
#if 0
|
||||
return (v+127)/255; // The ideal rounding divide by 255.
|
||||
#else
|
||||
return (v+255)/256; // A good approximation of (v+127)/255.
|
||||
#endif
|
||||
}
|
||||
|
||||
SI U16 inv(U16 v) { return 255-v; }
|
||||
|
||||
SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
|
||||
SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
|
||||
|
||||
SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
|
||||
SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
|
||||
SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
|
||||
SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
|
||||
|
||||
SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
|
||||
|
||||
SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
|
||||
|
||||
template <typename D, typename S>
|
||||
SI D cast(S src) {
|
||||
return __builtin_convertvector(src, D);
|
||||
}
|
||||
|
||||
template <typename D, typename S>
|
||||
SI void split(S v, D* lo, D* hi) {
|
||||
static_assert(2*sizeof(D) == sizeof(S), "");
|
||||
memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
|
||||
memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
|
||||
}
|
||||
template <typename D, typename S>
|
||||
SI D join(S lo, S hi) {
|
||||
static_assert(sizeof(D) == 2*sizeof(S), "");
|
||||
D v;
|
||||
memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
|
||||
memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
|
||||
return v;
|
||||
}
|
||||
template <typename V, typename H>
|
||||
SI V map(V v, H (*fn)(H)) {
|
||||
H lo,hi;
|
||||
split(v, &lo,&hi);
|
||||
lo = fn(lo);
|
||||
hi = fn(hi);
|
||||
return join<V>(lo,hi);
|
||||
}
|
||||
|
||||
// TODO: do we need platform-specific intrinsics for any of these?
|
||||
SI F if_then_else(I32 c, F t, F e) {
|
||||
return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
|
||||
}
|
||||
SI F max(F x, F y) { return if_then_else(x < y, y, x); }
|
||||
SI F min(F x, F y) { return if_then_else(x < y, x, y); }
|
||||
|
||||
SI F mad(F f, F m, F a) { return f*m+a; }
|
||||
SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
|
||||
|
||||
SI F rcp(F x) {
|
||||
#if defined(__AVX2__)
|
||||
return map(x, _mm256_rcp_ps);
|
||||
#elif defined(__SSE__)
|
||||
return map(x, _mm_rcp_ps);
|
||||
#elif defined(__ARM_NEON)
|
||||
return map(x, +[](float32x4_t v) {
|
||||
auto est = vrecpeq_f32(v);
|
||||
return vrecpsq_f32(v,est)*est;
|
||||
});
|
||||
#else
|
||||
return 1.0f / x;
|
||||
#endif
|
||||
}
|
||||
SI F sqrt_(F x) {
|
||||
#if defined(__AVX2__)
|
||||
return map(x, _mm256_sqrt_ps);
|
||||
#elif defined(__SSE__)
|
||||
return map(x, _mm_sqrt_ps);
|
||||
#elif defined(__aarch64__)
|
||||
return map(x, vsqrtq_f32);
|
||||
#elif defined(__ARM_NEON)
|
||||
return map(x, +[](float32x4_t v) {
|
||||
auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
|
||||
est *= vrsqrtsq_f32(v,est*est);
|
||||
est *= vrsqrtsq_f32(v,est*est);
|
||||
return v*est; // sqrt(v) == v*rsqrt(v).
|
||||
});
|
||||
#else
|
||||
return F{
|
||||
sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
|
||||
sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
SI F floor_(F x) {
|
||||
#if defined(__aarch64__)
|
||||
return map(x, vrndmq_f32);
|
||||
#elif defined(__AVX2__)
|
||||
return map(x, +[](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro...
|
||||
#elif defined(__SSE4_1__)
|
||||
return map(x, +[](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too.
|
||||
#else
|
||||
F roundtrip = cast<F>(cast<I32>(x));
|
||||
return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
|
||||
#endif
|
||||
}
|
||||
SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); }
|
||||
|
||||
// ~~~~~~ Basic / misc. stages ~~~~~~ //
|
||||
|
||||
STAGE_GG(seed_shader, const float* iota) {
|
||||
x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
|
||||
y = cast<F>(I32(dy)) + 0.5f;
|
||||
}
|
||||
|
||||
STAGE_GG(matrix_translate, const float* m) {
|
||||
x += m[0];
|
||||
y += m[1];
|
||||
}
|
||||
STAGE_GG(matrix_scale_translate, const float* m) {
|
||||
x = mad(x,m[0], m[2]);
|
||||
y = mad(y,m[1], m[3]);
|
||||
}
|
||||
STAGE_GG(matrix_2x3, const float* m) {
|
||||
auto X = mad(x,m[0], mad(y,m[2], m[4])),
|
||||
Y = mad(x,m[1], mad(y,m[3], m[5]));
|
||||
x = X;
|
||||
y = Y;
|
||||
}
|
||||
STAGE_GG(matrix_perspective, const float* m) {
|
||||
// N.B. Unlike the other matrix_ stages, this matrix is row-major.
|
||||
auto X = mad(x,m[0], mad(y,m[1], m[2])),
|
||||
Y = mad(x,m[3], mad(y,m[4], m[5])),
|
||||
Z = mad(x,m[6], mad(y,m[7], m[8]));
|
||||
x = X * rcp(Z);
|
||||
y = Y * rcp(Z);
|
||||
}
|
||||
|
||||
STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) {
|
||||
r = c->rgba[0];
|
||||
g = c->rgba[1];
|
||||
b = c->rgba[2];
|
||||
a = c->rgba[3];
|
||||
}
|
||||
STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; }
|
||||
STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
|
||||
|
||||
STAGE_PP(set_rgb, const float rgb[3]) {
|
||||
r = from_float(rgb[0]);
|
||||
g = from_float(rgb[1]);
|
||||
b = from_float(rgb[2]);
|
||||
}
|
||||
|
||||
STAGE_PP(clamp_a, Ctx::None) {
|
||||
r = min(r, a);
|
||||
g = min(g, a);
|
||||
b = min(b, a);
|
||||
}
|
||||
STAGE_PP(clamp_a_dst, Ctx::None) {
|
||||
dr = min(dr, da);
|
||||
dg = min(dg, da);
|
||||
db = min(db, da);
|
||||
}
|
||||
|
||||
STAGE_PP(premul, Ctx::None) {
|
||||
r = div255(r * a);
|
||||
g = div255(g * a);
|
||||
b = div255(b * a);
|
||||
}
|
||||
STAGE_PP(premul_dst, Ctx::None) {
|
||||
dr = div255(dr * da);
|
||||
dg = div255(dg * da);
|
||||
db = div255(db * da);
|
||||
}
|
||||
|
||||
STAGE_PP(force_opaque , Ctx::None) { a = 255; }
|
||||
STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; }
|
||||
|
||||
STAGE_PP(swap_rb, Ctx::None) {
|
||||
auto tmp = r;
|
||||
r = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
STAGE_PP(move_src_dst, Ctx::None) {
|
||||
dr = r;
|
||||
dg = g;
|
||||
db = b;
|
||||
da = a;
|
||||
}
|
||||
|
||||
STAGE_PP(move_dst_src, Ctx::None) {
|
||||
r = dr;
|
||||
g = dg;
|
||||
b = db;
|
||||
a = da;
|
||||
}
|
||||
|
||||
STAGE_PP(invert, Ctx::None) {
|
||||
r = inv(r);
|
||||
g = inv(g);
|
||||
b = inv(b);
|
||||
a = inv(a);
|
||||
}
|
||||
|
||||
// ~~~~~~ Blend modes ~~~~~~ //
|
||||
|
||||
// The same logic applied to all 4 channels.
|
||||
#define BLEND_MODE(name) \
|
||||
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
|
||||
STAGE_PP(name, Ctx::None) { \
|
||||
r = name##_channel(r,dr,a,da); \
|
||||
g = name##_channel(g,dg,a,da); \
|
||||
b = name##_channel(b,db,a,da); \
|
||||
a = name##_channel(a,da,a,da); \
|
||||
} \
|
||||
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
|
||||
|
||||
BLEND_MODE(clear) { return 0; }
|
||||
BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
|
||||
BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
|
||||
BLEND_MODE(srcin) { return div255( s*da ); }
|
||||
BLEND_MODE(dstin) { return div255( d*sa ); }
|
||||
BLEND_MODE(srcout) { return div255( s*inv(da) ); }
|
||||
BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
|
||||
BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
|
||||
BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
|
||||
BLEND_MODE(modulate) { return div255( s*d ); }
|
||||
BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
|
||||
BLEND_MODE(plus_) { return min(s+d, 255); }
|
||||
BLEND_MODE(screen) { return s + d - div255( s*d ); }
|
||||
BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
|
||||
#undef BLEND_MODE
|
||||
|
||||
// The same logic applied to color, and srcover for alpha.
|
||||
#define BLEND_MODE(name) \
|
||||
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
|
||||
STAGE_PP(name, Ctx::None) { \
|
||||
r = name##_channel(r,dr,a,da); \
|
||||
g = name##_channel(g,dg,a,da); \
|
||||
b = name##_channel(b,db,a,da); \
|
||||
a = a + div255( da*inv(a) ); \
|
||||
} \
|
||||
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
|
||||
|
||||
BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); }
|
||||
BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); }
|
||||
BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
|
||||
BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); }
|
||||
|
||||
BLEND_MODE(hardlight) {
|
||||
return div255( s*inv(da) + d*inv(sa) +
|
||||
if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
|
||||
}
|
||||
BLEND_MODE(overlay) {
|
||||
return div255( s*inv(da) + d*inv(sa) +
|
||||
if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
|
||||
}
|
||||
#undef BLEND_MODE
|
||||
|
||||
// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
|
||||
|
||||
template <typename T>
|
||||
SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
|
||||
return (T*)ctx->pixels + dy*ctx->stride + dx;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
|
||||
auto clamp = [](F v, F limit) {
|
||||
limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
|
||||
return min(max(0, v), limit);
|
||||
};
|
||||
x = clamp(x, ctx->width);
|
||||
y = clamp(y, ctx->height);
|
||||
|
||||
*ptr = (const T*)ctx->pixels;
|
||||
return trunc_(y)*ctx->stride + trunc_(x);
|
||||
}
|
||||
|
||||
template <typename V, typename T>
|
||||
SI V load(const T* ptr, size_t tail) {
|
||||
V v = 0;
|
||||
switch (tail & (N-1)) {
|
||||
case 0: memcpy(&v, ptr, sizeof(v)); break;
|
||||
#if defined(__AVX2__)
|
||||
case 15: v[14] = ptr[14];
|
||||
case 14: v[13] = ptr[13];
|
||||
case 13: v[12] = ptr[12];
|
||||
case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
|
||||
case 11: v[10] = ptr[10];
|
||||
case 10: v[ 9] = ptr[ 9];
|
||||
case 9: v[ 8] = ptr[ 8];
|
||||
case 8: memcpy(&v, ptr, 8*sizeof(T)); break;
|
||||
#endif
|
||||
case 7: v[ 6] = ptr[ 6];
|
||||
case 6: v[ 5] = ptr[ 5];
|
||||
case 5: v[ 4] = ptr[ 4];
|
||||
case 4: memcpy(&v, ptr, 4*sizeof(T)); break;
|
||||
case 3: v[ 2] = ptr[ 2];
|
||||
case 2: memcpy(&v, ptr, 2*sizeof(T)); break;
|
||||
case 1: v[ 0] = ptr[ 0];
|
||||
}
|
||||
return v;
|
||||
}
|
||||
template <typename V, typename T>
|
||||
SI void store(T* ptr, size_t tail, V v) {
|
||||
switch (tail & (N-1)) {
|
||||
case 0: memcpy(ptr, &v, sizeof(v)); break;
|
||||
#if defined(__AVX2__)
|
||||
case 15: ptr[14] = v[14];
|
||||
case 14: ptr[13] = v[13];
|
||||
case 13: ptr[12] = v[12];
|
||||
case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
|
||||
case 11: ptr[10] = v[10];
|
||||
case 10: ptr[ 9] = v[ 9];
|
||||
case 9: ptr[ 8] = v[ 8];
|
||||
case 8: memcpy(ptr, &v, 8*sizeof(T)); break;
|
||||
#endif
|
||||
case 7: ptr[ 6] = v[ 6];
|
||||
case 6: ptr[ 5] = v[ 5];
|
||||
case 5: ptr[ 4] = v[ 4];
|
||||
case 4: memcpy(ptr, &v, 4*sizeof(T)); break;
|
||||
case 3: ptr[ 2] = v[ 2];
|
||||
case 2: memcpy(ptr, &v, 2*sizeof(T)); break;
|
||||
case 1: ptr[ 0] = v[ 0];
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
template <typename V, typename T>
|
||||
SI V gather(const T* ptr, U32 ix) {
|
||||
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
|
||||
ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
|
||||
ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
|
||||
ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
|
||||
}
|
||||
|
||||
template<>
|
||||
F gather(const float* p, U32 ix) {
|
||||
__m256i lo, hi;
|
||||
split(ix, &lo, &hi);
|
||||
|
||||
return join<F>(_mm256_i32gather_ps(p, lo, 4),
|
||||
_mm256_i32gather_ps(p, hi, 4));
|
||||
}
|
||||
|
||||
template<>
|
||||
U32 gather(const uint32_t* p, U32 ix) {
|
||||
__m256i lo, hi;
|
||||
split(ix, &lo, &hi);
|
||||
|
||||
return join<U32>(_mm256_i32gather_epi32(p, lo, 4),
|
||||
_mm256_i32gather_epi32(p, hi, 4));
|
||||
}
|
||||
#else
|
||||
template <typename V, typename T>
|
||||
SI V gather(const T* ptr, U32 ix) {
|
||||
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
|
||||
ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
|
||||
|
||||
SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
|
||||
#if 1 && defined(__AVX2__)
|
||||
// Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
|
||||
__m256i _01,_23;
|
||||
split(rgba, &_01, &_23);
|
||||
__m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
|
||||
_13 = _mm256_permute2x128_si256(_01,_23, 0x31);
|
||||
rgba = join<U32>(_02, _13);
|
||||
|
||||
auto cast_U16 = [](U32 v) -> U16 {
|
||||
__m256i _02,_13;
|
||||
split(v, &_02,&_13);
|
||||
return _mm256_packus_epi32(_02,_13);
|
||||
};
|
||||
#else
|
||||
auto cast_U16 = [](U32 v) -> U16 {
|
||||
return cast<U16>(v);
|
||||
};
|
||||
#endif
|
||||
*r = cast_U16(rgba & 65535) & 255;
|
||||
*g = cast_U16(rgba & 65535) >> 8;
|
||||
*b = cast_U16(rgba >> 16) & 255;
|
||||
*a = cast_U16(rgba >> 16) >> 8;
|
||||
}
|
||||
|
||||
SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
|
||||
#if 1 && defined(__ARM_NEON)
|
||||
uint8x8x4_t rgba;
|
||||
switch (tail & (N-1)) {
|
||||
case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break;
|
||||
case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
|
||||
case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
|
||||
case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
|
||||
case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
|
||||
case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
|
||||
case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
|
||||
case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
|
||||
}
|
||||
*r = cast<U16>(rgba.val[0]);
|
||||
*g = cast<U16>(rgba.val[1]);
|
||||
*b = cast<U16>(rgba.val[2]);
|
||||
*a = cast<U16>(rgba.val[3]);
|
||||
#else
|
||||
from_8888(load<U32>(ptr, tail), r,g,b,a);
|
||||
#endif
|
||||
}
|
||||
SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
|
||||
#if 1 && defined(__ARM_NEON)
|
||||
uint8x8x4_t rgba = {{
|
||||
cast<U8>(r),
|
||||
cast<U8>(g),
|
||||
cast<U8>(b),
|
||||
cast<U8>(a),
|
||||
}};
|
||||
switch (tail & (N-1)) {
|
||||
case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break;
|
||||
case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
|
||||
case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
|
||||
case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
|
||||
case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
|
||||
case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
|
||||
case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
|
||||
case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
|
||||
}
|
||||
#else
|
||||
store(ptr, tail, cast<U32>(r | (g<<8)) << 0
|
||||
| cast<U32>(b | (a<<8)) << 16);
|
||||
#endif
|
||||
}
|
||||
|
||||
STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) {
|
||||
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
|
||||
}
|
||||
STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
|
||||
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
|
||||
}
|
||||
STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) {
|
||||
store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
|
||||
}
|
||||
|
||||
STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) {
|
||||
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
|
||||
}
|
||||
STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
|
||||
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
|
||||
}
|
||||
STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) {
|
||||
store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
|
||||
}
|
||||
|
||||
STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) {
|
||||
const uint32_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
||||
from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
|
||||
}
|
||||
STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) {
|
||||
const uint32_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
||||
from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a);
|
||||
}
|
||||
|
||||
// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
|
||||
|
||||
SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
|
||||
// Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
|
||||
U16 R = (rgb >> 11) & 31,
|
||||
G = (rgb >> 5) & 63,
|
||||
B = (rgb >> 0) & 31;
|
||||
|
||||
// These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
|
||||
*r = (R << 3) | (R >> 2);
|
||||
*g = (G << 2) | (G >> 4);
|
||||
*b = (B << 3) | (B >> 2);
|
||||
}
|
||||
SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
|
||||
from_565(load<U16>(ptr, tail), r,g,b);
|
||||
}
|
||||
SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
|
||||
// Select the top 5,6,5 bits.
|
||||
U16 R = r >> 3,
|
||||
G = g >> 2,
|
||||
B = b >> 3;
|
||||
// Pack them back into 15|rrrrr gggggg bbbbb|0.
|
||||
store(ptr, tail, R << 11
|
||||
| G << 5
|
||||
| B << 0);
|
||||
}
|
||||
|
||||
STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) {
|
||||
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
|
||||
a = 255;
|
||||
}
|
||||
STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) {
|
||||
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
|
||||
da = 255;
|
||||
}
|
||||
STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) {
|
||||
store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
|
||||
}
|
||||
STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) {
|
||||
const uint16_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
||||
from_565(gather<U16>(ptr, ix), &r, &g, &b);
|
||||
a = 255;
|
||||
}
|
||||
|
||||
SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
|
||||
// Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
|
||||
U16 R = (rgba >> 12) & 15,
|
||||
G = (rgba >> 8) & 15,
|
||||
B = (rgba >> 4) & 15,
|
||||
A = (rgba >> 0) & 15;
|
||||
|
||||
// Scale [0,15] to [0,255].
|
||||
*r = (R << 4) | R;
|
||||
*g = (G << 4) | G;
|
||||
*b = (B << 4) | B;
|
||||
*a = (A << 4) | A;
|
||||
}
|
||||
SI void load_4444(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
|
||||
from_4444(load<U16>(ptr, tail), r,g,b,a);
|
||||
}
|
||||
SI void store_4444(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
|
||||
// Select the top 4 bits of each.
|
||||
U16 R = r >> 4,
|
||||
G = g >> 4,
|
||||
B = b >> 4,
|
||||
A = a >> 4;
|
||||
// Pack them back into 15|rrrr gggg bbbb aaaa|0.
|
||||
store(ptr, tail, R << 12
|
||||
| G << 8
|
||||
| B << 4
|
||||
| A << 0);
|
||||
}
|
||||
|
||||
STAGE_PP(load_4444, const SkJumper_MemoryCtx* ctx) {
|
||||
load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
|
||||
}
|
||||
STAGE_PP(load_4444_dst, const SkJumper_MemoryCtx* ctx) {
|
||||
load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
|
||||
}
|
||||
STAGE_PP(store_4444, const SkJumper_MemoryCtx* ctx) {
|
||||
store_4444(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
|
||||
}
|
||||
STAGE_GP(gather_4444, const SkJumper_GatherCtx* ctx) {
|
||||
const uint16_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
||||
from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
|
||||
}
|
||||
|
||||
// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
|
||||
|
||||
SI U16 load_8(const uint8_t* ptr, size_t tail) {
|
||||
return cast<U16>(load<U8>(ptr, tail));
|
||||
}
|
||||
SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
|
||||
store(ptr, tail, cast<U8>(v));
|
||||
}
|
||||
|
||||
STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) {
|
||||
r = g = b = 0;
|
||||
a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
|
||||
}
|
||||
STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
|
||||
dr = dg = db = 0;
|
||||
da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
|
||||
}
|
||||
STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) {
|
||||
store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
|
||||
}
|
||||
STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) {
|
||||
const uint8_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
||||
r = g = b = 0;
|
||||
a = cast<U16>(gather<U8>(ptr, ix));
|
||||
}
|
||||
|
||||
STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) {
|
||||
r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
|
||||
a = 255;
|
||||
}
|
||||
STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
|
||||
dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
|
||||
da = 255;
|
||||
}
|
||||
STAGE_PP(luminance_to_alpha, Ctx::None) {
|
||||
a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
|
||||
r = g = b = 0;
|
||||
}
|
||||
STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) {
|
||||
const uint8_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
||||
r = g = b = cast<U16>(gather<U8>(ptr, ix));
|
||||
a = 255;
|
||||
}
|
||||
|
||||
// ~~~~~~ Coverage scales / lerps ~~~~~~ //
|
||||
|
||||
STAGE_PP(scale_1_float, const float* f) {
|
||||
U16 c = from_float(*f);
|
||||
r = div255( r * c );
|
||||
g = div255( g * c );
|
||||
b = div255( b * c );
|
||||
a = div255( a * c );
|
||||
}
|
||||
STAGE_PP(lerp_1_float, const float* f) {
|
||||
U16 c = from_float(*f);
|
||||
r = lerp(dr, r, c);
|
||||
g = lerp(dg, g, c);
|
||||
b = lerp(db, b, c);
|
||||
a = lerp(da, a, c);
|
||||
}
|
||||
|
||||
STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) {
|
||||
U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
|
||||
r = div255( r * c );
|
||||
g = div255( g * c );
|
||||
b = div255( b * c );
|
||||
a = div255( a * c );
|
||||
}
|
||||
STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) {
|
||||
U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
|
||||
r = lerp(dr, r, c);
|
||||
g = lerp(dg, g, c);
|
||||
b = lerp(db, b, c);
|
||||
a = lerp(da, a, c);
|
||||
}
|
||||
|
||||
// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
|
||||
SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
|
||||
return if_then_else(a < da, min(cr,cg,cb)
|
||||
, max(cr,cg,cb));
|
||||
}
|
||||
STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) {
|
||||
U16 cr,cg,cb;
|
||||
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
|
||||
U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
|
||||
|
||||
r = div255( r * cr );
|
||||
g = div255( g * cg );
|
||||
b = div255( b * cb );
|
||||
a = div255( a * ca );
|
||||
}
|
||||
STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) {
|
||||
U16 cr,cg,cb;
|
||||
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
|
||||
U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
|
||||
|
||||
r = lerp(dr, r, cr);
|
||||
g = lerp(dg, g, cg);
|
||||
b = lerp(db, b, cb);
|
||||
a = lerp(da, a, ca);
|
||||
}
|
||||
|
||||
// ~~~~~~ Gradient stages ~~~~~~ //
|
||||
|
||||
// Clamp x to [0,1], both sides inclusive (think, gradients).
|
||||
// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
|
||||
SI F clamp_01(F v) { return min(max(0, v), 1); }
|
||||
|
||||
STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
|
||||
STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
|
||||
STAGE_GG(mirror_x_1, Ctx::None) {
|
||||
auto two = [](F x){ return x+x; };
|
||||
x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
|
||||
}
|
||||
|
||||
SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
|
||||
|
||||
STAGE_GG(decal_x, SkJumper_DecalTileCtx* ctx) {
|
||||
auto w = ctx->limit_x;
|
||||
unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
|
||||
}
|
||||
STAGE_GG(decal_y, SkJumper_DecalTileCtx* ctx) {
|
||||
auto h = ctx->limit_y;
|
||||
unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
|
||||
}
|
||||
STAGE_GG(decal_x_and_y, SkJumper_DecalTileCtx* ctx) {
|
||||
auto w = ctx->limit_x;
|
||||
auto h = ctx->limit_y;
|
||||
unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
|
||||
}
|
||||
STAGE_PP(check_decal_mask, SkJumper_DecalTileCtx* ctx) {
|
||||
auto mask = unaligned_load<U16>(ctx->mask);
|
||||
r = r & mask;
|
||||
g = g & mask;
|
||||
b = b & mask;
|
||||
a = a & mask;
|
||||
}
|
||||
|
||||
|
||||
SI U16 round_F_to_U16(F x) { return cast<U16>(x * 255.0f + 0.5f); }
|
||||
|
||||
SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
|
||||
U16* r, U16* g, U16* b, U16* a) {
|
||||
|
||||
F fr, fg, fb, fa, br, bg, bb, ba;
|
||||
#if defined(__AVX2__)
|
||||
if (c->stopCount <=8) {
|
||||
__m256i lo, hi;
|
||||
split(idx, &lo, &hi);
|
||||
|
||||
fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
|
||||
br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
|
||||
fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
|
||||
bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
|
||||
fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
|
||||
bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
|
||||
fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
|
||||
ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
|
||||
_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
fr = gather<F>(c->fs[0], idx);
|
||||
fg = gather<F>(c->fs[1], idx);
|
||||
fb = gather<F>(c->fs[2], idx);
|
||||
fa = gather<F>(c->fs[3], idx);
|
||||
br = gather<F>(c->bs[0], idx);
|
||||
bg = gather<F>(c->bs[1], idx);
|
||||
bb = gather<F>(c->bs[2], idx);
|
||||
ba = gather<F>(c->bs[3], idx);
|
||||
}
|
||||
*r = round_F_to_U16(mad(t, fr, br));
|
||||
*g = round_F_to_U16(mad(t, fg, bg));
|
||||
*b = round_F_to_U16(mad(t, fb, bb));
|
||||
*a = round_F_to_U16(mad(t, fa, ba));
|
||||
}
|
||||
|
||||
STAGE_GP(gradient, const SkJumper_GradientCtx* c) {
|
||||
auto t = x;
|
||||
U32 idx = 0;
|
||||
|
||||
// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
|
||||
for (size_t i = 1; i < c->stopCount; i++) {
|
||||
idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
|
||||
}
|
||||
|
||||
gradient_lookup(c, idx, t, &r, &g, &b, &a);
|
||||
}
|
||||
|
||||
STAGE_GP(evenly_spaced_gradient, const SkJumper_GradientCtx* c) {
|
||||
auto t = x;
|
||||
auto idx = trunc_(t * (c->stopCount-1));
|
||||
gradient_lookup(c, idx, t, &r, &g, &b, &a);
|
||||
}
|
||||
|
||||
STAGE_GP(evenly_spaced_2_stop_gradient, const void* ctx) {
|
||||
// TODO: Rename Ctx SkJumper_EvenlySpaced2StopGradientCtx.
|
||||
struct Ctx { float f[4], b[4]; };
|
||||
auto c = (const Ctx*)ctx;
|
||||
|
||||
auto t = x;
|
||||
r = round_F_to_U16(mad(t, c->f[0], c->b[0]));
|
||||
g = round_F_to_U16(mad(t, c->f[1], c->b[1]));
|
||||
b = round_F_to_U16(mad(t, c->f[2], c->b[2]));
|
||||
a = round_F_to_U16(mad(t, c->f[3], c->b[3]));
|
||||
}
|
||||
|
||||
STAGE_GG(xy_to_unit_angle, Ctx::None) {
|
||||
F xabs = abs_(x),
|
||||
yabs = abs_(y);
|
||||
|
||||
F slope = min(xabs, yabs)/max(xabs, yabs);
|
||||
F s = slope * slope;
|
||||
|
||||
// Use a 7th degree polynomial to approximate atan.
|
||||
// This was generated using sollya.gforge.inria.fr.
|
||||
// A float optimized polynomial was generated using the following command.
|
||||
// P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
|
||||
F phi = slope
|
||||
* (0.15912117063999176025390625f + s
|
||||
* (-5.185396969318389892578125e-2f + s
|
||||
* (2.476101927459239959716796875e-2f + s
|
||||
* (-7.0547382347285747528076171875e-3f))));
|
||||
|
||||
phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
|
||||
phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi);
|
||||
phi = if_then_else(y < 0.0f , 1.0f - phi , phi);
|
||||
phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
|
||||
x = phi;
|
||||
}
|
||||
STAGE_GG(xy_to_radius, Ctx::None) {
|
||||
x = sqrt_(x*x + y*y);
|
||||
}
|
||||
|
||||
// ~~~~~~ Compound stages ~~~~~~ //
|
||||
|
||||
STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
|
||||
auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
|
||||
|
||||
load_8888(ptr, tail, &dr,&dg,&db,&da);
|
||||
r = r + div255( dr*inv(a) );
|
||||
g = g + div255( dg*inv(a) );
|
||||
b = b + div255( db*inv(a) );
|
||||
a = a + div255( da*inv(a) );
|
||||
store_8888(ptr, tail, r,g,b,a);
|
||||
}
|
||||
STAGE_PP(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) {
|
||||
auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
|
||||
|
||||
load_8888(ptr, tail, &db,&dg,&dr,&da);
|
||||
r = r + div255( dr*inv(a) );
|
||||
g = g + div255( dg*inv(a) );
|
||||
b = b + div255( db*inv(a) );
|
||||
a = a + div255( da*inv(a) );
|
||||
store_8888(ptr, tail, b,g,r,a);
|
||||
}
|
||||
|
||||
#endif//defined(__clang__)
|
261
src/jumper/build_stages.py
Executable file
261
src/jumper/build_stages.py
Executable file
@ -0,0 +1,261 @@
|
||||
#!/usr/bin/env python2.7
|
||||
#
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
clang = 'clang-5.0'
|
||||
objdump = 'gobjdump'
|
||||
ccache = 'ccache'
|
||||
stages = 'src/jumper/SkJumper_stages.cpp'
|
||||
stages_lowp = 'src/jumper/SkJumper_stages_lowp.cpp'
|
||||
generated = 'src/jumper/SkJumper_generated.S'
|
||||
generated_win = 'src/jumper/SkJumper_generated_win.S'
|
||||
|
||||
clang = sys.argv[1] if len(sys.argv) > 1 else clang
|
||||
objdump = sys.argv[2] if len(sys.argv) > 2 else objdump
|
||||
ccache = sys.argv[3] if len(sys.argv) > 3 else ccache
|
||||
stages = sys.argv[4] if len(sys.argv) > 4 else stages
|
||||
stages_lowp = sys.argv[5] if len(sys.argv) > 5 else stages_lowp
|
||||
generated = sys.argv[6] if len(sys.argv) > 6 else generated
|
||||
generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win
|
||||
|
||||
clang = [ccache, clang, '-x', 'c++']
|
||||
|
||||
|
||||
cflags = ['-std=c++11', '-Os', '-DJUMPER_IS_OFFLINE',
|
||||
'-momit-leaf-frame-pointer', '-ffp-contract=fast',
|
||||
'-fno-exceptions', '-fno-rtti', '-fno-unwind-tables']
|
||||
|
||||
x86 = [ '-m32' ]
|
||||
win = ['-DWIN', '-mno-red-zone']
|
||||
sse2 = ['-msse2', '-mno-sse3', '-mno-ssse3', '-mno-sse4.1']
|
||||
subprocess.check_call(clang + cflags + sse2 +
|
||||
['-c', stages] +
|
||||
['-o', 'sse2.o'])
|
||||
subprocess.check_call(clang + cflags + sse2 + win +
|
||||
['-c', stages] +
|
||||
['-o', 'win_sse2.o'])
|
||||
subprocess.check_call(clang + cflags + sse2 + x86 +
|
||||
['-c', stages] +
|
||||
['-o', 'x86_sse2.o'])
|
||||
subprocess.check_call(clang + cflags + sse2 + win + x86 +
|
||||
['-c', stages] +
|
||||
['-o', 'win_x86_sse2.o'])
|
||||
|
||||
subprocess.check_call(clang + cflags + sse2 +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'lowp_sse2.o'])
|
||||
subprocess.check_call(clang + cflags + sse2 + win +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'win_lowp_sse2.o'])
|
||||
subprocess.check_call(clang + cflags + sse2 + x86 +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'x86_lowp_sse2.o'])
|
||||
subprocess.check_call(clang + cflags + sse2 + win + x86 +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'win_x86_lowp_sse2.o'])
|
||||
|
||||
sse41 = ['-msse4.1']
|
||||
subprocess.check_call(clang + cflags + sse41 +
|
||||
['-c', stages] +
|
||||
['-o', 'sse41.o'])
|
||||
subprocess.check_call(clang + cflags + sse41 + win +
|
||||
['-c', stages] +
|
||||
['-o', 'win_sse41.o'])
|
||||
|
||||
subprocess.check_call(clang + cflags + sse41 +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'lowp_sse41.o'])
|
||||
subprocess.check_call(clang + cflags + sse41 + win +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'win_lowp_sse41.o'])
|
||||
|
||||
avx = ['-mavx']
|
||||
subprocess.check_call(clang + cflags + avx +
|
||||
['-c', stages] +
|
||||
['-o', 'avx.o'])
|
||||
subprocess.check_call(clang + cflags + avx + win +
|
||||
['-c', stages] +
|
||||
['-o', 'win_avx.o'])
|
||||
|
||||
hsw = ['-mavx2', '-mfma', '-mf16c']
|
||||
subprocess.check_call(clang + cflags + hsw +
|
||||
['-c', stages] +
|
||||
['-o', 'hsw.o'])
|
||||
subprocess.check_call(clang + cflags + hsw + win +
|
||||
['-c', stages] +
|
||||
['-o', 'win_hsw.o'])
|
||||
|
||||
subprocess.check_call(clang + cflags + hsw +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'lowp_hsw.o'])
|
||||
subprocess.check_call(clang + cflags + hsw + win +
|
||||
['-c', stages_lowp] +
|
||||
['-o', 'win_lowp_hsw.o'])
|
||||
|
||||
skx = ['-march=skylake-avx512']
|
||||
subprocess.check_call(clang + cflags + skx +
|
||||
['-c', stages] +
|
||||
['-o', 'skx.o'])
|
||||
|
||||
# Merge x86-64 object files to deduplicate constants.
|
||||
# (No other platform has more than one specialization.)
|
||||
subprocess.check_call(['ld', '-r', '-o', 'merged.o',
|
||||
'skx.o', 'hsw.o', 'avx.o', 'sse41.o', 'sse2.o',
|
||||
'lowp_hsw.o', 'lowp_sse41.o', 'lowp_sse2.o'])
|
||||
subprocess.check_call(['ld', '-r', '-o', 'win_merged.o',
|
||||
'win_hsw.o', 'win_avx.o', 'win_sse41.o', 'win_sse2.o',
|
||||
'win_lowp_hsw.o', 'win_lowp_sse41.o', 'win_lowp_sse2.o'])
|
||||
|
||||
subprocess.check_call(['ld', '-r', '-o', 'x86_merged.o',
|
||||
'x86_sse2.o',
|
||||
'x86_lowp_sse2.o'])
|
||||
subprocess.check_call(['ld', '-r', '-o', 'win_x86_merged.o',
|
||||
'win_x86_sse2.o',
|
||||
'win_x86_lowp_sse2.o'])
|
||||
|
||||
def parse_object_file(dot_o, directive, target=None):
|
||||
globl, hidden, label, comment, align = \
|
||||
'.globl', 'HIDDEN', ':', '// ', 'BALIGN'
|
||||
if 'win' in dot_o:
|
||||
globl, hidden, label, comment, align = \
|
||||
'PUBLIC', '', ' LABEL PROC', '; ', 'ALIGN '
|
||||
|
||||
cmd = [objdump]
|
||||
if target:
|
||||
cmd += ['--target', target]
|
||||
|
||||
# Look for sections we know we can't handle.
|
||||
section_headers = subprocess.check_output(cmd + ['-h', dot_o])
|
||||
for snippet in ['.rodata']:
|
||||
if snippet in section_headers:
|
||||
print >>sys.stderr, 'Found %s in section.' % snippet
|
||||
assert snippet not in section_headers
|
||||
|
||||
if directive == '.long':
|
||||
disassemble = ['-d', dot_o]
|
||||
dehex = lambda h: '0x'+h
|
||||
else:
|
||||
# x86-64... as long as we're using %rip-relative addressing,
|
||||
# literal sections should be fine to just dump in with .text.
|
||||
disassemble = ['-d', # DO NOT USE -D.
|
||||
'-z', # Print zero bytes instead of ...
|
||||
'--insn-width=11',
|
||||
'-j', '.text',
|
||||
'-j', '.literal4',
|
||||
'-j', '.literal8',
|
||||
'-j', '.literal16',
|
||||
'-j', '.const',
|
||||
dot_o]
|
||||
dehex = lambda h: str(int(h,16))
|
||||
|
||||
# Ok. Let's disassemble.
|
||||
for line in subprocess.check_output(cmd + disassemble).split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
|
||||
continue
|
||||
|
||||
# E.g. 00000000000003a4 <_load_f16>:
|
||||
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
|
||||
if m:
|
||||
print
|
||||
sym = m.group(1)
|
||||
if sym.startswith('.literal'): # .literal4, .literal16, etc
|
||||
print sym.replace('.literal', align)
|
||||
elif sym.startswith('.const'): # 32-byte constants
|
||||
print align + '32'
|
||||
elif not sym.startswith('sk_'):
|
||||
print >>sys.stderr, "build_stages.py can't handle '%s' (yet?)." % sym
|
||||
assert sym.startswith('sk_')
|
||||
else: # a stage function
|
||||
if hidden:
|
||||
print hidden + ' _' + sym
|
||||
print globl + ' _' + sym
|
||||
if 'win' not in dot_o:
|
||||
print 'FUNCTION(_' + sym + ')'
|
||||
print '_' + sym + label
|
||||
continue
|
||||
|
||||
columns = line.split('\t')
|
||||
#print >>sys.stderr, columns
|
||||
code = columns[1]
|
||||
if len(columns) >= 4:
|
||||
inst = columns[2]
|
||||
args = columns[3]
|
||||
else:
|
||||
inst, args = columns[2], ''
|
||||
if ' ' in columns[2]:
|
||||
inst, args = columns[2].split(' ', 1)
|
||||
code, inst, args = code.strip(), inst.strip(), args.strip()
|
||||
|
||||
hexed = ','.join(dehex(x) for x in code.split(' '))
|
||||
print ' ' + directive + ' ' + hexed + ' '*(36-len(hexed)) + \
|
||||
comment + inst + (' '*(14-len(inst)) + args if args else '')
|
||||
|
||||
sys.stdout = open(generated, 'w')
|
||||
|
||||
print '''# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
# This file is generated semi-automatically with this command:
|
||||
# $ src/jumper/build_stages.py
|
||||
'''
|
||||
print '#if defined(__MACH__)'
|
||||
print ' #define HIDDEN .private_extern'
|
||||
print ' #define FUNCTION(name)'
|
||||
print ' #define BALIGN4 .align 2'
|
||||
print ' #define BALIGN8 .align 3'
|
||||
print ' #define BALIGN16 .align 4'
|
||||
print ' #define BALIGN32 .align 5'
|
||||
print '#else'
|
||||
print ' .section .note.GNU-stack,"",%progbits'
|
||||
print ' #define HIDDEN .hidden'
|
||||
print ' #define FUNCTION(name) .type name,%function'
|
||||
print ' #define BALIGN4 .balign 4'
|
||||
print ' #define BALIGN8 .balign 8'
|
||||
print ' #define BALIGN16 .balign 16'
|
||||
print ' #define BALIGN32 .balign 32'
|
||||
print '#endif'
|
||||
|
||||
print '.text'
|
||||
print '#if defined(__x86_64__)'
|
||||
print 'BALIGN32'
|
||||
parse_object_file('merged.o', '.byte')
|
||||
|
||||
print '#elif defined(__i386__)'
|
||||
print 'BALIGN32'
|
||||
parse_object_file('x86_merged.o', '.byte')
|
||||
|
||||
print '#endif'
|
||||
|
||||
sys.stdout = open(generated_win, 'w')
|
||||
print '''; Copyright 2017 Google Inc.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license that can be
|
||||
; found in the LICENSE file.
|
||||
|
||||
; This file is generated semi-automatically with this command:
|
||||
; $ src/jumper/build_stages.py
|
||||
'''
|
||||
print 'IFDEF RAX'
|
||||
print "_text32 SEGMENT ALIGN(32) 'CODE'"
|
||||
print 'ALIGN 32'
|
||||
parse_object_file('win_merged.o', 'DB')
|
||||
|
||||
print 'ELSE'
|
||||
print '.MODEL FLAT,C'
|
||||
print "_text32 SEGMENT ALIGN(32) 'CODE'"
|
||||
print 'ALIGN 32'
|
||||
parse_object_file('win_x86_merged.o', 'DB')
|
||||
|
||||
print 'ENDIF'
|
||||
print 'END'
|
@ -19,11 +19,11 @@
|
||||
|
||||
namespace SK_OPTS_NS {
|
||||
|
||||
template <typename T, typename P>
|
||||
static inline T unaligned_load(const P* p) {
|
||||
T v;
|
||||
memcpy(&v, p, sizeof(v));
|
||||
return v;
|
||||
template <typename T>
|
||||
static inline T unaligned_load(const uint8_t* src) {
|
||||
T val;
|
||||
memcpy(&val, src, sizeof(val));
|
||||
return val;
|
||||
}
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
|
||||
|
@ -5,10 +5,14 @@
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkSafe_math.h" // Keep this first.
|
||||
#include "SkOpts.h"
|
||||
|
||||
#if defined(_INC_MATH) && !defined(INC_MATH_IS_SAFE_NOW)
|
||||
#error We have included ucrt\math.h without protecting it against ODR violation.
|
||||
#endif
|
||||
|
||||
#define SK_OPTS_NS avx
|
||||
#include "SkRasterPipeline_opts.h"
|
||||
#include "SkUtils_opts.h"
|
||||
|
||||
namespace SkOpts {
|
||||
@ -16,17 +20,5 @@ namespace SkOpts {
|
||||
memset16 = SK_OPTS_NS::memset16;
|
||||
memset32 = SK_OPTS_NS::memset32;
|
||||
memset64 = SK_OPTS_NS::memset64;
|
||||
|
||||
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
|
||||
start_pipeline_highp = SK_OPTS_NS::start_pipeline;
|
||||
#undef M
|
||||
|
||||
#define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
|
||||
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
|
@ -1,28 +0,0 @@
|
||||
/*
|
||||
* Copyright 2018 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkOpts.h"
|
||||
|
||||
#define SK_OPTS_NS hsw
|
||||
#include "SkRasterPipeline_opts.h"
|
||||
#include "SkUtils_opts.h"
|
||||
|
||||
namespace SkOpts {
|
||||
void Init_hsw() {
|
||||
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
|
||||
start_pipeline_highp = SK_OPTS_NS::start_pipeline;
|
||||
#undef M
|
||||
|
||||
#define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
|
||||
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
|
||||
#undef M
|
||||
}
|
||||
}
|
@ -8,23 +8,10 @@
|
||||
#include "SkOpts.h"
|
||||
|
||||
#define SK_OPTS_NS sse41
|
||||
#include "SkRasterPipeline_opts.h"
|
||||
#include "SkBlitRow_opts.h"
|
||||
|
||||
namespace SkOpts {
|
||||
void Init_sse41() {
|
||||
blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
|
||||
|
||||
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
|
||||
start_pipeline_highp = SK_OPTS_NS::start_pipeline;
|
||||
#undef M
|
||||
|
||||
#define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
|
||||
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
|
||||
#undef M
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user