SkSplicer: lowp hacking

Add lowp variants for most stages in SkSplicer.  These double the number
of pixels handled by representing each channel with 16 bits, ranging from
0x0000 as 0 to 0x8000 as 1.  This format lets us use the Q15 multiply
instructions available in NEON and SSSE3 at full register width, with
a little platform-specific fix up to smooth over the fact that these
aren't quite Q15 values.

When a lowp stage is unavailable, the entire pipeline upgrades to
floats.  So by simply not implementing sRGB, f16, matrix multiplication,
etc, we naturally express that they're best handled with floats.

These lowp stages ended up different enough that I've found it clearer
to have them live in their own files, noting where they differ from the
float stages.  HSW, aarch64, and armv7 are all supported.

I've seen very good things performance-wise on all platforms.

Change-Id: Ib4f820c6665f2c9020f7449a2b51bbaf6c408a63
Reviewed-on: https://skia-review.googlesource.com/7098
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-01-15 18:14:07 -05:00 committed by Skia Commit-Bot
parent 0ee6f95fa4
commit f720098671
7 changed files with 987 additions and 67 deletions

View File

@ -16,10 +16,12 @@
#endif
#include "SkSplicer_generated.h"
#include "SkSplicer_generated_lowp.h"
#include "SkSplicer_shared.h"
// Uncomment to dump output JIT'd pipeline.
//#define DUMP "/tmp/dump.bin"
//#define DUMP "/data/local/tmp/dump.bin"
//
// On x86, we'll include IACA markers too.
// https://software.intel.com/en-us/articles/intel-architecture-code-analyzer
@ -27,10 +29,10 @@
// $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.bin | less
//
// To disassemble an aarch64 dump,
// $ gobjdump -b binary -D dump.bin -m aarch64
// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m aarch64 | less
//
// To disassemble an armv7 dump,
// $ gobjdump -b binary -D dump.bin -m arm
// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m arm | less
namespace {
@ -41,11 +43,18 @@ namespace {
0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
};
static const SkSplicer_constants_lowp kConstants_lowp = {
0x0001, 0x8000,
};
// We do this a lot, so it's nice to infer the correct size. Works fine with arrays.
template <typename T>
static void splice(SkWStream* buf, const T& val) {
buf->write(&val, sizeof(val));
// This null check makes determining whether we can drop to lowp easier.
// It's always known at compile time..
if (buf) {
buf->write(&val, sizeof(val));
}
}
#if defined(__aarch64__)
@ -59,7 +68,6 @@ namespace {
splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2
}
static void loop(SkWStream* buf, int loop_start) {
splice(buf, 0x91001000); // add x0, x0, #4
splice(buf, 0xeb01001f); // cmp x0, x1
int off = loop_start - (int)buf->bytesWritten();
off /= 4; // bytes -> instructions, still signed
@ -81,7 +89,6 @@ namespace {
splice(buf, 0xe3402000 | encode(parts[1])); // movt r2, <top 16 bits>
}
static void loop(SkWStream* buf, int loop_start) {
splice(buf, 0xe2800002); // add r0, r0, #2
splice(buf, 0xe1500001); // cmp r0, r1
int off = loop_start - ((int)buf->bytesWritten() + 8 /*ARM is weird*/);
off /= 4; // bytes -> instructions, still signed
@ -99,10 +106,8 @@ namespace {
splice(buf, ctx);
}
static void loop(SkWStream* buf, int loop_start) {
static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };
static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
static const uint8_t jb_near[] = { 0x0f, 0x8c };
splice(buf, addq_8_rdi); // addq $8, %rdi
splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi
splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
@ -236,12 +241,64 @@ namespace {
}
#endif
static bool splice_lowp(SkWStream* buf, SkRasterPipeline::StockStage st) {
switch (st) {
default: return false;
case SkRasterPipeline::clamp_0: break; // lowp can't go below 0.
#define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st##_lowp); break
CASE(clear);
CASE(plus_);
CASE(srcover);
CASE(dstover);
CASE(clamp_1);
CASE(clamp_a);
CASE(swap);
CASE(move_src_dst);
CASE(move_dst_src);
CASE(premul);
CASE(load_8888);
CASE(store_8888);
#undef CASE
}
return true;
}
static bool splice_highp(SkWStream* buf, SkRasterPipeline::StockStage st) {
switch (st) {
default: return false;
#define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st); break
CASE(clear);
CASE(plus_);
CASE(srcover);
CASE(dstover);
CASE(clamp_0);
CASE(clamp_1);
CASE(clamp_a);
CASE(swap);
CASE(move_src_dst);
CASE(move_dst_src);
CASE(premul);
CASE(unpremul);
CASE(from_srgb);
CASE(to_srgb);
CASE(scale_u8);
CASE(load_tables);
CASE(load_8888);
CASE(store_8888);
CASE(load_f16);
CASE(store_f16);
CASE(matrix_3x4);
#undef CASE
}
return true;
}
struct Spliced {
Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
// We always create a backup interpreter pipeline,
// - to handle any program we can't, and
// - to handle the n < kStride tails.
// - to handle the n < stride tails.
fBackup = SkOpts::compile_pipeline(stages, nstages);
fSplicedLen = 0;
fSpliced = nullptr;
@ -260,12 +317,23 @@ namespace {
}
#endif
// See if all the stages can run in lowp mode. If so, we can run at ~2x speed.
bool lowp = true;
for (int i = 0; i < nstages; i++) {
if (!splice_lowp(nullptr, stages[i].stage)) {
//SkDebugf("SkSplicer can't yet handle stage %d in lowp.\n", stages[i].stage);
lowp = false;
break;
}
}
fLowp = lowp;
SkDynamicMemoryWStream buf;
// Our loop is the equivalent of this C++ code:
// do {
// ... run spliced stages...
// x += kStride;
// x += stride;
// } while(x < limit);
before_loop(&buf);
auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start:
@ -277,36 +345,18 @@ namespace {
}
// Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
switch(stages[i].stage) {
case SkRasterPipeline::clear: splice(&buf, kSplice_clear ); break;
case SkRasterPipeline::plus_: splice(&buf, kSplice_plus ); break;
case SkRasterPipeline::srcover: splice(&buf, kSplice_srcover ); break;
case SkRasterPipeline::dstover: splice(&buf, kSplice_dstover ); break;
case SkRasterPipeline::clamp_0: splice(&buf, kSplice_clamp_0 ); break;
case SkRasterPipeline::clamp_1: splice(&buf, kSplice_clamp_1 ); break;
case SkRasterPipeline::clamp_a: splice(&buf, kSplice_clamp_a ); break;
case SkRasterPipeline::swap: splice(&buf, kSplice_swap ); break;
case SkRasterPipeline::move_src_dst: splice(&buf, kSplice_move_src_dst); break;
case SkRasterPipeline::move_dst_src: splice(&buf, kSplice_move_dst_src); break;
case SkRasterPipeline::premul: splice(&buf, kSplice_premul ); break;
case SkRasterPipeline::unpremul: splice(&buf, kSplice_unpremul ); break;
case SkRasterPipeline::from_srgb: splice(&buf, kSplice_from_srgb ); break;
case SkRasterPipeline::to_srgb: splice(&buf, kSplice_to_srgb ); break;
case SkRasterPipeline::scale_u8: splice(&buf, kSplice_scale_u8 ); break;
case SkRasterPipeline::load_tables: splice(&buf, kSplice_load_tables ); break;
case SkRasterPipeline::load_8888: splice(&buf, kSplice_load_8888 ); break;
case SkRasterPipeline::store_8888: splice(&buf, kSplice_store_8888 ); break;
case SkRasterPipeline::load_f16: splice(&buf, kSplice_load_f16 ); break;
case SkRasterPipeline::store_f16: splice(&buf, kSplice_store_f16 ); break;
case SkRasterPipeline::matrix_3x4: splice(&buf, kSplice_matrix_3x4 ); break;
// No joy (probably just not yet implemented).
default:
//SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
return;
if (lowp) {
SkAssertResult(splice_lowp(&buf, stages[i].stage));
continue;
}
if (!splice_highp(&buf, stages[i].stage)) {
//SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
return;
}
}
lowp ? splice(&buf, kSplice_inc_x_lowp)
: splice(&buf, kSplice_inc_x);
loop(&buf, loop_start); // Loop back to handle more pixels if not done.
after_loop(&buf);
ret(&buf); // We're done.
@ -323,7 +373,8 @@ namespace {
// Spliced is stored in a std::function, so it needs to be copyable.
Spliced(const Spliced& o) : fBackup (o.fBackup)
, fSplicedLen(o.fSplicedLen)
, fSpliced (copy_to_executable_mem(o.fSpliced, &fSplicedLen)) {}
, fSpliced (copy_to_executable_mem(o.fSpliced, &fSplicedLen))
, fLowp (o.fLowp) {}
~Spliced() {
cleanup_executable_mem(fSpliced, fSplicedLen);
@ -331,13 +382,17 @@ namespace {
// Here's where we call fSpliced if we created it, fBackup if not.
void operator()(size_t x, size_t y, size_t n) const {
size_t body = n/kStride*kStride; // Largest multiple of kStride (4 or 8) <= n.
if (fSpliced && body) { // Can we run fSpliced for at least one kStride?
size_t stride = fLowp ? kStride*2
: kStride;
size_t body = n/stride*stride; // Largest multiple of stride (2, 4, 8, or 16) <= n.
if (fSpliced && body) { // Can we run fSpliced for at least one stride?
// TODO: At some point we will want to pass in y...
using Fn = void(size_t x, size_t limit, void* ctx, const SkSplicer_constants* k);
((Fn*)fSpliced)(x, x+body, nullptr, &kConstants);
using Fn = void(size_t x, size_t limit, void* ctx, const void* k);
auto k = fLowp ? (const void*)&kConstants_lowp
: (const void*)&kConstants;
((Fn*)fSpliced)(x, x+body, nullptr, k);
// Fall through to fBackup for any n<kStride last pixels.
// Fall through to fBackup for any n<stride last pixels.
x += body;
n -= body;
}
@ -347,6 +402,7 @@ namespace {
std::function<void(size_t, size_t, size_t)> fBackup;
size_t fSplicedLen;
void* fSpliced;
bool fLowp;
};
}

View File

@ -9,17 +9,20 @@
#define SkSplicer_generated_DEFINED
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
// $ src/splicer/build_stages.py
#if defined(__aarch64__)
static const unsigned int kSplice_inc_x[] = {
0x91001000, // add x0, x0, #0x4
};
static const unsigned int kSplice_clear[] = {
0x6f00e400, // movi v0.2d, #0x0
0x6f00e401, // movi v1.2d, #0x0
0x6f00e402, // movi v2.2d, #0x0
0x6f00e403, // movi v3.2d, #0x0
};
static const unsigned int kSplice_plus[] = {
static const unsigned int kSplice_plus_[] = {
0x4e24d400, // fadd v0.4s, v0.4s, v4.4s
0x4e25d421, // fadd v1.4s, v1.4s, v5.4s
0x4e26d442, // fadd v2.4s, v2.4s, v6.4s
@ -363,13 +366,16 @@ static const unsigned int kSplice_matrix_3x4[] = {
#elif defined(__ARM_NEON__)
static const unsigned int kSplice_inc_x[] = {
0xe2800002, // add r0, r0, #2
};
static const unsigned int kSplice_clear[] = {
0xf2800010, // vmov.i32 d0, #0
0xf2801010, // vmov.i32 d1, #0
0xf2802010, // vmov.i32 d2, #0
0xf2803010, // vmov.i32 d3, #0
};
static const unsigned int kSplice_plus[] = {
static const unsigned int kSplice_plus_[] = {
0xf2000d04, // vadd.f32 d0, d0, d4
0xf2011d05, // vadd.f32 d1, d1, d5
0xf2022d06, // vadd.f32 d2, d2, d6
@ -738,13 +744,16 @@ static const unsigned int kSplice_matrix_3x4[] = {
#else
static const unsigned char kSplice_inc_x[] = {
0x48,0x83,0xc7,0x08, // add $0x8,%rdi
};
static const unsigned char kSplice_clear[] = {
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2
0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
};
static const unsigned char kSplice_plus[] = {
static const unsigned char kSplice_plus_[] = {
0xc5,0xfc,0x58,0xc4, // vaddps %ymm4,%ymm0,%ymm0
0xc5,0xf4,0x58,0xcd, // vaddps %ymm5,%ymm1,%ymm1
0xc5,0xec,0x58,0xd6, // vaddps %ymm6,%ymm2,%ymm2

View File

@ -0,0 +1,499 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkSplicer_generated_lowp_DEFINED
#define SkSplicer_generated_lowp_DEFINED
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py
#if defined(__aarch64__)
static const unsigned int kSplice_inc_x_lowp[] = {
0x91002000, // add x0, x0, #0x8
};
static const unsigned int kSplice_clear_lowp[] = {
0x6f00e400, // movi v0.2d, #0x0
0x6f00e401, // movi v1.2d, #0x0
0x6f00e402, // movi v2.2d, #0x0
0x6f00e403, // movi v3.2d, #0x0
};
static const unsigned int kSplice_plus__lowp[] = {
0x6e640c00, // uqadd v0.8h, v0.8h, v4.8h
0x6e650c21, // uqadd v1.8h, v1.8h, v5.8h
0x6e660c42, // uqadd v2.8h, v2.8h, v6.8h
0x6e670c63, // uqadd v3.8h, v3.8h, v7.8h
};
static const unsigned int kSplice_srcover_lowp[] = {
0x91000868, // add x8, x3, #0x2
0x4d40c510, // ld1r {v16.8h}, [x8]
0x6e632e10, // uqsub v16.8h, v16.8h, v3.8h
0x6e70b491, // sqrdmulh v17.8h, v4.8h, v16.8h
0x4e241e12, // and v18.16b, v16.16b, v4.16b
0x6e70b4b3, // sqrdmulh v19.8h, v5.8h, v16.8h
0x4e60ba31, // abs v17.8h, v17.8h
0x4e251e14, // and v20.16b, v16.16b, v5.16b
0x6f111651, // usra v17.8h, v18.8h, #15
0x6e70b4d2, // sqrdmulh v18.8h, v6.8h, v16.8h
0x4e60ba73, // abs v19.8h, v19.8h
0x6f111693, // usra v19.8h, v20.8h, #15
0x4e261e14, // and v20.16b, v16.16b, v6.16b
0x4e60ba52, // abs v18.8h, v18.8h
0x6f111692, // usra v18.8h, v20.8h, #15
0x6e70b4f4, // sqrdmulh v20.8h, v7.8h, v16.8h
0x4e271e10, // and v16.16b, v16.16b, v7.16b
0x4e60ba94, // abs v20.8h, v20.8h
0x6f111614, // usra v20.8h, v16.8h, #15
0x6e600e20, // uqadd v0.8h, v17.8h, v0.8h
0x6e610e61, // uqadd v1.8h, v19.8h, v1.8h
0x6e620e42, // uqadd v2.8h, v18.8h, v2.8h
0x6e630e83, // uqadd v3.8h, v20.8h, v3.8h
};
static const unsigned int kSplice_dstover_lowp[] = {
0x91000868, // add x8, x3, #0x2
0x4d40c510, // ld1r {v16.8h}, [x8]
0x6e672e10, // uqsub v16.8h, v16.8h, v7.8h
0x6e70b411, // sqrdmulh v17.8h, v0.8h, v16.8h
0x4e201e12, // and v18.16b, v16.16b, v0.16b
0x6e70b433, // sqrdmulh v19.8h, v1.8h, v16.8h
0x4e60ba31, // abs v17.8h, v17.8h
0x4e211e14, // and v20.16b, v16.16b, v1.16b
0x6f111651, // usra v17.8h, v18.8h, #15
0x6e70b452, // sqrdmulh v18.8h, v2.8h, v16.8h
0x4e60ba73, // abs v19.8h, v19.8h
0x6f111693, // usra v19.8h, v20.8h, #15
0x4e221e14, // and v20.16b, v16.16b, v2.16b
0x4e60ba52, // abs v18.8h, v18.8h
0x6f111692, // usra v18.8h, v20.8h, #15
0x6e70b474, // sqrdmulh v20.8h, v3.8h, v16.8h
0x4e231e10, // and v16.16b, v16.16b, v3.16b
0x4e60ba94, // abs v20.8h, v20.8h
0x6f111614, // usra v20.8h, v16.8h, #15
0x6e640e24, // uqadd v4.8h, v17.8h, v4.8h
0x6e650e65, // uqadd v5.8h, v19.8h, v5.8h
0x6e660e46, // uqadd v6.8h, v18.8h, v6.8h
0x6e670e87, // uqadd v7.8h, v20.8h, v7.8h
};
static const unsigned int kSplice_clamp_1_lowp[] = {
0x91000868, // add x8, x3, #0x2
0x4d40c510, // ld1r {v16.8h}, [x8]
0x6e706c00, // umin v0.8h, v0.8h, v16.8h
0x6e706c21, // umin v1.8h, v1.8h, v16.8h
0x6e706c42, // umin v2.8h, v2.8h, v16.8h
0x6e706c63, // umin v3.8h, v3.8h, v16.8h
};
static const unsigned int kSplice_clamp_a_lowp[] = {
0x91000868, // add x8, x3, #0x2
0x4d40c510, // ld1r {v16.8h}, [x8]
0x6e706c63, // umin v3.8h, v3.8h, v16.8h
0x6e636c00, // umin v0.8h, v0.8h, v3.8h
0x6e636c21, // umin v1.8h, v1.8h, v3.8h
0x6e636c42, // umin v2.8h, v2.8h, v3.8h
};
static const unsigned int kSplice_swap_lowp[] = {
0x4ea31c70, // mov v16.16b, v3.16b
0x4ea21c51, // mov v17.16b, v2.16b
0x4ea11c32, // mov v18.16b, v1.16b
0x4ea01c13, // mov v19.16b, v0.16b
0x4ea41c80, // mov v0.16b, v4.16b
0x4ea51ca1, // mov v1.16b, v5.16b
0x4ea61cc2, // mov v2.16b, v6.16b
0x4ea71ce3, // mov v3.16b, v7.16b
0x4eb31e64, // mov v4.16b, v19.16b
0x4eb21e45, // mov v5.16b, v18.16b
0x4eb11e26, // mov v6.16b, v17.16b
0x4eb01e07, // mov v7.16b, v16.16b
};
static const unsigned int kSplice_move_src_dst_lowp[] = {
0x4ea01c04, // mov v4.16b, v0.16b
0x4ea11c25, // mov v5.16b, v1.16b
0x4ea21c46, // mov v6.16b, v2.16b
0x4ea31c67, // mov v7.16b, v3.16b
};
static const unsigned int kSplice_move_dst_src_lowp[] = {
0x4ea41c80, // mov v0.16b, v4.16b
0x4ea51ca1, // mov v1.16b, v5.16b
0x4ea61cc2, // mov v2.16b, v6.16b
0x4ea71ce3, // mov v3.16b, v7.16b
};
static const unsigned int kSplice_premul_lowp[] = {
0x6e63b410, // sqrdmulh v16.8h, v0.8h, v3.8h
0x4e201c71, // and v17.16b, v3.16b, v0.16b
0x4e60ba00, // abs v0.8h, v16.8h
0x6e63b430, // sqrdmulh v16.8h, v1.8h, v3.8h
0x6f111620, // usra v0.8h, v17.8h, #15
0x4e211c71, // and v17.16b, v3.16b, v1.16b
0x4e60ba01, // abs v1.8h, v16.8h
0x6e63b450, // sqrdmulh v16.8h, v2.8h, v3.8h
0x6f111621, // usra v1.8h, v17.8h, #15
0x4e221c71, // and v17.16b, v3.16b, v2.16b
0x4e60ba02, // abs v2.8h, v16.8h
0x6f111622, // usra v2.8h, v17.8h, #15
};
static const unsigned int kSplice_load_8888_lowp[] = {
0xf9400048, // ldr x8, [x2]
0x8b000908, // add x8, x8, x0, lsl #2
0x0c400110, // ld4 {v16.8b-v19.8b}, [x8]
0x2f0fa600, // ushll v0.8h, v16.8b, #7
0x2f0fa621, // ushll v1.8h, v17.8b, #7
0x2f0fa642, // ushll v2.8h, v18.8b, #7
0x2f0fa663, // ushll v3.8h, v19.8b, #7
0x6f183400, // ursra v0.8h, v0.8h, #8
0x6f183421, // ursra v1.8h, v1.8h, #8
0x6f183442, // ursra v2.8h, v2.8h, #8
0x6f183463, // ursra v3.8h, v3.8h, #8
};
static const unsigned int kSplice_store_8888_lowp[] = {
0xf9400048, // ldr x8, [x2]
0x2f099410, // uqshrn v16.8b, v0.8h, #7
0x2f099431, // uqshrn v17.8b, v1.8h, #7
0x2f099452, // uqshrn v18.8b, v2.8h, #7
0x8b000908, // add x8, x8, x0, lsl #2
0x2f099473, // uqshrn v19.8b, v3.8h, #7
0x0c000110, // st4 {v16.8b-v19.8b}, [x8]
};
#elif defined(__ARM_NEON__)
static const unsigned int kSplice_inc_x_lowp[] = {
0xe2800004, // add r0, r0, #4
};
static const unsigned int kSplice_clear_lowp[] = {
0xf2800010, // vmov.i32 d0, #0
0xf2801010, // vmov.i32 d1, #0
0xf2802010, // vmov.i32 d2, #0
0xf2803010, // vmov.i32 d3, #0
};
static const unsigned int kSplice_plus__lowp[] = {
0xf3100014, // vqadd.u16 d0, d0, d4
0xf3111015, // vqadd.u16 d1, d1, d5
0xf3122016, // vqadd.u16 d2, d2, d6
0xf3133017, // vqadd.u16 d3, d3, d7
};
static const unsigned int kSplice_srcover_lowp[] = {
0xe283c002, // add ip, r3, #2
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
0xf3500293, // vqsub.u16 d16, d16, d3
0xf3541b20, // vqrdmulh.s16 d17, d4, d16
0xf3552b20, // vqrdmulh.s16 d18, d5, d16
0xf3563b20, // vqrdmulh.s16 d19, d6, d16
0xf3574b20, // vqrdmulh.s16 d20, d7, d16
0xf2405194, // vand d21, d16, d4
0xf2406195, // vand d22, d16, d5
0xf2407196, // vand d23, d16, d6
0xf2400197, // vand d16, d16, d7
0xf3f51321, // vabs.s16 d17, d17
0xf3f52322, // vabs.s16 d18, d18
0xf3f53323, // vabs.s16 d19, d19
0xf3f54324, // vabs.s16 d20, d20
0xf3d11135, // vsra.u16 d17, d21, #15
0xf3d12136, // vsra.u16 d18, d22, #15
0xf3d13137, // vsra.u16 d19, d23, #15
0xf3d14130, // vsra.u16 d20, d16, #15
0xf3110090, // vqadd.u16 d0, d17, d0
0xf3121091, // vqadd.u16 d1, d18, d1
0xf3132092, // vqadd.u16 d2, d19, d2
0xf3143093, // vqadd.u16 d3, d20, d3
};
static const unsigned int kSplice_dstover_lowp[] = {
0xe283c002, // add ip, r3, #2
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
0xf3500297, // vqsub.u16 d16, d16, d7
0xf3501b20, // vqrdmulh.s16 d17, d0, d16
0xf3512b20, // vqrdmulh.s16 d18, d1, d16
0xf3523b20, // vqrdmulh.s16 d19, d2, d16
0xf3534b20, // vqrdmulh.s16 d20, d3, d16
0xf2405190, // vand d21, d16, d0
0xf2406191, // vand d22, d16, d1
0xf2407192, // vand d23, d16, d2
0xf2400193, // vand d16, d16, d3
0xf3f51321, // vabs.s16 d17, d17
0xf3f52322, // vabs.s16 d18, d18
0xf3f53323, // vabs.s16 d19, d19
0xf3f54324, // vabs.s16 d20, d20
0xf3d11135, // vsra.u16 d17, d21, #15
0xf3d12136, // vsra.u16 d18, d22, #15
0xf3d13137, // vsra.u16 d19, d23, #15
0xf3d14130, // vsra.u16 d20, d16, #15
0xf3114094, // vqadd.u16 d4, d17, d4
0xf3125095, // vqadd.u16 d5, d18, d5
0xf3136096, // vqadd.u16 d6, d19, d6
0xf3147097, // vqadd.u16 d7, d20, d7
};
static const unsigned int kSplice_clamp_1_lowp[] = {
0xe283c002, // add ip, r3, #2
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
0xf3100630, // vmin.u16 d0, d0, d16
0xf3111630, // vmin.u16 d1, d1, d16
0xf3122630, // vmin.u16 d2, d2, d16
0xf3133630, // vmin.u16 d3, d3, d16
};
static const unsigned int kSplice_clamp_a_lowp[] = {
0xe283c002, // add ip, r3, #2
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
0xf3133630, // vmin.u16 d3, d3, d16
0xf3100613, // vmin.u16 d0, d0, d3
0xf3111613, // vmin.u16 d1, d1, d3
0xf3122613, // vmin.u16 d2, d2, d3
};
static const unsigned int kSplice_swap_lowp[] = {
0xeef00b43, // vmov.f64 d16, d3
0xeef01b42, // vmov.f64 d17, d2
0xeef02b41, // vmov.f64 d18, d1
0xeef03b40, // vmov.f64 d19, d0
0xeeb00b44, // vmov.f64 d0, d4
0xeeb01b45, // vmov.f64 d1, d5
0xeeb02b46, // vmov.f64 d2, d6
0xeeb03b47, // vmov.f64 d3, d7
0xeeb04b63, // vmov.f64 d4, d19
0xeeb05b62, // vmov.f64 d5, d18
0xeeb06b61, // vmov.f64 d6, d17
0xeeb07b60, // vmov.f64 d7, d16
};
static const unsigned int kSplice_move_src_dst_lowp[] = {
0xeeb04b40, // vmov.f64 d4, d0
0xeeb05b41, // vmov.f64 d5, d1
0xeeb06b42, // vmov.f64 d6, d2
0xeeb07b43, // vmov.f64 d7, d3
};
static const unsigned int kSplice_move_dst_src_lowp[] = {
0xeeb00b44, // vmov.f64 d0, d4
0xeeb01b45, // vmov.f64 d1, d5
0xeeb02b46, // vmov.f64 d2, d6
0xeeb03b47, // vmov.f64 d3, d7
};
static const unsigned int kSplice_premul_lowp[] = {
0xf3500b03, // vqrdmulh.s16 d16, d0, d3
0xf3511b03, // vqrdmulh.s16 d17, d1, d3
0xf3522b03, // vqrdmulh.s16 d18, d2, d3
0xf2433110, // vand d19, d3, d0
0xf2434111, // vand d20, d3, d1
0xf3b50320, // vabs.s16 d0, d16
0xf2430112, // vand d16, d3, d2
0xf3b51321, // vabs.s16 d1, d17
0xf3b52322, // vabs.s16 d2, d18
0xf3910133, // vsra.u16 d0, d19, #15
0xf3911134, // vsra.u16 d1, d20, #15
0xf3912130, // vsra.u16 d2, d16, #15
};
static const unsigned int kSplice_load_8888_lowp[] = {
0xe592c000, // ldr ip, [r2]
0xe08cc100, // add ip, ip, r0, lsl #2
0xf4ec030d, // vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [ip]!
0xf4ec032d, // vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [ip]!
0xf4ec034d, // vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [ip]!
0xf4ec036d, // vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [ip]!
0xf38f0a30, // vshll.u8 q0, d16, #7
0xf38f2a32, // vshll.u8 q1, d18, #7
0xf3cf0a31, // vshll.u8 q8, d17, #7
0xf3cf2a33, // vshll.u8 q9, d19, #7
0xf3980350, // vrsra.u16 q0, q0, #8
0xf3d80370, // vrsra.u16 q8, q8, #8
0xf3d82372, // vrsra.u16 q9, q9, #8
0xf3982352, // vrsra.u16 q1, q1, #8
0xf22011b0, // vorr d1, d16, d16
0xf22231b2, // vorr d3, d18, d18
};
static const unsigned int kSplice_store_8888_lowp[] = {
0xf2630113, // vorr d16, d3, d3
0xe592c000, // ldr ip, [r2]
0xf2612111, // vorr d18, d1, d1
0xf3c94910, // vqshrn.u16 d20, q0, #7
0xe08cc100, // add ip, ip, r0, lsl #2
0xf3c96912, // vqshrn.u16 d22, q1, #7
0xf3c95932, // vqshrn.u16 d21, q9, #7
0xf3c97930, // vqshrn.u16 d23, q8, #7
0xf4cc430d, // vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [ip]!
0xf4cc432d, // vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [ip]!
0xf4cc434d, // vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [ip]!
0xf4cc436d, // vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [ip]!
};
#else
static const unsigned char kSplice_inc_x_lowp[] = {
0x48,0x83,0xc7,0x10, // add $0x10,%rdi
};
static const unsigned char kSplice_clear_lowp[] = {
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2
0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
};
static const unsigned char kSplice_plus__lowp[] = {
0xc5,0xfd,0xdd,0xc4, // vpaddusw %ymm4,%ymm0,%ymm0
0xc5,0xf5,0xdd,0xcd, // vpaddusw %ymm5,%ymm1,%ymm1
0xc5,0xed,0xdd,0xd6, // vpaddusw %ymm6,%ymm2,%ymm2
0xc5,0xe5,0xdd,0xdf, // vpaddusw %ymm7,%ymm3,%ymm3
};
static const unsigned char kSplice_srcover_lowp[] = {
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
0xc5,0x3d,0xd9,0xc3, // vpsubusw %ymm3,%ymm8,%ymm8
0xc4,0x42,0x5d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm4,%ymm9
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
0xc5,0xb5,0xdd,0xc0, // vpaddusw %ymm0,%ymm9,%ymm0
0xc4,0x42,0x55,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm5,%ymm9
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
0xc5,0xb5,0xdd,0xc9, // vpaddusw %ymm1,%ymm9,%ymm1
0xc4,0x42,0x4d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm6,%ymm9
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
0xc5,0xb5,0xdd,0xd2, // vpaddusw %ymm2,%ymm9,%ymm2
0xc4,0x42,0x45,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm7,%ymm8
0xc4,0x42,0x7d,0x1d,0xc0, // vpabsw %ymm8,%ymm8
0xc5,0xbd,0xdd,0xdb, // vpaddusw %ymm3,%ymm8,%ymm3
};
static const unsigned char kSplice_dstover_lowp[] = {
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
0xc5,0x3d,0xd9,0xc7, // vpsubusw %ymm7,%ymm8,%ymm8
0xc4,0x42,0x7d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm0,%ymm9
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
0xc5,0xb5,0xdd,0xe4, // vpaddusw %ymm4,%ymm9,%ymm4
0xc4,0x42,0x75,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm1,%ymm9
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
0xc5,0xb5,0xdd,0xed, // vpaddusw %ymm5,%ymm9,%ymm5
0xc4,0x42,0x6d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm2,%ymm9
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
0xc5,0xb5,0xdd,0xf6, // vpaddusw %ymm6,%ymm9,%ymm6
0xc4,0x42,0x65,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm3,%ymm8
0xc4,0x42,0x7d,0x1d,0xc0, // vpabsw %ymm8,%ymm8
0xc5,0xbd,0xdd,0xff, // vpaddusw %ymm7,%ymm8,%ymm7
};
static const unsigned char kSplice_clamp_1_lowp[] = {
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
0xc4,0xc2,0x7d,0x3a,0xc0, // vpminuw %ymm8,%ymm0,%ymm0
0xc4,0xc2,0x75,0x3a,0xc8, // vpminuw %ymm8,%ymm1,%ymm1
0xc4,0xc2,0x6d,0x3a,0xd0, // vpminuw %ymm8,%ymm2,%ymm2
0xc4,0xc2,0x65,0x3a,0xd8, // vpminuw %ymm8,%ymm3,%ymm3
};
static const unsigned char kSplice_clamp_a_lowp[] = {
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
0xc4,0xc2,0x65,0x3a,0xd8, // vpminuw %ymm8,%ymm3,%ymm3
0xc4,0xe2,0x7d,0x3a,0xc3, // vpminuw %ymm3,%ymm0,%ymm0
0xc4,0xe2,0x75,0x3a,0xcb, // vpminuw %ymm3,%ymm1,%ymm1
0xc4,0xe2,0x6d,0x3a,0xd3, // vpminuw %ymm3,%ymm2,%ymm2
};
static const unsigned char kSplice_swap_lowp[] = {
0xc5,0x7c,0x28,0xc3, // vmovaps %ymm3,%ymm8
0xc5,0x7c,0x28,0xca, // vmovaps %ymm2,%ymm9
0xc5,0x7c,0x28,0xd1, // vmovaps %ymm1,%ymm10
0xc5,0x7c,0x28,0xd8, // vmovaps %ymm0,%ymm11
0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4,%ymm0
0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5,%ymm1
0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6,%ymm2
0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7,%ymm3
0xc5,0x7c,0x29,0xdc, // vmovaps %ymm11,%ymm4
0xc5,0x7c,0x29,0xd5, // vmovaps %ymm10,%ymm5
0xc5,0x7c,0x29,0xce, // vmovaps %ymm9,%ymm6
0xc5,0x7c,0x29,0xc7, // vmovaps %ymm8,%ymm7
};
static const unsigned char kSplice_move_src_dst_lowp[] = {
0xc5,0xfc,0x28,0xe0, // vmovaps %ymm0,%ymm4
0xc5,0xfc,0x28,0xe9, // vmovaps %ymm1,%ymm5
0xc5,0xfc,0x28,0xf2, // vmovaps %ymm2,%ymm6
0xc5,0xfc,0x28,0xfb, // vmovaps %ymm3,%ymm7
};
static const unsigned char kSplice_move_dst_src_lowp[] = {
0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4,%ymm0
0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5,%ymm1
0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6,%ymm2
0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7,%ymm3
};
static const unsigned char kSplice_premul_lowp[] = {
0xc4,0xe2,0x7d,0x0b,0xc3, // vpmulhrsw %ymm3,%ymm0,%ymm0
0xc4,0xe2,0x7d,0x1d,0xc0, // vpabsw %ymm0,%ymm0
0xc4,0xe2,0x75,0x0b,0xcb, // vpmulhrsw %ymm3,%ymm1,%ymm1
0xc4,0xe2,0x7d,0x1d,0xc9, // vpabsw %ymm1,%ymm1
0xc4,0xe2,0x6d,0x0b,0xd3, // vpmulhrsw %ymm3,%ymm2,%ymm2
0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
};
static const unsigned char kSplice_load_8888_lowp[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc5,0xfa,0x6f,0x04,0xb8, // vmovdqu (%rax,%rdi,4),%xmm0
0xc5,0xfa,0x6f,0x4c,0xb8,0x10, // vmovdqu 0x10(%rax,%rdi,4),%xmm1
0xc5,0xfa,0x6f,0x54,0xb8,0x20, // vmovdqu 0x20(%rax,%rdi,4),%xmm2
0xc5,0xfa,0x6f,0x5c,0xb8,0x30, // vmovdqu 0x30(%rax,%rdi,4),%xmm3
0xc5,0x79,0x60,0xc1, // vpunpcklbw %xmm1,%xmm0,%xmm8
0xc5,0xf9,0x68,0xc1, // vpunpckhbw %xmm1,%xmm0,%xmm0
0xc5,0xe9,0x60,0xcb, // vpunpcklbw %xmm3,%xmm2,%xmm1
0xc5,0xe9,0x68,0xd3, // vpunpckhbw %xmm3,%xmm2,%xmm2
0xc5,0xb9,0x60,0xd8, // vpunpcklbw %xmm0,%xmm8,%xmm3
0xc5,0xb9,0x68,0xc0, // vpunpckhbw %xmm0,%xmm8,%xmm0
0xc5,0x71,0x60,0xc2, // vpunpcklbw %xmm2,%xmm1,%xmm8
0xc5,0xf1,0x68,0xca, // vpunpckhbw %xmm2,%xmm1,%xmm1
0xc5,0xe1,0x60,0xd0, // vpunpcklbw %xmm0,%xmm3,%xmm2
0xc5,0x61,0x68,0xc8, // vpunpckhbw %xmm0,%xmm3,%xmm9
0xc5,0xb9,0x60,0xd9, // vpunpcklbw %xmm1,%xmm8,%xmm3
0xc5,0x39,0x68,0xc1, // vpunpckhbw %xmm1,%xmm8,%xmm8
0xc5,0xe9,0x6c,0xc3, // vpunpcklqdq %xmm3,%xmm2,%xmm0
0xc4,0xe2,0x7d,0x30,0xc0, // vpmovzxbw %xmm0,%ymm0
0xc5,0xf5,0x71,0xf0,0x07, // vpsllw $0x7,%ymm0,%ymm1
0xc5,0xad,0x71,0xd0,0x01, // vpsrlw $0x1,%ymm0,%ymm10
0xc4,0xc1,0x75,0xdd,0xca, // vpaddusw %ymm10,%ymm1,%ymm1
0xc4,0x62,0x7d,0x79,0x11, // vpbroadcastw (%rcx),%ymm10
0xc4,0xc1,0x7d,0xdd,0xc2, // vpaddusw %ymm10,%ymm0,%ymm0
0xc5,0xfd,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm0,%ymm0
0xc5,0xf5,0xdd,0xc0, // vpaddusw %ymm0,%ymm1,%ymm0
0xc5,0xe9,0x6d,0xcb, // vpunpckhqdq %xmm3,%xmm2,%xmm1
0xc4,0xe2,0x7d,0x30,0xc9, // vpmovzxbw %xmm1,%ymm1
0xc5,0xed,0x71,0xf1,0x07, // vpsllw $0x7,%ymm1,%ymm2
0xc5,0xe5,0x71,0xd1,0x01, // vpsrlw $0x1,%ymm1,%ymm3
0xc5,0xed,0xdd,0xd3, // vpaddusw %ymm3,%ymm2,%ymm2
0xc4,0xc1,0x75,0xdd,0xca, // vpaddusw %ymm10,%ymm1,%ymm1
0xc5,0xf5,0x71,0xd1,0x08, // vpsrlw $0x8,%ymm1,%ymm1
0xc5,0xed,0xdd,0xc9, // vpaddusw %ymm1,%ymm2,%ymm1
0xc4,0xc1,0x31,0x6c,0xd0, // vpunpcklqdq %xmm8,%xmm9,%xmm2
0xc4,0xe2,0x7d,0x30,0xd2, // vpmovzxbw %xmm2,%ymm2
0xc5,0xe5,0x71,0xf2,0x07, // vpsllw $0x7,%ymm2,%ymm3
0xc5,0xa5,0x71,0xd2,0x01, // vpsrlw $0x1,%ymm2,%ymm11
0xc4,0xc1,0x65,0xdd,0xdb, // vpaddusw %ymm11,%ymm3,%ymm3
0xc4,0xc1,0x6d,0xdd,0xd2, // vpaddusw %ymm10,%ymm2,%ymm2
0xc5,0xed,0x71,0xd2,0x08, // vpsrlw $0x8,%ymm2,%ymm2
0xc5,0xe5,0xdd,0xd2, // vpaddusw %ymm2,%ymm3,%ymm2
0xc4,0xc1,0x31,0x6d,0xd8, // vpunpckhqdq %xmm8,%xmm9,%xmm3
0xc4,0xe2,0x7d,0x30,0xdb, // vpmovzxbw %xmm3,%ymm3
0xc5,0xbd,0x71,0xf3,0x07, // vpsllw $0x7,%ymm3,%ymm8
0xc5,0xb5,0x71,0xd3,0x01, // vpsrlw $0x1,%ymm3,%ymm9
0xc4,0x41,0x3d,0xdd,0xc1, // vpaddusw %ymm9,%ymm8,%ymm8
0xc4,0xc1,0x65,0xdd,0xda, // vpaddusw %ymm10,%ymm3,%ymm3
0xc5,0xe5,0x71,0xd3,0x08, // vpsrlw $0x8,%ymm3,%ymm3
0xc5,0xbd,0xdd,0xdb, // vpaddusw %ymm3,%ymm8,%ymm3
};
static const unsigned char kSplice_store_8888_lowp[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc5,0x7d,0xdd,0xc0, // vpaddusw %ymm0,%ymm0,%ymm8
0xc4,0xc1,0x3d,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm8,%ymm8
0xc4,0x43,0x7d,0x39,0xc1,0x01, // vextracti128 $0x1,%ymm8,%xmm9
0xc4,0x41,0x39,0x67,0xc1, // vpackuswb %xmm9,%xmm8,%xmm8
0xc5,0x75,0xdd,0xc9, // vpaddusw %ymm1,%ymm1,%ymm9
0xc4,0xc1,0x35,0x71,0xd1,0x08, // vpsrlw $0x8,%ymm9,%ymm9
0xc4,0x43,0x7d,0x39,0xca,0x01, // vextracti128 $0x1,%ymm9,%xmm10
0xc4,0x41,0x31,0x67,0xca, // vpackuswb %xmm10,%xmm9,%xmm9
0xc5,0x6d,0xdd,0xd2, // vpaddusw %ymm2,%ymm2,%ymm10
0xc4,0xc1,0x2d,0x71,0xd2,0x08, // vpsrlw $0x8,%ymm10,%ymm10
0xc4,0x43,0x7d,0x39,0xd3,0x01, // vextracti128 $0x1,%ymm10,%xmm11
0xc4,0x41,0x29,0x67,0xd3, // vpackuswb %xmm11,%xmm10,%xmm10
0xc5,0x65,0xdd,0xdb, // vpaddusw %ymm3,%ymm3,%ymm11
0xc4,0xc1,0x25,0x71,0xd3,0x08, // vpsrlw $0x8,%ymm11,%ymm11
0xc4,0x43,0x7d,0x39,0xdc,0x01, // vextracti128 $0x1,%ymm11,%xmm12
0xc4,0x41,0x21,0x67,0xdc, // vpackuswb %xmm12,%xmm11,%xmm11
0xc4,0x41,0x39,0x60,0xe1, // vpunpcklbw %xmm9,%xmm8,%xmm12
0xc4,0x41,0x39,0x68,0xc1, // vpunpckhbw %xmm9,%xmm8,%xmm8
0xc4,0x41,0x29,0x60,0xcb, // vpunpcklbw %xmm11,%xmm10,%xmm9
0xc4,0x41,0x29,0x68,0xd3, // vpunpckhbw %xmm11,%xmm10,%xmm10
0xc4,0x41,0x19,0x61,0xd9, // vpunpcklwd %xmm9,%xmm12,%xmm11
0xc5,0x7a,0x7f,0x1c,0xb8, // vmovdqu %xmm11,(%rax,%rdi,4)
0xc4,0x41,0x19,0x69,0xc9, // vpunpckhwd %xmm9,%xmm12,%xmm9
0xc5,0x7a,0x7f,0x4c,0xb8,0x10, // vmovdqu %xmm9,0x10(%rax,%rdi,4)
0xc4,0x41,0x39,0x61,0xca, // vpunpcklwd %xmm10,%xmm8,%xmm9
0xc5,0x7a,0x7f,0x4c,0xb8,0x20, // vmovdqu %xmm9,0x20(%rax,%rdi,4)
0xc4,0x41,0x39,0x69,0xc2, // vpunpckhwd %xmm10,%xmm8,%xmm8
0xc5,0x7a,0x7f,0x44,0xb8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,4)
};
#endif
#endif//SkSplicer_generated_lowp_DEFINED

View File

@ -40,4 +40,9 @@ struct SkSplicer_constants {
float _00043; // 0.0043f
};
struct SkSplicer_constants_lowp {
uint16_t _0x0001; // 0x0001 == 1 == epsilon
uint16_t _1; // 0x8000 == 32768 == 1.0
};
#endif//SkSplicer_shared_DEFINED

View File

@ -119,14 +119,14 @@ C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
// This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
// It's just a convenience to make a valid, spliceable Stage, nothing magic.
#define STAGE(name) \
static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
C void name(size_t x, size_t limit, void* ctx, K* k, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
} \
static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
// We can now define Stages!
@ -144,11 +144,15 @@ C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
// - lambdas;
// - memcpy() with a compile-time constant size argument.
STAGE(inc_x) {
x += sizeof(F) / sizeof(float);
}
STAGE(clear) {
r = g = b = a = 0;
}
STAGE(plus) {
STAGE(plus_) {
r = r + dr;
g = g + dg;
b = b + db;

View File

@ -0,0 +1,336 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
// This file is very similar to SkSplicer_stages.cpp, and you will want to read through that file
// first before trying to understand this one. We'll note only key differences here.
#include "SkSplicer_shared.h"
#include <string.h>
#if !defined(__clang__)
#error This file is not like the rest of Skia. It must be compiled with clang.
#endif
#if defined(__aarch64__)
#include <arm_neon.h>
// In this file, F is a vector of SkFixed15.
// See SkFixed15.h for notes on its various operations.
struct F {
using V = uint16_t __attribute__((ext_vector_type(8)));
V vec;
F(uint16x8_t v) : vec(v) {}
operator V() const { return vec; }
F() = default;
F(uint16_t v) : vec(v) {}
F operator+(F o) const { return vqaddq_u16(vec, o.vec); }
F operator-(F o) const { return vqsubq_u16(vec, o.vec); }
F operator*(F o) const {
return vsraq_n_u16(vabsq_s16(vqrdmulhq_s16(vec, o.vec)),
vandq_s16(vec, o.vec), 15);
}
F operator>>(int k) const { return vec >> k; }
F operator<<(int k) const { return vec << k; }
};
static F min(F a, F b) { return vminq_u16(a,b); }
static F max(F a, F b) { return vmaxq_u16(a,b); }
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
#endif
#include <arm_neon.h>
struct F {
using V = uint16_t __attribute__((ext_vector_type(4)));
V vec;
F(uint16x4_t v) : vec(v) {}
operator V() const { return vec; }
F() = default;
F(uint16_t v) : vec(v) {}
F operator+(F o) const { return vqadd_u16(vec, o.vec); }
F operator-(F o) const { return vqsub_u16(vec, o.vec); }
F operator*(F o) const {
return vsra_n_u16(vabs_s16(vqrdmulh_s16(vec, o.vec)),
vand_s16(vec, o.vec), 15);
}
F operator>>(int k) const { return vec >> k; }
F operator<<(int k) const { return vec << k; }
};
static F min(F a, F b) { return vmin_u16(a,b); }
static F max(F a, F b) { return vmax_u16(a,b); }
#else
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error On x86, compile with -mavx2 -mfma -mf16c.
#endif
#include <immintrin.h>
struct F {
using V = uint16_t __attribute__((ext_vector_type(16)));
V vec;
F(__m256 v) : vec(v) {}
operator V() const { return vec; }
F() = default;
F(uint16_t v) : vec(v) {}
F operator+(F o) const { return _mm256_adds_epu16(vec, o.vec); }
F operator-(F o) const { return _mm256_subs_epu16(vec, o.vec); }
F operator*(F o) const { return _mm256_abs_epi16(_mm256_mulhrs_epi16(vec, o.vec)); }
F operator>>(int k) const { return vec >> k; }
F operator<<(int k) const { return vec << k; }
};
static F min(F a, F b) { return _mm256_min_epu16(a,b); }
static F max(F a, F b) { return _mm256_max_epu16(a,b); }
#endif
// No platform actually supports FMA for SkFixed15.
// This fma() method just makes it easier to port stages to lowp.
static F fma(F f, F m, F a) { return f*m+a; }
#if defined(__ARM_NEON__)
#define C extern "C" __attribute__((pcs("aapcs-vfp")))
#else
#define C extern "C"
#endif
// We use a set of constants suitable for SkFixed15 math.
using K = const SkSplicer_constants_lowp;
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
// registers. This shouldn't affect performance or how you write STAGEs in any way.
C void done(size_t, size_t, void*, K*, F::V,F::V,F::V,F::V, F::V,F::V,F::V,F::V);
#define STAGE(name) \
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
C void name##_lowp(size_t x, size_t limit, void* ctx, K* k, \
F::V R, F::V G, F::V B, F::V A, \
F::V DR, F::V DG, F::V DB, F::V DA) { \
F r = R, g = G, b = B, a = A, dr = DR, dg = DG, db = DB, da = DA; \
name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
} \
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
STAGE(inc_x) {
x += sizeof(F) / sizeof(uint16_t);
}
STAGE(clear) {
r = g = b = a = 0;
}
STAGE(plus_) {
r = r + dr;
g = g + dg;
b = b + db;
a = a + da;
}
STAGE(srcover) {
auto A = F(k->_1) - a;
r = fma(dr, A, r);
g = fma(dg, A, g);
b = fma(db, A, b);
a = fma(da, A, a);
}
STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); }
STAGE(clamp_1) {
r = min(r, k->_1);
g = min(g, k->_1);
b = min(b, k->_1);
a = min(a, k->_1);
}
STAGE(clamp_a) {
a = min(a, k->_1);
r = min(r, a);
g = min(g, a);
b = min(b, a);
}
STAGE(swap) {
auto swap = [](F& v, F& dv) {
auto tmp = v;
v = dv;
dv = tmp;
};
swap(r, dr);
swap(g, dg);
swap(b, db);
swap(a, da);
}
STAGE(move_src_dst) {
dr = r;
dg = g;
db = b;
da = a;
}
STAGE(move_dst_src) {
r = dr;
g = dg;
b = db;
a = da;
}
STAGE(premul) {
r = r * a;
g = g * a;
b = b * a;
}
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
#if defined(__aarch64__)
auto to_fixed15 = [](uint8x8_t u8) {
// u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h)
//
// Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
// and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions!
auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding
};
uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
r = to_fixed15(rgba.val[0]);
g = to_fixed15(rgba.val[1]);
b = to_fixed15(rgba.val[2]);
a = to_fixed15(rgba.val[3]);
#elif defined(__ARM_NEON__)
auto to_fixed15 = [](uint8x8_t u8) {
// Same as aarch64, but only keeping the bottom 4 lanes.
auto u16 = vshll_n_u8(u8, 7);
return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
};
// I can't get quite the code generation I want using vld4_lane_u8(),
// so we're going to drop into assembly to do the loads. :/
uint8x8_t R,G,B,A;
asm("vld4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
"vld4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
"vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
"vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
: "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
r = to_fixed15(R);
g = to_fixed15(G);
b = to_fixed15(B);
a = to_fixed15(A);
#else
auto to_fixed15 = [k](__m128i u8) {
F u16 = _mm256_cvtepu8_epi16(u8);
return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
};
// TODO: shorter, more confusing, faster with 256-bit loads and shuffles
// Load 16 interplaced pixels.
auto _0123 = _mm_loadu_si128((const __m128i*)ptr + 0),
_4567 = _mm_loadu_si128((const __m128i*)ptr + 1),
_89AB = _mm_loadu_si128((const __m128i*)ptr + 2),
_CDEF = _mm_loadu_si128((const __m128i*)ptr + 3);
// We've got an awful lot of unpacking to do to transpose this...
auto _0415 = _mm_unpacklo_epi8(_0123, _4567), // r04 g04 b04 a04 r15 g15 b15 a15
_2637 = _mm_unpackhi_epi8(_0123, _4567), // r26 g26 b26 a26 r37 g37 b37 a37
_8C9D = _mm_unpacklo_epi8(_89AB, _CDEF),
_AEBF = _mm_unpackhi_epi8(_89AB, _CDEF);
auto _0246 = _mm_unpacklo_epi8(_0415, _2637), // r0246 g0246 b0246 a0246
_1357 = _mm_unpackhi_epi8(_0415, _2637), // r1357 g1357 b1357 a1357
_8ACE = _mm_unpacklo_epi8(_8C9D, _AEBF),
_9BDF = _mm_unpackhi_epi8(_8C9D, _AEBF);
auto rg_01234567 = _mm_unpacklo_epi8(_0246, _1357), // r01234567 g01234567
ba_01234567 = _mm_unpackhi_epi8(_0246, _1357), // b01234567 a01234567
rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF
ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF
r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
#endif
}
STAGE(store_8888) {
auto ptr = *(uint32_t**)ctx + x;
#if defined(__aarch64__)
auto from_fixed15 = [](F v) {
// The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
// But what's really most important is that all bytes round trip.
// We can do this in NEON in one instruction, a saturating narrowing right shift:
return vqshrn_n_u16(v, 7);
};
uint8x8x4_t rgba = {{
from_fixed15(r),
from_fixed15(g),
from_fixed15(b),
from_fixed15(a),
}};
vst4_u8((uint8_t*)ptr, rgba);
#elif defined(__ARM_NEON__)
auto from_fixed15 = [](F v) {
// Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
F whatever;
return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
};
// As in load_8888, I can't get quite the ideal code generation using vst4_lane_u8().
asm("vst4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
"vst4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
"vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
"vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
: "+r"(ptr)
: "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
: "memory");
#else
auto from_fixed15 = [](F v) {
// See the note in aarch64's from_fixed15(). The same roundtrip goal applies here.
// Here we take a different approach: (v saturated+ v) >> 8.
v = (v+v) >> 8;
return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
_mm256_extracti128_si256(v, 1));
};
auto R = from_fixed15(r),
G = from_fixed15(g),
B = from_fixed15(b),
A = from_fixed15(a);
auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7
rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF
ba_01234567 = _mm_unpacklo_epi8(B,A),
ba_89ABCDEF = _mm_unpackhi_epi8(B,A);
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi16(rg_01234567, ba_01234567));
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi16(rg_01234567, ba_01234567));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi16(rg_89ABCDEF, ba_89ABCDEF));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi16(rg_89ABCDEF, ba_89ABCDEF));
#endif
}

View File

@ -15,6 +15,9 @@ hsw = '-mavx2 -mfma -mf16c'.split()
subprocess.check_call(['clang++'] + cflags + hsw +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'hsw.o'])
subprocess.check_call(['clang++'] + cflags + hsw +
['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
['-o', 'hsw_lowp.o'])
aarch64 = [
'--target=aarch64-linux-android',
@ -24,6 +27,9 @@ aarch64 = [
subprocess.check_call(['clang++'] + cflags + aarch64 +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'aarch64.o'])
subprocess.check_call(['clang++'] + cflags + aarch64 +
['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
['-o', 'aarch64_lowp.o'])
armv7 = [
'--target=arm-linux-androideabi',
@ -35,8 +41,11 @@ armv7 = [
subprocess.check_call(['clang++'] + cflags + armv7 +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'armv7.o'])
subprocess.check_call(['clang++'] + cflags + armv7 +
['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
['-o', 'armv7_lowp.o'])
def parse_object_file(dot_o, array_type, done, target=None):
def parse_object_file(dst, dot_o, array_type, done, target=None):
cmd = ['gobjdump', '-d', dot_o]
if target:
cmd += ['--target', target]
@ -48,7 +57,7 @@ def parse_object_file(dot_o, array_type, done, target=None):
# E.g. 00000000000003a4 <_load_f16>:
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
if m:
print 'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
print >>dst,'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
continue
columns = line.split('\t')
@ -65,33 +74,35 @@ def parse_object_file(dot_o, array_type, done, target=None):
assert 'rip' not in arg # TODO: detect on aarch64 too
if code == done:
print '};'
print >>dst,'};'
continue
hexed = ''.join('0x'+x+',' for x in code.split(' '))
print ' ' + hexed + ' '*(44-len(hexed)) + \
'// ' + inst + ' '*(14-len(inst)) + args
print >>dst,' ' + hexed + ' '*(44-len(hexed)) + \
'// ' + inst + ' '*(14-len(inst)) + args
print '''/*
for suffix in ['', '_lowp']:
with open('src/splicer/SkSplicer_generated%s.h' % suffix, 'w') as f:
print >>f,'''/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkSplicer_generated_DEFINED
#define SkSplicer_generated_DEFINED
#ifndef SkSplicer_generated%s_DEFINED
#define SkSplicer_generated%s_DEFINED
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
// $ src/splicer/build_stages.py
#if defined(__aarch64__)
'''
parse_object_file('aarch64.o', 'unsigned int', '14000000')
print '\n#elif defined(__ARM_NEON__)\n'
parse_object_file('armv7.o', 'unsigned int', 'eafffffe',
target='elf32-littlearm')
print '\n#else\n'
parse_object_file('hsw.o', 'unsigned char', 'e9 00 00 00 00')
print '\n#endif\n'
print '#endif//SkSplicer_generated_DEFINED'
''' % (suffix, suffix)
parse_object_file(f, 'aarch64%s.o' % suffix, 'unsigned int', '14000000')
print >>f,'\n#elif defined(__ARM_NEON__)\n'
parse_object_file(f, 'armv7%s.o' % suffix, 'unsigned int', 'eafffffe',
target='elf32-littlearm')
print >>f,'\n#else\n'
parse_object_file(f, 'hsw%s.o' % suffix, 'unsigned char', 'e9 00 00 00 00')
print >>f,'\n#endif\n'
print >>f,'#endif//SkSplicer_generated%s_DEFINED' % suffix