SkSplicer: lowp hacking
Add lowp variants for most stages in SkSplicer. These double the number of pixels handled by representing each channel with 16 bits, ranging from 0x0000 as 0 to 0x8000 as 1. This format lets us use the Q15 multiply instructions available in NEON and SSSE3 at full register width, with a little platform-specific fix up to smooth over the fact that these aren't quite Q15 values. When a lowp stage is unavailable, the entire pipeline upgrades to floats. So by simply not implementing sRGB, f16, matrix multiplication, etc, we naturally express that they're best handled with floats. These lowp stages ended up different enough that I've found it clearer to have them live in their own files, noting where they differ from the float stages. HSW, aarch64, and armv7 are all supported. I've seen very good things performance-wise on all platforms. Change-Id: Ib4f820c6665f2c9020f7449a2b51bbaf6c408a63 Reviewed-on: https://skia-review.googlesource.com/7098 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
0ee6f95fa4
commit
f720098671
@ -16,10 +16,12 @@
|
||||
#endif
|
||||
|
||||
#include "SkSplicer_generated.h"
|
||||
#include "SkSplicer_generated_lowp.h"
|
||||
#include "SkSplicer_shared.h"
|
||||
|
||||
// Uncomment to dump output JIT'd pipeline.
|
||||
//#define DUMP "/tmp/dump.bin"
|
||||
//#define DUMP "/data/local/tmp/dump.bin"
|
||||
//
|
||||
// On x86, we'll include IACA markers too.
|
||||
// https://software.intel.com/en-us/articles/intel-architecture-code-analyzer
|
||||
@ -27,10 +29,10 @@
|
||||
// $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.bin | less
|
||||
//
|
||||
// To disassemble an aarch64 dump,
|
||||
// $ gobjdump -b binary -D dump.bin -m aarch64
|
||||
// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m aarch64 | less
|
||||
//
|
||||
// To disassemble an armv7 dump,
|
||||
// $ gobjdump -b binary -D dump.bin -m arm
|
||||
// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m arm | less
|
||||
|
||||
namespace {
|
||||
|
||||
@ -41,11 +43,18 @@ namespace {
|
||||
0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
|
||||
12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
|
||||
};
|
||||
static const SkSplicer_constants_lowp kConstants_lowp = {
|
||||
0x0001, 0x8000,
|
||||
};
|
||||
|
||||
// We do this a lot, so it's nice to infer the correct size. Works fine with arrays.
|
||||
template <typename T>
|
||||
static void splice(SkWStream* buf, const T& val) {
|
||||
buf->write(&val, sizeof(val));
|
||||
// This null check makes determining whether we can drop to lowp easier.
|
||||
// It's always known at compile time..
|
||||
if (buf) {
|
||||
buf->write(&val, sizeof(val));
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__aarch64__)
|
||||
@ -59,7 +68,6 @@ namespace {
|
||||
splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2
|
||||
}
|
||||
static void loop(SkWStream* buf, int loop_start) {
|
||||
splice(buf, 0x91001000); // add x0, x0, #4
|
||||
splice(buf, 0xeb01001f); // cmp x0, x1
|
||||
int off = loop_start - (int)buf->bytesWritten();
|
||||
off /= 4; // bytes -> instructions, still signed
|
||||
@ -81,7 +89,6 @@ namespace {
|
||||
splice(buf, 0xe3402000 | encode(parts[1])); // movt r2, <top 16 bits>
|
||||
}
|
||||
static void loop(SkWStream* buf, int loop_start) {
|
||||
splice(buf, 0xe2800002); // add r0, r0, #2
|
||||
splice(buf, 0xe1500001); // cmp r0, r1
|
||||
int off = loop_start - ((int)buf->bytesWritten() + 8 /*ARM is weird*/);
|
||||
off /= 4; // bytes -> instructions, still signed
|
||||
@ -99,10 +106,8 @@ namespace {
|
||||
splice(buf, ctx);
|
||||
}
|
||||
static void loop(SkWStream* buf, int loop_start) {
|
||||
static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };
|
||||
static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
|
||||
static const uint8_t jb_near[] = { 0x0f, 0x8c };
|
||||
splice(buf, addq_8_rdi); // addq $8, %rdi
|
||||
splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi
|
||||
splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
|
||||
splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
|
||||
@ -236,12 +241,64 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool splice_lowp(SkWStream* buf, SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return false;
|
||||
case SkRasterPipeline::clamp_0: break; // lowp can't go below 0.
|
||||
#define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st##_lowp); break
|
||||
CASE(clear);
|
||||
CASE(plus_);
|
||||
CASE(srcover);
|
||||
CASE(dstover);
|
||||
CASE(clamp_1);
|
||||
CASE(clamp_a);
|
||||
CASE(swap);
|
||||
CASE(move_src_dst);
|
||||
CASE(move_dst_src);
|
||||
CASE(premul);
|
||||
CASE(load_8888);
|
||||
CASE(store_8888);
|
||||
#undef CASE
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool splice_highp(SkWStream* buf, SkRasterPipeline::StockStage st) {
|
||||
switch (st) {
|
||||
default: return false;
|
||||
#define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st); break
|
||||
CASE(clear);
|
||||
CASE(plus_);
|
||||
CASE(srcover);
|
||||
CASE(dstover);
|
||||
CASE(clamp_0);
|
||||
CASE(clamp_1);
|
||||
CASE(clamp_a);
|
||||
CASE(swap);
|
||||
CASE(move_src_dst);
|
||||
CASE(move_dst_src);
|
||||
CASE(premul);
|
||||
CASE(unpremul);
|
||||
CASE(from_srgb);
|
||||
CASE(to_srgb);
|
||||
CASE(scale_u8);
|
||||
CASE(load_tables);
|
||||
CASE(load_8888);
|
||||
CASE(store_8888);
|
||||
CASE(load_f16);
|
||||
CASE(store_f16);
|
||||
CASE(matrix_3x4);
|
||||
#undef CASE
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
struct Spliced {
|
||||
|
||||
Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
|
||||
// We always create a backup interpreter pipeline,
|
||||
// - to handle any program we can't, and
|
||||
// - to handle the n < kStride tails.
|
||||
// - to handle the n < stride tails.
|
||||
fBackup = SkOpts::compile_pipeline(stages, nstages);
|
||||
fSplicedLen = 0;
|
||||
fSpliced = nullptr;
|
||||
@ -260,12 +317,23 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
// See if all the stages can run in lowp mode. If so, we can run at ~2x speed.
|
||||
bool lowp = true;
|
||||
for (int i = 0; i < nstages; i++) {
|
||||
if (!splice_lowp(nullptr, stages[i].stage)) {
|
||||
//SkDebugf("SkSplicer can't yet handle stage %d in lowp.\n", stages[i].stage);
|
||||
lowp = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fLowp = lowp;
|
||||
|
||||
SkDynamicMemoryWStream buf;
|
||||
|
||||
// Our loop is the equivalent of this C++ code:
|
||||
// do {
|
||||
// ... run spliced stages...
|
||||
// x += kStride;
|
||||
// x += stride;
|
||||
// } while(x < limit);
|
||||
before_loop(&buf);
|
||||
auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start:
|
||||
@ -277,36 +345,18 @@ namespace {
|
||||
}
|
||||
|
||||
// Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
|
||||
switch(stages[i].stage) {
|
||||
case SkRasterPipeline::clear: splice(&buf, kSplice_clear ); break;
|
||||
case SkRasterPipeline::plus_: splice(&buf, kSplice_plus ); break;
|
||||
case SkRasterPipeline::srcover: splice(&buf, kSplice_srcover ); break;
|
||||
case SkRasterPipeline::dstover: splice(&buf, kSplice_dstover ); break;
|
||||
case SkRasterPipeline::clamp_0: splice(&buf, kSplice_clamp_0 ); break;
|
||||
case SkRasterPipeline::clamp_1: splice(&buf, kSplice_clamp_1 ); break;
|
||||
case SkRasterPipeline::clamp_a: splice(&buf, kSplice_clamp_a ); break;
|
||||
case SkRasterPipeline::swap: splice(&buf, kSplice_swap ); break;
|
||||
case SkRasterPipeline::move_src_dst: splice(&buf, kSplice_move_src_dst); break;
|
||||
case SkRasterPipeline::move_dst_src: splice(&buf, kSplice_move_dst_src); break;
|
||||
case SkRasterPipeline::premul: splice(&buf, kSplice_premul ); break;
|
||||
case SkRasterPipeline::unpremul: splice(&buf, kSplice_unpremul ); break;
|
||||
case SkRasterPipeline::from_srgb: splice(&buf, kSplice_from_srgb ); break;
|
||||
case SkRasterPipeline::to_srgb: splice(&buf, kSplice_to_srgb ); break;
|
||||
case SkRasterPipeline::scale_u8: splice(&buf, kSplice_scale_u8 ); break;
|
||||
case SkRasterPipeline::load_tables: splice(&buf, kSplice_load_tables ); break;
|
||||
case SkRasterPipeline::load_8888: splice(&buf, kSplice_load_8888 ); break;
|
||||
case SkRasterPipeline::store_8888: splice(&buf, kSplice_store_8888 ); break;
|
||||
case SkRasterPipeline::load_f16: splice(&buf, kSplice_load_f16 ); break;
|
||||
case SkRasterPipeline::store_f16: splice(&buf, kSplice_store_f16 ); break;
|
||||
case SkRasterPipeline::matrix_3x4: splice(&buf, kSplice_matrix_3x4 ); break;
|
||||
|
||||
// No joy (probably just not yet implemented).
|
||||
default:
|
||||
//SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
|
||||
return;
|
||||
if (lowp) {
|
||||
SkAssertResult(splice_lowp(&buf, stages[i].stage));
|
||||
continue;
|
||||
}
|
||||
if (!splice_highp(&buf, stages[i].stage)) {
|
||||
//SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
lowp ? splice(&buf, kSplice_inc_x_lowp)
|
||||
: splice(&buf, kSplice_inc_x);
|
||||
loop(&buf, loop_start); // Loop back to handle more pixels if not done.
|
||||
after_loop(&buf);
|
||||
ret(&buf); // We're done.
|
||||
@ -323,7 +373,8 @@ namespace {
|
||||
// Spliced is stored in a std::function, so it needs to be copyable.
|
||||
Spliced(const Spliced& o) : fBackup (o.fBackup)
|
||||
, fSplicedLen(o.fSplicedLen)
|
||||
, fSpliced (copy_to_executable_mem(o.fSpliced, &fSplicedLen)) {}
|
||||
, fSpliced (copy_to_executable_mem(o.fSpliced, &fSplicedLen))
|
||||
, fLowp (o.fLowp) {}
|
||||
|
||||
~Spliced() {
|
||||
cleanup_executable_mem(fSpliced, fSplicedLen);
|
||||
@ -331,13 +382,17 @@ namespace {
|
||||
|
||||
// Here's where we call fSpliced if we created it, fBackup if not.
|
||||
void operator()(size_t x, size_t y, size_t n) const {
|
||||
size_t body = n/kStride*kStride; // Largest multiple of kStride (4 or 8) <= n.
|
||||
if (fSpliced && body) { // Can we run fSpliced for at least one kStride?
|
||||
size_t stride = fLowp ? kStride*2
|
||||
: kStride;
|
||||
size_t body = n/stride*stride; // Largest multiple of stride (2, 4, 8, or 16) <= n.
|
||||
if (fSpliced && body) { // Can we run fSpliced for at least one stride?
|
||||
// TODO: At some point we will want to pass in y...
|
||||
using Fn = void(size_t x, size_t limit, void* ctx, const SkSplicer_constants* k);
|
||||
((Fn*)fSpliced)(x, x+body, nullptr, &kConstants);
|
||||
using Fn = void(size_t x, size_t limit, void* ctx, const void* k);
|
||||
auto k = fLowp ? (const void*)&kConstants_lowp
|
||||
: (const void*)&kConstants;
|
||||
((Fn*)fSpliced)(x, x+body, nullptr, k);
|
||||
|
||||
// Fall through to fBackup for any n<kStride last pixels.
|
||||
// Fall through to fBackup for any n<stride last pixels.
|
||||
x += body;
|
||||
n -= body;
|
||||
}
|
||||
@ -347,6 +402,7 @@ namespace {
|
||||
std::function<void(size_t, size_t, size_t)> fBackup;
|
||||
size_t fSplicedLen;
|
||||
void* fSpliced;
|
||||
bool fLowp;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -9,17 +9,20 @@
|
||||
#define SkSplicer_generated_DEFINED
|
||||
|
||||
// This file is generated semi-automatically with this command:
|
||||
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
|
||||
// $ src/splicer/build_stages.py
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
static const unsigned int kSplice_inc_x[] = {
|
||||
0x91001000, // add x0, x0, #0x4
|
||||
};
|
||||
static const unsigned int kSplice_clear[] = {
|
||||
0x6f00e400, // movi v0.2d, #0x0
|
||||
0x6f00e401, // movi v1.2d, #0x0
|
||||
0x6f00e402, // movi v2.2d, #0x0
|
||||
0x6f00e403, // movi v3.2d, #0x0
|
||||
};
|
||||
static const unsigned int kSplice_plus[] = {
|
||||
static const unsigned int kSplice_plus_[] = {
|
||||
0x4e24d400, // fadd v0.4s, v0.4s, v4.4s
|
||||
0x4e25d421, // fadd v1.4s, v1.4s, v5.4s
|
||||
0x4e26d442, // fadd v2.4s, v2.4s, v6.4s
|
||||
@ -363,13 +366,16 @@ static const unsigned int kSplice_matrix_3x4[] = {
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
|
||||
static const unsigned int kSplice_inc_x[] = {
|
||||
0xe2800002, // add r0, r0, #2
|
||||
};
|
||||
static const unsigned int kSplice_clear[] = {
|
||||
0xf2800010, // vmov.i32 d0, #0
|
||||
0xf2801010, // vmov.i32 d1, #0
|
||||
0xf2802010, // vmov.i32 d2, #0
|
||||
0xf2803010, // vmov.i32 d3, #0
|
||||
};
|
||||
static const unsigned int kSplice_plus[] = {
|
||||
static const unsigned int kSplice_plus_[] = {
|
||||
0xf2000d04, // vadd.f32 d0, d0, d4
|
||||
0xf2011d05, // vadd.f32 d1, d1, d5
|
||||
0xf2022d06, // vadd.f32 d2, d2, d6
|
||||
@ -738,13 +744,16 @@ static const unsigned int kSplice_matrix_3x4[] = {
|
||||
|
||||
#else
|
||||
|
||||
static const unsigned char kSplice_inc_x[] = {
|
||||
0x48,0x83,0xc7,0x08, // add $0x8,%rdi
|
||||
};
|
||||
static const unsigned char kSplice_clear[] = {
|
||||
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
|
||||
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
|
||||
0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2
|
||||
0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_plus[] = {
|
||||
static const unsigned char kSplice_plus_[] = {
|
||||
0xc5,0xfc,0x58,0xc4, // vaddps %ymm4,%ymm0,%ymm0
|
||||
0xc5,0xf4,0x58,0xcd, // vaddps %ymm5,%ymm1,%ymm1
|
||||
0xc5,0xec,0x58,0xd6, // vaddps %ymm6,%ymm2,%ymm2
|
||||
|
499
src/splicer/SkSplicer_generated_lowp.h
Normal file
499
src/splicer/SkSplicer_generated_lowp.h
Normal file
@ -0,0 +1,499 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkSplicer_generated_lowp_DEFINED
|
||||
#define SkSplicer_generated_lowp_DEFINED
|
||||
|
||||
// This file is generated semi-automatically with this command:
|
||||
// $ src/splicer/build_stages.py
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
static const unsigned int kSplice_inc_x_lowp[] = {
|
||||
0x91002000, // add x0, x0, #0x8
|
||||
};
|
||||
static const unsigned int kSplice_clear_lowp[] = {
|
||||
0x6f00e400, // movi v0.2d, #0x0
|
||||
0x6f00e401, // movi v1.2d, #0x0
|
||||
0x6f00e402, // movi v2.2d, #0x0
|
||||
0x6f00e403, // movi v3.2d, #0x0
|
||||
};
|
||||
static const unsigned int kSplice_plus__lowp[] = {
|
||||
0x6e640c00, // uqadd v0.8h, v0.8h, v4.8h
|
||||
0x6e650c21, // uqadd v1.8h, v1.8h, v5.8h
|
||||
0x6e660c42, // uqadd v2.8h, v2.8h, v6.8h
|
||||
0x6e670c63, // uqadd v3.8h, v3.8h, v7.8h
|
||||
};
|
||||
static const unsigned int kSplice_srcover_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x6e632e10, // uqsub v16.8h, v16.8h, v3.8h
|
||||
0x6e70b491, // sqrdmulh v17.8h, v4.8h, v16.8h
|
||||
0x4e241e12, // and v18.16b, v16.16b, v4.16b
|
||||
0x6e70b4b3, // sqrdmulh v19.8h, v5.8h, v16.8h
|
||||
0x4e60ba31, // abs v17.8h, v17.8h
|
||||
0x4e251e14, // and v20.16b, v16.16b, v5.16b
|
||||
0x6f111651, // usra v17.8h, v18.8h, #15
|
||||
0x6e70b4d2, // sqrdmulh v18.8h, v6.8h, v16.8h
|
||||
0x4e60ba73, // abs v19.8h, v19.8h
|
||||
0x6f111693, // usra v19.8h, v20.8h, #15
|
||||
0x4e261e14, // and v20.16b, v16.16b, v6.16b
|
||||
0x4e60ba52, // abs v18.8h, v18.8h
|
||||
0x6f111692, // usra v18.8h, v20.8h, #15
|
||||
0x6e70b4f4, // sqrdmulh v20.8h, v7.8h, v16.8h
|
||||
0x4e271e10, // and v16.16b, v16.16b, v7.16b
|
||||
0x4e60ba94, // abs v20.8h, v20.8h
|
||||
0x6f111614, // usra v20.8h, v16.8h, #15
|
||||
0x6e600e20, // uqadd v0.8h, v17.8h, v0.8h
|
||||
0x6e610e61, // uqadd v1.8h, v19.8h, v1.8h
|
||||
0x6e620e42, // uqadd v2.8h, v18.8h, v2.8h
|
||||
0x6e630e83, // uqadd v3.8h, v20.8h, v3.8h
|
||||
};
|
||||
static const unsigned int kSplice_dstover_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x6e672e10, // uqsub v16.8h, v16.8h, v7.8h
|
||||
0x6e70b411, // sqrdmulh v17.8h, v0.8h, v16.8h
|
||||
0x4e201e12, // and v18.16b, v16.16b, v0.16b
|
||||
0x6e70b433, // sqrdmulh v19.8h, v1.8h, v16.8h
|
||||
0x4e60ba31, // abs v17.8h, v17.8h
|
||||
0x4e211e14, // and v20.16b, v16.16b, v1.16b
|
||||
0x6f111651, // usra v17.8h, v18.8h, #15
|
||||
0x6e70b452, // sqrdmulh v18.8h, v2.8h, v16.8h
|
||||
0x4e60ba73, // abs v19.8h, v19.8h
|
||||
0x6f111693, // usra v19.8h, v20.8h, #15
|
||||
0x4e221e14, // and v20.16b, v16.16b, v2.16b
|
||||
0x4e60ba52, // abs v18.8h, v18.8h
|
||||
0x6f111692, // usra v18.8h, v20.8h, #15
|
||||
0x6e70b474, // sqrdmulh v20.8h, v3.8h, v16.8h
|
||||
0x4e231e10, // and v16.16b, v16.16b, v3.16b
|
||||
0x4e60ba94, // abs v20.8h, v20.8h
|
||||
0x6f111614, // usra v20.8h, v16.8h, #15
|
||||
0x6e640e24, // uqadd v4.8h, v17.8h, v4.8h
|
||||
0x6e650e65, // uqadd v5.8h, v19.8h, v5.8h
|
||||
0x6e660e46, // uqadd v6.8h, v18.8h, v6.8h
|
||||
0x6e670e87, // uqadd v7.8h, v20.8h, v7.8h
|
||||
};
|
||||
static const unsigned int kSplice_clamp_1_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x6e706c00, // umin v0.8h, v0.8h, v16.8h
|
||||
0x6e706c21, // umin v1.8h, v1.8h, v16.8h
|
||||
0x6e706c42, // umin v2.8h, v2.8h, v16.8h
|
||||
0x6e706c63, // umin v3.8h, v3.8h, v16.8h
|
||||
};
|
||||
static const unsigned int kSplice_clamp_a_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x6e706c63, // umin v3.8h, v3.8h, v16.8h
|
||||
0x6e636c00, // umin v0.8h, v0.8h, v3.8h
|
||||
0x6e636c21, // umin v1.8h, v1.8h, v3.8h
|
||||
0x6e636c42, // umin v2.8h, v2.8h, v3.8h
|
||||
};
|
||||
static const unsigned int kSplice_swap_lowp[] = {
|
||||
0x4ea31c70, // mov v16.16b, v3.16b
|
||||
0x4ea21c51, // mov v17.16b, v2.16b
|
||||
0x4ea11c32, // mov v18.16b, v1.16b
|
||||
0x4ea01c13, // mov v19.16b, v0.16b
|
||||
0x4ea41c80, // mov v0.16b, v4.16b
|
||||
0x4ea51ca1, // mov v1.16b, v5.16b
|
||||
0x4ea61cc2, // mov v2.16b, v6.16b
|
||||
0x4ea71ce3, // mov v3.16b, v7.16b
|
||||
0x4eb31e64, // mov v4.16b, v19.16b
|
||||
0x4eb21e45, // mov v5.16b, v18.16b
|
||||
0x4eb11e26, // mov v6.16b, v17.16b
|
||||
0x4eb01e07, // mov v7.16b, v16.16b
|
||||
};
|
||||
static const unsigned int kSplice_move_src_dst_lowp[] = {
|
||||
0x4ea01c04, // mov v4.16b, v0.16b
|
||||
0x4ea11c25, // mov v5.16b, v1.16b
|
||||
0x4ea21c46, // mov v6.16b, v2.16b
|
||||
0x4ea31c67, // mov v7.16b, v3.16b
|
||||
};
|
||||
static const unsigned int kSplice_move_dst_src_lowp[] = {
|
||||
0x4ea41c80, // mov v0.16b, v4.16b
|
||||
0x4ea51ca1, // mov v1.16b, v5.16b
|
||||
0x4ea61cc2, // mov v2.16b, v6.16b
|
||||
0x4ea71ce3, // mov v3.16b, v7.16b
|
||||
};
|
||||
static const unsigned int kSplice_premul_lowp[] = {
|
||||
0x6e63b410, // sqrdmulh v16.8h, v0.8h, v3.8h
|
||||
0x4e201c71, // and v17.16b, v3.16b, v0.16b
|
||||
0x4e60ba00, // abs v0.8h, v16.8h
|
||||
0x6e63b430, // sqrdmulh v16.8h, v1.8h, v3.8h
|
||||
0x6f111620, // usra v0.8h, v17.8h, #15
|
||||
0x4e211c71, // and v17.16b, v3.16b, v1.16b
|
||||
0x4e60ba01, // abs v1.8h, v16.8h
|
||||
0x6e63b450, // sqrdmulh v16.8h, v2.8h, v3.8h
|
||||
0x6f111621, // usra v1.8h, v17.8h, #15
|
||||
0x4e221c71, // and v17.16b, v3.16b, v2.16b
|
||||
0x4e60ba02, // abs v2.8h, v16.8h
|
||||
0x6f111622, // usra v2.8h, v17.8h, #15
|
||||
};
|
||||
static const unsigned int kSplice_load_8888_lowp[] = {
|
||||
0xf9400048, // ldr x8, [x2]
|
||||
0x8b000908, // add x8, x8, x0, lsl #2
|
||||
0x0c400110, // ld4 {v16.8b-v19.8b}, [x8]
|
||||
0x2f0fa600, // ushll v0.8h, v16.8b, #7
|
||||
0x2f0fa621, // ushll v1.8h, v17.8b, #7
|
||||
0x2f0fa642, // ushll v2.8h, v18.8b, #7
|
||||
0x2f0fa663, // ushll v3.8h, v19.8b, #7
|
||||
0x6f183400, // ursra v0.8h, v0.8h, #8
|
||||
0x6f183421, // ursra v1.8h, v1.8h, #8
|
||||
0x6f183442, // ursra v2.8h, v2.8h, #8
|
||||
0x6f183463, // ursra v3.8h, v3.8h, #8
|
||||
};
|
||||
static const unsigned int kSplice_store_8888_lowp[] = {
|
||||
0xf9400048, // ldr x8, [x2]
|
||||
0x2f099410, // uqshrn v16.8b, v0.8h, #7
|
||||
0x2f099431, // uqshrn v17.8b, v1.8h, #7
|
||||
0x2f099452, // uqshrn v18.8b, v2.8h, #7
|
||||
0x8b000908, // add x8, x8, x0, lsl #2
|
||||
0x2f099473, // uqshrn v19.8b, v3.8h, #7
|
||||
0x0c000110, // st4 {v16.8b-v19.8b}, [x8]
|
||||
};
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
|
||||
static const unsigned int kSplice_inc_x_lowp[] = {
|
||||
0xe2800004, // add r0, r0, #4
|
||||
};
|
||||
static const unsigned int kSplice_clear_lowp[] = {
|
||||
0xf2800010, // vmov.i32 d0, #0
|
||||
0xf2801010, // vmov.i32 d1, #0
|
||||
0xf2802010, // vmov.i32 d2, #0
|
||||
0xf2803010, // vmov.i32 d3, #0
|
||||
};
|
||||
static const unsigned int kSplice_plus__lowp[] = {
|
||||
0xf3100014, // vqadd.u16 d0, d0, d4
|
||||
0xf3111015, // vqadd.u16 d1, d1, d5
|
||||
0xf3122016, // vqadd.u16 d2, d2, d6
|
||||
0xf3133017, // vqadd.u16 d3, d3, d7
|
||||
};
|
||||
static const unsigned int kSplice_srcover_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf3500293, // vqsub.u16 d16, d16, d3
|
||||
0xf3541b20, // vqrdmulh.s16 d17, d4, d16
|
||||
0xf3552b20, // vqrdmulh.s16 d18, d5, d16
|
||||
0xf3563b20, // vqrdmulh.s16 d19, d6, d16
|
||||
0xf3574b20, // vqrdmulh.s16 d20, d7, d16
|
||||
0xf2405194, // vand d21, d16, d4
|
||||
0xf2406195, // vand d22, d16, d5
|
||||
0xf2407196, // vand d23, d16, d6
|
||||
0xf2400197, // vand d16, d16, d7
|
||||
0xf3f51321, // vabs.s16 d17, d17
|
||||
0xf3f52322, // vabs.s16 d18, d18
|
||||
0xf3f53323, // vabs.s16 d19, d19
|
||||
0xf3f54324, // vabs.s16 d20, d20
|
||||
0xf3d11135, // vsra.u16 d17, d21, #15
|
||||
0xf3d12136, // vsra.u16 d18, d22, #15
|
||||
0xf3d13137, // vsra.u16 d19, d23, #15
|
||||
0xf3d14130, // vsra.u16 d20, d16, #15
|
||||
0xf3110090, // vqadd.u16 d0, d17, d0
|
||||
0xf3121091, // vqadd.u16 d1, d18, d1
|
||||
0xf3132092, // vqadd.u16 d2, d19, d2
|
||||
0xf3143093, // vqadd.u16 d3, d20, d3
|
||||
};
|
||||
static const unsigned int kSplice_dstover_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf3500297, // vqsub.u16 d16, d16, d7
|
||||
0xf3501b20, // vqrdmulh.s16 d17, d0, d16
|
||||
0xf3512b20, // vqrdmulh.s16 d18, d1, d16
|
||||
0xf3523b20, // vqrdmulh.s16 d19, d2, d16
|
||||
0xf3534b20, // vqrdmulh.s16 d20, d3, d16
|
||||
0xf2405190, // vand d21, d16, d0
|
||||
0xf2406191, // vand d22, d16, d1
|
||||
0xf2407192, // vand d23, d16, d2
|
||||
0xf2400193, // vand d16, d16, d3
|
||||
0xf3f51321, // vabs.s16 d17, d17
|
||||
0xf3f52322, // vabs.s16 d18, d18
|
||||
0xf3f53323, // vabs.s16 d19, d19
|
||||
0xf3f54324, // vabs.s16 d20, d20
|
||||
0xf3d11135, // vsra.u16 d17, d21, #15
|
||||
0xf3d12136, // vsra.u16 d18, d22, #15
|
||||
0xf3d13137, // vsra.u16 d19, d23, #15
|
||||
0xf3d14130, // vsra.u16 d20, d16, #15
|
||||
0xf3114094, // vqadd.u16 d4, d17, d4
|
||||
0xf3125095, // vqadd.u16 d5, d18, d5
|
||||
0xf3136096, // vqadd.u16 d6, d19, d6
|
||||
0xf3147097, // vqadd.u16 d7, d20, d7
|
||||
};
|
||||
static const unsigned int kSplice_clamp_1_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf3100630, // vmin.u16 d0, d0, d16
|
||||
0xf3111630, // vmin.u16 d1, d1, d16
|
||||
0xf3122630, // vmin.u16 d2, d2, d16
|
||||
0xf3133630, // vmin.u16 d3, d3, d16
|
||||
};
|
||||
static const unsigned int kSplice_clamp_a_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf3133630, // vmin.u16 d3, d3, d16
|
||||
0xf3100613, // vmin.u16 d0, d0, d3
|
||||
0xf3111613, // vmin.u16 d1, d1, d3
|
||||
0xf3122613, // vmin.u16 d2, d2, d3
|
||||
};
|
||||
static const unsigned int kSplice_swap_lowp[] = {
|
||||
0xeef00b43, // vmov.f64 d16, d3
|
||||
0xeef01b42, // vmov.f64 d17, d2
|
||||
0xeef02b41, // vmov.f64 d18, d1
|
||||
0xeef03b40, // vmov.f64 d19, d0
|
||||
0xeeb00b44, // vmov.f64 d0, d4
|
||||
0xeeb01b45, // vmov.f64 d1, d5
|
||||
0xeeb02b46, // vmov.f64 d2, d6
|
||||
0xeeb03b47, // vmov.f64 d3, d7
|
||||
0xeeb04b63, // vmov.f64 d4, d19
|
||||
0xeeb05b62, // vmov.f64 d5, d18
|
||||
0xeeb06b61, // vmov.f64 d6, d17
|
||||
0xeeb07b60, // vmov.f64 d7, d16
|
||||
};
|
||||
static const unsigned int kSplice_move_src_dst_lowp[] = {
|
||||
0xeeb04b40, // vmov.f64 d4, d0
|
||||
0xeeb05b41, // vmov.f64 d5, d1
|
||||
0xeeb06b42, // vmov.f64 d6, d2
|
||||
0xeeb07b43, // vmov.f64 d7, d3
|
||||
};
|
||||
static const unsigned int kSplice_move_dst_src_lowp[] = {
|
||||
0xeeb00b44, // vmov.f64 d0, d4
|
||||
0xeeb01b45, // vmov.f64 d1, d5
|
||||
0xeeb02b46, // vmov.f64 d2, d6
|
||||
0xeeb03b47, // vmov.f64 d3, d7
|
||||
};
|
||||
static const unsigned int kSplice_premul_lowp[] = {
|
||||
0xf3500b03, // vqrdmulh.s16 d16, d0, d3
|
||||
0xf3511b03, // vqrdmulh.s16 d17, d1, d3
|
||||
0xf3522b03, // vqrdmulh.s16 d18, d2, d3
|
||||
0xf2433110, // vand d19, d3, d0
|
||||
0xf2434111, // vand d20, d3, d1
|
||||
0xf3b50320, // vabs.s16 d0, d16
|
||||
0xf2430112, // vand d16, d3, d2
|
||||
0xf3b51321, // vabs.s16 d1, d17
|
||||
0xf3b52322, // vabs.s16 d2, d18
|
||||
0xf3910133, // vsra.u16 d0, d19, #15
|
||||
0xf3911134, // vsra.u16 d1, d20, #15
|
||||
0xf3912130, // vsra.u16 d2, d16, #15
|
||||
};
|
||||
static const unsigned int kSplice_load_8888_lowp[] = {
|
||||
0xe592c000, // ldr ip, [r2]
|
||||
0xe08cc100, // add ip, ip, r0, lsl #2
|
||||
0xf4ec030d, // vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [ip]!
|
||||
0xf4ec032d, // vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [ip]!
|
||||
0xf4ec034d, // vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [ip]!
|
||||
0xf4ec036d, // vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [ip]!
|
||||
0xf38f0a30, // vshll.u8 q0, d16, #7
|
||||
0xf38f2a32, // vshll.u8 q1, d18, #7
|
||||
0xf3cf0a31, // vshll.u8 q8, d17, #7
|
||||
0xf3cf2a33, // vshll.u8 q9, d19, #7
|
||||
0xf3980350, // vrsra.u16 q0, q0, #8
|
||||
0xf3d80370, // vrsra.u16 q8, q8, #8
|
||||
0xf3d82372, // vrsra.u16 q9, q9, #8
|
||||
0xf3982352, // vrsra.u16 q1, q1, #8
|
||||
0xf22011b0, // vorr d1, d16, d16
|
||||
0xf22231b2, // vorr d3, d18, d18
|
||||
};
|
||||
static const unsigned int kSplice_store_8888_lowp[] = {
|
||||
0xf2630113, // vorr d16, d3, d3
|
||||
0xe592c000, // ldr ip, [r2]
|
||||
0xf2612111, // vorr d18, d1, d1
|
||||
0xf3c94910, // vqshrn.u16 d20, q0, #7
|
||||
0xe08cc100, // add ip, ip, r0, lsl #2
|
||||
0xf3c96912, // vqshrn.u16 d22, q1, #7
|
||||
0xf3c95932, // vqshrn.u16 d21, q9, #7
|
||||
0xf3c97930, // vqshrn.u16 d23, q8, #7
|
||||
0xf4cc430d, // vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [ip]!
|
||||
0xf4cc432d, // vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [ip]!
|
||||
0xf4cc434d, // vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [ip]!
|
||||
0xf4cc436d, // vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [ip]!
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
static const unsigned char kSplice_inc_x_lowp[] = {
|
||||
0x48,0x83,0xc7,0x10, // add $0x10,%rdi
|
||||
};
|
||||
static const unsigned char kSplice_clear_lowp[] = {
|
||||
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
|
||||
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
|
||||
0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2
|
||||
0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_plus__lowp[] = {
|
||||
0xc5,0xfd,0xdd,0xc4, // vpaddusw %ymm4,%ymm0,%ymm0
|
||||
0xc5,0xf5,0xdd,0xcd, // vpaddusw %ymm5,%ymm1,%ymm1
|
||||
0xc5,0xed,0xdd,0xd6, // vpaddusw %ymm6,%ymm2,%ymm2
|
||||
0xc5,0xe5,0xdd,0xdf, // vpaddusw %ymm7,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_srcover_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc5,0x3d,0xd9,0xc3, // vpsubusw %ymm3,%ymm8,%ymm8
|
||||
0xc4,0x42,0x5d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm4,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
0xc5,0xb5,0xdd,0xc0, // vpaddusw %ymm0,%ymm9,%ymm0
|
||||
0xc4,0x42,0x55,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm5,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
0xc5,0xb5,0xdd,0xc9, // vpaddusw %ymm1,%ymm9,%ymm1
|
||||
0xc4,0x42,0x4d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm6,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
0xc5,0xb5,0xdd,0xd2, // vpaddusw %ymm2,%ymm9,%ymm2
|
||||
0xc4,0x42,0x45,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm7,%ymm8
|
||||
0xc4,0x42,0x7d,0x1d,0xc0, // vpabsw %ymm8,%ymm8
|
||||
0xc5,0xbd,0xdd,0xdb, // vpaddusw %ymm3,%ymm8,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_dstover_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc5,0x3d,0xd9,0xc7, // vpsubusw %ymm7,%ymm8,%ymm8
|
||||
0xc4,0x42,0x7d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm0,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
0xc5,0xb5,0xdd,0xe4, // vpaddusw %ymm4,%ymm9,%ymm4
|
||||
0xc4,0x42,0x75,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm1,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
0xc5,0xb5,0xdd,0xed, // vpaddusw %ymm5,%ymm9,%ymm5
|
||||
0xc4,0x42,0x6d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm2,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
0xc5,0xb5,0xdd,0xf6, // vpaddusw %ymm6,%ymm9,%ymm6
|
||||
0xc4,0x42,0x65,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm3,%ymm8
|
||||
0xc4,0x42,0x7d,0x1d,0xc0, // vpabsw %ymm8,%ymm8
|
||||
0xc5,0xbd,0xdd,0xff, // vpaddusw %ymm7,%ymm8,%ymm7
|
||||
};
|
||||
static const unsigned char kSplice_clamp_1_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc4,0xc2,0x7d,0x3a,0xc0, // vpminuw %ymm8,%ymm0,%ymm0
|
||||
0xc4,0xc2,0x75,0x3a,0xc8, // vpminuw %ymm8,%ymm1,%ymm1
|
||||
0xc4,0xc2,0x6d,0x3a,0xd0, // vpminuw %ymm8,%ymm2,%ymm2
|
||||
0xc4,0xc2,0x65,0x3a,0xd8, // vpminuw %ymm8,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_clamp_a_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc4,0xc2,0x65,0x3a,0xd8, // vpminuw %ymm8,%ymm3,%ymm3
|
||||
0xc4,0xe2,0x7d,0x3a,0xc3, // vpminuw %ymm3,%ymm0,%ymm0
|
||||
0xc4,0xe2,0x75,0x3a,0xcb, // vpminuw %ymm3,%ymm1,%ymm1
|
||||
0xc4,0xe2,0x6d,0x3a,0xd3, // vpminuw %ymm3,%ymm2,%ymm2
|
||||
};
|
||||
static const unsigned char kSplice_swap_lowp[] = {
|
||||
0xc5,0x7c,0x28,0xc3, // vmovaps %ymm3,%ymm8
|
||||
0xc5,0x7c,0x28,0xca, // vmovaps %ymm2,%ymm9
|
||||
0xc5,0x7c,0x28,0xd1, // vmovaps %ymm1,%ymm10
|
||||
0xc5,0x7c,0x28,0xd8, // vmovaps %ymm0,%ymm11
|
||||
0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4,%ymm0
|
||||
0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5,%ymm1
|
||||
0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6,%ymm2
|
||||
0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7,%ymm3
|
||||
0xc5,0x7c,0x29,0xdc, // vmovaps %ymm11,%ymm4
|
||||
0xc5,0x7c,0x29,0xd5, // vmovaps %ymm10,%ymm5
|
||||
0xc5,0x7c,0x29,0xce, // vmovaps %ymm9,%ymm6
|
||||
0xc5,0x7c,0x29,0xc7, // vmovaps %ymm8,%ymm7
|
||||
};
|
||||
static const unsigned char kSplice_move_src_dst_lowp[] = {
|
||||
0xc5,0xfc,0x28,0xe0, // vmovaps %ymm0,%ymm4
|
||||
0xc5,0xfc,0x28,0xe9, // vmovaps %ymm1,%ymm5
|
||||
0xc5,0xfc,0x28,0xf2, // vmovaps %ymm2,%ymm6
|
||||
0xc5,0xfc,0x28,0xfb, // vmovaps %ymm3,%ymm7
|
||||
};
|
||||
static const unsigned char kSplice_move_dst_src_lowp[] = {
|
||||
0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4,%ymm0
|
||||
0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5,%ymm1
|
||||
0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6,%ymm2
|
||||
0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_premul_lowp[] = {
|
||||
0xc4,0xe2,0x7d,0x0b,0xc3, // vpmulhrsw %ymm3,%ymm0,%ymm0
|
||||
0xc4,0xe2,0x7d,0x1d,0xc0, // vpabsw %ymm0,%ymm0
|
||||
0xc4,0xe2,0x75,0x0b,0xcb, // vpmulhrsw %ymm3,%ymm1,%ymm1
|
||||
0xc4,0xe2,0x7d,0x1d,0xc9, // vpabsw %ymm1,%ymm1
|
||||
0xc4,0xe2,0x6d,0x0b,0xd3, // vpmulhrsw %ymm3,%ymm2,%ymm2
|
||||
0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
|
||||
};
|
||||
static const unsigned char kSplice_load_8888_lowp[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
0xc5,0xfa,0x6f,0x04,0xb8, // vmovdqu (%rax,%rdi,4),%xmm0
|
||||
0xc5,0xfa,0x6f,0x4c,0xb8,0x10, // vmovdqu 0x10(%rax,%rdi,4),%xmm1
|
||||
0xc5,0xfa,0x6f,0x54,0xb8,0x20, // vmovdqu 0x20(%rax,%rdi,4),%xmm2
|
||||
0xc5,0xfa,0x6f,0x5c,0xb8,0x30, // vmovdqu 0x30(%rax,%rdi,4),%xmm3
|
||||
0xc5,0x79,0x60,0xc1, // vpunpcklbw %xmm1,%xmm0,%xmm8
|
||||
0xc5,0xf9,0x68,0xc1, // vpunpckhbw %xmm1,%xmm0,%xmm0
|
||||
0xc5,0xe9,0x60,0xcb, // vpunpcklbw %xmm3,%xmm2,%xmm1
|
||||
0xc5,0xe9,0x68,0xd3, // vpunpckhbw %xmm3,%xmm2,%xmm2
|
||||
0xc5,0xb9,0x60,0xd8, // vpunpcklbw %xmm0,%xmm8,%xmm3
|
||||
0xc5,0xb9,0x68,0xc0, // vpunpckhbw %xmm0,%xmm8,%xmm0
|
||||
0xc5,0x71,0x60,0xc2, // vpunpcklbw %xmm2,%xmm1,%xmm8
|
||||
0xc5,0xf1,0x68,0xca, // vpunpckhbw %xmm2,%xmm1,%xmm1
|
||||
0xc5,0xe1,0x60,0xd0, // vpunpcklbw %xmm0,%xmm3,%xmm2
|
||||
0xc5,0x61,0x68,0xc8, // vpunpckhbw %xmm0,%xmm3,%xmm9
|
||||
0xc5,0xb9,0x60,0xd9, // vpunpcklbw %xmm1,%xmm8,%xmm3
|
||||
0xc5,0x39,0x68,0xc1, // vpunpckhbw %xmm1,%xmm8,%xmm8
|
||||
0xc5,0xe9,0x6c,0xc3, // vpunpcklqdq %xmm3,%xmm2,%xmm0
|
||||
0xc4,0xe2,0x7d,0x30,0xc0, // vpmovzxbw %xmm0,%ymm0
|
||||
0xc5,0xf5,0x71,0xf0,0x07, // vpsllw $0x7,%ymm0,%ymm1
|
||||
0xc5,0xad,0x71,0xd0,0x01, // vpsrlw $0x1,%ymm0,%ymm10
|
||||
0xc4,0xc1,0x75,0xdd,0xca, // vpaddusw %ymm10,%ymm1,%ymm1
|
||||
0xc4,0x62,0x7d,0x79,0x11, // vpbroadcastw (%rcx),%ymm10
|
||||
0xc4,0xc1,0x7d,0xdd,0xc2, // vpaddusw %ymm10,%ymm0,%ymm0
|
||||
0xc5,0xfd,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm0,%ymm0
|
||||
0xc5,0xf5,0xdd,0xc0, // vpaddusw %ymm0,%ymm1,%ymm0
|
||||
0xc5,0xe9,0x6d,0xcb, // vpunpckhqdq %xmm3,%xmm2,%xmm1
|
||||
0xc4,0xe2,0x7d,0x30,0xc9, // vpmovzxbw %xmm1,%ymm1
|
||||
0xc5,0xed,0x71,0xf1,0x07, // vpsllw $0x7,%ymm1,%ymm2
|
||||
0xc5,0xe5,0x71,0xd1,0x01, // vpsrlw $0x1,%ymm1,%ymm3
|
||||
0xc5,0xed,0xdd,0xd3, // vpaddusw %ymm3,%ymm2,%ymm2
|
||||
0xc4,0xc1,0x75,0xdd,0xca, // vpaddusw %ymm10,%ymm1,%ymm1
|
||||
0xc5,0xf5,0x71,0xd1,0x08, // vpsrlw $0x8,%ymm1,%ymm1
|
||||
0xc5,0xed,0xdd,0xc9, // vpaddusw %ymm1,%ymm2,%ymm1
|
||||
0xc4,0xc1,0x31,0x6c,0xd0, // vpunpcklqdq %xmm8,%xmm9,%xmm2
|
||||
0xc4,0xe2,0x7d,0x30,0xd2, // vpmovzxbw %xmm2,%ymm2
|
||||
0xc5,0xe5,0x71,0xf2,0x07, // vpsllw $0x7,%ymm2,%ymm3
|
||||
0xc5,0xa5,0x71,0xd2,0x01, // vpsrlw $0x1,%ymm2,%ymm11
|
||||
0xc4,0xc1,0x65,0xdd,0xdb, // vpaddusw %ymm11,%ymm3,%ymm3
|
||||
0xc4,0xc1,0x6d,0xdd,0xd2, // vpaddusw %ymm10,%ymm2,%ymm2
|
||||
0xc5,0xed,0x71,0xd2,0x08, // vpsrlw $0x8,%ymm2,%ymm2
|
||||
0xc5,0xe5,0xdd,0xd2, // vpaddusw %ymm2,%ymm3,%ymm2
|
||||
0xc4,0xc1,0x31,0x6d,0xd8, // vpunpckhqdq %xmm8,%xmm9,%xmm3
|
||||
0xc4,0xe2,0x7d,0x30,0xdb, // vpmovzxbw %xmm3,%ymm3
|
||||
0xc5,0xbd,0x71,0xf3,0x07, // vpsllw $0x7,%ymm3,%ymm8
|
||||
0xc5,0xb5,0x71,0xd3,0x01, // vpsrlw $0x1,%ymm3,%ymm9
|
||||
0xc4,0x41,0x3d,0xdd,0xc1, // vpaddusw %ymm9,%ymm8,%ymm8
|
||||
0xc4,0xc1,0x65,0xdd,0xda, // vpaddusw %ymm10,%ymm3,%ymm3
|
||||
0xc5,0xe5,0x71,0xd3,0x08, // vpsrlw $0x8,%ymm3,%ymm3
|
||||
0xc5,0xbd,0xdd,0xdb, // vpaddusw %ymm3,%ymm8,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_store_8888_lowp[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
0xc5,0x7d,0xdd,0xc0, // vpaddusw %ymm0,%ymm0,%ymm8
|
||||
0xc4,0xc1,0x3d,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm8,%ymm8
|
||||
0xc4,0x43,0x7d,0x39,0xc1,0x01, // vextracti128 $0x1,%ymm8,%xmm9
|
||||
0xc4,0x41,0x39,0x67,0xc1, // vpackuswb %xmm9,%xmm8,%xmm8
|
||||
0xc5,0x75,0xdd,0xc9, // vpaddusw %ymm1,%ymm1,%ymm9
|
||||
0xc4,0xc1,0x35,0x71,0xd1,0x08, // vpsrlw $0x8,%ymm9,%ymm9
|
||||
0xc4,0x43,0x7d,0x39,0xca,0x01, // vextracti128 $0x1,%ymm9,%xmm10
|
||||
0xc4,0x41,0x31,0x67,0xca, // vpackuswb %xmm10,%xmm9,%xmm9
|
||||
0xc5,0x6d,0xdd,0xd2, // vpaddusw %ymm2,%ymm2,%ymm10
|
||||
0xc4,0xc1,0x2d,0x71,0xd2,0x08, // vpsrlw $0x8,%ymm10,%ymm10
|
||||
0xc4,0x43,0x7d,0x39,0xd3,0x01, // vextracti128 $0x1,%ymm10,%xmm11
|
||||
0xc4,0x41,0x29,0x67,0xd3, // vpackuswb %xmm11,%xmm10,%xmm10
|
||||
0xc5,0x65,0xdd,0xdb, // vpaddusw %ymm3,%ymm3,%ymm11
|
||||
0xc4,0xc1,0x25,0x71,0xd3,0x08, // vpsrlw $0x8,%ymm11,%ymm11
|
||||
0xc4,0x43,0x7d,0x39,0xdc,0x01, // vextracti128 $0x1,%ymm11,%xmm12
|
||||
0xc4,0x41,0x21,0x67,0xdc, // vpackuswb %xmm12,%xmm11,%xmm11
|
||||
0xc4,0x41,0x39,0x60,0xe1, // vpunpcklbw %xmm9,%xmm8,%xmm12
|
||||
0xc4,0x41,0x39,0x68,0xc1, // vpunpckhbw %xmm9,%xmm8,%xmm8
|
||||
0xc4,0x41,0x29,0x60,0xcb, // vpunpcklbw %xmm11,%xmm10,%xmm9
|
||||
0xc4,0x41,0x29,0x68,0xd3, // vpunpckhbw %xmm11,%xmm10,%xmm10
|
||||
0xc4,0x41,0x19,0x61,0xd9, // vpunpcklwd %xmm9,%xmm12,%xmm11
|
||||
0xc5,0x7a,0x7f,0x1c,0xb8, // vmovdqu %xmm11,(%rax,%rdi,4)
|
||||
0xc4,0x41,0x19,0x69,0xc9, // vpunpckhwd %xmm9,%xmm12,%xmm9
|
||||
0xc5,0x7a,0x7f,0x4c,0xb8,0x10, // vmovdqu %xmm9,0x10(%rax,%rdi,4)
|
||||
0xc4,0x41,0x39,0x61,0xca, // vpunpcklwd %xmm10,%xmm8,%xmm9
|
||||
0xc5,0x7a,0x7f,0x4c,0xb8,0x20, // vmovdqu %xmm9,0x20(%rax,%rdi,4)
|
||||
0xc4,0x41,0x39,0x69,0xc2, // vpunpckhwd %xmm10,%xmm8,%xmm8
|
||||
0xc5,0x7a,0x7f,0x44,0xb8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,4)
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif//SkSplicer_generated_lowp_DEFINED
|
@ -40,4 +40,9 @@ struct SkSplicer_constants {
|
||||
float _00043; // 0.0043f
|
||||
};
|
||||
|
||||
struct SkSplicer_constants_lowp {
|
||||
uint16_t _0x0001; // 0x0001 == 1 == epsilon
|
||||
uint16_t _1; // 0x8000 == 32768 == 1.0
|
||||
};
|
||||
|
||||
#endif//SkSplicer_shared_DEFINED
|
||||
|
@ -119,14 +119,14 @@ C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
|
||||
// This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
|
||||
// It's just a convenience to make a valid, spliceable Stage, nothing magic.
|
||||
#define STAGE(name) \
|
||||
static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
|
||||
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
|
||||
C void name(size_t x, size_t limit, void* ctx, K* k, \
|
||||
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
|
||||
name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
|
||||
done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
|
||||
} \
|
||||
static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
|
||||
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
|
||||
|
||||
// We can now define Stages!
|
||||
@ -144,11 +144,15 @@ C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
|
||||
// - lambdas;
|
||||
// - memcpy() with a compile-time constant size argument.
|
||||
|
||||
STAGE(inc_x) {
|
||||
x += sizeof(F) / sizeof(float);
|
||||
}
|
||||
|
||||
STAGE(clear) {
|
||||
r = g = b = a = 0;
|
||||
}
|
||||
|
||||
STAGE(plus) {
|
||||
STAGE(plus_) {
|
||||
r = r + dr;
|
||||
g = g + dg;
|
||||
b = b + db;
|
||||
|
336
src/splicer/SkSplicer_stages_lowp.cpp
Normal file
336
src/splicer/SkSplicer_stages_lowp.cpp
Normal file
@ -0,0 +1,336 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
// This file is very similar to SkSplicer_stages.cpp, and you will want to read through that file
|
||||
// first before trying to understand this one. We'll note only key differences here.
|
||||
|
||||
#include "SkSplicer_shared.h"
|
||||
#include <string.h>
|
||||
|
||||
#if !defined(__clang__)
|
||||
#error This file is not like the rest of Skia. It must be compiled with clang.
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#include <arm_neon.h>
|
||||
|
||||
// In this file, F is a vector of SkFixed15.
|
||||
// See SkFixed15.h for notes on its various operations.
|
||||
struct F {
|
||||
using V = uint16_t __attribute__((ext_vector_type(8)));
|
||||
|
||||
V vec;
|
||||
|
||||
F(uint16x8_t v) : vec(v) {}
|
||||
operator V() const { return vec; }
|
||||
|
||||
F() = default;
|
||||
F(uint16_t v) : vec(v) {}
|
||||
|
||||
F operator+(F o) const { return vqaddq_u16(vec, o.vec); }
|
||||
F operator-(F o) const { return vqsubq_u16(vec, o.vec); }
|
||||
F operator*(F o) const {
|
||||
return vsraq_n_u16(vabsq_s16(vqrdmulhq_s16(vec, o.vec)),
|
||||
vandq_s16(vec, o.vec), 15);
|
||||
}
|
||||
F operator>>(int k) const { return vec >> k; }
|
||||
F operator<<(int k) const { return vec << k; }
|
||||
};
|
||||
static F min(F a, F b) { return vminq_u16(a,b); }
|
||||
static F max(F a, F b) { return vmaxq_u16(a,b); }
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
|
||||
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
struct F {
|
||||
using V = uint16_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
V vec;
|
||||
|
||||
F(uint16x4_t v) : vec(v) {}
|
||||
operator V() const { return vec; }
|
||||
|
||||
F() = default;
|
||||
F(uint16_t v) : vec(v) {}
|
||||
|
||||
F operator+(F o) const { return vqadd_u16(vec, o.vec); }
|
||||
F operator-(F o) const { return vqsub_u16(vec, o.vec); }
|
||||
F operator*(F o) const {
|
||||
return vsra_n_u16(vabs_s16(vqrdmulh_s16(vec, o.vec)),
|
||||
vand_s16(vec, o.vec), 15);
|
||||
}
|
||||
F operator>>(int k) const { return vec >> k; }
|
||||
F operator<<(int k) const { return vec << k; }
|
||||
};
|
||||
static F min(F a, F b) { return vmin_u16(a,b); }
|
||||
static F max(F a, F b) { return vmax_u16(a,b); }
|
||||
|
||||
#else
|
||||
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
|
||||
#error On x86, compile with -mavx2 -mfma -mf16c.
|
||||
#endif
|
||||
#include <immintrin.h>
|
||||
|
||||
struct F {
|
||||
using V = uint16_t __attribute__((ext_vector_type(16)));
|
||||
|
||||
V vec;
|
||||
|
||||
F(__m256 v) : vec(v) {}
|
||||
operator V() const { return vec; }
|
||||
|
||||
F() = default;
|
||||
F(uint16_t v) : vec(v) {}
|
||||
|
||||
F operator+(F o) const { return _mm256_adds_epu16(vec, o.vec); }
|
||||
F operator-(F o) const { return _mm256_subs_epu16(vec, o.vec); }
|
||||
F operator*(F o) const { return _mm256_abs_epi16(_mm256_mulhrs_epi16(vec, o.vec)); }
|
||||
F operator>>(int k) const { return vec >> k; }
|
||||
F operator<<(int k) const { return vec << k; }
|
||||
};
|
||||
static F min(F a, F b) { return _mm256_min_epu16(a,b); }
|
||||
static F max(F a, F b) { return _mm256_max_epu16(a,b); }
|
||||
#endif
|
||||
|
||||
// No platform actually supports FMA for SkFixed15.
|
||||
// This fma() method just makes it easier to port stages to lowp.
|
||||
static F fma(F f, F m, F a) { return f*m+a; }
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
#define C extern "C" __attribute__((pcs("aapcs-vfp")))
|
||||
#else
|
||||
#define C extern "C"
|
||||
#endif
|
||||
|
||||
// We use a set of constants suitable for SkFixed15 math.
|
||||
using K = const SkSplicer_constants_lowp;
|
||||
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
|
||||
|
||||
// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
|
||||
// registers. This shouldn't affect performance or how you write STAGEs in any way.
|
||||
C void done(size_t, size_t, void*, K*, F::V,F::V,F::V,F::V, F::V,F::V,F::V,F::V);
|
||||
|
||||
#define STAGE(name) \
|
||||
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
|
||||
C void name##_lowp(size_t x, size_t limit, void* ctx, K* k, \
|
||||
F::V R, F::V G, F::V B, F::V A, \
|
||||
F::V DR, F::V DG, F::V DB, F::V DA) { \
|
||||
F r = R, g = G, b = B, a = A, dr = DR, dg = DG, db = DB, da = DA; \
|
||||
name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
|
||||
done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
|
||||
} \
|
||||
static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
|
||||
|
||||
STAGE(inc_x) {
|
||||
x += sizeof(F) / sizeof(uint16_t);
|
||||
}
|
||||
|
||||
STAGE(clear) {
|
||||
r = g = b = a = 0;
|
||||
}
|
||||
|
||||
STAGE(plus_) {
|
||||
r = r + dr;
|
||||
g = g + dg;
|
||||
b = b + db;
|
||||
a = a + da;
|
||||
}
|
||||
|
||||
STAGE(srcover) {
|
||||
auto A = F(k->_1) - a;
|
||||
r = fma(dr, A, r);
|
||||
g = fma(dg, A, g);
|
||||
b = fma(db, A, b);
|
||||
a = fma(da, A, a);
|
||||
}
|
||||
STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); }
|
||||
|
||||
STAGE(clamp_1) {
|
||||
r = min(r, k->_1);
|
||||
g = min(g, k->_1);
|
||||
b = min(b, k->_1);
|
||||
a = min(a, k->_1);
|
||||
}
|
||||
|
||||
STAGE(clamp_a) {
|
||||
a = min(a, k->_1);
|
||||
r = min(r, a);
|
||||
g = min(g, a);
|
||||
b = min(b, a);
|
||||
}
|
||||
|
||||
STAGE(swap) {
|
||||
auto swap = [](F& v, F& dv) {
|
||||
auto tmp = v;
|
||||
v = dv;
|
||||
dv = tmp;
|
||||
};
|
||||
swap(r, dr);
|
||||
swap(g, dg);
|
||||
swap(b, db);
|
||||
swap(a, da);
|
||||
}
|
||||
STAGE(move_src_dst) {
|
||||
dr = r;
|
||||
dg = g;
|
||||
db = b;
|
||||
da = a;
|
||||
}
|
||||
STAGE(move_dst_src) {
|
||||
r = dr;
|
||||
g = dg;
|
||||
b = db;
|
||||
a = da;
|
||||
}
|
||||
|
||||
STAGE(premul) {
|
||||
r = r * a;
|
||||
g = g * a;
|
||||
b = b * a;
|
||||
}
|
||||
|
||||
STAGE(load_8888) {
|
||||
auto ptr = *(const uint32_t**)ctx + x;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
auto to_fixed15 = [](uint8x8_t u8) {
|
||||
// u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h)
|
||||
//
|
||||
// Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
|
||||
// and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions!
|
||||
auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
|
||||
return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding
|
||||
};
|
||||
|
||||
uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
|
||||
r = to_fixed15(rgba.val[0]);
|
||||
g = to_fixed15(rgba.val[1]);
|
||||
b = to_fixed15(rgba.val[2]);
|
||||
a = to_fixed15(rgba.val[3]);
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
auto to_fixed15 = [](uint8x8_t u8) {
|
||||
// Same as aarch64, but only keeping the bottom 4 lanes.
|
||||
auto u16 = vshll_n_u8(u8, 7);
|
||||
return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
|
||||
};
|
||||
|
||||
// I can't get quite the code generation I want using vld4_lane_u8(),
|
||||
// so we're going to drop into assembly to do the loads. :/
|
||||
|
||||
uint8x8_t R,G,B,A;
|
||||
asm("vld4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
|
||||
"vld4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
|
||||
"vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
|
||||
"vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
|
||||
: "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
|
||||
r = to_fixed15(R);
|
||||
g = to_fixed15(G);
|
||||
b = to_fixed15(B);
|
||||
a = to_fixed15(A);
|
||||
|
||||
#else
|
||||
auto to_fixed15 = [k](__m128i u8) {
|
||||
F u16 = _mm256_cvtepu8_epi16(u8);
|
||||
return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
|
||||
};
|
||||
|
||||
// TODO: shorter, more confusing, faster with 256-bit loads and shuffles
|
||||
|
||||
// Load 16 interplaced pixels.
|
||||
auto _0123 = _mm_loadu_si128((const __m128i*)ptr + 0),
|
||||
_4567 = _mm_loadu_si128((const __m128i*)ptr + 1),
|
||||
_89AB = _mm_loadu_si128((const __m128i*)ptr + 2),
|
||||
_CDEF = _mm_loadu_si128((const __m128i*)ptr + 3);
|
||||
|
||||
// We've got an awful lot of unpacking to do to transpose this...
|
||||
auto _0415 = _mm_unpacklo_epi8(_0123, _4567), // r04 g04 b04 a04 r15 g15 b15 a15
|
||||
_2637 = _mm_unpackhi_epi8(_0123, _4567), // r26 g26 b26 a26 r37 g37 b37 a37
|
||||
_8C9D = _mm_unpacklo_epi8(_89AB, _CDEF),
|
||||
_AEBF = _mm_unpackhi_epi8(_89AB, _CDEF);
|
||||
|
||||
auto _0246 = _mm_unpacklo_epi8(_0415, _2637), // r0246 g0246 b0246 a0246
|
||||
_1357 = _mm_unpackhi_epi8(_0415, _2637), // r1357 g1357 b1357 a1357
|
||||
_8ACE = _mm_unpacklo_epi8(_8C9D, _AEBF),
|
||||
_9BDF = _mm_unpackhi_epi8(_8C9D, _AEBF);
|
||||
|
||||
auto rg_01234567 = _mm_unpacklo_epi8(_0246, _1357), // r01234567 g01234567
|
||||
ba_01234567 = _mm_unpackhi_epi8(_0246, _1357), // b01234567 a01234567
|
||||
rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF
|
||||
ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF
|
||||
|
||||
r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
|
||||
g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
|
||||
b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
|
||||
a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
|
||||
#endif
|
||||
}
|
||||
|
||||
STAGE(store_8888) {
|
||||
auto ptr = *(uint32_t**)ctx + x;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
auto from_fixed15 = [](F v) {
|
||||
// The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
|
||||
// But what's really most important is that all bytes round trip.
|
||||
|
||||
// We can do this in NEON in one instruction, a saturating narrowing right shift:
|
||||
return vqshrn_n_u16(v, 7);
|
||||
};
|
||||
|
||||
uint8x8x4_t rgba = {{
|
||||
from_fixed15(r),
|
||||
from_fixed15(g),
|
||||
from_fixed15(b),
|
||||
from_fixed15(a),
|
||||
}};
|
||||
vst4_u8((uint8_t*)ptr, rgba);
|
||||
#elif defined(__ARM_NEON__)
|
||||
auto from_fixed15 = [](F v) {
|
||||
// Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
|
||||
F whatever;
|
||||
return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
|
||||
};
|
||||
|
||||
// As in load_8888, I can't get quite the ideal code generation using vst4_lane_u8().
|
||||
asm("vst4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
|
||||
"vst4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
|
||||
"vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
|
||||
"vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
|
||||
: "+r"(ptr)
|
||||
: "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
|
||||
: "memory");
|
||||
|
||||
#else
|
||||
auto from_fixed15 = [](F v) {
|
||||
// See the note in aarch64's from_fixed15(). The same roundtrip goal applies here.
|
||||
// Here we take a different approach: (v saturated+ v) >> 8.
|
||||
v = (v+v) >> 8;
|
||||
return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
|
||||
_mm256_extracti128_si256(v, 1));
|
||||
};
|
||||
|
||||
auto R = from_fixed15(r),
|
||||
G = from_fixed15(g),
|
||||
B = from_fixed15(b),
|
||||
A = from_fixed15(a);
|
||||
|
||||
auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7
|
||||
rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF
|
||||
ba_01234567 = _mm_unpacklo_epi8(B,A),
|
||||
ba_89ABCDEF = _mm_unpackhi_epi8(B,A);
|
||||
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi16(rg_01234567, ba_01234567));
|
||||
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi16(rg_01234567, ba_01234567));
|
||||
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi16(rg_89ABCDEF, ba_89ABCDEF));
|
||||
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi16(rg_89ABCDEF, ba_89ABCDEF));
|
||||
#endif
|
||||
}
|
@ -15,6 +15,9 @@ hsw = '-mavx2 -mfma -mf16c'.split()
|
||||
subprocess.check_call(['clang++'] + cflags + hsw +
|
||||
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
|
||||
['-o', 'hsw.o'])
|
||||
subprocess.check_call(['clang++'] + cflags + hsw +
|
||||
['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
|
||||
['-o', 'hsw_lowp.o'])
|
||||
|
||||
aarch64 = [
|
||||
'--target=aarch64-linux-android',
|
||||
@ -24,6 +27,9 @@ aarch64 = [
|
||||
subprocess.check_call(['clang++'] + cflags + aarch64 +
|
||||
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
|
||||
['-o', 'aarch64.o'])
|
||||
subprocess.check_call(['clang++'] + cflags + aarch64 +
|
||||
['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
|
||||
['-o', 'aarch64_lowp.o'])
|
||||
|
||||
armv7 = [
|
||||
'--target=arm-linux-androideabi',
|
||||
@ -35,8 +41,11 @@ armv7 = [
|
||||
subprocess.check_call(['clang++'] + cflags + armv7 +
|
||||
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
|
||||
['-o', 'armv7.o'])
|
||||
subprocess.check_call(['clang++'] + cflags + armv7 +
|
||||
['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
|
||||
['-o', 'armv7_lowp.o'])
|
||||
|
||||
def parse_object_file(dot_o, array_type, done, target=None):
|
||||
def parse_object_file(dst, dot_o, array_type, done, target=None):
|
||||
cmd = ['gobjdump', '-d', dot_o]
|
||||
if target:
|
||||
cmd += ['--target', target]
|
||||
@ -48,7 +57,7 @@ def parse_object_file(dot_o, array_type, done, target=None):
|
||||
# E.g. 00000000000003a4 <_load_f16>:
|
||||
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
|
||||
if m:
|
||||
print 'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
|
||||
print >>dst,'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
|
||||
continue
|
||||
|
||||
columns = line.split('\t')
|
||||
@ -65,33 +74,35 @@ def parse_object_file(dot_o, array_type, done, target=None):
|
||||
assert 'rip' not in arg # TODO: detect on aarch64 too
|
||||
|
||||
if code == done:
|
||||
print '};'
|
||||
print >>dst,'};'
|
||||
continue
|
||||
|
||||
hexed = ''.join('0x'+x+',' for x in code.split(' '))
|
||||
print ' ' + hexed + ' '*(44-len(hexed)) + \
|
||||
'// ' + inst + ' '*(14-len(inst)) + args
|
||||
print >>dst,' ' + hexed + ' '*(44-len(hexed)) + \
|
||||
'// ' + inst + ' '*(14-len(inst)) + args
|
||||
|
||||
print '''/*
|
||||
for suffix in ['', '_lowp']:
|
||||
with open('src/splicer/SkSplicer_generated%s.h' % suffix, 'w') as f:
|
||||
print >>f,'''/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkSplicer_generated_DEFINED
|
||||
#define SkSplicer_generated_DEFINED
|
||||
#ifndef SkSplicer_generated%s_DEFINED
|
||||
#define SkSplicer_generated%s_DEFINED
|
||||
|
||||
// This file is generated semi-automatically with this command:
|
||||
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
|
||||
// $ src/splicer/build_stages.py
|
||||
|
||||
#if defined(__aarch64__)
|
||||
'''
|
||||
parse_object_file('aarch64.o', 'unsigned int', '14000000')
|
||||
print '\n#elif defined(__ARM_NEON__)\n'
|
||||
parse_object_file('armv7.o', 'unsigned int', 'eafffffe',
|
||||
target='elf32-littlearm')
|
||||
print '\n#else\n'
|
||||
parse_object_file('hsw.o', 'unsigned char', 'e9 00 00 00 00')
|
||||
print '\n#endif\n'
|
||||
print '#endif//SkSplicer_generated_DEFINED'
|
||||
''' % (suffix, suffix)
|
||||
parse_object_file(f, 'aarch64%s.o' % suffix, 'unsigned int', '14000000')
|
||||
print >>f,'\n#elif defined(__ARM_NEON__)\n'
|
||||
parse_object_file(f, 'armv7%s.o' % suffix, 'unsigned int', 'eafffffe',
|
||||
target='elf32-littlearm')
|
||||
print >>f,'\n#else\n'
|
||||
parse_object_file(f, 'hsw%s.o' % suffix, 'unsigned char', 'e9 00 00 00 00')
|
||||
print >>f,'\n#endif\n'
|
||||
print >>f,'#endif//SkSplicer_generated%s_DEFINED' % suffix
|
||||
|
Loading…
Reference in New Issue
Block a user