SkSplicer stage parity

I noticed scale_u8 is implemented in SkSplicer_stages but not _lowp.
That's not for any good reason... scale_u8 makes fine sense in _lowp.

All other stages missing in _lowp are nuts to attempt without floats.

This also renames the to_fixed15 lambdas to from_u8 functions.
Everything in the file converts to or from fixed15; the interesting
question is the other format.  Similarly, from_fixed15 becomes to_u8.

Change-Id: I10616b6772c65bd1acb9857f4f5b5f70a4f01bf4
Reviewed-on: https://skia-review.googlesource.com/7323
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-01-19 22:29:09 -05:00 committed by Skia Commit-Bot
parent 48f770c43f
commit ba294d930a
4 changed files with 148 additions and 48 deletions

View File

@ -256,6 +256,7 @@ namespace {
CASE(move_src_dst);
CASE(move_dst_src);
CASE(premul);
CASE(scale_u8);
CASE(load_8888);
CASE(store_8888);
#undef CASE

View File

@ -134,6 +134,28 @@ static const unsigned int kSplice_premul_lowp[] = {
0x4e60ba02, // abs v2.8h, v16.8h
0x6f111622, // usra v2.8h, v17.8h, #15
};
static const unsigned int kSplice_scale_u8_lowp[] = {
0xf9400048, // ldr x8, [x2]
0xfc606910, // ldr d16, [x8,x0]
0x2f0fa610, // ushll v16.8h, v16.8b, #7
0x6f183610, // ursra v16.8h, v16.8h, #8
0x6e70b411, // sqrdmulh v17.8h, v0.8h, v16.8h
0x6e70b433, // sqrdmulh v19.8h, v1.8h, v16.8h
0x6e70b455, // sqrdmulh v21.8h, v2.8h, v16.8h
0x6e70b477, // sqrdmulh v23.8h, v3.8h, v16.8h
0x4e201e12, // and v18.16b, v16.16b, v0.16b
0x4e211e14, // and v20.16b, v16.16b, v1.16b
0x4e221e16, // and v22.16b, v16.16b, v2.16b
0x4e231e10, // and v16.16b, v16.16b, v3.16b
0x4e60ba20, // abs v0.8h, v17.8h
0x4e60ba61, // abs v1.8h, v19.8h
0x4e60baa2, // abs v2.8h, v21.8h
0x4e60bae3, // abs v3.8h, v23.8h
0x6f111640, // usra v0.8h, v18.8h, #15
0x6f111681, // usra v1.8h, v20.8h, #15
0x6f1116c2, // usra v2.8h, v22.8h, #15
0x6f111603, // usra v3.8h, v16.8h, #15
};
static const unsigned int kSplice_load_8888_lowp[] = {
0xf9400048, // ldr x8, [x2]
0x8b000908, // add x8, x8, x0, lsl #2
@ -280,6 +302,29 @@ static const unsigned int kSplice_premul_lowp[] = {
0xf3911134, // vsra.u16 d1, d20, #15
0xf3912130, // vsra.u16 d2, d16, #15
};
static const unsigned int kSplice_scale_u8_lowp[] = {
0xe592c000, // ldr ip, [r2]
0xe08cc000, // add ip, ip, r0
0xf4ec0c8f, // vld1.32 {d16[]}, [ip]
0xf3cf0a30, // vshll.u8 q8, d16, #7
0xf3d80370, // vrsra.u16 q8, q8, #8
0xf3502b20, // vqrdmulh.s16 d18, d0, d16
0xf3513b20, // vqrdmulh.s16 d19, d1, d16
0xf3524b20, // vqrdmulh.s16 d20, d2, d16
0xf3535b20, // vqrdmulh.s16 d21, d3, d16
0xf2406190, // vand d22, d16, d0
0xf3b50322, // vabs.s16 d0, d18
0xf2407191, // vand d23, d16, d1
0xf2402192, // vand d18, d16, d2
0xf2400193, // vand d16, d16, d3
0xf3b51323, // vabs.s16 d1, d19
0xf3b52324, // vabs.s16 d2, d20
0xf3b53325, // vabs.s16 d3, d21
0xf3910136, // vsra.u16 d0, d22, #15
0xf3911137, // vsra.u16 d1, d23, #15
0xf3912132, // vsra.u16 d2, d18, #15
0xf3913130, // vsra.u16 d3, d16, #15
};
static const unsigned int kSplice_load_8888_lowp[] = {
0xe592c000, // ldr ip, [r2]
0xe08cc100, // add ip, ip, r0, lsl #2
@ -410,6 +455,25 @@ static const unsigned char kSplice_premul_lowp[] = {
0xc4,0xe2,0x6d,0x0b,0xd3, // vpmulhrsw %ymm3,%ymm2,%ymm2
0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
};
static const unsigned char kSplice_scale_u8_lowp[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc4,0x62,0x7d,0x30,0x04,0x38, // vpmovzxbw (%rax,%rdi,1),%ymm8
0xc4,0xc1,0x35,0x71,0xf0,0x07, // vpsllw $0x7,%ymm8,%ymm9
0xc4,0xc1,0x2d,0x71,0xd0,0x01, // vpsrlw $0x1,%ymm8,%ymm10
0xc4,0x41,0x35,0xdd,0xca, // vpaddusw %ymm10,%ymm9,%ymm9
0xc4,0x62,0x7d,0x79,0x11, // vpbroadcastw (%rcx),%ymm10
0xc4,0x41,0x3d,0xdd,0xc2, // vpaddusw %ymm10,%ymm8,%ymm8
0xc4,0xc1,0x3d,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm8,%ymm8
0xc4,0x41,0x35,0xdd,0xc0, // vpaddusw %ymm8,%ymm9,%ymm8
0xc4,0xc2,0x7d,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm0,%ymm0
0xc4,0xe2,0x7d,0x1d,0xc0, // vpabsw %ymm0,%ymm0
0xc4,0xc2,0x75,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm1,%ymm1
0xc4,0xe2,0x7d,0x1d,0xc9, // vpabsw %ymm1,%ymm1
0xc4,0xc2,0x6d,0x0b,0xd0, // vpmulhrsw %ymm8,%ymm2,%ymm2
0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
0xc4,0xc2,0x65,0x0b,0xd8, // vpmulhrsw %ymm8,%ymm3,%ymm3
0xc4,0xe2,0x7d,0x1d,0xdb, // vpabsw %ymm3,%ymm3
};
static const unsigned char kSplice_load_8888_lowp[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc5,0xfa,0x6f,0x04,0xb8, // vmovdqu (%rax,%rdi,4),%xmm0

View File

@ -12,6 +12,9 @@
#error This file is not like the rest of Skia. It must be compiled with clang.
#endif
// It's tricky to relocate code referencing ordinary constants, so we read them from this struct.
using K = const SkSplicer_constants;
#if defined(__aarch64__)
#include <arm_neon.h>
@ -95,7 +98,6 @@ static T unaligned_load(const P* p) {
#endif
// Stages all fit a common interface that allows SkSplicer to splice them together.
using K = const SkSplicer_constants;
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
// Stage's arguments act as the working set of registers within the final spliced function.

View File

@ -15,9 +15,14 @@
#error This file is not like the rest of Skia. It must be compiled with clang.
#endif
// We use a set of constants suitable for SkFixed15 math.
using K = const SkSplicer_constants_lowp;
#if defined(__aarch64__)
#include <arm_neon.h>
using U8 = uint8_t __attribute__((ext_vector_type(8)));
// In this file, F is a vector of SkFixed15.
// See SkFixed15.h for notes on its various operations.
struct F {
@ -43,12 +48,24 @@
static F min(F a, F b) { return vminq_u16(a,b); }
static F max(F a, F b) { return vmaxq_u16(a,b); }
static F from_u8(U8 u8, K*) {
// u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8
//
// Here we do (u8*128 <rounding +> u8/2), which is correct for 0 and 255,
// and never off by more than 1 anywhere. It's just 2 instructions in NEON:
auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
u16 = vrsraq_n_u16(u16, u16, 8); // u16 += u16/256, with rounding
return u16;
};
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
#endif
#include <arm_neon.h>
using U8 = uint8_t __attribute__((ext_vector_type(8))); // But, only low 4 lanes active.
struct F {
using V = uint16_t __attribute__((ext_vector_type(4)));
@ -72,12 +89,20 @@
static F min(F a, F b) { return vmin_u16(a,b); }
static F max(F a, F b) { return vmax_u16(a,b); }
static F from_u8(U8 u8, K*) {
auto u16 = vshll_n_u8(u8, 7); // Identical to aarch64...
u16 = vrsraq_n_u16(u16, u16, 8); //
return vget_low_u16(u16); // ...but only the low 4 lanes are active.
}
#else
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error On x86, compile with -mavx2 -mfma -mf16c.
#endif
#include <immintrin.h>
using U8 = uint8_t __attribute__((ext_vector_type(16)));
struct F {
using V = uint16_t __attribute__((ext_vector_type(16)));
@ -97,20 +122,31 @@
};
static F min(F a, F b) { return _mm256_min_epu16(a,b); }
static F max(F a, F b) { return _mm256_max_epu16(a,b); }
static F from_u8(U8 u8, K* k) {
// Nothing too interesting here. We follow the stock SkFixed15 formula.
F u16 = _mm256_cvtepu8_epi16(u8);
return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
}
#endif
// No platform actually supports FMA for SkFixed15.
// This fma() method just makes it easier to port stages to lowp.
static F fma(F f, F m, F a) { return f*m+a; }
template <typename T, typename P>
static T unaligned_load(const P* p) {
T v;
memcpy(&v, p, sizeof(v));
return v;
}
#if defined(__ARM_NEON__)
#define C extern "C" __attribute__((pcs("aapcs-vfp")))
#else
#define C extern "C"
#endif
// We use a set of constants suitable for SkFixed15 math.
using K = const SkSplicer_constants_lowp;
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
@ -198,32 +234,34 @@ STAGE(premul) {
b = b * a;
}
STAGE(scale_u8) {
auto ptr = *(const uint8_t**)ctx + x;
#if defined(__ARM_NEON__)
// On armv7, U8 can fit 8 bytes, but we only want to load 4.
U8 scales = vdup_n_u32(unaligned_load<uint32_t>(ptr));
#else
U8 scales = unaligned_load<U8>(ptr);
#endif
auto c = from_u8(scales, k);
r = r * c;
g = g * c;
b = b * c;
a = a * c;
}
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
#if defined(__aarch64__)
auto to_fixed15 = [](uint8x8_t u8) {
// u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h)
//
// Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
// and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions!
auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding
};
uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
r = to_fixed15(rgba.val[0]);
g = to_fixed15(rgba.val[1]);
b = to_fixed15(rgba.val[2]);
a = to_fixed15(rgba.val[3]);
r = from_u8(rgba.val[0], k);
g = from_u8(rgba.val[1], k);
b = from_u8(rgba.val[2], k);
a = from_u8(rgba.val[3], k);
#elif defined(__ARM_NEON__)
auto to_fixed15 = [](uint8x8_t u8) {
// Same as aarch64, but only keeping the bottom 4 lanes.
auto u16 = vshll_n_u8(u8, 7);
return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
};
// I can't get quite the code generation I want using vld4_lane_u8(),
// so we're going to drop into assembly to do the loads. :/
@ -233,17 +271,12 @@ STAGE(load_8888) {
"vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
"vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
: "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
r = to_fixed15(R);
g = to_fixed15(G);
b = to_fixed15(B);
a = to_fixed15(A);
r = from_u8(R, k);
g = from_u8(G, k);
b = from_u8(B, k);
a = from_u8(A, k);
#else
auto to_fixed15 = [k](__m128i u8) {
F u16 = _mm256_cvtepu8_epi16(u8);
return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
};
// TODO: shorter, more confusing, faster with 256-bit loads and shuffles
// Load 16 interplaced pixels.
@ -268,10 +301,10 @@ STAGE(load_8888) {
rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF
ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF
r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
r = from_u8(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF), k);
g = from_u8(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF), k);
b = from_u8(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF), k);
a = from_u8(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF), k);
#endif
}
@ -279,7 +312,7 @@ STAGE(store_8888) {
auto ptr = *(uint32_t**)ctx + x;
#if defined(__aarch64__)
auto from_fixed15 = [](F v) {
auto to_u8 = [](F v) {
// The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
// But what's really most important is that all bytes round trip.
@ -288,14 +321,14 @@ STAGE(store_8888) {
};
uint8x8x4_t rgba = {{
from_fixed15(r),
from_fixed15(g),
from_fixed15(b),
from_fixed15(a),
to_u8(r),
to_u8(g),
to_u8(b),
to_u8(a),
}};
vst4_u8((uint8_t*)ptr, rgba);
#elif defined(__ARM_NEON__)
auto from_fixed15 = [](F v) {
auto to_u8 = [](F v) {
// Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
F whatever;
return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
@ -307,22 +340,22 @@ STAGE(store_8888) {
"vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
"vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
: "+r"(ptr)
: "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
: "w"(to_u8(r)), "w"(to_u8(g)), "w"(to_u8(b)), "w"(to_u8(a))
: "memory");
#else
auto from_fixed15 = [](F v) {
// See the note in aarch64's from_fixed15(). The same roundtrip goal applies here.
auto to_u8 = [](F v) {
// See the note in aarch64's to_u8(). The same roundtrip goal applies here.
// Here we take a different approach: (v saturated+ v) >> 8.
v = (v+v) >> 8;
return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
_mm256_extracti128_si256(v, 1));
};
auto R = from_fixed15(r),
G = from_fixed15(g),
B = from_fixed15(b),
A = from_fixed15(a);
auto R = to_u8(r),
G = to_u8(g),
B = to_u8(b),
A = to_u8(a);
auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7
rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF