SkSplicer stage parity
I noticed scale_u8 is implemented in SkSplicer_stages but not _lowp. That's not for any good reason... scale_u8 makes fine sense in _lowp. All other stages missing in _lowp are nuts to attempt without floats. This also renames the to_fixed15 lambdas to from_u8 functions. Everything in the file converts to or from fixed15; the interesting question is the other format. Similarly, from_fixed15 becomes to_u8. Change-Id: I10616b6772c65bd1acb9857f4f5b5f70a4f01bf4 Reviewed-on: https://skia-review.googlesource.com/7323 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
48f770c43f
commit
ba294d930a
@ -256,6 +256,7 @@ namespace {
|
||||
CASE(move_src_dst);
|
||||
CASE(move_dst_src);
|
||||
CASE(premul);
|
||||
CASE(scale_u8);
|
||||
CASE(load_8888);
|
||||
CASE(store_8888);
|
||||
#undef CASE
|
||||
|
@ -134,6 +134,28 @@ static const unsigned int kSplice_premul_lowp[] = {
|
||||
0x4e60ba02, // abs v2.8h, v16.8h
|
||||
0x6f111622, // usra v2.8h, v17.8h, #15
|
||||
};
|
||||
static const unsigned int kSplice_scale_u8_lowp[] = {
|
||||
0xf9400048, // ldr x8, [x2]
|
||||
0xfc606910, // ldr d16, [x8,x0]
|
||||
0x2f0fa610, // ushll v16.8h, v16.8b, #7
|
||||
0x6f183610, // ursra v16.8h, v16.8h, #8
|
||||
0x6e70b411, // sqrdmulh v17.8h, v0.8h, v16.8h
|
||||
0x6e70b433, // sqrdmulh v19.8h, v1.8h, v16.8h
|
||||
0x6e70b455, // sqrdmulh v21.8h, v2.8h, v16.8h
|
||||
0x6e70b477, // sqrdmulh v23.8h, v3.8h, v16.8h
|
||||
0x4e201e12, // and v18.16b, v16.16b, v0.16b
|
||||
0x4e211e14, // and v20.16b, v16.16b, v1.16b
|
||||
0x4e221e16, // and v22.16b, v16.16b, v2.16b
|
||||
0x4e231e10, // and v16.16b, v16.16b, v3.16b
|
||||
0x4e60ba20, // abs v0.8h, v17.8h
|
||||
0x4e60ba61, // abs v1.8h, v19.8h
|
||||
0x4e60baa2, // abs v2.8h, v21.8h
|
||||
0x4e60bae3, // abs v3.8h, v23.8h
|
||||
0x6f111640, // usra v0.8h, v18.8h, #15
|
||||
0x6f111681, // usra v1.8h, v20.8h, #15
|
||||
0x6f1116c2, // usra v2.8h, v22.8h, #15
|
||||
0x6f111603, // usra v3.8h, v16.8h, #15
|
||||
};
|
||||
static const unsigned int kSplice_load_8888_lowp[] = {
|
||||
0xf9400048, // ldr x8, [x2]
|
||||
0x8b000908, // add x8, x8, x0, lsl #2
|
||||
@ -280,6 +302,29 @@ static const unsigned int kSplice_premul_lowp[] = {
|
||||
0xf3911134, // vsra.u16 d1, d20, #15
|
||||
0xf3912130, // vsra.u16 d2, d16, #15
|
||||
};
|
||||
static const unsigned int kSplice_scale_u8_lowp[] = {
|
||||
0xe592c000, // ldr ip, [r2]
|
||||
0xe08cc000, // add ip, ip, r0
|
||||
0xf4ec0c8f, // vld1.32 {d16[]}, [ip]
|
||||
0xf3cf0a30, // vshll.u8 q8, d16, #7
|
||||
0xf3d80370, // vrsra.u16 q8, q8, #8
|
||||
0xf3502b20, // vqrdmulh.s16 d18, d0, d16
|
||||
0xf3513b20, // vqrdmulh.s16 d19, d1, d16
|
||||
0xf3524b20, // vqrdmulh.s16 d20, d2, d16
|
||||
0xf3535b20, // vqrdmulh.s16 d21, d3, d16
|
||||
0xf2406190, // vand d22, d16, d0
|
||||
0xf3b50322, // vabs.s16 d0, d18
|
||||
0xf2407191, // vand d23, d16, d1
|
||||
0xf2402192, // vand d18, d16, d2
|
||||
0xf2400193, // vand d16, d16, d3
|
||||
0xf3b51323, // vabs.s16 d1, d19
|
||||
0xf3b52324, // vabs.s16 d2, d20
|
||||
0xf3b53325, // vabs.s16 d3, d21
|
||||
0xf3910136, // vsra.u16 d0, d22, #15
|
||||
0xf3911137, // vsra.u16 d1, d23, #15
|
||||
0xf3912132, // vsra.u16 d2, d18, #15
|
||||
0xf3913130, // vsra.u16 d3, d16, #15
|
||||
};
|
||||
static const unsigned int kSplice_load_8888_lowp[] = {
|
||||
0xe592c000, // ldr ip, [r2]
|
||||
0xe08cc100, // add ip, ip, r0, lsl #2
|
||||
@ -410,6 +455,25 @@ static const unsigned char kSplice_premul_lowp[] = {
|
||||
0xc4,0xe2,0x6d,0x0b,0xd3, // vpmulhrsw %ymm3,%ymm2,%ymm2
|
||||
0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
|
||||
};
|
||||
static const unsigned char kSplice_scale_u8_lowp[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
0xc4,0x62,0x7d,0x30,0x04,0x38, // vpmovzxbw (%rax,%rdi,1),%ymm8
|
||||
0xc4,0xc1,0x35,0x71,0xf0,0x07, // vpsllw $0x7,%ymm8,%ymm9
|
||||
0xc4,0xc1,0x2d,0x71,0xd0,0x01, // vpsrlw $0x1,%ymm8,%ymm10
|
||||
0xc4,0x41,0x35,0xdd,0xca, // vpaddusw %ymm10,%ymm9,%ymm9
|
||||
0xc4,0x62,0x7d,0x79,0x11, // vpbroadcastw (%rcx),%ymm10
|
||||
0xc4,0x41,0x3d,0xdd,0xc2, // vpaddusw %ymm10,%ymm8,%ymm8
|
||||
0xc4,0xc1,0x3d,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm8,%ymm8
|
||||
0xc4,0x41,0x35,0xdd,0xc0, // vpaddusw %ymm8,%ymm9,%ymm8
|
||||
0xc4,0xc2,0x7d,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm0,%ymm0
|
||||
0xc4,0xe2,0x7d,0x1d,0xc0, // vpabsw %ymm0,%ymm0
|
||||
0xc4,0xc2,0x75,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm1,%ymm1
|
||||
0xc4,0xe2,0x7d,0x1d,0xc9, // vpabsw %ymm1,%ymm1
|
||||
0xc4,0xc2,0x6d,0x0b,0xd0, // vpmulhrsw %ymm8,%ymm2,%ymm2
|
||||
0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
|
||||
0xc4,0xc2,0x65,0x0b,0xd8, // vpmulhrsw %ymm8,%ymm3,%ymm3
|
||||
0xc4,0xe2,0x7d,0x1d,0xdb, // vpabsw %ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_load_8888_lowp[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
0xc5,0xfa,0x6f,0x04,0xb8, // vmovdqu (%rax,%rdi,4),%xmm0
|
||||
|
@ -12,6 +12,9 @@
|
||||
#error This file is not like the rest of Skia. It must be compiled with clang.
|
||||
#endif
|
||||
|
||||
// It's tricky to relocate code referencing ordinary constants, so we read them from this struct.
|
||||
using K = const SkSplicer_constants;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#include <arm_neon.h>
|
||||
|
||||
@ -95,7 +98,6 @@ static T unaligned_load(const P* p) {
|
||||
#endif
|
||||
|
||||
// Stages all fit a common interface that allows SkSplicer to splice them together.
|
||||
using K = const SkSplicer_constants;
|
||||
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
|
||||
|
||||
// Stage's arguments act as the working set of registers within the final spliced function.
|
||||
|
@ -15,9 +15,14 @@
|
||||
#error This file is not like the rest of Skia. It must be compiled with clang.
|
||||
#endif
|
||||
|
||||
// We use a set of constants suitable for SkFixed15 math.
|
||||
using K = const SkSplicer_constants_lowp;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#include <arm_neon.h>
|
||||
|
||||
using U8 = uint8_t __attribute__((ext_vector_type(8)));
|
||||
|
||||
// In this file, F is a vector of SkFixed15.
|
||||
// See SkFixed15.h for notes on its various operations.
|
||||
struct F {
|
||||
@ -43,12 +48,24 @@
|
||||
static F min(F a, F b) { return vminq_u16(a,b); }
|
||||
static F max(F a, F b) { return vmaxq_u16(a,b); }
|
||||
|
||||
static F from_u8(U8 u8, K*) {
|
||||
// u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8
|
||||
//
|
||||
// Here we do (u8*128 <rounding +> u8/2), which is correct for 0 and 255,
|
||||
// and never off by more than 1 anywhere. It's just 2 instructions in NEON:
|
||||
auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
|
||||
u16 = vrsraq_n_u16(u16, u16, 8); // u16 += u16/256, with rounding
|
||||
return u16;
|
||||
};
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
|
||||
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
using U8 = uint8_t __attribute__((ext_vector_type(8))); // But, only low 4 lanes active.
|
||||
|
||||
struct F {
|
||||
using V = uint16_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
@ -72,12 +89,20 @@
|
||||
static F min(F a, F b) { return vmin_u16(a,b); }
|
||||
static F max(F a, F b) { return vmax_u16(a,b); }
|
||||
|
||||
static F from_u8(U8 u8, K*) {
|
||||
auto u16 = vshll_n_u8(u8, 7); // Identical to aarch64...
|
||||
u16 = vrsraq_n_u16(u16, u16, 8); //
|
||||
return vget_low_u16(u16); // ...but only the low 4 lanes are active.
|
||||
}
|
||||
|
||||
#else
|
||||
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
|
||||
#error On x86, compile with -mavx2 -mfma -mf16c.
|
||||
#endif
|
||||
#include <immintrin.h>
|
||||
|
||||
using U8 = uint8_t __attribute__((ext_vector_type(16)));
|
||||
|
||||
struct F {
|
||||
using V = uint16_t __attribute__((ext_vector_type(16)));
|
||||
|
||||
@ -97,20 +122,31 @@
|
||||
};
|
||||
static F min(F a, F b) { return _mm256_min_epu16(a,b); }
|
||||
static F max(F a, F b) { return _mm256_max_epu16(a,b); }
|
||||
|
||||
static F from_u8(U8 u8, K* k) {
|
||||
// Nothing too interesting here. We follow the stock SkFixed15 formula.
|
||||
F u16 = _mm256_cvtepu8_epi16(u8);
|
||||
return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
|
||||
}
|
||||
#endif
|
||||
|
||||
// No platform actually supports FMA for SkFixed15.
|
||||
// This fma() method just makes it easier to port stages to lowp.
|
||||
static F fma(F f, F m, F a) { return f*m+a; }
|
||||
|
||||
template <typename T, typename P>
|
||||
static T unaligned_load(const P* p) {
|
||||
T v;
|
||||
memcpy(&v, p, sizeof(v));
|
||||
return v;
|
||||
}
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
#define C extern "C" __attribute__((pcs("aapcs-vfp")))
|
||||
#else
|
||||
#define C extern "C"
|
||||
#endif
|
||||
|
||||
// We use a set of constants suitable for SkFixed15 math.
|
||||
using K = const SkSplicer_constants_lowp;
|
||||
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
|
||||
|
||||
// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
|
||||
@ -198,32 +234,34 @@ STAGE(premul) {
|
||||
b = b * a;
|
||||
}
|
||||
|
||||
STAGE(scale_u8) {
|
||||
auto ptr = *(const uint8_t**)ctx + x;
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
// On armv7, U8 can fit 8 bytes, but we only want to load 4.
|
||||
U8 scales = vdup_n_u32(unaligned_load<uint32_t>(ptr));
|
||||
#else
|
||||
U8 scales = unaligned_load<U8>(ptr);
|
||||
#endif
|
||||
|
||||
auto c = from_u8(scales, k);
|
||||
r = r * c;
|
||||
g = g * c;
|
||||
b = b * c;
|
||||
a = a * c;
|
||||
}
|
||||
|
||||
STAGE(load_8888) {
|
||||
auto ptr = *(const uint32_t**)ctx + x;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
auto to_fixed15 = [](uint8x8_t u8) {
|
||||
// u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h)
|
||||
//
|
||||
// Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
|
||||
// and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions!
|
||||
auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
|
||||
return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding
|
||||
};
|
||||
|
||||
uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
|
||||
r = to_fixed15(rgba.val[0]);
|
||||
g = to_fixed15(rgba.val[1]);
|
||||
b = to_fixed15(rgba.val[2]);
|
||||
a = to_fixed15(rgba.val[3]);
|
||||
r = from_u8(rgba.val[0], k);
|
||||
g = from_u8(rgba.val[1], k);
|
||||
b = from_u8(rgba.val[2], k);
|
||||
a = from_u8(rgba.val[3], k);
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
auto to_fixed15 = [](uint8x8_t u8) {
|
||||
// Same as aarch64, but only keeping the bottom 4 lanes.
|
||||
auto u16 = vshll_n_u8(u8, 7);
|
||||
return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
|
||||
};
|
||||
|
||||
// I can't get quite the code generation I want using vld4_lane_u8(),
|
||||
// so we're going to drop into assembly to do the loads. :/
|
||||
|
||||
@ -233,17 +271,12 @@ STAGE(load_8888) {
|
||||
"vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
|
||||
"vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
|
||||
: "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
|
||||
r = to_fixed15(R);
|
||||
g = to_fixed15(G);
|
||||
b = to_fixed15(B);
|
||||
a = to_fixed15(A);
|
||||
r = from_u8(R, k);
|
||||
g = from_u8(G, k);
|
||||
b = from_u8(B, k);
|
||||
a = from_u8(A, k);
|
||||
|
||||
#else
|
||||
auto to_fixed15 = [k](__m128i u8) {
|
||||
F u16 = _mm256_cvtepu8_epi16(u8);
|
||||
return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
|
||||
};
|
||||
|
||||
// TODO: shorter, more confusing, faster with 256-bit loads and shuffles
|
||||
|
||||
// Load 16 interplaced pixels.
|
||||
@ -268,10 +301,10 @@ STAGE(load_8888) {
|
||||
rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF
|
||||
ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF
|
||||
|
||||
r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
|
||||
g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
|
||||
b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
|
||||
a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
|
||||
r = from_u8(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF), k);
|
||||
g = from_u8(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF), k);
|
||||
b = from_u8(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF), k);
|
||||
a = from_u8(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF), k);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -279,7 +312,7 @@ STAGE(store_8888) {
|
||||
auto ptr = *(uint32_t**)ctx + x;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
auto from_fixed15 = [](F v) {
|
||||
auto to_u8 = [](F v) {
|
||||
// The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
|
||||
// But what's really most important is that all bytes round trip.
|
||||
|
||||
@ -288,14 +321,14 @@ STAGE(store_8888) {
|
||||
};
|
||||
|
||||
uint8x8x4_t rgba = {{
|
||||
from_fixed15(r),
|
||||
from_fixed15(g),
|
||||
from_fixed15(b),
|
||||
from_fixed15(a),
|
||||
to_u8(r),
|
||||
to_u8(g),
|
||||
to_u8(b),
|
||||
to_u8(a),
|
||||
}};
|
||||
vst4_u8((uint8_t*)ptr, rgba);
|
||||
#elif defined(__ARM_NEON__)
|
||||
auto from_fixed15 = [](F v) {
|
||||
auto to_u8 = [](F v) {
|
||||
// Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
|
||||
F whatever;
|
||||
return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
|
||||
@ -307,22 +340,22 @@ STAGE(store_8888) {
|
||||
"vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
|
||||
"vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
|
||||
: "+r"(ptr)
|
||||
: "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
|
||||
: "w"(to_u8(r)), "w"(to_u8(g)), "w"(to_u8(b)), "w"(to_u8(a))
|
||||
: "memory");
|
||||
|
||||
#else
|
||||
auto from_fixed15 = [](F v) {
|
||||
// See the note in aarch64's from_fixed15(). The same roundtrip goal applies here.
|
||||
auto to_u8 = [](F v) {
|
||||
// See the note in aarch64's to_u8(). The same roundtrip goal applies here.
|
||||
// Here we take a different approach: (v saturated+ v) >> 8.
|
||||
v = (v+v) >> 8;
|
||||
return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
|
||||
_mm256_extracti128_si256(v, 1));
|
||||
};
|
||||
|
||||
auto R = from_fixed15(r),
|
||||
G = from_fixed15(g),
|
||||
B = from_fixed15(b),
|
||||
A = from_fixed15(a);
|
||||
auto R = to_u8(r),
|
||||
G = to_u8(g),
|
||||
B = to_u8(b),
|
||||
A = to_u8(a);
|
||||
|
||||
auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7
|
||||
rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF
|
||||
|
Loading…
Reference in New Issue
Block a user