skia2/tools/SkVMBuilders.cpp
Mike Klein 342b1b2753 proposed: add bytes() op
I'm staring at this assembly,

	vmovups	(%rsi), %ymm3
	vpsrld	$24, %ymm3, %ymm4
	vpslld	$16, %ymm4, %ymm15
	vorps	%ymm4, %ymm15, %ymm4
	vpsubw	%ymm4, %ymm0, %ymm4

Just knowing that could be

	vmovups	(%rsi), %ymm3
	vpshufb	 0x??(%rip), %ymm3, %ymm4
	vpsubw	%ymm4, %ymm0, %ymm4

That is, instead of shifting, shifting, and bit-oring
to create the 0a0a scale factor from ymm3, we could just
byte shuffle directly using some pre-baked control pattern
(stored at the end of the program like other constants)

pshufb lets you arbitrarily remix bytes from its argument and
zero bytes, and NEON has a similar family of vtbl instructions,
even including that same feature of injecting zeroes.

I think I've got this working, and the speedup is great,
from 0.19 to 0.16 ns/px for I32_SWAR, and
from 0.43 to 0.38 ns/px for I32.

Change-Id: Iab850275e826b4187f0efc9495a4b9eab4402c38
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220871
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2019-06-17 15:29:34 +00:00

160 lines
4.9 KiB
C++

/*
* Copyright 2019 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "tools/SkVMBuilders.h"
// Some parts of this builder code are written less fluently than possible,
// to avoid any ambiguity of function argument evaluation order. This lets
// our golden tests work portably. In general there's no reason to fear
// nesting calls to Builder routines.
SrcoverBuilder_F32::SrcoverBuilder_F32(Fmt srcFmt, Fmt dstFmt) {
skvm::Arg src = arg(0),
dst = arg(1);
auto byte_to_f32 = [&](skvm::I32 byte) {
skvm::F32 _1_255 = splat(1/255.0f);
return mul(_1_255, to_f32(byte));
};
auto load = [&](skvm::Arg ptr, Fmt fmt,
skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32* a) {
switch (fmt) {
case Fmt::A8: {
*r = *g = *b = splat(0.0f);
*a = byte_to_f32(load8(ptr));
} break;
case Fmt::G8: {
*r = *g = *b = byte_to_f32(load8(ptr));
*a = splat(1.0f);
} break;
case Fmt::RGBA_8888: {
skvm::I32 rgba = load32(ptr);
*r = byte_to_f32(extract(rgba, 0, splat(0xff)));
*g = byte_to_f32(extract(rgba, 8, splat(0xff)));
*b = byte_to_f32(extract(rgba, 16, splat(0xff)));
*a = byte_to_f32(extract(rgba, 24, splat(0xff)));
} break;
}
};
skvm::F32 r,g,b,a;
load(src, srcFmt, &r,&g,&b,&a);
skvm::F32 dr,dg,db,da;
load(dst, dstFmt, &dr,&dg,&db,&da);
skvm::F32 invA = sub(splat(1.0f), a);
r = mad(dr, invA, r);
g = mad(dg, invA, g);
b = mad(db, invA, b);
a = mad(da, invA, a);
auto f32_to_byte = [&](skvm::F32 f32) {
skvm::F32 _255 = splat(255.0f),
_0_5 = splat(0.5f);
return to_i32(mad(f32, _255, _0_5));
};
switch (dstFmt) {
case Fmt::A8: {
store8(dst, f32_to_byte(a));
} break;
case Fmt::G8: {
skvm::F32 _2126 = splat(0.2126f),
_7152 = splat(0.7152f),
_0722 = splat(0.0722f);
store8(dst, f32_to_byte(mad(r, _2126,
mad(g, _7152,
mul(b, _0722)))));
} break;
case Fmt::RGBA_8888: {
skvm::I32 R = f32_to_byte(r),
G = f32_to_byte(g),
B = f32_to_byte(b),
A = f32_to_byte(a);
R = pack(R, G, 8);
B = pack(B, A, 8);
R = pack(R, B, 16);
store32(dst, R);
} break;
}
}
SrcoverBuilder_I32::SrcoverBuilder_I32() {
skvm::Arg src = arg(0),
dst = arg(1);
auto load = [&](skvm::Arg ptr,
skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
skvm::I32 rgba = load32(ptr);
*r = bit_and(rgba, splat(0xff));
*g = bytes (rgba, 0x0002);
*b = bytes (rgba, 0x0003);
*a = shr (rgba, 24);
};
skvm::I32 r,g,b,a;
load(src, &r,&g,&b,&a);
skvm::I32 dr,dg,db,da;
load(dst, &dr,&dg,&db,&da);
// (xy + x)/256 is a good approximation of (xy + 127)/255
//
// == (d*(255-a) + d)/256
// == (d*(255-a+1) )/256
// == (d*(256-a ) )/256
// We're doing 8x8 bit multiplies in 32-bit lanes.
// Since the inputs and results both fit in 16 bits,
// we can use mul_16x2, which tends to be faster than mul.
//
// (The top 2 zero bytes of the inputs will also multiply
// with each other to produce zero... perfect.)
skvm::I32 invA = sub(splat(256), a);
r = add(r, shr(mul_16x2(dr, invA), 8));
g = add(g, shr(mul_16x2(dg, invA), 8));
b = add(b, shr(mul_16x2(db, invA), 8));
a = add(a, shr(mul_16x2(da, invA), 8));
r = pack(r, g, 8);
b = pack(b, a, 8);
r = pack(r, b, 16);
store32(dst, r);
}
SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
skvm::Arg src = arg(0),
dst = arg(1);
// The s += d*invA adds won't overflow,
// so we don't have to unpack s beyond grabbing the alpha channel.
skvm::I32 s = load32(src),
ax2 = bytes(s, 0x0404); // rgba -> a0a0
// We'll use the same approximation math as above, this time making sure to
// use both i16 multiplies to our benefit, one for r/g, the other for b/a.
skvm::I32 invAx2 = sub_16x2(splat(0x01000100), ax2);
skvm::I32 d = load32(dst),
rb = bit_and (d, splat(0x00ff00ff)),
ga = shr_16x2(d, 8);
rb = shr_16x2(mul_16x2(rb, invAx2), 8); // Put the high 8 bits back in the low lane.
ga = mul_16x2(ga, invAx2); // Keep the high 8 bits up high...
ga = bit_and(ga, splat(0xff00ff00)); // ...and mask off the low bits.
store32(dst, add(s, bit_or(rb, ga)));
}