skia2/tests/SkVMTest.cpp

2540 lines
82 KiB
C++
Raw Normal View History

/*
* Copyright 2019 Google LLC
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "include/core/SkColorPriv.h"
#include "include/private/SkColorData.h"
#include "src/core/SkCpu.h"
#include "src/core/SkMSAN.h"
#include "src/core/SkVM.h"
#include "tests/Test.h"
#include "tools/Resources.h"
#include "tools/SkVMBuilders.h"
using Fmt = SrcoverBuilder_F32::Fmt;
const char* fmt_name(Fmt fmt) {
switch (fmt) {
case Fmt::A8: return "A8";
case Fmt::G8: return "G8";
case Fmt::RGBA_8888: return "RGBA_8888";
}
return "";
}
static void dump(skvm::Builder& builder, SkWStream* o) {
skvm::Program program = builder.done();
builder.dump(o);
o->writeText("\n");
program.dump(o);
o->writeText("\n");
}
template <typename Fn>
static void test_jit_and_interpreter(skvm::Program&& program, Fn&& test) {
if (program.hasJIT()) {
Reland "mark which SkVM tests should JIT or not" This is a reland of 52435503e992cbeb388d90c51f74515ab1e11c96 with better checks for when we should expect JIT and not. Original change's description: > mark which SkVM tests should JIT or not > > Most of these tests converted over to test_interpreter_only() > are failing to JIT because of unimplemented instructions. No > bug there, just TODOs. > > But SkVM_hoist _should_ be JITting. A while back I landed a CL > that messed with value lifetimes that prevents it from JITting. > Will be using this as a regression test to fix that bug. > > Change-Id: Id2034f6548a45ed9aeb9ae3cbb24d389cad7dc60 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248980 > Commit-Queue: Mike Klein <mtklein@google.com> > Commit-Queue: Ethan Nicholas <ethannicholas@google.com> > Auto-Submit: Mike Klein <mtklein@google.com> > Reviewed-by: Ethan Nicholas <ethannicholas@google.com> > Reviewed-by: Herb Derby <herb@google.com> Cq-Include-Trybots: skia.primary:Test-Android-Clang-NVIDIA_Shield-CPU-TegraX1-arm64-Release-All-Android,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_CPU_LIMIT_SSE2,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_CPU_LIMIT_SSE41,Test-Mac10.13-Clang-VMware7.1-CPU-AVX-x86_64-Debug-All-NativeFonts,Test-Mac10.14-Clang-VMware7.1-CPU-AVX-x86_64-Debug-All-NativeFonts Change-Id: Id7bde7e879649e435fa424a9c9d6c51a31afd5e9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248990 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-10-16 15:11:56 +00:00
test((const skvm::Program&) program);
program.dropJIT();
}
test((const skvm::Program&) program);
Reland "mark which SkVM tests should JIT or not" This is a reland of 52435503e992cbeb388d90c51f74515ab1e11c96 with better checks for when we should expect JIT and not. Original change's description: > mark which SkVM tests should JIT or not > > Most of these tests converted over to test_interpreter_only() > are failing to JIT because of unimplemented instructions. No > bug there, just TODOs. > > But SkVM_hoist _should_ be JITting. A while back I landed a CL > that messed with value lifetimes that prevents it from JITting. > Will be using this as a regression test to fix that bug. > > Change-Id: Id2034f6548a45ed9aeb9ae3cbb24d389cad7dc60 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248980 > Commit-Queue: Mike Klein <mtklein@google.com> > Commit-Queue: Ethan Nicholas <ethannicholas@google.com> > Auto-Submit: Mike Klein <mtklein@google.com> > Reviewed-by: Ethan Nicholas <ethannicholas@google.com> > Reviewed-by: Herb Derby <herb@google.com> Cq-Include-Trybots: skia.primary:Test-Android-Clang-NVIDIA_Shield-CPU-TegraX1-arm64-Release-All-Android,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_CPU_LIMIT_SSE2,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_CPU_LIMIT_SSE41,Test-Mac10.13-Clang-VMware7.1-CPU-AVX-x86_64-Debug-All-NativeFonts,Test-Mac10.14-Clang-VMware7.1-CPU-AVX-x86_64-Debug-All-NativeFonts Change-Id: Id7bde7e879649e435fa424a9c9d6c51a31afd5e9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248990 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-10-16 15:11:56 +00:00
}
DEF_TEST(SkVM, r) {
SkDynamicMemoryWStream buf;
// Write all combinations of SrcoverBuilder_F32
for (int s = 0; s < 3; s++)
for (int d = 0; d < 3; d++) {
auto srcFmt = (Fmt)s,
dstFmt = (Fmt)d;
SrcoverBuilder_F32 builder{srcFmt, dstFmt};
buf.writeText(fmt_name(srcFmt));
buf.writeText(" over ");
buf.writeText(fmt_name(dstFmt));
buf.writeText("\n");
dump(builder, &buf);
}
// Write the I32 Srcovers also.
{
SrcoverBuilder_I32_Naive builder;
buf.writeText("I32 (Naive) 8888 over 8888\n");
dump(builder, &buf);
}
reorder to minimize register pressure Rewrite program instructions so that each value becomes available as late as possible, just before it's used by another instruction. This reorders blocks of instructions to reduce them number of temporary registers in flight. Take this example of the sort of program that we naturally write, noting the registers needed as we progress down the right: src = load32 ... (1) sr = extract src ... (2) sg = extract src ... (3) sb = extract src ... (4) sa = extract src ... (4, src dies) dst = load32 ... (5) dr = extract dst ... (6) dg = extract dst ... (7) db = extract dst ... (8) da = extract dst ... (8, dst dies) r = add sr dr (7, sr and dr die) g = add sg dg (6, sg and dg die) b = add sb db (5, sb and db die) a = add sa da (4, sa and da die) rg = pack r g ... (3, r and g die) ba = pack b a ... (2, b and a die) rgba = pack rg ba ... (1, rg and ba die) store32 rgba ... (0, rgba dies) That original ordering of the code needs 8 registers (perhaps with a temporary 9th, but we'll ignore that here). This CL will rewrite the program to something more like this by recursively issuing inputs only once needed: src = load32 ... (1) sr = extract src ... (2) dst = load32 ... (3) dr = extract dst ... (4) r = add sr dr (3, sr and dr die) sg = extract src ... (4) dg = extract dst ... (5) g = add sg dg (4, sg and dg die) rg = pack r g (3, r and g die) sb = extract src ... (4) db = extract dst ... (5) b = add sb db (4, sb and db die) sa = extract src ... (4, src dies) da = extract dst ... (4, dst dies) a = add sa da (3, sa and da die) ba = pack b a (2, b and a die) rgba = pack rg ba ... (1, rg and ba die) store32 rgba ... (0) That trims 3 registers off the example, just by reordering! I've added the real version of this example to SkVMTest.cpp. (Its 6th register comes from holding the 0xff byte mask used by extract, in case you're curious). I'll admit it's not exactly easy to work out how this reordering works without a pen and paper or trial and error. I've tried to make the implementation preserve the original program's order as much as makes sense (i.e. when order is an otherwise arbitrary choice) to keep it somewhat sane to follow. This reordering naturally skips dead code, so pour one out for ☠️ . We lose our cute dead code emoji marker, but on the other hand all code downstream of Builder::done() can assume every instruction is live. Change-Id: Iceffcd10fd7465eae51a39ef8eec7a7189766ba2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/249999 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-10-22 17:27:58 +00:00
{
// Demonstrate the value of program reordering.
skvm::Builder b;
skvm::Arg sp = b.varying<int>(),
dp = b.varying<int>();
skvm::I32 byte = b.splat(0xff);
skvm::I32 src = b.load32(sp),
sr = b.extract(src, 0, byte),
sg = b.extract(src, 8, byte),
sb = b.extract(src, 16, byte),
sa = b.extract(src, 24, byte);
skvm::I32 dst = b.load32(dp),
dr = b.extract(dst, 0, byte),
dg = b.extract(dst, 8, byte),
db = b.extract(dst, 16, byte),
da = b.extract(dst, 24, byte);
skvm::I32 R = b.add(sr, dr),
G = b.add(sg, dg),
B = b.add(sb, db),
A = b.add(sa, da);
skvm::I32 rg = b.pack(R, G, 8),
ba = b.pack(B, A, 8),
rgba = b.pack(rg, ba, 16);
b.store32(dp, rgba);
dump(b, &buf);
}
// Our checked in dump expectations assume we have FMA support.
if (skvm::fma_supported()) {
sk_sp<SkData> actual = buf.detachAsData();
bool writeActualAsNewExpectation = false;
{
sk_sp<SkData> expected = GetResourceAsData("SkVMTest.expected");
if (!expected) {
ERRORF(r, "Couldn't load SkVMTest.expected.");
writeActualAsNewExpectation = true;
} else if (!expected->equals(actual.get())) {
ERRORF(r, "SkVMTest expected\n%.*s\nbut got\n%.*s\n",
(int)expected->size(), expected->data(),
(int)actual->size(), actual->data());
writeActualAsNewExpectation = true;
}
}
if (writeActualAsNewExpectation) {
SkFILEWStream out(GetResourcePath("SkVMTest.expected").c_str());
if (out.isValid()) {
out.write(actual->data(), actual->size());
}
}
}
auto test_8888 = [&](skvm::Program&& program) {
uint32_t src[9];
uint32_t dst[SK_ARRAY_COUNT(src)];
test_jit_and_interpreter(std::move(program), [&](const skvm::Program& program) {
for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
src[i] = 0xbb007733;
dst[i] = 0xffaaccee;
}
SkPMColor expected = SkPMSrcOver(src[0], dst[0]); // 0xff2dad73
program.eval((int)SK_ARRAY_COUNT(src), src, dst);
// dst is probably 0xff2dad72.
for (auto got : dst) {
auto want = expected;
for (int i = 0; i < 4; i++) {
uint8_t d = got & 0xff,
w = want & 0xff;
if (abs(d-w) >= 2) {
SkDebugf("d %02x, w %02x\n", d,w);
}
REPORTER_ASSERT(r, abs(d-w) < 2);
got >>= 8;
want >>= 8;
}
}
});
};
test_8888(SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::RGBA_8888}.done("srcover_f32"));
test_8888(SrcoverBuilder_I32_Naive{}.done("srcover_i32_naive"));
test_jit_and_interpreter(SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::G8}.done(),
[&](const skvm::Program& program) {
uint32_t src[9];
uint8_t dst[SK_ARRAY_COUNT(src)];
for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
src[i] = 0xbb007733;
dst[i] = 0x42;
}
SkPMColor over = SkPMSrcOver(SkPackARGB32(0xbb, 0x33, 0x77, 0x00),
0xff424242);
uint8_t want = SkComputeLuminance(SkGetPackedR32(over),
SkGetPackedG32(over),
SkGetPackedB32(over));
program.eval((int)SK_ARRAY_COUNT(src), src, dst);
for (auto got : dst) {
REPORTER_ASSERT(r, abs(got-want) < 3);
}
});
test_jit_and_interpreter(SrcoverBuilder_F32{Fmt::A8, Fmt::A8}.done(),
[&](const skvm::Program& program) {
uint8_t src[256],
dst[256];
for (int i = 0; i < 256; i++) {
src[i] = 255 - i;
dst[i] = i;
}
program.eval(256, src, dst);
for (int i = 0; i < 256; i++) {
uint8_t want = SkGetPackedA32(SkPMSrcOver(SkPackARGB32(src[i], 0,0,0),
SkPackARGB32( i, 0,0,0)));
REPORTER_ASSERT(r, abs(dst[i]-want) < 2);
}
});
}
DEF_TEST(SkVM_eliminate_dead_code, r) {
skvm::Builder b;
{
skvm::Arg arg = b.varying<int>();
skvm::I32 l = b.load32(arg);
skvm::I32 a = b.add(l, l);
b.add(a, b.splat(7));
}
std::vector<skvm::Instruction> program = b.program();
REPORTER_ASSERT(r, program.size() == 4);
program = skvm::eliminate_dead_code(program);
REPORTER_ASSERT(r, program.size() == 0);
}
DEF_TEST(SkVM_Usage, r) {
skvm::Builder b;
{
skvm::Arg arg = b.varying<int>(),
buf = b.varying<int>();
skvm::I32 l = b.load32(arg);
skvm::I32 a = b.add(l, l);
skvm::I32 s = b.add(a, b.splat(7));
b.store32(buf, s);
}
skvm::Usage usage{b.program()};
REPORTER_ASSERT(r, b.program()[0].op == skvm::Op::load32);
REPORTER_ASSERT(r, usage[0].size() == 2);
REPORTER_ASSERT(r, b.program()[1].op == skvm::Op::add_i32);
REPORTER_ASSERT(r, usage[1].size() == 1);
REPORTER_ASSERT(r, b.program()[2].op == skvm::Op::splat);
REPORTER_ASSERT(r, usage[2].size() == 1);
REPORTER_ASSERT(r, b.program()[3].op == skvm::Op::add_i32);
REPORTER_ASSERT(r, usage[3].size() == 1);
}
DEF_TEST(SkVM_Pointless, r) {
// Let's build a program with no memory arguments.
// It should all be pegged as dead code, but we should be able to "run" it.
skvm::Builder b;
{
b.add(b.splat(5.0f),
b.splat(4.0f));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
for (int N = 0; N < 64; N++) {
program.eval(N);
}
});
refactor out a middle representation Kind of brewing a big refactor here, to give me some room between skvm::Builder and skvm::Program to do optimizations, bakend specializations and analysis. As a warmup, I'm trying to split up today's Builder::Instruction into two forms, first just what the user requested in Builder (this stays Builder::Instruction) then a new type representing any transformation or analysis we've done to it (OptimizedInstruction). Roughly six important optimizations happen in SkVM today, in this order: 1) constant folding 2) backend-specific instruction specialization 3) common sub-expression elimination 4) reordering + dead code elimination 5) loop invariant and lifetime analysis 6) register assignment At head 1-5 all happen in Builder, and 2 is particularly awkward to have there (e.g. mul_f32 -> mul_f32_imm). 6 happens in Program per-backend, and that seems healthy. As of this CL, 1-3 happen in Builder, 4-5 now on this middle OptimizedInstruction format, and 6 still in Program. I'd like to get to the point where 1 stays in Builder, 2-5 all happen on this middle IR, and 6 stays in Program. That ought to let me do things like turn mul_f32 -> mul_f32_imm when it's good to and still benefit from things like common sub-expression elimination and code reordering happening after that trnasformation. And then, I hope that's also a good spot to do more complicated transformations, like lowering gather8 into gather32 plus some fix up when targeting an x86 JIT but not anywhere else. Today's Builder is too early to know whether we should do this or not, and in Program it's actually kind of awkward to do this sort of thing while also doing having to do register assignment. Some middle might be right. Change-Id: I9c00268a084f07fbab88d05eb441f1957a0d7c67 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/269181 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-02-06 19:02:32 +00:00
for (const skvm::OptimizedInstruction& inst : b.optimize()) {
add used_in_loop bit to skvm::Builder::Instruction Most hoisted values are used in the loop body (and that's really the whole point of hoisting) but some are just temporaries to help produce other hoisted values. This used_in_loop bit helps us distinguish the two, and lets us recycle registers holding temporary hoisted values not used in the loop. The can-we-recycle logic now becomes: - is this a real value? - is it time for it to die? - is it either not hoisted or a hoisted temporary? The set-death-to-infinity approach for hoisted values is now gone. That worked great for hoisted values used inside the loop, but was too conservative for hoisted temporaries. This lifetime extension was preventing us from recycling those registers, pinning enough registers that we run out and fail to JIT. Small amounts of refactoring to make this clearer: - move the Instruction hash function definition near its operator== - rename the two "hoist" variables to "can_hoist" for Instructions and "try_hoisting" for the JIT approach - add ↟ to mark hoisted temporaries, _really_ hoisted values. There's some redundancy here between tracking the can_hoist bit, the used_in_loop bit, and lifetime tracking. I think it should be true, for instance, that !can_hoist && !used_in_loop implies an instruction is dead code. I plan to continue refactoring lifetime analysis (in particular reordering instructions to decrease register pressure) so hopefully by the time I'm done that metadata will shake out a little crisper. Change-Id: I6460ca96d1cbec0315bed3c9a0774cd88ab5be26 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248986 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-10-16 15:46:01 +00:00
REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
}
}
DEF_TEST(SkVM_memset, r) {
skvm::Builder b;
b.store32(b.varying<int>(), b.splat(42));
test_jit_and_interpreter(b.done(), [&](const skvm::Program& p) {
int buf[18];
buf[17] = 47;
convert to phi nodes Convert our n+args stack homes to phi nodes, essentially performing mem2reg ourselves, eliminating the need for it at runtime. Also, use b.getInt64(k) to create integer constants. Also, print verifyModule() errors to stdout (instead of nowhere). Also, update unit test to make sure we don't run off the end. Bitcode still looks good: define void @skvm-jit-211960346(i64, i8*) { enter: br label %testK testK: ; preds = %loopK, %enter %2 = phi i64 [ %0, %enter ], [ %6, %loopK ] %3 = phi i8* [ %1, %enter ], [ %7, %loopK ] %4 = icmp uge i64 %2, 16 br i1 %4, label %loopK, label %test1 loopK: ; preds = %testK %5 = bitcast i8* %3 to <16 x i32>* store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, <16 x i32>* %5, align 1 %6 = sub i64 %2, 16 %7 = getelementptr i8, i8* %3, i64 64 br label %testK test1: ; preds = %loop1, %testK %8 = phi i64 [ %2, %testK ], [ %12, %loop1 ] %9 = phi i8* [ %3, %testK ], [ %13, %loop1 ] %10 = icmp uge i64 %8, 1 br i1 %10, label %loop1, label %leave loop1: ; preds = %test1 %11 = bitcast i8* %9 to i32* store i32 42, i32* %11, align 1 %12 = sub i64 %8, 1 %13 = getelementptr i8, i8* %9, i64 4 br label %test1 leave: ; preds = %test1 ret void } and the final assembly looks the same: 0x10a3f5000: movabsq $0x10a3f6000, %rax ; imm = 0x10A3F6000 0x10a3f500a: vbroadcastss (%rax), %zmm0 0x10a3f5010: cmpq $0xf, %rdi 0x10a3f5014: jbe 0x10a3f504d 0x10a3f5016: nopw %cs:(%rax,%rax) 0x10a3f5020: vmovups %zmm0, (%rsi) 0x10a3f5026: addq $-0x10, %rdi 0x10a3f502a: addq $0x40, %rsi 0x10a3f502e: cmpq $0xf, %rdi 0x10a3f5032: ja 0x10a3f5020 0x10a3f5034: jmp 0x10a3f504d 0x10a3f5036: nopw %cs:(%rax,%rax) 0x10a3f5040: movl $0x2a, (%rsi) 0x10a3f5046: decq %rdi 0x10a3f5049: addq $0x4, %rsi 0x10a3f504d: testq %rdi, %rdi 0x10a3f5050: jne 0x10a3f5040 0x10a3f5052: vzeroupper 0x10a3f5055: retq Change-Id: I12d11c7d5786c4c3df28a49bb3044be10f0770e0 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/273753 Reviewed-by: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-02-27 16:07:53 +00:00
p.eval(17, buf);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(r, buf[i] == 42);
}
REPORTER_ASSERT(r, buf[17] == 47);
});
}
DEF_TEST(SkVM_memcpy, r) {
skvm::Builder b;
{
auto src = b.varying<int>(),
dst = b.varying<int>();
b.store32(dst, b.load32(src));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& p) {
int src[] = {1,2,3,4,5,6,7,8,9},
dst[] = {0,0,0,0,0,0,0,0,0};
p.eval(SK_ARRAY_COUNT(src)-1, src, dst);
for (size_t i = 0; i < SK_ARRAY_COUNT(src)-1; i++) {
REPORTER_ASSERT(r, dst[i] == src[i]);
}
size_t i = SK_ARRAY_COUNT(src)-1;
REPORTER_ASSERT(r, dst[i] == 0);
});
}
DEF_TEST(SkVM_LoopCounts, r) {
// Make sure we cover all the exact N we want.
// buf[i] += 1
skvm::Builder b;
skvm::Arg arg = b.varying<int>();
b.store32(arg,
b.add(b.splat(1),
b.load32(arg)));
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int buf[64];
for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
buf[i] = i;
}
program.eval(N, buf);
for (int i = 0; i < N; i++) {
REPORTER_ASSERT(r, buf[i] == i+1);
}
for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == i);
}
}
});
}
DEF_TEST(SkVM_gather32, r) {
skvm::Builder b;
{
skvm::Arg uniforms = b.uniform(),
buf = b.varying<int>();
skvm::I32 x = b.load32(buf);
b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
const int img[] = {12,34,56,78, 90,98,76,54};
int buf[20];
for (int i = 0; i < 20; i++) {
buf[i] = i;
}
struct Uniforms {
const int* img;
} uniforms{img};
program.eval(20, &uniforms, buf);
int i = 0;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
REPORTER_ASSERT(r, buf[i] == 90); i++;
REPORTER_ASSERT(r, buf[i] == 98); i++;
REPORTER_ASSERT(r, buf[i] == 76); i++;
REPORTER_ASSERT(r, buf[i] == 54); i++;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
REPORTER_ASSERT(r, buf[i] == 90); i++;
REPORTER_ASSERT(r, buf[i] == 98); i++;
REPORTER_ASSERT(r, buf[i] == 76); i++;
REPORTER_ASSERT(r, buf[i] == 54); i++;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
});
}
DEF_TEST(SkVM_gathers, r) {
skvm::Builder b;
{
skvm::Arg uniforms = b.uniform(),
buf32 = b.varying<int>(),
buf16 = b.varying<uint16_t>(),
buf8 = b.varying<uint8_t>();
skvm::I32 x = b.load32(buf32);
b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
const int img[] = {12,34,56,78, 90,98,76,54};
constexpr int N = 20;
int buf32[N];
uint16_t buf16[N];
uint8_t buf8 [N];
for (int i = 0; i < 20; i++) {
buf32[i] = i;
}
struct Uniforms {
const int* img;
} uniforms{img};
program.eval(N, &uniforms, buf32, buf16, buf8);
int i = 0;
REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
});
}
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
DEF_TEST(SkVM_gathers2, r) {
skvm::Builder b;
{
skvm::Arg uniforms = b.uniform(),
buf32 = b.varying<int>(),
buf16 = b.varying<uint16_t>(),
buf8 = b.varying<uint8_t>();
skvm::I32 x = b.load32(buf32);
b.store32(buf32, b.gather32(uniforms,0, x));
b.store16(buf16, b.gather16(uniforms,0, x));
b.store8 (buf8 , b.gather8 (uniforms,0, x));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
uint8_t img[256];
for (int i = 0; i < 256; i++) {
img[i] = i;
}
int buf32[64];
uint16_t buf16[64];
uint8_t buf8 [64];
for (int i = 0; i < 64; i++) {
buf32[i] = (i*47)&63;
buf16[i] = 0;
buf8 [i] = 0;
}
struct Uniforms {
const uint8_t* img;
} uniforms{img};
program.eval(64, &uniforms, buf32, buf16, buf8);
for (int i = 0; i < 64; i++) {
REPORTER_ASSERT(r, buf8[i] == ((i*47)&63)); // 0,47,30,13,60,...
}
REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
REPORTER_ASSERT(r, buf16[63] == 0x2322);
REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
REPORTER_ASSERT(r, buf32[63] == 0x47464544);
});
}
DEF_TEST(SkVM_bitops, r) {
skvm::Builder b;
{
skvm::Arg ptr = b.varying<int>();
skvm::I32 x = b.load32(ptr);
x = b.bit_and (x, b.splat(0xf1)); // 0x40
x = b.bit_or (x, b.splat(0x80)); // 0xc0
x = b.bit_xor (x, b.splat(0xfe)); // 0x3e
x = b.bit_clear(x, b.splat(0x30)); // 0x0e
x = b.shl(x, 28); // 0xe000'0000
x = b.sra(x, 28); // 0xffff'fffe
x = b.shr(x, 1); // 0x7fff'ffff
b.store32(ptr, x);
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int x = 0x42;
program.eval(1, &x);
REPORTER_ASSERT(r, x == 0x7fff'ffff);
});
}
DEF_TEST(SkVM_select_is_NaN, r) {
skvm::Builder b;
{
skvm::Arg src = b.varying<float>(),
dst = b.varying<float>();
skvm::F32 x = b.loadF(src);
x = select(is_NaN(x), b.splat(0.0f)
, x);
b.storeF(dst, x);
}
std::vector<skvm::OptimizedInstruction> program = b.optimize();
REPORTER_ASSERT(r, program.size() == 4);
REPORTER_ASSERT(r, program[0].op == skvm::Op::load32);
REPORTER_ASSERT(r, program[1].op == skvm::Op::neq_f32);
REPORTER_ASSERT(r, program[2].op == skvm::Op::bit_clear);
REPORTER_ASSERT(r, program[3].op == skvm::Op::store32);
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
// ±NaN, ±0, ±1, ±inf
uint32_t src[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
uint32_t dst[SK_ARRAY_COUNT(src)];
program.eval(SK_ARRAY_COUNT(src), src, dst);
for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
REPORTER_ASSERT(r, dst[i] == (i < 2 ? 0 : src[i]));
}
});
}
DEF_TEST(SkVM_f32, r) {
skvm::Builder b;
{
skvm::Arg arg = b.varying<float>();
skvm::F32 x = b.loadF(arg),
y = b.add(x,x), // y = 2x
z = b.sub(y,x), // z = 2x-x = x
w = b.div(z,x); // w = x/x = 1
b.storeF(arg, w);
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
float buf[] = { 1,2,3,4,5,6,7,8,9 };
program.eval(SK_ARRAY_COUNT(buf), buf);
for (float v : buf) {
REPORTER_ASSERT(r, v == 1.0f);
}
});
}
DEF_TEST(SkVM_cmp_i32, r) {
skvm::Builder b;
{
skvm::I32 x = b.load32(b.varying<int>());
auto to_bit = [&](int shift, skvm::I32 mask) {
return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
};
skvm::I32 m = b.splat(0);
m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
b.store32(b.varying<int>(), m);
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int in[] = { 0,1,2,3,4,5,6,7,8,9 };
int out[SK_ARRAY_COUNT(in)];
program.eval(SK_ARRAY_COUNT(in), in, out);
REPORTER_ASSERT(r, out[0] == 0b001111);
REPORTER_ASSERT(r, out[1] == 0b001100);
REPORTER_ASSERT(r, out[2] == 0b001010);
REPORTER_ASSERT(r, out[3] == 0b001010);
REPORTER_ASSERT(r, out[4] == 0b000010);
for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
REPORTER_ASSERT(r, out[i] == 0b110010);
}
});
}
DEF_TEST(SkVM_cmp_f32, r) {
skvm::Builder b;
{
skvm::F32 x = b.loadF(b.varying<float>());
auto to_bit = [&](int shift, skvm::I32 mask) {
return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
};
skvm::I32 m = b.splat(0);
m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
b.store32(b.varying<int>(), m);
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
float in[] = { 0,1,2,3,4,5,6,7,8,9 };
int out[SK_ARRAY_COUNT(in)];
program.eval(SK_ARRAY_COUNT(in), in, out);
REPORTER_ASSERT(r, out[0] == 0b001111);
REPORTER_ASSERT(r, out[1] == 0b001100);
REPORTER_ASSERT(r, out[2] == 0b001010);
REPORTER_ASSERT(r, out[3] == 0b001010);
REPORTER_ASSERT(r, out[4] == 0b000010);
for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
REPORTER_ASSERT(r, out[i] == 0b110010);
}
});
}
DEF_TEST(SkVM_index, r) {
skvm::Builder b;
b.store32(b.varying<int>(), b.index());
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int buf[23];
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == (int)SK_ARRAY_COUNT(buf)-i);
}
});
}
DEF_TEST(SkVM_mad, r) {
// This program is designed to exercise the tricky corners of instruction
// and register selection for Op::mad_f32.
skvm::Builder b;
{
skvm::Arg arg = b.varying<int>();
skvm::F32 x = b.to_F32(b.load32(arg)),
y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
z = b.mad(y,y,x), // y is needed in the future, but r[z] = r[x] is ok.
w = b.mad(z,z,y), // w can alias z but not y.
v = b.mad(w,y,w); // Got to stop somewhere.
b.store32(arg, b.trunc(v));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int x = 2;
program.eval(1, &x);
// x = 2
// y = 2*2 + 2 = 6
// z = 6*6 + 2 = 38
// w = 38*38 + 6 = 1450
// v = 1450*6 + 1450 = 10150
REPORTER_ASSERT(r, x == 10150);
});
}
DEF_TEST(SkVM_fms, r) {
// Create a pattern that can be peepholed into an Op::fms_f32.
skvm::Builder b;
{
skvm::Arg arg = b.varying<int>();
skvm::F32 x = b.to_F32(b.load32(arg)),
v = b.sub(b.mul(x, b.splat(2.0f)),
b.splat(1.0f));
b.store32(arg, b.trunc(v));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
program.eval((int)SK_ARRAY_COUNT(buf), &buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] = 2*i-1);
}
});
}
DEF_TEST(SkVM_fnma, r) {
// Create a pattern that can be peepholed into an Op::fnma_f32.
skvm::Builder b;
{
skvm::Arg arg = b.varying<int>();
skvm::F32 x = b.to_F32(b.load32(arg)),
v = b.sub(b.splat(1.0f),
b.mul(x, b.splat(2.0f)));
b.store32(arg, b.trunc(v));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
program.eval((int)SK_ARRAY_COUNT(buf), &buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] = 1-2*i);
}
});
}
DEF_TEST(SkVM_madder, r) {
skvm::Builder b;
{
skvm::Arg arg = b.varying<float>();
skvm::F32 x = b.loadF(arg),
y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
z = b.mad(y,x,y), // r[x] can be reused after this instruction, but not r[y].
w = b.mad(y,y,z);
b.storeF(arg, w);
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
float x = 2.0f;
// y = 2*2 + 2 = 6
// z = 6*2 + 6 = 18
// w = 6*6 + 18 = 54
program.eval(1, &x);
REPORTER_ASSERT(r, x == 54.0f);
});
}
DEF_TEST(SkVM_floor, r) {
skvm::Builder b;
{
skvm::Arg arg = b.varying<float>();
b.storeF(arg, b.floor(b.loadF(arg)));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
float buf[] = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == want[i]);
}
});
}
restore Op::round While I think trunc(mad(x, scale, 0.5)) is fine for doing our float to fixed point conversions, round(mul(x, scale)) was kind of better all around: - better rounding than +0.5 and trunc - faster when mad() is not an fma - often now no need to use the constant 0.5f or have it in a register - allows the mul() in to_unorm to use mul_f32_imm Those last two points are key... this actually frees up 2 registers in the x86 JIT when using to_unorm(). So I think maybe we can resurrect round and still guarantee our desired intra-machine stability by committing to using instructions that follow the current rounding mode, which is what [v]cvtps2dq inextricably uses. Left some notes on the ARM impl... we're rounding to nearest even there, which is probably the current mode anyway, but to be more correct we need a slightly longer impl that rounds float->float then "truncates". Unsure whether it matters in practice. Same deal in the unit test that I added back, now testing negative and 0.5 cases too. The expectations assume the current mode is nearest even. I had the idea to resurrect this when I was looking at adding _imm Ops for fma_f32. I noticed that the y and z arguments to an fma_f32 were by far most likely to be constants, and when they are, they're by far likely to both be constants, e.g. 255.0f & 0.5f from to_unorm(8,...). llvm disassembly for SkVM_round unit test looks good: ~ $ llc -mcpu=haswell /tmp/skvm-jit-1231521224.bc -o - .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 15 .globl "_skvm-jit-1231521224" ## -- Begin function skvm-jit-1231521224 .p2align 4, 0x90 "_skvm-jit-1231521224": ## @skvm-jit-1231521224 .cfi_startproc cmpl $8, %edi jl LBB0_3 .p2align 4, 0x90 LBB0_2: ## %loopK ## =>This Inner Loop Header: Depth=1 vcvtps2dq (%rsi), %ymm0 vmovupd %ymm0, (%rdx) addl $-8, %edi addq $32, %rsi addq $32, %rdx cmpl $8, %edi jge LBB0_2 LBB0_3: ## %hoist1 xorl %eax, %eax testl %edi, %edi jle LBB0_6 .p2align 4, 0x90 LBB0_5: ## %loop1 ## =>This Inner Loop Header: Depth=1 vcvtss2si (%rsi,%rax), %ecx movl %ecx, (%rdx,%rax) decl %edi addq $4, %rax testl %edi, %edi jg LBB0_5 LBB0_6: ## %leave vzeroupper retq .cfi_endproc ## -- End function Change-Id: Ib59eb3fd8a6805397850d93226c6c6d37cc3ab84 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/276738 Auto-Submit: Mike Klein <mtklein@google.com> Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-03-12 16:05:46 +00:00
DEF_TEST(SkVM_round, r) {
skvm::Builder b;
{
skvm::Arg src = b.varying<float>();
skvm::Arg dst = b.varying<int>();
b.store32(dst, b.round(b.loadF(src)));
restore Op::round While I think trunc(mad(x, scale, 0.5)) is fine for doing our float to fixed point conversions, round(mul(x, scale)) was kind of better all around: - better rounding than +0.5 and trunc - faster when mad() is not an fma - often now no need to use the constant 0.5f or have it in a register - allows the mul() in to_unorm to use mul_f32_imm Those last two points are key... this actually frees up 2 registers in the x86 JIT when using to_unorm(). So I think maybe we can resurrect round and still guarantee our desired intra-machine stability by committing to using instructions that follow the current rounding mode, which is what [v]cvtps2dq inextricably uses. Left some notes on the ARM impl... we're rounding to nearest even there, which is probably the current mode anyway, but to be more correct we need a slightly longer impl that rounds float->float then "truncates". Unsure whether it matters in practice. Same deal in the unit test that I added back, now testing negative and 0.5 cases too. The expectations assume the current mode is nearest even. I had the idea to resurrect this when I was looking at adding _imm Ops for fma_f32. I noticed that the y and z arguments to an fma_f32 were by far most likely to be constants, and when they are, they're by far likely to both be constants, e.g. 255.0f & 0.5f from to_unorm(8,...). llvm disassembly for SkVM_round unit test looks good: ~ $ llc -mcpu=haswell /tmp/skvm-jit-1231521224.bc -o - .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 15 .globl "_skvm-jit-1231521224" ## -- Begin function skvm-jit-1231521224 .p2align 4, 0x90 "_skvm-jit-1231521224": ## @skvm-jit-1231521224 .cfi_startproc cmpl $8, %edi jl LBB0_3 .p2align 4, 0x90 LBB0_2: ## %loopK ## =>This Inner Loop Header: Depth=1 vcvtps2dq (%rsi), %ymm0 vmovupd %ymm0, (%rdx) addl $-8, %edi addq $32, %rsi addq $32, %rdx cmpl $8, %edi jge LBB0_2 LBB0_3: ## %hoist1 xorl %eax, %eax testl %edi, %edi jle LBB0_6 .p2align 4, 0x90 LBB0_5: ## %loop1 ## =>This Inner Loop Header: Depth=1 vcvtss2si (%rsi,%rax), %ecx movl %ecx, (%rdx,%rax) decl %edi addq $4, %rax testl %edi, %edi jg LBB0_5 LBB0_6: ## %leave vzeroupper retq .cfi_endproc ## -- End function Change-Id: Ib59eb3fd8a6805397850d93226c6c6d37cc3ab84 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/276738 Auto-Submit: Mike Klein <mtklein@google.com> Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-03-12 16:05:46 +00:00
}
// The test cases on exact 0.5f boundaries assume the current rounding mode is nearest even.
// We haven't explicitly guaranteed that here... it just probably is.
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
restore Op::round While I think trunc(mad(x, scale, 0.5)) is fine for doing our float to fixed point conversions, round(mul(x, scale)) was kind of better all around: - better rounding than +0.5 and trunc - faster when mad() is not an fma - often now no need to use the constant 0.5f or have it in a register - allows the mul() in to_unorm to use mul_f32_imm Those last two points are key... this actually frees up 2 registers in the x86 JIT when using to_unorm(). So I think maybe we can resurrect round and still guarantee our desired intra-machine stability by committing to using instructions that follow the current rounding mode, which is what [v]cvtps2dq inextricably uses. Left some notes on the ARM impl... we're rounding to nearest even there, which is probably the current mode anyway, but to be more correct we need a slightly longer impl that rounds float->float then "truncates". Unsure whether it matters in practice. Same deal in the unit test that I added back, now testing negative and 0.5 cases too. The expectations assume the current mode is nearest even. I had the idea to resurrect this when I was looking at adding _imm Ops for fma_f32. I noticed that the y and z arguments to an fma_f32 were by far most likely to be constants, and when they are, they're by far likely to both be constants, e.g. 255.0f & 0.5f from to_unorm(8,...). llvm disassembly for SkVM_round unit test looks good: ~ $ llc -mcpu=haswell /tmp/skvm-jit-1231521224.bc -o - .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 15 .globl "_skvm-jit-1231521224" ## -- Begin function skvm-jit-1231521224 .p2align 4, 0x90 "_skvm-jit-1231521224": ## @skvm-jit-1231521224 .cfi_startproc cmpl $8, %edi jl LBB0_3 .p2align 4, 0x90 LBB0_2: ## %loopK ## =>This Inner Loop Header: Depth=1 vcvtps2dq (%rsi), %ymm0 vmovupd %ymm0, (%rdx) addl $-8, %edi addq $32, %rsi addq $32, %rdx cmpl $8, %edi jge LBB0_2 LBB0_3: ## %hoist1 xorl %eax, %eax testl %edi, %edi jle LBB0_6 .p2align 4, 0x90 LBB0_5: ## %loop1 ## =>This Inner Loop Header: Depth=1 vcvtss2si (%rsi,%rax), %ecx movl %ecx, (%rdx,%rax) decl %edi addq $4, %rax testl %edi, %edi jg LBB0_5 LBB0_6: ## %leave vzeroupper retq .cfi_endproc ## -- End function Change-Id: Ib59eb3fd8a6805397850d93226c6c6d37cc3ab84 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/276738 Auto-Submit: Mike Klein <mtklein@google.com> Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-03-12 16:05:46 +00:00
float buf[] = { -1.5f, -0.5f, 0.0f, 0.5f, 0.2f, 0.6f, 1.0f, 1.4f, 1.5f, 2.0f };
int want[] = { -2 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 2 , 2 };
int dst[SK_ARRAY_COUNT(buf)];
program.eval(SK_ARRAY_COUNT(buf), buf, dst);
for (int i = 0; i < (int)SK_ARRAY_COUNT(dst); i++) {
REPORTER_ASSERT(r, dst[i] == want[i]);
}
});
}
DEF_TEST(SkVM_min, r) {
skvm::Builder b;
{
skvm::Arg src1 = b.varying<float>();
skvm::Arg src2 = b.varying<float>();
skvm::Arg dst = b.varying<float>();
b.storeF(dst, b.min(b.loadF(src1), b.loadF(src2)));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
float s1[] = { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
float s2[] = { 0.0f, 2.0f, 3.0f, 1.0f, -2.0f};
float want[] = { 0.0f, 1.0f, 3.0f, -1.0f, -2.0f};
float d[SK_ARRAY_COUNT(s1)];
program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
REPORTER_ASSERT(r, d[i] == want[i]);
}
});
}
DEF_TEST(SkVM_max, r) {
skvm::Builder b;
{
skvm::Arg src1 = b.varying<float>();
skvm::Arg src2 = b.varying<float>();
skvm::Arg dst = b.varying<float>();
b.storeF(dst, b.max(b.loadF(src1), b.loadF(src2)));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
float s1[] = { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
float s2[] = { 0.0f, 2.0f, 3.0f, 1.0f, -2.0f};
float want[] = { 0.0f, 2.0f, 4.0f, 1.0f, -1.0f};
float d[SK_ARRAY_COUNT(s1)];
program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
REPORTER_ASSERT(r, d[i] == want[i]);
}
});
}
DEF_TEST(SkVM_hoist, r) {
// This program uses enough constants that it will fail to JIT if we hoist them.
// The JIT will try again without hoisting, and that'll just need 2 registers.
skvm::Builder b;
{
skvm::Arg arg = b.varying<int>();
skvm::I32 x = b.load32(arg);
for (int i = 0; i < 32; i++) {
x = b.add(x, b.splat(i));
}
b.store32(arg, x);
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int x = 4;
program.eval(1, &x);
// x += 0 + 1 + 2 + 3 + ... + 30 + 31
// x += 496
REPORTER_ASSERT(r, x == 500);
});
}
DEF_TEST(SkVM_select, r) {
skvm::Builder b;
{
skvm::Arg buf = b.varying<int>();
skvm::I32 x = b.load32(buf);
x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
b.store32(buf, x);
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int buf[] = { 0,1,2,3,4,5,6,7,8 };
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
}
});
}
DEF_TEST(SkVM_NewOps, r) {
// Exercise a somewhat arbitrary set of new ops.
skvm::Builder b;
{
skvm::Arg buf = b.varying<int16_t>(),
uniforms = b.uniform();
skvm::I32 x = b.load16(buf);
const size_t kPtr = sizeof(const int*);
x = b.add(x, b.uniform32(uniforms, kPtr+0));
x = b.mul(x, b.uniform8 (uniforms, kPtr+4));
x = b.sub(x, b.uniform16(uniforms, kPtr+6));
skvm::I32 limit = b.uniform32(uniforms, kPtr+8);
x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
x = b.select(b.gt(x, limit ), limit , x);
x = b.gather8(uniforms,0, x);
b.store16(buf, x);
}
if ((false)) {
SkDynamicMemoryWStream buf;
dump(b, &buf);
sk_sp<SkData> blob = buf.detachAsData();
SkDebugf("%.*s\n", blob->size(), blob->data());
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
const int N = 31;
int16_t buf[N];
for (int i = 0; i < N; i++) {
buf[i] = i;
}
const int M = 16;
uint8_t img[M];
for (int i = 0; i < M; i++) {
img[i] = i*i;
}
struct {
const uint8_t* img;
int add = 5;
uint8_t mul = 3;
uint16_t sub = 18;
int limit = M-1;
} uniforms{img};
program.eval(N, buf, &uniforms);
for (int i = 0; i < N; i++) {
// Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
int x = 3*(i-1);
// Then that's pinned to the limits of img.
if (i < 2) { x = 0; } // Notice i == 1 hits x == 0 exactly...
if (i > 5) { x = 15; } // ...and i == 6 hits x == 15 exactly
REPORTER_ASSERT(r, buf[i] == img[x]);
}
});
}
DEF_TEST(SkVM_sqrt, r) {
skvm::Builder b;
auto buf = b.varying<int>();
b.storeF(buf, b.sqrt(b.loadF(buf)));
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
constexpr int K = 17;
float buf[K];
for (int i = 0; i < K; i++) {
buf[i] = (float)(i*i);
}
// x^2 -> x
program.eval(K, buf);
for (int i = 0; i < K; i++) {
REPORTER_ASSERT(r, buf[i] == (float)i);
}
});
}
DEF_TEST(SkVM_MSAN, r) {
// This little memset32() program should be able to JIT, but if we run that
// JIT code in an MSAN build, it won't see the writes initialize buf. So
// this tests that we're using the interpreter instead.
skvm::Builder b;
b.store32(b.varying<int>(), b.splat(42));
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
constexpr int K = 17;
int buf[K]; // Intentionally uninitialized.
program.eval(K, buf);
sk_msan_assert_initialized(buf, buf+K);
for (int x : buf) {
REPORTER_ASSERT(r, x == 42);
}
});
}
DEF_TEST(SkVM_assert, r) {
skvm::Builder b;
b.assert_true(b.lt(b.load32(b.varying<int>()),
b.splat(42)));
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
program.eval(SK_ARRAY_COUNT(buf), buf);
});
}
DEF_TEST(SkVM_premul, reporter) {
// Test that premul is short-circuited when alpha is known opaque.
{
skvm::Builder p;
auto rptr = p.varying<int>(),
aptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.loadF(aptr);
p.premul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, load alpha, red *= alpha, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
}
{
skvm::Builder p;
auto rptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.splat(1.0f);
p.premul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
}
// Same deal for unpremul.
{
skvm::Builder p;
auto rptr = p.varying<int>(),
aptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.loadF(aptr);
p.unpremul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, load alpha, a bunch of unpremul instructions, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
}
{
skvm::Builder p;
auto rptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.splat(1.0f);
p.unpremul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
}
}
template <typename Fn>
static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
uint8_t buf[4096];
skvm::Assembler a{buf};
fn(a);
REPORTER_ASSERT(r, a.size() == expected.size());
auto got = (const uint8_t*)buf,
want = expected.begin();
for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
REPORTER_ASSERT(r, got[i] == want[i],
"byte %d was %02x, want %02x", i, got[i], want[i]);
}
}
DEF_TEST(SkVM_Assembler, r) {
// Easiest way to generate test cases is
//
// echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
//
// The -x86-asm-syntax=intel bit is optional, controlling the
// input syntax only; the output will always be AT&T op x,y,dst style.
// Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
// that a bit easier to use here, despite maybe favoring AT&T overall.
using A = skvm::Assembler;
// Our exit strategy from AVX code.
test_asm(r, [&](A& a) {
a.int3();
a.vzeroupper();
a.ret();
},{
0xcc,
0xc5, 0xf8, 0x77,
0xc3,
});
// Align should pad with zero
test_asm(r, [&](A& a) {
a.ret();
a.align(4);
},{
0xc3,
0x00, 0x00, 0x00,
});
test_asm(r, [&](A& a) {
a.add(A::rax, 8); // Always good to test rax.
a.sub(A::rax, 32);
a.add(A::rdi, 12); // Last 0x48 REX
a.sub(A::rdi, 8);
a.add(A::r8 , 7); // First 0x49 REX
a.sub(A::r8 , 4);
a.add(A::rsi, 128); // Requires 4 byte immediate.
a.sub(A::r8 , 1000000);
a.add(A::Mem{A::rsi}, 7); // addq $7, (%rsi)
a.add(A::Mem{A::rsi, 12}, 7); // addq $7, 12(%rsi)
a.add(A::Mem{A::rsp, 12}, 7); // addq $7, 12(%rsp)
a.add(A::Mem{A::r12, 12}, 7); // addq $7, 12(%r12)
a.add(A::Mem{A::rsp, 12, A::rax, A::FOUR}, 7); // addq $7, 12(%rsp,%rax,4)
a.add(A::Mem{A::r12, 12, A::rax, A::FOUR}, 7); // addq $7, 12(%r12,%rax,4)
a.add(A::Mem{A::rax, 12, A::r12, A::FOUR}, 7); // addq $7, 12(%rax,%r12,4)
a.add(A::Mem{A::r11, 12, A::r8 , A::TWO }, 7); // addq $7, 12(%r11,%r8,2)
a.add(A::Mem{A::r11, 12, A::rax} , 7); // addq $7, 12(%r11,%rax)
a.add(A::Mem{A::rax, 12, A::r11} , 7); // addq $7, 12(%rax,%r11)
a.sub(A::Mem{A::rax, 12, A::r11} , 7); // subq $7, 12(%rax,%r11)
a.add( A::rax , A::rcx); // addq %rcx, %rax
a.add(A::Mem{A::rax} , A::rcx); // addq %rcx, (%rax)
a.add(A::Mem{A::rax, 12}, A::rcx); // addq %rcx, 12(%rax)
a.add(A::rcx, A::Mem{A::rax, 12}); // addq 12(%rax), %rcx
a.sub(A::rcx, A::Mem{A::rax, 12}); // subq 12(%rax), %rcx
},{
0x48, 0x83, 0b11'000'000, 0x08,
0x48, 0x83, 0b11'101'000, 0x20,
0x48, 0x83, 0b11'000'111, 0x0c,
0x48, 0x83, 0b11'101'111, 0x08,
0x49, 0x83, 0b11'000'000, 0x07,
0x49, 0x83, 0b11'101'000, 0x04,
0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
0x48,0x83,0x06,0x07,
0x48,0x83,0x46,0x0c,0x07,
0x48,0x83,0x44,0x24,0x0c,0x07,
0x49,0x83,0x44,0x24,0x0c,0x07,
0x48,0x83,0x44,0x84,0x0c,0x07,
0x49,0x83,0x44,0x84,0x0c,0x07,
0x4a,0x83,0x44,0xa0,0x0c,0x07,
0x4b,0x83,0x44,0x43,0x0c,0x07,
0x49,0x83,0x44,0x03,0x0c,0x07,
0x4a,0x83,0x44,0x18,0x0c,0x07,
0x4a,0x83,0x6c,0x18,0x0c,0x07,
0x48,0x01,0xc8,
0x48,0x01,0x08,
0x48,0x01,0x48,0x0c,
0x48,0x03,0x48,0x0c,
0x48,0x2b,0x48,0x0c,
});
test_asm(r, [&](A& a) {
a.vpaddd (A::ymm0, A::ymm1, A::ymm2); // Low registers and 0x0f map -> 2-byte VEX.
a.vpaddd (A::ymm8, A::ymm1, A::ymm2); // A high dst register is ok -> 2-byte VEX.
a.vpaddd (A::ymm0, A::ymm8, A::ymm2); // A high first argument register -> 2-byte VEX.
a.vpaddd (A::ymm0, A::ymm1, A::ymm8); // A high second argument -> 3-byte VEX.
a.vpmulld(A::ymm0, A::ymm1, A::ymm2); // Using non-0x0f map instruction -> 3-byte VEX.
a.vpsubd (A::ymm0, A::ymm1, A::ymm2); // Test vpsubd to ensure argument order is right.
},{
/* VEX */ /*op*/ /*modRM*/
0xc5, 0xf5, 0xfe, 0xc2,
0xc5, 0x75, 0xfe, 0xc2,
0xc5, 0xbd, 0xfe, 0xc2,
0xc4, 0xc1, 0x75, 0xfe, 0xc0,
0xc4, 0xe2, 0x75, 0x40, 0xc2,
0xc5, 0xf5, 0xfa, 0xc2,
});
implement some useful 16-bit instructions Add a slew of 16-bit instructions for experiments. I want to try a fixed-point path through SkVMBlitter, continuing to represent geometry with F32, but color channels in 16 bits, with several possible representations: - unorm8 lowp like SkRasterPipeline (0 -> 0.0, 0x00ff -> 1.0) - 15-bit SkFixed15 fixed-point (0 -> 0.0, 0x8000 -> 1.0) - 14-bit signed fixed-point (0 -> 0.0, ±0x4000 -> ±1.0) I'm leaning towards the 14-bit version for being able to hold a good range of temporary values in [-2,2), or perhaps even a 13-bit analog for even a little more safety range. Mostly something new to try. Most of these instructions are pretty obvious, with notes on a few: vpavgw is an unsigned (x+y+1)>>1, and is useful for converting unorm8 up to Q14. There are a couple ways to do this pretty well, and using vpavgw is the best, and uses the fewest instructions: A) (x << 6) + ( x >> 2) + (x == 255) // Ok approx. B) (x << 6) + ((x+1) >> 2) // Better approx. C) vpavgw(x << 7, x >> 1) // Perfect math! The best good reverse math I've found is (x >> 6) - (x > 16319). vpmulhrsw is the key to the whole thing as usual, letting us do 16x16->16-bit multiplies. An SkFixed15 multiply is vpmulhrsw followed by vpabsw (also added here), and a Q14 multiply is vpmulhrsw followed by a simple <<1. I've added both signed and unsigned min and max. Not entirely sure they'll all be used, but I do have my eye on vpminuw as a single-instruction clamp to [0,0x4000] ~~> [0.0,1.0], treating any negative Q14 as very large unsigned. Change-Id: I0db7f3f943ef6c9a600821444cc5b003fe5f675d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317119 Commit-Queue: Herb Derby <herb@google.com> Auto-Submit: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-09-15 12:57:27 +00:00
test_asm(r, [&](A& a) {
a.vpaddw (A::ymm4, A::ymm3, A::ymm2);
a.vpavgw (A::ymm4, A::ymm3, A::ymm2);
a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
a.vpminsw (A::ymm4, A::ymm3, A::ymm2);
a.vpmaxsw (A::ymm4, A::ymm3, A::ymm2);
a.vpminuw (A::ymm4, A::ymm3, A::ymm2);
a.vpmaxuw (A::ymm4, A::ymm3, A::ymm2);
a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
a.vpabsw (A::ymm4, A::ymm3);
a.vpsllw (A::ymm4, A::ymm3, 12);
a.vpsraw (A::ymm4, A::ymm3, 12);
},{
0xc5, 0xe5, 0xfd, 0xe2,
0xc5, 0xe5, 0xe3, 0xe2,
0xc5, 0xe5, 0x75, 0xe2,
0xc5, 0xe5, 0x65, 0xe2,
0xc5, 0xe5, 0xea, 0xe2,
0xc5, 0xe5, 0xee, 0xe2,
0xc4,0xe2,0x65, 0x3a, 0xe2,
0xc4,0xe2,0x65, 0x3e, 0xe2,
0xc4,0xe2,0x65, 0x0b, 0xe2,
0xc4,0xe2,0x7d, 0x1d, 0xe3,
0xc5,0xdd,0x71, 0xf3, 0x0c,
0xc5,0xdd,0x71, 0xe3, 0x0c,
});
test_asm(r, [&](A& a) {
A::Label l;
a.vcmpeqps (A::ymm0, A::ymm1, &l); // vcmpeqps 0x1c(%rip), %ymm1, %ymm0
a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
a.label(&l); // 28 bytes after the vcmpeqps that uses it.
},{
0xc5,0xf4,0xc2,0x05,0x1c,0x00,0x00,0x00,0x00,
0xc5,0xf5,0x76,0xc2,
0xc5,0xf5,0x66,0xc2,
0xc5,0xf4,0xc2,0xc2,0x00,
0xc5,0xf4,0xc2,0xc2,0x01,
0xc5,0xf4,0xc2,0xc2,0x02,
0xc5,0xf4,0xc2,0xc2,0x04,
});
test_asm(r, [&](A& a) {
a.vminps(A::ymm0, A::ymm1, A::ymm2);
a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
},{
0xc5,0xf4,0x5d,0xc2,
0xc5,0xf4,0x5f,0xc2,
});
test_asm(r, [&](A& a) {
a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
},{
0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
});
test_asm(r, [&](A& a) {
a.vpsrld(A::ymm15, A::ymm2, 8);
a.vpsrld(A::ymm0 , A::ymm8, 5);
},{
0xc5, 0x85, 0x72,0xd2, 0x08,
0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
});
test_asm(r, [&](A& a) {
A::Label l;
a.vpermps(A::ymm1, A::ymm2, A::Mem{A::rdi, 32});
a.vperm2f128(A::ymm1, A::ymm2, &l, 0x20);
a.vpermq(A::ymm1, A::ymm2, 5);
a.label(&l); // 6 bytes after vperm2f128
},{
0xc4,0xe2,0x6d,0x16,0x4f,0x20,
0xc4,0xe3,0x6d,0x06,0x0d,0x06,0x00,0x00,0x00,0x20,
0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
});
test_asm(r, [&](A& a) {
a.vpunpckldq(A::ymm1, A::ymm2, A::Mem{A::rdi});
a.vpunpckhdq(A::ymm1, A::ymm2, A::ymm3);
},{
0xc5,0xed,0x62,0x0f,
0xc5,0xed,0x6a,0xcb,
});
test_asm(r, [&](A& a) {
a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
a.vroundps(A::ymm1, A::ymm2, A::CEIL);
a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
},{
0xc4,0xe3,0x7d,0x08,0xca,0x00,
0xc4,0xe3,0x7d,0x08,0xca,0x01,
0xc4,0xe3,0x7d,0x08,0xca,0x02,
0xc4,0xe3,0x7d,0x08,0xca,0x03,
});
test_asm(r, [&](A& a) {
A::Label l;
a.label(&l);
a.byte(1);
a.byte(2);
a.byte(3);
a.byte(4);
a.vbroadcastss(A::ymm0 , &l);
a.vbroadcastss(A::ymm1 , &l);
a.vbroadcastss(A::ymm8 , &l);
a.vbroadcastss(A::ymm15, &l);
a.vpshufb(A::ymm4, A::ymm3, &l);
a.vpaddd (A::ymm4, A::ymm3, &l);
a.vpsubd (A::ymm4, A::ymm3, &l);
a.vptest(A::ymm4, &l);
sketch out structure for ops with immediates Lots of x86 instructions can take their right hand side argument from memory directly rather than a register. We can use this to avoid the need to allocate a register for many constants. The strategy in this CL is one of several I've been stewing over, the simplest of those strategies I think. There are some trade offs particularly on ARM; this naive ARM implementation means we'll load&op every time, even though the load part of the operation can logically be hoisted. From here on I'm going to just briefly enumerate a few other approaches that allow the optimization on x86 and still allow the immediate splats to hoist on ARM. 1) don't do it on ARM A very simple approach is to simply not perform this optimization on ARM. ARM has more vector registers than x86, and so register pressure is lower there. We're going to end up with splatted constants in registers anyway, so maybe just let that happen the normal way instead of some roundabout complicated hack like I'll talk about in 2). The only downside in my mind is that this approach would make high-level program descriptions platform dependent, which isn't so bad, but it's been nice to be able to compare and diff debug dumps. 2) split Op::splat up The next less-simple approach to this problem could fix this by splitting splats into two Ops internally, one inner Op::immediate that guantees at least the constant is in memory and is compatible with immediate-aware Ops like mul_f32_imm, and an outer Op::constant that depends on that Op::immediate and further guarantees that constant has been broadcast into a register to be compatible with non-immediate-aware ops like div_f32. When building a program, immediate-aware ops would peek for Op::constants as they do today for Op::splats, but instead of embedding the immediate themselves, they'd replace their dependency with the inner Op::immediate. On x86 these new Ops would work just as advertised, with Op::immediate a runtime no-op, Op::constant the usual vbroadcastss. On ARM Op::immediate needs to go all the way and splat out a register to make the constant compatible with immediate-aware ops, and the Op::constant becomes a noop now instead. All this comes together to let the Op::immediate splat hoist up out of the loop while still feeding Op::mul_f32_imm and co. It's a rather complicated approach to solving this issue, but I might want to explore it just to see how bad it is. 3) do it inside the x86 JIT The conceptually best approach is to find a way to do this peepholing only inside the JIT only on x86, avoiding the need for new Op::mul_f32_imm and co. ARM and the interpreter don't benefit from this peephole, so the x86 JIT is the logical owner of this optimization. Finding a clean way to do this without too much disruption is the least baked idea I've got here, though I think the most desirable long-term. Cq-Include-Trybots: skia.primary:Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-SK_USE_SKVM_BLITTER,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_USE_SKVM_BLITTER Change-Id: Ie9c6336ed08b6fbeb89acf920a48a319f74f3643 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/254217 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-11-12 15:07:23 +00:00
a.vmulps (A::ymm4, A::ymm3, &l);
},{
0x01, 0x02, 0x03, 0x4,
/* VEX */ /*op*/ /* ModRM */ /* offset */
0xc4, 0xe2, 0x7d, 0x18, 0b00'000'101, 0xf3,0xff,0xff,0xff, // 0xfffffff3 == -13
0xc4, 0xe2, 0x7d, 0x18, 0b00'001'101, 0xea,0xff,0xff,0xff, // 0xffffffea == -22
0xc4, 0x62, 0x7d, 0x18, 0b00'000'101, 0xe1,0xff,0xff,0xff, // 0xffffffe1 == -31
0xc4, 0x62, 0x7d, 0x18, 0b00'111'101, 0xd8,0xff,0xff,0xff, // 0xffffffd8 == -40
0xc4, 0xe2, 0x65, 0x00, 0b00'100'101, 0xcf,0xff,0xff,0xff, // 0xffffffcf == -49
0xc5, 0xe5, 0xfe, 0b00'100'101, 0xc7,0xff,0xff,0xff, // 0xffffffc7 == -57
0xc5, 0xe5, 0xfa, 0b00'100'101, 0xbf,0xff,0xff,0xff, // 0xffffffbf == -65
sketch out structure for ops with immediates Lots of x86 instructions can take their right hand side argument from memory directly rather than a register. We can use this to avoid the need to allocate a register for many constants. The strategy in this CL is one of several I've been stewing over, the simplest of those strategies I think. There are some trade offs particularly on ARM; this naive ARM implementation means we'll load&op every time, even though the load part of the operation can logically be hoisted. From here on I'm going to just briefly enumerate a few other approaches that allow the optimization on x86 and still allow the immediate splats to hoist on ARM. 1) don't do it on ARM A very simple approach is to simply not perform this optimization on ARM. ARM has more vector registers than x86, and so register pressure is lower there. We're going to end up with splatted constants in registers anyway, so maybe just let that happen the normal way instead of some roundabout complicated hack like I'll talk about in 2). The only downside in my mind is that this approach would make high-level program descriptions platform dependent, which isn't so bad, but it's been nice to be able to compare and diff debug dumps. 2) split Op::splat up The next less-simple approach to this problem could fix this by splitting splats into two Ops internally, one inner Op::immediate that guantees at least the constant is in memory and is compatible with immediate-aware Ops like mul_f32_imm, and an outer Op::constant that depends on that Op::immediate and further guarantees that constant has been broadcast into a register to be compatible with non-immediate-aware ops like div_f32. When building a program, immediate-aware ops would peek for Op::constants as they do today for Op::splats, but instead of embedding the immediate themselves, they'd replace their dependency with the inner Op::immediate. On x86 these new Ops would work just as advertised, with Op::immediate a runtime no-op, Op::constant the usual vbroadcastss. On ARM Op::immediate needs to go all the way and splat out a register to make the constant compatible with immediate-aware ops, and the Op::constant becomes a noop now instead. All this comes together to let the Op::immediate splat hoist up out of the loop while still feeding Op::mul_f32_imm and co. It's a rather complicated approach to solving this issue, but I might want to explore it just to see how bad it is. 3) do it inside the x86 JIT The conceptually best approach is to find a way to do this peepholing only inside the JIT only on x86, avoiding the need for new Op::mul_f32_imm and co. ARM and the interpreter don't benefit from this peephole, so the x86 JIT is the logical owner of this optimization. Finding a clean way to do this without too much disruption is the least baked idea I've got here, though I think the most desirable long-term. Cq-Include-Trybots: skia.primary:Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-SK_USE_SKVM_BLITTER,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_USE_SKVM_BLITTER Change-Id: Ie9c6336ed08b6fbeb89acf920a48a319f74f3643 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/254217 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-11-12 15:07:23 +00:00
0xc4, 0xe2, 0x7d, 0x17, 0b00'100'101, 0xb6,0xff,0xff,0xff, // 0xffffffb6 == -74
0xc5, 0xe4, 0x59, 0b00'100'101, 0xae,0xff,0xff,0xff, // 0xffffffaf == -82
});
test_asm(r, [&](A& a) {
a.vbroadcastss(A::ymm0, A::Mem{A::rdi, 0});
a.vbroadcastss(A::ymm13, A::Mem{A::r14, 7});
a.vbroadcastss(A::ymm8, A::Mem{A::rdx, -12});
a.vbroadcastss(A::ymm8, A::Mem{A::rdx, 400});
a.vbroadcastss(A::ymm8, A::xmm0);
a.vbroadcastss(A::ymm0, A::xmm13);
},{
/* VEX */ /*op*/ /*ModRM*/ /*offset*/
0xc4,0xe2,0x7d, 0x18, 0b00'000'111,
0xc4,0x42,0x7d, 0x18, 0b01'101'110, 0x07,
0xc4,0x62,0x7d, 0x18, 0b01'000'010, 0xf4,
0xc4,0x62,0x7d, 0x18, 0b10'000'010, 0x90,0x01,0x00,0x00,
0xc4,0x62,0x7d, 0x18, 0b11'000'000,
0xc4,0xc2,0x7d, 0x18, 0b11'000'101,
});
test_asm(r, [&](A& a) {
A::Label l;
a.label(&l);
a.jne(&l);
a.jne(&l);
a.je (&l);
a.jmp(&l);
a.jl (&l);
a.jc (&l);
a.cmp(A::rdx, 1);
a.cmp(A::rax, 12);
a.cmp(A::r14, 2000000000);
},{
0x0f,0x85, 0xfa,0xff,0xff,0xff, // near jne -6 bytes
0x0f,0x85, 0xf4,0xff,0xff,0xff, // near jne -12 bytes
0x0f,0x84, 0xee,0xff,0xff,0xff, // near je -18 bytes
0xe9, 0xe9,0xff,0xff,0xff, // near jmp -23 bytes
0x0f,0x8c, 0xe3,0xff,0xff,0xff, // near jl -29 bytes
0x0f,0x82, 0xdd,0xff,0xff,0xff, // near jc -35 bytes
0x48,0x83,0xfa,0x01,
0x48,0x83,0xf8,0x0c,
0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
});
test_asm(r, [&](A& a) {
a.vmovups(A::ymm5, A::Mem{A::rsi});
a.vmovups(A::Mem{A::rsi}, A::ymm5);
a.vmovups(A::xmm5, A::Mem{A::rsi});
a.vmovups(A::Mem{A::rsi}, A::xmm5);
a.vpmovzxwd(A::ymm4, A::Mem{A::rsi});
a.vpmovzxbd(A::ymm4, A::Mem{A::rsi});
a.vmovq(A::Mem{A::rdx}, A::xmm15);
},{
/* VEX */ /*Op*/ /* ModRM */
0xc5, 0xfc, 0x10, 0b00'101'110,
0xc5, 0xfc, 0x11, 0b00'101'110,
0xc5, 0xf8, 0x10, 0b00'101'110,
0xc5, 0xf8, 0x11, 0b00'101'110,
0xc4,0xe2,0x7d, 0x33, 0b00'100'110,
0xc4,0xe2,0x7d, 0x31, 0b00'100'110,
0xc5, 0x79, 0xd6, 0b00'111'010,
});
test_asm(r, [&](A& a) {
a.vmovups(A::ymm5, A::Mem{A::rsp, 0});
a.vmovups(A::ymm5, A::Mem{A::rsp, 64});
a.vmovups(A::ymm5, A::Mem{A::rsp,128});
a.vmovups(A::Mem{A::rsp, 0}, A::ymm5);
a.vmovups(A::Mem{A::rsp, 64}, A::ymm5);
a.vmovups(A::Mem{A::rsp,128}, A::ymm5);
},{
0xc5,0xfc,0x10,0x2c,0x24,
0xc5,0xfc,0x10,0x6c,0x24,0x40,
0xc5,0xfc,0x10,0xac,0x24,0x80,0x00,0x00,0x00,
0xc5,0xfc,0x11,0x2c,0x24,
0xc5,0xfc,0x11,0x6c,0x24,0x40,
0xc5,0xfc,0x11,0xac,0x24,0x80,0x00,0x00,0x00,
});
test_asm(r, [&](A& a) {
a.movzbq(A::rax, A::Mem{A::rsi}); // Low registers for src and dst.
a.movzbq(A::rax, A::Mem{A::r8,}); // High src register.
a.movzbq(A::r8 , A::Mem{A::rsi}); // High dst register.
a.movzbq(A::r8, A::Mem{A::rsi, 12});
a.movzbq(A::r8, A::Mem{A::rsi, 400});
a.movzwq(A::rax, A::Mem{A::rsi}); // Low registers for src and dst.
a.movzwq(A::rax, A::Mem{A::r8,}); // High src register.
a.movzwq(A::r8 , A::Mem{A::rsi}); // High dst register.
a.movzwq(A::r8, A::Mem{A::rsi, 12});
a.movzwq(A::r8, A::Mem{A::rsi, 400});
a.vmovd(A::Mem{A::rax}, A::xmm0);
a.vmovd(A::Mem{A::rax}, A::xmm8);
a.vmovd(A::Mem{A::r8 }, A::xmm0);
a.vmovd(A::xmm0, A::Mem{A::rax});
a.vmovd(A::xmm8, A::Mem{A::rax});
a.vmovd(A::xmm0, A::Mem{A::r8 });
a.vmovd(A::xmm0 , A::Mem{A::rax, 0, A::rcx, A::FOUR});
a.vmovd(A::xmm15, A::Mem{A::rax, 0, A::r8, A::TWO });
a.vmovd(A::xmm0 , A::Mem{A::r8 , 0, A::rcx});
a.vmovd(A::rax, A::xmm0);
a.vmovd(A::rax, A::xmm8);
a.vmovd(A::r8 , A::xmm0);
a.vmovd(A::xmm0, A::rax);
a.vmovd(A::xmm8, A::rax);
a.vmovd(A::xmm0, A::r8 );
a.movb(A::Mem{A::rdx}, A::rax);
a.movb(A::Mem{A::rdx}, A::r8 );
a.movb(A::Mem{A::r8 }, A::rax);
a.movb(A::rdx, A::Mem{A::rax});
a.movb(A::rdx, A::Mem{A::r8 });
a.movb(A::r8 , A::Mem{A::rax});
a.movb(A::rdx, 12);
a.movb(A::rax, 4);
a.movb(A::r8 , -1);
a.movb(A::Mem{A::rdx}, 12);
a.movb(A::Mem{A::rax}, 4);
a.movb(A::Mem{A::r8 }, -1);
},{
0x48,0x0f,0xb6,0x06, // movzbq (%rsi), %rax
0x49,0x0f,0xb6,0x00,
0x4c,0x0f,0xb6,0x06,
0x4c,0x0f,0xb6,0x46, 12,
0x4c,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
0x48,0x0f,0xb7,0x06, // movzwq (%rsi), %rax
0x49,0x0f,0xb7,0x00,
0x4c,0x0f,0xb7,0x06,
0x4c,0x0f,0xb7,0x46, 12,
0x4c,0x0f,0xb7,0x86, 0x90,0x01,0x00,0x00,
0xc5,0xf9,0x7e,0x00,
0xc5,0x79,0x7e,0x00,
0xc4,0xc1,0x79,0x7e,0x00,
0xc5,0xf9,0x6e,0x00,
0xc5,0x79,0x6e,0x00,
0xc4,0xc1,0x79,0x6e,0x00,
0xc5,0xf9,0x6e,0x04,0x88,
0xc4,0x21,0x79,0x6e,0x3c,0x40,
0xc4,0xc1,0x79,0x6e,0x04,0x08,
0xc5,0xf9,0x7e,0xc0,
0xc5,0x79,0x7e,0xc0,
0xc4,0xc1,0x79,0x7e,0xc0,
0xc5,0xf9,0x6e,0xc0,
0xc5,0x79,0x6e,0xc0,
0xc4,0xc1,0x79,0x6e,0xc0,
0x48 ,0x88, 0x02,
0x4c, 0x88, 0x02,
0x49, 0x88, 0x00,
0x48 ,0x8a, 0x10,
0x49, 0x8a, 0x10,
0x4c, 0x8a, 0x00,
0x48, 0xc6, 0xc2, 0x0c,
0x48, 0xc6, 0xc0, 0x04,
0x49, 0xc6, 0xc0, 0xff,
0x48, 0xc6, 0x02, 0x0c,
0x48, 0xc6, 0x00, 0x04,
0x49, 0xc6, 0x00, 0xff,
});
test_asm(r, [&](A& a) {
a.vpinsrd(A::xmm1, A::xmm8, A::Mem{A::rsi}, 1); // vpinsrd $1, (%rsi), %xmm8, %xmm1
a.vpinsrd(A::xmm8, A::xmm1, A::Mem{A::r8 }, 3); // vpinsrd $3, (%r8), %xmm1, %xmm8;
a.vpinsrw(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrw $4, (%rsi), %xmm8, %xmm1
a.vpinsrw(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinrsw $12, (%r8), %xmm1, %xmm8
a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrb $4, (%rsi), %xmm8, %xmm1
a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinsrb $12, (%r8), %xmm1, %xmm8
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
a.vextracti128(A::xmm1, A::ymm8, 1); // vextracti128 $1, %ymm8, %xmm1
a.vextracti128(A::xmm8, A::ymm1, 0); // vextracti128 $0, %ymm1, %xmm8
a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3); // vpextrd $3, %xmm8, (%rsi)
a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2); // vpextrd $2, %xmm1, (%r8)
a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
a.vpextrb(A::Mem{A::rsi}, A::xmm8, 7);
a.vpextrb(A::Mem{A::r8 }, A::xmm1, 15);
},{
0xc4,0xe3,0x39, 0x22, 0x0e, 1,
0xc4,0x43,0x71, 0x22, 0x00, 3,
0xc5,0xb9, 0xc4, 0x0e, 4,
0xc4,0x41,0x71, 0xc4, 0x00, 12,
0xc4,0xe3,0x39, 0x20, 0x0e, 4,
0xc4,0x43,0x71, 0x20, 0x00, 12,
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
0xc4,0x63,0x7d,0x39,0xc1, 1,
0xc4,0xc3,0x7d,0x39,0xc8, 0,
0xc4,0x63,0x79,0x16,0x06, 3,
0xc4,0xc3,0x79,0x16,0x08, 2,
0xc4,0x63,0x79, 0x15, 0x06, 7,
0xc4,0xc3,0x79, 0x15, 0x08, 15,
0xc4,0x63,0x79, 0x14, 0x06, 7,
0xc4,0xc3,0x79, 0x14, 0x08, 15,
});
test_asm(r, [&](A& a) {
a.vpandn(A::ymm3, A::ymm12, A::ymm2);
},{
0xc5, 0x9d, 0xdf, 0xda,
});
test_asm(r, [&](A& a) {
A::Label l;
a.vmovdqa(A::ymm3, A::ymm2); // vmovdqa %ymm2 , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi}); // vmovdqa (%rsi) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsp}); // vmovdqa (%rsp) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::r11}); // vmovdqa (%r11) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4}); // vmovdqa 4(%rsi) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsp, 4}); // vmovdqa 4(%rsp) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::rax, A::EIGHT}); // vmovdqa 4(%rsi,%rax,8), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::r11, 4, A::rax, A::TWO }); // vmovdqa 4(%r11,%rax,2), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11, A::FOUR }); // vmovdqa 4(%rsi,%r11,4), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11, A::ONE }); // vmovdqa 4(%rsi,%r11,1), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11}); // vmovdqa 4(%rsi,%r11) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 64, A::r11}); // vmovdqa 64(%rsi,%r11), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 128, A::r11}); // vmovdqa 128(%rsi,%r11), %ymm3
a.vmovdqa(A::ymm3, &l); // vmovdqa 16(%rip) , %ymm3
a.vcvttps2dq(A::ymm3, A::ymm2);
a.vcvtdq2ps (A::ymm3, A::ymm2);
a.vcvtps2dq (A::ymm3, A::ymm2);
a.vsqrtps (A::ymm3, A::ymm2);
a.label(&l);
},{
0xc5,0xfd,0x6f,0xda,
0xc5,0xfd,0x6f,0x1e,
0xc5,0xfd,0x6f,0x1c,0x24,
0xc4,0xc1,0x7d,0x6f,0x1b,
0xc5,0xfd,0x6f,0x5e,0x04,
0xc5,0xfd,0x6f,0x5c,0x24,0x04,
0xc5,0xfd,0x6f,0x5c,0xc6,0x04,
0xc4,0xc1,0x7d,0x6f,0x5c,0x43,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x9e,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x40,
0xc4,0xa1,0x7d,0x6f,0x9c,0x1e,0x80,0x00,0x00,0x00,
0xc5,0xfd,0x6f,0x1d,0x10,0x00,0x00,0x00,
0xc5,0xfe,0x5b,0xda,
0xc5,0xfc,0x5b,0xda,
0xc5,0xfd,0x5b,0xda,
0xc5,0xfc,0x51,0xda,
});
test_asm(r, [&](A& a) {
a.vcvtps2ph(A::xmm3, A::ymm2, A::CURRENT);
a.vcvtps2ph(A::Mem{A::rsi, 32, A::rax, A::EIGHT}, A::ymm5, A::CEIL);
a.vcvtph2ps(A::ymm15, A::Mem{A::rdi, 12, A::r9, A::ONE});
a.vcvtph2ps(A::ymm2, A::xmm3);
},{
0xc4,0xe3,0x7d,0x1d,0xd3,0x04,
0xc4,0xe3,0x7d,0x1d,0x6c,0xc6,0x20,0x02,
0xc4,0x22,0x7d,0x13,0x7c,0x0f,0x0c,
0xc4,0xe2,0x7d,0x13,0xd3,
});
test_asm(r, [&](A& a) {
a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm1 );
a.vgatherdps(A::ymm10, A::ONE , A::ymm2 , A::rax, A::ymm1 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm12, A::rax, A::ymm1 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::r9 , A::ymm1 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm12);
a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
},{
0xc4,0xe2,0x6d,0x92,0x0c,0x87,
0xc4,0xe2,0x75,0x92,0x04,0x10,
0xc4,0x62,0x75,0x92,0x14,0x10,
0xc4,0xa2,0x75,0x92,0x04,0x20,
0xc4,0xc2,0x75,0x92,0x04,0x11,
0xc4,0xe2,0x1d,0x92,0x04,0x10,
0xc4,0xe2,0x1d,0x92,0x04,0xd0,
});
test_asm(r, [&](A& a) {
a.mov(A::rax, A::Mem{A::rdi, 0});
a.mov(A::rax, A::Mem{A::rdi, 1});
a.mov(A::rax, A::Mem{A::rdi, 512});
a.mov(A::r15, A::Mem{A::r13, 42});
a.mov(A::rax, A::Mem{A::r13, 42});
a.mov(A::r15, A::Mem{A::rax, 42});
a.mov(A::rax, 1);
a.mov(A::rax, A::rcx);
},{
0x48, 0x8b, 0x07,
0x48, 0x8b, 0x47, 0x01,
0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
0x4d, 0x8b, 0x7d, 0x2a,
0x49, 0x8b, 0x45, 0x2a,
0x4c, 0x8b, 0x78, 0x2a,
0x48, 0xc7, 0xc0, 0x01,0x00,0x00,0x00,
0x48, 0x89, 0xc8,
});
// echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
test_asm(r, [&](A& a) {
a.and16b(A::v4, A::v3, A::v1);
a.orr16b(A::v4, A::v3, A::v1);
a.eor16b(A::v4, A::v3, A::v1);
a.bic16b(A::v4, A::v3, A::v1);
a.bsl16b(A::v4, A::v3, A::v1);
a.not16b(A::v4, A::v3);
a.add4s(A::v4, A::v3, A::v1);
a.sub4s(A::v4, A::v3, A::v1);
a.mul4s(A::v4, A::v3, A::v1);
a.cmeq4s(A::v4, A::v3, A::v1);
a.cmgt4s(A::v4, A::v3, A::v1);
a.sub8h(A::v4, A::v3, A::v1);
a.mul8h(A::v4, A::v3, A::v1);
a.fadd4s(A::v4, A::v3, A::v1);
a.fsub4s(A::v4, A::v3, A::v1);
a.fmul4s(A::v4, A::v3, A::v1);
a.fdiv4s(A::v4, A::v3, A::v1);
a.fmin4s(A::v4, A::v3, A::v1);
a.fmax4s(A::v4, A::v3, A::v1);
a.fneg4s(A::v4, A::v3);
a.fmla4s(A::v4, A::v3, A::v1);
a.fmls4s(A::v4, A::v3, A::v1);
a.fcmeq4s(A::v4, A::v3, A::v1);
a.fcmgt4s(A::v4, A::v3, A::v1);
a.fcmge4s(A::v4, A::v3, A::v1);
},{
0x64,0x1c,0x21,0x4e,
0x64,0x1c,0xa1,0x4e,
0x64,0x1c,0x21,0x6e,
0x64,0x1c,0x61,0x4e,
0x64,0x1c,0x61,0x6e,
0x64,0x58,0x20,0x6e,
0x64,0x84,0xa1,0x4e,
0x64,0x84,0xa1,0x6e,
0x64,0x9c,0xa1,0x4e,
0x64,0x8c,0xa1,0x6e,
0x64,0x34,0xa1,0x4e,
0x64,0x84,0x61,0x6e,
0x64,0x9c,0x61,0x4e,
0x64,0xd4,0x21,0x4e,
0x64,0xd4,0xa1,0x4e,
0x64,0xdc,0x21,0x6e,
0x64,0xfc,0x21,0x6e,
0x64,0xf4,0xa1,0x4e,
0x64,0xf4,0x21,0x4e,
0x64,0xf8,0xa0,0x6e,
0x64,0xcc,0x21,0x4e,
0x64,0xcc,0xa1,0x4e,
0x64,0xe4,0x21,0x4e,
0x64,0xe4,0xa1,0x6e,
0x64,0xe4,0x21,0x6e,
});
test_asm(r, [&](A& a) {
a.shl4s(A::v4, A::v3, 0);
a.shl4s(A::v4, A::v3, 1);
a.shl4s(A::v4, A::v3, 8);
a.shl4s(A::v4, A::v3, 16);
a.shl4s(A::v4, A::v3, 31);
a.sshr4s(A::v4, A::v3, 1);
a.sshr4s(A::v4, A::v3, 8);
a.sshr4s(A::v4, A::v3, 31);
a.ushr4s(A::v4, A::v3, 1);
a.ushr4s(A::v4, A::v3, 8);
a.ushr4s(A::v4, A::v3, 31);
a.ushr8h(A::v4, A::v3, 1);
a.ushr8h(A::v4, A::v3, 8);
a.ushr8h(A::v4, A::v3, 15);
},{
0x64,0x54,0x20,0x4f,
0x64,0x54,0x21,0x4f,
0x64,0x54,0x28,0x4f,
0x64,0x54,0x30,0x4f,
0x64,0x54,0x3f,0x4f,
0x64,0x04,0x3f,0x4f,
0x64,0x04,0x38,0x4f,
0x64,0x04,0x21,0x4f,
0x64,0x04,0x3f,0x6f,
0x64,0x04,0x38,0x6f,
0x64,0x04,0x21,0x6f,
0x64,0x04,0x1f,0x6f,
0x64,0x04,0x18,0x6f,
0x64,0x04,0x11,0x6f,
});
test_asm(r, [&](A& a) {
a.sli4s(A::v4, A::v3, 0);
a.sli4s(A::v4, A::v3, 1);
a.sli4s(A::v4, A::v3, 8);
a.sli4s(A::v4, A::v3, 16);
a.sli4s(A::v4, A::v3, 31);
},{
0x64,0x54,0x20,0x6f,
0x64,0x54,0x21,0x6f,
0x64,0x54,0x28,0x6f,
0x64,0x54,0x30,0x6f,
0x64,0x54,0x3f,0x6f,
});
test_asm(r, [&](A& a) {
a.scvtf4s (A::v4, A::v3);
a.fcvtzs4s(A::v4, A::v3);
a.fcvtns4s(A::v4, A::v3);
},{
0x64,0xd8,0x21,0x4e,
0x64,0xb8,0xa1,0x4e,
0x64,0xa8,0x21,0x4e,
});
test_asm(r, [&](A& a) {
a.sub (A::sp, A::sp, 32); // sub sp, sp, #32
a.strq(A::v0, A::sp, 1); // str q0, [sp, #16]
a.strq(A::v1, A::sp); // str q1, [sp]
a.strs(A::v0, A::sp, 6); // str s0, [sp, #24]
a.strb(A::v0, A::sp, 47); // str b0, [sp, #47]
a.ldrb(A::v9, A::sp, 42); // ldr b9, [sp, #42]
a.ldrs(A::v7, A::sp, 10); // ldr s7, [sp, #40]
a.ldrq(A::v5, A::sp, 128); // ldr q5, [sp, #2048]
a.add (A::sp, A::sp, 32); // add sp, sp, #32
},{
0xff,0x83,0x00,0xd1,
0xe0,0x07,0x80,0x3d,
0xe1,0x03,0x80,0x3d,
0xe0,0x1b,0x00,0xbd,
0xe0,0xbf,0x00,0x3d,
0xe9,0xab,0x40,0x3d,
0xe7,0x2b,0x40,0xbd,
0xe5,0x03,0xc2,0x3d,
0xff,0x83,0x00,0x91,
});
test_asm(r, [&](A& a) {
a.brk(0);
a.brk(65535);
a.ret(A::x30); // Conventional ret using link register.
a.ret(A::x13); // Can really return using any register if we like.
a.add(A::x2, A::x2, 4);
a.add(A::x3, A::x2, 32);
a.sub(A::x2, A::x2, 4);
a.sub(A::x3, A::x2, 32);
a.subs(A::x2, A::x2, 4);
a.subs(A::x3, A::x2, 32);
a.subs(A::xzr, A::x2, 4); // These are actually the same instruction!
a.cmp(A::x2, 4);
A::Label l;
a.label(&l);
a.bne(&l);
a.bne(&l);
a.blt(&l);
a.b(&l);
a.cbnz(A::x2, &l);
a.cbz(A::x2, &l);
},{
0x00,0x00,0x20,0xd4,
0xe0,0xff,0x3f,0xd4,
0xc0,0x03,0x5f,0xd6,
0xa0,0x01,0x5f,0xd6,
0x42,0x10,0x00,0x91,
0x43,0x80,0x00,0x91,
0x42,0x10,0x00,0xd1,
0x43,0x80,0x00,0xd1,
0x42,0x10,0x00,0xf1,
0x43,0x80,0x00,0xf1,
0x5f,0x10,0x00,0xf1,
0x5f,0x10,0x00,0xf1,
0x01,0x00,0x00,0x54, // b.ne #0
0xe1,0xff,0xff,0x54, // b.ne #-4
0xcb,0xff,0xff,0x54, // b.lt #-8
0xae,0xff,0xff,0x54, // b.al #-12
0x82,0xff,0xff,0xb5, // cbnz x2, #-16
0x62,0xff,0xff,0xb4, // cbz x2, #-20
});
// Can we cbz() to a not-yet-defined label?
test_asm(r, [&](A& a) {
A::Label l;
a.cbz(A::x2, &l);
a.add(A::x3, A::x2, 32);
a.label(&l);
a.ret(A::x30);
},{
0x42,0x00,0x00,0xb4, // cbz x2, #8
0x43,0x80,0x00,0x91, // add x3, x2, #32
0xc0,0x03,0x5f,0xd6, // ret
});
// If we start a label as a backward label,
// can we redefine it to be a future label?
// (Not sure this is useful... just want to test it works.)
test_asm(r, [&](A& a) {
A::Label l1;
a.label(&l1);
a.add(A::x3, A::x2, 32);
a.cbz(A::x2, &l1); // This will jump backward... nothing sneaky.
A::Label l2; // Start off the same...
a.label(&l2);
a.add(A::x3, A::x2, 32);
a.cbz(A::x2, &l2); // Looks like this will go backward...
a.add(A::x2, A::x2, 4);
a.add(A::x3, A::x2, 32);
a.label(&l2); // But no... actually forward! What a switcheroo!
},{
0x43,0x80,0x00,0x91, // add x3, x2, #32
0xe2,0xff,0xff,0xb4, // cbz x2, #-4
0x43,0x80,0x00,0x91, // add x3, x2, #32
0x62,0x00,0x00,0xb4, // cbz x2, #12
0x42,0x10,0x00,0x91, // add x2, x2, #4
0x43,0x80,0x00,0x91, // add x3, x2, #32
});
// Loading from a label on ARM.
test_asm(r, [&](A& a) {
A::Label fore,aft;
a.label(&fore);
a.word(0x01234567);
a.ldrq(A::v1, &fore);
a.ldrq(A::v2, &aft);
a.label(&aft);
a.word(0x76543210);
},{
0x67,0x45,0x23,0x01,
0xe1,0xff,0xff,0x9c, // ldr q1, #-4
0x22,0x00,0x00,0x9c, // ldr q2, #4
0x10,0x32,0x54,0x76,
});
test_asm(r, [&](A& a) {
a.ldrq(A::v0, A::x8);
a.strq(A::v0, A::x8);
},{
0x00,0x01,0xc0,0x3d,
0x00,0x01,0x80,0x3d,
});
test_asm(r, [&](A& a) {
a.xtns2h(A::v0, A::v0);
a.xtnh2b(A::v0, A::v0);
a.strs (A::v0, A::x0);
a.ldrs (A::v0, A::x0);
a.uxtlb2h(A::v0, A::v0);
a.uxtlh2s(A::v0, A::v0);
a.uminv4s(A::v3, A::v4);
a.fmovs (A::x3, A::v4); // fmov w3,s4
},{
0x00,0x28,0x61,0x0e,
0x00,0x28,0x21,0x0e,
0x00,0x00,0x00,0xbd,
0x00,0x00,0x40,0xbd,
0x00,0xa4,0x08,0x2f,
0x00,0xa4,0x10,0x2f,
0x83,0xa8,0xb1,0x6e,
0x83,0x00,0x26,0x1e,
});
test_asm(r, [&](A& a) {
a.ldrb(A::v0, A::x8);
a.strb(A::v0, A::x8);
},{
0x00,0x01,0x40,0x3d,
0x00,0x01,0x00,0x3d,
});
test_asm(r, [&](A& a) {
a.tbl(A::v0, A::v1, A::v2);
},{
0x20,0x00,0x02,0x4e,
});
}
DEF_TEST(SkVM_approx_math, r) {
auto eval = [](int N, float values[], auto fn) {
skvm::Builder b;
skvm::Arg inout = b.varying<float>();
b.storeF(inout, fn(&b, b.loadF(inout)));
b.done().eval(N, values);
};
auto compare = [r](int N, const float values[], const float expected[]) {
for (int i = 0; i < N; ++i) {
REPORTER_ASSERT(r, SkScalarNearlyEqual(values[i], expected[i], 0.001f));
}
};
// log2
{
float values[] = {0.25f, 0.5f, 1, 2, 4, 8};
constexpr int N = SK_ARRAY_COUNT(values);
eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
return b->approx_log2(v);
});
const float expected[] = {-2, -1, 0, 1, 2, 3};
compare(N, values, expected);
}
// pow2
{
float values[] = {-2, -1, 0, 1, 2, 3};
constexpr int N = SK_ARRAY_COUNT(values);
eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
return b->approx_pow2(v);
});
const float expected[] = {0.25f, 0.5f, 1, 2, 4, 8};
compare(N, values, expected);
}
// powf -- x^0.5
{
float bases[] = {0, 1, 4, 9, 16};
constexpr int N = SK_ARRAY_COUNT(bases);
eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
return b->approx_powf(base, b->splat(0.5f));
});
const float expected[] = {0, 1, 2, 3, 4};
compare(N, bases, expected);
}
// powf -- 3^x
{
float exps[] = {-2, -1, 0, 1, 2};
constexpr int N = SK_ARRAY_COUNT(exps);
eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
return b->approx_powf(b->splat(3.0f), exp);
});
const float expected[] = {1/9.0f, 1/3.0f, 1, 3, 9};
compare(N, exps, expected);
}
auto test = [r](float arg, float expected, float tolerance, auto prog) {
skvm::Builder b;
skvm::Arg inout = b.varying<float>();
b.storeF(inout, prog(b.loadF(inout)));
float actual = arg;
b.done().eval(1, &actual);
float err = std::abs(actual - expected);
if (err > tolerance) {
// SkDebugf("arg %g, expected %g, actual %g\n", arg, expected, actual);
REPORTER_ASSERT(r, true);
}
return err;
};
auto test2 = [r](float arg0, float arg1, float expected, float tolerance, auto prog) {
skvm::Builder b;
skvm::Arg in0 = b.varying<float>();
skvm::Arg in1 = b.varying<float>();
skvm::Arg out = b.varying<float>();
b.storeF(out, prog(b.loadF(in0), b.loadF(in1)));
float actual;
b.done().eval(1, &arg0, &arg1, &actual);
float err = std::abs(actual - expected);
if (err > tolerance) {
// SkDebugf("[%g, %g]: expected %g, actual %g\n", arg0, arg1, expected, actual);
REPORTER_ASSERT(r, true);
}
return err;
};
// sine, cosine, tangent
{
constexpr float P = SK_ScalarPI;
constexpr float tol = 0.00175f;
for (float rad = -5*P; rad <= 5*P; rad += 0.1f) {
test(rad, sk_float_sin(rad), tol, [](skvm::F32 x) {
return approx_sin(x);
});
test(rad, sk_float_cos(rad), tol, [](skvm::F32 x) {
return approx_cos(x);
});
}
// Our tangent diverge more as we get near infinities (x near +- Pi/2),
// so bring in the domain a little.
constexpr float eps = 0.16f;
float err = 0;
for (float rad = -P/2 + eps; rad <= P/2 - eps; rad += 0.01f) {
err += test(rad, sk_float_tan(rad), tol, [](skvm::F32 x) {
return approx_tan(x);
});
// try again with some multiples of P, to check our periodicity
test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
return approx_tan(x + 3*P);
});
test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
return approx_tan(x - 3*P);
});
}
if (0) { SkDebugf("tan error %g\n", err); }
}
// asin, acos, atan
{
constexpr float tol = 0.00175f;
float err = 0;
for (float x = -1; x <= 1; x += 1.0f/64) {
err += test(x, asin(x), tol, [](skvm::F32 x) {
return approx_asin(x);
});
test(x, acos(x), tol, [](skvm::F32 x) {
return approx_acos(x);
});
}
if (0) { SkDebugf("asin error %g\n", err); }
err = 0;
for (float x = -10; x <= 10; x += 1.0f/16) {
err += test(x, atan(x), tol, [](skvm::F32 x) {
return approx_atan(x);
});
}
if (0) { SkDebugf("atan error %g\n", err); }
for (float y = -3; y <= 3; y += 1) {
for (float x = -3; x <= 3; x += 1) {
err += test2(y, x, atan2(y,x), tol, [](skvm::F32 y, skvm::F32 x) {
return approx_atan2(y,x);
});
}
}
if (0) { SkDebugf("atan2 error %g\n", err); }
}
}
DEF_TEST(SkVM_min_max, r) {
// min() and max() have subtle behavior when one argument is NaN and
// the other isn't. It's not sound to blindly swap their arguments.
//
// All backends must behave like std::min() and std::max(), which are
//
// min(x,y) = y<x ? y : x
// max(x,y) = x<y ? y : x
// ±NaN, ±0, ±1, ±inf
const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
float f[8];
memcpy(f, bits, sizeof(bits));
auto identical = [&](float x, float y) {
uint32_t X,Y;
memcpy(&X, &x, 4);
memcpy(&Y, &y, 4);
return X == Y;
};
// Test min/max with non-constant x, non-constant y.
// (Whether x and y are varying or uniform shouldn't make any difference.)
{
skvm::Builder b;
{
skvm::Arg src = b.varying<float>(),
mn = b.varying<float>(),
mx = b.varying<float>();
skvm::F32 x = b.loadF(src),
y = b.uniformF(b.uniform(), 0);
b.storeF(mn, b.min(x,y));
b.storeF(mx, b.max(x,y));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
float mn[8], mx[8];
for (int i = 0; i < 8; i++) {
// min() and max() everything with f[i].
program.eval(8, f,mn,mx, &f[i]);
for (int j = 0; j < 8; j++) {
REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
}
}
});
}
// Test each with constant on the right.
for (int i = 0; i < 8; i++) {
skvm::Builder b;
{
skvm::Arg src = b.varying<float>(),
mn = b.varying<float>(),
mx = b.varying<float>();
skvm::F32 x = b.loadF(src),
y = b.splat(f[i]);
b.storeF(mn, b.min(x,y));
b.storeF(mx, b.max(x,y));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
float mn[8], mx[8];
program.eval(8, f,mn,mx);
for (int j = 0; j < 8; j++) {
REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
}
});
}
// Test each with constant on the left.
for (int i = 0; i < 8; i++) {
skvm::Builder b;
{
skvm::Arg src = b.varying<float>(),
mn = b.varying<float>(),
mx = b.varying<float>();
skvm::F32 x = b.splat(f[i]),
y = b.loadF(src);
b.storeF(mn, b.min(x,y));
b.storeF(mx, b.max(x,y));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
float mn[8], mx[8];
program.eval(8, f,mn,mx);
for (int j = 0; j < 8; j++) {
REPORTER_ASSERT(r, identical(mn[j], std::min(f[i], f[j])));
REPORTER_ASSERT(r, identical(mx[j], std::max(f[i], f[j])));
}
});
}
}
DEF_TEST(SkVM_halfs, r) {
const uint16_t hs[] = {0x0000,0x3800,0x3c00,0x4000,
0xc400,0xb800,0xbc00,0xc000};
const float fs[] = {+0.0f,+0.5f,+1.0f,+2.0f,
-4.0f,-0.5f,-1.0f,-2.0f};
{
skvm::Builder b;
skvm::Arg src = b.varying<uint16_t>(),
dst = b.varying<float>();
b.storeF(dst, b.from_half(b.load16(src)));
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
float dst[8];
program.eval(8, hs, dst);
for (int i = 0; i < 8; i++) {
REPORTER_ASSERT(r, dst[i] == fs[i]);
}
});
}
{
skvm::Builder b;
skvm::Arg src = b.varying<float>(),
dst = b.varying<uint16_t>();
b.store16(dst, b.to_half(b.loadF(src)));
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
uint16_t dst[8];
program.eval(8, fs, dst);
for (int i = 0; i < 8; i++) {
REPORTER_ASSERT(r, dst[i] == hs[i]);
}
});
}
}
DEF_TEST(SkVM_64bit, r) {
uint32_t lo[65],
hi[65];
uint64_t wide[65];
for (int i = 0; i < 65; i++) {
lo[i] = 2*i+0;
hi[i] = 2*i+1;
wide[i] = ((uint64_t)lo[i] << 0)
| ((uint64_t)hi[i] << 32);
}
{
skvm::Builder b;
{
skvm::Arg wide = b.varying<uint64_t>(),
lo = b.varying<int>(),
hi = b.varying<int>();
b.store32(lo, b.load64(wide, 0));
b.store32(hi, b.load64(wide, 1));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
uint32_t l[65], h[65];
program.eval(65, wide,l,h);
for (int i = 0; i < 65; i++) {
REPORTER_ASSERT(r, l[i] == lo[i]);
REPORTER_ASSERT(r, h[i] == hi[i]);
}
});
}
{
skvm::Builder b;
{
skvm::Arg wide = b.varying<uint64_t>(),
lo = b.varying<int>(),
hi = b.varying<int>();
b.store64(wide, b.load32(lo), b.load32(hi));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
uint64_t w[65];
program.eval(65, w,lo,hi);
for (int i = 0; i < 65; i++) {
REPORTER_ASSERT(r, w[i] == wide[i]);
}
});
}
}
DEF_TEST(SkVM_is_NaN_is_finite, r) {
skvm::Builder b;
{
skvm::Arg src = b.varying<float>(),
nan = b.varying<int>(),
fin = b.varying<int>();
b.store32(nan, is_NaN (b.loadF(src)));
b.store32(fin, is_finite(b.loadF(src)));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
// ±NaN, ±0, ±1, ±inf
const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
uint32_t nan[8], fin[8];
program.eval(8, bits, nan,fin);
for (int i = 0; i < 8; i++) {
REPORTER_ASSERT(r, nan[i] == ((i == 0 || i == 1) ? 0xffffffff : 0));
REPORTER_ASSERT(r, fin[i] == ((i == 2 || i == 3 ||
i == 4 || i == 5) ? 0xffffffff : 0));
}
});
}
DEF_TEST(SkVM_args, r) {
// Test we can handle at least six arguments.
skvm::Builder b;
{
skvm::Arg dst = b.varying<float>(),
A = b.varying<float>(),
B = b.varying<float>(),
C = b.varying<float>(),
D = b.varying<float>(),
E = b.varying<float>();
storeF(dst, b.loadF(A)
+ b.loadF(B)
+ b.loadF(C)
+ b.loadF(D)
+ b.loadF(E));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
float dst[17],A[17],B[17],C[17],D[17],E[17];
for (int i = 0; i < 17; i++) {
A[i] = B[i] = C[i] = D[i] = E[i] = (float)i;
}
program.eval(17, dst,A,B,C,D,E);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(r, dst[i] == 5.0f*i);
}
});
}
q14 rethink I've been thinking and rethinking and rethinking how best to use 16-bit values like Q14 fixed-point in SkVM. Here's some ways: A) don't... just use 32-bit values instead B) use 16x2-bit pairs to match the narrower 32-bit lane count C) double-pump 32-bit values to match the wider 16-bit lane count D) use native 16- and 32-bit values and let the backends sort it out A) is how things work today, and C) is how SkRasterPipeline's lowp mode works. Having tried out B) and C) both for a good fair shake, they were both already awkward to work with after writing just a few functions. I would not give up on them entirely, but they're no longer my favorites. D) is subtle and my new favorite. It's easiest to program with SkVM when the values we're holding represent single values and the backend handles any parallelism for us. That suggests we add a simple 16-bit Q14 to the existing 32-bit I32 and F32 types, where they can be actively converted between as normal, but not freely no-op bit punned. D) says we people shouldn't have to choose between A-C) up front... each backend can handle it themselves. Under strategy D), it's entirely the backend's job to decide how to represent each value, and how to to vectorize them. We don't need to know as a user, and the backends can use the program itself to inform how they vectorize. 16-bit values could live in xmm registers and 32-bit values in ymm, or the 16-bit values could go in the low half of a ymm, or the even lanes of a ymm, or a full ymm and use two for 32-bit values, etc. etc. This all is a backend choice, not something we should have to know about when writing a program using Q14/I32/F32. My next steps are to get Q14 operations tested and plumbed through the JIT again, and to build out a blitter and a few effects using Q14 color channels. Then, independently, we can look at each backend and how to vectorize them. Some ideas: 1) keep running at current vectorization, with half rate 16-bit ops 2) pump up to 2x wider vectorization unconditionally to favor 16-bit 3) pump up to 2x wider vectorization only when any 16-bit op is used These choices can be made independently for each backend (JIT, LLVM, interp), and I wouldn't be surprised to find that we'll want to do them differently. For instance, the interpreter is already running at 32x vectorization... might be pumping it higher won't help anything. Change-Id: Ib8ad2b1bf790e8c4e3acfb4818d4032f7628e8f8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/319321 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2020-09-24 16:17:22 +00:00
DEF_TEST(SkVM_Q14, r) {
// Some nice round Q14 test values from -1.0 (0xc000) to +1.0 (0x4000) by 16ths (0x0400).
const uint16_t src[] = {
0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00,
0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, 0xfc00,
0x0000,
0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000,
};
// These test cases are essentially mechanically generated to get coverage...
// I've spot checked here and there and things seem correct, but I wouldn't
// be surprised to find that there were bugs. Using nice round numbers to
// avoid having to think about low-bit precision for now.
struct {
q14 rethink I've been thinking and rethinking and rethinking how best to use 16-bit values like Q14 fixed-point in SkVM. Here's some ways: A) don't... just use 32-bit values instead B) use 16x2-bit pairs to match the narrower 32-bit lane count C) double-pump 32-bit values to match the wider 16-bit lane count D) use native 16- and 32-bit values and let the backends sort it out A) is how things work today, and C) is how SkRasterPipeline's lowp mode works. Having tried out B) and C) both for a good fair shake, they were both already awkward to work with after writing just a few functions. I would not give up on them entirely, but they're no longer my favorites. D) is subtle and my new favorite. It's easiest to program with SkVM when the values we're holding represent single values and the backend handles any parallelism for us. That suggests we add a simple 16-bit Q14 to the existing 32-bit I32 and F32 types, where they can be actively converted between as normal, but not freely no-op bit punned. D) says we people shouldn't have to choose between A-C) up front... each backend can handle it themselves. Under strategy D), it's entirely the backend's job to decide how to represent each value, and how to to vectorize them. We don't need to know as a user, and the backends can use the program itself to inform how they vectorize. 16-bit values could live in xmm registers and 32-bit values in ymm, or the 16-bit values could go in the low half of a ymm, or the even lanes of a ymm, or a full ymm and use two for 32-bit values, etc. etc. This all is a backend choice, not something we should have to know about when writing a program using Q14/I32/F32. My next steps are to get Q14 operations tested and plumbed through the JIT again, and to build out a blitter and a few effects using Q14 color channels. Then, independently, we can look at each backend and how to vectorize them. Some ideas: 1) keep running at current vectorization, with half rate 16-bit ops 2) pump up to 2x wider vectorization unconditionally to favor 16-bit 3) pump up to 2x wider vectorization only when any 16-bit op is used These choices can be made independently for each backend (JIT, LLVM, interp), and I wouldn't be surprised to find that we'll want to do them differently. For instance, the interpreter is already running at 32x vectorization... might be pumping it higher won't help anything. Change-Id: Ib8ad2b1bf790e8c4e3acfb4818d4032f7628e8f8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/319321 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2020-09-24 16:17:22 +00:00
skvm::Q14 (*fn)(skvm::Q14);
uint16_t expected[33];
} cases[] = {
q14 rethink I've been thinking and rethinking and rethinking how best to use 16-bit values like Q14 fixed-point in SkVM. Here's some ways: A) don't... just use 32-bit values instead B) use 16x2-bit pairs to match the narrower 32-bit lane count C) double-pump 32-bit values to match the wider 16-bit lane count D) use native 16- and 32-bit values and let the backends sort it out A) is how things work today, and C) is how SkRasterPipeline's lowp mode works. Having tried out B) and C) both for a good fair shake, they were both already awkward to work with after writing just a few functions. I would not give up on them entirely, but they're no longer my favorites. D) is subtle and my new favorite. It's easiest to program with SkVM when the values we're holding represent single values and the backend handles any parallelism for us. That suggests we add a simple 16-bit Q14 to the existing 32-bit I32 and F32 types, where they can be actively converted between as normal, but not freely no-op bit punned. D) says we people shouldn't have to choose between A-C) up front... each backend can handle it themselves. Under strategy D), it's entirely the backend's job to decide how to represent each value, and how to to vectorize them. We don't need to know as a user, and the backends can use the program itself to inform how they vectorize. 16-bit values could live in xmm registers and 32-bit values in ymm, or the 16-bit values could go in the low half of a ymm, or the even lanes of a ymm, or a full ymm and use two for 32-bit values, etc. etc. This all is a backend choice, not something we should have to know about when writing a program using Q14/I32/F32. My next steps are to get Q14 operations tested and plumbed through the JIT again, and to build out a blitter and a few effects using Q14 color channels. Then, independently, we can look at each backend and how to vectorize them. Some ideas: 1) keep running at current vectorization, with half rate 16-bit ops 2) pump up to 2x wider vectorization unconditionally to favor 16-bit 3) pump up to 2x wider vectorization only when any 16-bit op is used These choices can be made independently for each backend (JIT, LLVM, interp), and I wouldn't be surprised to find that we'll want to do them differently. For instance, the interpreter is already running at 32x vectorization... might be pumping it higher won't help anything. Change-Id: Ib8ad2b1bf790e8c4e3acfb4818d4032f7628e8f8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/319321 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2020-09-24 16:17:22 +00:00
{[](skvm::Q14 x) { return x; }, // Just double checking the test harness works.
{0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00,
0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, 0xfc00,
0x0000,
0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000}},
{[](skvm::Q14 x) { return x*x; }, // square ±1/16 (0x0400) -> 1/256 (0x0040), etc.
{0x4000, 0x3840, 0x3100, 0x2a40, 0x2400, 0x1e40, 0x1900, 0x1440,
0x1000, 0x0c40, 0x0900, 0x0640, 0x0400, 0x0240, 0x0100, 0x0040,
0x0000,
0x0040, 0x0100, 0x0240, 0x0400, 0x0640, 0x0900, 0x0c40, 0x1000,
0x1440, 0x1900, 0x1e40, 0x2400, 0x2a40, 0x3100, 0x3840, 0x4000}},
{[](skvm::Q14 x) { return -(x*-x); }, // square, version B
{0x4000, 0x3840, 0x3100, 0x2a40, 0x2400, 0x1e40, 0x1900, 0x1440,
0x1000, 0x0c40, 0x0900, 0x0640, 0x0400, 0x0240, 0x0100, 0x0040,
0x0000,
0x0040, 0x0100, 0x0240, 0x0400, 0x0640, 0x0900, 0x0c40, 0x1000,
0x1440, 0x1900, 0x1e40, 0x2400, 0x2a40, 0x3100, 0x3840, 0x4000}},
{[](skvm::Q14 x) { return x>>1; }, // divide by 2
{0xe000, 0xe200, 0xe400, 0xe600, 0xe800, 0xea00, 0xec00, 0xee00,
0xf000, 0xf200, 0xf400, 0xf600, 0xf800, 0xfa00, 0xfc00, 0xfe00,
0x0000,
0x0200, 0x0400, 0x0600, 0x0800, 0x0a00, 0x0c00, 0x0e00, 0x1000,
0x1200, 0x1400, 0x1600, 0x1800, 0x1a00, 0x1c00, 0x1e00, 0x2000}},
{[](skvm::Q14 x) { return shr(x,1); }, // logical shift by 1
{0x6000, 0x6200, 0x6400, 0x6600, 0x6800, 0x6a00, 0x6c00, 0x6e00,
0x7000, 0x7200, 0x7400, 0x7600, 0x7800, 0x7a00, 0x7c00, 0x7e00,
0x0000,
0x0200, 0x0400, 0x0600, 0x0800, 0x0a00, 0x0c00, 0x0e00, 0x1000,
0x1200, 0x1400, 0x1600, 0x1800, 0x1a00, 0x1c00, 0x1e00, 0x2000}},
{[](skvm::Q14 x) { return x - (x>>2); }, // 3/4 x, version A
{0xd000, 0xd300, 0xd600, 0xd900, 0xdc00, 0xdf00, 0xe200, 0xe500,
0xe800, 0xeb00, 0xee00, 0xf100, 0xf400, 0xf700, 0xfa00, 0xfd00,
0x0000,
0x0300, 0x0600, 0x0900, 0x0c00, 0x0f00, 0x1200, 0x1500, 0x1800,
0x1b00, 0x1e00, 0x2100, 0x2400, 0x2700, 0x2a00, 0x2d00, 0x3000}},
{[](skvm::Q14 x) { return (x>>1) + (x>>2); }, // 3/4 x, version B
{0xd000, 0xd300, 0xd600, 0xd900, 0xdc00, 0xdf00, 0xe200, 0xe500,
0xe800, 0xeb00, 0xee00, 0xf100, 0xf400, 0xf700, 0xfa00, 0xfd00,
0x0000,
0x0300, 0x0600, 0x0900, 0x0c00, 0x0f00, 0x1200, 0x1500, 0x1800,
0x1b00, 0x1e00, 0x2100, 0x2400, 0x2700, 0x2a00, 0x2d00, 0x3000}},
{[](skvm::Q14 x) { return ((x>>2) + (x>>3))<<1; }, // 3/4 x, version C
{0xd000, 0xd300, 0xd600, 0xd900, 0xdc00, 0xdf00, 0xe200, 0xe500,
0xe800, 0xeb00, 0xee00, 0xf100, 0xf400, 0xf700, 0xfa00, 0xfd00,
0x0000,
0x0300, 0x0600, 0x0900, 0x0c00, 0x0f00, 0x1200, 0x1500, 0x1800,
0x1b00, 0x1e00, 0x2100, 0x2400, 0x2700, 0x2a00, 0x2d00, 0x3000}},
// TODO: I'm not sure if this one is working correctly or not. Should only work for >=0?
q14 rethink I've been thinking and rethinking and rethinking how best to use 16-bit values like Q14 fixed-point in SkVM. Here's some ways: A) don't... just use 32-bit values instead B) use 16x2-bit pairs to match the narrower 32-bit lane count C) double-pump 32-bit values to match the wider 16-bit lane count D) use native 16- and 32-bit values and let the backends sort it out A) is how things work today, and C) is how SkRasterPipeline's lowp mode works. Having tried out B) and C) both for a good fair shake, they were both already awkward to work with after writing just a few functions. I would not give up on them entirely, but they're no longer my favorites. D) is subtle and my new favorite. It's easiest to program with SkVM when the values we're holding represent single values and the backend handles any parallelism for us. That suggests we add a simple 16-bit Q14 to the existing 32-bit I32 and F32 types, where they can be actively converted between as normal, but not freely no-op bit punned. D) says we people shouldn't have to choose between A-C) up front... each backend can handle it themselves. Under strategy D), it's entirely the backend's job to decide how to represent each value, and how to to vectorize them. We don't need to know as a user, and the backends can use the program itself to inform how they vectorize. 16-bit values could live in xmm registers and 32-bit values in ymm, or the 16-bit values could go in the low half of a ymm, or the even lanes of a ymm, or a full ymm and use two for 32-bit values, etc. etc. This all is a backend choice, not something we should have to know about when writing a program using Q14/I32/F32. My next steps are to get Q14 operations tested and plumbed through the JIT again, and to build out a blitter and a few effects using Q14 color channels. Then, independently, we can look at each backend and how to vectorize them. Some ideas: 1) keep running at current vectorization, with half rate 16-bit ops 2) pump up to 2x wider vectorization unconditionally to favor 16-bit 3) pump up to 2x wider vectorization only when any 16-bit op is used These choices can be made independently for each backend (JIT, LLVM, interp), and I wouldn't be surprised to find that we'll want to do them differently. For instance, the interpreter is already running at 32x vectorization... might be pumping it higher won't help anything. Change-Id: Ib8ad2b1bf790e8c4e3acfb4818d4032f7628e8f8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/319321 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2020-09-24 16:17:22 +00:00
{[](skvm::Q14 x) { return unsigned_avg(x, x>>1); }, // 3/4 x, version D
{0xd000, 0xd300, 0xd600, 0xd900, 0xdc00, 0xdf00, 0xe200, 0xe500,
0xe800, 0xeb00, 0xee00, 0xf100, 0xf400, 0xf700, 0xfa00, 0xfd00,
0x0000,
0x0300, 0x0600, 0x0900, 0x0c00, 0x0f00, 0x1200, 0x1500, 0x1800,
0x1b00, 0x1e00, 0x2100, 0x2400, 0x2700, 0x2a00, 0x2d00, 0x3000}},
{[](skvm::Q14 x) { return min(x, +0.5f); }, // clamp down to 0x2000, version A
{0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00,
0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, 0xfc00,
0x0000,
0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000}},
{[](skvm::Q14 x) { return select(x < +0.5f, x, +0.5f); }, // clamp down to 0x2000, vB
{0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00,
0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, 0xfc00,
0x0000,
0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000}},
{[](skvm::Q14 x) { return select(x == 1.0f, 0.5f, x); },
{0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00,
0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, 0xfc00,
0x0000,
0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x2000}},
{[](skvm::Q14 x) { return max(x, -0.5f); }, // clamp up to 0xe000
{0xe000, 0xe000, 0xe000, 0xe000, 0xe000, 0xe000, 0xe000, 0xe000,
0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, 0xfc00,
0x0000,
0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000}},
};
for (const auto& test : cases) {
skvm::Builder b;
{
q14 rethink I've been thinking and rethinking and rethinking how best to use 16-bit values like Q14 fixed-point in SkVM. Here's some ways: A) don't... just use 32-bit values instead B) use 16x2-bit pairs to match the narrower 32-bit lane count C) double-pump 32-bit values to match the wider 16-bit lane count D) use native 16- and 32-bit values and let the backends sort it out A) is how things work today, and C) is how SkRasterPipeline's lowp mode works. Having tried out B) and C) both for a good fair shake, they were both already awkward to work with after writing just a few functions. I would not give up on them entirely, but they're no longer my favorites. D) is subtle and my new favorite. It's easiest to program with SkVM when the values we're holding represent single values and the backend handles any parallelism for us. That suggests we add a simple 16-bit Q14 to the existing 32-bit I32 and F32 types, where they can be actively converted between as normal, but not freely no-op bit punned. D) says we people shouldn't have to choose between A-C) up front... each backend can handle it themselves. Under strategy D), it's entirely the backend's job to decide how to represent each value, and how to to vectorize them. We don't need to know as a user, and the backends can use the program itself to inform how they vectorize. 16-bit values could live in xmm registers and 32-bit values in ymm, or the 16-bit values could go in the low half of a ymm, or the even lanes of a ymm, or a full ymm and use two for 32-bit values, etc. etc. This all is a backend choice, not something we should have to know about when writing a program using Q14/I32/F32. My next steps are to get Q14 operations tested and plumbed through the JIT again, and to build out a blitter and a few effects using Q14 color channels. Then, independently, we can look at each backend and how to vectorize them. Some ideas: 1) keep running at current vectorization, with half rate 16-bit ops 2) pump up to 2x wider vectorization unconditionally to favor 16-bit 3) pump up to 2x wider vectorization only when any 16-bit op is used These choices can be made independently for each backend (JIT, LLVM, interp), and I wouldn't be surprised to find that we'll want to do them differently. For instance, the interpreter is already running at 32x vectorization... might be pumping it higher won't help anything. Change-Id: Ib8ad2b1bf790e8c4e3acfb4818d4032f7628e8f8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/319321 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2020-09-24 16:17:22 +00:00
skvm::Arg dst = b.varying<uint16_t>(),
src = b.varying<uint16_t>();
skvm::Q14 x = to_Q14(b.load16(src));
store16(dst, to_I32(test.fn(x)));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program){
q14 rethink I've been thinking and rethinking and rethinking how best to use 16-bit values like Q14 fixed-point in SkVM. Here's some ways: A) don't... just use 32-bit values instead B) use 16x2-bit pairs to match the narrower 32-bit lane count C) double-pump 32-bit values to match the wider 16-bit lane count D) use native 16- and 32-bit values and let the backends sort it out A) is how things work today, and C) is how SkRasterPipeline's lowp mode works. Having tried out B) and C) both for a good fair shake, they were both already awkward to work with after writing just a few functions. I would not give up on them entirely, but they're no longer my favorites. D) is subtle and my new favorite. It's easiest to program with SkVM when the values we're holding represent single values and the backend handles any parallelism for us. That suggests we add a simple 16-bit Q14 to the existing 32-bit I32 and F32 types, where they can be actively converted between as normal, but not freely no-op bit punned. D) says we people shouldn't have to choose between A-C) up front... each backend can handle it themselves. Under strategy D), it's entirely the backend's job to decide how to represent each value, and how to to vectorize them. We don't need to know as a user, and the backends can use the program itself to inform how they vectorize. 16-bit values could live in xmm registers and 32-bit values in ymm, or the 16-bit values could go in the low half of a ymm, or the even lanes of a ymm, or a full ymm and use two for 32-bit values, etc. etc. This all is a backend choice, not something we should have to know about when writing a program using Q14/I32/F32. My next steps are to get Q14 operations tested and plumbed through the JIT again, and to build out a blitter and a few effects using Q14 color channels. Then, independently, we can look at each backend and how to vectorize them. Some ideas: 1) keep running at current vectorization, with half rate 16-bit ops 2) pump up to 2x wider vectorization unconditionally to favor 16-bit 3) pump up to 2x wider vectorization only when any 16-bit op is used These choices can be made independently for each backend (JIT, LLVM, interp), and I wouldn't be surprised to find that we'll want to do them differently. For instance, the interpreter is already running at 32x vectorization... might be pumping it higher won't help anything. Change-Id: Ib8ad2b1bf790e8c4e3acfb4818d4032f7628e8f8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/319321 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2020-09-24 16:17:22 +00:00
uint16_t dst[33];
program.eval(33, dst,src);
for (int i = 0; i < 33; i++) {
if (test.expected[32]) {
REPORTER_ASSERT(r, test.expected[i] == dst[i]);
} else {
q14 rethink I've been thinking and rethinking and rethinking how best to use 16-bit values like Q14 fixed-point in SkVM. Here's some ways: A) don't... just use 32-bit values instead B) use 16x2-bit pairs to match the narrower 32-bit lane count C) double-pump 32-bit values to match the wider 16-bit lane count D) use native 16- and 32-bit values and let the backends sort it out A) is how things work today, and C) is how SkRasterPipeline's lowp mode works. Having tried out B) and C) both for a good fair shake, they were both already awkward to work with after writing just a few functions. I would not give up on them entirely, but they're no longer my favorites. D) is subtle and my new favorite. It's easiest to program with SkVM when the values we're holding represent single values and the backend handles any parallelism for us. That suggests we add a simple 16-bit Q14 to the existing 32-bit I32 and F32 types, where they can be actively converted between as normal, but not freely no-op bit punned. D) says we people shouldn't have to choose between A-C) up front... each backend can handle it themselves. Under strategy D), it's entirely the backend's job to decide how to represent each value, and how to to vectorize them. We don't need to know as a user, and the backends can use the program itself to inform how they vectorize. 16-bit values could live in xmm registers and 32-bit values in ymm, or the 16-bit values could go in the low half of a ymm, or the even lanes of a ymm, or a full ymm and use two for 32-bit values, etc. etc. This all is a backend choice, not something we should have to know about when writing a program using Q14/I32/F32. My next steps are to get Q14 operations tested and plumbed through the JIT again, and to build out a blitter and a few effects using Q14 color channels. Then, independently, we can look at each backend and how to vectorize them. Some ideas: 1) keep running at current vectorization, with half rate 16-bit ops 2) pump up to 2x wider vectorization unconditionally to favor 16-bit 3) pump up to 2x wider vectorization only when any 16-bit op is used These choices can be made independently for each backend (JIT, LLVM, interp), and I wouldn't be surprised to find that we'll want to do them differently. For instance, the interpreter is already running at 32x vectorization... might be pumping it higher won't help anything. Change-Id: Ib8ad2b1bf790e8c4e3acfb4818d4032f7628e8f8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/319321 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2020-09-24 16:17:22 +00:00
if (i == 0 || i == 8 || i == 16 || i == 17 || i == 25) SkDebugf("\n");
SkDebugf("0x%04x, ", dst[i]);
}
}
});
}
}
remove Op::pack pack(x,y,bits) as an alias for x|(y<<bits) only existed originally to implement it with the SLI arm64 instruction, but I've since realized that was misguided. I had thought the assumption on pack ("(x & (y << bits)) == 0"), i.e. "no overlap between x and the shifted y", was enough to make using SLI legal, but it's actually not strong enough a requirement. The SLI docs say "...inserts the result into the corresponding vector element in the destination SIMD&FP register such that the new zero bits created by the shift are not inserted but retain their existing value." The key thing not mentioned there happens with zero bits _not_ created by the shift, the ones already present at the top of y. They're of course inserted, overwriting any previous values. This means SLI (and so pack()) become strictly order dependent in a way I had never intended. This will work as you'd think, skvm::I32 px = splat(0); px = pack(px, r, 0); px = pack(px, a, 24); but this version swapping the two calls to pack() will overwrite alpha, skvm::I32 px = splat(0); px = pack(px, a, 24); px = pack(px, r, 0); I find that error-prone, so I've removed Op::pack and replaced it with a simple expansion to x|(y<<bits). That of course works in either order. This new test can't JIT at head, but if we implement the other missing instructions (soon, dependent CL) it would start failing when JIT'd. The interpreter and x86 were both fine, since they're both doing what's now the only approach to pack(), the simple x|(y<<bits). I've left assembler support for SLI in case we want to try it again. Change-Id: Iaf879309d3e1d0a458a688f3a62556e55ab05e23 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/337197 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-11-20 21:34:16 +00:00
DEF_TEST(SkVM_badpack, r) {
// Test case distilled from actual failing draw,
// originally with a bad arm64 implementation of pack().
skvm::Builder p;
{
skvm::Arg uniforms = p.uniform(),
dst = p.varying<uint16_t>();
skvm::I32 r = round(bit_cast(p.uniform32(uniforms, 8)) * 15),
a = p.splat(0xf);
skvm::I32 _4444 = p.splat(0);
_4444 = pack(_4444, r, 12);
_4444 = pack(_4444, a, 0);
store16(dst, _4444);
}
test_jit_and_interpreter(p.done(), [&](const skvm::Program& program){
const float uniforms[] = { 0.0f, 0.0f,
1.0f, 0.0f, 0.0f, 1.0f };
uint16_t dst[17] = {0};
program.eval(17, uniforms,dst);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(r, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
}
});
}