skia2/tests/SkVMTest.cpp

2898 lines
92 KiB
C++
Raw Normal View History

/*
* Copyright 2019 Google LLC
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "include/core/SkColorPriv.h"
#include "include/private/SkColorData.h"
#include "src/core/SkCpu.h"
#include "src/core/SkMSAN.h"
#include "src/core/SkVM.h"
#include "src/gpu/ganesh/GrShaderCaps.h"
#include "src/sksl/SkSLCompiler.h"
#include "src/sksl/codegen/SkSLVMCodeGenerator.h"
#include "src/sksl/tracing/SkVMDebugTrace.h"
#include "src/utils/SkVMVisualizer.h"
#include "tests/Test.h"
template <typename Fn>
static void test_jit_and_interpreter(const skvm::Builder& b, Fn&& test) {
skvm::Program p = b.done();
test(p);
if (p.hasJIT()) {
test(b.done(/*debug_name=*/nullptr, /*allow_jit=*/false));
Reland "mark which SkVM tests should JIT or not" This is a reland of 52435503e992cbeb388d90c51f74515ab1e11c96 with better checks for when we should expect JIT and not. Original change's description: > mark which SkVM tests should JIT or not > > Most of these tests converted over to test_interpreter_only() > are failing to JIT because of unimplemented instructions. No > bug there, just TODOs. > > But SkVM_hoist _should_ be JITting. A while back I landed a CL > that messed with value lifetimes that prevents it from JITting. > Will be using this as a regression test to fix that bug. > > Change-Id: Id2034f6548a45ed9aeb9ae3cbb24d389cad7dc60 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248980 > Commit-Queue: Mike Klein <mtklein@google.com> > Commit-Queue: Ethan Nicholas <ethannicholas@google.com> > Auto-Submit: Mike Klein <mtklein@google.com> > Reviewed-by: Ethan Nicholas <ethannicholas@google.com> > Reviewed-by: Herb Derby <herb@google.com> Cq-Include-Trybots: skia.primary:Test-Android-Clang-NVIDIA_Shield-CPU-TegraX1-arm64-Release-All-Android,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_CPU_LIMIT_SSE2,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_CPU_LIMIT_SSE41,Test-Mac10.13-Clang-VMware7.1-CPU-AVX-x86_64-Debug-All-NativeFonts,Test-Mac10.14-Clang-VMware7.1-CPU-AVX-x86_64-Debug-All-NativeFonts Change-Id: Id7bde7e879649e435fa424a9c9d6c51a31afd5e9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248990 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-10-16 15:11:56 +00:00
}
}
DEF_TEST(SkVM_eliminate_dead_code, r) {
skvm::Builder b;
{
skvm::Ptr arg = b.varying<int>();
skvm::I32 l = b.load32(arg);
skvm::I32 a = b.add(l, l);
b.add(a, b.splat(7));
}
std::vector<skvm::Instruction> program = b.program();
REPORTER_ASSERT(r, program.size() == 4);
program = skvm::eliminate_dead_code(program);
REPORTER_ASSERT(r, program.size() == 0);
}
DEF_TEST(SkVM_Pointless, r) {
// Let's build a program with no memory arguments.
// It should all be pegged as dead code, but we should be able to "run" it.
skvm::Builder b;
{
b.add(b.splat(5.0f),
b.splat(4.0f));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
for (int N = 0; N < 64; N++) {
program.eval(N);
}
});
refactor out a middle representation Kind of brewing a big refactor here, to give me some room between skvm::Builder and skvm::Program to do optimizations, bakend specializations and analysis. As a warmup, I'm trying to split up today's Builder::Instruction into two forms, first just what the user requested in Builder (this stays Builder::Instruction) then a new type representing any transformation or analysis we've done to it (OptimizedInstruction). Roughly six important optimizations happen in SkVM today, in this order: 1) constant folding 2) backend-specific instruction specialization 3) common sub-expression elimination 4) reordering + dead code elimination 5) loop invariant and lifetime analysis 6) register assignment At head 1-5 all happen in Builder, and 2 is particularly awkward to have there (e.g. mul_f32 -> mul_f32_imm). 6 happens in Program per-backend, and that seems healthy. As of this CL, 1-3 happen in Builder, 4-5 now on this middle OptimizedInstruction format, and 6 still in Program. I'd like to get to the point where 1 stays in Builder, 2-5 all happen on this middle IR, and 6 stays in Program. That ought to let me do things like turn mul_f32 -> mul_f32_imm when it's good to and still benefit from things like common sub-expression elimination and code reordering happening after that trnasformation. And then, I hope that's also a good spot to do more complicated transformations, like lowering gather8 into gather32 plus some fix up when targeting an x86 JIT but not anywhere else. Today's Builder is too early to know whether we should do this or not, and in Program it's actually kind of awkward to do this sort of thing while also doing having to do register assignment. Some middle might be right. Change-Id: I9c00268a084f07fbab88d05eb441f1957a0d7c67 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/269181 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-02-06 19:02:32 +00:00
for (const skvm::OptimizedInstruction& inst : b.optimize()) {
add used_in_loop bit to skvm::Builder::Instruction Most hoisted values are used in the loop body (and that's really the whole point of hoisting) but some are just temporaries to help produce other hoisted values. This used_in_loop bit helps us distinguish the two, and lets us recycle registers holding temporary hoisted values not used in the loop. The can-we-recycle logic now becomes: - is this a real value? - is it time for it to die? - is it either not hoisted or a hoisted temporary? The set-death-to-infinity approach for hoisted values is now gone. That worked great for hoisted values used inside the loop, but was too conservative for hoisted temporaries. This lifetime extension was preventing us from recycling those registers, pinning enough registers that we run out and fail to JIT. Small amounts of refactoring to make this clearer: - move the Instruction hash function definition near its operator== - rename the two "hoist" variables to "can_hoist" for Instructions and "try_hoisting" for the JIT approach - add ↟ to mark hoisted temporaries, _really_ hoisted values. There's some redundancy here between tracking the can_hoist bit, the used_in_loop bit, and lifetime tracking. I think it should be true, for instance, that !can_hoist && !used_in_loop implies an instruction is dead code. I plan to continue refactoring lifetime analysis (in particular reordering instructions to decrease register pressure) so hopefully by the time I'm done that metadata will shake out a little crisper. Change-Id: I6460ca96d1cbec0315bed3c9a0774cd88ab5be26 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/248986 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-10-16 15:46:01 +00:00
REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
}
}
DEF_TEST(SkVM_memset, r) {
skvm::Builder b;
b.store32(b.varying<int>(), b.splat(42));
test_jit_and_interpreter(b, [&](const skvm::Program& p) {
int buf[18];
buf[17] = 47;
convert to phi nodes Convert our n+args stack homes to phi nodes, essentially performing mem2reg ourselves, eliminating the need for it at runtime. Also, use b.getInt64(k) to create integer constants. Also, print verifyModule() errors to stdout (instead of nowhere). Also, update unit test to make sure we don't run off the end. Bitcode still looks good: define void @skvm-jit-211960346(i64, i8*) { enter: br label %testK testK: ; preds = %loopK, %enter %2 = phi i64 [ %0, %enter ], [ %6, %loopK ] %3 = phi i8* [ %1, %enter ], [ %7, %loopK ] %4 = icmp uge i64 %2, 16 br i1 %4, label %loopK, label %test1 loopK: ; preds = %testK %5 = bitcast i8* %3 to <16 x i32>* store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, <16 x i32>* %5, align 1 %6 = sub i64 %2, 16 %7 = getelementptr i8, i8* %3, i64 64 br label %testK test1: ; preds = %loop1, %testK %8 = phi i64 [ %2, %testK ], [ %12, %loop1 ] %9 = phi i8* [ %3, %testK ], [ %13, %loop1 ] %10 = icmp uge i64 %8, 1 br i1 %10, label %loop1, label %leave loop1: ; preds = %test1 %11 = bitcast i8* %9 to i32* store i32 42, i32* %11, align 1 %12 = sub i64 %8, 1 %13 = getelementptr i8, i8* %9, i64 4 br label %test1 leave: ; preds = %test1 ret void } and the final assembly looks the same: 0x10a3f5000: movabsq $0x10a3f6000, %rax ; imm = 0x10A3F6000 0x10a3f500a: vbroadcastss (%rax), %zmm0 0x10a3f5010: cmpq $0xf, %rdi 0x10a3f5014: jbe 0x10a3f504d 0x10a3f5016: nopw %cs:(%rax,%rax) 0x10a3f5020: vmovups %zmm0, (%rsi) 0x10a3f5026: addq $-0x10, %rdi 0x10a3f502a: addq $0x40, %rsi 0x10a3f502e: cmpq $0xf, %rdi 0x10a3f5032: ja 0x10a3f5020 0x10a3f5034: jmp 0x10a3f504d 0x10a3f5036: nopw %cs:(%rax,%rax) 0x10a3f5040: movl $0x2a, (%rsi) 0x10a3f5046: decq %rdi 0x10a3f5049: addq $0x4, %rsi 0x10a3f504d: testq %rdi, %rdi 0x10a3f5050: jne 0x10a3f5040 0x10a3f5052: vzeroupper 0x10a3f5055: retq Change-Id: I12d11c7d5786c4c3df28a49bb3044be10f0770e0 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/273753 Reviewed-by: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-02-27 16:07:53 +00:00
p.eval(17, buf);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(r, buf[i] == 42);
}
REPORTER_ASSERT(r, buf[17] == 47);
});
}
DEF_TEST(SkVM_memcpy, r) {
skvm::Builder b;
{
auto src = b.varying<int>(),
dst = b.varying<int>();
b.store32(dst, b.load32(src));
}
test_jit_and_interpreter(b, [&](const skvm::Program& p) {
int src[] = {1,2,3,4,5,6,7,8,9},
dst[] = {0,0,0,0,0,0,0,0,0};
p.eval(SK_ARRAY_COUNT(src)-1, src, dst);
for (size_t i = 0; i < SK_ARRAY_COUNT(src)-1; i++) {
REPORTER_ASSERT(r, dst[i] == src[i]);
}
size_t i = SK_ARRAY_COUNT(src)-1;
REPORTER_ASSERT(r, dst[i] == 0);
});
}
DEF_TEST(SkVM_allow_jit, r) {
skvm::Builder b;
{
auto src = b.varying<int>(),
dst = b.varying<int>();
b.store32(dst, b.load32(src));
}
if (b.done("test-allow_jit", /*allow_jit=*/true).hasJIT()) {
REPORTER_ASSERT(r, !b.done("", false).hasJIT());
}
}
DEF_TEST(SkVM_LoopCounts, r) {
// Make sure we cover all the exact N we want.
// buf[i] += 1
skvm::Builder b;
skvm::Ptr arg = b.varying<int>();
b.store32(arg,
b.add(b.splat(1),
b.load32(arg)));
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int buf[64];
for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
buf[i] = i;
}
program.eval(N, buf);
for (int i = 0; i < N; i++) {
REPORTER_ASSERT(r, buf[i] == i+1);
}
for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == i);
}
}
});
}
DEF_TEST(SkVM_gather32, r) {
skvm::Builder b;
{
2021-08-03 20:43:14 +00:00
skvm::UPtr uniforms = b.uniform();
skvm::Ptr buf = b.varying<int>();
skvm::I32 x = b.load32(buf);
b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
const int img[] = {12,34,56,78, 90,98,76,54};
int buf[20];
for (int i = 0; i < 20; i++) {
buf[i] = i;
}
struct Uniforms {
const int* img;
} uniforms{img};
program.eval(20, &uniforms, buf);
int i = 0;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
REPORTER_ASSERT(r, buf[i] == 90); i++;
REPORTER_ASSERT(r, buf[i] == 98); i++;
REPORTER_ASSERT(r, buf[i] == 76); i++;
REPORTER_ASSERT(r, buf[i] == 54); i++;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
REPORTER_ASSERT(r, buf[i] == 90); i++;
REPORTER_ASSERT(r, buf[i] == 98); i++;
REPORTER_ASSERT(r, buf[i] == 76); i++;
REPORTER_ASSERT(r, buf[i] == 54); i++;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
});
}
DEF_TEST(SkVM_gathers, r) {
skvm::Builder b;
{
2021-08-03 20:43:14 +00:00
skvm::UPtr uniforms = b.uniform();
skvm::Ptr buf32 = b.varying<int>(),
buf16 = b.varying<uint16_t>(),
buf8 = b.varying<uint8_t>();
skvm::I32 x = b.load32(buf32);
b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
const int img[] = {12,34,56,78, 90,98,76,54};
constexpr int N = 20;
int buf32[N];
uint16_t buf16[N];
uint8_t buf8 [N];
for (int i = 0; i < 20; i++) {
buf32[i] = i;
}
struct Uniforms {
const int* img;
} uniforms{img};
program.eval(N, &uniforms, buf32, buf16, buf8);
int i = 0;
REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
});
}
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
DEF_TEST(SkVM_gathers2, r) {
skvm::Builder b;
{
2021-08-03 20:43:14 +00:00
skvm::UPtr uniforms = b.uniform();
skvm::Ptr buf32 = b.varying<int>(),
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
buf16 = b.varying<uint16_t>(),
buf8 = b.varying<uint8_t>();
skvm::I32 x = b.load32(buf32);
b.store32(buf32, b.gather32(uniforms,0, x));
b.store16(buf16, b.gather16(uniforms,0, x));
b.store8 (buf8 , b.gather8 (uniforms,0, x));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
uint8_t img[256];
for (int i = 0; i < 256; i++) {
img[i] = i;
}
int buf32[64];
uint16_t buf16[64];
uint8_t buf8 [64];
for (int i = 0; i < 64; i++) {
buf32[i] = (i*47)&63;
buf16[i] = 0;
buf8 [i] = 0;
}
struct Uniforms {
const uint8_t* img;
} uniforms{img};
program.eval(64, &uniforms, buf32, buf16, buf8);
for (int i = 0; i < 64; i++) {
REPORTER_ASSERT(r, buf8[i] == ((i*47)&63)); // 0,47,30,13,60,...
}
REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
REPORTER_ASSERT(r, buf16[63] == 0x2322);
REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
REPORTER_ASSERT(r, buf32[63] == 0x47464544);
});
}
DEF_TEST(SkVM_bitops, r) {
skvm::Builder b;
{
skvm::Ptr ptr = b.varying<int>();
skvm::I32 x = b.load32(ptr);
x = b.bit_and (x, b.splat(0xf1)); // 0x40
x = b.bit_or (x, b.splat(0x80)); // 0xc0
x = b.bit_xor (x, b.splat(0xfe)); // 0x3e
x = b.bit_clear(x, b.splat(0x30)); // 0x0e
x = b.shl(x, 28); // 0xe000'0000
x = b.sra(x, 28); // 0xffff'fffe
x = b.shr(x, 1); // 0x7fff'ffff
b.store32(ptr, x);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int x = 0x42;
program.eval(1, &x);
REPORTER_ASSERT(r, x == 0x7fff'ffff);
});
}
DEF_TEST(SkVM_select_is_NaN, r) {
skvm::Builder b;
{
skvm::Ptr src = b.varying<float>(),
dst = b.varying<float>();
skvm::F32 x = b.loadF(src);
x = select(is_NaN(x), b.splat(0.0f)
, x);
b.storeF(dst, x);
}
std::vector<skvm::OptimizedInstruction> program = b.optimize();
REPORTER_ASSERT(r, program.size() == 4);
REPORTER_ASSERT(r, program[0].op == skvm::Op::load32);
REPORTER_ASSERT(r, program[1].op == skvm::Op::neq_f32);
REPORTER_ASSERT(r, program[2].op == skvm::Op::bit_clear);
REPORTER_ASSERT(r, program[3].op == skvm::Op::store32);
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
// ±NaN, ±0, ±1, ±inf
uint32_t src[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
uint32_t dst[SK_ARRAY_COUNT(src)];
program.eval(SK_ARRAY_COUNT(src), src, dst);
for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
REPORTER_ASSERT(r, dst[i] == (i < 2 ? 0 : src[i]));
}
});
}
DEF_TEST(SkVM_f32, r) {
skvm::Builder b;
{
skvm::Ptr arg = b.varying<float>();
skvm::F32 x = b.loadF(arg),
y = b.add(x,x), // y = 2x
z = b.sub(y,x), // z = 2x-x = x
w = b.div(z,x); // w = x/x = 1
b.storeF(arg, w);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
float buf[] = { 1,2,3,4,5,6,7,8,9 };
program.eval(SK_ARRAY_COUNT(buf), buf);
for (float v : buf) {
REPORTER_ASSERT(r, v == 1.0f);
}
});
}
DEF_TEST(SkVM_cmp_i32, r) {
skvm::Builder b;
{
skvm::I32 x = b.load32(b.varying<int>());
auto to_bit = [&](int shift, skvm::I32 mask) {
return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
};
skvm::I32 m = b.splat(0);
m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
b.store32(b.varying<int>(), m);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int in[] = { 0,1,2,3,4,5,6,7,8,9 };
int out[SK_ARRAY_COUNT(in)];
program.eval(SK_ARRAY_COUNT(in), in, out);
REPORTER_ASSERT(r, out[0] == 0b001111);
REPORTER_ASSERT(r, out[1] == 0b001100);
REPORTER_ASSERT(r, out[2] == 0b001010);
REPORTER_ASSERT(r, out[3] == 0b001010);
REPORTER_ASSERT(r, out[4] == 0b000010);
for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
REPORTER_ASSERT(r, out[i] == 0b110010);
}
});
}
DEF_TEST(SkVM_cmp_f32, r) {
skvm::Builder b;
{
skvm::F32 x = b.loadF(b.varying<float>());
auto to_bit = [&](int shift, skvm::I32 mask) {
return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
};
skvm::I32 m = b.splat(0);
m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
b.store32(b.varying<int>(), m);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
float in[] = { 0,1,2,3,4,5,6,7,8,9 };
int out[SK_ARRAY_COUNT(in)];
program.eval(SK_ARRAY_COUNT(in), in, out);
REPORTER_ASSERT(r, out[0] == 0b001111);
REPORTER_ASSERT(r, out[1] == 0b001100);
REPORTER_ASSERT(r, out[2] == 0b001010);
REPORTER_ASSERT(r, out[3] == 0b001010);
REPORTER_ASSERT(r, out[4] == 0b000010);
for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
REPORTER_ASSERT(r, out[i] == 0b110010);
}
});
}
DEF_TEST(SkVM_index, r) {
skvm::Builder b;
b.store32(b.varying<int>(), b.index());
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int buf[23];
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == (int)SK_ARRAY_COUNT(buf)-i);
}
});
}
DEF_TEST(SkVM_mad, r) {
// This program is designed to exercise the tricky corners of instruction
// and register selection for Op::mad_f32.
skvm::Builder b;
{
skvm::Ptr arg = b.varying<int>();
skvm::F32 x = b.to_F32(b.load32(arg)),
y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
z = b.mad(y,y,x), // y is needed in the future, but r[z] = r[x] is ok.
w = b.mad(z,z,y), // w can alias z but not y.
v = b.mad(w,y,w); // Got to stop somewhere.
b.store32(arg, b.trunc(v));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int x = 2;
program.eval(1, &x);
// x = 2
// y = 2*2 + 2 = 6
// z = 6*6 + 2 = 38
// w = 38*38 + 6 = 1450
// v = 1450*6 + 1450 = 10150
REPORTER_ASSERT(r, x == 10150);
});
}
DEF_TEST(SkVM_fms, r) {
// Create a pattern that can be peepholed into an Op::fms_f32.
skvm::Builder b;
{
skvm::Ptr arg = b.varying<int>();
skvm::F32 x = b.to_F32(b.load32(arg)),
v = b.sub(b.mul(x, b.splat(2.0f)),
b.splat(1.0f));
b.store32(arg, b.trunc(v));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
program.eval((int)SK_ARRAY_COUNT(buf), &buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] = 2*i-1);
}
});
}
DEF_TEST(SkVM_fnma, r) {
// Create a pattern that can be peepholed into an Op::fnma_f32.
skvm::Builder b;
{
skvm::Ptr arg = b.varying<int>();
skvm::F32 x = b.to_F32(b.load32(arg)),
v = b.sub(b.splat(1.0f),
b.mul(x, b.splat(2.0f)));
b.store32(arg, b.trunc(v));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
program.eval((int)SK_ARRAY_COUNT(buf), &buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] = 1-2*i);
}
});
}
DEF_TEST(SkVM_madder, r) {
skvm::Builder b;
{
skvm::Ptr arg = b.varying<float>();
skvm::F32 x = b.loadF(arg),
y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
z = b.mad(y,x,y), // r[x] can be reused after this instruction, but not r[y].
w = b.mad(y,y,z);
b.storeF(arg, w);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
float x = 2.0f;
// y = 2*2 + 2 = 6
// z = 6*2 + 6 = 18
// w = 6*6 + 18 = 54
program.eval(1, &x);
REPORTER_ASSERT(r, x == 54.0f);
});
}
DEF_TEST(SkVM_floor, r) {
skvm::Builder b;
{
skvm::Ptr arg = b.varying<float>();
b.storeF(arg, b.floor(b.loadF(arg)));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
float buf[] = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == want[i]);
}
});
}
restore Op::round While I think trunc(mad(x, scale, 0.5)) is fine for doing our float to fixed point conversions, round(mul(x, scale)) was kind of better all around: - better rounding than +0.5 and trunc - faster when mad() is not an fma - often now no need to use the constant 0.5f or have it in a register - allows the mul() in to_unorm to use mul_f32_imm Those last two points are key... this actually frees up 2 registers in the x86 JIT when using to_unorm(). So I think maybe we can resurrect round and still guarantee our desired intra-machine stability by committing to using instructions that follow the current rounding mode, which is what [v]cvtps2dq inextricably uses. Left some notes on the ARM impl... we're rounding to nearest even there, which is probably the current mode anyway, but to be more correct we need a slightly longer impl that rounds float->float then "truncates". Unsure whether it matters in practice. Same deal in the unit test that I added back, now testing negative and 0.5 cases too. The expectations assume the current mode is nearest even. I had the idea to resurrect this when I was looking at adding _imm Ops for fma_f32. I noticed that the y and z arguments to an fma_f32 were by far most likely to be constants, and when they are, they're by far likely to both be constants, e.g. 255.0f & 0.5f from to_unorm(8,...). llvm disassembly for SkVM_round unit test looks good: ~ $ llc -mcpu=haswell /tmp/skvm-jit-1231521224.bc -o - .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 15 .globl "_skvm-jit-1231521224" ## -- Begin function skvm-jit-1231521224 .p2align 4, 0x90 "_skvm-jit-1231521224": ## @skvm-jit-1231521224 .cfi_startproc cmpl $8, %edi jl LBB0_3 .p2align 4, 0x90 LBB0_2: ## %loopK ## =>This Inner Loop Header: Depth=1 vcvtps2dq (%rsi), %ymm0 vmovupd %ymm0, (%rdx) addl $-8, %edi addq $32, %rsi addq $32, %rdx cmpl $8, %edi jge LBB0_2 LBB0_3: ## %hoist1 xorl %eax, %eax testl %edi, %edi jle LBB0_6 .p2align 4, 0x90 LBB0_5: ## %loop1 ## =>This Inner Loop Header: Depth=1 vcvtss2si (%rsi,%rax), %ecx movl %ecx, (%rdx,%rax) decl %edi addq $4, %rax testl %edi, %edi jg LBB0_5 LBB0_6: ## %leave vzeroupper retq .cfi_endproc ## -- End function Change-Id: Ib59eb3fd8a6805397850d93226c6c6d37cc3ab84 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/276738 Auto-Submit: Mike Klein <mtklein@google.com> Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-03-12 16:05:46 +00:00
DEF_TEST(SkVM_round, r) {
skvm::Builder b;
{
skvm::Ptr src = b.varying<float>();
skvm::Ptr dst = b.varying<int>();
b.store32(dst, b.round(b.loadF(src)));
restore Op::round While I think trunc(mad(x, scale, 0.5)) is fine for doing our float to fixed point conversions, round(mul(x, scale)) was kind of better all around: - better rounding than +0.5 and trunc - faster when mad() is not an fma - often now no need to use the constant 0.5f or have it in a register - allows the mul() in to_unorm to use mul_f32_imm Those last two points are key... this actually frees up 2 registers in the x86 JIT when using to_unorm(). So I think maybe we can resurrect round and still guarantee our desired intra-machine stability by committing to using instructions that follow the current rounding mode, which is what [v]cvtps2dq inextricably uses. Left some notes on the ARM impl... we're rounding to nearest even there, which is probably the current mode anyway, but to be more correct we need a slightly longer impl that rounds float->float then "truncates". Unsure whether it matters in practice. Same deal in the unit test that I added back, now testing negative and 0.5 cases too. The expectations assume the current mode is nearest even. I had the idea to resurrect this when I was looking at adding _imm Ops for fma_f32. I noticed that the y and z arguments to an fma_f32 were by far most likely to be constants, and when they are, they're by far likely to both be constants, e.g. 255.0f & 0.5f from to_unorm(8,...). llvm disassembly for SkVM_round unit test looks good: ~ $ llc -mcpu=haswell /tmp/skvm-jit-1231521224.bc -o - .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 15 .globl "_skvm-jit-1231521224" ## -- Begin function skvm-jit-1231521224 .p2align 4, 0x90 "_skvm-jit-1231521224": ## @skvm-jit-1231521224 .cfi_startproc cmpl $8, %edi jl LBB0_3 .p2align 4, 0x90 LBB0_2: ## %loopK ## =>This Inner Loop Header: Depth=1 vcvtps2dq (%rsi), %ymm0 vmovupd %ymm0, (%rdx) addl $-8, %edi addq $32, %rsi addq $32, %rdx cmpl $8, %edi jge LBB0_2 LBB0_3: ## %hoist1 xorl %eax, %eax testl %edi, %edi jle LBB0_6 .p2align 4, 0x90 LBB0_5: ## %loop1 ## =>This Inner Loop Header: Depth=1 vcvtss2si (%rsi,%rax), %ecx movl %ecx, (%rdx,%rax) decl %edi addq $4, %rax testl %edi, %edi jg LBB0_5 LBB0_6: ## %leave vzeroupper retq .cfi_endproc ## -- End function Change-Id: Ib59eb3fd8a6805397850d93226c6c6d37cc3ab84 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/276738 Auto-Submit: Mike Klein <mtklein@google.com> Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-03-12 16:05:46 +00:00
}
// The test cases on exact 0.5f boundaries assume the current rounding mode is nearest even.
// We haven't explicitly guaranteed that here... it just probably is.
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
restore Op::round While I think trunc(mad(x, scale, 0.5)) is fine for doing our float to fixed point conversions, round(mul(x, scale)) was kind of better all around: - better rounding than +0.5 and trunc - faster when mad() is not an fma - often now no need to use the constant 0.5f or have it in a register - allows the mul() in to_unorm to use mul_f32_imm Those last two points are key... this actually frees up 2 registers in the x86 JIT when using to_unorm(). So I think maybe we can resurrect round and still guarantee our desired intra-machine stability by committing to using instructions that follow the current rounding mode, which is what [v]cvtps2dq inextricably uses. Left some notes on the ARM impl... we're rounding to nearest even there, which is probably the current mode anyway, but to be more correct we need a slightly longer impl that rounds float->float then "truncates". Unsure whether it matters in practice. Same deal in the unit test that I added back, now testing negative and 0.5 cases too. The expectations assume the current mode is nearest even. I had the idea to resurrect this when I was looking at adding _imm Ops for fma_f32. I noticed that the y and z arguments to an fma_f32 were by far most likely to be constants, and when they are, they're by far likely to both be constants, e.g. 255.0f & 0.5f from to_unorm(8,...). llvm disassembly for SkVM_round unit test looks good: ~ $ llc -mcpu=haswell /tmp/skvm-jit-1231521224.bc -o - .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 15 .globl "_skvm-jit-1231521224" ## -- Begin function skvm-jit-1231521224 .p2align 4, 0x90 "_skvm-jit-1231521224": ## @skvm-jit-1231521224 .cfi_startproc cmpl $8, %edi jl LBB0_3 .p2align 4, 0x90 LBB0_2: ## %loopK ## =>This Inner Loop Header: Depth=1 vcvtps2dq (%rsi), %ymm0 vmovupd %ymm0, (%rdx) addl $-8, %edi addq $32, %rsi addq $32, %rdx cmpl $8, %edi jge LBB0_2 LBB0_3: ## %hoist1 xorl %eax, %eax testl %edi, %edi jle LBB0_6 .p2align 4, 0x90 LBB0_5: ## %loop1 ## =>This Inner Loop Header: Depth=1 vcvtss2si (%rsi,%rax), %ecx movl %ecx, (%rdx,%rax) decl %edi addq $4, %rax testl %edi, %edi jg LBB0_5 LBB0_6: ## %leave vzeroupper retq .cfi_endproc ## -- End function Change-Id: Ib59eb3fd8a6805397850d93226c6c6d37cc3ab84 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/276738 Auto-Submit: Mike Klein <mtklein@google.com> Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-03-12 16:05:46 +00:00
float buf[] = { -1.5f, -0.5f, 0.0f, 0.5f, 0.2f, 0.6f, 1.0f, 1.4f, 1.5f, 2.0f };
int want[] = { -2 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 2 , 2 };
int dst[SK_ARRAY_COUNT(buf)];
program.eval(SK_ARRAY_COUNT(buf), buf, dst);
for (int i = 0; i < (int)SK_ARRAY_COUNT(dst); i++) {
REPORTER_ASSERT(r, dst[i] == want[i]);
}
});
}
DEF_TEST(SkVM_min, r) {
skvm::Builder b;
{
skvm::Ptr src1 = b.varying<float>();
skvm::Ptr src2 = b.varying<float>();
skvm::Ptr dst = b.varying<float>();
b.storeF(dst, b.min(b.loadF(src1), b.loadF(src2)));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
float s1[] = { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
float s2[] = { 0.0f, 2.0f, 3.0f, 1.0f, -2.0f};
float want[] = { 0.0f, 1.0f, 3.0f, -1.0f, -2.0f};
float d[SK_ARRAY_COUNT(s1)];
program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
REPORTER_ASSERT(r, d[i] == want[i]);
}
});
}
DEF_TEST(SkVM_max, r) {
skvm::Builder b;
{
skvm::Ptr src1 = b.varying<float>();
skvm::Ptr src2 = b.varying<float>();
skvm::Ptr dst = b.varying<float>();
b.storeF(dst, b.max(b.loadF(src1), b.loadF(src2)));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
float s1[] = { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
float s2[] = { 0.0f, 2.0f, 3.0f, 1.0f, -2.0f};
float want[] = { 0.0f, 2.0f, 4.0f, 1.0f, -1.0f};
float d[SK_ARRAY_COUNT(s1)];
program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
REPORTER_ASSERT(r, d[i] == want[i]);
}
});
}
DEF_TEST(SkVM_hoist, r) {
// This program uses enough constants that it will fail to JIT if we hoist them.
// The JIT will try again without hoisting, and that'll just need 2 registers.
skvm::Builder b;
{
skvm::Ptr arg = b.varying<int>();
skvm::I32 x = b.load32(arg);
for (int i = 0; i < 32; i++) {
x = b.add(x, b.splat(i));
}
b.store32(arg, x);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int x = 4;
program.eval(1, &x);
// x += 0 + 1 + 2 + 3 + ... + 30 + 31
// x += 496
REPORTER_ASSERT(r, x == 500);
});
}
DEF_TEST(SkVM_select, r) {
skvm::Builder b;
{
skvm::Ptr buf = b.varying<int>();
skvm::I32 x = b.load32(buf);
x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
b.store32(buf, x);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int buf[] = { 0,1,2,3,4,5,6,7,8 };
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
}
});
}
Stop calling schedule() The new unit test demonstrates load/store reordering is error-prone. At head we're allowing loads from a given pointer to reorder later than a store to that same pointer, and boy, that's just not sound. In the scenario constructed by the test we reorder this swap, x = load32 X y = load32 Y store32 X y store32 Y x using schedule() (following Op argument data dependencies) into y = load32 Y store32 X y x = load32 X store32 Y x which moves `x = load32 X` illegally past `store X y`. We write `y` twice instead of swapping `x` and `y`. It's not impossible to implement that extra reordering constraint: I think it's easiest to think about by adding implicit use edges in schedule() from stores to prior loads of the same pointer. But that'd be a little complicated to implement, and doesn't handle aliasing at all, so I decided to ponder on other approaches that handle a wider range of programs or would have a simpler implementation to reason about. I ended up walking through this rough chain of ideas: 0) reorder using only Op argument data dependencies (HEAD) 1) don't let load(ptr) pass store(ptr) (above) 2) don't let any load pass any store (allows aliasing) 3) don't reorder any Op that touches memory 4) don't reorder any Op, period. This CL is 4). It's certainly the easiest and cheapest implementation. It's not clear to me that we need this scheduling, and should we find we really want it I'll come back and work back through the list until we find something that meets our needs. (Hoisting of uniforms is unaffected here.) Change-Id: I7765b1d16202e0645b11295f7e30c5e09f2b7339 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350256 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2021-01-05 19:31:15 +00:00
DEF_TEST(SkVM_swap, r) {
skvm::Builder b;
{
// This program is the equivalent of
// x = *X
// y = *Y
// *X = y
// *Y = x
// One rescheduling of the program based only on data flow of Op arguments is
// x = *X
// *Y = x
// y = *Y
// *X = y
// but this reordering does not produce the same results and is invalid.
skvm::Ptr X = b.varying<int>(),
Stop calling schedule() The new unit test demonstrates load/store reordering is error-prone. At head we're allowing loads from a given pointer to reorder later than a store to that same pointer, and boy, that's just not sound. In the scenario constructed by the test we reorder this swap, x = load32 X y = load32 Y store32 X y store32 Y x using schedule() (following Op argument data dependencies) into y = load32 Y store32 X y x = load32 X store32 Y x which moves `x = load32 X` illegally past `store X y`. We write `y` twice instead of swapping `x` and `y`. It's not impossible to implement that extra reordering constraint: I think it's easiest to think about by adding implicit use edges in schedule() from stores to prior loads of the same pointer. But that'd be a little complicated to implement, and doesn't handle aliasing at all, so I decided to ponder on other approaches that handle a wider range of programs or would have a simpler implementation to reason about. I ended up walking through this rough chain of ideas: 0) reorder using only Op argument data dependencies (HEAD) 1) don't let load(ptr) pass store(ptr) (above) 2) don't let any load pass any store (allows aliasing) 3) don't reorder any Op that touches memory 4) don't reorder any Op, period. This CL is 4). It's certainly the easiest and cheapest implementation. It's not clear to me that we need this scheduling, and should we find we really want it I'll come back and work back through the list until we find something that meets our needs. (Hoisting of uniforms is unaffected here.) Change-Id: I7765b1d16202e0645b11295f7e30c5e09f2b7339 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350256 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2021-01-05 19:31:15 +00:00
Y = b.varying<int>();
skvm::I32 x = b.load32(X),
y = b.load32(Y);
b.store32(X, y);
b.store32(Y, x);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
Stop calling schedule() The new unit test demonstrates load/store reordering is error-prone. At head we're allowing loads from a given pointer to reorder later than a store to that same pointer, and boy, that's just not sound. In the scenario constructed by the test we reorder this swap, x = load32 X y = load32 Y store32 X y store32 Y x using schedule() (following Op argument data dependencies) into y = load32 Y store32 X y x = load32 X store32 Y x which moves `x = load32 X` illegally past `store X y`. We write `y` twice instead of swapping `x` and `y`. It's not impossible to implement that extra reordering constraint: I think it's easiest to think about by adding implicit use edges in schedule() from stores to prior loads of the same pointer. But that'd be a little complicated to implement, and doesn't handle aliasing at all, so I decided to ponder on other approaches that handle a wider range of programs or would have a simpler implementation to reason about. I ended up walking through this rough chain of ideas: 0) reorder using only Op argument data dependencies (HEAD) 1) don't let load(ptr) pass store(ptr) (above) 2) don't let any load pass any store (allows aliasing) 3) don't reorder any Op that touches memory 4) don't reorder any Op, period. This CL is 4). It's certainly the easiest and cheapest implementation. It's not clear to me that we need this scheduling, and should we find we really want it I'll come back and work back through the list until we find something that meets our needs. (Hoisting of uniforms is unaffected here.) Change-Id: I7765b1d16202e0645b11295f7e30c5e09f2b7339 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350256 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2021-01-05 19:31:15 +00:00
int b1[] = { 0,1,2,3 };
int b2[] = { 4,5,6,7 };
program.eval(SK_ARRAY_COUNT(b1), b1, b2);
for (int i = 0; i < (int)SK_ARRAY_COUNT(b1); i++) {
REPORTER_ASSERT(r, b1[i] == 4 + i);
REPORTER_ASSERT(r, b2[i] == i);
}
});
}
DEF_TEST(SkVM_NewOps, r) {
// Exercise a somewhat arbitrary set of new ops.
skvm::Builder b;
{
2021-08-03 20:43:14 +00:00
skvm::Ptr buf = b.varying<int16_t>();
skvm::UPtr uniforms = b.uniform();
skvm::I32 x = b.load16(buf);
const size_t kPtr = sizeof(const int*);
x = b.add(x, b.uniform32(uniforms, kPtr+0));
x = b.mul(x, b.uniform32(uniforms, kPtr+4));
x = b.sub(x, b.uniform32(uniforms, kPtr+8));
skvm::I32 limit = b.uniform32(uniforms, kPtr+12);
x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
x = b.select(b.gt(x, limit ), limit , x);
x = b.gather8(uniforms,0, x);
b.store16(buf, x);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
const int N = 31;
int16_t buf[N];
for (int i = 0; i < N; i++) {
buf[i] = i;
}
const int M = 16;
uint8_t img[M];
for (int i = 0; i < M; i++) {
img[i] = i*i;
}
struct {
const uint8_t* img;
int add = 5;
int mul = 3;
int sub = 18;
int limit = M-1;
} uniforms{img};
program.eval(N, buf, &uniforms);
for (int i = 0; i < N; i++) {
// Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
int x = 3*(i-1);
// Then that's pinned to the limits of img.
if (i < 2) { x = 0; } // Notice i == 1 hits x == 0 exactly...
if (i > 5) { x = 15; } // ...and i == 6 hits x == 15 exactly
REPORTER_ASSERT(r, buf[i] == img[x]);
}
});
}
DEF_TEST(SKVM_array32, r) {
skvm::Builder b;
skvm::Uniforms uniforms(b.uniform(), 0);
// Take up the first slot, so other uniforms are not at 0 offset.
uniforms.push(0);
int i[] = {3, 7};
skvm::Uniform array = uniforms.pushArray(i);
float f[] = {5, 9};
skvm::Uniform arrayF = uniforms.pushArrayF(f);
{
skvm::Ptr buf0 = b.varying<int32_t>(),
buf1 = b.varying<int32_t>(),
buf2 = b.varying<int32_t>();
skvm::I32 j = b.array32(array, 0);
b.store32(buf0, j);
skvm::I32 k = b.array32(array, 1);
b.store32(buf1, k);
skvm::F32 x = b.arrayF(arrayF, 0);
skvm::F32 y = b.arrayF(arrayF, 1);
b.store32(buf2, b.trunc(b.add(x, y)));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
const int K = 10;
int32_t buf0[K],
buf1[K],
buf2[K];
// reset the i[0] for the two tests.
i[0] = 3;
f[1] = 9;
program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
for (auto v : buf0) {
REPORTER_ASSERT(r, v == 3);
}
for (auto v : buf1) {
REPORTER_ASSERT(r, v == 7);
}
for (auto v : buf2) {
REPORTER_ASSERT(r, v == 14);
}
i[0] = 4;
f[1] = 10;
program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
for (auto v : buf0) {
REPORTER_ASSERT(r, v == 4);
}
for (auto v : buf1) {
REPORTER_ASSERT(r, v == 7);
}
for (auto v : buf2) {
REPORTER_ASSERT(r, v == 15);
}
});
}
DEF_TEST(SkVM_sqrt, r) {
skvm::Builder b;
auto buf = b.varying<int>();
b.storeF(buf, b.sqrt(b.loadF(buf)));
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
constexpr int K = 17;
float buf[K];
for (int i = 0; i < K; i++) {
buf[i] = (float)(i*i);
}
// x^2 -> x
program.eval(K, buf);
for (int i = 0; i < K; i++) {
REPORTER_ASSERT(r, buf[i] == (float)i);
}
});
}
DEF_TEST(SkVM_MSAN, r) {
// This little memset32() program should be able to JIT, but if we run that
// JIT code in an MSAN build, it won't see the writes initialize buf. So
// this tests that we're using the interpreter instead.
skvm::Builder b;
b.store32(b.varying<int>(), b.splat(42));
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
constexpr int K = 17;
int buf[K]; // Intentionally uninitialized.
program.eval(K, buf);
sk_msan_assert_initialized(buf, buf+K);
for (int x : buf) {
REPORTER_ASSERT(r, x == 42);
}
});
}
DEF_TEST(SkVM_assert, r) {
skvm::Builder b;
b.assert_true(b.lt(b.load32(b.varying<int>()),
b.splat(42)));
test_jit_and_interpreter(b, [&](const skvm::Program& program) {
int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
program.eval(SK_ARRAY_COUNT(buf), buf);
});
}
DEF_TEST(SkVM_trace_line, r) {
class TestTraceHook : public skvm::TraceHook {
public:
void var(int, int32_t) override { fBuffer.push_back(-9999999); }
void enter(int) override { fBuffer.push_back(-9999999); }
void exit(int) override { fBuffer.push_back(-9999999); }
void scope(int) override { fBuffer.push_back(-9999999); }
void line(int lineNum) override { fBuffer.push_back(lineNum); }
std::vector<int> fBuffer;
};
skvm::Builder b;
TestTraceHook testTrace;
int traceHookID = b.attachTraceHook(&testTrace);
b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 123);
b.trace_line(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 456);
b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 567);
b.trace_line(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 678);
b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 789);
skvm::Program p = b.done();
p.eval(1);
REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{123, 789}));
}
DEF_TEST(SkVM_trace_var, r) {
class TestTraceHook : public skvm::TraceHook {
public:
void line(int) override { fBuffer.push_back(-9999999); }
void enter(int) override { fBuffer.push_back(-9999999); }
void exit(int) override { fBuffer.push_back(-9999999); }
void scope(int) override { fBuffer.push_back(-9999999); }
void var(int slot, int32_t val) override {
fBuffer.push_back(slot);
fBuffer.push_back(val);
}
std::vector<int> fBuffer;
};
skvm::Builder b;
TestTraceHook testTrace;
int traceHookID = b.attachTraceHook(&testTrace);
b.trace_var(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 2, b.splat(333));
b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 4, b.splat(555));
b.trace_var(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 5, b.splat(666));
b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 6, b.splat(777));
b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 8, b.splat(999));
skvm::Program p = b.done();
p.eval(1);
REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{4, 555, 6, 777}));
}
DEF_TEST(SkVM_trace_enter_exit, r) {
class TestTraceHook : public skvm::TraceHook {
public:
void line(int) override { fBuffer.push_back(-9999999); }
void var(int, int32_t) override { fBuffer.push_back(-9999999); }
void scope(int) override { fBuffer.push_back(-9999999); }
void enter(int fnIdx) override {
fBuffer.push_back(fnIdx);
fBuffer.push_back(1);
}
void exit(int fnIdx) override {
fBuffer.push_back(fnIdx);
fBuffer.push_back(0);
}
std::vector<int> fBuffer;
};
skvm::Builder b;
TestTraceHook testTrace;
int traceHookID = b.attachTraceHook(&testTrace);
b.trace_enter(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 99);
b.trace_enter(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 12);
b.trace_enter(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 34);
b.trace_exit(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 56);
b.trace_exit(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 78);
b.trace_exit(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 90);
skvm::Program p = b.done();
p.eval(1);
REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{12, 1, 56, 0}));
}
DEF_TEST(SkVM_trace_scope, r) {
class TestTraceHook : public skvm::TraceHook {
public:
void var(int, int32_t) override { fBuffer.push_back(-9999999); }
void enter(int) override { fBuffer.push_back(-9999999); }
void exit(int) override { fBuffer.push_back(-9999999); }
void line(int) override { fBuffer.push_back(-9999999); }
void scope(int delta) override { fBuffer.push_back(delta); }
std::vector<int> fBuffer;
};
skvm::Builder b;
TestTraceHook testTrace;
int traceHookID = b.attachTraceHook(&testTrace);
b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 1);
b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), -2);
b.trace_scope(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 3);
b.trace_scope(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 4);
b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), -5);
skvm::Program p = b.done();
p.eval(1);
REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{1, -5}));
}
DEF_TEST(SkVM_trace_multiple_hooks, r) {
class TestTraceHook : public skvm::TraceHook {
public:
void var(int, int32_t) override { fBuffer.push_back(-9999999); }
void enter(int) override { fBuffer.push_back(-9999999); }
void exit(int) override { fBuffer.push_back(-9999999); }
void scope(int) override { fBuffer.push_back(-9999999); }
void line(int lineNum) override { fBuffer.push_back(lineNum); }
std::vector<int> fBuffer;
};
skvm::Builder b;
TestTraceHook testTraceA, testTraceB, testTraceC;
int traceHookAID = b.attachTraceHook(&testTraceA);
int traceHookBID = b.attachTraceHook(&testTraceB);
int traceHookCID = b.attachTraceHook(&testTraceC);
b.trace_line(traceHookCID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 111);
b.trace_line(traceHookAID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 222);
b.trace_line(traceHookCID, b.splat(0x00000000), b.splat(0x00000000), 333);
b.trace_line(traceHookBID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 444);
b.trace_line(traceHookAID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 555);
b.trace_line(traceHookBID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 666);
skvm::Program p = b.done();
p.eval(1);
REPORTER_ASSERT(r, (testTraceA.fBuffer == std::vector<int>{222}));
REPORTER_ASSERT(r, (testTraceB.fBuffer == std::vector<int>{666}));
REPORTER_ASSERT(r, (testTraceC.fBuffer == std::vector<int>{111}));
}
DEF_TEST(SkVM_premul, reporter) {
// Test that premul is short-circuited when alpha is known opaque.
{
skvm::Builder p;
auto rptr = p.varying<int>(),
aptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.loadF(aptr);
p.premul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, load alpha, red *= alpha, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
}
{
skvm::Builder p;
auto rptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.splat(1.0f);
p.premul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
}
// Same deal for unpremul.
{
skvm::Builder p;
auto rptr = p.varying<int>(),
aptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.loadF(aptr);
p.unpremul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, load alpha, a bunch of unpremul instructions, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
}
{
skvm::Builder p;
auto rptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.splat(1.0f);
p.unpremul(&r, &g, &b, a);
p.storeF(rptr, r);
// load red, store red
REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
}
}
template <typename Fn>
static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
uint8_t buf[4096];
skvm::Assembler a{buf};
fn(a);
REPORTER_ASSERT(r, a.size() == expected.size());
auto got = (const uint8_t*)buf,
want = expected.begin();
for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
REPORTER_ASSERT(r, got[i] == want[i],
"byte %d was %02x, want %02x", i, got[i], want[i]);
}
}
DEF_TEST(SkVM_Assembler, r) {
// Easiest way to generate test cases is
//
// echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
//
// The -x86-asm-syntax=intel bit is optional, controlling the
// input syntax only; the output will always be AT&T op x,y,dst style.
// Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
// that a bit easier to use here, despite maybe favoring AT&T overall.
using A = skvm::Assembler;
// Our exit strategy from AVX code.
test_asm(r, [&](A& a) {
a.int3();
a.vzeroupper();
a.ret();
},{
0xcc,
0xc5, 0xf8, 0x77,
0xc3,
});
// Align should pad with zero
test_asm(r, [&](A& a) {
a.ret();
a.align(4);
},{
0xc3,
0x00, 0x00, 0x00,
});
test_asm(r, [&](A& a) {
a.add(A::rax, 8); // Always good to test rax.
a.sub(A::rax, 32);
a.add(A::rdi, 12); // Last 0x48 REX
a.sub(A::rdi, 8);
a.add(A::r8 , 7); // First 0x49 REX
a.sub(A::r8 , 4);
a.add(A::rsi, 128); // Requires 4 byte immediate.
a.sub(A::r8 , 1000000);
a.add(A::Mem{A::rsi}, 7); // addq $7, (%rsi)
a.add(A::Mem{A::rsi, 12}, 7); // addq $7, 12(%rsi)
a.add(A::Mem{A::rsp, 12}, 7); // addq $7, 12(%rsp)
a.add(A::Mem{A::r12, 12}, 7); // addq $7, 12(%r12)
a.add(A::Mem{A::rsp, 12, A::rax, A::FOUR}, 7); // addq $7, 12(%rsp,%rax,4)
a.add(A::Mem{A::r12, 12, A::rax, A::FOUR}, 7); // addq $7, 12(%r12,%rax,4)
a.add(A::Mem{A::rax, 12, A::r12, A::FOUR}, 7); // addq $7, 12(%rax,%r12,4)
a.add(A::Mem{A::r11, 12, A::r8 , A::TWO }, 7); // addq $7, 12(%r11,%r8,2)
a.add(A::Mem{A::r11, 12, A::rax} , 7); // addq $7, 12(%r11,%rax)
a.add(A::Mem{A::rax, 12, A::r11} , 7); // addq $7, 12(%rax,%r11)
a.sub(A::Mem{A::rax, 12, A::r11} , 7); // subq $7, 12(%rax,%r11)
a.add( A::rax , A::rcx); // addq %rcx, %rax
a.add(A::Mem{A::rax} , A::rcx); // addq %rcx, (%rax)
a.add(A::Mem{A::rax, 12}, A::rcx); // addq %rcx, 12(%rax)
a.add(A::rcx, A::Mem{A::rax, 12}); // addq 12(%rax), %rcx
a.sub(A::rcx, A::Mem{A::rax, 12}); // subq 12(%rax), %rcx
},{
0x48, 0x83, 0b11'000'000, 0x08,
0x48, 0x83, 0b11'101'000, 0x20,
0x48, 0x83, 0b11'000'111, 0x0c,
0x48, 0x83, 0b11'101'111, 0x08,
0x49, 0x83, 0b11'000'000, 0x07,
0x49, 0x83, 0b11'101'000, 0x04,
0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
0x48,0x83,0x06,0x07,
0x48,0x83,0x46,0x0c,0x07,
0x48,0x83,0x44,0x24,0x0c,0x07,
0x49,0x83,0x44,0x24,0x0c,0x07,
0x48,0x83,0x44,0x84,0x0c,0x07,
0x49,0x83,0x44,0x84,0x0c,0x07,
0x4a,0x83,0x44,0xa0,0x0c,0x07,
0x4b,0x83,0x44,0x43,0x0c,0x07,
0x49,0x83,0x44,0x03,0x0c,0x07,
0x4a,0x83,0x44,0x18,0x0c,0x07,
0x4a,0x83,0x6c,0x18,0x0c,0x07,
0x48,0x01,0xc8,
0x48,0x01,0x08,
0x48,0x01,0x48,0x0c,
0x48,0x03,0x48,0x0c,
0x48,0x2b,0x48,0x0c,
});
test_asm(r, [&](A& a) {
a.vpaddd (A::ymm0, A::ymm1, A::ymm2); // Low registers and 0x0f map -> 2-byte VEX.
a.vpaddd (A::ymm8, A::ymm1, A::ymm2); // A high dst register is ok -> 2-byte VEX.
a.vpaddd (A::ymm0, A::ymm8, A::ymm2); // A high first argument register -> 2-byte VEX.
a.vpaddd (A::ymm0, A::ymm1, A::ymm8); // A high second argument -> 3-byte VEX.
a.vpmulld(A::ymm0, A::ymm1, A::ymm2); // Using non-0x0f map instruction -> 3-byte VEX.
a.vpsubd (A::ymm0, A::ymm1, A::ymm2); // Test vpsubd to ensure argument order is right.
},{
/* VEX */ /*op*/ /*modRM*/
0xc5, 0xf5, 0xfe, 0xc2,
0xc5, 0x75, 0xfe, 0xc2,
0xc5, 0xbd, 0xfe, 0xc2,
0xc4, 0xc1, 0x75, 0xfe, 0xc0,
0xc4, 0xe2, 0x75, 0x40, 0xc2,
0xc5, 0xf5, 0xfa, 0xc2,
});
implement some useful 16-bit instructions Add a slew of 16-bit instructions for experiments. I want to try a fixed-point path through SkVMBlitter, continuing to represent geometry with F32, but color channels in 16 bits, with several possible representations: - unorm8 lowp like SkRasterPipeline (0 -> 0.0, 0x00ff -> 1.0) - 15-bit SkFixed15 fixed-point (0 -> 0.0, 0x8000 -> 1.0) - 14-bit signed fixed-point (0 -> 0.0, ±0x4000 -> ±1.0) I'm leaning towards the 14-bit version for being able to hold a good range of temporary values in [-2,2), or perhaps even a 13-bit analog for even a little more safety range. Mostly something new to try. Most of these instructions are pretty obvious, with notes on a few: vpavgw is an unsigned (x+y+1)>>1, and is useful for converting unorm8 up to Q14. There are a couple ways to do this pretty well, and using vpavgw is the best, and uses the fewest instructions: A) (x << 6) + ( x >> 2) + (x == 255) // Ok approx. B) (x << 6) + ((x+1) >> 2) // Better approx. C) vpavgw(x << 7, x >> 1) // Perfect math! The best good reverse math I've found is (x >> 6) - (x > 16319). vpmulhrsw is the key to the whole thing as usual, letting us do 16x16->16-bit multiplies. An SkFixed15 multiply is vpmulhrsw followed by vpabsw (also added here), and a Q14 multiply is vpmulhrsw followed by a simple <<1. I've added both signed and unsigned min and max. Not entirely sure they'll all be used, but I do have my eye on vpminuw as a single-instruction clamp to [0,0x4000] ~~> [0.0,1.0], treating any negative Q14 as very large unsigned. Change-Id: I0db7f3f943ef6c9a600821444cc5b003fe5f675d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317119 Commit-Queue: Herb Derby <herb@google.com> Auto-Submit: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-09-15 12:57:27 +00:00
test_asm(r, [&](A& a) {
a.vpaddw (A::ymm4, A::ymm3, A::ymm2);
a.vpavgw (A::ymm4, A::ymm3, A::ymm2);
a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
a.vpminsw (A::ymm4, A::ymm3, A::ymm2);
a.vpmaxsw (A::ymm4, A::ymm3, A::ymm2);
a.vpminuw (A::ymm4, A::ymm3, A::ymm2);
a.vpmaxuw (A::ymm4, A::ymm3, A::ymm2);
a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
a.vpabsw (A::ymm4, A::ymm3);
a.vpsllw (A::ymm4, A::ymm3, 12);
a.vpsraw (A::ymm4, A::ymm3, 12);
},{
0xc5, 0xe5, 0xfd, 0xe2,
0xc5, 0xe5, 0xe3, 0xe2,
0xc5, 0xe5, 0x75, 0xe2,
0xc5, 0xe5, 0x65, 0xe2,
0xc5, 0xe5, 0xea, 0xe2,
0xc5, 0xe5, 0xee, 0xe2,
0xc4,0xe2,0x65, 0x3a, 0xe2,
0xc4,0xe2,0x65, 0x3e, 0xe2,
0xc4,0xe2,0x65, 0x0b, 0xe2,
0xc4,0xe2,0x7d, 0x1d, 0xe3,
0xc5,0xdd,0x71, 0xf3, 0x0c,
0xc5,0xdd,0x71, 0xe3, 0x0c,
});
test_asm(r, [&](A& a) {
A::Label l;
a.vcmpeqps (A::ymm0, A::ymm1, &l); // vcmpeqps 0x1c(%rip), %ymm1, %ymm0
a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
a.label(&l); // 28 bytes after the vcmpeqps that uses it.
},{
0xc5,0xf4,0xc2,0x05,0x1c,0x00,0x00,0x00,0x00,
0xc5,0xf5,0x76,0xc2,
0xc5,0xf5,0x66,0xc2,
0xc5,0xf4,0xc2,0xc2,0x00,
0xc5,0xf4,0xc2,0xc2,0x01,
0xc5,0xf4,0xc2,0xc2,0x02,
0xc5,0xf4,0xc2,0xc2,0x04,
});
test_asm(r, [&](A& a) {
a.vminps(A::ymm0, A::ymm1, A::ymm2);
a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
},{
0xc5,0xf4,0x5d,0xc2,
0xc5,0xf4,0x5f,0xc2,
});
test_asm(r, [&](A& a) {
a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
},{
0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
});
test_asm(r, [&](A& a) {
a.vpsrld(A::ymm15, A::ymm2, 8);
a.vpsrld(A::ymm0 , A::ymm8, 5);
},{
0xc5, 0x85, 0x72,0xd2, 0x08,
0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
});
test_asm(r, [&](A& a) {
A::Label l;
a.vpermps(A::ymm1, A::ymm2, A::Mem{A::rdi, 32});
a.vperm2f128(A::ymm1, A::ymm2, &l, 0x20);
a.vpermq(A::ymm1, A::ymm2, 5);
a.label(&l); // 6 bytes after vperm2f128
},{
0xc4,0xe2,0x6d,0x16,0x4f,0x20,
0xc4,0xe3,0x6d,0x06,0x0d,0x06,0x00,0x00,0x00,0x20,
0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
});
test_asm(r, [&](A& a) {
a.vpunpckldq(A::ymm1, A::ymm2, A::Mem{A::rdi});
a.vpunpckhdq(A::ymm1, A::ymm2, A::ymm3);
},{
0xc5,0xed,0x62,0x0f,
0xc5,0xed,0x6a,0xcb,
});
test_asm(r, [&](A& a) {
a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
a.vroundps(A::ymm1, A::ymm2, A::CEIL);
a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
},{
0xc4,0xe3,0x7d,0x08,0xca,0x00,
0xc4,0xe3,0x7d,0x08,0xca,0x01,
0xc4,0xe3,0x7d,0x08,0xca,0x02,
0xc4,0xe3,0x7d,0x08,0xca,0x03,
});
test_asm(r, [&](A& a) {
A::Label l;
a.label(&l);
a.byte(1);
a.byte(2);
a.byte(3);
a.byte(4);
a.vbroadcastss(A::ymm0 , &l);
a.vbroadcastss(A::ymm1 , &l);
a.vbroadcastss(A::ymm8 , &l);
a.vbroadcastss(A::ymm15, &l);
a.vpshufb(A::ymm4, A::ymm3, &l);
a.vpaddd (A::ymm4, A::ymm3, &l);
a.vpsubd (A::ymm4, A::ymm3, &l);
a.vptest(A::ymm4, &l);
sketch out structure for ops with immediates Lots of x86 instructions can take their right hand side argument from memory directly rather than a register. We can use this to avoid the need to allocate a register for many constants. The strategy in this CL is one of several I've been stewing over, the simplest of those strategies I think. There are some trade offs particularly on ARM; this naive ARM implementation means we'll load&op every time, even though the load part of the operation can logically be hoisted. From here on I'm going to just briefly enumerate a few other approaches that allow the optimization on x86 and still allow the immediate splats to hoist on ARM. 1) don't do it on ARM A very simple approach is to simply not perform this optimization on ARM. ARM has more vector registers than x86, and so register pressure is lower there. We're going to end up with splatted constants in registers anyway, so maybe just let that happen the normal way instead of some roundabout complicated hack like I'll talk about in 2). The only downside in my mind is that this approach would make high-level program descriptions platform dependent, which isn't so bad, but it's been nice to be able to compare and diff debug dumps. 2) split Op::splat up The next less-simple approach to this problem could fix this by splitting splats into two Ops internally, one inner Op::immediate that guantees at least the constant is in memory and is compatible with immediate-aware Ops like mul_f32_imm, and an outer Op::constant that depends on that Op::immediate and further guarantees that constant has been broadcast into a register to be compatible with non-immediate-aware ops like div_f32. When building a program, immediate-aware ops would peek for Op::constants as they do today for Op::splats, but instead of embedding the immediate themselves, they'd replace their dependency with the inner Op::immediate. On x86 these new Ops would work just as advertised, with Op::immediate a runtime no-op, Op::constant the usual vbroadcastss. On ARM Op::immediate needs to go all the way and splat out a register to make the constant compatible with immediate-aware ops, and the Op::constant becomes a noop now instead. All this comes together to let the Op::immediate splat hoist up out of the loop while still feeding Op::mul_f32_imm and co. It's a rather complicated approach to solving this issue, but I might want to explore it just to see how bad it is. 3) do it inside the x86 JIT The conceptually best approach is to find a way to do this peepholing only inside the JIT only on x86, avoiding the need for new Op::mul_f32_imm and co. ARM and the interpreter don't benefit from this peephole, so the x86 JIT is the logical owner of this optimization. Finding a clean way to do this without too much disruption is the least baked idea I've got here, though I think the most desirable long-term. Cq-Include-Trybots: skia.primary:Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-SK_USE_SKVM_BLITTER,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_USE_SKVM_BLITTER Change-Id: Ie9c6336ed08b6fbeb89acf920a48a319f74f3643 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/254217 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-11-12 15:07:23 +00:00
a.vmulps (A::ymm4, A::ymm3, &l);
},{
0x01, 0x02, 0x03, 0x4,
/* VEX */ /*op*/ /* ModRM */ /* offset */
0xc4, 0xe2, 0x7d, 0x18, 0b00'000'101, 0xf3,0xff,0xff,0xff, // 0xfffffff3 == -13
0xc4, 0xe2, 0x7d, 0x18, 0b00'001'101, 0xea,0xff,0xff,0xff, // 0xffffffea == -22
0xc4, 0x62, 0x7d, 0x18, 0b00'000'101, 0xe1,0xff,0xff,0xff, // 0xffffffe1 == -31
0xc4, 0x62, 0x7d, 0x18, 0b00'111'101, 0xd8,0xff,0xff,0xff, // 0xffffffd8 == -40
0xc4, 0xe2, 0x65, 0x00, 0b00'100'101, 0xcf,0xff,0xff,0xff, // 0xffffffcf == -49
0xc5, 0xe5, 0xfe, 0b00'100'101, 0xc7,0xff,0xff,0xff, // 0xffffffc7 == -57
0xc5, 0xe5, 0xfa, 0b00'100'101, 0xbf,0xff,0xff,0xff, // 0xffffffbf == -65
sketch out structure for ops with immediates Lots of x86 instructions can take their right hand side argument from memory directly rather than a register. We can use this to avoid the need to allocate a register for many constants. The strategy in this CL is one of several I've been stewing over, the simplest of those strategies I think. There are some trade offs particularly on ARM; this naive ARM implementation means we'll load&op every time, even though the load part of the operation can logically be hoisted. From here on I'm going to just briefly enumerate a few other approaches that allow the optimization on x86 and still allow the immediate splats to hoist on ARM. 1) don't do it on ARM A very simple approach is to simply not perform this optimization on ARM. ARM has more vector registers than x86, and so register pressure is lower there. We're going to end up with splatted constants in registers anyway, so maybe just let that happen the normal way instead of some roundabout complicated hack like I'll talk about in 2). The only downside in my mind is that this approach would make high-level program descriptions platform dependent, which isn't so bad, but it's been nice to be able to compare and diff debug dumps. 2) split Op::splat up The next less-simple approach to this problem could fix this by splitting splats into two Ops internally, one inner Op::immediate that guantees at least the constant is in memory and is compatible with immediate-aware Ops like mul_f32_imm, and an outer Op::constant that depends on that Op::immediate and further guarantees that constant has been broadcast into a register to be compatible with non-immediate-aware ops like div_f32. When building a program, immediate-aware ops would peek for Op::constants as they do today for Op::splats, but instead of embedding the immediate themselves, they'd replace their dependency with the inner Op::immediate. On x86 these new Ops would work just as advertised, with Op::immediate a runtime no-op, Op::constant the usual vbroadcastss. On ARM Op::immediate needs to go all the way and splat out a register to make the constant compatible with immediate-aware ops, and the Op::constant becomes a noop now instead. All this comes together to let the Op::immediate splat hoist up out of the loop while still feeding Op::mul_f32_imm and co. It's a rather complicated approach to solving this issue, but I might want to explore it just to see how bad it is. 3) do it inside the x86 JIT The conceptually best approach is to find a way to do this peepholing only inside the JIT only on x86, avoiding the need for new Op::mul_f32_imm and co. ARM and the interpreter don't benefit from this peephole, so the x86 JIT is the logical owner of this optimization. Finding a clean way to do this without too much disruption is the least baked idea I've got here, though I think the most desirable long-term. Cq-Include-Trybots: skia.primary:Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-SK_USE_SKVM_BLITTER,Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SK_USE_SKVM_BLITTER Change-Id: Ie9c6336ed08b6fbeb89acf920a48a319f74f3643 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/254217 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2019-11-12 15:07:23 +00:00
0xc4, 0xe2, 0x7d, 0x17, 0b00'100'101, 0xb6,0xff,0xff,0xff, // 0xffffffb6 == -74
0xc5, 0xe4, 0x59, 0b00'100'101, 0xae,0xff,0xff,0xff, // 0xffffffaf == -82
});
test_asm(r, [&](A& a) {
a.vbroadcastss(A::ymm0, A::Mem{A::rdi, 0});
a.vbroadcastss(A::ymm13, A::Mem{A::r14, 7});
a.vbroadcastss(A::ymm8, A::Mem{A::rdx, -12});
a.vbroadcastss(A::ymm8, A::Mem{A::rdx, 400});
a.vbroadcastss(A::ymm8, A::xmm0);
a.vbroadcastss(A::ymm0, A::xmm13);
},{
/* VEX */ /*op*/ /*ModRM*/ /*offset*/
0xc4,0xe2,0x7d, 0x18, 0b00'000'111,
0xc4,0x42,0x7d, 0x18, 0b01'101'110, 0x07,
0xc4,0x62,0x7d, 0x18, 0b01'000'010, 0xf4,
0xc4,0x62,0x7d, 0x18, 0b10'000'010, 0x90,0x01,0x00,0x00,
0xc4,0x62,0x7d, 0x18, 0b11'000'000,
0xc4,0xc2,0x7d, 0x18, 0b11'000'101,
});
test_asm(r, [&](A& a) {
A::Label l;
a.label(&l);
a.jne(&l);
a.jne(&l);
a.je (&l);
a.jmp(&l);
a.jl (&l);
a.jc (&l);
a.cmp(A::rdx, 1);
a.cmp(A::rax, 12);
a.cmp(A::r14, 2000000000);
},{
0x0f,0x85, 0xfa,0xff,0xff,0xff, // near jne -6 bytes
0x0f,0x85, 0xf4,0xff,0xff,0xff, // near jne -12 bytes
0x0f,0x84, 0xee,0xff,0xff,0xff, // near je -18 bytes
0xe9, 0xe9,0xff,0xff,0xff, // near jmp -23 bytes
0x0f,0x8c, 0xe3,0xff,0xff,0xff, // near jl -29 bytes
0x0f,0x82, 0xdd,0xff,0xff,0xff, // near jc -35 bytes
0x48,0x83,0xfa,0x01,
0x48,0x83,0xf8,0x0c,
0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
});
test_asm(r, [&](A& a) {
a.vmovups(A::ymm5, A::Mem{A::rsi});
a.vmovups(A::Mem{A::rsi}, A::ymm5);
a.vmovups(A::xmm5, A::Mem{A::rsi});
a.vmovups(A::Mem{A::rsi}, A::xmm5);
a.vpmovzxwd(A::ymm4, A::Mem{A::rsi});
a.vpmovzxbd(A::ymm4, A::Mem{A::rsi});
a.vmovq(A::Mem{A::rdx}, A::xmm15);
},{
/* VEX */ /*Op*/ /* ModRM */
0xc5, 0xfc, 0x10, 0b00'101'110,
0xc5, 0xfc, 0x11, 0b00'101'110,
0xc5, 0xf8, 0x10, 0b00'101'110,
0xc5, 0xf8, 0x11, 0b00'101'110,
0xc4,0xe2,0x7d, 0x33, 0b00'100'110,
0xc4,0xe2,0x7d, 0x31, 0b00'100'110,
0xc5, 0x79, 0xd6, 0b00'111'010,
});
test_asm(r, [&](A& a) {
a.vmovups(A::ymm5, A::Mem{A::rsp, 0});
a.vmovups(A::ymm5, A::Mem{A::rsp, 64});
a.vmovups(A::ymm5, A::Mem{A::rsp,128});
a.vmovups(A::Mem{A::rsp, 0}, A::ymm5);
a.vmovups(A::Mem{A::rsp, 64}, A::ymm5);
a.vmovups(A::Mem{A::rsp,128}, A::ymm5);
},{
0xc5,0xfc,0x10,0x2c,0x24,
0xc5,0xfc,0x10,0x6c,0x24,0x40,
0xc5,0xfc,0x10,0xac,0x24,0x80,0x00,0x00,0x00,
0xc5,0xfc,0x11,0x2c,0x24,
0xc5,0xfc,0x11,0x6c,0x24,0x40,
0xc5,0xfc,0x11,0xac,0x24,0x80,0x00,0x00,0x00,
});
test_asm(r, [&](A& a) {
a.movzbq(A::rax, A::Mem{A::rsi}); // Low registers for src and dst.
a.movzbq(A::rax, A::Mem{A::r8,}); // High src register.
a.movzbq(A::r8 , A::Mem{A::rsi}); // High dst register.
a.movzbq(A::r8, A::Mem{A::rsi, 12});
a.movzbq(A::r8, A::Mem{A::rsi, 400});
a.movzwq(A::rax, A::Mem{A::rsi}); // Low registers for src and dst.
a.movzwq(A::rax, A::Mem{A::r8,}); // High src register.
a.movzwq(A::r8 , A::Mem{A::rsi}); // High dst register.
a.movzwq(A::r8, A::Mem{A::rsi, 12});
a.movzwq(A::r8, A::Mem{A::rsi, 400});
a.vmovd(A::Mem{A::rax}, A::xmm0);
a.vmovd(A::Mem{A::rax}, A::xmm8);
a.vmovd(A::Mem{A::r8 }, A::xmm0);
a.vmovd(A::xmm0, A::Mem{A::rax});
a.vmovd(A::xmm8, A::Mem{A::rax});
a.vmovd(A::xmm0, A::Mem{A::r8 });
a.vmovd(A::xmm0 , A::Mem{A::rax, 0, A::rcx, A::FOUR});
a.vmovd(A::xmm15, A::Mem{A::rax, 0, A::r8, A::TWO });
a.vmovd(A::xmm0 , A::Mem{A::r8 , 0, A::rcx});
a.vmovd(A::rax, A::xmm0);
a.vmovd(A::rax, A::xmm8);
a.vmovd(A::r8 , A::xmm0);
a.vmovd(A::xmm0, A::rax);
a.vmovd(A::xmm8, A::rax);
a.vmovd(A::xmm0, A::r8 );
a.movb(A::Mem{A::rdx}, A::rax);
a.movb(A::Mem{A::rdx}, A::r8 );
a.movb(A::Mem{A::r8 }, A::rax);
a.movb(A::rdx, A::Mem{A::rax});
a.movb(A::rdx, A::Mem{A::r8 });
a.movb(A::r8 , A::Mem{A::rax});
a.movb(A::rdx, 12);
a.movb(A::rax, 4);
a.movb(A::r8 , -1);
a.movb(A::Mem{A::rdx}, 12);
a.movb(A::Mem{A::rax}, 4);
a.movb(A::Mem{A::r8 }, -1);
},{
0x48,0x0f,0xb6,0x06, // movzbq (%rsi), %rax
0x49,0x0f,0xb6,0x00,
0x4c,0x0f,0xb6,0x06,
0x4c,0x0f,0xb6,0x46, 12,
0x4c,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
0x48,0x0f,0xb7,0x06, // movzwq (%rsi), %rax
0x49,0x0f,0xb7,0x00,
0x4c,0x0f,0xb7,0x06,
0x4c,0x0f,0xb7,0x46, 12,
0x4c,0x0f,0xb7,0x86, 0x90,0x01,0x00,0x00,
0xc5,0xf9,0x7e,0x00,
0xc5,0x79,0x7e,0x00,
0xc4,0xc1,0x79,0x7e,0x00,
0xc5,0xf9,0x6e,0x00,
0xc5,0x79,0x6e,0x00,
0xc4,0xc1,0x79,0x6e,0x00,
0xc5,0xf9,0x6e,0x04,0x88,
0xc4,0x21,0x79,0x6e,0x3c,0x40,
0xc4,0xc1,0x79,0x6e,0x04,0x08,
0xc5,0xf9,0x7e,0xc0,
0xc5,0x79,0x7e,0xc0,
0xc4,0xc1,0x79,0x7e,0xc0,
0xc5,0xf9,0x6e,0xc0,
0xc5,0x79,0x6e,0xc0,
0xc4,0xc1,0x79,0x6e,0xc0,
0x48 ,0x88, 0x02,
0x4c, 0x88, 0x02,
0x49, 0x88, 0x00,
0x48 ,0x8a, 0x10,
0x49, 0x8a, 0x10,
0x4c, 0x8a, 0x00,
0x48, 0xc6, 0xc2, 0x0c,
0x48, 0xc6, 0xc0, 0x04,
0x49, 0xc6, 0xc0, 0xff,
0x48, 0xc6, 0x02, 0x0c,
0x48, 0xc6, 0x00, 0x04,
0x49, 0xc6, 0x00, 0xff,
});
test_asm(r, [&](A& a) {
a.vpinsrd(A::xmm1, A::xmm8, A::Mem{A::rsi}, 1); // vpinsrd $1, (%rsi), %xmm8, %xmm1
a.vpinsrd(A::xmm8, A::xmm1, A::Mem{A::r8 }, 3); // vpinsrd $3, (%r8), %xmm1, %xmm8;
a.vpinsrw(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrw $4, (%rsi), %xmm8, %xmm1
a.vpinsrw(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinrsw $12, (%r8), %xmm1, %xmm8
a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrb $4, (%rsi), %xmm8, %xmm1
a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinsrb $12, (%r8), %xmm1, %xmm8
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
a.vextracti128(A::xmm1, A::ymm8, 1); // vextracti128 $1, %ymm8, %xmm1
a.vextracti128(A::xmm8, A::ymm1, 0); // vextracti128 $0, %ymm1, %xmm8
a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3); // vpextrd $3, %xmm8, (%rsi)
a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2); // vpextrd $2, %xmm1, (%r8)
a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
a.vpextrb(A::Mem{A::rsi}, A::xmm8, 7);
a.vpextrb(A::Mem{A::r8 }, A::xmm1, 15);
},{
0xc4,0xe3,0x39, 0x22, 0x0e, 1,
0xc4,0x43,0x71, 0x22, 0x00, 3,
0xc5,0xb9, 0xc4, 0x0e, 4,
0xc4,0x41,0x71, 0xc4, 0x00, 12,
0xc4,0xe3,0x39, 0x20, 0x0e, 4,
0xc4,0x43,0x71, 0x20, 0x00, 12,
Reland "Reland "gather8/16 JIT support"" This is a reland of 1283d55f35495c38f3a80b1fc5611981ddd6315f ... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bccc106b67ba36d5e91cac457d84b99e > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 18:57:13 +00:00
0xc4,0x63,0x7d,0x39,0xc1, 1,
0xc4,0xc3,0x7d,0x39,0xc8, 0,
0xc4,0x63,0x79,0x16,0x06, 3,
0xc4,0xc3,0x79,0x16,0x08, 2,
0xc4,0x63,0x79, 0x15, 0x06, 7,
0xc4,0xc3,0x79, 0x15, 0x08, 15,
0xc4,0x63,0x79, 0x14, 0x06, 7,
0xc4,0xc3,0x79, 0x14, 0x08, 15,
});
test_asm(r, [&](A& a) {
a.vpandn(A::ymm3, A::ymm12, A::ymm2);
},{
0xc5, 0x9d, 0xdf, 0xda,
});
test_asm(r, [&](A& a) {
A::Label l;
a.vmovdqa(A::ymm3, A::ymm2); // vmovdqa %ymm2 , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi}); // vmovdqa (%rsi) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsp}); // vmovdqa (%rsp) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::r11}); // vmovdqa (%r11) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4}); // vmovdqa 4(%rsi) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsp, 4}); // vmovdqa 4(%rsp) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::rax, A::EIGHT}); // vmovdqa 4(%rsi,%rax,8), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::r11, 4, A::rax, A::TWO }); // vmovdqa 4(%r11,%rax,2), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11, A::FOUR }); // vmovdqa 4(%rsi,%r11,4), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11, A::ONE }); // vmovdqa 4(%rsi,%r11,1), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11}); // vmovdqa 4(%rsi,%r11) , %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 64, A::r11}); // vmovdqa 64(%rsi,%r11), %ymm3
a.vmovdqa(A::ymm3, A::Mem{A::rsi, 128, A::r11}); // vmovdqa 128(%rsi,%r11), %ymm3
a.vmovdqa(A::ymm3, &l); // vmovdqa 16(%rip) , %ymm3
a.vcvttps2dq(A::ymm3, A::ymm2);
a.vcvtdq2ps (A::ymm3, A::ymm2);
a.vcvtps2dq (A::ymm3, A::ymm2);
a.vsqrtps (A::ymm3, A::ymm2);
a.label(&l);
},{
0xc5,0xfd,0x6f,0xda,
0xc5,0xfd,0x6f,0x1e,
0xc5,0xfd,0x6f,0x1c,0x24,
0xc4,0xc1,0x7d,0x6f,0x1b,
0xc5,0xfd,0x6f,0x5e,0x04,
0xc5,0xfd,0x6f,0x5c,0x24,0x04,
0xc5,0xfd,0x6f,0x5c,0xc6,0x04,
0xc4,0xc1,0x7d,0x6f,0x5c,0x43,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x9e,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x40,
0xc4,0xa1,0x7d,0x6f,0x9c,0x1e,0x80,0x00,0x00,0x00,
0xc5,0xfd,0x6f,0x1d,0x10,0x00,0x00,0x00,
0xc5,0xfe,0x5b,0xda,
0xc5,0xfc,0x5b,0xda,
0xc5,0xfd,0x5b,0xda,
0xc5,0xfc,0x51,0xda,
});
test_asm(r, [&](A& a) {
a.vcvtps2ph(A::xmm3, A::ymm2, A::CURRENT);
a.vcvtps2ph(A::Mem{A::rsi, 32, A::rax, A::EIGHT}, A::ymm5, A::CEIL);
a.vcvtph2ps(A::ymm15, A::Mem{A::rdi, 12, A::r9, A::ONE});
a.vcvtph2ps(A::ymm2, A::xmm3);
},{
0xc4,0xe3,0x7d,0x1d,0xd3,0x04,
0xc4,0xe3,0x7d,0x1d,0x6c,0xc6,0x20,0x02,
0xc4,0x22,0x7d,0x13,0x7c,0x0f,0x0c,
0xc4,0xe2,0x7d,0x13,0xd3,
});
test_asm(r, [&](A& a) {
a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm1 );
a.vgatherdps(A::ymm10, A::ONE , A::ymm2 , A::rax, A::ymm1 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm12, A::rax, A::ymm1 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::r9 , A::ymm1 );
a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm12);
a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
},{
0xc4,0xe2,0x6d,0x92,0x0c,0x87,
0xc4,0xe2,0x75,0x92,0x04,0x10,
0xc4,0x62,0x75,0x92,0x14,0x10,
0xc4,0xa2,0x75,0x92,0x04,0x20,
0xc4,0xc2,0x75,0x92,0x04,0x11,
0xc4,0xe2,0x1d,0x92,0x04,0x10,
0xc4,0xe2,0x1d,0x92,0x04,0xd0,
});
test_asm(r, [&](A& a) {
a.mov(A::rax, A::Mem{A::rdi, 0});
a.mov(A::rax, A::Mem{A::rdi, 1});
a.mov(A::rax, A::Mem{A::rdi, 512});
a.mov(A::r15, A::Mem{A::r13, 42});
a.mov(A::rax, A::Mem{A::r13, 42});
a.mov(A::r15, A::Mem{A::rax, 42});
a.mov(A::rax, 1);
a.mov(A::rax, A::rcx);
},{
0x48, 0x8b, 0x07,
0x48, 0x8b, 0x47, 0x01,
0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
0x4d, 0x8b, 0x7d, 0x2a,
0x49, 0x8b, 0x45, 0x2a,
0x4c, 0x8b, 0x78, 0x2a,
0x48, 0xc7, 0xc0, 0x01,0x00,0x00,0x00,
0x48, 0x89, 0xc8,
});
// echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
test_asm(r, [&](A& a) {
a.and16b(A::v4, A::v3, A::v1);
a.orr16b(A::v4, A::v3, A::v1);
a.eor16b(A::v4, A::v3, A::v1);
a.bic16b(A::v4, A::v3, A::v1);
a.bsl16b(A::v4, A::v3, A::v1);
a.not16b(A::v4, A::v3);
a.add4s(A::v4, A::v3, A::v1);
a.sub4s(A::v4, A::v3, A::v1);
a.mul4s(A::v4, A::v3, A::v1);
a.cmeq4s(A::v4, A::v3, A::v1);
a.cmgt4s(A::v4, A::v3, A::v1);
a.sub8h(A::v4, A::v3, A::v1);
a.mul8h(A::v4, A::v3, A::v1);
a.fadd4s(A::v4, A::v3, A::v1);
a.fsub4s(A::v4, A::v3, A::v1);
a.fmul4s(A::v4, A::v3, A::v1);
a.fdiv4s(A::v4, A::v3, A::v1);
a.fmin4s(A::v4, A::v3, A::v1);
a.fmax4s(A::v4, A::v3, A::v1);
a.fneg4s (A::v4, A::v3);
a.fsqrt4s(A::v4, A::v3);
a.fmla4s(A::v4, A::v3, A::v1);
a.fmls4s(A::v4, A::v3, A::v1);
a.fcmeq4s(A::v4, A::v3, A::v1);
a.fcmgt4s(A::v4, A::v3, A::v1);
a.fcmge4s(A::v4, A::v3, A::v1);
},{
0x64,0x1c,0x21,0x4e,
0x64,0x1c,0xa1,0x4e,
0x64,0x1c,0x21,0x6e,
0x64,0x1c,0x61,0x4e,
0x64,0x1c,0x61,0x6e,
0x64,0x58,0x20,0x6e,
0x64,0x84,0xa1,0x4e,
0x64,0x84,0xa1,0x6e,
0x64,0x9c,0xa1,0x4e,
0x64,0x8c,0xa1,0x6e,
0x64,0x34,0xa1,0x4e,
0x64,0x84,0x61,0x6e,
0x64,0x9c,0x61,0x4e,
0x64,0xd4,0x21,0x4e,
0x64,0xd4,0xa1,0x4e,
0x64,0xdc,0x21,0x6e,
0x64,0xfc,0x21,0x6e,
0x64,0xf4,0xa1,0x4e,
0x64,0xf4,0x21,0x4e,
0x64,0xf8,0xa0,0x6e,
0x64,0xf8,0xa1,0x6e,
0x64,0xcc,0x21,0x4e,
0x64,0xcc,0xa1,0x4e,
0x64,0xe4,0x21,0x4e,
0x64,0xe4,0xa1,0x6e,
0x64,0xe4,0x21,0x6e,
});
test_asm(r, [&](A& a) {
a.shl4s(A::v4, A::v3, 0);
a.shl4s(A::v4, A::v3, 1);
a.shl4s(A::v4, A::v3, 8);
a.shl4s(A::v4, A::v3, 16);
a.shl4s(A::v4, A::v3, 31);
a.sshr4s(A::v4, A::v3, 1);
a.sshr4s(A::v4, A::v3, 8);
a.sshr4s(A::v4, A::v3, 31);
a.ushr4s(A::v4, A::v3, 1);
a.ushr4s(A::v4, A::v3, 8);
a.ushr4s(A::v4, A::v3, 31);
a.ushr8h(A::v4, A::v3, 1);
a.ushr8h(A::v4, A::v3, 8);
a.ushr8h(A::v4, A::v3, 15);
},{
0x64,0x54,0x20,0x4f,
0x64,0x54,0x21,0x4f,
0x64,0x54,0x28,0x4f,
0x64,0x54,0x30,0x4f,
0x64,0x54,0x3f,0x4f,
0x64,0x04,0x3f,0x4f,
0x64,0x04,0x38,0x4f,
0x64,0x04,0x21,0x4f,
0x64,0x04,0x3f,0x6f,
0x64,0x04,0x38,0x6f,
0x64,0x04,0x21,0x6f,
0x64,0x04,0x1f,0x6f,
0x64,0x04,0x18,0x6f,
0x64,0x04,0x11,0x6f,
});
test_asm(r, [&](A& a) {
a.sli4s(A::v4, A::v3, 0);
a.sli4s(A::v4, A::v3, 1);
a.sli4s(A::v4, A::v3, 8);
a.sli4s(A::v4, A::v3, 16);
a.sli4s(A::v4, A::v3, 31);
},{
0x64,0x54,0x20,0x6f,
0x64,0x54,0x21,0x6f,
0x64,0x54,0x28,0x6f,
0x64,0x54,0x30,0x6f,
0x64,0x54,0x3f,0x6f,
});
test_asm(r, [&](A& a) {
a.scvtf4s (A::v4, A::v3);
a.fcvtzs4s(A::v4, A::v3);
a.fcvtns4s(A::v4, A::v3);
a.frintp4s(A::v4, A::v3);
a.frintm4s(A::v4, A::v3);
a.fcvtn (A::v4, A::v3);
a.fcvtl (A::v4, A::v3);
},{
0x64,0xd8,0x21,0x4e,
0x64,0xb8,0xa1,0x4e,
0x64,0xa8,0x21,0x4e,
0x64,0x88,0xa1,0x4e,
0x64,0x98,0x21,0x4e,
0x64,0x68,0x21,0x0e,
0x64,0x78,0x21,0x0e,
});
test_asm(r, [&](A& a) {
a.sub (A::sp, A::sp, 32); // sub sp, sp, #32
a.strq(A::v0, A::sp, 1); // str q0, [sp, #16]
a.strq(A::v1, A::sp); // str q1, [sp]
a.strd(A::v0, A::sp, 6); // str s0, [sp, #48]
a.strs(A::v0, A::sp, 6); // str s0, [sp, #24]
a.strh(A::v0, A::sp, 10); // str h0, [sp, #20]
a.strb(A::v0, A::sp, 47); // str b0, [sp, #47]
a.ldrb(A::v9, A::sp, 42); // ldr b9, [sp, #42]
a.ldrh(A::v9, A::sp, 47); // ldr h9, [sp, #94]
a.ldrs(A::v7, A::sp, 10); // ldr s7, [sp, #40]
a.ldrd(A::v7, A::sp, 1); // ldr d7, [sp, #8]
a.ldrq(A::v5, A::sp, 128); // ldr q5, [sp, #2048]
a.add (A::sp, A::sp, 32); // add sp, sp, #32
},{
0xff,0x83,0x00,0xd1,
0xe0,0x07,0x80,0x3d,
0xe1,0x03,0x80,0x3d,
0xe0,0x1b,0x00,0xfd,
0xe0,0x1b,0x00,0xbd,
0xe0,0x2b,0x00,0x7d,
0xe0,0xbf,0x00,0x3d,
0xe9,0xab,0x40,0x3d,
0xe9,0xbf,0x40,0x7d,
0xe7,0x2b,0x40,0xbd,
0xe7,0x07,0x40,0xfd,
0xe5,0x03,0xc2,0x3d,
0xff,0x83,0x00,0x91,
});
test_asm(r, [&](A& a) {
a.brk(0);
a.brk(65535);
a.ret(A::x30); // Conventional ret using link register.
a.ret(A::x13); // Can really return using any register if we like.
a.add(A::x2, A::x2, 4);
a.add(A::x3, A::x2, 32);
a.sub(A::x2, A::x2, 4);
a.sub(A::x3, A::x2, 32);
a.subs(A::x2, A::x2, 4);
a.subs(A::x3, A::x2, 32);
a.subs(A::xzr, A::x2, 4); // These are actually the same instruction!
a.cmp(A::x2, 4);
A::Label l;
a.label(&l);
a.bne(&l);
a.bne(&l);
a.blt(&l);
a.b(&l);
a.cbnz(A::x2, &l);
a.cbz(A::x2, &l);
a.add(A::x3, A::x2, A::x1); // add x3,x2,x1
a.add(A::x3, A::x2, A::x1, A::ASR, 3); // add x3,x2,x1, asr #3
},{
0x00,0x00,0x20,0xd4,
0xe0,0xff,0x3f,0xd4,
0xc0,0x03,0x5f,0xd6,
0xa0,0x01,0x5f,0xd6,
0x42,0x10,0x00,0x91,
0x43,0x80,0x00,0x91,
0x42,0x10,0x00,0xd1,
0x43,0x80,0x00,0xd1,
0x42,0x10,0x00,0xf1,
0x43,0x80,0x00,0xf1,
0x5f,0x10,0x00,0xf1,
0x5f,0x10,0x00,0xf1,
0x01,0x00,0x00,0x54, // b.ne #0
0xe1,0xff,0xff,0x54, // b.ne #-4
0xcb,0xff,0xff,0x54, // b.lt #-8
0xae,0xff,0xff,0x54, // b.al #-12
0x82,0xff,0xff,0xb5, // cbnz x2, #-16
0x62,0xff,0xff,0xb4, // cbz x2, #-20
0x43,0x00,0x01,0x8b,
0x43,0x0c,0x81,0x8b,
});
// Can we cbz() to a not-yet-defined label?
test_asm(r, [&](A& a) {
A::Label l;
a.cbz(A::x2, &l);
a.add(A::x3, A::x2, 32);
a.label(&l);
a.ret(A::x30);
},{
0x42,0x00,0x00,0xb4, // cbz x2, #8
0x43,0x80,0x00,0x91, // add x3, x2, #32
0xc0,0x03,0x5f,0xd6, // ret
});
// If we start a label as a backward label,
// can we redefine it to be a future label?
// (Not sure this is useful... just want to test it works.)
test_asm(r, [&](A& a) {
A::Label l1;
a.label(&l1);
a.add(A::x3, A::x2, 32);
a.cbz(A::x2, &l1); // This will jump backward... nothing sneaky.
A::Label l2; // Start off the same...
a.label(&l2);
a.add(A::x3, A::x2, 32);
a.cbz(A::x2, &l2); // Looks like this will go backward...
a.add(A::x2, A::x2, 4);
a.add(A::x3, A::x2, 32);
a.label(&l2); // But no... actually forward! What a switcheroo!
},{
0x43,0x80,0x00,0x91, // add x3, x2, #32
0xe2,0xff,0xff,0xb4, // cbz x2, #-4
0x43,0x80,0x00,0x91, // add x3, x2, #32
0x62,0x00,0x00,0xb4, // cbz x2, #12
0x42,0x10,0x00,0x91, // add x2, x2, #4
0x43,0x80,0x00,0x91, // add x3, x2, #32
});
// Loading from a label on ARM.
test_asm(r, [&](A& a) {
A::Label fore,aft;
a.label(&fore);
a.word(0x01234567);
a.ldrq(A::v1, &fore);
a.ldrq(A::v2, &aft);
a.label(&aft);
a.word(0x76543210);
},{
0x67,0x45,0x23,0x01,
0xe1,0xff,0xff,0x9c, // ldr q1, #-4
0x22,0x00,0x00,0x9c, // ldr q2, #4
0x10,0x32,0x54,0x76,
});
test_asm(r, [&](A& a) {
a.ldrq(A::v0, A::x8);
a.strq(A::v0, A::x8);
},{
0x00,0x01,0xc0,0x3d,
0x00,0x01,0x80,0x3d,
});
test_asm(r, [&](A& a) {
a.dup4s (A::v0, A::x8);
a.ld1r4s (A::v0, A::x8); // echo 'ld1r.4s {v0}, [x8]' | llvm-mc --show-encoding
a.ld1r8h (A::v0, A::x8);
a.ld1r16b(A::v0, A::x8);
},{
0x00,0x0d,0x04,0x4e,
0x00,0xc9,0x40,0x4d,
0x00,0xc5,0x40,0x4d,
0x00,0xc1,0x40,0x4d,
});
test_asm(r, [&](A& a) {
a.ld24s(A::v0, A::x8); // echo 'ld2.4s {v0,v1}, [x8]' | llvm-mc --show-encoding
a.ld44s(A::v0, A::x8);
a.st24s(A::v0, A::x8);
a.st44s(A::v0, A::x8); // echo 'st4.4s {v0,v1,v2,v3}, [x8]' | llvm-mc --show-encoding
a.ld24s(A::v0, A::x8, 0); //echo 'ld2 {v0.s,v1.s}[0], [x8]' | llvm-mc --show-encoding
a.ld24s(A::v0, A::x8, 1);
a.ld24s(A::v0, A::x8, 2);
a.ld24s(A::v0, A::x8, 3);
a.ld44s(A::v0, A::x8, 0); // ld4 {v0.s,v1.s,v2.s,v3.s}[0], [x8]
a.ld44s(A::v0, A::x8, 1);
a.ld44s(A::v0, A::x8, 2);
a.ld44s(A::v0, A::x8, 3);
},{
0x00,0x89,0x40,0x4c,
0x00,0x09,0x40,0x4c,
0x00,0x89,0x00,0x4c,
0x00,0x09,0x00,0x4c,
0x00,0x81,0x60,0x0d,
0x00,0x91,0x60,0x0d,
0x00,0x81,0x60,0x4d,
0x00,0x91,0x60,0x4d,
0x00,0xa1,0x60,0x0d,
0x00,0xb1,0x60,0x0d,
0x00,0xa1,0x60,0x4d,
0x00,0xb1,0x60,0x4d,
});
test_asm(r, [&](A& a) {
a.xtns2h(A::v0, A::v0);
a.xtnh2b(A::v0, A::v0);
a.strs (A::v0, A::x0);
a.ldrs (A::v0, A::x0);
a.uxtlb2h(A::v0, A::v0);
a.uxtlh2s(A::v0, A::v0);
a.uminv4s(A::v3, A::v4);
a.movs (A::x3, A::v4,0); // mov.s w3,v4[0]
a.movs (A::x3, A::v4,1); // mov.s w3,v4[1]
a.inss (A::v4, A::x3,3); // ins.s v4[3],w3
},{
0x00,0x28,0x61,0x0e,
0x00,0x28,0x21,0x0e,
0x00,0x00,0x00,0xbd,
0x00,0x00,0x40,0xbd,
0x00,0xa4,0x08,0x2f,
0x00,0xa4,0x10,0x2f,
0x83,0xa8,0xb1,0x6e,
0x83,0x3c,0x04,0x0e,
0x83,0x3c,0x0c,0x0e,
0x64,0x1c,0x1c,0x4e,
});
test_asm(r, [&](A& a) {
a.ldrb(A::v0, A::x8);
a.strb(A::v0, A::x8);
},{
0x00,0x01,0x40,0x3d,
0x00,0x01,0x00,0x3d,
});
test_asm(r, [&](A& a) {
a.ldrd(A::x0, A::x1, 3); // ldr x0, [x1, #24]
a.ldrs(A::x0, A::x1, 3); // ldr w0, [x1, #12]
a.ldrh(A::x0, A::x1, 3); // ldrh w0, [x1, #6]
a.ldrb(A::x0, A::x1, 3); // ldrb w0, [x1, #3]
a.strs(A::x0, A::x1, 3); // str w0, [x1, #12]
},{
0x20,0x0c,0x40,0xf9,
0x20,0x0c,0x40,0xb9,
0x20,0x0c,0x40,0x79,
0x20,0x0c,0x40,0x39,
0x20,0x0c,0x00,0xb9,
});
test_asm(r, [&](A& a) {
a.tbl (A::v0, A::v1, A::v2);
a.uzp14s(A::v0, A::v1, A::v2);
a.uzp24s(A::v0, A::v1, A::v2);
a.zip14s(A::v0, A::v1, A::v2);
a.zip24s(A::v0, A::v1, A::v2);
},{
0x20,0x00,0x02,0x4e,
0x20,0x18,0x82,0x4e,
0x20,0x58,0x82,0x4e,
0x20,0x38,0x82,0x4e,
0x20,0x78,0x82,0x4e,
});
}
DEF_TEST(SkVM_approx_math, r) {
auto eval = [](int N, float values[], auto fn) {
skvm::Builder b;
skvm::Ptr inout = b.varying<float>();
b.storeF(inout, fn(&b, b.loadF(inout)));
b.done().eval(N, values);
};
auto compare = [r](int N, const float values[], const float expected[]) {
for (int i = 0; i < N; ++i) {
REPORTER_ASSERT(r, (values[i] == expected[i]) ||
SkScalarNearlyEqual(values[i], expected[i], 0.001f),
"evaluated to %g, but expected %g", values[i], expected[i]);
}
};
// log2
{
float values[] = {0.25f, 0.5f, 1, 2, 4, 8};
constexpr int N = SK_ARRAY_COUNT(values);
eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
return b->approx_log2(v);
});
const float expected[] = {-2, -1, 0, 1, 2, 3};
compare(N, values, expected);
}
// pow2
{
float values[] = {-80, -5, -2, -1, 0, 1, 2, 3, 5, 160};
constexpr int N = SK_ARRAY_COUNT(values);
eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
return b->approx_pow2(v);
});
const float expected[] = {0, 0.03125f, 0.25f, 0.5f, 1, 2, 4, 8, 32, INFINITY};
compare(N, values, expected);
}
// powf -- 1^x
{
float exps[] = {-2, -1, 0, 1, 2};
constexpr int N = SK_ARRAY_COUNT(exps);
eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
return b->approx_powf(b->splat(1.0f), exp);
});
const float expected[] = {1, 1, 1, 1, 1};
compare(N, exps, expected);
}
// powf -- 2^x
{
float exps[] = {-80, -5, -2, -1, 0, 1, 2, 3, 5, 160};
constexpr int N = SK_ARRAY_COUNT(exps);
eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
return b->approx_powf(2.0, exp);
});
const float expected[] = {0, 0.03125f, 0.25f, 0.5f, 1, 2, 4, 8, 32, INFINITY};
compare(N, exps, expected);
}
// powf -- 3^x
{
float exps[] = {-2, -1, 0, 1, 2};
constexpr int N = SK_ARRAY_COUNT(exps);
eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
return b->approx_powf(b->splat(3.0f), exp);
});
const float expected[] = {1/9.0f, 1/3.0f, 1, 3, 9};
compare(N, exps, expected);
}
// powf -- x^0.5
{
float bases[] = {0, 1, 4, 9, 16};
constexpr int N = SK_ARRAY_COUNT(bases);
eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
return b->approx_powf(base, b->splat(0.5f));
});
const float expected[] = {0, 1, 2, 3, 4};
compare(N, bases, expected);
}
// powf -- x^1
{
float bases[] = {0, 1, 2, 3, 4};
constexpr int N = SK_ARRAY_COUNT(bases);
eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
return b->approx_powf(base, b->splat(1.0f));
});
const float expected[] = {0, 1, 2, 3, 4};
compare(N, bases, expected);
}
// powf -- x^2
{
float bases[] = {0, 1, 2, 3, 4};
constexpr int N = SK_ARRAY_COUNT(bases);
eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
return b->approx_powf(base, b->splat(2.0f));
});
const float expected[] = {0, 1, 4, 9, 16};
compare(N, bases, expected);
}
auto test = [r](float arg, float expected, float tolerance, auto prog) {
skvm::Builder b;
skvm::Ptr inout = b.varying<float>();
b.storeF(inout, prog(b.loadF(inout)));
float actual = arg;
b.done().eval(1, &actual);
float err = std::abs(actual - expected);
if (err > tolerance) {
// SkDebugf("arg %g, expected %g, actual %g\n", arg, expected, actual);
REPORTER_ASSERT(r, true);
}
return err;
};
auto test2 = [r](float arg0, float arg1, float expected, float tolerance, auto prog) {
skvm::Builder b;
skvm::Ptr in0 = b.varying<float>();
skvm::Ptr in1 = b.varying<float>();
skvm::Ptr out = b.varying<float>();
b.storeF(out, prog(b.loadF(in0), b.loadF(in1)));
float actual;
b.done().eval(1, &arg0, &arg1, &actual);
float err = std::abs(actual - expected);
if (err > tolerance) {
// SkDebugf("[%g, %g]: expected %g, actual %g\n", arg0, arg1, expected, actual);
REPORTER_ASSERT(r, true);
}
return err;
};
// sine, cosine, tangent
{
constexpr float P = SK_ScalarPI;
constexpr float tol = 0.00175f;
for (float rad = -5*P; rad <= 5*P; rad += 0.1f) {
test(rad, sk_float_sin(rad), tol, [](skvm::F32 x) {
return approx_sin(x);
});
test(rad, sk_float_cos(rad), tol, [](skvm::F32 x) {
return approx_cos(x);
});
}
// Our tangent diverge more as we get near infinities (x near +- Pi/2),
// so bring in the domain a little.
constexpr float eps = 0.16f;
float err = 0;
for (float rad = -P/2 + eps; rad <= P/2 - eps; rad += 0.01f) {
err += test(rad, sk_float_tan(rad), tol, [](skvm::F32 x) {
return approx_tan(x);
});
// try again with some multiples of P, to check our periodicity
test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
return approx_tan(x + 3*P);
});
test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
return approx_tan(x - 3*P);
});
}
if ((false)) { SkDebugf("tan error %g\n", err); }
}
// asin, acos, atan
{
constexpr float tol = 0.00175f;
float err = 0;
for (float x = -1; x <= 1; x += 1.0f/64) {
err += test(x, asin(x), tol, [](skvm::F32 x) {
return approx_asin(x);
});
test(x, acos(x), tol, [](skvm::F32 x) {
return approx_acos(x);
});
}
if ((false)) { SkDebugf("asin error %g\n", err); }
err = 0;
for (float x = -10; x <= 10; x += 1.0f/16) {
err += test(x, atan(x), tol, [](skvm::F32 x) {
return approx_atan(x);
});
}
if ((false)) { SkDebugf("atan error %g\n", err); }
for (float y = -3; y <= 3; y += 1) {
for (float x = -3; x <= 3; x += 1) {
err += test2(y, x, atan2(y,x), tol, [](skvm::F32 y, skvm::F32 x) {
return approx_atan2(y,x);
});
}
}
if ((false)) { SkDebugf("atan2 error %g\n", err); }
}
}
DEF_TEST(SkVM_min_max, r) {
// min() and max() have subtle behavior when one argument is NaN and
// the other isn't. It's not sound to blindly swap their arguments.
//
// All backends must behave like std::min() and std::max(), which are
//
// min(x,y) = y<x ? y : x
// max(x,y) = x<y ? y : x
// ±NaN, ±0, ±1, ±inf
const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
float f[8];
memcpy(f, bits, sizeof(bits));
auto identical = [&](float x, float y) {
uint32_t X,Y;
memcpy(&X, &x, 4);
memcpy(&Y, &y, 4);
return X == Y;
};
// Test min/max with non-constant x, non-constant y.
// (Whether x and y are varying or uniform shouldn't make any difference.)
{
skvm::Builder b;
{
skvm::Ptr src = b.varying<float>(),
mn = b.varying<float>(),
mx = b.varying<float>();
skvm::F32 x = b.loadF(src),
y = b.uniformF(b.uniform(), 0);
b.storeF(mn, b.min(x,y));
b.storeF(mx, b.max(x,y));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
float mn[8], mx[8];
for (int i = 0; i < 8; i++) {
// min() and max() everything with f[i].
program.eval(8, f,mn,mx, &f[i]);
for (int j = 0; j < 8; j++) {
REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
}
}
});
}
// Test each with constant on the right.
for (int i = 0; i < 8; i++) {
skvm::Builder b;
{
skvm::Ptr src = b.varying<float>(),
mn = b.varying<float>(),
mx = b.varying<float>();
skvm::F32 x = b.loadF(src),
y = b.splat(f[i]);
b.storeF(mn, b.min(x,y));
b.storeF(mx, b.max(x,y));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
float mn[8], mx[8];
program.eval(8, f,mn,mx);
for (int j = 0; j < 8; j++) {
REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
}
});
}
// Test each with constant on the left.
for (int i = 0; i < 8; i++) {
skvm::Builder b;
{
skvm::Ptr src = b.varying<float>(),
mn = b.varying<float>(),
mx = b.varying<float>();
skvm::F32 x = b.splat(f[i]),
y = b.loadF(src);
b.storeF(mn, b.min(x,y));
b.storeF(mx, b.max(x,y));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
float mn[8], mx[8];
program.eval(8, f,mn,mx);
for (int j = 0; j < 8; j++) {
REPORTER_ASSERT(r, identical(mn[j], std::min(f[i], f[j])));
REPORTER_ASSERT(r, identical(mx[j], std::max(f[i], f[j])));
}
});
}
}
DEF_TEST(SkVM_halfs, r) {
const uint16_t hs[] = {0x0000,0x3800,0x3c00,0x4000,
0xc400,0xb800,0xbc00,0xc000};
const float fs[] = {+0.0f,+0.5f,+1.0f,+2.0f,
-4.0f,-0.5f,-1.0f,-2.0f};
{
skvm::Builder b;
skvm::Ptr src = b.varying<uint16_t>(),
dst = b.varying<float>();
b.storeF(dst, b.from_fp16(b.load16(src)));
test_jit_and_interpreter(b, [&](const skvm::Program& program){
float dst[8];
program.eval(8, hs, dst);
for (int i = 0; i < 8; i++) {
REPORTER_ASSERT(r, dst[i] == fs[i]);
}
});
}
{
skvm::Builder b;
skvm::Ptr src = b.varying<float>(),
dst = b.varying<uint16_t>();
b.store16(dst, b.to_fp16(b.loadF(src)));
test_jit_and_interpreter(b, [&](const skvm::Program& program){
uint16_t dst[8];
program.eval(8, fs, dst);
for (int i = 0; i < 8; i++) {
REPORTER_ASSERT(r, dst[i] == hs[i]);
}
});
}
}
DEF_TEST(SkVM_64bit, r) {
uint32_t lo[65],
hi[65];
uint64_t wide[65];
for (int i = 0; i < 65; i++) {
lo[i] = 2*i+0;
hi[i] = 2*i+1;
wide[i] = ((uint64_t)lo[i] << 0)
| ((uint64_t)hi[i] << 32);
}
{
skvm::Builder b;
{
skvm::Ptr widePtr = b.varying<uint64_t>(),
loPtr = b.varying<int>(),
hiPtr = b.varying<int>();
b.store32(loPtr, b.load64(widePtr, 0));
b.store32(hiPtr, b.load64(widePtr, 1));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
uint32_t l[65], h[65];
program.eval(65, wide,l,h);
for (int i = 0; i < 65; i++) {
REPORTER_ASSERT(r, l[i] == lo[i]);
REPORTER_ASSERT(r, h[i] == hi[i]);
}
});
}
{
skvm::Builder b;
{
skvm::Ptr widePtr = b.varying<uint64_t>(),
loPtr = b.varying<int>(),
hiPtr = b.varying<int>();
b.store64(widePtr, b.load32(loPtr), b.load32(hiPtr));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
uint64_t w[65];
program.eval(65, w,lo,hi);
for (int i = 0; i < 65; i++) {
REPORTER_ASSERT(r, w[i] == wide[i]);
}
});
}
}
DEF_TEST(SkVM_128bit, r) {
float floats[4*63];
uint8_t packed[4*63];
for (int i = 0; i < 4*63; i++) {
floats[i] = i * (1/255.0f);
}
skvm::PixelFormat rgba_ffff = skvm::SkColorType_to_PixelFormat(kRGBA_F32_SkColorType),
rgba_8888 = skvm::SkColorType_to_PixelFormat(kRGBA_8888_SkColorType);
{ // Convert RGBA F32 to RGBA 8888, testing 128-bit loads.
skvm::Builder b;
{
2021-08-03 20:43:14 +00:00
skvm::Ptr dst = b.varying(4),
src = b.varying(16);
skvm::Color c = b.load(rgba_ffff, src);
b.store(rgba_8888, dst, c);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
memset(packed, 0, sizeof(packed));
program.eval(63, packed, floats);
for (int i = 0; i < 4*63; i++) {
REPORTER_ASSERT(r, packed[i] == i);
}
});
}
{ // Convert RGBA 8888 to RGBA F32, testing 128-bit stores.
skvm::Builder b;
{
2021-08-03 20:43:14 +00:00
skvm::Ptr dst = b.varying(16),
src = b.varying(4);
skvm::Color c = b.load(rgba_8888, src);
b.store(rgba_ffff, dst, c);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
memset(floats, 0, sizeof(floats));
program.eval(63, floats, packed);
for (int i = 0; i < 4*63; i++) {
REPORTER_ASSERT(r, floats[i] == i * (1/255.0f));
}
});
}
}
DEF_TEST(SkVM_is_NaN_is_finite, r) {
skvm::Builder b;
{
skvm::Ptr src = b.varying<float>(),
nan = b.varying<int>(),
fin = b.varying<int>();
b.store32(nan, is_NaN (b.loadF(src)));
b.store32(fin, is_finite(b.loadF(src)));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
// ±NaN, ±0, ±1, ±inf
const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
uint32_t nan[8], fin[8];
program.eval(8, bits, nan,fin);
for (int i = 0; i < 8; i++) {
REPORTER_ASSERT(r, nan[i] == ((i == 0 || i == 1) ? 0xffffffff : 0));
REPORTER_ASSERT(r, fin[i] == ((i == 2 || i == 3 ||
i == 4 || i == 5) ? 0xffffffff : 0));
}
});
}
DEF_TEST(SkVM_args, r) {
// Test we can handle at least six arguments.
skvm::Builder b;
{
skvm::Ptr dst = b.varying<float>(),
A = b.varying<float>(),
B = b.varying<float>(),
C = b.varying<float>(),
D = b.varying<float>(),
E = b.varying<float>();
storeF(dst, b.loadF(A)
+ b.loadF(B)
+ b.loadF(C)
+ b.loadF(D)
+ b.loadF(E));
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
float dst[17],A[17],B[17],C[17],D[17],E[17];
for (int i = 0; i < 17; i++) {
A[i] = B[i] = C[i] = D[i] = E[i] = (float)i;
}
program.eval(17, dst,A,B,C,D,E);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(r, dst[i] == 5.0f*i);
}
});
}
DEF_TEST(SkVM_badpack, reporter) {
remove Op::pack pack(x,y,bits) as an alias for x|(y<<bits) only existed originally to implement it with the SLI arm64 instruction, but I've since realized that was misguided. I had thought the assumption on pack ("(x & (y << bits)) == 0"), i.e. "no overlap between x and the shifted y", was enough to make using SLI legal, but it's actually not strong enough a requirement. The SLI docs say "...inserts the result into the corresponding vector element in the destination SIMD&FP register such that the new zero bits created by the shift are not inserted but retain their existing value." The key thing not mentioned there happens with zero bits _not_ created by the shift, the ones already present at the top of y. They're of course inserted, overwriting any previous values. This means SLI (and so pack()) become strictly order dependent in a way I had never intended. This will work as you'd think, skvm::I32 px = splat(0); px = pack(px, r, 0); px = pack(px, a, 24); but this version swapping the two calls to pack() will overwrite alpha, skvm::I32 px = splat(0); px = pack(px, a, 24); px = pack(px, r, 0); I find that error-prone, so I've removed Op::pack and replaced it with a simple expansion to x|(y<<bits). That of course works in either order. This new test can't JIT at head, but if we implement the other missing instructions (soon, dependent CL) it would start failing when JIT'd. The interpreter and x86 were both fine, since they're both doing what's now the only approach to pack(), the simple x|(y<<bits). I've left assembler support for SLI in case we want to try it again. Change-Id: Iaf879309d3e1d0a458a688f3a62556e55ab05e23 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/337197 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-11-20 21:34:16 +00:00
// Test case distilled from actual failing draw,
// originally with a bad arm64 implementation of pack().
skvm::Builder p;
{
2021-08-03 20:43:14 +00:00
skvm::UPtr uniforms = p.uniform();
skvm::Ptr dst = p.varying<uint16_t>();
remove Op::pack pack(x,y,bits) as an alias for x|(y<<bits) only existed originally to implement it with the SLI arm64 instruction, but I've since realized that was misguided. I had thought the assumption on pack ("(x & (y << bits)) == 0"), i.e. "no overlap between x and the shifted y", was enough to make using SLI legal, but it's actually not strong enough a requirement. The SLI docs say "...inserts the result into the corresponding vector element in the destination SIMD&FP register such that the new zero bits created by the shift are not inserted but retain their existing value." The key thing not mentioned there happens with zero bits _not_ created by the shift, the ones already present at the top of y. They're of course inserted, overwriting any previous values. This means SLI (and so pack()) become strictly order dependent in a way I had never intended. This will work as you'd think, skvm::I32 px = splat(0); px = pack(px, r, 0); px = pack(px, a, 24); but this version swapping the two calls to pack() will overwrite alpha, skvm::I32 px = splat(0); px = pack(px, a, 24); px = pack(px, r, 0); I find that error-prone, so I've removed Op::pack and replaced it with a simple expansion to x|(y<<bits). That of course works in either order. This new test can't JIT at head, but if we implement the other missing instructions (soon, dependent CL) it would start failing when JIT'd. The interpreter and x86 were both fine, since they're both doing what's now the only approach to pack(), the simple x|(y<<bits). I've left assembler support for SLI in case we want to try it again. Change-Id: Iaf879309d3e1d0a458a688f3a62556e55ab05e23 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/337197 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-11-20 21:34:16 +00:00
skvm::I32 r = round(p.uniformF(uniforms, 8) * 15),
remove Op::pack pack(x,y,bits) as an alias for x|(y<<bits) only existed originally to implement it with the SLI arm64 instruction, but I've since realized that was misguided. I had thought the assumption on pack ("(x & (y << bits)) == 0"), i.e. "no overlap between x and the shifted y", was enough to make using SLI legal, but it's actually not strong enough a requirement. The SLI docs say "...inserts the result into the corresponding vector element in the destination SIMD&FP register such that the new zero bits created by the shift are not inserted but retain their existing value." The key thing not mentioned there happens with zero bits _not_ created by the shift, the ones already present at the top of y. They're of course inserted, overwriting any previous values. This means SLI (and so pack()) become strictly order dependent in a way I had never intended. This will work as you'd think, skvm::I32 px = splat(0); px = pack(px, r, 0); px = pack(px, a, 24); but this version swapping the two calls to pack() will overwrite alpha, skvm::I32 px = splat(0); px = pack(px, a, 24); px = pack(px, r, 0); I find that error-prone, so I've removed Op::pack and replaced it with a simple expansion to x|(y<<bits). That of course works in either order. This new test can't JIT at head, but if we implement the other missing instructions (soon, dependent CL) it would start failing when JIT'd. The interpreter and x86 were both fine, since they're both doing what's now the only approach to pack(), the simple x|(y<<bits). I've left assembler support for SLI in case we want to try it again. Change-Id: Iaf879309d3e1d0a458a688f3a62556e55ab05e23 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/337197 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-11-20 21:34:16 +00:00
a = p.splat(0xf);
skvm::I32 _4444 = p.splat(0);
_4444 = pack(_4444, r, 12);
_4444 = pack(_4444, a, 0);
store16(dst, _4444);
}
test_jit_and_interpreter(p, [&](const skvm::Program& program){
remove Op::pack pack(x,y,bits) as an alias for x|(y<<bits) only existed originally to implement it with the SLI arm64 instruction, but I've since realized that was misguided. I had thought the assumption on pack ("(x & (y << bits)) == 0"), i.e. "no overlap between x and the shifted y", was enough to make using SLI legal, but it's actually not strong enough a requirement. The SLI docs say "...inserts the result into the corresponding vector element in the destination SIMD&FP register such that the new zero bits created by the shift are not inserted but retain their existing value." The key thing not mentioned there happens with zero bits _not_ created by the shift, the ones already present at the top of y. They're of course inserted, overwriting any previous values. This means SLI (and so pack()) become strictly order dependent in a way I had never intended. This will work as you'd think, skvm::I32 px = splat(0); px = pack(px, r, 0); px = pack(px, a, 24); but this version swapping the two calls to pack() will overwrite alpha, skvm::I32 px = splat(0); px = pack(px, a, 24); px = pack(px, r, 0); I find that error-prone, so I've removed Op::pack and replaced it with a simple expansion to x|(y<<bits). That of course works in either order. This new test can't JIT at head, but if we implement the other missing instructions (soon, dependent CL) it would start failing when JIT'd. The interpreter and x86 were both fine, since they're both doing what's now the only approach to pack(), the simple x|(y<<bits). I've left assembler support for SLI in case we want to try it again. Change-Id: Iaf879309d3e1d0a458a688f3a62556e55ab05e23 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/337197 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-11-20 21:34:16 +00:00
const float uniforms[] = { 0.0f, 0.0f,
1.0f, 0.0f, 0.0f, 1.0f };
uint16_t dst[17] = {0};
program.eval(17, uniforms,dst);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(reporter, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
remove Op::pack pack(x,y,bits) as an alias for x|(y<<bits) only existed originally to implement it with the SLI arm64 instruction, but I've since realized that was misguided. I had thought the assumption on pack ("(x & (y << bits)) == 0"), i.e. "no overlap between x and the shifted y", was enough to make using SLI legal, but it's actually not strong enough a requirement. The SLI docs say "...inserts the result into the corresponding vector element in the destination SIMD&FP register such that the new zero bits created by the shift are not inserted but retain their existing value." The key thing not mentioned there happens with zero bits _not_ created by the shift, the ones already present at the top of y. They're of course inserted, overwriting any previous values. This means SLI (and so pack()) become strictly order dependent in a way I had never intended. This will work as you'd think, skvm::I32 px = splat(0); px = pack(px, r, 0); px = pack(px, a, 24); but this version swapping the two calls to pack() will overwrite alpha, skvm::I32 px = splat(0); px = pack(px, a, 24); px = pack(px, r, 0); I find that error-prone, so I've removed Op::pack and replaced it with a simple expansion to x|(y<<bits). That of course works in either order. This new test can't JIT at head, but if we implement the other missing instructions (soon, dependent CL) it would start failing when JIT'd. The interpreter and x86 were both fine, since they're both doing what's now the only approach to pack(), the simple x|(y<<bits). I've left assembler support for SLI in case we want to try it again. Change-Id: Iaf879309d3e1d0a458a688f3a62556e55ab05e23 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/337197 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-11-20 21:34:16 +00:00
}
});
}
DEF_TEST(SkVM_features, r) {
auto build_program = [](skvm::Builder* b) {
skvm::F32 x = b->loadF(b->varying<float>());
b->storeF(b->varying<float>(), x*x+x);
};
{ // load-fma-store with FMA available.
skvm::Features features;
features.fma = true;
skvm::Builder b(features);
build_program(&b);
REPORTER_ASSERT(r, b.optimize().size() == 3);
}
{ // load-mul-add-store without FMA.
skvm::Features features;
features.fma = false;
skvm::Builder b(features);
build_program(&b);
REPORTER_ASSERT(r, b.optimize().size() == 4);
}
{ // Auto-detected, could be either.
skvm::Builder b;
build_program(&b);
REPORTER_ASSERT(r, b.optimize().size() == 3
|| b.optimize().size() == 4);
}
}
DEF_TEST(SkVM_gather_can_hoist, r) {
// A gather instruction isn't necessarily varying... it's whatever its index is.
// First a typical gather scenario with varying index.
{
skvm::Builder b;
2021-08-03 20:43:14 +00:00
skvm::UPtr uniforms = b.uniform();
skvm::Ptr buf = b.varying<int>();
skvm::I32 ix = b.load32(buf);
b.store32(buf, b.gather32(uniforms,0, ix));
skvm::Program p = b.done();
// ix is varying, so the gather is too.
//
// loop:
// v0 = load32 buf
// v1 = gather32 uniforms+0 v0
// store32 buf v1
REPORTER_ASSERT(r, p.instructions().size() == 3);
REPORTER_ASSERT(r, p.loop() == 0);
}
// Now the same but with a uniform index instead.
{
skvm::Builder b;
2021-08-03 20:43:14 +00:00
skvm::UPtr uniforms = b.uniform();
skvm::Ptr buf = b.varying<int>();
skvm::I32 ix = b.uniform32(uniforms,8);
b.store32(buf, b.gather32(uniforms,0, ix));
skvm::Program p = b.done();
// ix is uniform, so the gather is too.
//
// v0 = uniform32 uniforms+8
// v1 = gather32 uniforms+0 v0
// loop:
// store32 buf v1
REPORTER_ASSERT(r, p.instructions().size() == 3);
REPORTER_ASSERT(r, p.loop() == 2);
}
}
don't dedup loads or stores We've been assuming that all Ops with the same arguments produce the same value and deduplicating them, which results in a simple common subexpression eliminator. But we can't soundly dedup two identical loads with a store between; that store could change the memory those loads read, producing different values, as demonstrated by the first new unit test. Then, by similar reasoning, it may first seem fine to deduplicate stores, e.g. store32 arg(0), v1 store32 arg(0), v1 That second store certainly does look redundant. But if we slot a different store between, it's no longer redundant: store32 arg(0), v1 store32 arg(0), v2 store32 arg(0), v1 If we dedup those two v1 stores, we'll skip the second and be left with v2 in our buffer instead of v1. This is the second new unit test. Now, uniform32 and gather ops also touch memory... are they safe to dedup? Surprisingly, yes! Uniforms are easy: they're read-only. No way to store to uniforms, so no intervening store can invalidate them. Gathers are a little fuzzier, in that the buffer we gather from is uniform in practice, but not strictly required to be... it's not impossible to construct a program that gathers from a buffer that the program also stores to, but you'd have to go out of your way to do it, and it's not a pattern we use today, and SkVM does not provide the synchronization primitives you'd need to make attempting that even vaguely sensible. So gathers in practice can also be deduplicated. In general it's safe to dedup an operation unless it touches _varying memory_, i.e. loads and stores. uniform32 and gathers touch non-varying memory, so they're safe, and while index is varying, it doesn't touch memory. Change-Id: Ia275f0ab2708d3f71e783164b419436b90f103a9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350608 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
2021-01-06 16:57:19 +00:00
DEF_TEST(SkVM_dont_dedup_loads, r) {
// We've been assuming that all Ops with the same arguments produce the same value
// and deduplicating them, which results in a simple common subexpression eliminator.
//
// But we can't soundly dedup two identical loads with a store between.
// If we dedup the loads in this test program it will always increment by 1, not K.
constexpr int K = 2;
skvm::Builder b;
{
skvm::Ptr buf = b.varying<int>();
don't dedup loads or stores We've been assuming that all Ops with the same arguments produce the same value and deduplicating them, which results in a simple common subexpression eliminator. But we can't soundly dedup two identical loads with a store between; that store could change the memory those loads read, producing different values, as demonstrated by the first new unit test. Then, by similar reasoning, it may first seem fine to deduplicate stores, e.g. store32 arg(0), v1 store32 arg(0), v1 That second store certainly does look redundant. But if we slot a different store between, it's no longer redundant: store32 arg(0), v1 store32 arg(0), v2 store32 arg(0), v1 If we dedup those two v1 stores, we'll skip the second and be left with v2 in our buffer instead of v1. This is the second new unit test. Now, uniform32 and gather ops also touch memory... are they safe to dedup? Surprisingly, yes! Uniforms are easy: they're read-only. No way to store to uniforms, so no intervening store can invalidate them. Gathers are a little fuzzier, in that the buffer we gather from is uniform in practice, but not strictly required to be... it's not impossible to construct a program that gathers from a buffer that the program also stores to, but you'd have to go out of your way to do it, and it's not a pattern we use today, and SkVM does not provide the synchronization primitives you'd need to make attempting that even vaguely sensible. So gathers in practice can also be deduplicated. In general it's safe to dedup an operation unless it touches _varying memory_, i.e. loads and stores. uniform32 and gathers touch non-varying memory, so they're safe, and while index is varying, it doesn't touch memory. Change-Id: Ia275f0ab2708d3f71e783164b419436b90f103a9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350608 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
2021-01-06 16:57:19 +00:00
for (int i = 0; i < K; i++) {
b.store32(buf, b.load32(buf) + 1);
}
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
don't dedup loads or stores We've been assuming that all Ops with the same arguments produce the same value and deduplicating them, which results in a simple common subexpression eliminator. But we can't soundly dedup two identical loads with a store between; that store could change the memory those loads read, producing different values, as demonstrated by the first new unit test. Then, by similar reasoning, it may first seem fine to deduplicate stores, e.g. store32 arg(0), v1 store32 arg(0), v1 That second store certainly does look redundant. But if we slot a different store between, it's no longer redundant: store32 arg(0), v1 store32 arg(0), v2 store32 arg(0), v1 If we dedup those two v1 stores, we'll skip the second and be left with v2 in our buffer instead of v1. This is the second new unit test. Now, uniform32 and gather ops also touch memory... are they safe to dedup? Surprisingly, yes! Uniforms are easy: they're read-only. No way to store to uniforms, so no intervening store can invalidate them. Gathers are a little fuzzier, in that the buffer we gather from is uniform in practice, but not strictly required to be... it's not impossible to construct a program that gathers from a buffer that the program also stores to, but you'd have to go out of your way to do it, and it's not a pattern we use today, and SkVM does not provide the synchronization primitives you'd need to make attempting that even vaguely sensible. So gathers in practice can also be deduplicated. In general it's safe to dedup an operation unless it touches _varying memory_, i.e. loads and stores. uniform32 and gathers touch non-varying memory, so they're safe, and while index is varying, it doesn't touch memory. Change-Id: Ia275f0ab2708d3f71e783164b419436b90f103a9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350608 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
2021-01-06 16:57:19 +00:00
int buf[] = { 0,1,2,3,4 };
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
REPORTER_ASSERT(r, buf[i] == i+K);
}
});
}
DEF_TEST(SkVM_dont_dedup_stores, r) {
// Following a similar line of reasoning to SkVM_dont_dedup_loads,
// we cannot dedup stores either. A different store between two identical stores
// will invalidate the first store, meaning we do need to reissue that store operation.
skvm::Builder b;
{
skvm::Ptr buf = b.varying<int>();
don't dedup loads or stores We've been assuming that all Ops with the same arguments produce the same value and deduplicating them, which results in a simple common subexpression eliminator. But we can't soundly dedup two identical loads with a store between; that store could change the memory those loads read, producing different values, as demonstrated by the first new unit test. Then, by similar reasoning, it may first seem fine to deduplicate stores, e.g. store32 arg(0), v1 store32 arg(0), v1 That second store certainly does look redundant. But if we slot a different store between, it's no longer redundant: store32 arg(0), v1 store32 arg(0), v2 store32 arg(0), v1 If we dedup those two v1 stores, we'll skip the second and be left with v2 in our buffer instead of v1. This is the second new unit test. Now, uniform32 and gather ops also touch memory... are they safe to dedup? Surprisingly, yes! Uniforms are easy: they're read-only. No way to store to uniforms, so no intervening store can invalidate them. Gathers are a little fuzzier, in that the buffer we gather from is uniform in practice, but not strictly required to be... it's not impossible to construct a program that gathers from a buffer that the program also stores to, but you'd have to go out of your way to do it, and it's not a pattern we use today, and SkVM does not provide the synchronization primitives you'd need to make attempting that even vaguely sensible. So gathers in practice can also be deduplicated. In general it's safe to dedup an operation unless it touches _varying memory_, i.e. loads and stores. uniform32 and gathers touch non-varying memory, so they're safe, and while index is varying, it doesn't touch memory. Change-Id: Ia275f0ab2708d3f71e783164b419436b90f103a9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350608 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
2021-01-06 16:57:19 +00:00
b.store32(buf, b.splat(4));
b.store32(buf, b.splat(5));
b.store32(buf, b.splat(4)); // If we dedup'd, we'd skip this store.
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
don't dedup loads or stores We've been assuming that all Ops with the same arguments produce the same value and deduplicating them, which results in a simple common subexpression eliminator. But we can't soundly dedup two identical loads with a store between; that store could change the memory those loads read, producing different values, as demonstrated by the first new unit test. Then, by similar reasoning, it may first seem fine to deduplicate stores, e.g. store32 arg(0), v1 store32 arg(0), v1 That second store certainly does look redundant. But if we slot a different store between, it's no longer redundant: store32 arg(0), v1 store32 arg(0), v2 store32 arg(0), v1 If we dedup those two v1 stores, we'll skip the second and be left with v2 in our buffer instead of v1. This is the second new unit test. Now, uniform32 and gather ops also touch memory... are they safe to dedup? Surprisingly, yes! Uniforms are easy: they're read-only. No way to store to uniforms, so no intervening store can invalidate them. Gathers are a little fuzzier, in that the buffer we gather from is uniform in practice, but not strictly required to be... it's not impossible to construct a program that gathers from a buffer that the program also stores to, but you'd have to go out of your way to do it, and it's not a pattern we use today, and SkVM does not provide the synchronization primitives you'd need to make attempting that even vaguely sensible. So gathers in practice can also be deduplicated. In general it's safe to dedup an operation unless it touches _varying memory_, i.e. loads and stores. uniform32 and gathers touch non-varying memory, so they're safe, and while index is varying, it doesn't touch memory. Change-Id: Ia275f0ab2708d3f71e783164b419436b90f103a9 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/350608 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
2021-01-06 16:57:19 +00:00
int buf[42];
program.eval(SK_ARRAY_COUNT(buf), buf);
for (int x : buf) {
REPORTER_ASSERT(r, x == 4);
}
});
}
DEF_TEST(SkVM_fast_mul, r) {
skvm::Builder b;
{
skvm::Ptr src = b.varying<float>(),
fast = b.varying<float>(),
slow = b.varying<float>();
skvm::F32 x = b.loadF(src);
b.storeF(fast, fast_mul(0.0f, x));
b.storeF(slow, 0.0f * x);
}
test_jit_and_interpreter(b, [&](const skvm::Program& program){
const uint32_t bits[] = {
0x0000'0000, 0x8000'0000, //±0
0x3f80'0000, 0xbf80'0000, //±1
0x7f80'0000, 0xff80'0000, //±inf
0x7f80'0001, 0xff80'0001, //±NaN
};
float fast[8],
slow[8];
program.eval(8,bits,fast,slow);
for (int i = 0; i < 8; i++) {
REPORTER_ASSERT(r, fast[i] == 0.0f);
if (i < 4) {
REPORTER_ASSERT(r, slow[i] == 0.0f);
} else {
REPORTER_ASSERT(r, isnan(slow[i]));
}
}
});
}
DEF_TEST(SkVM_duplicates, reporter) {
{
skvm::Builder p(true);
auto rptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.splat(1.0f);
p.unpremul(&r, &g, &b, a);
p.storeF(rptr, r);
std::vector<skvm::Instruction> program = b->program();
auto withDuplicates = skvm::finalize(program);
int duplicates = 0;
for (const auto& instr : withDuplicates) {
if (instr.op == skvm::Op::duplicate) {
++duplicates;
}
}
REPORTER_ASSERT(reporter, duplicates > 0);
auto eliminatedAsDeadCode = skvm::eliminate_dead_code(program);
for (const auto& instr : eliminatedAsDeadCode) {
REPORTER_ASSERT(reporter, instr.op != skvm::Op::duplicate);
}
}
{
skvm::Builder p(false);
auto rptr = p.varying<int>();
skvm::F32 r = p.loadF(rptr),
g = p.splat(0.0f),
b = p.splat(0.0f),
a = p.splat(1.0f);
p.unpremul(&r, &g, &b, a);
p.storeF(rptr, r);
auto withoutDuplicates = p.done().instructions();
for (const auto& instr : withoutDuplicates) {
REPORTER_ASSERT(reporter, instr.op != skvm::Op::duplicate);
}
}
}
DEF_TEST(SkVM_Visualizer, r) {
const char* src =
"int main(int x, int y) {\n"
" int a = 99;\n"
" if (x > 0) a += 100;\n"
" if (y > 0) a += 101;\n"
" a = 102;\n"
" return a;\n"
"}";
GrShaderCaps caps;
SkSL::Compiler compiler(&caps);
SkSL::Program::Settings settings;
auto program = compiler.convertProgram(SkSL::ProgramKind::kGeneric,
std::string(src), settings);
const SkSL::FunctionDefinition* main = SkSL::Program_GetFunction(*program, "main");
SkSL::SkVMDebugTrace d;
d.setSource(src);
auto v = std::make_unique<skvm::viz::Visualizer>(&d);
skvm::Builder b(skvm::Features{}, /*createDuplicates=*/true);
SkSL::ProgramToSkVM(*program, *main, &b, &d, /*uniforms=*/{});
skvm::Program p = b.done(nullptr, true, std::move(v));
#if defined(SKVM_JIT)
SkDynamicMemoryWStream asmFile;
p.disassemble(&asmFile);
auto dumpData = asmFile.detachAsData();
std::string dumpString((const char*)dumpData->data(), dumpData->size());
#else
std::string dumpString;
#endif
SkDynamicMemoryWStream vizFile;
p.visualizer()->dump(&vizFile, dumpString.c_str());
auto vizData = vizFile.detachAsData();
std::string html((const char*)vizData->data(), vizData->size());
//b.dump();
//std::printf(html.c_str());
// Check that html contains all types of information:
if (!dumpString.empty() && !std::strstr(dumpString.c_str(), "Program not JIT'd.")) {
REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='machine'>")); // machine commands
}
REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='normal'>")); // SkVM byte code
REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='source'>")); // C++ source
REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='dead'>")); // dead code
REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='dead deduped'>")); // deduped removed
REPORTER_ASSERT(r, std::strstr(html.c_str(), // deduped origins
"<tr class='normal origin'>"
"<td>&#8593;&#8593;&#8593; *13</td>"
"<td>v2 = splat 0 (0)</td></tr>"));
REPORTER_ASSERT(r, std::strstr(html.c_str(), // trace enter
"<tr class='source'><td class='mask'>&#8618;v9</td>"
"<td colspan=2>int main(int x, int y)</td></tr>"));
REPORTER_ASSERT(r, std::strstr(html.c_str(), // trace exit
"<tr class='source'><td class='mask'>&#8617;v9</td>"
"<td colspan=2>int main(int x, int y)</td></tr>"));
}