convert to phi nodes

Convert our n+args stack homes to phi nodes,
essentially performing mem2reg ourselves,
eliminating the need for it at runtime.

Also, use b.getInt64(k) to create integer constants.

Also, print verifyModule() errors to stdout (instead of nowhere).

Also, update unit test to make sure we don't run off the end.

Bitcode still looks good:
    define void @skvm-jit-211960346(i64, i8*) {
    enter:
      br label %testK

    testK:                                            ; preds = %loopK, %enter
      %2 = phi i64 [ %0, %enter ], [ %6, %loopK ]
      %3 = phi i8* [ %1, %enter ], [ %7, %loopK ]
      %4 = icmp uge i64 %2, 16
      br i1 %4, label %loopK, label %test1

    loopK:                                            ; preds = %testK
      %5 = bitcast i8* %3 to <16 x i32>*
      store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, <16 x i32>* %5, align 1
      %6 = sub i64 %2, 16
      %7 = getelementptr i8, i8* %3, i64 64
      br label %testK

    test1:                                            ; preds = %loop1, %testK
      %8 = phi i64 [ %2, %testK ], [ %12, %loop1 ]
      %9 = phi i8* [ %3, %testK ], [ %13, %loop1 ]
      %10 = icmp uge i64 %8, 1
      br i1 %10, label %loop1, label %leave

    loop1:                                            ; preds = %test1
      %11 = bitcast i8* %9 to i32*
      store i32 42, i32* %11, align 1
      %12 = sub i64 %8, 1
      %13 = getelementptr i8, i8* %9, i64 4
      br label %test1

    leave:                                            ; preds = %test1
      ret void
    }

and the final assembly looks the same:

    0x10a3f5000: movabsq $0x10a3f6000, %rax        ; imm = 0x10A3F6000
    0x10a3f500a: vbroadcastss (%rax), %zmm0
    0x10a3f5010: cmpq   $0xf, %rdi
    0x10a3f5014: jbe    0x10a3f504d
    0x10a3f5016: nopw   %cs:(%rax,%rax)
    0x10a3f5020: vmovups %zmm0, (%rsi)
    0x10a3f5026: addq   $-0x10, %rdi
    0x10a3f502a: addq   $0x40, %rsi
    0x10a3f502e: cmpq   $0xf, %rdi
    0x10a3f5032: ja     0x10a3f5020
    0x10a3f5034: jmp    0x10a3f504d
    0x10a3f5036: nopw   %cs:(%rax,%rax)
    0x10a3f5040: movl   $0x2a, (%rsi)
    0x10a3f5046: decq   %rdi
    0x10a3f5049: addq   $0x4, %rsi
    0x10a3f504d: testq  %rdi, %rdi
    0x10a3f5050: jne    0x10a3f5040
    0x10a3f5052: vzeroupper
    0x10a3f5055: retq

Change-Id: I12d11c7d5786c4c3df28a49bb3044be10f0770e0
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/273753
Reviewed-by: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2020-02-27 10:07:53 -06:00 committed by Skia Commit-Bot
parent fb3f302bdf
commit 7b3999edcb
2 changed files with 56 additions and 47 deletions

View File

@ -20,11 +20,8 @@
#include <llvm/Bitcode/BitcodeWriter.h>
#include <llvm/ExecutionEngine/ExecutionEngine.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/IR/Verifier.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Transforms/IPO/PassManagerBuilder.h>
#include <llvm/Transforms/Utils.h>
#endif
bool gSkVMJITViaDylib{false};
@ -1941,8 +1938,9 @@ namespace skvm {
using IRBuilder = llvm::IRBuilder<>;
llvm::Value* n;
std::vector<llvm::Value*> args;
// `n` won't be used in emit, but `args` will be and they're clearest kept together.
llvm::PHINode* n;
std::vector<llvm::PHINode*> args;
std::vector<llvm::Value*> vals(instructions.size());
auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
@ -1957,7 +1955,7 @@ namespace skvm {
if (scalar) {
v = b->CreateExtractElement(v, (uint64_t)0);
}
llvm::Value* ptr = b->CreateBitCast(b->CreateLoad(args[immy]),
llvm::Value* ptr = b->CreateBitCast(args[immy],
v->getType()->getPointerTo());
vals[i] = b->CreateAlignedStore(v, ptr, 1);
} break;
@ -1972,28 +1970,28 @@ namespace skvm {
return true;
};
// enter: set up stack homes `n` and `args` for loop counter and uniform/varying pointers.
// TODO: manual PHI nodes for these instead of relying on load/store and mem2reg
// We can't jump to the first basic block or this would be testK directly.
{
IRBuilder b(enter);
llvm::Argument* arg = fn->arg_begin();
n = b.CreateAlloca(arg->getType());
b.CreateStore(arg++, n);
for (size_t i = 0; i < fStrides.size(); i++) {
args.push_back(b.CreateAlloca(arg->getType()));
b.CreateStore(arg++, args.back());
}
b.CreateBr(testK);
}
// testK: if (N >= K) goto loopK; else goto test1;
llvm::ConstantInt* i64_K = llvm::ConstantInt::get(i64, K);
{
IRBuilder b(testK);
b.CreateCondBr(b.CreateICmpUGE(b.CreateLoad(n), i64_K), loopK, test1);
// Set up phi nodes for `n` and each pointer argument from enter; later we'll add loopK.
llvm::Argument* arg = fn->arg_begin();
n = b.CreatePHI(arg->getType(), 2);
n->addIncoming(arg++, enter);
for (size_t i = 0; i < fStrides.size(); i++) {
args.push_back(b.CreatePHI(arg->getType(), 2));
args.back()->addIncoming(arg++, enter);
}
b.CreateCondBr(b.CreateICmpUGE(n, b.getInt64(K)), loopK, test1);
}
// loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
@ -2004,19 +2002,36 @@ namespace skvm {
return;
}
}
b.CreateStore(b.CreateSub(b.CreateLoad(n), i64_K), n);
// n -= K
llvm::Value* n_next = b.CreateSub(n, b.getInt64(K));
n->addIncoming(n_next, loopK);
// Each arg ptr += K
for (size_t i = 0; i < fStrides.size(); i++) {
b.CreateStore(b.CreateGEP(b.CreateLoad(args[i]),
llvm::ConstantInt::get(i64, K * fStrides[i])), args[i]);
llvm::Value* arg_next = b.CreateGEP(args[i], b.getInt64(K*fStrides[i]));
args[i]->addIncoming(arg_next, loopK);
}
b.CreateBr(testK);
}
// test1: if (N >= 1) goto loop1; else goto leave;
llvm::ConstantInt* i64_1 = llvm::ConstantInt::get(i64, 1);
{
IRBuilder b(test1);
b.CreateCondBr(b.CreateICmpUGE(b.CreateLoad(n), i64_1), loop1, leave);
// Set up new phi nodes for `n` and each pointer argument, now from testK and loop1.
llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
n_new->addIncoming(n, testK);
n = n_new;
for (size_t i = 0; i < fStrides.size(); i++) {
llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
arg_new->addIncoming(args[i], testK);
args[i] = arg_new;
}
b.CreateCondBr(b.CreateICmpUGE(n, b.getInt64(1)), loop1, leave);
}
// loop1: ... insts on scalars; N -= 1, args += stride; goto test1;
@ -2027,10 +2042,15 @@ namespace skvm {
return;
}
}
b.CreateStore(b.CreateSub(b.CreateLoad(n), i64_1), n);
// n -= 1
llvm::Value* n_next = b.CreateSub(n, b.getInt64(1));
n->addIncoming(n_next, loop1);
// Each arg ptr += K
for (size_t i = 0; i < fStrides.size(); i++) {
b.CreateStore(b.CreateGEP(b.CreateLoad(args[i]),
llvm::ConstantInt::get(i64, fStrides[i])), args[i]);
llvm::Value* arg_next = b.CreateGEP(args[i], b.getInt64(fStrides[i]));
args[i]->addIncoming(arg_next, loop1);
}
b.CreateBr(test1);
}
@ -2041,21 +2061,7 @@ namespace skvm {
b.CreateRetVoid();
}
SkASSERT(false == llvm::verifyModule(*mod));
llvm::legacy::FunctionPassManager fpm(mod.get());
#if 0
llvm::PassManagerBuilder pmb;
pmb. OptLevel = 1;
pmb.SizeLevel = 1;
// TargetMachine::adjustPassManager(pmb)
pmb.populateFunctionPassManager(fpm);
while (fpm.run(*fn));
#else
fpm.add(llvm::createPromoteMemoryToRegisterPass());
fpm.run(*fn);
SkASSERT(!fpm.run(*fn));
#endif
SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
if (false) {
SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);

View File

@ -288,11 +288,14 @@ DEF_TEST(SkVM_LLVM, r) {
skvm::Program p = b.done();
REPORTER_ASSERT(r, p.hasJIT());
int buf[17];
p.eval(SK_ARRAY_COUNT(buf), buf);
for (int v : buf) {
REPORTER_ASSERT(r, v == 42);
int buf[18];
buf[17] = 47;
p.eval(17, buf);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(r, buf[i] == 42);
}
REPORTER_ASSERT(r, buf[17] == 47);
}
#endif