bit_and and gather

Enough to run SkVM_gather32 test.

SSE looks good, unrolled as you'd expect.  HSW oddly is also
unrolled like SSE, not using vpgatherdd like we want.  TODO!

SKX looks great!

(lldb) dis -s fJITEntry -c 40
    0x10a6e8000: movabsq $0x10a6f6000, %rax        ; imm = 0x10A6F6000
    0x10a6e800a: cmpq   $0xf, %rdi
    0x10a6e800e: jbe    0x10a6e8054
    0x10a6e8010: vmovdqu64 (%rdx), %zmm0
    0x10a6e8016: vpandd (%rax){1to16}, %zmm0, %zmm0
    0x10a6e801c: movq   (%rsi), %rcx
    0x10a6e801f: kxnorw %k0, %k0, %k1
    0x10a6e8023: vpgatherdd (%rcx,%zmm0,4), %zmm1 {%k1}
    0x10a6e802a: vmovdqu64 %zmm1, (%rdx)
    0x10a6e8030: addq   $-0x10, %rdi
    0x10a6e8034: addq   $0x40, %rdx
    0x10a6e8038: cmpq   $0xf, %rdi
    0x10a6e803c: ja     0x10a6e8010
    0x10a6e803e: jmp    0x10a6e8054
    0x10a6e8040: movl   (%rdx), %eax
    0x10a6e8042: movq   (%rsi), %rcx
    0x10a6e8045: andl   $0x7, %eax
    0x10a6e8048: movl   (%rcx,%rax,4), %eax
    0x10a6e804b: movl   %eax, (%rdx)
    0x10a6e804d: decq   %rdi
    0x10a6e8050: addq   $0x4, %rdx
    0x10a6e8054: testq  %rdi, %rdi
    0x10a6e8057: jne    0x10a6e8040
    0x10a6e8059: vzeroupper
    0x10a6e805c: retq

Change-Id: If84bff6954b46aee5a4fd862c2899b6bf97fb164
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/273854
Commit-Queue: Mike Klein <mtklein@google.com>
Commit-Queue: Herb Derby <herb@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2020-02-27 14:38:43 -06:00
parent 6c59fe4110
commit 102488f038

View File

@ -1969,7 +1969,23 @@ namespace skvm {
vals[i] = b->CreateAlignedStore(vals[x], ptr, 1);
} break;
case Op::bit_and:
vals[i] = b->CreateAnd(vals[x], vals[y]);
break;
case Op::gather32: {
// Our gather base pointer is immz bytes off of uniform immy.
llvm::Value* base =
b->CreateLoad(b->CreateBitCast(b->CreateGEP(args[immy], b->getInt32(immz)),
i32->getPointerTo()->getPointerTo()));
llvm::Value* ptr = b->CreateGEP(base, vals[x]);
if (scalar) {
vals[i] = b->CreateAlignedLoad(ptr, 1);
} else {
vals[i] = b->CreateMaskedGather(ptr, 1);
}
} break;
// Ops below this line shouldn't need to consider `scalar`... they're Just Math.