diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp index fa61ae07d8..65b1e01287 100644 --- a/src/core/SkVM.cpp +++ b/src/core/SkVM.cpp @@ -1999,6 +1999,14 @@ namespace skvm { this->byte(imm); } + void Assembler::vextracti128(Operand dst, Ymm src, int imm) { + this->op(0x66,0x3a0f,0x39, src,dst); + this->byte(imm); + } + void Assembler::vpextrd(Operand dst, Xmm src, int imm) { + this->op(0x66,0x3a0f,0x16, src,dst); + this->byte(imm); + } void Assembler::vpextrw(Operand dst, Xmm src, int imm) { this->op(0x66,0x3a0f,0x15, src,dst); this->byte(imm); @@ -3010,12 +3018,6 @@ namespace skvm { // // Now let's actually assemble the instruction! switch (op) { - default: - if (debug_dump()) { - SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op); - } - return false; // TODO: many new ops - #if defined(__x86_64__) case Op::assert_true: { a->vptest (r[x], &constants[0xffffffff].label); @@ -3060,10 +3062,76 @@ namespace skvm { else { a->vmovups( dst(), A::Mem{arg[immy]}); } break; + case Op::gather8: { + A::GP64 base = scratch, + index = scratch2; + + // As usual, the gather base pointer is immz bytes off of uniform immy. + a->mov(base, A::Mem{arg[immy], immz}); + + // We'll need two distinct temporary vector registers: + // - tmp() to hold our indices; + // - accum to hold our partial gathered result. + a->vmovdqa(tmp(), r[x]); + + // accum can be any register, even dst(), as long as it's not the same as tmp(). + A::Xmm accum; + if (dst() != tmp()) { + accum = (A::Xmm)dst(); + } else if (int found = __builtin_ffs(avail & ~(1<vextracti128((A::Xmm)tmp(), tmp(), 1); + } + a->vpextrd(index, (A::Xmm)tmp(), i%4); + a->vpinsrb(accum, accum, A::Mem{base,0,index,A::ONE}, i); + } + a->vpmovzxbd(dst(), accum); + } break; + + case Op::gather16: { + // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd. + A::GP64 base = scratch, + index = scratch2; + + a->mov(base, A::Mem{arg[immy], immz}); + + a->vmovdqa(tmp(), r[x]); + + A::Xmm accum; + if (dst() != tmp()) { + accum = (A::Xmm)dst(); + } else if (int found = __builtin_ffs(avail & ~(1<vextracti128((A::Xmm)tmp(), tmp(), 1); + } + a->vpextrd(index, (A::Xmm)tmp(), i%4); + a->vpinsrw(accum, accum, A::Mem{base,0,index,A::TWO}, i); + } + a->vpmovzxwd(dst(), accum); + } break; + case Op::gather32: if (scalar) { - auto base = scratch, - index = scratch2; + A::GP64 base = scratch, + index = scratch2; // Our gather base pointer is immz bytes off of uniform immy. a->mov(base, A::Mem{arg[immy], immz}); @@ -3100,7 +3168,7 @@ namespace skvm { } // Our gather base pointer is immz bytes off of uniform immy. - auto base = scratch; + A::GP64 base = scratch; a->mov(base, A::Mem{arg[immy], immz}); a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) a->vgatherdps(dst(), A::FOUR, index, base, mask); @@ -3207,6 +3275,12 @@ namespace skvm { case Op::round : a->vcvtps2dq (dst(), r[x]); break; #elif defined(__aarch64__) + default: + if (debug_dump()) { + SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op); + } + return false; // TODO: many new ops + case Op::assert_true: { a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector. a->fmovs(scratch, tmp()); diff --git a/src/core/SkVM.h b/src/core/SkVM.h index 7a4bd642c1..0fcb59107e 100644 --- a/src/core/SkVM.h +++ b/src/core/SkVM.h @@ -187,8 +187,10 @@ namespace skvm { void vpinsrw(Xmm dst, Xmm src, Operand y, int imm); // dst = src; dst[imm] = y, 16-bit void vpinsrb(Xmm dst, Xmm src, Operand y, int imm); // dst = src; dst[imm] = y, 8-bit - void vpextrw(Operand dst, Xmm src, int imm); // dst = src[imm] , 16-bit - void vpextrb(Operand dst, Xmm src, int imm); // dst = src[imm] , 8-bit + void vextracti128(Operand dst, Ymm src, int imm); // dst = src[imm], 128-bit + void vpextrd (Operand dst, Xmm src, int imm); // dst = src[imm], 32-bit + void vpextrw (Operand dst, Xmm src, int imm); // dst = src[imm], 16-bit + void vpextrb (Operand dst, Xmm src, int imm); // dst = src[imm], 8-bit // if (mask & 0x8000'0000) { // dst = base[scale*ix]; diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp index 1361ced862..b08301c22e 100644 --- a/tests/SkVMTest.cpp +++ b/tests/SkVMTest.cpp @@ -37,7 +37,7 @@ static void test_jit_and_interpreter(skvm::Program&& program, Fn&& test) { #if defined(SKVM_LLVM) SkASSERT(program.hasJIT()); #elif defined(SKVM_JIT) && defined(SK_CPU_X86) // soon! - // SkASSERT(program.hasJIT()); + SkASSERT(program.hasJIT()); #elif defined(SKVM_JIT) // eventually! // SkASSERT(program.hasJIT()); #else @@ -439,6 +439,55 @@ DEF_TEST(SkVM_gathers, r) { }); } +DEF_TEST(SkVM_gathers2, r) { + skvm::Builder b; + { + skvm::Arg uniforms = b.uniform(), + buf32 = b.varying(), + buf16 = b.varying(), + buf8 = b.varying(); + + skvm::I32 x = b.load32(buf32); + + b.store32(buf32, b.gather32(uniforms,0, x)); + b.store16(buf16, b.gather16(uniforms,0, x)); + b.store8 (buf8 , b.gather8 (uniforms,0, x)); + } + + test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) { + uint8_t img[256]; + for (int i = 0; i < 256; i++) { + img[i] = i; + } + + int buf32[64]; + uint16_t buf16[64]; + uint8_t buf8 [64]; + + for (int i = 0; i < 64; i++) { + buf32[i] = (i*47)&63; + buf16[i] = 0; + buf8 [i] = 0; + } + + struct Uniforms { + const uint8_t* img; + } uniforms{img}; + + program.eval(64, &uniforms, buf32, buf16, buf8); + + for (int i = 0; i < 64; i++) { + REPORTER_ASSERT(r, buf8[i] == ((i*47)&63)); // 0,47,30,13,60,... + } + + REPORTER_ASSERT(r, buf16[ 0] == 0x0100); + REPORTER_ASSERT(r, buf16[63] == 0x2322); + + REPORTER_ASSERT(r, buf32[ 0] == 0x03020100); + REPORTER_ASSERT(r, buf32[63] == 0x47464544); + }); +} + DEF_TEST(SkVM_bitops, r) { skvm::Builder b; { @@ -1418,6 +1467,12 @@ DEF_TEST(SkVM_Assembler, r) { a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrb $4, (%rsi), %xmm8, %xmm1 a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinsrb $4, (%rsi), %xmm8, %xmm1 + a.vextracti128(A::xmm1, A::ymm8, 1); // vextracti128 $1, %ymm8, %xmm1 + a.vextracti128(A::xmm8, A::ymm1, 0); // vextracti128 $0, %ymm1, %xmm8 + + a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3); // vpextrd $3, %xmm8, (%rsi) + a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2); // vpextrd $2, %xmm1, (%r8) + a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7); a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15); @@ -1430,6 +1485,12 @@ DEF_TEST(SkVM_Assembler, r) { 0xc4,0xe3,0x39, 0x20, 0x0e, 4, 0xc4,0x43,0x71, 0x20, 0x00, 12, + 0xc4,0x63,0x7d,0x39,0xc1, 1, + 0xc4,0xc3,0x7d,0x39,0xc8, 0, + + 0xc4,0x63,0x79,0x16,0x06, 3, + 0xc4,0xc3,0x79,0x16,0x08, 2, + 0xc4,0x63,0x79, 0x15, 0x06, 7, 0xc4,0xc3,0x79, 0x15, 0x08, 15,