implement some useful 16-bit instructions

Add a slew of 16-bit instructions for experiments.

I want to try a fixed-point path through SkVMBlitter, continuing to
represent geometry with F32, but color channels in 16 bits, with several
possible representations:

    - unorm8 lowp like SkRasterPipeline (0 -> 0.0,  0x00ff ->  1.0)
    - 15-bit SkFixed15 fixed-point      (0 -> 0.0,  0x8000 ->  1.0)
    - 14-bit signed fixed-point         (0 -> 0.0, ±0x4000 -> ±1.0)

I'm leaning towards the 14-bit version for being able to hold a good
range of temporary values in [-2,2), or perhaps even a 13-bit analog for
even a little more safety range.  Mostly something new to try.

Most of these instructions are pretty obvious, with notes on a few:

    vpavgw is an unsigned (x+y+1)>>1, and is useful for converting
    unorm8 up to Q14.  There are a couple ways to do this pretty well,
    and using vpavgw is the best, and uses the fewest instructions:

         A) (x << 6) + ( x    >> 2) + (x == 255)   // Ok approx.
         B) (x << 6) + ((x+1) >> 2)                // Better approx.
         C) vpavgw(x << 7, x >> 1)                 // Perfect math!

    The best good reverse math I've found is (x >> 6) - (x > 16319).

    vpmulhrsw is the key to the whole thing as usual, letting us do
    16x16->16-bit multiplies.  An SkFixed15 multiply is vpmulhrsw
    followed by vpabsw (also added here), and a Q14 multiply is
    vpmulhrsw followed by a simple <<1.

    I've added both signed and unsigned min and max.  Not entirely
    sure they'll all be used, but I do have my eye on vpminuw as a
    single-instruction clamp to [0,0x4000] ~~> [0.0,1.0], treating
    any negative Q14 as very large unsigned.

Change-Id: I0db7f3f943ef6c9a600821444cc5b003fe5f675d
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317119
Commit-Queue: Herb Derby <herb@google.com>
Auto-Submit: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2020-09-15 07:57:27 -05:00 committed by Skia Commit-Bot
parent 272e8bcd24
commit 84dd8f9912
3 changed files with 70 additions and 4 deletions

View File

@ -2016,8 +2016,18 @@ namespace skvm {
void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfa, dst,x,y); }
void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xf9, dst,x,y); }
void Assembler::vpmullw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xd5, dst,x,y); }
void Assembler::vpaddw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfd, dst,x,y); }
void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xf9, dst,x,y); }
void Assembler::vpmullw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xd5, dst,x,y); }
void Assembler::vpavgw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xe3, dst,x,y); }
void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); }
void Assembler::vpminsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xea, dst,x,y); }
void Assembler::vpmaxsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xee, dst,x,y); }
void Assembler::vpminuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); }
void Assembler::vpmaxuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); }
void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); }
void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
@ -2050,7 +2060,9 @@ namespace skvm {
void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); }
void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); }
void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
@ -2089,10 +2101,18 @@ namespace skvm {
this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
this->byte(imm);
}
void Assembler::vpsllw(Ymm dst, Ymm x, int imm) {
this->op(0x66,0x0f,0x71,(Ymm)6, dst,x);
this->byte(imm);
}
void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
this->byte(imm);
}
void Assembler::vpsraw(Ymm dst, Ymm x, int imm) {
this->op(0x66,0x0f,0x71,(Ymm)4, dst,x);
this->byte(imm);
}
void Assembler::vpermq(Ymm dst, Operand x, int imm) {
// A bit unusual among the instructions we use, this is 64-bit operation, so we set W.

View File

@ -132,8 +132,17 @@ namespace skvm {
void vpsubd (Ymm dst, Ymm x, Operand y);
void vpmulld(Ymm dst, Ymm x, Operand y);
void vpsubw (Ymm dst, Ymm x, Operand y);
void vpmullw(Ymm dst, Ymm x, Operand y);
void vpaddw (Ymm dst, Ymm x, Operand y);
void vpsubw (Ymm dst, Ymm x, Operand y);
void vpmullw (Ymm dst, Ymm x, Operand y);
void vpabsw (Ymm dst, Operand x);
void vpavgw (Ymm dst, Ymm x, Operand y); // dst = (x+y+1)>>1, unsigned.
void vpmulhrsw(Ymm dst, Ymm x, Operand y); // dst = (x*y + (1<<14)) >> 15, signed.
void vpminsw (Ymm dst, Ymm x, Operand y);
void vpminuw (Ymm dst, Ymm x, Operand y);
void vpmaxsw (Ymm dst, Ymm x, Operand y);
void vpmaxuw (Ymm dst, Ymm x, Operand y);
void vaddps(Ymm dst, Ymm x, Operand y);
void vsubps(Ymm dst, Ymm x, Operand y);
@ -164,6 +173,8 @@ namespace skvm {
void vpcmpeqd(Ymm dst, Ymm x, Operand y);
void vpcmpgtd(Ymm dst, Ymm x, Operand y);
void vpcmpeqw(Ymm dst, Ymm x, Operand y);
void vpcmpgtw(Ymm dst, Ymm x, Operand y);
void vcmpps (Ymm dst, Ymm x, Operand y, int imm);
void vcmpeqps (Ymm dst, Ymm x, Operand y) { this->vcmpps(dst,x,y,0); }
@ -175,7 +186,10 @@ namespace skvm {
void vpslld(Ymm dst, Ymm x, int imm);
void vpsrld(Ymm dst, Ymm x, int imm);
void vpsrad(Ymm dst, Ymm x, int imm);
void vpsllw(Ymm dst, Ymm x, int imm);
void vpsrlw(Ymm dst, Ymm x, int imm);
void vpsraw(Ymm dst, Ymm x, int imm);
void vpermq (Ymm dst, Operand x, int imm);
void vperm2f128(Ymm dst, Ymm x, Operand y, int imm);

View File

@ -1180,6 +1180,38 @@ DEF_TEST(SkVM_Assembler, r) {
0xc5, 0xf5, 0xfa, 0xc2,
});
test_asm(r, [&](A& a) {
a.vpaddw (A::ymm4, A::ymm3, A::ymm2);
a.vpavgw (A::ymm4, A::ymm3, A::ymm2);
a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
a.vpminsw (A::ymm4, A::ymm3, A::ymm2);
a.vpmaxsw (A::ymm4, A::ymm3, A::ymm2);
a.vpminuw (A::ymm4, A::ymm3, A::ymm2);
a.vpmaxuw (A::ymm4, A::ymm3, A::ymm2);
a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
a.vpabsw (A::ymm4, A::ymm3);
a.vpsllw (A::ymm4, A::ymm3, 12);
a.vpsraw (A::ymm4, A::ymm3, 12);
},{
0xc5, 0xe5, 0xfd, 0xe2,
0xc5, 0xe5, 0xe3, 0xe2,
0xc5, 0xe5, 0x75, 0xe2,
0xc5, 0xe5, 0x65, 0xe2,
0xc5, 0xe5, 0xea, 0xe2,
0xc5, 0xe5, 0xee, 0xe2,
0xc4,0xe2,0x65, 0x3a, 0xe2,
0xc4,0xe2,0x65, 0x3e, 0xe2,
0xc4,0xe2,0x65, 0x0b, 0xe2,
0xc4,0xe2,0x7d, 0x1d, 0xe3,
0xc5,0xdd,0x71, 0xf3, 0x0c,
0xc5,0xdd,0x71, 0xe3, 0x0c,
});
test_asm(r, [&](A& a) {
A::Label l;
a.vcmpeqps (A::ymm0, A::ymm1, &l); // vcmpeqps 0x1c(%rip), %ymm1, %ymm0