remove Op::pack

pack(x,y,bits) as an alias for x|(y<<bits) only existed originally to
implement it with the SLI arm64 instruction, but I've since realized
that was misguided.

I had thought the assumption on pack ("(x & (y << bits)) == 0"), i.e.
"no overlap between x and the shifted y", was enough to make using SLI
legal, but it's actually not strong enough a requirement.

The SLI docs say "...inserts the result into the corresponding vector
element in the destination SIMD&FP register such that the new zero bits
created by the shift are not inserted but retain their existing value."
The key thing not mentioned there happens with zero bits _not_ created
by the shift, the ones already present at the top of y.  They're of
course inserted, overwriting any previous values.

This means SLI (and so pack()) become strictly order dependent in a way
I had never intended.  This will work as you'd think,

    skvm::I32 px = splat(0);
    px = pack(px, r,  0);
    px = pack(px, a, 24);

but this version swapping the two calls to pack() will overwrite alpha,

    skvm::I32 px = splat(0);
    px = pack(px, a, 24);
    px = pack(px, r,  0);

I find that error-prone, so I've removed Op::pack and replaced it
with a simple expansion to x|(y<<bits).  That of course works in either
order.

This new test can't JIT at head, but if we implement the other missing
instructions (soon, dependent CL) it would start failing when JIT'd.
The interpreter and x86 were both fine, since they're both doing what's
now the only approach to pack(), the simple x|(y<<bits).

I've left assembler support for SLI in case we want to try it again.

Change-Id: Iaf879309d3e1d0a458a688f3a62556e55ab05e23
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/337197
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2020-11-20 15:34:16 -06:00 committed by Skia Commit-Bot
parent d90024d498
commit ee40ec6dd6
5 changed files with 328 additions and 286 deletions

View File

@ -77,7 +77,7 @@ loop:
18 store8 arg(1) r7 18 store8 arg(1) r7
A8 over RGBA_8888 A8 over RGBA_8888
39 values (originally 41): 42 values (originally 44):
↑ v0 = splat 437F0000 (255) ↑ v0 = splat 437F0000 (255)
↑ v1 = splat 3B808081 (0.0039215689) ↑ v1 = splat 3B808081 (0.0039215689)
v2 = load8 arg(0) v2 = load8 arg(0)
@ -92,33 +92,36 @@ A8 over RGBA_8888
v11 = fma_f32 v10 v6 v4 v11 = fma_f32 v10 v6 v4
v12 = mul_f32 v11 v0 v12 = mul_f32 v11 v0
v13 = round v12 v13 = round v12
v14 = shr_i32 v7 16 v14 = shl_i32 v13 8
↑ v15 = splat FF (3.5733111e-43) v15 = shr_i32 v7 16
v16 = bit_and v15 v14 ↑ v16 = splat FF (3.5733111e-43)
v17 = to_f32 v16 v17 = bit_and v16 v15
v18 = mul_f32 v17 v1 v18 = to_f32 v17
v19 = mul_f32 v18 v6 v19 = mul_f32 v18 v1
v20 = mul_f32 v19 v0 v20 = mul_f32 v19 v6
v21 = round v20 v21 = mul_f32 v20 v0
v22 = pack v21 v13 8 v22 = round v21
v23 = shr_i32 v7 8 v23 = bit_or v22 v14
v24 = bit_and v15 v23 v24 = shl_i32 v23 16
v25 = to_f32 v24 v25 = shr_i32 v7 8
v26 = mul_f32 v25 v1 v26 = bit_and v16 v25
v27 = mul_f32 v26 v6 v27 = to_f32 v26
v28 = mul_f32 v27 v0 v28 = mul_f32 v27 v1
v29 = round v28 v29 = mul_f32 v28 v6
v30 = bit_and v15 v7 v30 = mul_f32 v29 v0
v31 = to_f32 v30 v31 = round v30
v32 = mul_f32 v31 v1 v32 = shl_i32 v31 8
v33 = mul_f32 v32 v6 v33 = bit_and v16 v7
v34 = mul_f32 v33 v0 v34 = to_f32 v33
v35 = round v34 v35 = mul_f32 v34 v1
v36 = pack v35 v29 8 v36 = mul_f32 v35 v6
v37 = pack v36 v22 16 v37 = mul_f32 v36 v0
store32 arg(1) v37 v38 = round v37
v39 = bit_or v38 v32
v40 = bit_or v39 v24
store32 arg(1) v40
8 registers, 39 instructions: 8 registers, 42 instructions:
0 r0 = splat 437F0000 (255) 0 r0 = splat 437F0000 (255)
1 r1 = splat 3B808081 (0.0039215689) 1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat 3F800000 (1) 2 r2 = splat 3F800000 (1)
@ -135,30 +138,33 @@ loop:
12 r5 = fma_f32 r7 r4 r5 12 r5 = fma_f32 r7 r4 r5
13 r5 = mul_f32 r5 r0 13 r5 = mul_f32 r5 r0
14 r5 = round r5 14 r5 = round r5
15 r7 = shr_i32 r6 16 15 r5 = shl_i32 r5 8
16 r7 = bit_and r3 r7 16 r7 = shr_i32 r6 16
17 r7 = to_f32 r7 17 r7 = bit_and r3 r7
18 r7 = mul_f32 r7 r1 18 r7 = to_f32 r7
19 r7 = mul_f32 r7 r4 19 r7 = mul_f32 r7 r1
20 r7 = mul_f32 r7 r0 20 r7 = mul_f32 r7 r4
21 r7 = round r7 21 r7 = mul_f32 r7 r0
22 r5 = pack r7 r5 8 22 r7 = round r7
23 r7 = shr_i32 r6 8 23 r5 = bit_or r7 r5
24 r7 = bit_and r3 r7 24 r5 = shl_i32 r5 16
25 r7 = to_f32 r7 25 r7 = shr_i32 r6 8
26 r7 = mul_f32 r7 r1 26 r7 = bit_and r3 r7
27 r7 = mul_f32 r7 r4 27 r7 = to_f32 r7
28 r7 = mul_f32 r7 r0 28 r7 = mul_f32 r7 r1
29 r7 = round r7 29 r7 = mul_f32 r7 r4
30 r6 = bit_and r3 r6 30 r7 = mul_f32 r7 r0
31 r6 = to_f32 r6 31 r7 = round r7
32 r6 = mul_f32 r6 r1 32 r7 = shl_i32 r7 8
33 r4 = mul_f32 r6 r4 33 r6 = bit_and r3 r6
34 r4 = mul_f32 r4 r0 34 r6 = to_f32 r6
35 r4 = round r4 35 r6 = mul_f32 r6 r1
36 r7 = pack r4 r7 8 36 r4 = mul_f32 r6 r4
37 r5 = pack r7 r5 16 37 r4 = mul_f32 r4 r0
38 store32 arg(1) r5 38 r4 = round r4
39 r7 = bit_or r4 r7
40 r5 = bit_or r7 r5
41 store32 arg(1) r5
G8 over A8 G8 over A8
11 values (originally 15): 11 values (originally 15):
@ -233,7 +239,7 @@ loop:
18 store8 arg(1) r7 18 store8 arg(1) r7
G8 over RGBA_8888 G8 over RGBA_8888
39 values (originally 43): 42 values (originally 46):
↑ v0 = splat 437F0000 (255) ↑ v0 = splat 437F0000 (255)
↑ v1 = splat 3F800000 (1) ↑ v1 = splat 3F800000 (1)
↑ v2 = splat 0 (0) ↑ v2 = splat 0 (0)
@ -245,36 +251,39 @@ G8 over RGBA_8888
v8 = fma_f32 v7 v2 v1 v8 = fma_f32 v7 v2 v1
v9 = mul_f32 v8 v0 v9 = mul_f32 v8 v0
v10 = round v9 v10 = round v9
v11 = load8 arg(0) v11 = shl_i32 v10 8
v12 = to_f32 v11 v12 = load8 arg(0)
v13 = mul_f32 v12 v3 v13 = to_f32 v12
v14 = shr_i32 v4 16 v14 = mul_f32 v13 v3
↑ v15 = splat FF (3.5733111e-43) v15 = shr_i32 v4 16
v16 = bit_and v15 v14 ↑ v16 = splat FF (3.5733111e-43)
v17 = to_f32 v16 v17 = bit_and v16 v15
v18 = mul_f32 v17 v3 v18 = to_f32 v17
v19 = fma_f32 v18 v2 v13 v19 = mul_f32 v18 v3
v20 = mul_f32 v19 v0 v20 = fma_f32 v19 v2 v14
v21 = round v20 v21 = mul_f32 v20 v0
v22 = pack v21 v10 8 v22 = round v21
v23 = shr_i32 v4 8 v23 = bit_or v22 v11
v24 = bit_and v15 v23 v24 = shl_i32 v23 16
v25 = to_f32 v24 v25 = shr_i32 v4 8
v26 = mul_f32 v25 v3 v26 = bit_and v16 v25
v27 = fma_f32 v26 v2 v13 v27 = to_f32 v26
v28 = mul_f32 v27 v0 v28 = mul_f32 v27 v3
v29 = round v28 v29 = fma_f32 v28 v2 v14
v30 = bit_and v15 v4 v30 = mul_f32 v29 v0
v31 = to_f32 v30 v31 = round v30
v32 = mul_f32 v31 v3 v32 = shl_i32 v31 8
v33 = fma_f32 v32 v2 v13 v33 = bit_and v16 v4
v34 = mul_f32 v33 v0 v34 = to_f32 v33
v35 = round v34 v35 = mul_f32 v34 v3
v36 = pack v35 v29 8 v36 = fma_f32 v35 v2 v14
v37 = pack v36 v22 16 v37 = mul_f32 v36 v0
store32 arg(1) v37 v38 = round v37
v39 = bit_or v38 v32
v40 = bit_or v39 v24
store32 arg(1) v40
9 registers, 39 instructions: 9 registers, 42 instructions:
0 r0 = splat 437F0000 (255) 0 r0 = splat 437F0000 (255)
1 r1 = splat 3F800000 (1) 1 r1 = splat 3F800000 (1)
2 r2 = splat 0 (0) 2 r2 = splat 0 (0)
@ -288,33 +297,36 @@ loop:
9 r6 = fma_f32 r6 r2 r1 9 r6 = fma_f32 r6 r2 r1
10 r6 = mul_f32 r6 r0 10 r6 = mul_f32 r6 r0
11 r6 = round r6 11 r6 = round r6
12 r7 = load8 arg(0) 12 r6 = shl_i32 r6 8
13 r7 = to_f32 r7 13 r7 = load8 arg(0)
14 r7 = mul_f32 r7 r3 14 r7 = to_f32 r7
15 r8 = shr_i32 r5 16 15 r7 = mul_f32 r7 r3
16 r8 = bit_and r4 r8 16 r8 = shr_i32 r5 16
17 r8 = to_f32 r8 17 r8 = bit_and r4 r8
18 r8 = mul_f32 r8 r3 18 r8 = to_f32 r8
19 r8 = fma_f32 r8 r2 r7 19 r8 = mul_f32 r8 r3
20 r8 = mul_f32 r8 r0 20 r8 = fma_f32 r8 r2 r7
21 r8 = round r8 21 r8 = mul_f32 r8 r0
22 r6 = pack r8 r6 8 22 r8 = round r8
23 r8 = shr_i32 r5 8 23 r6 = bit_or r8 r6
24 r8 = bit_and r4 r8 24 r6 = shl_i32 r6 16
25 r8 = to_f32 r8 25 r8 = shr_i32 r5 8
26 r8 = mul_f32 r8 r3 26 r8 = bit_and r4 r8
27 r8 = fma_f32 r8 r2 r7 27 r8 = to_f32 r8
28 r8 = mul_f32 r8 r0 28 r8 = mul_f32 r8 r3
29 r8 = round r8 29 r8 = fma_f32 r8 r2 r7
30 r5 = bit_and r4 r5 30 r8 = mul_f32 r8 r0
31 r5 = to_f32 r5 31 r8 = round r8
32 r5 = mul_f32 r5 r3 32 r8 = shl_i32 r8 8
33 r7 = fma_f32 r5 r2 r7 33 r5 = bit_and r4 r5
34 r7 = mul_f32 r7 r0 34 r5 = to_f32 r5
35 r7 = round r7 35 r5 = mul_f32 r5 r3
36 r8 = pack r7 r8 8 36 r7 = fma_f32 r5 r2 r7
37 r6 = pack r8 r6 16 37 r7 = mul_f32 r7 r0
38 store32 arg(1) r6 38 r7 = round r7
39 r8 = bit_or r7 r8
40 r6 = bit_or r8 r6
41 store32 arg(1) r6
RGBA_8888 over A8 RGBA_8888 over A8
15 values (originally 33): 15 values (originally 33):
@ -427,7 +439,7 @@ loop:
33 store8 arg(1) r8 33 store8 arg(1) r8
RGBA_8888 over RGBA_8888 RGBA_8888 over RGBA_8888
51 values (originally 55): 54 values (originally 58):
↑ v0 = splat 437F0000 (255) ↑ v0 = splat 437F0000 (255)
↑ v1 = splat 3B808081 (0.0039215689) ↑ v1 = splat 3B808081 (0.0039215689)
v2 = load32 arg(0) v2 = load32 arg(0)
@ -443,44 +455,47 @@ RGBA_8888 over RGBA_8888
v12 = fma_f32 v11 v7 v5 v12 = fma_f32 v11 v7 v5
v13 = mul_f32 v12 v0 v13 = mul_f32 v12 v0
v14 = round v13 v14 = round v13
v15 = shr_i32 v2 16 v15 = shl_i32 v14 8
↑ v16 = splat FF (3.5733111e-43) v16 = shr_i32 v2 16
v17 = bit_and v16 v15 ↑ v17 = splat FF (3.5733111e-43)
v18 = to_f32 v17 v18 = bit_and v17 v16
v19 = mul_f32 v18 v1 v19 = to_f32 v18
v20 = shr_i32 v8 16 v20 = mul_f32 v19 v1
v21 = bit_and v16 v20 v21 = shr_i32 v8 16
v22 = to_f32 v21 v22 = bit_and v17 v21
v23 = mul_f32 v22 v1 v23 = to_f32 v22
v24 = fma_f32 v23 v7 v19 v24 = mul_f32 v23 v1
v25 = mul_f32 v24 v0 v25 = fma_f32 v24 v7 v20
v26 = round v25 v26 = mul_f32 v25 v0
v27 = pack v26 v14 8 v27 = round v26
v28 = shr_i32 v2 8 v28 = bit_or v27 v15
v29 = bit_and v16 v28 v29 = shl_i32 v28 16
v30 = to_f32 v29 v30 = shr_i32 v2 8
v31 = mul_f32 v30 v1 v31 = bit_and v17 v30
v32 = shr_i32 v8 8 v32 = to_f32 v31
v33 = bit_and v16 v32 v33 = mul_f32 v32 v1
v34 = to_f32 v33 v34 = shr_i32 v8 8
v35 = mul_f32 v34 v1 v35 = bit_and v17 v34
v36 = fma_f32 v35 v7 v31 v36 = to_f32 v35
v37 = mul_f32 v36 v0 v37 = mul_f32 v36 v1
v38 = round v37 v38 = fma_f32 v37 v7 v33
v39 = bit_and v16 v2 v39 = mul_f32 v38 v0
v40 = to_f32 v39 v40 = round v39
v41 = mul_f32 v40 v1 v41 = shl_i32 v40 8
v42 = bit_and v16 v8 v42 = bit_and v17 v2
v43 = to_f32 v42 v43 = to_f32 v42
v44 = mul_f32 v43 v1 v44 = mul_f32 v43 v1
v45 = fma_f32 v44 v7 v41 v45 = bit_and v17 v8
v46 = mul_f32 v45 v0 v46 = to_f32 v45
v47 = round v46 v47 = mul_f32 v46 v1
v48 = pack v47 v38 8 v48 = fma_f32 v47 v7 v44
v49 = pack v48 v27 16 v49 = mul_f32 v48 v0
store32 arg(1) v49 v50 = round v49
v51 = bit_or v50 v41
v52 = bit_or v51 v29
store32 arg(1) v52
10 registers, 51 instructions: 10 registers, 54 instructions:
0 r0 = splat 437F0000 (255) 0 r0 = splat 437F0000 (255)
1 r1 = splat 3B808081 (0.0039215689) 1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat 3F800000 (1) 2 r2 = splat 3F800000 (1)
@ -498,44 +513,47 @@ loop:
13 r6 = fma_f32 r8 r5 r6 13 r6 = fma_f32 r8 r5 r6
14 r6 = mul_f32 r6 r0 14 r6 = mul_f32 r6 r0
15 r6 = round r6 15 r6 = round r6
16 r8 = shr_i32 r4 16 16 r6 = shl_i32 r6 8
17 r8 = bit_and r3 r8 17 r8 = shr_i32 r4 16
18 r8 = to_f32 r8 18 r8 = bit_and r3 r8
19 r8 = mul_f32 r8 r1 19 r8 = to_f32 r8
20 r9 = shr_i32 r7 16 20 r8 = mul_f32 r8 r1
21 r9 = bit_and r3 r9 21 r9 = shr_i32 r7 16
22 r9 = to_f32 r9 22 r9 = bit_and r3 r9
23 r9 = mul_f32 r9 r1 23 r9 = to_f32 r9
24 r8 = fma_f32 r9 r5 r8 24 r9 = mul_f32 r9 r1
25 r8 = mul_f32 r8 r0 25 r8 = fma_f32 r9 r5 r8
26 r8 = round r8 26 r8 = mul_f32 r8 r0
27 r6 = pack r8 r6 8 27 r8 = round r8
28 r8 = shr_i32 r4 8 28 r6 = bit_or r8 r6
29 r8 = bit_and r3 r8 29 r6 = shl_i32 r6 16
30 r8 = to_f32 r8 30 r8 = shr_i32 r4 8
31 r8 = mul_f32 r8 r1 31 r8 = bit_and r3 r8
32 r9 = shr_i32 r7 8 32 r8 = to_f32 r8
33 r9 = bit_and r3 r9 33 r8 = mul_f32 r8 r1
34 r9 = to_f32 r9 34 r9 = shr_i32 r7 8
35 r9 = mul_f32 r9 r1 35 r9 = bit_and r3 r9
36 r8 = fma_f32 r9 r5 r8 36 r9 = to_f32 r9
37 r8 = mul_f32 r8 r0 37 r9 = mul_f32 r9 r1
38 r8 = round r8 38 r8 = fma_f32 r9 r5 r8
39 r4 = bit_and r3 r4 39 r8 = mul_f32 r8 r0
40 r4 = to_f32 r4 40 r8 = round r8
41 r4 = mul_f32 r4 r1 41 r8 = shl_i32 r8 8
42 r7 = bit_and r3 r7 42 r4 = bit_and r3 r4
43 r7 = to_f32 r7 43 r4 = to_f32 r4
44 r7 = mul_f32 r7 r1 44 r4 = mul_f32 r4 r1
45 r4 = fma_f32 r7 r5 r4 45 r7 = bit_and r3 r7
46 r4 = mul_f32 r4 r0 46 r7 = to_f32 r7
47 r4 = round r4 47 r7 = mul_f32 r7 r1
48 r8 = pack r4 r8 8 48 r4 = fma_f32 r7 r5 r4
49 r6 = pack r8 r6 16 49 r4 = mul_f32 r4 r0
50 store32 arg(1) r6 50 r4 = round r4
51 r8 = bit_or r4 r8
52 r6 = bit_or r8 r6
53 store32 arg(1) r6
I32 (Naive) 8888 over 8888 I32 (Naive) 8888 over 8888
33 values (originally 33): 36 values (originally 36):
v0 = load32 arg(0) v0 = load32 arg(0)
v1 = shr_i32 v0 24 v1 = shr_i32 v0 24
↑ v2 = splat 100 (3.5873241e-43) ↑ v2 = splat 100 (3.5873241e-43)
@ -545,32 +563,35 @@ I32 (Naive) 8888 over 8888
v6 = mul_i32 v5 v3 v6 = mul_i32 v5 v3
v7 = shr_i32 v6 8 v7 = shr_i32 v6 8
v8 = add_i32 v1 v7 v8 = add_i32 v1 v7
v9 = shr_i32 v4 16 v9 = shl_i32 v8 8
↑ v10 = splat FF (3.5733111e-43) v10 = shr_i32 v4 16
v11 = bit_and v10 v9 ↑ v11 = splat FF (3.5733111e-43)
v12 = mul_i32 v11 v3 v12 = bit_and v11 v10
v13 = shr_i32 v12 8 v13 = mul_i32 v12 v3
v14 = shr_i32 v0 16 v14 = shr_i32 v13 8
v15 = bit_and v10 v14 v15 = shr_i32 v0 16
v16 = add_i32 v15 v13 v16 = bit_and v11 v15
v17 = pack v16 v8 8 v17 = add_i32 v16 v14
v18 = shr_i32 v4 8 v18 = bit_or v17 v9
v19 = bit_and v10 v18 v19 = shl_i32 v18 16
v20 = mul_i32 v19 v3 v20 = shr_i32 v4 8
v21 = shr_i32 v20 8 v21 = bit_and v11 v20
v22 = shr_i32 v0 8 v22 = mul_i32 v21 v3
v23 = bit_and v10 v22 v23 = shr_i32 v22 8
v24 = add_i32 v23 v21 v24 = shr_i32 v0 8
v25 = bit_and v10 v4 v25 = bit_and v11 v24
v26 = mul_i32 v25 v3 v26 = add_i32 v25 v23
v27 = shr_i32 v26 8 v27 = shl_i32 v26 8
v28 = bit_and v10 v0 v28 = bit_and v11 v4
v29 = add_i32 v28 v27 v29 = mul_i32 v28 v3
v30 = pack v29 v24 8 v30 = shr_i32 v29 8
v31 = pack v30 v17 16 v31 = bit_and v11 v0
store32 arg(1) v31 v32 = add_i32 v31 v30
v33 = bit_or v32 v27
v34 = bit_or v33 v19
store32 arg(1) v34
8 registers, 33 instructions: 8 registers, 36 instructions:
0 r0 = splat 100 (3.5873241e-43) 0 r0 = splat 100 (3.5873241e-43)
1 r1 = splat FF (3.5733111e-43) 1 r1 = splat FF (3.5733111e-43)
loop: loop:
@ -582,56 +603,62 @@ loop:
7 r6 = mul_i32 r6 r4 7 r6 = mul_i32 r6 r4
8 r6 = shr_i32 r6 8 8 r6 = shr_i32 r6 8
9 r6 = add_i32 r3 r6 9 r6 = add_i32 r3 r6
10 r3 = shr_i32 r5 16 10 r6 = shl_i32 r6 8
11 r3 = bit_and r1 r3 11 r3 = shr_i32 r5 16
12 r3 = mul_i32 r3 r4 12 r3 = bit_and r1 r3
13 r3 = shr_i32 r3 8 13 r3 = mul_i32 r3 r4
14 r7 = shr_i32 r2 16 14 r3 = shr_i32 r3 8
15 r7 = bit_and r1 r7 15 r7 = shr_i32 r2 16
16 r3 = add_i32 r7 r3 16 r7 = bit_and r1 r7
17 r6 = pack r3 r6 8 17 r3 = add_i32 r7 r3
18 r3 = shr_i32 r5 8 18 r6 = bit_or r3 r6
19 r3 = bit_and r1 r3 19 r6 = shl_i32 r6 16
20 r3 = mul_i32 r3 r4 20 r3 = shr_i32 r5 8
21 r3 = shr_i32 r3 8 21 r3 = bit_and r1 r3
22 r7 = shr_i32 r2 8 22 r3 = mul_i32 r3 r4
23 r7 = bit_and r1 r7 23 r3 = shr_i32 r3 8
24 r3 = add_i32 r7 r3 24 r7 = shr_i32 r2 8
25 r5 = bit_and r1 r5 25 r7 = bit_and r1 r7
26 r4 = mul_i32 r5 r4 26 r3 = add_i32 r7 r3
27 r4 = shr_i32 r4 8 27 r3 = shl_i32 r3 8
28 r2 = bit_and r1 r2 28 r5 = bit_and r1 r5
29 r4 = add_i32 r2 r4 29 r4 = mul_i32 r5 r4
30 r3 = pack r4 r3 8 30 r4 = shr_i32 r4 8
31 r6 = pack r3 r6 16 31 r2 = bit_and r1 r2
32 store32 arg(1) r6 32 r4 = add_i32 r2 r4
33 r3 = bit_or r4 r3
34 r6 = bit_or r3 r6
35 store32 arg(1) r6
23 values (originally 23): 26 values (originally 26):
v0 = load32 arg(1) v0 = load32 arg(1)
v1 = shr_i32 v0 24 v1 = shr_i32 v0 24
v2 = load32 arg(0) v2 = load32 arg(0)
v3 = shr_i32 v2 24 v3 = shr_i32 v2 24
v4 = add_i32 v3 v1 v4 = add_i32 v3 v1
v5 = shr_i32 v0 16 v5 = shl_i32 v4 8
↑ v6 = splat FF (3.5733111e-43) v6 = shr_i32 v0 16
v7 = bit_and v6 v5 ↑ v7 = splat FF (3.5733111e-43)
v8 = shr_i32 v2 16 v8 = bit_and v7 v6
v9 = bit_and v6 v8 v9 = shr_i32 v2 16
v10 = add_i32 v9 v7 v10 = bit_and v7 v9
v11 = pack v10 v4 8 v11 = add_i32 v10 v8
v12 = shr_i32 v0 8 v12 = bit_or v11 v5
v13 = bit_and v6 v12 v13 = shl_i32 v12 16
v14 = shr_i32 v2 8 v14 = shr_i32 v0 8
v15 = bit_and v6 v14 v15 = bit_and v7 v14
v16 = add_i32 v15 v13 v16 = shr_i32 v2 8
v17 = bit_and v6 v0 v17 = bit_and v7 v16
v18 = bit_and v6 v2 v18 = add_i32 v17 v15
v19 = add_i32 v18 v17 v19 = shl_i32 v18 8
v20 = pack v19 v16 8 v20 = bit_and v7 v0
v21 = pack v20 v11 16 v21 = bit_and v7 v2
store32 arg(1) v21 v22 = add_i32 v21 v20
v23 = bit_or v22 v19
v24 = bit_or v23 v13
store32 arg(1) v24
6 registers, 23 instructions: 6 registers, 26 instructions:
0 r0 = splat FF (3.5733111e-43) 0 r0 = splat FF (3.5733111e-43)
loop: loop:
1 r1 = load32 arg(1) 1 r1 = load32 arg(1)
@ -639,21 +666,24 @@ loop:
3 r3 = load32 arg(0) 3 r3 = load32 arg(0)
4 r4 = shr_i32 r3 24 4 r4 = shr_i32 r3 24
5 r2 = add_i32 r4 r2 5 r2 = add_i32 r4 r2
6 r4 = shr_i32 r1 16 6 r2 = shl_i32 r2 8
7 r4 = bit_and r0 r4 7 r4 = shr_i32 r1 16
8 r5 = shr_i32 r3 16 8 r4 = bit_and r0 r4
9 r5 = bit_and r0 r5 9 r5 = shr_i32 r3 16
10 r4 = add_i32 r5 r4 10 r5 = bit_and r0 r5
11 r2 = pack r4 r2 8 11 r4 = add_i32 r5 r4
12 r4 = shr_i32 r1 8 12 r2 = bit_or r4 r2
13 r4 = bit_and r0 r4 13 r2 = shl_i32 r2 16
14 r5 = shr_i32 r3 8 14 r4 = shr_i32 r1 8
15 r5 = bit_and r0 r5 15 r4 = bit_and r0 r4
16 r4 = add_i32 r5 r4 16 r5 = shr_i32 r3 8
17 r1 = bit_and r0 r1 17 r5 = bit_and r0 r5
18 r3 = bit_and r0 r3 18 r4 = add_i32 r5 r4
19 r1 = add_i32 r3 r1 19 r4 = shl_i32 r4 8
20 r4 = pack r1 r4 8 20 r1 = bit_and r0 r1
21 r2 = pack r4 r2 16 21 r3 = bit_and r0 r3
22 store32 arg(1) r2 22 r1 = add_i32 r3 r1
23 r4 = bit_or r1 r4
24 r2 = bit_or r4 r2
25 store32 arg(1) r2

View File

@ -350,7 +350,6 @@ namespace skvm {
case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break; case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
case Op::select_q14: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break; case Op::select_q14: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
case Op::pack: write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}, fs(id)...); break;
case Op::ceil: write(o, V{id}, "=", op, V{x}, fs(id)...); break; case Op::ceil: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
case Op::floor: write(o, V{id}, "=", op, V{x}, fs(id)...); break; case Op::floor: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
@ -502,7 +501,6 @@ namespace skvm {
case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
case Op::select_q14: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; case Op::select_q14: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
case Op::pack: write(o, R{d}, "=", op, R{x}, R{y}, Shift{immz}); break;
case Op::ceil: write(o, R{d}, "=", op, R{x}); break; case Op::ceil: write(o, R{d}, "=", op, R{x}); break;
case Op::floor: write(o, R{d}, "=", op, R{x}); break; case Op::floor: write(o, R{d}, "=", op, R{x}); break;
@ -1194,8 +1192,7 @@ namespace skvm {
} }
I32 Builder::pack(I32 x, I32 y, int bits) { I32 Builder::pack(I32 x, I32 y, int bits) {
if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|(Y<<bits)); } return bit_or(x, shl(y, bits));
return {this, this->push(Op::pack, x.id,y.id,NA, 0,bits)};
} }
F32 Builder::ceil(F32 x) { F32 Builder::ceil(F32 x) {
@ -2758,8 +2755,6 @@ namespace skvm {
case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break;
case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break;
case Op::select: case Op::select:
vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]); vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
break; break;
@ -3390,6 +3385,7 @@ namespace skvm {
z = inst.z; z = inst.z;
const int immy = inst.immy, const int immy = inst.immy,
immz = inst.immz; immz = inst.immz;
(void)immz; // not yet used on arm64
// alloc_tmp() returns a temporary register, freed manually with free_tmp(). // alloc_tmp() returns a temporary register, freed manually with free_tmp().
auto alloc_tmp = [&]() -> Reg { auto alloc_tmp = [&]() -> Reg {
@ -3899,11 +3895,6 @@ namespace skvm {
case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break; case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break; case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
// It's safe to alias dst(y) only when y != x. Otherwise we'd overwrite x!
case Op::pack: a->vpslld(dst(y != x ? y : NA), r(y), immz);
a->vpor (dst(), dst(), any(x));
break;
case Op::ceil: case Op::ceil:
if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); } if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); }
else { a->vroundps(dst(), any(x), Assembler::CEIL); } else { a->vroundps(dst(), any(x), Assembler::CEIL); }
@ -4045,12 +4036,6 @@ namespace skvm {
case Op::eq_i32: a->cmeq4s(dst(), r(x), r(y)); break; case Op::eq_i32: a->cmeq4s(dst(), r(x), r(y)); break;
case Op::gt_i32: a->cmgt4s(dst(), r(x), r(y)); break; case Op::gt_i32: a->cmgt4s(dst(), r(x), r(y)); break;
case Op::pack:
if (try_alias(x)) { a->sli4s ( r(x), r(y), immz); }
else { a->shl4s (dst(), r(y), immz);
a->orr16b(dst(), dst(), r(x)); }
break;
case Op::to_f32: a->scvtf4s (dst(), r(x)); break; case Op::to_f32: a->scvtf4s (dst(), r(x)); break;
case Op::trunc: a->fcvtzs4s(dst(), r(x)); break; case Op::trunc: a->fcvtzs4s(dst(), r(x)); break;
case Op::round: a->fcvtns4s(dst(), r(x)); break; case Op::round: a->fcvtns4s(dst(), r(x)); break;

View File

@ -423,7 +423,7 @@ namespace skvm {
M(gte_f32) M(gt_f32) M(gt_i32) M(gt_q14) \ M(gte_f32) M(gt_f32) M(gt_i32) M(gt_q14) \
M(bit_and) M(bit_or) M(bit_xor) M(bit_clear) \ M(bit_and) M(bit_or) M(bit_xor) M(bit_clear) \
M(bit_and_q14) M(bit_or_q14) M(bit_xor_q14) M(bit_clear_q14) \ M(bit_and_q14) M(bit_or_q14) M(bit_xor_q14) M(bit_clear_q14) \
M(select) M(select_q14) M(pack) \ M(select) M(select_q14)
// End of SKVM_OPS // End of SKVM_OPS
enum class Op : int { enum class Op : int {
@ -786,7 +786,7 @@ namespace skvm {
Q14 select(Q14a cond, Q14a t, Q14a f) { return select(_(cond), _(t), _(f)); } Q14 select(Q14a cond, Q14a t, Q14a f) { return select(_(cond), _(t), _(f)); }
I32 extract(I32 x, int bits, I32 z); // (x>>bits) & z I32 extract(I32 x, int bits, I32 z); // (x>>bits) & z
I32 pack (I32 x, I32 y, int bits); // x | (y << bits), assuming (x & (y << bits)) == 0 I32 pack (I32 x, I32 y, int bits); // x | (y<<bits)
I32 extract(I32a x, int bits, I32a z) { return extract(_(x), bits, _(z)); } I32 extract(I32a x, int bits, I32a z) { return extract(_(x), bits, _(z)); }
I32 pack (I32a x, I32a y, int bits) { return pack (_(x), _(y), bits); } I32 pack (I32a x, I32a y, int bits) { return pack (_(x), _(y), bits); }

View File

@ -289,8 +289,6 @@ namespace SK_OPTS_NS {
CASE(Op::select): r[d].i32 = skvx::if_then_else(r[x].i32, r[y].i32, r[z].i32); CASE(Op::select): r[d].i32 = skvx::if_then_else(r[x].i32, r[y].i32, r[z].i32);
break; break;
CASE(Op::pack): r[d].u32 = r[x].u32 | (r[y].u32 << immz); break;
CASE(Op::ceil): r[d].f32 = skvx::ceil(r[x].f32) ; break; CASE(Op::ceil): r[d].f32 = skvx::ceil(r[x].f32) ; break;
CASE(Op::floor): r[d].f32 = skvx::floor(r[x].f32) ; break; CASE(Op::floor): r[d].f32 = skvx::floor(r[x].f32) ; break;
CASE(Op::to_f32): r[d].f32 = skvx::cast<float>( r[x].i32 ); break; CASE(Op::to_f32): r[d].f32 = skvx::cast<float>( r[x].i32 ); break;

View File

@ -2508,3 +2508,32 @@ DEF_TEST(SkVM_Q14, r) {
} }
} }
DEF_TEST(SkVM_badpack, r) {
// Test case distilled from actual failing draw,
// originally with a bad arm64 implementation of pack().
skvm::Builder p;
{
skvm::Arg uniforms = p.uniform(),
dst = p.varying<uint16_t>();
skvm::I32 r = round(bit_cast(p.uniform32(uniforms, 8)) * 15),
a = p.splat(0xf);
skvm::I32 _4444 = p.splat(0);
_4444 = pack(_4444, r, 12);
_4444 = pack(_4444, a, 0);
store16(dst, _4444);
}
test_jit_and_interpreter(p.done(), [&](const skvm::Program& program){
const float uniforms[] = { 0.0f, 0.0f,
1.0f, 0.0f, 0.0f, 1.0f };
uint16_t dst[17] = {0};
program.eval(17, uniforms,dst);
for (int i = 0; i < 17; i++) {
REPORTER_ASSERT(r, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
}
});
}