skia2/resources/SkVMTest.expected
Mike Klein 4067a9429a the return of bit_clear
bit_clear is at least useful as a special case for select(),
which helps with code readability.

Add is_NaN() and use these all together in sweep gradient.

Change-Id: I57a54f8956f85e0db0662b33f8446b8dc7342d8d
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/281685
Reviewed-by: Mike Reed <reed@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-05 16:44:16 +00:00

776 lines
18 KiB
Plaintext

A8 over A8
14 values (originally 17):
v0 = load8 arg(0)
v1 = to_f32 v0
↑ v2 = splat 3F800000 (1)
↑ v3 = splat 3B808081 (0.0039215689)
v4 = fnma_f32 v1 v3 v2
v5 = load8 arg(1)
v6 = to_f32 v5
v7 = mul_f32 v6 v3
v8 = mul_f32 v1 v3
v9 = fma_f32 v7 v4 v8
↑ v10 = splat 437F0000 (255)
v11 = mul_f32 v9 v10
v12 = round v11
store8 arg(1) v12
6 registers, 14 instructions:
0 r0 = splat 3F800000 (1)
1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat 437F0000 (255)
loop:
3 r3 = load8 arg(0)
4 r3 = to_f32 r3
5 r4 = fnma_f32 r3 r1 r0
6 r5 = load8 arg(1)
7 r5 = to_f32 r5
8 r5 = mul_f32 r5 r1
9 r3 = mul_f32 r3 r1
10 r3 = fma_f32 r5 r4 r3
11 r3 = mul_f32 r3 r2
12 r3 = round r3
13 store8 arg(1) r3
A8 over G8
19 values (originally 24):
v0 = load8 arg(0)
v1 = to_f32 v0
↑ v2 = splat 3F800000 (1)
↑ v3 = splat 3B808081 (0.0039215689)
v4 = fnma_f32 v1 v3 v2
v5 = load8 arg(1)
v6 = to_f32 v5
v7 = mul_f32 v6 v3
v8 = mul_f32 v7 v4
↑ v9 = splat 3D93DD98 (0.0722)
v10 = mul_f32 v8 v9
↑ v11 = splat 3F371759 (0.71520001)
v12 = fma_f32 v8 v11 v10
↑ v13 = splat 3E59B3D0 (0.21259999)
v14 = fma_f32 v8 v13 v12
↑ v15 = splat 437F0000 (255)
v16 = mul_f32 v14 v15
v17 = round v16
store8 arg(1) v17
8 registers, 19 instructions:
0 r0 = splat 3F800000 (1)
1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat 3D93DD98 (0.0722)
3 r3 = splat 3F371759 (0.71520001)
4 r4 = splat 3E59B3D0 (0.21259999)
5 r5 = splat 437F0000 (255)
loop:
6 r6 = load8 arg(0)
7 r6 = to_f32 r6
8 r6 = fnma_f32 r6 r1 r0
9 r7 = load8 arg(1)
10 r7 = to_f32 r7
11 r7 = mul_f32 r7 r1
12 r6 = mul_f32 r7 r6
13 r7 = mul_f32 r6 r2
14 r7 = fma_f32 r6 r3 r7
15 r7 = fma_f32 r6 r4 r7
16 r7 = mul_f32 r7 r5
17 r7 = round r7
18 store8 arg(1) r7
A8 over RGBA_8888
39 values (originally 41):
v0 = load8 arg(0)
v1 = to_f32 v0
↑ v2 = splat 3F800000 (1)
↑ v3 = splat 3B808081 (0.0039215689)
v4 = fnma_f32 v1 v3 v2
v5 = load32 arg(1)
v6 = shr_i32 v5 24
v7 = to_f32 v6
v8 = mul_f32 v7 v3
v9 = mul_f32 v1 v3
v10 = fma_f32 v8 v4 v9
↑ v11 = splat 437F0000 (255)
v12 = mul_f32 v10 v11
v13 = shr_i32 v5 16
↑ v14 = splat FF (3.5733111e-43)
v15 = bit_and v14 v13
v16 = round v12
v17 = to_f32 v15
v18 = mul_f32 v17 v3
v19 = mul_f32 v18 v4
v20 = mul_f32 v19 v11
v21 = round v20
v22 = pack v21 v16 8
v23 = shr_i32 v5 8
v24 = bit_and v14 v23
v25 = to_f32 v24
v26 = mul_f32 v25 v3
v27 = mul_f32 v26 v4
v28 = mul_f32 v27 v11
v29 = round v28
v30 = bit_and v14 v5
v31 = to_f32 v30
v32 = mul_f32 v31 v3
v33 = mul_f32 v32 v4
v34 = mul_f32 v33 v11
v35 = round v34
v36 = pack v35 v29 8
v37 = pack v36 v22 16
store32 arg(1) v37
8 registers, 39 instructions:
0 r0 = splat 3F800000 (1)
1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat 437F0000 (255)
3 r3 = splat FF (3.5733111e-43)
loop:
4 r4 = load8 arg(0)
5 r4 = to_f32 r4
6 r5 = fnma_f32 r4 r1 r0
7 r6 = load32 arg(1)
8 r7 = shr_i32 r6 24
9 r7 = to_f32 r7
10 r7 = mul_f32 r7 r1
11 r4 = mul_f32 r4 r1
12 r4 = fma_f32 r7 r5 r4
13 r4 = mul_f32 r4 r2
14 r7 = shr_i32 r6 16
15 r7 = bit_and r3 r7
16 r4 = round r4
17 r7 = to_f32 r7
18 r7 = mul_f32 r7 r1
19 r7 = mul_f32 r7 r5
20 r7 = mul_f32 r7 r2
21 r7 = round r7
22 r4 = pack r7 r4 8
23 r7 = shr_i32 r6 8
24 r7 = bit_and r3 r7
25 r7 = to_f32 r7
26 r7 = mul_f32 r7 r1
27 r7 = mul_f32 r7 r5
28 r7 = mul_f32 r7 r2
29 r7 = round r7
30 r6 = bit_and r3 r6
31 r6 = to_f32 r6
32 r6 = mul_f32 r6 r1
33 r5 = mul_f32 r6 r5
34 r5 = mul_f32 r5 r2
35 r5 = round r5
36 r7 = pack r5 r7 8
37 r4 = pack r7 r4 16
38 store32 arg(1) r4
G8 over A8
11 values (originally 15):
v0 = load8 arg(1)
v1 = to_f32 v0
↑ v2 = splat 3B808081 (0.0039215689)
v3 = mul_f32 v1 v2
↑ v4 = splat 0 (0)
↑ v5 = splat 3F800000 (1)
v6 = fma_f32 v3 v4 v5
↑ v7 = splat 437F0000 (255)
v8 = mul_f32 v6 v7
v9 = round v8
store8 arg(1) v9
5 registers, 11 instructions:
0 r0 = splat 3B808081 (0.0039215689)
1 r1 = splat 0 (0)
2 r2 = splat 3F800000 (1)
3 r3 = splat 437F0000 (255)
loop:
4 r4 = load8 arg(1)
5 r4 = to_f32 r4
6 r4 = mul_f32 r4 r0
7 r4 = fma_f32 r4 r1 r2
8 r4 = mul_f32 r4 r3
9 r4 = round r4
10 store8 arg(1) r4
G8 over G8
19 values (originally 23):
v0 = load8 arg(1)
v1 = to_f32 v0
↑ v2 = splat 3B808081 (0.0039215689)
v3 = mul_f32 v1 v2
v4 = load8 arg(0)
v5 = to_f32 v4
v6 = mul_f32 v5 v2
↑ v7 = splat 0 (0)
v8 = fma_f32 v3 v7 v6
↑ v9 = splat 3D93DD98 (0.0722)
v10 = mul_f32 v8 v9
↑ v11 = splat 3F371759 (0.71520001)
v12 = fma_f32 v8 v11 v10
↑ v13 = splat 3E59B3D0 (0.21259999)
v14 = fma_f32 v8 v13 v12
↑ v15 = splat 437F0000 (255)
v16 = mul_f32 v14 v15
v17 = round v16
store8 arg(1) v17
8 registers, 19 instructions:
0 r0 = splat 3B808081 (0.0039215689)
1 r1 = splat 0 (0)
2 r2 = splat 3D93DD98 (0.0722)
3 r3 = splat 3F371759 (0.71520001)
4 r4 = splat 3E59B3D0 (0.21259999)
5 r5 = splat 437F0000 (255)
loop:
6 r6 = load8 arg(1)
7 r6 = to_f32 r6
8 r6 = mul_f32 r6 r0
9 r7 = load8 arg(0)
10 r7 = to_f32 r7
11 r7 = mul_f32 r7 r0
12 r7 = fma_f32 r6 r1 r7
13 r6 = mul_f32 r7 r2
14 r6 = fma_f32 r7 r3 r6
15 r6 = fma_f32 r7 r4 r6
16 r6 = mul_f32 r6 r5
17 r6 = round r6
18 store8 arg(1) r6
G8 over RGBA_8888
39 values (originally 43):
v0 = load32 arg(1)
v1 = shr_i32 v0 24
v2 = to_f32 v1
↑ v3 = splat 3B808081 (0.0039215689)
v4 = mul_f32 v2 v3
↑ v5 = splat 0 (0)
↑ v6 = splat 3F800000 (1)
v7 = fma_f32 v4 v5 v6
↑ v8 = splat 437F0000 (255)
v9 = mul_f32 v7 v8
v10 = shr_i32 v0 16
↑ v11 = splat FF (3.5733111e-43)
v12 = bit_and v11 v10
v13 = to_f32 v12
v14 = mul_f32 v13 v3
v15 = load8 arg(0)
v16 = to_f32 v15
v17 = mul_f32 v16 v3
v18 = fma_f32 v14 v5 v17
v19 = round v9
v20 = mul_f32 v18 v8
v21 = round v20
v22 = pack v21 v19 8
v23 = shr_i32 v0 8
v24 = bit_and v11 v23
v25 = to_f32 v24
v26 = mul_f32 v25 v3
v27 = fma_f32 v26 v5 v17
v28 = mul_f32 v27 v8
v29 = round v28
v30 = bit_and v11 v0
v31 = to_f32 v30
v32 = mul_f32 v31 v3
v33 = fma_f32 v32 v5 v17
v34 = mul_f32 v33 v8
v35 = round v34
v36 = pack v35 v29 8
v37 = pack v36 v22 16
store32 arg(1) v37
9 registers, 39 instructions:
0 r0 = splat 3B808081 (0.0039215689)
1 r1 = splat 0 (0)
2 r2 = splat 3F800000 (1)
3 r3 = splat 437F0000 (255)
4 r4 = splat FF (3.5733111e-43)
loop:
5 r5 = load32 arg(1)
6 r6 = shr_i32 r5 24
7 r6 = to_f32 r6
8 r6 = mul_f32 r6 r0
9 r6 = fma_f32 r6 r1 r2
10 r6 = mul_f32 r6 r3
11 r7 = shr_i32 r5 16
12 r7 = bit_and r4 r7
13 r7 = to_f32 r7
14 r7 = mul_f32 r7 r0
15 r8 = load8 arg(0)
16 r8 = to_f32 r8
17 r8 = mul_f32 r8 r0
18 r7 = fma_f32 r7 r1 r8
19 r6 = round r6
20 r7 = mul_f32 r7 r3
21 r7 = round r7
22 r6 = pack r7 r6 8
23 r7 = shr_i32 r5 8
24 r7 = bit_and r4 r7
25 r7 = to_f32 r7
26 r7 = mul_f32 r7 r0
27 r7 = fma_f32 r7 r1 r8
28 r7 = mul_f32 r7 r3
29 r7 = round r7
30 r5 = bit_and r4 r5
31 r5 = to_f32 r5
32 r5 = mul_f32 r5 r0
33 r8 = fma_f32 r5 r1 r8
34 r8 = mul_f32 r8 r3
35 r8 = round r8
36 r7 = pack r8 r7 8
37 r6 = pack r7 r6 16
38 store32 arg(1) r6
RGBA_8888 over A8
15 values (originally 33):
v0 = load32 arg(0)
v1 = shr_i32 v0 24
v2 = to_f32 v1
↑ v3 = splat 3F800000 (1)
↑ v4 = splat 3B808081 (0.0039215689)
v5 = fnma_f32 v2 v4 v3
v6 = load8 arg(1)
v7 = to_f32 v6
v8 = mul_f32 v7 v4
v9 = mul_f32 v2 v4
v10 = fma_f32 v8 v5 v9
↑ v11 = splat 437F0000 (255)
v12 = mul_f32 v10 v11
v13 = round v12
store8 arg(1) v13
6 registers, 15 instructions:
0 r0 = splat 3F800000 (1)
1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat 437F0000 (255)
loop:
3 r3 = load32 arg(0)
4 r3 = shr_i32 r3 24
5 r3 = to_f32 r3
6 r4 = fnma_f32 r3 r1 r0
7 r5 = load8 arg(1)
8 r5 = to_f32 r5
9 r5 = mul_f32 r5 r1
10 r3 = mul_f32 r3 r1
11 r3 = fma_f32 r5 r4 r3
12 r3 = mul_f32 r3 r2
13 r3 = round r3
14 store8 arg(1) r3
RGBA_8888 over G8
34 values (originally 39):
v0 = load32 arg(0)
v1 = shr_i32 v0 24
v2 = to_f32 v1
↑ v3 = splat 3F800000 (1)
↑ v4 = splat 3B808081 (0.0039215689)
v5 = fnma_f32 v2 v4 v3
v6 = shr_i32 v0 16
↑ v7 = splat FF (3.5733111e-43)
v8 = bit_and v7 v6
v9 = load8 arg(1)
v10 = to_f32 v9
v11 = mul_f32 v10 v4
v12 = to_f32 v8
v13 = mul_f32 v12 v4
v14 = fma_f32 v11 v5 v13
↑ v15 = splat 3D93DD98 (0.0722)
v16 = mul_f32 v14 v15
v17 = shr_i32 v0 8
v18 = bit_and v7 v17
v19 = to_f32 v18
v20 = mul_f32 v19 v4
v21 = fma_f32 v11 v5 v20
↑ v22 = splat 3F371759 (0.71520001)
v23 = fma_f32 v21 v22 v16
v24 = bit_and v7 v0
v25 = to_f32 v24
v26 = mul_f32 v25 v4
v27 = fma_f32 v11 v5 v26
↑ v28 = splat 3E59B3D0 (0.21259999)
v29 = fma_f32 v27 v28 v23
↑ v30 = splat 437F0000 (255)
v31 = mul_f32 v29 v30
v32 = round v31
store8 arg(1) v32
12 registers, 34 instructions:
0 r0 = splat 3F800000 (1)
1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat FF (3.5733111e-43)
3 r3 = splat 3D93DD98 (0.0722)
4 r4 = splat 3F371759 (0.71520001)
5 r5 = splat 3E59B3D0 (0.21259999)
6 r6 = splat 437F0000 (255)
loop:
7 r7 = load32 arg(0)
8 r8 = shr_i32 r7 24
9 r8 = to_f32 r8
10 r8 = fnma_f32 r8 r1 r0
11 r9 = shr_i32 r7 16
12 r9 = bit_and r2 r9
13 r10 = load8 arg(1)
14 r10 = to_f32 r10
15 r10 = mul_f32 r10 r1
16 r9 = to_f32 r9
17 r9 = mul_f32 r9 r1
18 r9 = fma_f32 r10 r8 r9
19 r9 = mul_f32 r9 r3
20 r11 = shr_i32 r7 8
21 r11 = bit_and r2 r11
22 r11 = to_f32 r11
23 r11 = mul_f32 r11 r1
24 r11 = fma_f32 r10 r8 r11
25 r9 = fma_f32 r11 r4 r9
26 r7 = bit_and r2 r7
27 r7 = to_f32 r7
28 r7 = mul_f32 r7 r1
29 r7 = fma_f32 r10 r8 r7
30 r9 = fma_f32 r7 r5 r9
31 r9 = mul_f32 r9 r6
32 r9 = round r9
33 store8 arg(1) r9
RGBA_8888 over RGBA_8888
51 values (originally 55):
v0 = load32 arg(0)
v1 = shr_i32 v0 24
v2 = to_f32 v1
↑ v3 = splat 3F800000 (1)
↑ v4 = splat 3B808081 (0.0039215689)
v5 = fnma_f32 v2 v4 v3
v6 = load32 arg(1)
v7 = shr_i32 v6 24
v8 = to_f32 v7
v9 = mul_f32 v8 v4
v10 = mul_f32 v2 v4
v11 = fma_f32 v9 v5 v10
↑ v12 = splat 437F0000 (255)
v13 = mul_f32 v11 v12
v14 = shr_i32 v6 16
↑ v15 = splat FF (3.5733111e-43)
v16 = bit_and v15 v14
v17 = to_f32 v16
v18 = mul_f32 v17 v4
v19 = shr_i32 v0 16
v20 = bit_and v15 v19
v21 = to_f32 v20
v22 = mul_f32 v21 v4
v23 = fma_f32 v18 v5 v22
v24 = round v13
v25 = mul_f32 v23 v12
v26 = round v25
v27 = pack v26 v24 8
v28 = shr_i32 v6 8
v29 = bit_and v15 v28
v30 = to_f32 v29
v31 = mul_f32 v30 v4
v32 = shr_i32 v0 8
v33 = bit_and v15 v32
v34 = to_f32 v33
v35 = mul_f32 v34 v4
v36 = fma_f32 v31 v5 v35
v37 = bit_and v15 v6
v38 = to_f32 v37
v39 = mul_f32 v38 v4
v40 = bit_and v15 v0
v41 = to_f32 v40
v42 = mul_f32 v41 v4
v43 = fma_f32 v39 v5 v42
v44 = mul_f32 v36 v12
v45 = round v44
v46 = mul_f32 v43 v12
v47 = round v46
v48 = pack v47 v45 8
v49 = pack v48 v27 16
store32 arg(1) v49
10 registers, 51 instructions:
0 r0 = splat 3F800000 (1)
1 r1 = splat 3B808081 (0.0039215689)
2 r2 = splat 437F0000 (255)
3 r3 = splat FF (3.5733111e-43)
loop:
4 r4 = load32 arg(0)
5 r5 = shr_i32 r4 24
6 r5 = to_f32 r5
7 r6 = fnma_f32 r5 r1 r0
8 r7 = load32 arg(1)
9 r8 = shr_i32 r7 24
10 r8 = to_f32 r8
11 r8 = mul_f32 r8 r1
12 r5 = mul_f32 r5 r1
13 r5 = fma_f32 r8 r6 r5
14 r5 = mul_f32 r5 r2
15 r8 = shr_i32 r7 16
16 r8 = bit_and r3 r8
17 r8 = to_f32 r8
18 r8 = mul_f32 r8 r1
19 r9 = shr_i32 r4 16
20 r9 = bit_and r3 r9
21 r9 = to_f32 r9
22 r9 = mul_f32 r9 r1
23 r9 = fma_f32 r8 r6 r9
24 r5 = round r5
25 r9 = mul_f32 r9 r2
26 r9 = round r9
27 r5 = pack r9 r5 8
28 r9 = shr_i32 r7 8
29 r9 = bit_and r3 r9
30 r9 = to_f32 r9
31 r9 = mul_f32 r9 r1
32 r8 = shr_i32 r4 8
33 r8 = bit_and r3 r8
34 r8 = to_f32 r8
35 r8 = mul_f32 r8 r1
36 r8 = fma_f32 r9 r6 r8
37 r7 = bit_and r3 r7
38 r7 = to_f32 r7
39 r7 = mul_f32 r7 r1
40 r4 = bit_and r3 r4
41 r4 = to_f32 r4
42 r4 = mul_f32 r4 r1
43 r4 = fma_f32 r7 r6 r4
44 r8 = mul_f32 r8 r2
45 r8 = round r8
46 r4 = mul_f32 r4 r2
47 r4 = round r4
48 r8 = pack r4 r8 8
49 r5 = pack r8 r5 16
50 store32 arg(1) r5
I32 (Naive) 8888 over 8888
33 values (originally 33):
v0 = load32 arg(0)
v1 = shr_i32 v0 24
↑ v2 = splat 100 (3.5873241e-43)
v3 = sub_i32 v2 v1
v4 = load32 arg(1)
v5 = shr_i32 v4 16
↑ v6 = splat FF (3.5733111e-43)
v7 = bit_and v6 v5
v8 = mul_i32 v7 v3
v9 = shr_i32 v8 8
v10 = shr_i32 v0 16
v11 = bit_and v6 v10
v12 = add_i32 v11 v9
v13 = shr_i32 v4 24
v14 = mul_i32 v13 v3
v15 = shr_i32 v14 8
v16 = add_i32 v1 v15
v17 = pack v12 v16 8
v18 = shr_i32 v4 8
v19 = bit_and v6 v18
v20 = mul_i32 v19 v3
v21 = shr_i32 v20 8
v22 = shr_i32 v0 8
v23 = bit_and v6 v22
v24 = add_i32 v23 v21
v25 = bit_and v6 v4
v26 = mul_i32 v25 v3
v27 = shr_i32 v26 8
v28 = bit_and v6 v0
v29 = add_i32 v28 v27
v30 = pack v29 v24 8
v31 = pack v30 v17 16
store32 arg(1) v31
8 registers, 33 instructions:
0 r0 = splat 100 (3.5873241e-43)
1 r1 = splat FF (3.5733111e-43)
loop:
2 r2 = load32 arg(0)
3 r3 = shr_i32 r2 24
4 r4 = sub_i32 r0 r3
5 r5 = load32 arg(1)
6 r6 = shr_i32 r5 16
7 r6 = bit_and r1 r6
8 r6 = mul_i32 r6 r4
9 r6 = shr_i32 r6 8
10 r7 = shr_i32 r2 16
11 r7 = bit_and r1 r7
12 r6 = add_i32 r7 r6
13 r7 = shr_i32 r5 24
14 r7 = mul_i32 r7 r4
15 r7 = shr_i32 r7 8
16 r7 = add_i32 r3 r7
17 r7 = pack r6 r7 8
18 r6 = shr_i32 r5 8
19 r6 = bit_and r1 r6
20 r6 = mul_i32 r6 r4
21 r6 = shr_i32 r6 8
22 r3 = shr_i32 r2 8
23 r3 = bit_and r1 r3
24 r6 = add_i32 r3 r6
25 r5 = bit_and r1 r5
26 r4 = mul_i32 r5 r4
27 r4 = shr_i32 r4 8
28 r2 = bit_and r1 r2
29 r4 = add_i32 r2 r4
30 r6 = pack r4 r6 8
31 r7 = pack r6 r7 16
32 store32 arg(1) r7
I32 8888 over 8888
33 values (originally 33):
v0 = load32 arg(0)
v1 = shr_i32 v0 24
↑ v2 = splat 100 (3.5873241e-43)
v3 = sub_i32 v2 v1
v4 = load32 arg(1)
v5 = shr_i32 v4 16
↑ v6 = splat FF (3.5733111e-43)
v7 = bit_and v6 v5
v8 = mul_i16x2 v7 v3
v9 = shr_i32 v8 8
v10 = shr_i32 v0 16
v11 = bit_and v6 v10
v12 = add_i32 v11 v9
v13 = shr_i32 v4 24
v14 = mul_i16x2 v13 v3
v15 = shr_i32 v14 8
v16 = add_i32 v1 v15
v17 = pack v12 v16 8
v18 = shr_i32 v4 8
v19 = bit_and v6 v18
v20 = mul_i16x2 v19 v3
v21 = shr_i32 v20 8
v22 = shr_i32 v0 8
v23 = bit_and v6 v22
v24 = add_i32 v23 v21
v25 = bit_and v6 v4
v26 = mul_i16x2 v25 v3
v27 = shr_i32 v26 8
v28 = bit_and v6 v0
v29 = add_i32 v28 v27
v30 = pack v29 v24 8
v31 = pack v30 v17 16
store32 arg(1) v31
8 registers, 33 instructions:
0 r0 = splat 100 (3.5873241e-43)
1 r1 = splat FF (3.5733111e-43)
loop:
2 r2 = load32 arg(0)
3 r3 = shr_i32 r2 24
4 r4 = sub_i32 r0 r3
5 r5 = load32 arg(1)
6 r6 = shr_i32 r5 16
7 r6 = bit_and r1 r6
8 r6 = mul_i16x2 r6 r4
9 r6 = shr_i32 r6 8
10 r7 = shr_i32 r2 16
11 r7 = bit_and r1 r7
12 r6 = add_i32 r7 r6
13 r7 = shr_i32 r5 24
14 r7 = mul_i16x2 r7 r4
15 r7 = shr_i32 r7 8
16 r7 = add_i32 r3 r7
17 r7 = pack r6 r7 8
18 r6 = shr_i32 r5 8
19 r6 = bit_and r1 r6
20 r6 = mul_i16x2 r6 r4
21 r6 = shr_i32 r6 8
22 r3 = shr_i32 r2 8
23 r3 = bit_and r1 r3
24 r6 = add_i32 r3 r6
25 r5 = bit_and r1 r5
26 r4 = mul_i16x2 r5 r4
27 r4 = shr_i32 r4 8
28 r2 = bit_and r1 r2
29 r4 = add_i32 r2 r4
30 r6 = pack r4 r6 8
31 r7 = pack r6 r7 16
32 store32 arg(1) r7
I32 (SWAR) 8888 over 8888
19 values (originally 20):
v0 = load32 arg(0)
v1 = shr_i32 v0 8
↑ v2 = splat FF0000 (2.3418052e-38)
v3 = bit_and v2 v1
v4 = shr_i32 v0 24
v5 = bit_or v4 v3
↑ v6 = splat 1000100 (2.3510604e-38)
v7 = sub_i16x2 v6 v5
v8 = load32 arg(1)
v9 = shr_i16x2 v8 8
v10 = mul_i16x2 v9 v7
↑ v11 = splat FF00FF (2.3418409e-38)
v12 = bit_clear v10 v11
v13 = bit_and v8 v11
v14 = mul_i16x2 v13 v7
v15 = shr_i16x2 v14 8
v16 = bit_or v15 v12
v17 = add_i32 v0 v16
store32 arg(1) v17
7 registers, 19 instructions:
0 r0 = splat FF0000 (2.3418052e-38)
1 r1 = splat 1000100 (2.3510604e-38)
2 r2 = splat FF00FF (2.3418409e-38)
loop:
3 r3 = load32 arg(0)
4 r4 = shr_i32 r3 8
5 r4 = bit_and r0 r4
6 r5 = shr_i32 r3 24
7 r4 = bit_or r5 r4
8 r4 = sub_i16x2 r1 r4
9 r5 = load32 arg(1)
10 r6 = shr_i16x2 r5 8
11 r6 = mul_i16x2 r6 r4
12 r6 = bit_clear r6 r2
13 r5 = bit_and r5 r2
14 r4 = mul_i16x2 r5 r4
15 r4 = shr_i16x2 r4 8
16 r6 = bit_or r4 r6
17 r6 = add_i32 r3 r6
18 store32 arg(1) r6
23 values (originally 23):
v0 = load32 arg(1)
v1 = shr_i32 v0 24
v2 = load32 arg(0)
v3 = shr_i32 v2 24
v4 = add_i32 v3 v1
v5 = shr_i32 v0 16
↑ v6 = splat FF (3.5733111e-43)
v7 = bit_and v6 v5
v8 = shr_i32 v2 16
v9 = bit_and v6 v8
v10 = add_i32 v9 v7
v11 = pack v10 v4 8
v12 = shr_i32 v0 8
v13 = bit_and v6 v12
v14 = shr_i32 v2 8
v15 = bit_and v6 v14
v16 = add_i32 v15 v13
v17 = bit_and v6 v0
v18 = bit_and v6 v2
v19 = add_i32 v18 v17
v20 = pack v19 v16 8
v21 = pack v20 v11 16
store32 arg(1) v21
6 registers, 23 instructions:
0 r0 = splat FF (3.5733111e-43)
loop:
1 r1 = load32 arg(1)
2 r2 = shr_i32 r1 24
3 r3 = load32 arg(0)
4 r4 = shr_i32 r3 24
5 r2 = add_i32 r4 r2
6 r4 = shr_i32 r1 16
7 r4 = bit_and r0 r4
8 r5 = shr_i32 r3 16
9 r5 = bit_and r0 r5
10 r4 = add_i32 r5 r4
11 r2 = pack r4 r2 8
12 r4 = shr_i32 r1 8
13 r4 = bit_and r0 r4
14 r5 = shr_i32 r3 8
15 r5 = bit_and r0 r5
16 r4 = add_i32 r5 r4
17 r1 = bit_and r0 r1
18 r3 = bit_and r0 r3
19 r1 = add_i32 r3 r1
20 r4 = pack r1 r4 8
21 r2 = pack r4 r2 16
22 store32 arg(1) r2