cca2acfb77
bit_clear() is just another bit_and(), and bytes() is a way of expression pshufb that we never really use (yet). Can always add them back later, but there's some extra complexity to think about for each that I'd like to not think about now: - common sub-expression elimination between bit_and and bit_clear - large constant management JIT'ing bytes Change-Id: I3a54afa963231fec1d5de949acc647e3430ed0d8 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/281557 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
795 lines
18 KiB
Plaintext
795 lines
18 KiB
Plaintext
A8 over A8
|
|
14 values (originally 17):
|
|
v0 = load8 arg(0)
|
|
v1 = to_f32 v0
|
|
↑ v2 = splat 3F800000 (1)
|
|
↑ v3 = splat 3B808081 (0.0039215689)
|
|
v4 = fnma_f32 v1 v3 v2
|
|
v5 = load8 arg(1)
|
|
v6 = to_f32 v5
|
|
v7 = mul_f32 v6 v3
|
|
v8 = mul_f32 v1 v3
|
|
v9 = fma_f32 v7 v4 v8
|
|
↑ v10 = splat 437F0000 (255)
|
|
v11 = mul_f32 v9 v10
|
|
v12 = round v11
|
|
store8 arg(1) v12
|
|
|
|
6 registers, 14 instructions:
|
|
0 r0 = splat 3F800000 (1)
|
|
1 r1 = splat 3B808081 (0.0039215689)
|
|
2 r2 = splat 437F0000 (255)
|
|
loop:
|
|
3 r3 = load8 arg(0)
|
|
4 r3 = to_f32 r3
|
|
5 r4 = fnma_f32 r3 r1 r0
|
|
6 r5 = load8 arg(1)
|
|
7 r5 = to_f32 r5
|
|
8 r5 = mul_f32 r5 r1
|
|
9 r3 = mul_f32 r3 r1
|
|
10 r3 = fma_f32 r5 r4 r3
|
|
11 r3 = mul_f32 r3 r2
|
|
12 r3 = round r3
|
|
13 store8 arg(1) r3
|
|
|
|
A8 over G8
|
|
19 values (originally 24):
|
|
v0 = load8 arg(0)
|
|
v1 = to_f32 v0
|
|
↑ v2 = splat 3F800000 (1)
|
|
↑ v3 = splat 3B808081 (0.0039215689)
|
|
v4 = fnma_f32 v1 v3 v2
|
|
v5 = load8 arg(1)
|
|
v6 = to_f32 v5
|
|
v7 = mul_f32 v6 v3
|
|
v8 = mul_f32 v7 v4
|
|
↑ v9 = splat 3D93DD98 (0.0722)
|
|
v10 = mul_f32 v8 v9
|
|
↑ v11 = splat 3F371759 (0.71520001)
|
|
v12 = fma_f32 v8 v11 v10
|
|
↑ v13 = splat 3E59B3D0 (0.21259999)
|
|
v14 = fma_f32 v8 v13 v12
|
|
↑ v15 = splat 437F0000 (255)
|
|
v16 = mul_f32 v14 v15
|
|
v17 = round v16
|
|
store8 arg(1) v17
|
|
|
|
8 registers, 19 instructions:
|
|
0 r0 = splat 3F800000 (1)
|
|
1 r1 = splat 3B808081 (0.0039215689)
|
|
2 r2 = splat 3D93DD98 (0.0722)
|
|
3 r3 = splat 3F371759 (0.71520001)
|
|
4 r4 = splat 3E59B3D0 (0.21259999)
|
|
5 r5 = splat 437F0000 (255)
|
|
loop:
|
|
6 r6 = load8 arg(0)
|
|
7 r6 = to_f32 r6
|
|
8 r6 = fnma_f32 r6 r1 r0
|
|
9 r7 = load8 arg(1)
|
|
10 r7 = to_f32 r7
|
|
11 r7 = mul_f32 r7 r1
|
|
12 r6 = mul_f32 r7 r6
|
|
13 r7 = mul_f32 r6 r2
|
|
14 r7 = fma_f32 r6 r3 r7
|
|
15 r7 = fma_f32 r6 r4 r7
|
|
16 r7 = mul_f32 r7 r5
|
|
17 r7 = round r7
|
|
18 store8 arg(1) r7
|
|
|
|
A8 over RGBA_8888
|
|
39 values (originally 41):
|
|
v0 = load8 arg(0)
|
|
v1 = to_f32 v0
|
|
↑ v2 = splat 3F800000 (1)
|
|
↑ v3 = splat 3B808081 (0.0039215689)
|
|
v4 = fnma_f32 v1 v3 v2
|
|
v5 = load32 arg(1)
|
|
v6 = shr_i32 v5 24
|
|
v7 = to_f32 v6
|
|
v8 = mul_f32 v7 v3
|
|
v9 = mul_f32 v1 v3
|
|
v10 = fma_f32 v8 v4 v9
|
|
↑ v11 = splat 437F0000 (255)
|
|
v12 = mul_f32 v10 v11
|
|
v13 = shr_i32 v5 16
|
|
↑ v14 = splat FF (3.5733111e-43)
|
|
v15 = bit_and v14 v13
|
|
v16 = round v12
|
|
v17 = to_f32 v15
|
|
v18 = mul_f32 v17 v3
|
|
v19 = mul_f32 v18 v4
|
|
v20 = mul_f32 v19 v11
|
|
v21 = round v20
|
|
v22 = pack v21 v16 8
|
|
v23 = shr_i32 v5 8
|
|
v24 = bit_and v14 v23
|
|
v25 = to_f32 v24
|
|
v26 = mul_f32 v25 v3
|
|
v27 = mul_f32 v26 v4
|
|
v28 = mul_f32 v27 v11
|
|
v29 = round v28
|
|
v30 = bit_and v14 v5
|
|
v31 = to_f32 v30
|
|
v32 = mul_f32 v31 v3
|
|
v33 = mul_f32 v32 v4
|
|
v34 = mul_f32 v33 v11
|
|
v35 = round v34
|
|
v36 = pack v35 v29 8
|
|
v37 = pack v36 v22 16
|
|
store32 arg(1) v37
|
|
|
|
8 registers, 39 instructions:
|
|
0 r0 = splat 3F800000 (1)
|
|
1 r1 = splat 3B808081 (0.0039215689)
|
|
2 r2 = splat 437F0000 (255)
|
|
3 r3 = splat FF (3.5733111e-43)
|
|
loop:
|
|
4 r4 = load8 arg(0)
|
|
5 r4 = to_f32 r4
|
|
6 r5 = fnma_f32 r4 r1 r0
|
|
7 r6 = load32 arg(1)
|
|
8 r7 = shr_i32 r6 24
|
|
9 r7 = to_f32 r7
|
|
10 r7 = mul_f32 r7 r1
|
|
11 r4 = mul_f32 r4 r1
|
|
12 r4 = fma_f32 r7 r5 r4
|
|
13 r4 = mul_f32 r4 r2
|
|
14 r7 = shr_i32 r6 16
|
|
15 r7 = bit_and r3 r7
|
|
16 r4 = round r4
|
|
17 r7 = to_f32 r7
|
|
18 r7 = mul_f32 r7 r1
|
|
19 r7 = mul_f32 r7 r5
|
|
20 r7 = mul_f32 r7 r2
|
|
21 r7 = round r7
|
|
22 r4 = pack r7 r4 8
|
|
23 r7 = shr_i32 r6 8
|
|
24 r7 = bit_and r3 r7
|
|
25 r7 = to_f32 r7
|
|
26 r7 = mul_f32 r7 r1
|
|
27 r7 = mul_f32 r7 r5
|
|
28 r7 = mul_f32 r7 r2
|
|
29 r7 = round r7
|
|
30 r6 = bit_and r3 r6
|
|
31 r6 = to_f32 r6
|
|
32 r6 = mul_f32 r6 r1
|
|
33 r5 = mul_f32 r6 r5
|
|
34 r5 = mul_f32 r5 r2
|
|
35 r5 = round r5
|
|
36 r7 = pack r5 r7 8
|
|
37 r4 = pack r7 r4 16
|
|
38 store32 arg(1) r4
|
|
|
|
G8 over A8
|
|
11 values (originally 15):
|
|
v0 = load8 arg(1)
|
|
v1 = to_f32 v0
|
|
↑ v2 = splat 3B808081 (0.0039215689)
|
|
v3 = mul_f32 v1 v2
|
|
↑ v4 = splat 0 (0)
|
|
↑ v5 = splat 3F800000 (1)
|
|
v6 = fma_f32 v3 v4 v5
|
|
↑ v7 = splat 437F0000 (255)
|
|
v8 = mul_f32 v6 v7
|
|
v9 = round v8
|
|
store8 arg(1) v9
|
|
|
|
5 registers, 11 instructions:
|
|
0 r0 = splat 3B808081 (0.0039215689)
|
|
1 r1 = splat 0 (0)
|
|
2 r2 = splat 3F800000 (1)
|
|
3 r3 = splat 437F0000 (255)
|
|
loop:
|
|
4 r4 = load8 arg(1)
|
|
5 r4 = to_f32 r4
|
|
6 r4 = mul_f32 r4 r0
|
|
7 r4 = fma_f32 r4 r1 r2
|
|
8 r4 = mul_f32 r4 r3
|
|
9 r4 = round r4
|
|
10 store8 arg(1) r4
|
|
|
|
G8 over G8
|
|
19 values (originally 23):
|
|
v0 = load8 arg(1)
|
|
v1 = to_f32 v0
|
|
↑ v2 = splat 3B808081 (0.0039215689)
|
|
v3 = mul_f32 v1 v2
|
|
v4 = load8 arg(0)
|
|
v5 = to_f32 v4
|
|
v6 = mul_f32 v5 v2
|
|
↑ v7 = splat 0 (0)
|
|
v8 = fma_f32 v3 v7 v6
|
|
↑ v9 = splat 3D93DD98 (0.0722)
|
|
v10 = mul_f32 v8 v9
|
|
↑ v11 = splat 3F371759 (0.71520001)
|
|
v12 = fma_f32 v8 v11 v10
|
|
↑ v13 = splat 3E59B3D0 (0.21259999)
|
|
v14 = fma_f32 v8 v13 v12
|
|
↑ v15 = splat 437F0000 (255)
|
|
v16 = mul_f32 v14 v15
|
|
v17 = round v16
|
|
store8 arg(1) v17
|
|
|
|
8 registers, 19 instructions:
|
|
0 r0 = splat 3B808081 (0.0039215689)
|
|
1 r1 = splat 0 (0)
|
|
2 r2 = splat 3D93DD98 (0.0722)
|
|
3 r3 = splat 3F371759 (0.71520001)
|
|
4 r4 = splat 3E59B3D0 (0.21259999)
|
|
5 r5 = splat 437F0000 (255)
|
|
loop:
|
|
6 r6 = load8 arg(1)
|
|
7 r6 = to_f32 r6
|
|
8 r6 = mul_f32 r6 r0
|
|
9 r7 = load8 arg(0)
|
|
10 r7 = to_f32 r7
|
|
11 r7 = mul_f32 r7 r0
|
|
12 r7 = fma_f32 r6 r1 r7
|
|
13 r6 = mul_f32 r7 r2
|
|
14 r6 = fma_f32 r7 r3 r6
|
|
15 r6 = fma_f32 r7 r4 r6
|
|
16 r6 = mul_f32 r6 r5
|
|
17 r6 = round r6
|
|
18 store8 arg(1) r6
|
|
|
|
G8 over RGBA_8888
|
|
39 values (originally 43):
|
|
v0 = load32 arg(1)
|
|
v1 = shr_i32 v0 24
|
|
v2 = to_f32 v1
|
|
↑ v3 = splat 3B808081 (0.0039215689)
|
|
v4 = mul_f32 v2 v3
|
|
↑ v5 = splat 0 (0)
|
|
↑ v6 = splat 3F800000 (1)
|
|
v7 = fma_f32 v4 v5 v6
|
|
↑ v8 = splat 437F0000 (255)
|
|
v9 = mul_f32 v7 v8
|
|
v10 = shr_i32 v0 16
|
|
↑ v11 = splat FF (3.5733111e-43)
|
|
v12 = bit_and v11 v10
|
|
v13 = to_f32 v12
|
|
v14 = mul_f32 v13 v3
|
|
v15 = load8 arg(0)
|
|
v16 = to_f32 v15
|
|
v17 = mul_f32 v16 v3
|
|
v18 = fma_f32 v14 v5 v17
|
|
v19 = round v9
|
|
v20 = mul_f32 v18 v8
|
|
v21 = round v20
|
|
v22 = pack v21 v19 8
|
|
v23 = shr_i32 v0 8
|
|
v24 = bit_and v11 v23
|
|
v25 = to_f32 v24
|
|
v26 = mul_f32 v25 v3
|
|
v27 = fma_f32 v26 v5 v17
|
|
v28 = mul_f32 v27 v8
|
|
v29 = round v28
|
|
v30 = bit_and v11 v0
|
|
v31 = to_f32 v30
|
|
v32 = mul_f32 v31 v3
|
|
v33 = fma_f32 v32 v5 v17
|
|
v34 = mul_f32 v33 v8
|
|
v35 = round v34
|
|
v36 = pack v35 v29 8
|
|
v37 = pack v36 v22 16
|
|
store32 arg(1) v37
|
|
|
|
9 registers, 39 instructions:
|
|
0 r0 = splat 3B808081 (0.0039215689)
|
|
1 r1 = splat 0 (0)
|
|
2 r2 = splat 3F800000 (1)
|
|
3 r3 = splat 437F0000 (255)
|
|
4 r4 = splat FF (3.5733111e-43)
|
|
loop:
|
|
5 r5 = load32 arg(1)
|
|
6 r6 = shr_i32 r5 24
|
|
7 r6 = to_f32 r6
|
|
8 r6 = mul_f32 r6 r0
|
|
9 r6 = fma_f32 r6 r1 r2
|
|
10 r6 = mul_f32 r6 r3
|
|
11 r7 = shr_i32 r5 16
|
|
12 r7 = bit_and r4 r7
|
|
13 r7 = to_f32 r7
|
|
14 r7 = mul_f32 r7 r0
|
|
15 r8 = load8 arg(0)
|
|
16 r8 = to_f32 r8
|
|
17 r8 = mul_f32 r8 r0
|
|
18 r7 = fma_f32 r7 r1 r8
|
|
19 r6 = round r6
|
|
20 r7 = mul_f32 r7 r3
|
|
21 r7 = round r7
|
|
22 r6 = pack r7 r6 8
|
|
23 r7 = shr_i32 r5 8
|
|
24 r7 = bit_and r4 r7
|
|
25 r7 = to_f32 r7
|
|
26 r7 = mul_f32 r7 r0
|
|
27 r7 = fma_f32 r7 r1 r8
|
|
28 r7 = mul_f32 r7 r3
|
|
29 r7 = round r7
|
|
30 r5 = bit_and r4 r5
|
|
31 r5 = to_f32 r5
|
|
32 r5 = mul_f32 r5 r0
|
|
33 r8 = fma_f32 r5 r1 r8
|
|
34 r8 = mul_f32 r8 r3
|
|
35 r8 = round r8
|
|
36 r7 = pack r8 r7 8
|
|
37 r6 = pack r7 r6 16
|
|
38 store32 arg(1) r6
|
|
|
|
RGBA_8888 over A8
|
|
15 values (originally 33):
|
|
v0 = load32 arg(0)
|
|
v1 = shr_i32 v0 24
|
|
v2 = to_f32 v1
|
|
↑ v3 = splat 3F800000 (1)
|
|
↑ v4 = splat 3B808081 (0.0039215689)
|
|
v5 = fnma_f32 v2 v4 v3
|
|
v6 = load8 arg(1)
|
|
v7 = to_f32 v6
|
|
v8 = mul_f32 v7 v4
|
|
v9 = mul_f32 v2 v4
|
|
v10 = fma_f32 v8 v5 v9
|
|
↑ v11 = splat 437F0000 (255)
|
|
v12 = mul_f32 v10 v11
|
|
v13 = round v12
|
|
store8 arg(1) v13
|
|
|
|
6 registers, 15 instructions:
|
|
0 r0 = splat 3F800000 (1)
|
|
1 r1 = splat 3B808081 (0.0039215689)
|
|
2 r2 = splat 437F0000 (255)
|
|
loop:
|
|
3 r3 = load32 arg(0)
|
|
4 r3 = shr_i32 r3 24
|
|
5 r3 = to_f32 r3
|
|
6 r4 = fnma_f32 r3 r1 r0
|
|
7 r5 = load8 arg(1)
|
|
8 r5 = to_f32 r5
|
|
9 r5 = mul_f32 r5 r1
|
|
10 r3 = mul_f32 r3 r1
|
|
11 r3 = fma_f32 r5 r4 r3
|
|
12 r3 = mul_f32 r3 r2
|
|
13 r3 = round r3
|
|
14 store8 arg(1) r3
|
|
|
|
RGBA_8888 over G8
|
|
34 values (originally 39):
|
|
v0 = load32 arg(0)
|
|
v1 = shr_i32 v0 24
|
|
v2 = to_f32 v1
|
|
↑ v3 = splat 3F800000 (1)
|
|
↑ v4 = splat 3B808081 (0.0039215689)
|
|
v5 = fnma_f32 v2 v4 v3
|
|
v6 = shr_i32 v0 16
|
|
↑ v7 = splat FF (3.5733111e-43)
|
|
v8 = bit_and v7 v6
|
|
v9 = load8 arg(1)
|
|
v10 = to_f32 v9
|
|
v11 = mul_f32 v10 v4
|
|
v12 = to_f32 v8
|
|
v13 = mul_f32 v12 v4
|
|
v14 = fma_f32 v11 v5 v13
|
|
↑ v15 = splat 3D93DD98 (0.0722)
|
|
v16 = mul_f32 v14 v15
|
|
v17 = shr_i32 v0 8
|
|
v18 = bit_and v7 v17
|
|
v19 = to_f32 v18
|
|
v20 = mul_f32 v19 v4
|
|
v21 = fma_f32 v11 v5 v20
|
|
↑ v22 = splat 3F371759 (0.71520001)
|
|
v23 = fma_f32 v21 v22 v16
|
|
v24 = bit_and v7 v0
|
|
v25 = to_f32 v24
|
|
v26 = mul_f32 v25 v4
|
|
v27 = fma_f32 v11 v5 v26
|
|
↑ v28 = splat 3E59B3D0 (0.21259999)
|
|
v29 = fma_f32 v27 v28 v23
|
|
↑ v30 = splat 437F0000 (255)
|
|
v31 = mul_f32 v29 v30
|
|
v32 = round v31
|
|
store8 arg(1) v32
|
|
|
|
12 registers, 34 instructions:
|
|
0 r0 = splat 3F800000 (1)
|
|
1 r1 = splat 3B808081 (0.0039215689)
|
|
2 r2 = splat FF (3.5733111e-43)
|
|
3 r3 = splat 3D93DD98 (0.0722)
|
|
4 r4 = splat 3F371759 (0.71520001)
|
|
5 r5 = splat 3E59B3D0 (0.21259999)
|
|
6 r6 = splat 437F0000 (255)
|
|
loop:
|
|
7 r7 = load32 arg(0)
|
|
8 r8 = shr_i32 r7 24
|
|
9 r8 = to_f32 r8
|
|
10 r8 = fnma_f32 r8 r1 r0
|
|
11 r9 = shr_i32 r7 16
|
|
12 r9 = bit_and r2 r9
|
|
13 r10 = load8 arg(1)
|
|
14 r10 = to_f32 r10
|
|
15 r10 = mul_f32 r10 r1
|
|
16 r9 = to_f32 r9
|
|
17 r9 = mul_f32 r9 r1
|
|
18 r9 = fma_f32 r10 r8 r9
|
|
19 r9 = mul_f32 r9 r3
|
|
20 r11 = shr_i32 r7 8
|
|
21 r11 = bit_and r2 r11
|
|
22 r11 = to_f32 r11
|
|
23 r11 = mul_f32 r11 r1
|
|
24 r11 = fma_f32 r10 r8 r11
|
|
25 r9 = fma_f32 r11 r4 r9
|
|
26 r7 = bit_and r2 r7
|
|
27 r7 = to_f32 r7
|
|
28 r7 = mul_f32 r7 r1
|
|
29 r7 = fma_f32 r10 r8 r7
|
|
30 r9 = fma_f32 r7 r5 r9
|
|
31 r9 = mul_f32 r9 r6
|
|
32 r9 = round r9
|
|
33 store8 arg(1) r9
|
|
|
|
RGBA_8888 over RGBA_8888
|
|
51 values (originally 55):
|
|
v0 = load32 arg(0)
|
|
v1 = shr_i32 v0 24
|
|
v2 = to_f32 v1
|
|
↑ v3 = splat 3F800000 (1)
|
|
↑ v4 = splat 3B808081 (0.0039215689)
|
|
v5 = fnma_f32 v2 v4 v3
|
|
v6 = load32 arg(1)
|
|
v7 = shr_i32 v6 24
|
|
v8 = to_f32 v7
|
|
v9 = mul_f32 v8 v4
|
|
v10 = mul_f32 v2 v4
|
|
v11 = fma_f32 v9 v5 v10
|
|
↑ v12 = splat 437F0000 (255)
|
|
v13 = mul_f32 v11 v12
|
|
v14 = shr_i32 v6 16
|
|
↑ v15 = splat FF (3.5733111e-43)
|
|
v16 = bit_and v15 v14
|
|
v17 = to_f32 v16
|
|
v18 = mul_f32 v17 v4
|
|
v19 = shr_i32 v0 16
|
|
v20 = bit_and v15 v19
|
|
v21 = to_f32 v20
|
|
v22 = mul_f32 v21 v4
|
|
v23 = fma_f32 v18 v5 v22
|
|
v24 = round v13
|
|
v25 = mul_f32 v23 v12
|
|
v26 = round v25
|
|
v27 = pack v26 v24 8
|
|
v28 = shr_i32 v6 8
|
|
v29 = bit_and v15 v28
|
|
v30 = to_f32 v29
|
|
v31 = mul_f32 v30 v4
|
|
v32 = shr_i32 v0 8
|
|
v33 = bit_and v15 v32
|
|
v34 = to_f32 v33
|
|
v35 = mul_f32 v34 v4
|
|
v36 = fma_f32 v31 v5 v35
|
|
v37 = bit_and v15 v6
|
|
v38 = to_f32 v37
|
|
v39 = mul_f32 v38 v4
|
|
v40 = bit_and v15 v0
|
|
v41 = to_f32 v40
|
|
v42 = mul_f32 v41 v4
|
|
v43 = fma_f32 v39 v5 v42
|
|
v44 = mul_f32 v36 v12
|
|
v45 = round v44
|
|
v46 = mul_f32 v43 v12
|
|
v47 = round v46
|
|
v48 = pack v47 v45 8
|
|
v49 = pack v48 v27 16
|
|
store32 arg(1) v49
|
|
|
|
10 registers, 51 instructions:
|
|
0 r0 = splat 3F800000 (1)
|
|
1 r1 = splat 3B808081 (0.0039215689)
|
|
2 r2 = splat 437F0000 (255)
|
|
3 r3 = splat FF (3.5733111e-43)
|
|
loop:
|
|
4 r4 = load32 arg(0)
|
|
5 r5 = shr_i32 r4 24
|
|
6 r5 = to_f32 r5
|
|
7 r6 = fnma_f32 r5 r1 r0
|
|
8 r7 = load32 arg(1)
|
|
9 r8 = shr_i32 r7 24
|
|
10 r8 = to_f32 r8
|
|
11 r8 = mul_f32 r8 r1
|
|
12 r5 = mul_f32 r5 r1
|
|
13 r5 = fma_f32 r8 r6 r5
|
|
14 r5 = mul_f32 r5 r2
|
|
15 r8 = shr_i32 r7 16
|
|
16 r8 = bit_and r3 r8
|
|
17 r8 = to_f32 r8
|
|
18 r8 = mul_f32 r8 r1
|
|
19 r9 = shr_i32 r4 16
|
|
20 r9 = bit_and r3 r9
|
|
21 r9 = to_f32 r9
|
|
22 r9 = mul_f32 r9 r1
|
|
23 r9 = fma_f32 r8 r6 r9
|
|
24 r5 = round r5
|
|
25 r9 = mul_f32 r9 r2
|
|
26 r9 = round r9
|
|
27 r5 = pack r9 r5 8
|
|
28 r9 = shr_i32 r7 8
|
|
29 r9 = bit_and r3 r9
|
|
30 r9 = to_f32 r9
|
|
31 r9 = mul_f32 r9 r1
|
|
32 r8 = shr_i32 r4 8
|
|
33 r8 = bit_and r3 r8
|
|
34 r8 = to_f32 r8
|
|
35 r8 = mul_f32 r8 r1
|
|
36 r8 = fma_f32 r9 r6 r8
|
|
37 r7 = bit_and r3 r7
|
|
38 r7 = to_f32 r7
|
|
39 r7 = mul_f32 r7 r1
|
|
40 r4 = bit_and r3 r4
|
|
41 r4 = to_f32 r4
|
|
42 r4 = mul_f32 r4 r1
|
|
43 r4 = fma_f32 r7 r6 r4
|
|
44 r8 = mul_f32 r8 r2
|
|
45 r8 = round r8
|
|
46 r4 = mul_f32 r4 r2
|
|
47 r4 = round r4
|
|
48 r8 = pack r4 r8 8
|
|
49 r5 = pack r8 r5 16
|
|
50 store32 arg(1) r5
|
|
|
|
I32 (Naive) 8888 over 8888
|
|
33 values (originally 33):
|
|
v0 = load32 arg(0)
|
|
v1 = shr_i32 v0 24
|
|
↑ v2 = splat 100 (3.5873241e-43)
|
|
v3 = sub_i32 v2 v1
|
|
v4 = load32 arg(1)
|
|
v5 = shr_i32 v4 16
|
|
↑ v6 = splat FF (3.5733111e-43)
|
|
v7 = bit_and v6 v5
|
|
v8 = mul_i32 v7 v3
|
|
v9 = shr_i32 v8 8
|
|
v10 = shr_i32 v0 16
|
|
v11 = bit_and v6 v10
|
|
v12 = add_i32 v11 v9
|
|
v13 = shr_i32 v4 24
|
|
v14 = mul_i32 v13 v3
|
|
v15 = shr_i32 v14 8
|
|
v16 = add_i32 v1 v15
|
|
v17 = pack v12 v16 8
|
|
v18 = shr_i32 v4 8
|
|
v19 = bit_and v6 v18
|
|
v20 = mul_i32 v19 v3
|
|
v21 = shr_i32 v20 8
|
|
v22 = shr_i32 v0 8
|
|
v23 = bit_and v6 v22
|
|
v24 = add_i32 v23 v21
|
|
v25 = bit_and v6 v4
|
|
v26 = mul_i32 v25 v3
|
|
v27 = shr_i32 v26 8
|
|
v28 = bit_and v6 v0
|
|
v29 = add_i32 v28 v27
|
|
v30 = pack v29 v24 8
|
|
v31 = pack v30 v17 16
|
|
store32 arg(1) v31
|
|
|
|
8 registers, 33 instructions:
|
|
0 r0 = splat 100 (3.5873241e-43)
|
|
1 r1 = splat FF (3.5733111e-43)
|
|
loop:
|
|
2 r2 = load32 arg(0)
|
|
3 r3 = shr_i32 r2 24
|
|
4 r4 = sub_i32 r0 r3
|
|
5 r5 = load32 arg(1)
|
|
6 r6 = shr_i32 r5 16
|
|
7 r6 = bit_and r1 r6
|
|
8 r6 = mul_i32 r6 r4
|
|
9 r6 = shr_i32 r6 8
|
|
10 r7 = shr_i32 r2 16
|
|
11 r7 = bit_and r1 r7
|
|
12 r6 = add_i32 r7 r6
|
|
13 r7 = shr_i32 r5 24
|
|
14 r7 = mul_i32 r7 r4
|
|
15 r7 = shr_i32 r7 8
|
|
16 r7 = add_i32 r3 r7
|
|
17 r7 = pack r6 r7 8
|
|
18 r6 = shr_i32 r5 8
|
|
19 r6 = bit_and r1 r6
|
|
20 r6 = mul_i32 r6 r4
|
|
21 r6 = shr_i32 r6 8
|
|
22 r3 = shr_i32 r2 8
|
|
23 r3 = bit_and r1 r3
|
|
24 r6 = add_i32 r3 r6
|
|
25 r5 = bit_and r1 r5
|
|
26 r4 = mul_i32 r5 r4
|
|
27 r4 = shr_i32 r4 8
|
|
28 r2 = bit_and r1 r2
|
|
29 r4 = add_i32 r2 r4
|
|
30 r6 = pack r4 r6 8
|
|
31 r7 = pack r6 r7 16
|
|
32 store32 arg(1) r7
|
|
|
|
I32 8888 over 8888
|
|
33 values (originally 33):
|
|
v0 = load32 arg(0)
|
|
v1 = shr_i32 v0 24
|
|
↑ v2 = splat 100 (3.5873241e-43)
|
|
v3 = sub_i32 v2 v1
|
|
v4 = load32 arg(1)
|
|
v5 = shr_i32 v4 16
|
|
↑ v6 = splat FF (3.5733111e-43)
|
|
v7 = bit_and v6 v5
|
|
v8 = mul_i16x2 v7 v3
|
|
v9 = shr_i32 v8 8
|
|
v10 = shr_i32 v0 16
|
|
v11 = bit_and v6 v10
|
|
v12 = add_i32 v11 v9
|
|
v13 = shr_i32 v4 24
|
|
v14 = mul_i16x2 v13 v3
|
|
v15 = shr_i32 v14 8
|
|
v16 = add_i32 v1 v15
|
|
v17 = pack v12 v16 8
|
|
v18 = shr_i32 v4 8
|
|
v19 = bit_and v6 v18
|
|
v20 = mul_i16x2 v19 v3
|
|
v21 = shr_i32 v20 8
|
|
v22 = shr_i32 v0 8
|
|
v23 = bit_and v6 v22
|
|
v24 = add_i32 v23 v21
|
|
v25 = bit_and v6 v4
|
|
v26 = mul_i16x2 v25 v3
|
|
v27 = shr_i32 v26 8
|
|
v28 = bit_and v6 v0
|
|
v29 = add_i32 v28 v27
|
|
v30 = pack v29 v24 8
|
|
v31 = pack v30 v17 16
|
|
store32 arg(1) v31
|
|
|
|
8 registers, 33 instructions:
|
|
0 r0 = splat 100 (3.5873241e-43)
|
|
1 r1 = splat FF (3.5733111e-43)
|
|
loop:
|
|
2 r2 = load32 arg(0)
|
|
3 r3 = shr_i32 r2 24
|
|
4 r4 = sub_i32 r0 r3
|
|
5 r5 = load32 arg(1)
|
|
6 r6 = shr_i32 r5 16
|
|
7 r6 = bit_and r1 r6
|
|
8 r6 = mul_i16x2 r6 r4
|
|
9 r6 = shr_i32 r6 8
|
|
10 r7 = shr_i32 r2 16
|
|
11 r7 = bit_and r1 r7
|
|
12 r6 = add_i32 r7 r6
|
|
13 r7 = shr_i32 r5 24
|
|
14 r7 = mul_i16x2 r7 r4
|
|
15 r7 = shr_i32 r7 8
|
|
16 r7 = add_i32 r3 r7
|
|
17 r7 = pack r6 r7 8
|
|
18 r6 = shr_i32 r5 8
|
|
19 r6 = bit_and r1 r6
|
|
20 r6 = mul_i16x2 r6 r4
|
|
21 r6 = shr_i32 r6 8
|
|
22 r3 = shr_i32 r2 8
|
|
23 r3 = bit_and r1 r3
|
|
24 r6 = add_i32 r3 r6
|
|
25 r5 = bit_and r1 r5
|
|
26 r4 = mul_i16x2 r5 r4
|
|
27 r4 = shr_i32 r4 8
|
|
28 r2 = bit_and r1 r2
|
|
29 r4 = add_i32 r2 r4
|
|
30 r6 = pack r4 r6 8
|
|
31 r7 = pack r6 r7 16
|
|
32 store32 arg(1) r7
|
|
|
|
I32 (SWAR) 8888 over 8888
|
|
20 values (originally 21):
|
|
v0 = load32 arg(0)
|
|
v1 = shr_i32 v0 8
|
|
↑ v2 = splat FF0000 (2.3418052e-38)
|
|
v3 = bit_and v2 v1
|
|
v4 = shr_i32 v0 24
|
|
v5 = bit_or v4 v3
|
|
↑ v6 = splat 1000100 (2.3510604e-38)
|
|
v7 = sub_i16x2 v6 v5
|
|
v8 = load32 arg(1)
|
|
v9 = shr_i16x2 v8 8
|
|
v10 = mul_i16x2 v9 v7
|
|
↑ v11 = splat FF00FF00 (-1.7146522e+38)
|
|
v12 = bit_and v10 v11
|
|
↑ v13 = splat FF00FF (2.3418409e-38)
|
|
v14 = bit_and v8 v13
|
|
v15 = mul_i16x2 v14 v7
|
|
v16 = shr_i16x2 v15 8
|
|
v17 = bit_or v16 v12
|
|
v18 = add_i32 v0 v17
|
|
store32 arg(1) v18
|
|
|
|
8 registers, 20 instructions:
|
|
0 r0 = splat FF0000 (2.3418052e-38)
|
|
1 r1 = splat 1000100 (2.3510604e-38)
|
|
2 r2 = splat FF00FF00 (-1.7146522e+38)
|
|
3 r3 = splat FF00FF (2.3418409e-38)
|
|
loop:
|
|
4 r4 = load32 arg(0)
|
|
5 r5 = shr_i32 r4 8
|
|
6 r5 = bit_and r0 r5
|
|
7 r6 = shr_i32 r4 24
|
|
8 r5 = bit_or r6 r5
|
|
9 r5 = sub_i16x2 r1 r5
|
|
10 r6 = load32 arg(1)
|
|
11 r7 = shr_i16x2 r6 8
|
|
12 r7 = mul_i16x2 r7 r5
|
|
13 r7 = bit_and r7 r2
|
|
14 r6 = bit_and r6 r3
|
|
15 r5 = mul_i16x2 r6 r5
|
|
16 r5 = shr_i16x2 r5 8
|
|
17 r7 = bit_or r5 r7
|
|
18 r7 = add_i32 r4 r7
|
|
19 store32 arg(1) r7
|
|
|
|
6 values (originally 6):
|
|
↟ v0 = splat 2 (2.8025969e-45)
|
|
↟ v1 = splat 1 (1.4012985e-45)
|
|
↑ v2 = add_i32 v1 v0
|
|
v3 = load32 arg(0)
|
|
v4 = mul_i32 v3 v2
|
|
store32 arg(0) v4
|
|
|
|
2 registers, 6 instructions:
|
|
0 r0 = splat 2 (2.8025969e-45)
|
|
1 r1 = splat 1 (1.4012985e-45)
|
|
2 r0 = add_i32 r1 r0
|
|
loop:
|
|
3 r1 = load32 arg(0)
|
|
4 r1 = mul_i32 r1 r0
|
|
5 store32 arg(0) r1
|
|
|
|
23 values (originally 23):
|
|
v0 = load32 arg(1)
|
|
v1 = shr_i32 v0 24
|
|
v2 = load32 arg(0)
|
|
v3 = shr_i32 v2 24
|
|
v4 = add_i32 v3 v1
|
|
v5 = shr_i32 v0 16
|
|
↑ v6 = splat FF (3.5733111e-43)
|
|
v7 = bit_and v6 v5
|
|
v8 = shr_i32 v2 16
|
|
v9 = bit_and v6 v8
|
|
v10 = add_i32 v9 v7
|
|
v11 = pack v10 v4 8
|
|
v12 = shr_i32 v0 8
|
|
v13 = bit_and v6 v12
|
|
v14 = shr_i32 v2 8
|
|
v15 = bit_and v6 v14
|
|
v16 = add_i32 v15 v13
|
|
v17 = bit_and v6 v0
|
|
v18 = bit_and v6 v2
|
|
v19 = add_i32 v18 v17
|
|
v20 = pack v19 v16 8
|
|
v21 = pack v20 v11 16
|
|
store32 arg(1) v21
|
|
|
|
6 registers, 23 instructions:
|
|
0 r0 = splat FF (3.5733111e-43)
|
|
loop:
|
|
1 r1 = load32 arg(1)
|
|
2 r2 = shr_i32 r1 24
|
|
3 r3 = load32 arg(0)
|
|
4 r4 = shr_i32 r3 24
|
|
5 r2 = add_i32 r4 r2
|
|
6 r4 = shr_i32 r1 16
|
|
7 r4 = bit_and r0 r4
|
|
8 r5 = shr_i32 r3 16
|
|
9 r5 = bit_and r0 r5
|
|
10 r4 = add_i32 r5 r4
|
|
11 r2 = pack r4 r2 8
|
|
12 r4 = shr_i32 r1 8
|
|
13 r4 = bit_and r0 r4
|
|
14 r5 = shr_i32 r3 8
|
|
15 r5 = bit_and r0 r5
|
|
16 r4 = add_i32 r5 r4
|
|
17 r1 = bit_and r0 r1
|
|
18 r3 = bit_and r0 r3
|
|
19 r1 = add_i32 r3 r1
|
|
20 r4 = pack r1 r4 8
|
|
21 r2 = pack r4 r2 16
|
|
22 store32 arg(1) r2
|
|
|