0c3346643a
This cuts the overhead bench from about 19µs to about 15µs. The key insight here is that the only registers that might become available after any given instruction are the ones that hold that instruction's inputs. We can check when they become available directly from the original Builder::Program, without needing a side death schedule data structure. Marking hoisted instructions as having life == program size helps make this logic a little simpler to reason through. Change-Id: Ifb9957f2d0e323e0e5d07996a2cc988f7c8b4c3f Reviewed-on: https://skia-review.googlesource.com/c/skia/+/223117 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
654 lines
14 KiB
Plaintext
654 lines
14 KiB
Plaintext
A8 over A8
|
|
17 values:
|
|
☠ v0 = splat 0 (0)
|
|
v1 = load8 arg(0)
|
|
⤴ v2 = splat 3B808081 (0.0039215689)
|
|
v3 = to_f32 v1
|
|
v4 = mul_f32 v2 v3
|
|
v5 = load8 arg(1)
|
|
v6 = to_f32 v5
|
|
v7 = mul_f32 v2 v6
|
|
⤴ v8 = splat 3F800000 (1)
|
|
v9 = sub_f32 v8 v4
|
|
☠ v10 = mul_f32 v0 v9
|
|
v11 = mad_f32 v7 v9 v4
|
|
⤴ v12 = splat 437F0000 (255)
|
|
⤴ v13 = splat 3F000000 (0.5)
|
|
v14 = mad_f32 v11 v12 v13
|
|
v15 = to_i32 v14
|
|
store8 arg(1) v15
|
|
|
|
7 registers, 15 instructions:
|
|
r0 = splat 3B808081 (0.0039215689)
|
|
r1 = splat 3F800000 (1)
|
|
r2 = splat 437F0000 (255)
|
|
r3 = splat 3F000000 (0.5)
|
|
loop:
|
|
r4 = load8 arg(0)
|
|
r4 = to_f32 r4
|
|
r4 = mul_f32 r0 r4
|
|
r5 = load8 arg(1)
|
|
r5 = to_f32 r5
|
|
r5 = mul_f32 r0 r5
|
|
r6 = sub_f32 r1 r4
|
|
r4 = mad_f32 r5 r6 r4
|
|
r4 = mad_f32 r4 r2 r3
|
|
r4 = to_i32 r4
|
|
store8 arg(1) r4
|
|
|
|
A8 over G8
|
|
23 values:
|
|
☠ v0 = splat 0 (0)
|
|
v1 = load8 arg(0)
|
|
⤴ v2 = splat 3B808081 (0.0039215689)
|
|
v3 = to_f32 v1
|
|
v4 = mul_f32 v2 v3
|
|
v5 = load8 arg(1)
|
|
v6 = to_f32 v5
|
|
v7 = mul_f32 v2 v6
|
|
⤴ v8 = splat 3F800000 (1)
|
|
v9 = sub_f32 v8 v4
|
|
v10 = mul_f32 v7 v9
|
|
☠ v11 = mad_f32 v8 v9 v4
|
|
⤴ v12 = splat 3E59B3D0 (0.21259999)
|
|
⤴ v13 = splat 3F371759 (0.71520001)
|
|
⤴ v14 = splat 3D93DD98 (0.0722)
|
|
v15 = mul_f32 v10 v14
|
|
v16 = mad_f32 v10 v13 v15
|
|
v17 = mad_f32 v10 v12 v16
|
|
⤴ v18 = splat 437F0000 (255)
|
|
⤴ v19 = splat 3F000000 (0.5)
|
|
v20 = mad_f32 v17 v18 v19
|
|
v21 = to_i32 v20
|
|
store8 arg(1) v21
|
|
|
|
9 registers, 21 instructions:
|
|
r0 = splat 3B808081 (0.0039215689)
|
|
r1 = splat 3F800000 (1)
|
|
r2 = splat 3E59B3D0 (0.21259999)
|
|
r3 = splat 3F371759 (0.71520001)
|
|
r4 = splat 3D93DD98 (0.0722)
|
|
r5 = splat 437F0000 (255)
|
|
r6 = splat 3F000000 (0.5)
|
|
loop:
|
|
r7 = load8 arg(0)
|
|
r7 = to_f32 r7
|
|
r7 = mul_f32 r0 r7
|
|
r8 = load8 arg(1)
|
|
r8 = to_f32 r8
|
|
r8 = mul_f32 r0 r8
|
|
r7 = sub_f32 r1 r7
|
|
r7 = mul_f32 r8 r7
|
|
r8 = mul_f32 r7 r4
|
|
r8 = mad_f32 r7 r3 r8
|
|
r8 = mad_f32 r7 r2 r8
|
|
r8 = mad_f32 r8 r5 r6
|
|
r8 = to_i32 r8
|
|
store8 arg(1) r8
|
|
|
|
A8 over RGBA_8888
|
|
39 values:
|
|
☠ v0 = splat 0 (0)
|
|
v1 = load8 arg(0)
|
|
⤴ v2 = splat 3B808081 (0.0039215689)
|
|
v3 = to_f32 v1
|
|
v4 = mul_f32 v2 v3
|
|
v5 = load32 arg(1)
|
|
⤴ v6 = splat FF (3.5733111e-43)
|
|
v7 = extract v5 0 v6
|
|
v8 = to_f32 v7
|
|
v9 = mul_f32 v2 v8
|
|
v10 = extract v5 8 v6
|
|
v11 = to_f32 v10
|
|
v12 = mul_f32 v2 v11
|
|
v13 = extract v5 16 v6
|
|
v14 = to_f32 v13
|
|
v15 = mul_f32 v2 v14
|
|
v16 = extract v5 24 v6
|
|
v17 = to_f32 v16
|
|
v18 = mul_f32 v2 v17
|
|
⤴ v19 = splat 3F800000 (1)
|
|
v20 = sub_f32 v19 v4
|
|
v21 = mul_f32 v9 v20
|
|
v22 = mul_f32 v12 v20
|
|
v23 = mul_f32 v15 v20
|
|
v24 = mad_f32 v18 v20 v4
|
|
⤴ v25 = splat 437F0000 (255)
|
|
⤴ v26 = splat 3F000000 (0.5)
|
|
v27 = mad_f32 v21 v25 v26
|
|
v28 = to_i32 v27
|
|
v29 = mad_f32 v22 v25 v26
|
|
v30 = to_i32 v29
|
|
v31 = mad_f32 v23 v25 v26
|
|
v32 = to_i32 v31
|
|
v33 = mad_f32 v24 v25 v26
|
|
v34 = to_i32 v33
|
|
v35 = pack v28 v30 8
|
|
v36 = pack v32 v34 8
|
|
v37 = pack v35 v36 16
|
|
store32 arg(1) v37
|
|
|
|
11 registers, 38 instructions:
|
|
r0 = splat 3B808081 (0.0039215689)
|
|
r1 = splat FF (3.5733111e-43)
|
|
r2 = splat 3F800000 (1)
|
|
r3 = splat 437F0000 (255)
|
|
r4 = splat 3F000000 (0.5)
|
|
loop:
|
|
r5 = load8 arg(0)
|
|
r5 = to_f32 r5
|
|
r5 = mul_f32 r0 r5
|
|
r6 = load32 arg(1)
|
|
r7 = extract r6 0 r1
|
|
r7 = to_f32 r7
|
|
r7 = mul_f32 r0 r7
|
|
r8 = extract r6 8 r1
|
|
r8 = to_f32 r8
|
|
r8 = mul_f32 r0 r8
|
|
r9 = extract r6 16 r1
|
|
r9 = to_f32 r9
|
|
r9 = mul_f32 r0 r9
|
|
r6 = extract r6 24 r1
|
|
r6 = to_f32 r6
|
|
r6 = mul_f32 r0 r6
|
|
r10 = sub_f32 r2 r5
|
|
r7 = mul_f32 r7 r10
|
|
r8 = mul_f32 r8 r10
|
|
r9 = mul_f32 r9 r10
|
|
r5 = mad_f32 r6 r10 r5
|
|
r7 = mad_f32 r7 r3 r4
|
|
r7 = to_i32 r7
|
|
r8 = mad_f32 r8 r3 r4
|
|
r8 = to_i32 r8
|
|
r9 = mad_f32 r9 r3 r4
|
|
r9 = to_i32 r9
|
|
r5 = mad_f32 r5 r3 r4
|
|
r5 = to_i32 r5
|
|
r8 = pack r7 r8 8
|
|
r5 = pack r9 r5 8
|
|
r5 = pack r8 r5 16
|
|
store32 arg(1) r5
|
|
|
|
G8 over A8
|
|
17 values:
|
|
☠ v0 = load8 arg(0)
|
|
⤴ v1 = splat 3B808081 (0.0039215689)
|
|
☠ v2 = to_f32 v0
|
|
☠ v3 = mul_f32 v1 v2
|
|
⤴ v4 = splat 3F800000 (1)
|
|
☠ v5 = splat 0 (0)
|
|
v6 = load8 arg(1)
|
|
v7 = to_f32 v6
|
|
v8 = mul_f32 v1 v7
|
|
⤴ v9 = sub_f32 v4 v4
|
|
☠ v10 = mad_f32 v5 v9 v3
|
|
v11 = mad_f32 v8 v9 v4
|
|
⤴ v12 = splat 437F0000 (255)
|
|
⤴ v13 = splat 3F000000 (0.5)
|
|
v14 = mad_f32 v11 v12 v13
|
|
v15 = to_i32 v14
|
|
store8 arg(1) v15
|
|
|
|
6 registers, 12 instructions:
|
|
r0 = splat 3B808081 (0.0039215689)
|
|
r1 = splat 3F800000 (1)
|
|
r2 = sub_f32 r1 r1
|
|
r3 = splat 437F0000 (255)
|
|
r4 = splat 3F000000 (0.5)
|
|
loop:
|
|
r5 = load8 arg(1)
|
|
r5 = to_f32 r5
|
|
r5 = mul_f32 r0 r5
|
|
r5 = mad_f32 r5 r2 r1
|
|
r5 = mad_f32 r5 r3 r4
|
|
r5 = to_i32 r5
|
|
store8 arg(1) r5
|
|
|
|
G8 over G8
|
|
22 values:
|
|
v0 = load8 arg(0)
|
|
⤴ v1 = splat 3B808081 (0.0039215689)
|
|
v2 = to_f32 v0
|
|
v3 = mul_f32 v1 v2
|
|
⤴ v4 = splat 3F800000 (1)
|
|
v5 = load8 arg(1)
|
|
v6 = to_f32 v5
|
|
v7 = mul_f32 v1 v6
|
|
⤴ v8 = sub_f32 v4 v4
|
|
v9 = mad_f32 v7 v8 v3
|
|
☠ v10 = mad_f32 v4 v8 v4
|
|
⤴ v11 = splat 3E59B3D0 (0.21259999)
|
|
⤴ v12 = splat 3F371759 (0.71520001)
|
|
⤴ v13 = splat 3D93DD98 (0.0722)
|
|
v14 = mul_f32 v9 v13
|
|
v15 = mad_f32 v9 v12 v14
|
|
v16 = mad_f32 v9 v11 v15
|
|
⤴ v17 = splat 437F0000 (255)
|
|
⤴ v18 = splat 3F000000 (0.5)
|
|
v19 = mad_f32 v16 v17 v18
|
|
v20 = to_i32 v19
|
|
store8 arg(1) v20
|
|
|
|
10 registers, 21 instructions:
|
|
r0 = splat 3B808081 (0.0039215689)
|
|
r1 = splat 3F800000 (1)
|
|
r2 = sub_f32 r1 r1
|
|
r3 = splat 3E59B3D0 (0.21259999)
|
|
r4 = splat 3F371759 (0.71520001)
|
|
r5 = splat 3D93DD98 (0.0722)
|
|
r6 = splat 437F0000 (255)
|
|
r7 = splat 3F000000 (0.5)
|
|
loop:
|
|
r8 = load8 arg(0)
|
|
r8 = to_f32 r8
|
|
r8 = mul_f32 r0 r8
|
|
r9 = load8 arg(1)
|
|
r9 = to_f32 r9
|
|
r9 = mul_f32 r0 r9
|
|
r8 = mad_f32 r9 r2 r8
|
|
r9 = mul_f32 r8 r5
|
|
r9 = mad_f32 r8 r4 r9
|
|
r9 = mad_f32 r8 r3 r9
|
|
r9 = mad_f32 r9 r6 r7
|
|
r9 = to_i32 r9
|
|
store8 arg(1) r9
|
|
|
|
G8 over RGBA_8888
|
|
38 values:
|
|
v0 = load8 arg(0)
|
|
⤴ v1 = splat 3B808081 (0.0039215689)
|
|
v2 = to_f32 v0
|
|
v3 = mul_f32 v1 v2
|
|
⤴ v4 = splat 3F800000 (1)
|
|
v5 = load32 arg(1)
|
|
⤴ v6 = splat FF (3.5733111e-43)
|
|
v7 = extract v5 0 v6
|
|
v8 = to_f32 v7
|
|
v9 = mul_f32 v1 v8
|
|
v10 = extract v5 8 v6
|
|
v11 = to_f32 v10
|
|
v12 = mul_f32 v1 v11
|
|
v13 = extract v5 16 v6
|
|
v14 = to_f32 v13
|
|
v15 = mul_f32 v1 v14
|
|
v16 = extract v5 24 v6
|
|
v17 = to_f32 v16
|
|
v18 = mul_f32 v1 v17
|
|
⤴ v19 = sub_f32 v4 v4
|
|
v20 = mad_f32 v9 v19 v3
|
|
v21 = mad_f32 v12 v19 v3
|
|
v22 = mad_f32 v15 v19 v3
|
|
v23 = mad_f32 v18 v19 v4
|
|
⤴ v24 = splat 437F0000 (255)
|
|
⤴ v25 = splat 3F000000 (0.5)
|
|
v26 = mad_f32 v20 v24 v25
|
|
v27 = to_i32 v26
|
|
v28 = mad_f32 v21 v24 v25
|
|
v29 = to_i32 v28
|
|
v30 = mad_f32 v22 v24 v25
|
|
v31 = to_i32 v30
|
|
v32 = mad_f32 v23 v24 v25
|
|
v33 = to_i32 v32
|
|
v34 = pack v27 v29 8
|
|
v35 = pack v31 v33 8
|
|
v36 = pack v34 v35 16
|
|
store32 arg(1) v36
|
|
|
|
11 registers, 38 instructions:
|
|
r0 = splat 3B808081 (0.0039215689)
|
|
r1 = splat 3F800000 (1)
|
|
r2 = splat FF (3.5733111e-43)
|
|
r3 = sub_f32 r1 r1
|
|
r4 = splat 437F0000 (255)
|
|
r5 = splat 3F000000 (0.5)
|
|
loop:
|
|
r6 = load8 arg(0)
|
|
r6 = to_f32 r6
|
|
r6 = mul_f32 r0 r6
|
|
r7 = load32 arg(1)
|
|
r8 = extract r7 0 r2
|
|
r8 = to_f32 r8
|
|
r8 = mul_f32 r0 r8
|
|
r9 = extract r7 8 r2
|
|
r9 = to_f32 r9
|
|
r9 = mul_f32 r0 r9
|
|
r10 = extract r7 16 r2
|
|
r10 = to_f32 r10
|
|
r10 = mul_f32 r0 r10
|
|
r7 = extract r7 24 r2
|
|
r7 = to_f32 r7
|
|
r7 = mul_f32 r0 r7
|
|
r8 = mad_f32 r8 r3 r6
|
|
r9 = mad_f32 r9 r3 r6
|
|
r6 = mad_f32 r10 r3 r6
|
|
r7 = mad_f32 r7 r3 r1
|
|
r8 = mad_f32 r8 r4 r5
|
|
r8 = to_i32 r8
|
|
r9 = mad_f32 r9 r4 r5
|
|
r9 = to_i32 r9
|
|
r6 = mad_f32 r6 r4 r5
|
|
r6 = to_i32 r6
|
|
r7 = mad_f32 r7 r4 r5
|
|
r7 = to_i32 r7
|
|
r9 = pack r8 r9 8
|
|
r7 = pack r6 r7 8
|
|
r7 = pack r9 r7 16
|
|
store32 arg(1) r7
|
|
|
|
RGBA_8888 over A8
|
|
30 values:
|
|
v0 = load32 arg(0)
|
|
⤴ v1 = splat FF (3.5733111e-43)
|
|
☠ v2 = extract v0 0 v1
|
|
⤴ v3 = splat 3B808081 (0.0039215689)
|
|
☠ v4 = to_f32 v2
|
|
☠ v5 = mul_f32 v3 v4
|
|
☠ v6 = extract v0 8 v1
|
|
☠ v7 = to_f32 v6
|
|
☠ v8 = mul_f32 v3 v7
|
|
☠ v9 = extract v0 16 v1
|
|
☠ v10 = to_f32 v9
|
|
☠ v11 = mul_f32 v3 v10
|
|
v12 = extract v0 24 v1
|
|
v13 = to_f32 v12
|
|
v14 = mul_f32 v3 v13
|
|
☠ v15 = splat 0 (0)
|
|
v16 = load8 arg(1)
|
|
v17 = to_f32 v16
|
|
v18 = mul_f32 v3 v17
|
|
⤴ v19 = splat 3F800000 (1)
|
|
v20 = sub_f32 v19 v14
|
|
☠ v21 = mad_f32 v15 v20 v5
|
|
☠ v22 = mad_f32 v15 v20 v8
|
|
☠ v23 = mad_f32 v15 v20 v11
|
|
v24 = mad_f32 v18 v20 v14
|
|
⤴ v25 = splat 437F0000 (255)
|
|
⤴ v26 = splat 3F000000 (0.5)
|
|
v27 = mad_f32 v24 v25 v26
|
|
v28 = to_i32 v27
|
|
store8 arg(1) v28
|
|
|
|
8 registers, 17 instructions:
|
|
r0 = splat FF (3.5733111e-43)
|
|
r1 = splat 3B808081 (0.0039215689)
|
|
r2 = splat 3F800000 (1)
|
|
r3 = splat 437F0000 (255)
|
|
r4 = splat 3F000000 (0.5)
|
|
loop:
|
|
r5 = load32 arg(0)
|
|
r5 = extract r5 24 r0
|
|
r5 = to_f32 r5
|
|
r5 = mul_f32 r1 r5
|
|
r6 = load8 arg(1)
|
|
r6 = to_f32 r6
|
|
r6 = mul_f32 r1 r6
|
|
r7 = sub_f32 r2 r5
|
|
r5 = mad_f32 r6 r7 r5
|
|
r5 = mad_f32 r5 r3 r4
|
|
r5 = to_i32 r5
|
|
store8 arg(1) r5
|
|
|
|
RGBA_8888 over G8
|
|
35 values:
|
|
v0 = load32 arg(0)
|
|
⤴ v1 = splat FF (3.5733111e-43)
|
|
v2 = extract v0 0 v1
|
|
⤴ v3 = splat 3B808081 (0.0039215689)
|
|
v4 = to_f32 v2
|
|
v5 = mul_f32 v3 v4
|
|
v6 = extract v0 8 v1
|
|
v7 = to_f32 v6
|
|
v8 = mul_f32 v3 v7
|
|
v9 = extract v0 16 v1
|
|
v10 = to_f32 v9
|
|
v11 = mul_f32 v3 v10
|
|
v12 = extract v0 24 v1
|
|
v13 = to_f32 v12
|
|
v14 = mul_f32 v3 v13
|
|
v15 = load8 arg(1)
|
|
v16 = to_f32 v15
|
|
v17 = mul_f32 v3 v16
|
|
⤴ v18 = splat 3F800000 (1)
|
|
v19 = sub_f32 v18 v14
|
|
v20 = mad_f32 v17 v19 v5
|
|
v21 = mad_f32 v17 v19 v8
|
|
v22 = mad_f32 v17 v19 v11
|
|
☠ v23 = mad_f32 v18 v19 v14
|
|
⤴ v24 = splat 3E59B3D0 (0.21259999)
|
|
⤴ v25 = splat 3F371759 (0.71520001)
|
|
⤴ v26 = splat 3D93DD98 (0.0722)
|
|
v27 = mul_f32 v22 v26
|
|
v28 = mad_f32 v21 v25 v27
|
|
v29 = mad_f32 v20 v24 v28
|
|
⤴ v30 = splat 437F0000 (255)
|
|
⤴ v31 = splat 3F000000 (0.5)
|
|
v32 = mad_f32 v29 v30 v31
|
|
v33 = to_i32 v32
|
|
store8 arg(1) v33
|
|
|
|
13 registers, 34 instructions:
|
|
r0 = splat FF (3.5733111e-43)
|
|
r1 = splat 3B808081 (0.0039215689)
|
|
r2 = splat 3F800000 (1)
|
|
r3 = splat 3E59B3D0 (0.21259999)
|
|
r4 = splat 3F371759 (0.71520001)
|
|
r5 = splat 3D93DD98 (0.0722)
|
|
r6 = splat 437F0000 (255)
|
|
r7 = splat 3F000000 (0.5)
|
|
loop:
|
|
r8 = load32 arg(0)
|
|
r9 = extract r8 0 r0
|
|
r9 = to_f32 r9
|
|
r9 = mul_f32 r1 r9
|
|
r10 = extract r8 8 r0
|
|
r10 = to_f32 r10
|
|
r10 = mul_f32 r1 r10
|
|
r11 = extract r8 16 r0
|
|
r11 = to_f32 r11
|
|
r11 = mul_f32 r1 r11
|
|
r8 = extract r8 24 r0
|
|
r8 = to_f32 r8
|
|
r8 = mul_f32 r1 r8
|
|
r12 = load8 arg(1)
|
|
r12 = to_f32 r12
|
|
r12 = mul_f32 r1 r12
|
|
r8 = sub_f32 r2 r8
|
|
r9 = mad_f32 r12 r8 r9
|
|
r10 = mad_f32 r12 r8 r10
|
|
r11 = mad_f32 r12 r8 r11
|
|
r11 = mul_f32 r11 r5
|
|
r11 = mad_f32 r10 r4 r11
|
|
r11 = mad_f32 r9 r3 r11
|
|
r11 = mad_f32 r11 r6 r7
|
|
r11 = to_i32 r11
|
|
store8 arg(1) r11
|
|
|
|
RGBA_8888 over RGBA_8888
|
|
48 values:
|
|
v0 = load32 arg(0)
|
|
⤴ v1 = splat FF (3.5733111e-43)
|
|
v2 = extract v0 0 v1
|
|
⤴ v3 = splat 3B808081 (0.0039215689)
|
|
v4 = to_f32 v2
|
|
v5 = mul_f32 v3 v4
|
|
v6 = extract v0 8 v1
|
|
v7 = to_f32 v6
|
|
v8 = mul_f32 v3 v7
|
|
v9 = extract v0 16 v1
|
|
v10 = to_f32 v9
|
|
v11 = mul_f32 v3 v10
|
|
v12 = extract v0 24 v1
|
|
v13 = to_f32 v12
|
|
v14 = mul_f32 v3 v13
|
|
v15 = load32 arg(1)
|
|
v16 = extract v15 0 v1
|
|
v17 = to_f32 v16
|
|
v18 = mul_f32 v3 v17
|
|
v19 = extract v15 8 v1
|
|
v20 = to_f32 v19
|
|
v21 = mul_f32 v3 v20
|
|
v22 = extract v15 16 v1
|
|
v23 = to_f32 v22
|
|
v24 = mul_f32 v3 v23
|
|
v25 = extract v15 24 v1
|
|
v26 = to_f32 v25
|
|
v27 = mul_f32 v3 v26
|
|
⤴ v28 = splat 3F800000 (1)
|
|
v29 = sub_f32 v28 v14
|
|
v30 = mad_f32 v18 v29 v5
|
|
v31 = mad_f32 v21 v29 v8
|
|
v32 = mad_f32 v24 v29 v11
|
|
v33 = mad_f32 v27 v29 v14
|
|
⤴ v34 = splat 437F0000 (255)
|
|
⤴ v35 = splat 3F000000 (0.5)
|
|
v36 = mad_f32 v30 v34 v35
|
|
v37 = to_i32 v36
|
|
v38 = mad_f32 v31 v34 v35
|
|
v39 = to_i32 v38
|
|
v40 = mad_f32 v32 v34 v35
|
|
v41 = to_i32 v40
|
|
v42 = mad_f32 v33 v34 v35
|
|
v43 = to_i32 v42
|
|
v44 = pack v37 v39 8
|
|
v45 = pack v41 v43 8
|
|
v46 = pack v44 v45 16
|
|
store32 arg(1) v46
|
|
|
|
14 registers, 48 instructions:
|
|
r0 = splat FF (3.5733111e-43)
|
|
r1 = splat 3B808081 (0.0039215689)
|
|
r2 = splat 3F800000 (1)
|
|
r3 = splat 437F0000 (255)
|
|
r4 = splat 3F000000 (0.5)
|
|
loop:
|
|
r5 = load32 arg(0)
|
|
r6 = extract r5 0 r0
|
|
r6 = to_f32 r6
|
|
r6 = mul_f32 r1 r6
|
|
r7 = extract r5 8 r0
|
|
r7 = to_f32 r7
|
|
r7 = mul_f32 r1 r7
|
|
r8 = extract r5 16 r0
|
|
r8 = to_f32 r8
|
|
r8 = mul_f32 r1 r8
|
|
r5 = extract r5 24 r0
|
|
r5 = to_f32 r5
|
|
r5 = mul_f32 r1 r5
|
|
r9 = load32 arg(1)
|
|
r10 = extract r9 0 r0
|
|
r10 = to_f32 r10
|
|
r10 = mul_f32 r1 r10
|
|
r11 = extract r9 8 r0
|
|
r11 = to_f32 r11
|
|
r11 = mul_f32 r1 r11
|
|
r12 = extract r9 16 r0
|
|
r12 = to_f32 r12
|
|
r12 = mul_f32 r1 r12
|
|
r9 = extract r9 24 r0
|
|
r9 = to_f32 r9
|
|
r9 = mul_f32 r1 r9
|
|
r13 = sub_f32 r2 r5
|
|
r6 = mad_f32 r10 r13 r6
|
|
r7 = mad_f32 r11 r13 r7
|
|
r8 = mad_f32 r12 r13 r8
|
|
r5 = mad_f32 r9 r13 r5
|
|
r6 = mad_f32 r6 r3 r4
|
|
r6 = to_i32 r6
|
|
r7 = mad_f32 r7 r3 r4
|
|
r7 = to_i32 r7
|
|
r8 = mad_f32 r8 r3 r4
|
|
r8 = to_i32 r8
|
|
r5 = mad_f32 r5 r3 r4
|
|
r5 = to_i32 r5
|
|
r7 = pack r6 r7 8
|
|
r5 = pack r8 r5 8
|
|
r5 = pack r7 r5 16
|
|
store32 arg(1) r5
|
|
|
|
I32 (Naive) 8888 over 8888
|
|
11 registers, 29 instructions:
|
|
r0 = splat FF (3.5733111e-43)
|
|
r1 = splat 100 (3.5873241e-43)
|
|
loop:
|
|
r2 = load32 arg(0)
|
|
r3 = extract r2 0 r0
|
|
r4 = extract r2 8 r0
|
|
r5 = extract r2 16 r0
|
|
r2 = extract r2 24 r0
|
|
r6 = load32 arg(1)
|
|
r7 = extract r6 0 r0
|
|
r8 = extract r6 8 r0
|
|
r9 = extract r6 16 r0
|
|
r6 = extract r6 24 r0
|
|
r10 = sub_i32 r1 r2
|
|
r7 = mul_i32 r7 r10
|
|
r7 = shr r7 8
|
|
r7 = add_i32 r3 r7
|
|
r8 = mul_i32 r8 r10
|
|
r8 = shr r8 8
|
|
r8 = add_i32 r4 r8
|
|
r9 = mul_i32 r9 r10
|
|
r9 = shr r9 8
|
|
r9 = add_i32 r5 r9
|
|
r10 = mul_i32 r6 r10
|
|
r10 = shr r10 8
|
|
r10 = add_i32 r2 r10
|
|
r8 = pack r7 r8 8
|
|
r10 = pack r9 r10 8
|
|
r10 = pack r8 r10 16
|
|
store32 arg(1) r10
|
|
|
|
I32 8888 over 8888
|
|
11 registers, 29 instructions:
|
|
r0 = splat FF (3.5733111e-43)
|
|
r1 = splat 100 (3.5873241e-43)
|
|
loop:
|
|
r2 = load32 arg(0)
|
|
r3 = bit_and r2 r0
|
|
r4 = bytes r2 2
|
|
r5 = bytes r2 3
|
|
r2 = shr r2 24
|
|
r6 = load32 arg(1)
|
|
r7 = bit_and r6 r0
|
|
r8 = bytes r6 2
|
|
r9 = bytes r6 3
|
|
r6 = shr r6 24
|
|
r10 = sub_i32 r1 r2
|
|
r7 = mul_i16x2 r7 r10
|
|
r7 = shr r7 8
|
|
r7 = add_i32 r3 r7
|
|
r8 = mul_i16x2 r8 r10
|
|
r8 = shr r8 8
|
|
r8 = add_i32 r4 r8
|
|
r9 = mul_i16x2 r9 r10
|
|
r9 = shr r9 8
|
|
r9 = add_i32 r5 r9
|
|
r10 = mul_i16x2 r6 r10
|
|
r10 = shr r10 8
|
|
r10 = add_i32 r2 r10
|
|
r8 = pack r7 r8 8
|
|
r10 = pack r9 r10 8
|
|
r10 = pack r8 r10 16
|
|
store32 arg(1) r10
|
|
|
|
I32 (SWAR) 8888 over 8888
|
|
7 registers, 16 instructions:
|
|
r0 = splat 1000100 (2.3510604e-38)
|
|
r1 = splat FF00FF (2.3418409e-38)
|
|
r2 = splat FF00FF00 (-1.7146522e+38)
|
|
loop:
|
|
r3 = load32 arg(0)
|
|
r4 = bytes r3 404
|
|
r4 = sub_i16x2 r0 r4
|
|
r5 = load32 arg(1)
|
|
r6 = bit_and r5 r1
|
|
r5 = shr_i16x2 r5 8
|
|
r6 = mul_i16x2 r6 r4
|
|
r6 = shr_i16x2 r6 8
|
|
r4 = mul_i16x2 r5 r4
|
|
r4 = bit_and r4 r2
|
|
r4 = bit_or r6 r4
|
|
r4 = add_i32 r3 r4
|
|
store32 arg(1) r4
|
|
|