hoist loop-invariant code out of the loop
I'm of two minds about this... it adds register pressure and really only tends to hoist few instructions that are fairly cheap anway. On the other hand, it's neat, it's easy to turn off (just set the initial hoist value to false in Builder::push()) and it does deliver a noticeable though slight performance improvement in the interpreter. I think the final decision will probably come down to what we think about maintainability? Change-Id: Idd6346f70f03188917918406731154246a7c6fcb Reviewed-on: https://skia-review.googlesource.com/c/skia/+/218584 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
4d74f605d7
commit
754bad3f38
@ -1,316 +1,327 @@
|
||||
A8 over A8
|
||||
3 registers, 15 instructions:
|
||||
r0 = load8 arg(0)
|
||||
r1 = splat 3B808081 (0.0039215689)
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r1 r0
|
||||
r2 = load8 arg(1)
|
||||
r2 = to_f32 r2
|
||||
r2 = mul_f32 r1 r2
|
||||
r1 = splat 3F800000 (1)
|
||||
r1 = sub_f32 r1 r0
|
||||
r1 = mad_f32 r2 r1 r0
|
||||
r2 = splat 437F0000 (255)
|
||||
r0 = splat 3F000000 (0.5)
|
||||
r0 = mad_f32 r1 r2 r0
|
||||
r0 = to_i32 r0
|
||||
store8 arg(1) r0
|
||||
|
||||
A8 over G8
|
||||
4 registers, 21 instructions:
|
||||
r0 = load8 arg(0)
|
||||
r1 = splat 3B808081 (0.0039215689)
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r1 r0
|
||||
r2 = load8 arg(1)
|
||||
r2 = to_f32 r2
|
||||
r2 = mul_f32 r1 r2
|
||||
r1 = splat 3F800000 (1)
|
||||
r1 = sub_f32 r1 r0
|
||||
r1 = mul_f32 r2 r1
|
||||
r2 = splat 3E59B3D0 (0.21259999)
|
||||
r0 = splat 3F371759 (0.71520001)
|
||||
r3 = splat 3D93DD98 (0.0722)
|
||||
r3 = mul_f32 r1 r3
|
||||
r3 = mad_f32 r1 r0 r3
|
||||
r3 = mad_f32 r1 r2 r3
|
||||
r2 = splat 437F0000 (255)
|
||||
r1 = splat 3F000000 (0.5)
|
||||
r1 = mad_f32 r3 r2 r1
|
||||
r1 = to_i32 r1
|
||||
store8 arg(1) r1
|
||||
|
||||
A8 over RGBA_8888
|
||||
6 registers, 37 instructions:
|
||||
r0 = load8 arg(0)
|
||||
r1 = splat 3B808081 (0.0039215689)
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r1 r0
|
||||
r2 = load32 arg(1)
|
||||
r3 = extract r2 FF
|
||||
r3 = to_f32 r3
|
||||
r3 = mul_f32 r1 r3
|
||||
r4 = extract r2 FF00
|
||||
r4 = to_f32 r4
|
||||
r4 = mul_f32 r1 r4
|
||||
r5 = extract r2 FF0000
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r1 r5
|
||||
r2 = shr r2 24
|
||||
r2 = to_f32 r2
|
||||
r2 = mul_f32 r1 r2
|
||||
r1 = splat 3F800000 (1)
|
||||
r1 = sub_f32 r1 r0
|
||||
r3 = mul_f32 r3 r1
|
||||
r4 = mul_f32 r4 r1
|
||||
r5 = mul_f32 r5 r1
|
||||
r1 = mad_f32 r2 r1 r0
|
||||
r2 = splat 437F0000 (255)
|
||||
r0 = splat 3F000000 (0.5)
|
||||
r3 = mad_f32 r3 r2 r0
|
||||
r3 = to_i32 r3
|
||||
r4 = mad_f32 r4 r2 r0
|
||||
r4 = to_i32 r4
|
||||
r5 = mad_f32 r5 r2 r0
|
||||
r5 = to_i32 r5
|
||||
r0 = mad_f32 r1 r2 r0
|
||||
r0 = to_i32 r0
|
||||
r4 = pack r3 r4 8
|
||||
r0 = pack r5 r0 8
|
||||
r0 = pack r4 r0 16
|
||||
store32 arg(1) r0
|
||||
|
||||
G8 over A8
|
||||
3 registers, 12 instructions:
|
||||
7 registers, 15 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = load8 arg(1)
|
||||
r2 = to_f32 r2
|
||||
r2 = mul_f32 r0 r2
|
||||
r0 = sub_f32 r1 r1
|
||||
r0 = mad_f32 r2 r0 r1
|
||||
r2 = splat 437F0000 (255)
|
||||
r1 = splat 3F000000 (0.5)
|
||||
r1 = mad_f32 r0 r2 r1
|
||||
r1 = to_i32 r1
|
||||
store8 arg(1) r1
|
||||
|
||||
G8 over G8
|
||||
4 registers, 21 instructions:
|
||||
r0 = load8 arg(0)
|
||||
r1 = splat 3B808081 (0.0039215689)
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r1 r0
|
||||
r2 = splat 3F800000 (1)
|
||||
r3 = load8 arg(1)
|
||||
r3 = to_f32 r3
|
||||
r3 = mul_f32 r1 r3
|
||||
r2 = sub_f32 r2 r2
|
||||
r2 = mad_f32 r3 r2 r0
|
||||
r3 = splat 3E59B3D0 (0.21259999)
|
||||
r0 = splat 3F371759 (0.71520001)
|
||||
r1 = splat 3D93DD98 (0.0722)
|
||||
r1 = mul_f32 r2 r1
|
||||
r1 = mad_f32 r2 r0 r1
|
||||
r1 = mad_f32 r2 r3 r1
|
||||
r3 = splat 437F0000 (255)
|
||||
r2 = splat 3F000000 (0.5)
|
||||
r2 = mad_f32 r1 r3 r2
|
||||
r2 = to_i32 r2
|
||||
store8 arg(1) r2
|
||||
|
||||
G8 over RGBA_8888
|
||||
7 registers, 37 instructions:
|
||||
r0 = load8 arg(0)
|
||||
r1 = splat 3B808081 (0.0039215689)
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r1 r0
|
||||
r2 = splat 3F800000 (1)
|
||||
r3 = load32 arg(1)
|
||||
r4 = extract r3 FF
|
||||
r3 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r4 = load8 arg(0)
|
||||
r4 = to_f32 r4
|
||||
r4 = mul_f32 r1 r4
|
||||
r5 = extract r3 FF00
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r1 r5
|
||||
r6 = extract r3 FF0000
|
||||
r6 = to_f32 r6
|
||||
r6 = mul_f32 r1 r6
|
||||
r3 = shr r3 24
|
||||
r3 = to_f32 r3
|
||||
r3 = mul_f32 r1 r3
|
||||
r1 = sub_f32 r2 r2
|
||||
r4 = mad_f32 r4 r1 r0
|
||||
r5 = mad_f32 r5 r1 r0
|
||||
r6 = mad_f32 r6 r1 r0
|
||||
r1 = mad_f32 r3 r1 r2
|
||||
r3 = splat 437F0000 (255)
|
||||
r2 = splat 3F000000 (0.5)
|
||||
r4 = mad_f32 r4 r3 r2
|
||||
r4 = to_i32 r4
|
||||
r5 = mad_f32 r5 r3 r2
|
||||
r5 = to_i32 r5
|
||||
r6 = mad_f32 r6 r3 r2
|
||||
r6 = to_i32 r6
|
||||
r2 = mad_f32 r1 r3 r2
|
||||
r2 = to_i32 r2
|
||||
r5 = pack r4 r5 8
|
||||
r2 = pack r6 r2 8
|
||||
r2 = pack r5 r2 16
|
||||
store32 arg(1) r2
|
||||
|
||||
RGBA_8888 over A8
|
||||
3 registers, 16 instructions:
|
||||
r0 = load32 arg(0)
|
||||
r1 = splat 3B808081 (0.0039215689)
|
||||
r0 = shr r0 24
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r1 r0
|
||||
r2 = load8 arg(1)
|
||||
r2 = to_f32 r2
|
||||
r2 = mul_f32 r1 r2
|
||||
r1 = splat 3F800000 (1)
|
||||
r1 = sub_f32 r1 r0
|
||||
r1 = mad_f32 r2 r1 r0
|
||||
r2 = splat 437F0000 (255)
|
||||
r0 = splat 3F000000 (0.5)
|
||||
r0 = mad_f32 r1 r2 r0
|
||||
r0 = to_i32 r0
|
||||
store8 arg(1) r0
|
||||
|
||||
RGBA_8888 over G8
|
||||
6 registers, 33 instructions:
|
||||
r0 = load32 arg(0)
|
||||
r1 = extract r0 FF
|
||||
r2 = splat 3B808081 (0.0039215689)
|
||||
r1 = to_f32 r1
|
||||
r1 = mul_f32 r2 r1
|
||||
r3 = extract r0 FF00
|
||||
r3 = to_f32 r3
|
||||
r3 = mul_f32 r2 r3
|
||||
r4 = extract r0 FF0000
|
||||
r4 = to_f32 r4
|
||||
r4 = mul_f32 r2 r4
|
||||
r0 = shr r0 24
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r2 r0
|
||||
r4 = mul_f32 r0 r4
|
||||
r5 = load8 arg(1)
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r2 r5
|
||||
r2 = splat 3F800000 (1)
|
||||
r2 = sub_f32 r2 r0
|
||||
r1 = mad_f32 r5 r2 r1
|
||||
r3 = mad_f32 r5 r2 r3
|
||||
r2 = mad_f32 r5 r2 r4
|
||||
r5 = splat 3E59B3D0 (0.21259999)
|
||||
r4 = splat 3F371759 (0.71520001)
|
||||
r0 = splat 3D93DD98 (0.0722)
|
||||
r0 = mul_f32 r2 r0
|
||||
r0 = mad_f32 r3 r4 r0
|
||||
r0 = mad_f32 r1 r5 r0
|
||||
r5 = splat 437F0000 (255)
|
||||
r1 = splat 3F000000 (0.5)
|
||||
r1 = mad_f32 r0 r5 r1
|
||||
r1 = to_i32 r1
|
||||
store8 arg(1) r1
|
||||
r5 = mul_f32 r0 r5
|
||||
r6 = sub_f32 r1 r4
|
||||
r6 = mad_f32 r5 r6 r4
|
||||
r6 = mad_f32 r6 r2 r3
|
||||
r6 = to_i32 r6
|
||||
store8 arg(1) r6
|
||||
|
||||
RGBA_8888 over RGBA_8888
|
||||
9 registers, 47 instructions:
|
||||
r0 = load32 arg(0)
|
||||
r1 = extract r0 FF
|
||||
r2 = splat 3B808081 (0.0039215689)
|
||||
r1 = to_f32 r1
|
||||
r1 = mul_f32 r2 r1
|
||||
r3 = extract r0 FF00
|
||||
r3 = to_f32 r3
|
||||
r3 = mul_f32 r2 r3
|
||||
r4 = extract r0 FF0000
|
||||
A8 over G8
|
||||
9 registers, 21 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = splat 3E59B3D0 (0.21259999)
|
||||
r3 = splat 3F371759 (0.71520001)
|
||||
r4 = splat 3D93DD98 (0.0722)
|
||||
r5 = splat 437F0000 (255)
|
||||
r6 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r7 = load8 arg(0)
|
||||
r7 = to_f32 r7
|
||||
r7 = mul_f32 r0 r7
|
||||
r8 = load8 arg(1)
|
||||
r8 = to_f32 r8
|
||||
r8 = mul_f32 r0 r8
|
||||
r7 = sub_f32 r1 r7
|
||||
r7 = mul_f32 r8 r7
|
||||
r8 = mul_f32 r7 r4
|
||||
r8 = mad_f32 r7 r3 r8
|
||||
r8 = mad_f32 r7 r2 r8
|
||||
r8 = mad_f32 r8 r5 r6
|
||||
r8 = to_i32 r8
|
||||
store8 arg(1) r8
|
||||
|
||||
A8 over RGBA_8888
|
||||
10 registers, 37 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = splat 437F0000 (255)
|
||||
r3 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r4 = load8 arg(0)
|
||||
r4 = to_f32 r4
|
||||
r4 = mul_f32 r2 r4
|
||||
r0 = shr r0 24
|
||||
r0 = to_f32 r0
|
||||
r0 = mul_f32 r2 r0
|
||||
r4 = mul_f32 r0 r4
|
||||
r5 = load32 arg(1)
|
||||
r6 = extract r5 FF
|
||||
r6 = to_f32 r6
|
||||
r6 = mul_f32 r2 r6
|
||||
r6 = mul_f32 r0 r6
|
||||
r7 = extract r5 FF00
|
||||
r7 = to_f32 r7
|
||||
r7 = mul_f32 r2 r7
|
||||
r7 = mul_f32 r0 r7
|
||||
r8 = extract r5 FF0000
|
||||
r8 = to_f32 r8
|
||||
r8 = mul_f32 r2 r8
|
||||
r8 = mul_f32 r0 r8
|
||||
r5 = shr r5 24
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r2 r5
|
||||
r2 = splat 3F800000 (1)
|
||||
r2 = sub_f32 r2 r0
|
||||
r6 = mad_f32 r6 r2 r1
|
||||
r7 = mad_f32 r7 r2 r3
|
||||
r8 = mad_f32 r8 r2 r4
|
||||
r2 = mad_f32 r5 r2 r0
|
||||
r5 = splat 437F0000 (255)
|
||||
r0 = splat 3F000000 (0.5)
|
||||
r6 = mad_f32 r6 r5 r0
|
||||
r5 = mul_f32 r0 r5
|
||||
r9 = sub_f32 r1 r4
|
||||
r6 = mul_f32 r6 r9
|
||||
r7 = mul_f32 r7 r9
|
||||
r8 = mul_f32 r8 r9
|
||||
r9 = mad_f32 r5 r9 r4
|
||||
r6 = mad_f32 r6 r2 r3
|
||||
r6 = to_i32 r6
|
||||
r7 = mad_f32 r7 r5 r0
|
||||
r7 = mad_f32 r7 r2 r3
|
||||
r7 = to_i32 r7
|
||||
r8 = mad_f32 r8 r5 r0
|
||||
r8 = mad_f32 r8 r2 r3
|
||||
r8 = to_i32 r8
|
||||
r0 = mad_f32 r2 r5 r0
|
||||
r0 = to_i32 r0
|
||||
r9 = mad_f32 r9 r2 r3
|
||||
r9 = to_i32 r9
|
||||
r7 = pack r6 r7 8
|
||||
r0 = pack r8 r0 8
|
||||
r0 = pack r7 r0 16
|
||||
store32 arg(1) r0
|
||||
r9 = pack r8 r9 8
|
||||
r9 = pack r7 r9 16
|
||||
store32 arg(1) r9
|
||||
|
||||
G8 over A8
|
||||
6 registers, 12 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = sub_f32 r1 r1
|
||||
r3 = splat 437F0000 (255)
|
||||
r4 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r5 = load8 arg(1)
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r0 r5
|
||||
r5 = mad_f32 r5 r2 r1
|
||||
r5 = mad_f32 r5 r3 r4
|
||||
r5 = to_i32 r5
|
||||
store8 arg(1) r5
|
||||
|
||||
G8 over G8
|
||||
10 registers, 21 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = sub_f32 r1 r1
|
||||
r3 = splat 3E59B3D0 (0.21259999)
|
||||
r4 = splat 3F371759 (0.71520001)
|
||||
r5 = splat 3D93DD98 (0.0722)
|
||||
r6 = splat 437F0000 (255)
|
||||
r7 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r8 = load8 arg(0)
|
||||
r8 = to_f32 r8
|
||||
r8 = mul_f32 r0 r8
|
||||
r9 = load8 arg(1)
|
||||
r9 = to_f32 r9
|
||||
r9 = mul_f32 r0 r9
|
||||
r9 = mad_f32 r9 r2 r8
|
||||
r8 = mul_f32 r9 r5
|
||||
r8 = mad_f32 r9 r4 r8
|
||||
r8 = mad_f32 r9 r3 r8
|
||||
r8 = mad_f32 r8 r6 r7
|
||||
r8 = to_i32 r8
|
||||
store8 arg(1) r8
|
||||
|
||||
G8 over RGBA_8888
|
||||
10 registers, 37 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = sub_f32 r1 r1
|
||||
r3 = splat 437F0000 (255)
|
||||
r4 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r5 = load8 arg(0)
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r0 r5
|
||||
r6 = load32 arg(1)
|
||||
r7 = extract r6 FF
|
||||
r7 = to_f32 r7
|
||||
r7 = mul_f32 r0 r7
|
||||
r8 = extract r6 FF00
|
||||
r8 = to_f32 r8
|
||||
r8 = mul_f32 r0 r8
|
||||
r9 = extract r6 FF0000
|
||||
r9 = to_f32 r9
|
||||
r9 = mul_f32 r0 r9
|
||||
r6 = shr r6 24
|
||||
r6 = to_f32 r6
|
||||
r6 = mul_f32 r0 r6
|
||||
r7 = mad_f32 r7 r2 r5
|
||||
r8 = mad_f32 r8 r2 r5
|
||||
r9 = mad_f32 r9 r2 r5
|
||||
r6 = mad_f32 r6 r2 r1
|
||||
r7 = mad_f32 r7 r3 r4
|
||||
r7 = to_i32 r7
|
||||
r8 = mad_f32 r8 r3 r4
|
||||
r8 = to_i32 r8
|
||||
r9 = mad_f32 r9 r3 r4
|
||||
r9 = to_i32 r9
|
||||
r6 = mad_f32 r6 r3 r4
|
||||
r6 = to_i32 r6
|
||||
r8 = pack r7 r8 8
|
||||
r6 = pack r9 r6 8
|
||||
r6 = pack r8 r6 16
|
||||
store32 arg(1) r6
|
||||
|
||||
RGBA_8888 over A8
|
||||
7 registers, 16 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = splat 437F0000 (255)
|
||||
r3 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r4 = load32 arg(0)
|
||||
r4 = shr r4 24
|
||||
r4 = to_f32 r4
|
||||
r4 = mul_f32 r0 r4
|
||||
r5 = load8 arg(1)
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r0 r5
|
||||
r6 = sub_f32 r1 r4
|
||||
r6 = mad_f32 r5 r6 r4
|
||||
r6 = mad_f32 r6 r2 r3
|
||||
r6 = to_i32 r6
|
||||
store8 arg(1) r6
|
||||
|
||||
RGBA_8888 over G8
|
||||
12 registers, 33 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = splat 3E59B3D0 (0.21259999)
|
||||
r3 = splat 3F371759 (0.71520001)
|
||||
r4 = splat 3D93DD98 (0.0722)
|
||||
r5 = splat 437F0000 (255)
|
||||
r6 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r7 = load32 arg(0)
|
||||
r8 = extract r7 FF
|
||||
r8 = to_f32 r8
|
||||
r8 = mul_f32 r0 r8
|
||||
r9 = extract r7 FF00
|
||||
r9 = to_f32 r9
|
||||
r9 = mul_f32 r0 r9
|
||||
r10 = extract r7 FF0000
|
||||
r10 = to_f32 r10
|
||||
r10 = mul_f32 r0 r10
|
||||
r7 = shr r7 24
|
||||
r7 = to_f32 r7
|
||||
r7 = mul_f32 r0 r7
|
||||
r11 = load8 arg(1)
|
||||
r11 = to_f32 r11
|
||||
r11 = mul_f32 r0 r11
|
||||
r7 = sub_f32 r1 r7
|
||||
r8 = mad_f32 r11 r7 r8
|
||||
r9 = mad_f32 r11 r7 r9
|
||||
r7 = mad_f32 r11 r7 r10
|
||||
r7 = mul_f32 r7 r4
|
||||
r7 = mad_f32 r9 r3 r7
|
||||
r7 = mad_f32 r8 r2 r7
|
||||
r7 = mad_f32 r7 r5 r6
|
||||
r7 = to_i32 r7
|
||||
store8 arg(1) r7
|
||||
|
||||
RGBA_8888 over RGBA_8888
|
||||
13 registers, 47 instructions:
|
||||
r0 = splat 3B808081 (0.0039215689)
|
||||
r1 = splat 3F800000 (1)
|
||||
r2 = splat 437F0000 (255)
|
||||
r3 = splat 3F000000 (0.5)
|
||||
loop:
|
||||
r4 = load32 arg(0)
|
||||
r5 = extract r4 FF
|
||||
r5 = to_f32 r5
|
||||
r5 = mul_f32 r0 r5
|
||||
r6 = extract r4 FF00
|
||||
r6 = to_f32 r6
|
||||
r6 = mul_f32 r0 r6
|
||||
r7 = extract r4 FF0000
|
||||
r7 = to_f32 r7
|
||||
r7 = mul_f32 r0 r7
|
||||
r4 = shr r4 24
|
||||
r4 = to_f32 r4
|
||||
r4 = mul_f32 r0 r4
|
||||
r8 = load32 arg(1)
|
||||
r9 = extract r8 FF
|
||||
r9 = to_f32 r9
|
||||
r9 = mul_f32 r0 r9
|
||||
r10 = extract r8 FF00
|
||||
r10 = to_f32 r10
|
||||
r10 = mul_f32 r0 r10
|
||||
r11 = extract r8 FF0000
|
||||
r11 = to_f32 r11
|
||||
r11 = mul_f32 r0 r11
|
||||
r8 = shr r8 24
|
||||
r8 = to_f32 r8
|
||||
r8 = mul_f32 r0 r8
|
||||
r12 = sub_f32 r1 r4
|
||||
r9 = mad_f32 r9 r12 r5
|
||||
r10 = mad_f32 r10 r12 r6
|
||||
r11 = mad_f32 r11 r12 r7
|
||||
r12 = mad_f32 r8 r12 r4
|
||||
r9 = mad_f32 r9 r2 r3
|
||||
r9 = to_i32 r9
|
||||
r10 = mad_f32 r10 r2 r3
|
||||
r10 = to_i32 r10
|
||||
r11 = mad_f32 r11 r2 r3
|
||||
r11 = to_i32 r11
|
||||
r12 = mad_f32 r12 r2 r3
|
||||
r12 = to_i32 r12
|
||||
r10 = pack r9 r10 8
|
||||
r12 = pack r11 r12 8
|
||||
r12 = pack r10 r12 16
|
||||
store32 arg(1) r12
|
||||
|
||||
I32 8888 over 8888
|
||||
9 registers, 24 instructions:
|
||||
r0 = load32 arg(0)
|
||||
r1 = extract r0 FF
|
||||
r2 = extract r0 FF00
|
||||
r3 = extract r0 FF0000
|
||||
r0 = shr r0 24
|
||||
r4 = load32 arg(1)
|
||||
r5 = extract r4 FF
|
||||
r6 = extract r4 FF00
|
||||
r7 = extract r4 FF0000
|
||||
r4 = shr r4 24
|
||||
r8 = splat FF (3.5733111e-43)
|
||||
r8 = sub_i32 r8 r0
|
||||
r5 = mul_unorm8 r5 r8
|
||||
r5 = add_i32 r1 r5
|
||||
r6 = mul_unorm8 r6 r8
|
||||
10 registers, 24 instructions:
|
||||
r0 = splat FF (3.5733111e-43)
|
||||
loop:
|
||||
r1 = load32 arg(0)
|
||||
r2 = extract r1 FF
|
||||
r3 = extract r1 FF00
|
||||
r4 = extract r1 FF0000
|
||||
r1 = shr r1 24
|
||||
r5 = load32 arg(1)
|
||||
r6 = extract r5 FF
|
||||
r7 = extract r5 FF00
|
||||
r8 = extract r5 FF0000
|
||||
r5 = shr r5 24
|
||||
r9 = sub_i32 r0 r1
|
||||
r6 = mul_unorm8 r6 r9
|
||||
r6 = add_i32 r2 r6
|
||||
r7 = mul_unorm8 r7 r8
|
||||
r7 = mul_unorm8 r7 r9
|
||||
r7 = add_i32 r3 r7
|
||||
r8 = mul_unorm8 r4 r8
|
||||
r8 = add_i32 r0 r8
|
||||
r6 = pack r5 r6 8
|
||||
r8 = pack r7 r8 8
|
||||
r8 = pack r6 r8 16
|
||||
store32 arg(1) r8
|
||||
r8 = mul_unorm8 r8 r9
|
||||
r8 = add_i32 r4 r8
|
||||
r9 = mul_unorm8 r5 r9
|
||||
r9 = add_i32 r1 r9
|
||||
r7 = pack r6 r7 8
|
||||
r9 = pack r8 r9 8
|
||||
r9 = pack r7 r9 16
|
||||
store32 arg(1) r9
|
||||
|
||||
I32 (SWAR) 8888 over 8888
|
||||
6 registers, 20 instructions:
|
||||
r0 = load32 arg(0)
|
||||
r1 = extract r0 FF00FF
|
||||
r0 = extract r0 FF00FF00
|
||||
r2 = load32 arg(1)
|
||||
7 registers, 20 instructions:
|
||||
r0 = splat FF (3.5733111e-43)
|
||||
r1 = splat FF00FF (2.3418409e-38)
|
||||
loop:
|
||||
r2 = load32 arg(0)
|
||||
r3 = extract r2 FF00FF
|
||||
r2 = extract r2 FF00FF00
|
||||
r4 = splat FF (3.5733111e-43)
|
||||
r5 = shr r0 16
|
||||
r5 = sub_i32 r4 r5
|
||||
r4 = splat FF00FF (2.3418409e-38)
|
||||
r3 = mul_i32 r3 r5
|
||||
r3 = add_i32 r3 r4
|
||||
r3 = extract r3 FF00FF00
|
||||
r3 = add_i32 r1 r3
|
||||
r5 = mul_i32 r2 r5
|
||||
r5 = add_i32 r5 r4
|
||||
r4 = load32 arg(1)
|
||||
r5 = extract r4 FF00FF
|
||||
r4 = extract r4 FF00FF00
|
||||
r6 = shr r2 16
|
||||
r6 = sub_i32 r0 r6
|
||||
r5 = mul_i32 r5 r6
|
||||
r5 = add_i32 r5 r1
|
||||
r5 = extract r5 FF00FF00
|
||||
r5 = add_i32 r0 r5
|
||||
r5 = pack r3 r5 8
|
||||
store32 arg(1) r5
|
||||
r5 = add_i32 r3 r5
|
||||
r6 = mul_i32 r4 r6
|
||||
r6 = add_i32 r6 r1
|
||||
r6 = extract r6 FF00FF00
|
||||
r6 = add_i32 r2 r6
|
||||
r6 = pack r5 r6 8
|
||||
store32 arg(1) r6
|
||||
|
||||
|
@ -75,7 +75,7 @@ namespace SkOpts {
|
||||
extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
|
||||
#undef M
|
||||
|
||||
extern void (*eval)(const skvm::Program::Instruction[], int ninsts, int nregs,
|
||||
extern void (*eval)(const skvm::Program::Instruction[], int ninsts, int nregs, int loop,
|
||||
int n, void* args[], size_t strides[], int nargs);
|
||||
|
||||
}
|
||||
|
@ -15,9 +15,10 @@
|
||||
|
||||
namespace skvm {
|
||||
|
||||
Program::Program(std::vector<Instruction> instructions, int regs)
|
||||
Program::Program(std::vector<Instruction> instructions, int regs, int loop)
|
||||
: fInstructions(std::move(instructions))
|
||||
, fRegs(regs)
|
||||
, fLoop(loop)
|
||||
{}
|
||||
|
||||
Program Builder::done() {
|
||||
@ -38,19 +39,49 @@ namespace skvm {
|
||||
}
|
||||
}
|
||||
|
||||
// Look to see if there are any instructions that can be hoisted outside the program's loop.
|
||||
for (ID id = 0; id < (ID)fProgram.size(); id++) {
|
||||
Instruction& inst = fProgram[id];
|
||||
|
||||
// Loads and stores cannot be hoisted out of the loop.
|
||||
if (inst.op <= Op::load32) {
|
||||
inst.hoist = false;
|
||||
}
|
||||
|
||||
// If any of an instruction's arguments can't be hoisted, it can't be hoisted itself.
|
||||
if (inst.hoist) {
|
||||
if (inst.x != NA) { inst.hoist &= fProgram[inst.x].hoist; }
|
||||
if (inst.y != NA) { inst.hoist &= fProgram[inst.y].hoist; }
|
||||
if (inst.z != NA) { inst.hoist &= fProgram[inst.z].hoist; }
|
||||
}
|
||||
}
|
||||
|
||||
// We'll need to map each live value to a register.
|
||||
std::unordered_map<ID, ID> val_to_reg;
|
||||
|
||||
// Count the registers we've used so far, and track any registers available to reuse.
|
||||
// Count the registers we've used so far.
|
||||
ID next_reg = 0;
|
||||
std::vector<ID> avail;
|
||||
|
||||
// A schedule of which registers become available as we reach any given instruction.
|
||||
// Our first pass of register assignment assigns hoisted values to eternal registers.
|
||||
for (ID val = 0; val < (ID)fProgram.size(); val++) {
|
||||
Instruction& inst = fProgram[val];
|
||||
if (inst.life == NA || !inst.hoist) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Hoisted values are needed forever, so they each get their own register.
|
||||
val_to_reg[val] = next_reg++;
|
||||
}
|
||||
|
||||
// Now we'll assign registers to values that can't be hoisted out of the loop. These
|
||||
// values have finite liftimes, so we track pre-owned registers that have become available
|
||||
// and a schedule of which registers become available as we reach a given instruction.
|
||||
std::vector<ID> avail;
|
||||
std::unordered_map<ID, std::vector<ID>> deaths;
|
||||
|
||||
for (ID val = 0; val < (ID)fProgram.size(); val++) {
|
||||
Instruction& inst = fProgram[val];
|
||||
if (inst.life == NA) {
|
||||
if (inst.life == NA || inst.hoist) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -83,13 +114,14 @@ namespace skvm {
|
||||
: val_to_reg[val];
|
||||
};
|
||||
|
||||
std::vector<Program::Instruction> program;
|
||||
for (ID id = 0; id < (ID)fProgram.size(); id++) {
|
||||
Instruction& inst = fProgram[id];
|
||||
if (inst.life == NA) {
|
||||
continue;
|
||||
}
|
||||
// Finally translate Builder::Instructions to Program::Instructions by mapping values to
|
||||
// registers. This will be two passes again, first outside the loop, then inside.
|
||||
|
||||
// The loop begins at the loop'th Instruction.
|
||||
int loop = 0;
|
||||
std::vector<Program::Instruction> program;
|
||||
|
||||
auto push_instruction = [&](ID id, const Builder::Instruction& inst) {
|
||||
Program::Instruction pinst{
|
||||
inst.op,
|
||||
lookup_register(id),
|
||||
@ -100,16 +132,34 @@ namespace skvm {
|
||||
if (inst.y == NA) { pinst.y.imm = inst.immy; }
|
||||
if (inst.z == NA) { pinst.z.imm = inst.immz; }
|
||||
program.push_back(pinst);
|
||||
};
|
||||
|
||||
for (ID id = 0; id < (ID)fProgram.size(); id++) {
|
||||
Instruction& inst = fProgram[id];
|
||||
if (inst.life == NA || !inst.hoist) {
|
||||
continue;
|
||||
}
|
||||
|
||||
push_instruction(id, inst);
|
||||
loop++;
|
||||
}
|
||||
for (ID id = 0; id < (ID)fProgram.size(); id++) {
|
||||
Instruction& inst = fProgram[id];
|
||||
if (inst.life == NA || inst.hoist) {
|
||||
continue;
|
||||
}
|
||||
|
||||
push_instruction(id, inst);
|
||||
}
|
||||
|
||||
return { std::move(program), /*register count = */next_reg };
|
||||
return { std::move(program), /*register count = */next_reg, loop };
|
||||
}
|
||||
|
||||
// Most instructions produce a value and return it by ID,
|
||||
// the value-producing instruction's own index in the program vector.
|
||||
|
||||
ID Builder::push(Op op, ID x, ID y, ID z, int immy, int immz) {
|
||||
Instruction inst{op, /*life=*/NA, x, y, z, immy, immz};
|
||||
Instruction inst{op, /*hoist=*/true, /*life=*/NA, x, y, z, immy, immz};
|
||||
|
||||
// Basic common subexpression elimination:
|
||||
// if we've already seen this exact Instruction, use it instead of creating a new one.
|
||||
@ -238,7 +288,11 @@ namespace skvm {
|
||||
o->writeText(" registers, ");
|
||||
o->writeDecAsText(fInstructions.size());
|
||||
o->writeText(" instructions:\n");
|
||||
for (const Instruction& inst : fInstructions) {
|
||||
for (int i = 0; i < (int)fInstructions.size(); i++) {
|
||||
if (i == fLoop) {
|
||||
write(o, "loop:\n");
|
||||
}
|
||||
const Instruction& inst = fInstructions[i];
|
||||
Op op = inst.op;
|
||||
ID d = inst.d,
|
||||
x = inst.x;
|
||||
@ -286,7 +340,7 @@ namespace skvm {
|
||||
// ~~~~ Program::eval() and co. ~~~~ //
|
||||
|
||||
void Program::eval(int n, void* args[], size_t strides[], int nargs) const {
|
||||
SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs,
|
||||
SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs, fLoop,
|
||||
n, args, strides, nargs);
|
||||
}
|
||||
}
|
||||
|
@ -39,8 +39,8 @@ namespace skvm {
|
||||
union { ID id; int imm; } y,z;
|
||||
};
|
||||
|
||||
Program(std::vector<Instruction>, int regs);
|
||||
Program() : Program({}, 0) {}
|
||||
Program(std::vector<Instruction>, int regs, int loop);
|
||||
Program() : Program({}, 0, 0) {}
|
||||
|
||||
void dump(SkWStream*) const;
|
||||
|
||||
@ -56,6 +56,7 @@ namespace skvm {
|
||||
|
||||
std::vector<Instruction> fInstructions;
|
||||
int fRegs;
|
||||
int fLoop;
|
||||
};
|
||||
|
||||
struct Arg { int ix; };
|
||||
@ -121,19 +122,21 @@ namespace skvm {
|
||||
static const ID NA = ~0;
|
||||
|
||||
struct Instruction {
|
||||
Op op; // v* = op(x,y,z,imm), where * == index of this Instruction.
|
||||
ID life; // ID of last instruction using this instruction's result.
|
||||
ID x,y,z; // Enough arguments for mad().
|
||||
int immy,immz; // Immediate bit patterns, shift counts, argument indexes.
|
||||
Op op; // v* = op(x,y,z,imm), where * == index of this Instruction.
|
||||
bool hoist; // Can this instruction be hoisted outside our implicit loop?
|
||||
ID life; // ID of last instruction using this instruction's result.
|
||||
ID x,y,z; // Enough arguments for mad().
|
||||
int immy,immz; // Immediate bit patterns, shift counts, argument indexes.
|
||||
|
||||
bool operator==(const Instruction& o) const {
|
||||
return op == o.op
|
||||
&& life == o.life
|
||||
&& x == o.x
|
||||
&& y == o.y
|
||||
&& z == o.z
|
||||
&& immy == o.immy
|
||||
&& immz == o.immz;
|
||||
return op == o.op
|
||||
&& hoist == o.hoist
|
||||
&& life == o.life
|
||||
&& x == o.x
|
||||
&& y == o.y
|
||||
&& z == o.z
|
||||
&& immy == o.immy
|
||||
&& immz == o.immz;
|
||||
}
|
||||
};
|
||||
|
||||
@ -144,6 +147,7 @@ namespace skvm {
|
||||
}
|
||||
size_t operator()(const Instruction& inst) const {
|
||||
return Hash((uint8_t)inst.op)
|
||||
^ Hash(inst.hoist)
|
||||
^ Hash(inst.life)
|
||||
^ Hash(inst.x)
|
||||
^ Hash(inst.y)
|
||||
|
@ -12,7 +12,8 @@
|
||||
|
||||
namespace SK_OPTS_NS {
|
||||
|
||||
inline void eval(const skvm::Program::Instruction insts[], const int ninsts, const int nregs,
|
||||
inline void eval(const skvm::Program::Instruction insts[], const int ninsts,
|
||||
const int nregs, const int loop,
|
||||
int n, void* args[], size_t strides[], const int nargs) {
|
||||
using namespace skvm;
|
||||
|
||||
@ -66,11 +67,12 @@ namespace SK_OPTS_NS {
|
||||
SkASSERT(arg == args + nargs);
|
||||
};
|
||||
|
||||
int stride;
|
||||
for ( ; n > 0; n -= stride, step_args(stride)) {
|
||||
int start = 0,
|
||||
stride;
|
||||
for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
|
||||
stride = n >= K ? K : 1;
|
||||
|
||||
for (int i = 0; i < ninsts; i++) {
|
||||
for (int i = start; i < ninsts; i++) {
|
||||
skvm::Program::Instruction inst = insts[i];
|
||||
|
||||
// d = op(x, y.id/z.imm, z.id/z.imm)
|
||||
|
Loading…
Reference in New Issue
Block a user