hoist loop-invariant code out of the loop

I'm of two minds about this... it adds register pressure and really only
tends to hoist few instructions that are fairly cheap anway.  On the
other hand, it's neat, it's easy to turn off (just set the initial
hoist value to false in Builder::push()) and it does deliver a
noticeable though slight performance improvement in the interpreter.

I think the final decision will probably come down to what we think
about maintainability?

Change-Id: Idd6346f70f03188917918406731154246a7c6fcb
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/218584
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2019-06-05 10:47:46 -05:00 committed by Skia Commit-Bot
parent 4d74f605d7
commit 754bad3f38
5 changed files with 388 additions and 317 deletions

View File

@ -1,316 +1,327 @@
A8 over A8
3 registers, 15 instructions:
r0 = load8 arg(0)
r1 = splat 3B808081 (0.0039215689)
r0 = to_f32 r0
r0 = mul_f32 r1 r0
r2 = load8 arg(1)
r2 = to_f32 r2
r2 = mul_f32 r1 r2
r1 = splat 3F800000 (1)
r1 = sub_f32 r1 r0
r1 = mad_f32 r2 r1 r0
r2 = splat 437F0000 (255)
r0 = splat 3F000000 (0.5)
r0 = mad_f32 r1 r2 r0
r0 = to_i32 r0
store8 arg(1) r0
A8 over G8
4 registers, 21 instructions:
r0 = load8 arg(0)
r1 = splat 3B808081 (0.0039215689)
r0 = to_f32 r0
r0 = mul_f32 r1 r0
r2 = load8 arg(1)
r2 = to_f32 r2
r2 = mul_f32 r1 r2
r1 = splat 3F800000 (1)
r1 = sub_f32 r1 r0
r1 = mul_f32 r2 r1
r2 = splat 3E59B3D0 (0.21259999)
r0 = splat 3F371759 (0.71520001)
r3 = splat 3D93DD98 (0.0722)
r3 = mul_f32 r1 r3
r3 = mad_f32 r1 r0 r3
r3 = mad_f32 r1 r2 r3
r2 = splat 437F0000 (255)
r1 = splat 3F000000 (0.5)
r1 = mad_f32 r3 r2 r1
r1 = to_i32 r1
store8 arg(1) r1
A8 over RGBA_8888
6 registers, 37 instructions:
r0 = load8 arg(0)
r1 = splat 3B808081 (0.0039215689)
r0 = to_f32 r0
r0 = mul_f32 r1 r0
r2 = load32 arg(1)
r3 = extract r2 FF
r3 = to_f32 r3
r3 = mul_f32 r1 r3
r4 = extract r2 FF00
r4 = to_f32 r4
r4 = mul_f32 r1 r4
r5 = extract r2 FF0000
r5 = to_f32 r5
r5 = mul_f32 r1 r5
r2 = shr r2 24
r2 = to_f32 r2
r2 = mul_f32 r1 r2
r1 = splat 3F800000 (1)
r1 = sub_f32 r1 r0
r3 = mul_f32 r3 r1
r4 = mul_f32 r4 r1
r5 = mul_f32 r5 r1
r1 = mad_f32 r2 r1 r0
r2 = splat 437F0000 (255)
r0 = splat 3F000000 (0.5)
r3 = mad_f32 r3 r2 r0
r3 = to_i32 r3
r4 = mad_f32 r4 r2 r0
r4 = to_i32 r4
r5 = mad_f32 r5 r2 r0
r5 = to_i32 r5
r0 = mad_f32 r1 r2 r0
r0 = to_i32 r0
r4 = pack r3 r4 8
r0 = pack r5 r0 8
r0 = pack r4 r0 16
store32 arg(1) r0
G8 over A8
3 registers, 12 instructions:
7 registers, 15 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = load8 arg(1)
r2 = to_f32 r2
r2 = mul_f32 r0 r2
r0 = sub_f32 r1 r1
r0 = mad_f32 r2 r0 r1
r2 = splat 437F0000 (255)
r1 = splat 3F000000 (0.5)
r1 = mad_f32 r0 r2 r1
r1 = to_i32 r1
store8 arg(1) r1
G8 over G8
4 registers, 21 instructions:
r0 = load8 arg(0)
r1 = splat 3B808081 (0.0039215689)
r0 = to_f32 r0
r0 = mul_f32 r1 r0
r2 = splat 3F800000 (1)
r3 = load8 arg(1)
r3 = to_f32 r3
r3 = mul_f32 r1 r3
r2 = sub_f32 r2 r2
r2 = mad_f32 r3 r2 r0
r3 = splat 3E59B3D0 (0.21259999)
r0 = splat 3F371759 (0.71520001)
r1 = splat 3D93DD98 (0.0722)
r1 = mul_f32 r2 r1
r1 = mad_f32 r2 r0 r1
r1 = mad_f32 r2 r3 r1
r3 = splat 437F0000 (255)
r2 = splat 3F000000 (0.5)
r2 = mad_f32 r1 r3 r2
r2 = to_i32 r2
store8 arg(1) r2
G8 over RGBA_8888
7 registers, 37 instructions:
r0 = load8 arg(0)
r1 = splat 3B808081 (0.0039215689)
r0 = to_f32 r0
r0 = mul_f32 r1 r0
r2 = splat 3F800000 (1)
r3 = load32 arg(1)
r4 = extract r3 FF
r3 = splat 3F000000 (0.5)
loop:
r4 = load8 arg(0)
r4 = to_f32 r4
r4 = mul_f32 r1 r4
r5 = extract r3 FF00
r5 = to_f32 r5
r5 = mul_f32 r1 r5
r6 = extract r3 FF0000
r6 = to_f32 r6
r6 = mul_f32 r1 r6
r3 = shr r3 24
r3 = to_f32 r3
r3 = mul_f32 r1 r3
r1 = sub_f32 r2 r2
r4 = mad_f32 r4 r1 r0
r5 = mad_f32 r5 r1 r0
r6 = mad_f32 r6 r1 r0
r1 = mad_f32 r3 r1 r2
r3 = splat 437F0000 (255)
r2 = splat 3F000000 (0.5)
r4 = mad_f32 r4 r3 r2
r4 = to_i32 r4
r5 = mad_f32 r5 r3 r2
r5 = to_i32 r5
r6 = mad_f32 r6 r3 r2
r6 = to_i32 r6
r2 = mad_f32 r1 r3 r2
r2 = to_i32 r2
r5 = pack r4 r5 8
r2 = pack r6 r2 8
r2 = pack r5 r2 16
store32 arg(1) r2
RGBA_8888 over A8
3 registers, 16 instructions:
r0 = load32 arg(0)
r1 = splat 3B808081 (0.0039215689)
r0 = shr r0 24
r0 = to_f32 r0
r0 = mul_f32 r1 r0
r2 = load8 arg(1)
r2 = to_f32 r2
r2 = mul_f32 r1 r2
r1 = splat 3F800000 (1)
r1 = sub_f32 r1 r0
r1 = mad_f32 r2 r1 r0
r2 = splat 437F0000 (255)
r0 = splat 3F000000 (0.5)
r0 = mad_f32 r1 r2 r0
r0 = to_i32 r0
store8 arg(1) r0
RGBA_8888 over G8
6 registers, 33 instructions:
r0 = load32 arg(0)
r1 = extract r0 FF
r2 = splat 3B808081 (0.0039215689)
r1 = to_f32 r1
r1 = mul_f32 r2 r1
r3 = extract r0 FF00
r3 = to_f32 r3
r3 = mul_f32 r2 r3
r4 = extract r0 FF0000
r4 = to_f32 r4
r4 = mul_f32 r2 r4
r0 = shr r0 24
r0 = to_f32 r0
r0 = mul_f32 r2 r0
r4 = mul_f32 r0 r4
r5 = load8 arg(1)
r5 = to_f32 r5
r5 = mul_f32 r2 r5
r2 = splat 3F800000 (1)
r2 = sub_f32 r2 r0
r1 = mad_f32 r5 r2 r1
r3 = mad_f32 r5 r2 r3
r2 = mad_f32 r5 r2 r4
r5 = splat 3E59B3D0 (0.21259999)
r4 = splat 3F371759 (0.71520001)
r0 = splat 3D93DD98 (0.0722)
r0 = mul_f32 r2 r0
r0 = mad_f32 r3 r4 r0
r0 = mad_f32 r1 r5 r0
r5 = splat 437F0000 (255)
r1 = splat 3F000000 (0.5)
r1 = mad_f32 r0 r5 r1
r1 = to_i32 r1
store8 arg(1) r1
r5 = mul_f32 r0 r5
r6 = sub_f32 r1 r4
r6 = mad_f32 r5 r6 r4
r6 = mad_f32 r6 r2 r3
r6 = to_i32 r6
store8 arg(1) r6
RGBA_8888 over RGBA_8888
9 registers, 47 instructions:
r0 = load32 arg(0)
r1 = extract r0 FF
r2 = splat 3B808081 (0.0039215689)
r1 = to_f32 r1
r1 = mul_f32 r2 r1
r3 = extract r0 FF00
r3 = to_f32 r3
r3 = mul_f32 r2 r3
r4 = extract r0 FF0000
A8 over G8
9 registers, 21 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = splat 3E59B3D0 (0.21259999)
r3 = splat 3F371759 (0.71520001)
r4 = splat 3D93DD98 (0.0722)
r5 = splat 437F0000 (255)
r6 = splat 3F000000 (0.5)
loop:
r7 = load8 arg(0)
r7 = to_f32 r7
r7 = mul_f32 r0 r7
r8 = load8 arg(1)
r8 = to_f32 r8
r8 = mul_f32 r0 r8
r7 = sub_f32 r1 r7
r7 = mul_f32 r8 r7
r8 = mul_f32 r7 r4
r8 = mad_f32 r7 r3 r8
r8 = mad_f32 r7 r2 r8
r8 = mad_f32 r8 r5 r6
r8 = to_i32 r8
store8 arg(1) r8
A8 over RGBA_8888
10 registers, 37 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = splat 437F0000 (255)
r3 = splat 3F000000 (0.5)
loop:
r4 = load8 arg(0)
r4 = to_f32 r4
r4 = mul_f32 r2 r4
r0 = shr r0 24
r0 = to_f32 r0
r0 = mul_f32 r2 r0
r4 = mul_f32 r0 r4
r5 = load32 arg(1)
r6 = extract r5 FF
r6 = to_f32 r6
r6 = mul_f32 r2 r6
r6 = mul_f32 r0 r6
r7 = extract r5 FF00
r7 = to_f32 r7
r7 = mul_f32 r2 r7
r7 = mul_f32 r0 r7
r8 = extract r5 FF0000
r8 = to_f32 r8
r8 = mul_f32 r2 r8
r8 = mul_f32 r0 r8
r5 = shr r5 24
r5 = to_f32 r5
r5 = mul_f32 r2 r5
r2 = splat 3F800000 (1)
r2 = sub_f32 r2 r0
r6 = mad_f32 r6 r2 r1
r7 = mad_f32 r7 r2 r3
r8 = mad_f32 r8 r2 r4
r2 = mad_f32 r5 r2 r0
r5 = splat 437F0000 (255)
r0 = splat 3F000000 (0.5)
r6 = mad_f32 r6 r5 r0
r5 = mul_f32 r0 r5
r9 = sub_f32 r1 r4
r6 = mul_f32 r6 r9
r7 = mul_f32 r7 r9
r8 = mul_f32 r8 r9
r9 = mad_f32 r5 r9 r4
r6 = mad_f32 r6 r2 r3
r6 = to_i32 r6
r7 = mad_f32 r7 r5 r0
r7 = mad_f32 r7 r2 r3
r7 = to_i32 r7
r8 = mad_f32 r8 r5 r0
r8 = mad_f32 r8 r2 r3
r8 = to_i32 r8
r0 = mad_f32 r2 r5 r0
r0 = to_i32 r0
r9 = mad_f32 r9 r2 r3
r9 = to_i32 r9
r7 = pack r6 r7 8
r0 = pack r8 r0 8
r0 = pack r7 r0 16
store32 arg(1) r0
r9 = pack r8 r9 8
r9 = pack r7 r9 16
store32 arg(1) r9
G8 over A8
6 registers, 12 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = sub_f32 r1 r1
r3 = splat 437F0000 (255)
r4 = splat 3F000000 (0.5)
loop:
r5 = load8 arg(1)
r5 = to_f32 r5
r5 = mul_f32 r0 r5
r5 = mad_f32 r5 r2 r1
r5 = mad_f32 r5 r3 r4
r5 = to_i32 r5
store8 arg(1) r5
G8 over G8
10 registers, 21 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = sub_f32 r1 r1
r3 = splat 3E59B3D0 (0.21259999)
r4 = splat 3F371759 (0.71520001)
r5 = splat 3D93DD98 (0.0722)
r6 = splat 437F0000 (255)
r7 = splat 3F000000 (0.5)
loop:
r8 = load8 arg(0)
r8 = to_f32 r8
r8 = mul_f32 r0 r8
r9 = load8 arg(1)
r9 = to_f32 r9
r9 = mul_f32 r0 r9
r9 = mad_f32 r9 r2 r8
r8 = mul_f32 r9 r5
r8 = mad_f32 r9 r4 r8
r8 = mad_f32 r9 r3 r8
r8 = mad_f32 r8 r6 r7
r8 = to_i32 r8
store8 arg(1) r8
G8 over RGBA_8888
10 registers, 37 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = sub_f32 r1 r1
r3 = splat 437F0000 (255)
r4 = splat 3F000000 (0.5)
loop:
r5 = load8 arg(0)
r5 = to_f32 r5
r5 = mul_f32 r0 r5
r6 = load32 arg(1)
r7 = extract r6 FF
r7 = to_f32 r7
r7 = mul_f32 r0 r7
r8 = extract r6 FF00
r8 = to_f32 r8
r8 = mul_f32 r0 r8
r9 = extract r6 FF0000
r9 = to_f32 r9
r9 = mul_f32 r0 r9
r6 = shr r6 24
r6 = to_f32 r6
r6 = mul_f32 r0 r6
r7 = mad_f32 r7 r2 r5
r8 = mad_f32 r8 r2 r5
r9 = mad_f32 r9 r2 r5
r6 = mad_f32 r6 r2 r1
r7 = mad_f32 r7 r3 r4
r7 = to_i32 r7
r8 = mad_f32 r8 r3 r4
r8 = to_i32 r8
r9 = mad_f32 r9 r3 r4
r9 = to_i32 r9
r6 = mad_f32 r6 r3 r4
r6 = to_i32 r6
r8 = pack r7 r8 8
r6 = pack r9 r6 8
r6 = pack r8 r6 16
store32 arg(1) r6
RGBA_8888 over A8
7 registers, 16 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = splat 437F0000 (255)
r3 = splat 3F000000 (0.5)
loop:
r4 = load32 arg(0)
r4 = shr r4 24
r4 = to_f32 r4
r4 = mul_f32 r0 r4
r5 = load8 arg(1)
r5 = to_f32 r5
r5 = mul_f32 r0 r5
r6 = sub_f32 r1 r4
r6 = mad_f32 r5 r6 r4
r6 = mad_f32 r6 r2 r3
r6 = to_i32 r6
store8 arg(1) r6
RGBA_8888 over G8
12 registers, 33 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = splat 3E59B3D0 (0.21259999)
r3 = splat 3F371759 (0.71520001)
r4 = splat 3D93DD98 (0.0722)
r5 = splat 437F0000 (255)
r6 = splat 3F000000 (0.5)
loop:
r7 = load32 arg(0)
r8 = extract r7 FF
r8 = to_f32 r8
r8 = mul_f32 r0 r8
r9 = extract r7 FF00
r9 = to_f32 r9
r9 = mul_f32 r0 r9
r10 = extract r7 FF0000
r10 = to_f32 r10
r10 = mul_f32 r0 r10
r7 = shr r7 24
r7 = to_f32 r7
r7 = mul_f32 r0 r7
r11 = load8 arg(1)
r11 = to_f32 r11
r11 = mul_f32 r0 r11
r7 = sub_f32 r1 r7
r8 = mad_f32 r11 r7 r8
r9 = mad_f32 r11 r7 r9
r7 = mad_f32 r11 r7 r10
r7 = mul_f32 r7 r4
r7 = mad_f32 r9 r3 r7
r7 = mad_f32 r8 r2 r7
r7 = mad_f32 r7 r5 r6
r7 = to_i32 r7
store8 arg(1) r7
RGBA_8888 over RGBA_8888
13 registers, 47 instructions:
r0 = splat 3B808081 (0.0039215689)
r1 = splat 3F800000 (1)
r2 = splat 437F0000 (255)
r3 = splat 3F000000 (0.5)
loop:
r4 = load32 arg(0)
r5 = extract r4 FF
r5 = to_f32 r5
r5 = mul_f32 r0 r5
r6 = extract r4 FF00
r6 = to_f32 r6
r6 = mul_f32 r0 r6
r7 = extract r4 FF0000
r7 = to_f32 r7
r7 = mul_f32 r0 r7
r4 = shr r4 24
r4 = to_f32 r4
r4 = mul_f32 r0 r4
r8 = load32 arg(1)
r9 = extract r8 FF
r9 = to_f32 r9
r9 = mul_f32 r0 r9
r10 = extract r8 FF00
r10 = to_f32 r10
r10 = mul_f32 r0 r10
r11 = extract r8 FF0000
r11 = to_f32 r11
r11 = mul_f32 r0 r11
r8 = shr r8 24
r8 = to_f32 r8
r8 = mul_f32 r0 r8
r12 = sub_f32 r1 r4
r9 = mad_f32 r9 r12 r5
r10 = mad_f32 r10 r12 r6
r11 = mad_f32 r11 r12 r7
r12 = mad_f32 r8 r12 r4
r9 = mad_f32 r9 r2 r3
r9 = to_i32 r9
r10 = mad_f32 r10 r2 r3
r10 = to_i32 r10
r11 = mad_f32 r11 r2 r3
r11 = to_i32 r11
r12 = mad_f32 r12 r2 r3
r12 = to_i32 r12
r10 = pack r9 r10 8
r12 = pack r11 r12 8
r12 = pack r10 r12 16
store32 arg(1) r12
I32 8888 over 8888
9 registers, 24 instructions:
r0 = load32 arg(0)
r1 = extract r0 FF
r2 = extract r0 FF00
r3 = extract r0 FF0000
r0 = shr r0 24
r4 = load32 arg(1)
r5 = extract r4 FF
r6 = extract r4 FF00
r7 = extract r4 FF0000
r4 = shr r4 24
r8 = splat FF (3.5733111e-43)
r8 = sub_i32 r8 r0
r5 = mul_unorm8 r5 r8
r5 = add_i32 r1 r5
r6 = mul_unorm8 r6 r8
10 registers, 24 instructions:
r0 = splat FF (3.5733111e-43)
loop:
r1 = load32 arg(0)
r2 = extract r1 FF
r3 = extract r1 FF00
r4 = extract r1 FF0000
r1 = shr r1 24
r5 = load32 arg(1)
r6 = extract r5 FF
r7 = extract r5 FF00
r8 = extract r5 FF0000
r5 = shr r5 24
r9 = sub_i32 r0 r1
r6 = mul_unorm8 r6 r9
r6 = add_i32 r2 r6
r7 = mul_unorm8 r7 r8
r7 = mul_unorm8 r7 r9
r7 = add_i32 r3 r7
r8 = mul_unorm8 r4 r8
r8 = add_i32 r0 r8
r6 = pack r5 r6 8
r8 = pack r7 r8 8
r8 = pack r6 r8 16
store32 arg(1) r8
r8 = mul_unorm8 r8 r9
r8 = add_i32 r4 r8
r9 = mul_unorm8 r5 r9
r9 = add_i32 r1 r9
r7 = pack r6 r7 8
r9 = pack r8 r9 8
r9 = pack r7 r9 16
store32 arg(1) r9
I32 (SWAR) 8888 over 8888
6 registers, 20 instructions:
r0 = load32 arg(0)
r1 = extract r0 FF00FF
r0 = extract r0 FF00FF00
r2 = load32 arg(1)
7 registers, 20 instructions:
r0 = splat FF (3.5733111e-43)
r1 = splat FF00FF (2.3418409e-38)
loop:
r2 = load32 arg(0)
r3 = extract r2 FF00FF
r2 = extract r2 FF00FF00
r4 = splat FF (3.5733111e-43)
r5 = shr r0 16
r5 = sub_i32 r4 r5
r4 = splat FF00FF (2.3418409e-38)
r3 = mul_i32 r3 r5
r3 = add_i32 r3 r4
r3 = extract r3 FF00FF00
r3 = add_i32 r1 r3
r5 = mul_i32 r2 r5
r5 = add_i32 r5 r4
r4 = load32 arg(1)
r5 = extract r4 FF00FF
r4 = extract r4 FF00FF00
r6 = shr r2 16
r6 = sub_i32 r0 r6
r5 = mul_i32 r5 r6
r5 = add_i32 r5 r1
r5 = extract r5 FF00FF00
r5 = add_i32 r0 r5
r5 = pack r3 r5 8
store32 arg(1) r5
r5 = add_i32 r3 r5
r6 = mul_i32 r4 r6
r6 = add_i32 r6 r1
r6 = extract r6 FF00FF00
r6 = add_i32 r2 r6
r6 = pack r5 r6 8
store32 arg(1) r6

View File

@ -75,7 +75,7 @@ namespace SkOpts {
extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
#undef M
extern void (*eval)(const skvm::Program::Instruction[], int ninsts, int nregs,
extern void (*eval)(const skvm::Program::Instruction[], int ninsts, int nregs, int loop,
int n, void* args[], size_t strides[], int nargs);
}

View File

@ -15,9 +15,10 @@
namespace skvm {
Program::Program(std::vector<Instruction> instructions, int regs)
Program::Program(std::vector<Instruction> instructions, int regs, int loop)
: fInstructions(std::move(instructions))
, fRegs(regs)
, fLoop(loop)
{}
Program Builder::done() {
@ -38,19 +39,49 @@ namespace skvm {
}
}
// Look to see if there are any instructions that can be hoisted outside the program's loop.
for (ID id = 0; id < (ID)fProgram.size(); id++) {
Instruction& inst = fProgram[id];
// Loads and stores cannot be hoisted out of the loop.
if (inst.op <= Op::load32) {
inst.hoist = false;
}
// If any of an instruction's arguments can't be hoisted, it can't be hoisted itself.
if (inst.hoist) {
if (inst.x != NA) { inst.hoist &= fProgram[inst.x].hoist; }
if (inst.y != NA) { inst.hoist &= fProgram[inst.y].hoist; }
if (inst.z != NA) { inst.hoist &= fProgram[inst.z].hoist; }
}
}
// We'll need to map each live value to a register.
std::unordered_map<ID, ID> val_to_reg;
// Count the registers we've used so far, and track any registers available to reuse.
// Count the registers we've used so far.
ID next_reg = 0;
std::vector<ID> avail;
// A schedule of which registers become available as we reach any given instruction.
// Our first pass of register assignment assigns hoisted values to eternal registers.
for (ID val = 0; val < (ID)fProgram.size(); val++) {
Instruction& inst = fProgram[val];
if (inst.life == NA || !inst.hoist) {
continue;
}
// Hoisted values are needed forever, so they each get their own register.
val_to_reg[val] = next_reg++;
}
// Now we'll assign registers to values that can't be hoisted out of the loop. These
// values have finite liftimes, so we track pre-owned registers that have become available
// and a schedule of which registers become available as we reach a given instruction.
std::vector<ID> avail;
std::unordered_map<ID, std::vector<ID>> deaths;
for (ID val = 0; val < (ID)fProgram.size(); val++) {
Instruction& inst = fProgram[val];
if (inst.life == NA) {
if (inst.life == NA || inst.hoist) {
continue;
}
@ -83,13 +114,14 @@ namespace skvm {
: val_to_reg[val];
};
std::vector<Program::Instruction> program;
for (ID id = 0; id < (ID)fProgram.size(); id++) {
Instruction& inst = fProgram[id];
if (inst.life == NA) {
continue;
}
// Finally translate Builder::Instructions to Program::Instructions by mapping values to
// registers. This will be two passes again, first outside the loop, then inside.
// The loop begins at the loop'th Instruction.
int loop = 0;
std::vector<Program::Instruction> program;
auto push_instruction = [&](ID id, const Builder::Instruction& inst) {
Program::Instruction pinst{
inst.op,
lookup_register(id),
@ -100,16 +132,34 @@ namespace skvm {
if (inst.y == NA) { pinst.y.imm = inst.immy; }
if (inst.z == NA) { pinst.z.imm = inst.immz; }
program.push_back(pinst);
};
for (ID id = 0; id < (ID)fProgram.size(); id++) {
Instruction& inst = fProgram[id];
if (inst.life == NA || !inst.hoist) {
continue;
}
push_instruction(id, inst);
loop++;
}
for (ID id = 0; id < (ID)fProgram.size(); id++) {
Instruction& inst = fProgram[id];
if (inst.life == NA || inst.hoist) {
continue;
}
push_instruction(id, inst);
}
return { std::move(program), /*register count = */next_reg };
return { std::move(program), /*register count = */next_reg, loop };
}
// Most instructions produce a value and return it by ID,
// the value-producing instruction's own index in the program vector.
ID Builder::push(Op op, ID x, ID y, ID z, int immy, int immz) {
Instruction inst{op, /*life=*/NA, x, y, z, immy, immz};
Instruction inst{op, /*hoist=*/true, /*life=*/NA, x, y, z, immy, immz};
// Basic common subexpression elimination:
// if we've already seen this exact Instruction, use it instead of creating a new one.
@ -238,7 +288,11 @@ namespace skvm {
o->writeText(" registers, ");
o->writeDecAsText(fInstructions.size());
o->writeText(" instructions:\n");
for (const Instruction& inst : fInstructions) {
for (int i = 0; i < (int)fInstructions.size(); i++) {
if (i == fLoop) {
write(o, "loop:\n");
}
const Instruction& inst = fInstructions[i];
Op op = inst.op;
ID d = inst.d,
x = inst.x;
@ -286,7 +340,7 @@ namespace skvm {
// ~~~~ Program::eval() and co. ~~~~ //
void Program::eval(int n, void* args[], size_t strides[], int nargs) const {
SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs,
SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs, fLoop,
n, args, strides, nargs);
}
}

View File

@ -39,8 +39,8 @@ namespace skvm {
union { ID id; int imm; } y,z;
};
Program(std::vector<Instruction>, int regs);
Program() : Program({}, 0) {}
Program(std::vector<Instruction>, int regs, int loop);
Program() : Program({}, 0, 0) {}
void dump(SkWStream*) const;
@ -56,6 +56,7 @@ namespace skvm {
std::vector<Instruction> fInstructions;
int fRegs;
int fLoop;
};
struct Arg { int ix; };
@ -121,19 +122,21 @@ namespace skvm {
static const ID NA = ~0;
struct Instruction {
Op op; // v* = op(x,y,z,imm), where * == index of this Instruction.
ID life; // ID of last instruction using this instruction's result.
ID x,y,z; // Enough arguments for mad().
int immy,immz; // Immediate bit patterns, shift counts, argument indexes.
Op op; // v* = op(x,y,z,imm), where * == index of this Instruction.
bool hoist; // Can this instruction be hoisted outside our implicit loop?
ID life; // ID of last instruction using this instruction's result.
ID x,y,z; // Enough arguments for mad().
int immy,immz; // Immediate bit patterns, shift counts, argument indexes.
bool operator==(const Instruction& o) const {
return op == o.op
&& life == o.life
&& x == o.x
&& y == o.y
&& z == o.z
&& immy == o.immy
&& immz == o.immz;
return op == o.op
&& hoist == o.hoist
&& life == o.life
&& x == o.x
&& y == o.y
&& z == o.z
&& immy == o.immy
&& immz == o.immz;
}
};
@ -144,6 +147,7 @@ namespace skvm {
}
size_t operator()(const Instruction& inst) const {
return Hash((uint8_t)inst.op)
^ Hash(inst.hoist)
^ Hash(inst.life)
^ Hash(inst.x)
^ Hash(inst.y)

View File

@ -12,7 +12,8 @@
namespace SK_OPTS_NS {
inline void eval(const skvm::Program::Instruction insts[], const int ninsts, const int nregs,
inline void eval(const skvm::Program::Instruction insts[], const int ninsts,
const int nregs, const int loop,
int n, void* args[], size_t strides[], const int nargs) {
using namespace skvm;
@ -66,11 +67,12 @@ namespace SK_OPTS_NS {
SkASSERT(arg == args + nargs);
};
int stride;
for ( ; n > 0; n -= stride, step_args(stride)) {
int start = 0,
stride;
for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
stride = n >= K ? K : 1;
for (int i = 0; i < ninsts; i++) {
for (int i = start; i < ninsts; i++) {
skvm::Program::Instruction inst = insts[i];
// d = op(x, y.id/z.imm, z.id/z.imm)