fast NEON divide-by-255

We can approximate (xy + 127) / 255 with (xy + 255) / 256. On ARM this divide-by-255 is a single instruction, one of the two we use today to do a perfect divide-by-255 (#if 0). This cuts div-255 in half, or a full mul-div-255 by a third. The U16(255) constant can even be created in a single instruction without hitting memory, which is as good as it gets. Here's a nice little example: 0000000000000000 <sk_premul_8bit>: 0: f8408404 ldr x4, [x0], #8 // Load the next stage. 4: 2e23c000 umull v0.8h, v0.8b, v3.8b // r = r * a 8: 6f02e6b0 movi v16.2d, #0xff00ff00ff00ff // create U16(255) c: 2e23c021 umull v1.8h, v1.8b, v3.8b // g = g * a 10: 2e23c042 umull v2.8h, v2.8b, v3.8b // b = b * a 14: 0e304000 addhn v0.8b, v0.8h, v16.8h // r = div255(r) 18: 0e304021 addhn v1.8b, v1.8h, v16.8h // g = div255(g) 1c: 0e304042 addhn v2.8b, v2.8h, v16.8h // b = div255(b) 20: d61f0080 br x4 // JUMP! Change-Id: I4224ed3844abf6c67d9e42b67444a60f4aee8f08 Reviewed-on: https://skia-review.googlesource.com/40121 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
2017-08-29 18:10:15 -04:00 · 2017-08-29 18:10:15 -04:00 · 21befdcf5e
commit 21befdcf5e
parent 569b74c38c
1 changed files with 6 additions and 1 deletions
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@ -476,8 +476,13 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
        V(int   v) : vec(v) {}
        V(float v) : vec(v * 255) {}
        V(U16   v) {
-            // (v + 127) / 255 == (v + (v+128)>>8 +128) >> 8
+        #if 0
+            // (v + 127) / 255 = (v + ((v+128)>>8) + 128) >> 8
            vec = vraddhn_u16(v, vrshrq_n_u16(v, 8));
+        #else
+            // (v + 127) / 255 ≈ (v + 255) >> 8
+            vec = vaddhn_u16(v, U16(255));
+        #endif
        }

        operator U8() const { return vec; }