fast NEON divide-by-255

We can approximate (xy + 127) / 255 with (xy + 255) / 256.

On ARM this divide-by-255 is a single instruction, one of the two we use today
to do a perfect divide-by-255 (#if 0).  This cuts div-255 in half, or a full
mul-div-255 by a third.  The U16(255) constant can even be created in a single
instruction without hitting memory, which is as good as it gets.

Here's a nice little example:

    0000000000000000 <sk_premul_8bit>:
       0:   f8408404        ldr     x4, [x0], #8                 // Load the next stage.
       4:   2e23c000        umull   v0.8h, v0.8b, v3.8b          // r = r * a
       8:   6f02e6b0        movi    v16.2d, #0xff00ff00ff00ff    // create U16(255)
       c:   2e23c021        umull   v1.8h, v1.8b, v3.8b          // g = g * a
      10:   2e23c042        umull   v2.8h, v2.8b, v3.8b          // b = b * a
      14:   0e304000        addhn   v0.8b, v0.8h, v16.8h         // r = div255(r)
      18:   0e304021        addhn   v1.8b, v1.8h, v16.8h         // g = div255(g)
      1c:   0e304042        addhn   v2.8b, v2.8h, v16.8h         // b = div255(b)
      20:   d61f0080        br      x4                           // JUMP!

Change-Id: I4224ed3844abf6c67d9e42b67444a60f4aee8f08
Reviewed-on: https://skia-review.googlesource.com/40121
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Florin Malita <fmalita@chromium.org>
This commit is contained in:
Mike Klein 2017-08-29 18:10:15 -04:00 committed by Skia Commit-Bot
parent 569b74c38c
commit 21befdcf5e

View File

@ -476,8 +476,13 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
V(int v) : vec(v) {}
V(float v) : vec(v * 255) {}
V(U16 v) {
// (v + 127) / 255 == (v + (v+128)>>8 +128) >> 8
#if 0
// (v + 127) / 255 = (v + ((v+128)>>8) + 128) >> 8
vec = vraddhn_u16(v, vrshrq_n_u16(v, 8));
#else
// (v + 127) / 255 ≈ (v + 255) >> 8
vec = vaddhn_u16(v, U16(255));
#endif
}
operator U8() const { return vec; }