fast NEON divide-by-255
We can approximate (xy + 127) / 255 with (xy + 255) / 256. On ARM this divide-by-255 is a single instruction, one of the two we use today to do a perfect divide-by-255 (#if 0). This cuts div-255 in half, or a full mul-div-255 by a third. The U16(255) constant can even be created in a single instruction without hitting memory, which is as good as it gets. Here's a nice little example: 0000000000000000 <sk_premul_8bit>: 0: f8408404 ldr x4, [x0], #8 // Load the next stage. 4: 2e23c000 umull v0.8h, v0.8b, v3.8b // r = r * a 8: 6f02e6b0 movi v16.2d, #0xff00ff00ff00ff // create U16(255) c: 2e23c021 umull v1.8h, v1.8b, v3.8b // g = g * a 10: 2e23c042 umull v2.8h, v2.8b, v3.8b // b = b * a 14: 0e304000 addhn v0.8b, v0.8h, v16.8h // r = div255(r) 18: 0e304021 addhn v1.8b, v1.8h, v16.8h // g = div255(g) 1c: 0e304042 addhn v2.8b, v2.8h, v16.8h // b = div255(b) 20: d61f0080 br x4 // JUMP! Change-Id: I4224ed3844abf6c67d9e42b67444a60f4aee8f08 Reviewed-on: https://skia-review.googlesource.com/40121 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
This commit is contained in:
parent
569b74c38c
commit
21befdcf5e
@ -476,8 +476,13 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
|
||||
V(int v) : vec(v) {}
|
||||
V(float v) : vec(v * 255) {}
|
||||
V(U16 v) {
|
||||
// (v + 127) / 255 == (v + (v+128)>>8 +128) >> 8
|
||||
#if 0
|
||||
// (v + 127) / 255 = (v + ((v+128)>>8) + 128) >> 8
|
||||
vec = vraddhn_u16(v, vrshrq_n_u16(v, 8));
|
||||
#else
|
||||
// (v + 127) / 255 ≈ (v + 255) >> 8
|
||||
vec = vaddhn_u16(v, U16(255));
|
||||
#endif
|
||||
}
|
||||
|
||||
operator U8() const { return vec; }
|
||||
|
Loading…
Reference in New Issue
Block a user