linear -> sRGB: use fast approximate sqrt()

Since we're already approximating the sRGB gamma curve with a sqrt(), we might as well approximate with it a faster approximate sqrt(). On Intel, this .rsqrt().invert() version is 2-3x faster than .sqrt() (~3x faster on older machines, ~2x faster on newer machines). This should provide ~11 bits of precision, suspiciously exactly enough. Running dm --config srgb, there are diffs, but none perceptible. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2046063002 Review-Url: https://codereview.chromium.org/2046063002
2016-06-07 12:12:37 -07:00 · 2016-06-07 12:12:37 -07:00 · 3db2028126
commit 3db2028126
parent 12dfaaa53c
3 changed files with 11 additions and 6 deletions
--- a/src/core/SkPM4fPriv.h
+++ b/src/core/SkPM4fPriv.h
@ -41,7 +41,7 @@ static inline Sk4f srgb_to_linear(const Sk4f& s4) {
 }

 static inline Sk4f linear_to_srgb(const Sk4f& l4) {
-    return set_alpha(l4.sqrt(), get_alpha(l4));
+    return set_alpha(l4.rsqrt().invert(), get_alpha(l4));
 }

 static inline float srgb_to_linear(float x) {
--- a/src/core/SkXfermode4f.cpp
+++ b/src/core/SkXfermode4f.cpp
@ -68,10 +68,10 @@ static Sk4x4f load_4_srgb(const void* ptr) {
 // Store an Sk4x4f back to 4 interlaced 8888 sRGB pixels.
 static void store_4_srgb(void* ptr, const Sk4x4f& p) {
    // Convert back to sRGB and [0,255], again approximating sRGB as gamma == 2.
-    auto r = p.r.sqrt() * 255.0f + 0.5f,
-         g = p.g.sqrt() * 255.0f + 0.5f,
-         b = p.b.sqrt() * 255.0f + 0.5f,
-         a = p.a        * 255.0f + 0.5f;
+    auto r = p.r.rsqrt().invert() * 255.0f + 0.5f,
+         g = p.g.rsqrt().invert() * 255.0f + 0.5f,
+         b = p.b.rsqrt().invert() * 255.0f + 0.5f,
+         a = p.a                  * 255.0f + 0.5f;
    Sk4x4f{r,g,b,a}.transpose((uint8_t*)ptr);
 }

--- a/src/effects/gradients/Sk4fLinearGradient.cpp
+++ b/src/effects/gradients/Sk4fLinearGradient.cpp
@ -53,7 +53,12 @@ void ramp<DstType::S32, ApplyPremul::False>(const Sk4f& c, const Sk4f& dc, SkPMC
    Sk4x4f        c4x = Sk4x4f::Transpose(c, c + dc, c + dc * 2, c + dc * 3);

    while (n >= 4) {
-        const Sk4x4f cx4s32 = { c4x.r.sqrt(), c4x.g.sqrt(), c4x.b.sqrt(), c4x.a };
+        const Sk4x4f cx4s32 = {
+            c4x.r.rsqrt().invert(),
+            c4x.g.rsqrt().invert(),
+            c4x.b.rsqrt().invert(),
+            c4x.a
+        };
        cx4s32.transpose((uint8_t*)dst);

        c4x.r += dc4x.r;