linear -> sRGB: use fast approximate sqrt()

Since we're already approximating the sRGB gamma curve with a sqrt(), we might
as well approximate with it a faster approximate sqrt().  On Intel, this
.rsqrt().invert() version is 2-3x faster than .sqrt()  (~3x faster on older
machines, ~2x faster on newer machines).

This should provide ~11 bits of precision, suspiciously exactly enough.

Running dm --config srgb, there are diffs, but none perceptible.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2046063002

Review-Url: https://codereview.chromium.org/2046063002
This commit is contained in:
mtklein 2016-06-07 12:12:37 -07:00 committed by Commit bot
parent 12dfaaa53c
commit 3db2028126
3 changed files with 11 additions and 6 deletions

View File

@ -41,7 +41,7 @@ static inline Sk4f srgb_to_linear(const Sk4f& s4) {
}
static inline Sk4f linear_to_srgb(const Sk4f& l4) {
return set_alpha(l4.sqrt(), get_alpha(l4));
return set_alpha(l4.rsqrt().invert(), get_alpha(l4));
}
static inline float srgb_to_linear(float x) {

View File

@ -68,10 +68,10 @@ static Sk4x4f load_4_srgb(const void* ptr) {
// Store an Sk4x4f back to 4 interlaced 8888 sRGB pixels.
static void store_4_srgb(void* ptr, const Sk4x4f& p) {
// Convert back to sRGB and [0,255], again approximating sRGB as gamma == 2.
auto r = p.r.sqrt() * 255.0f + 0.5f,
g = p.g.sqrt() * 255.0f + 0.5f,
b = p.b.sqrt() * 255.0f + 0.5f,
a = p.a * 255.0f + 0.5f;
auto r = p.r.rsqrt().invert() * 255.0f + 0.5f,
g = p.g.rsqrt().invert() * 255.0f + 0.5f,
b = p.b.rsqrt().invert() * 255.0f + 0.5f,
a = p.a * 255.0f + 0.5f;
Sk4x4f{r,g,b,a}.transpose((uint8_t*)ptr);
}

View File

@ -53,7 +53,12 @@ void ramp<DstType::S32, ApplyPremul::False>(const Sk4f& c, const Sk4f& dc, SkPMC
Sk4x4f c4x = Sk4x4f::Transpose(c, c + dc, c + dc * 2, c + dc * 3);
while (n >= 4) {
const Sk4x4f cx4s32 = { c4x.r.sqrt(), c4x.g.sqrt(), c4x.b.sqrt(), c4x.a };
const Sk4x4f cx4s32 = {
c4x.r.rsqrt().invert(),
c4x.g.rsqrt().invert(),
c4x.b.rsqrt().invert(),
c4x.a
};
cx4s32.transpose((uint8_t*)dst);
c4x.r += dc4x.r;