Convert SkPMFloat to [0,1] range and prune its API.
Now that Sk4px exists, there's a lot less sense in eeking out every cycle of speed from SkPMFloat: if we need to go _really_ fast, we should use Sk4px. SkPMFloat's going to be used for things that are already slow: large-range intermediates, divides, sqrts, etc. A [0,1] range is easier to work with, and can even be faster if we eliminate enough *255 and *1/255 steps. This is particularly true on ARM, where NEON can do the *255 and /255 steps for us while converting float<->int. We have lots of experimental SkPMFloat <-> SkPMColor APIs that I'm now removing. Of the existing APIs, roundClamp() is the sanest, so I've kept only that, now called round(). The 4-at-a-time APIs never panned out, so they're gone. There will be small diffs on: colormatrix coloremoji colorfilterimagefilter fadefilter imagefilters_xfermodes imagefilterscropexpand imagefiltersgraph tileimagefilter BUG=skia: Review URL: https://codereview.chromium.org/1201343004
This commit is contained in:
parent
538bacb4bb
commit
e9a3e3c17a
@ -20,20 +20,10 @@ static uint32_t lcg_rand(uint32_t* seed) {
|
||||
}
|
||||
|
||||
// I'm having better luck getting these to constant-propagate away as template parameters.
|
||||
template <bool kClamp, bool kWide>
|
||||
struct PMFloatGetSetBench : public Benchmark {
|
||||
PMFloatGetSetBench() {}
|
||||
struct PMFloatRoundtripBench : public Benchmark {
|
||||
PMFloatRoundtripBench() {}
|
||||
|
||||
const char* onGetName() override {
|
||||
switch (kClamp << 1 | kWide) {
|
||||
case 0: return "SkPMFloat_get_1x";
|
||||
case 1: return "SkPMFloat_get_4x";
|
||||
case 2: return "SkPMFloat_clamp_1x";
|
||||
case 3: return "SkPMFloat_clamp_4x";
|
||||
}
|
||||
SkFAIL("unreachable");
|
||||
return "oh bother";
|
||||
}
|
||||
const char* onGetName() override { return "SkPMFloat_roundtrip"; }
|
||||
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
||||
|
||||
void onDraw(const int loops, SkCanvas* canvas) override {
|
||||
@ -41,61 +31,23 @@ struct PMFloatGetSetBench : public Benchmark {
|
||||
uint32_t junk = 0;
|
||||
uint32_t seed = 0;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
SkPMColor colors[4];
|
||||
SkPMColor color;
|
||||
#ifdef SK_DEBUG
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// Our SkASSERTs will remind us that it's technically required that we premultiply.
|
||||
colors[i] = SkPreMultiplyColor(lcg_rand(&seed));
|
||||
}
|
||||
// Our SkASSERTs will remind us that it's technically required that we premultiply.
|
||||
color = SkPreMultiplyColor(lcg_rand(&seed));
|
||||
#else
|
||||
// But it's a lot faster not to, and this code won't really mind the non-PM colors.
|
||||
(void)lcg_rand(&seed);
|
||||
colors[0] = seed + 0;
|
||||
colors[1] = seed + 1;
|
||||
colors[2] = seed + 2;
|
||||
colors[3] = seed + 3;
|
||||
color = lcg_rand(&seed);
|
||||
#endif
|
||||
|
||||
SkPMFloat fa,fb,fc,fd;
|
||||
if (kWide) {
|
||||
SkPMFloat::From4PMColors(colors, &fa, &fb, &fc, &fd);
|
||||
} else {
|
||||
fa = SkPMFloat::FromPMColor(colors[0]);
|
||||
fb = SkPMFloat::FromPMColor(colors[1]);
|
||||
fc = SkPMFloat::FromPMColor(colors[2]);
|
||||
fd = SkPMFloat::FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
SkPMColor back[4];
|
||||
switch (kClamp << 1 | kWide) {
|
||||
case 0: {
|
||||
back[0] = fa.round();
|
||||
back[1] = fb.round();
|
||||
back[2] = fc.round();
|
||||
back[3] = fd.round();
|
||||
} break;
|
||||
case 1: SkPMFloat::RoundTo4PMColors(fa, fb, fc, fd, back); break;
|
||||
case 2: {
|
||||
back[0] = fa.roundClamp();
|
||||
back[1] = fb.roundClamp();
|
||||
back[2] = fc.roundClamp();
|
||||
back[3] = fd.roundClamp();
|
||||
} break;
|
||||
case 3: SkPMFloat::RoundClampTo4PMColors(fa, fb, fc, fd, back); break;
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
junk ^= back[i];
|
||||
}
|
||||
auto f = SkPMFloat::FromPMColor(color);
|
||||
SkPMColor back = f.round();
|
||||
junk ^= back;
|
||||
}
|
||||
blackhole ^= junk;
|
||||
}
|
||||
};
|
||||
|
||||
// Extra () help DEF_BENCH not get confused by the comma inside the <>.
|
||||
DEF_BENCH(return (new PMFloatGetSetBench< true, true>);)
|
||||
DEF_BENCH(return (new PMFloatGetSetBench<false, true>);)
|
||||
DEF_BENCH(return (new PMFloatGetSetBench< true, false>);)
|
||||
DEF_BENCH(return (new PMFloatGetSetBench<false, false>);)
|
||||
DEF_BENCH(return new PMFloatRoundtripBench;)
|
||||
|
||||
struct PMFloatGradientBench : public Benchmark {
|
||||
const char* onGetName() override { return "PMFloat_gradient"; }
|
||||
@ -103,8 +55,8 @@ struct PMFloatGradientBench : public Benchmark {
|
||||
|
||||
SkPMColor fDevice[100];
|
||||
void onDraw(const int loops, SkCanvas*) override {
|
||||
Sk4f c0 = SkPMFloat::FromARGB(255, 255, 0, 0),
|
||||
c1 = SkPMFloat::FromARGB(255, 0, 0, 255),
|
||||
Sk4f c0 = SkPMFloat::FromARGB(1, 1, 0, 0),
|
||||
c1 = SkPMFloat::FromARGB(1, 0, 0, 1),
|
||||
dc = c1 - c0,
|
||||
fx(0.1f),
|
||||
dx(0.002f),
|
||||
@ -112,15 +64,15 @@ struct PMFloatGradientBench : public Benchmark {
|
||||
dcdx4(dcdx+dcdx+dcdx+dcdx);
|
||||
|
||||
for (int n = 0; n < loops; n++) {
|
||||
Sk4f a = c0 + dc*fx + Sk4f(0.5f), // The +0.5f lets us call trunc() instead of get().
|
||||
Sk4f a = c0 + dc*fx,
|
||||
b = a + dcdx,
|
||||
c = b + dcdx,
|
||||
d = c + dcdx;
|
||||
for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
|
||||
fDevice[i+0] = SkPMFloat(a).trunc();
|
||||
fDevice[i+1] = SkPMFloat(b).trunc();
|
||||
fDevice[i+2] = SkPMFloat(c).trunc();
|
||||
fDevice[i+3] = SkPMFloat(d).trunc();
|
||||
fDevice[i+0] = SkPMFloat(a).round();
|
||||
fDevice[i+1] = SkPMFloat(b).round();
|
||||
fDevice[i+2] = SkPMFloat(c).round();
|
||||
fDevice[i+3] = SkPMFloat(d).round();
|
||||
a = a + dcdx4;
|
||||
b = b + dcdx4;
|
||||
c = c + dcdx4;
|
||||
|
@ -21,15 +21,12 @@
|
||||
namespace {
|
||||
|
||||
// A pre-multiplied color storing each component in the same order as SkPMColor,
|
||||
// but as a float in the range [0, 255].
|
||||
// but as a float in the range [0, 1].
|
||||
class SkPMFloat : public Sk4f {
|
||||
public:
|
||||
static SkPMFloat FromPMColor(SkPMColor c) { return SkPMFloat(c); }
|
||||
static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
|
||||
|
||||
// May be more efficient than one at a time. No special alignment assumed for SkPMColors.
|
||||
static void From4PMColors(const SkPMColor[4], SkPMFloat*, SkPMFloat*, SkPMFloat*, SkPMFloat*);
|
||||
|
||||
// Uninitialized.
|
||||
SkPMFloat() {}
|
||||
explicit SkPMFloat(SkPMColor);
|
||||
@ -47,24 +44,10 @@ public:
|
||||
float g() const { return this->kth<SK_G32_SHIFT / 8>(); }
|
||||
float b() const { return this->kth<SK_B32_SHIFT / 8>(); }
|
||||
|
||||
// N.B. All methods returning an SkPMColor call SkPMColorAssert on that result before returning.
|
||||
|
||||
// round() and roundClamp() round component values to the nearest integer.
|
||||
SkPMColor round() const; // Assumes all values in [0, 255]. Some implementations may clamp.
|
||||
SkPMColor roundClamp() const; // Will clamp all values to [0, 255].
|
||||
|
||||
// Like round(), but truncates instead of rounding.
|
||||
// The domain of this function is (-1.0f, 256.0f). Values in (-1.0f, 0.0f] trunc to a zero.
|
||||
SkPMColor trunc() const;
|
||||
|
||||
// 4-at-a-time versions of round() and roundClamp(). Like From4PMColors(), no alignment assumed.
|
||||
static void RoundTo4PMColors(
|
||||
const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
|
||||
static void RoundClampTo4PMColors(
|
||||
const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
|
||||
SkPMColor round() const; // Rounds from [0.0f, 1.0f] to [0, 255], clamping if out of range.
|
||||
|
||||
bool isValid() const {
|
||||
return this->a() >= 0 && this->a() <= 255
|
||||
return this->a() >= 0 && this->a() <= 1
|
||||
&& this->r() >= 0 && this->r() <= this->a()
|
||||
&& this->g() >= 0 && this->g() <= this->a()
|
||||
&& this->b() >= 0 && this->b() <= this->a();
|
||||
@ -80,10 +63,8 @@ private:
|
||||
// Platform implementations of SkPMFloat assume Sk4f uses SSE or NEON. _none is generic.
|
||||
#include "../opts/SkPMFloat_none.h"
|
||||
#else
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
#include "../opts/SkPMFloat_SSSE3.h"
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include "../opts/SkPMFloat_SSE2.h"
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include "../opts/SkPMFloat_sse.h"
|
||||
#elif defined(SK_ARM_HAS_NEON)
|
||||
#include "../opts/SkPMFloat_neon.h"
|
||||
#else
|
||||
|
@ -239,25 +239,9 @@ uint32_t SkColorMatrixFilter::getFlags() const {
|
||||
return this->INHERITED::getFlags() | fFlags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Need inv255 = 1 / 255 as a constant, so when we premul a SkPMFloat, we can do this
|
||||
*
|
||||
* new_red = old_red * alpha * inv255
|
||||
*
|
||||
* instead of (much slower)
|
||||
*
|
||||
* new_red = old_red * alpha / 255
|
||||
*
|
||||
* However, 1.0f/255 comes to (in hex) 0x3B808081, which is slightly bigger than the "actual"
|
||||
* value of 0x3B808080(repeat 80)... This slightly too-big value can cause us to compute
|
||||
* new_red > alpha, which is a problem (for valid premul). To fix this, we use a
|
||||
* hand-computed value of 0x3B808080, 1 ULP smaller. This keeps our colors valid.
|
||||
*/
|
||||
static const float gInv255 = 0.0039215683f; // (1.0f / 255) - ULP == SkBits2Float(0x3B808080)
|
||||
|
||||
static Sk4f premul(const Sk4f& x) {
|
||||
float scale = SkPMFloat(x).a() * gInv255;
|
||||
Sk4f pm = x * Sk4f(scale, scale, scale, 1);
|
||||
float scale = SkPMFloat(x).a();
|
||||
Sk4f pm = x * SkPMFloat(1, scale, scale, scale);
|
||||
|
||||
#ifdef SK_DEBUG
|
||||
SkPMFloat pmf(pm);
|
||||
@ -268,12 +252,12 @@ static Sk4f premul(const Sk4f& x) {
|
||||
}
|
||||
|
||||
static Sk4f unpremul(const SkPMFloat& pm) {
|
||||
float scale = 255 / pm.a(); // candidate for fast/approx invert?
|
||||
return pm * Sk4f(scale, scale, scale, 1);
|
||||
float scale = 1 / pm.a(); // candidate for fast/approx invert?
|
||||
return pm * SkPMFloat(1, scale, scale, scale);
|
||||
}
|
||||
|
||||
static Sk4f clamp_0_255(const Sk4f& value) {
|
||||
return Sk4f::Max(Sk4f::Min(value, Sk4f(255)), Sk4f(0));
|
||||
static Sk4f clamp_0_1(const Sk4f& value) {
|
||||
return Sk4f::Max(Sk4f::Min(value, Sk4f(1)), Sk4f(0));
|
||||
}
|
||||
|
||||
void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor dst[]) const {
|
||||
@ -292,14 +276,16 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
|
||||
#endif
|
||||
|
||||
if (use_floats) {
|
||||
// c0-c3 are already in [0,1].
|
||||
const Sk4f c0 = Sk4f::Load(fTranspose + 0);
|
||||
const Sk4f c1 = Sk4f::Load(fTranspose + 4);
|
||||
const Sk4f c2 = Sk4f::Load(fTranspose + 8);
|
||||
const Sk4f c3 = Sk4f::Load(fTranspose + 12);
|
||||
const Sk4f c4 = Sk4f::Load(fTranspose + 16); // translates
|
||||
// c4 (the translate vector) is in [0, 255]. Bring it back to [0,1].
|
||||
const Sk4f c4 = Sk4f::Load(fTranspose + 16)*Sk4f(1.0f/255);
|
||||
|
||||
// todo: we could cache this in the constructor...
|
||||
SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_255(c4))).roundClamp();
|
||||
SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_1(c4))).round();
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
const SkPMColor src_c = src[i];
|
||||
@ -323,7 +309,7 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
|
||||
Sk4f dst4 = c0 * r4 + c1 * g4 + c2 * b4 + c3 * a4 + c4;
|
||||
|
||||
// clamp, re-premul, and write
|
||||
dst[i] = SkPMFloat(premul(clamp_0_255(dst4))).round();
|
||||
dst[i] = SkPMFloat(premul(clamp_0_1(dst4))).round();
|
||||
}
|
||||
} else {
|
||||
const State& state = fState;
|
||||
|
@ -1,84 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
namespace { // See SkPMFloat.h
|
||||
|
||||
// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
|
||||
// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
|
||||
|
||||
// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
|
||||
// to 8-bit-in-16-bit, back down to 8-bit components.
|
||||
// _mm_packus_epi16() gives us clamping for free while narrowing.
|
||||
|
||||
inline SkPMFloat::SkPMFloat(SkPMColor c) {
|
||||
SkPMColorAssert(c);
|
||||
__m128i fix8 = _mm_set_epi32(0,0,0,c),
|
||||
fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
|
||||
fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
|
||||
fVec = _mm_cvtepi32_ps(fix8_32);
|
||||
SkASSERT(this->isValid());
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::round() const {
|
||||
return this->roundClamp(); // Haven't beaten this yet.
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::roundClamp() const {
|
||||
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
|
||||
__m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
|
||||
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
|
||||
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
|
||||
SkPMColor c = _mm_cvtsi128_si32(fix8);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::trunc() const {
|
||||
// Basically, same as roundClamp(), but no rounding.
|
||||
__m128i fix8_32 = _mm_cvttps_epi32(fVec),
|
||||
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
|
||||
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
|
||||
SkPMColor c = _mm_cvtsi128_si32(fix8);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
// Haven't beaten this yet.
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Haven't beaten this yet.
|
||||
RoundClampTo4PMColors(a,b,c,d, colors);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
|
||||
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
|
||||
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
|
||||
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
|
||||
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
|
||||
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
|
||||
__m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
|
||||
_mm_packus_epi16(c2, c3));
|
||||
_mm_storeu_si128((__m128i*)colors, c3210);
|
||||
SkPMColorAssert(colors[0]);
|
||||
SkPMColorAssert(colors[1]);
|
||||
SkPMColorAssert(colors[2]);
|
||||
SkPMColorAssert(colors[3]);
|
||||
}
|
||||
|
||||
} // namespace
|
@ -1,87 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
namespace { // See SkPMFloat.h
|
||||
|
||||
// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits
|
||||
// (fix8_32), then convert those to floats.
|
||||
|
||||
// round() does the opposite, working from floats to 8-bit-in-32-bits, then back to packed 8 bit.
|
||||
|
||||
// roundClamp() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bit, with
|
||||
// _mm_packus_epi16() both clamping and narrowing.
|
||||
|
||||
inline SkPMFloat::SkPMFloat(SkPMColor c) {
|
||||
SkPMColorAssert(c);
|
||||
const int _ = 255; // _ means to zero that byte.
|
||||
__m128i fix8 = _mm_set_epi32(0,0,0,c),
|
||||
fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0));
|
||||
fVec = _mm_cvtepi32_ps(fix8_32);
|
||||
SkASSERT(this->isValid());
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::trunc() const {
|
||||
const int _ = 255; // _ means to zero that byte.
|
||||
__m128i fix8_32 = _mm_cvttps_epi32(fVec),
|
||||
fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
|
||||
SkPMColor c = _mm_cvtsi128_si32(fix8);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::round() const {
|
||||
return SkPMFloat(Sk4f(0.5f) + *this).trunc();
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::roundClamp() const {
|
||||
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
|
||||
__m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
|
||||
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
|
||||
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
|
||||
SkPMColor c = _mm_cvtsi128_si32(fix8);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
// Haven't beaten this yet.
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Haven't beaten this yet. Still faster than RoundClampTo4PMColors?
|
||||
colors[0] = a.round();
|
||||
colors[1] = b.round();
|
||||
colors[2] = c.round();
|
||||
colors[3] = d.round();
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
|
||||
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
|
||||
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
|
||||
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
|
||||
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
|
||||
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
|
||||
__m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
|
||||
_mm_packus_epi16(c2, c3));
|
||||
_mm_storeu_si128((__m128i*)colors, c3210);
|
||||
SkPMColorAssert(colors[0]);
|
||||
SkPMColorAssert(colors[1]);
|
||||
SkPMColorAssert(colors[2]);
|
||||
SkPMColorAssert(colors[3]);
|
||||
}
|
||||
|
||||
} // namespace
|
@ -7,70 +7,24 @@
|
||||
|
||||
namespace { // See SkPMFloat.h
|
||||
|
||||
// For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
|
||||
// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
|
||||
|
||||
// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
|
||||
// to 8-bit-in-16-bit, back down to 8-bit components.
|
||||
// roundClamp() uses vqmovn to clamp while narrowing instead of just narrowing with vmovn.
|
||||
|
||||
inline SkPMFloat::SkPMFloat(SkPMColor c) {
|
||||
SkPMColorAssert(c);
|
||||
uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c);
|
||||
uint16x8_t fix8_16 = vmovl_u8(fix8);
|
||||
uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
|
||||
fVec = vcvtq_f32_u32(fix8_32);
|
||||
fVec = vcvtq_n_f32_u32(fix8_32, 8);
|
||||
SkASSERT(this->isValid());
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::trunc() const {
|
||||
uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); // vcvtq_u32_f32 truncates
|
||||
uint16x4_t fix8_16 = vmovn_u32(fix8_32);
|
||||
uint8x8_t fix8 = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
|
||||
SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::round() const {
|
||||
return SkPMFloat(Sk4f(0.5f) + *this).trunc();
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::roundClamp() const {
|
||||
float32x4_t add_half = vaddq_f32(fVec, vdupq_n_f32(0.5f));
|
||||
uint32x4_t fix8_32 = vcvtq_u32_f32(add_half); // vcvtq_u32_f32 truncates, so round manually
|
||||
uint16x4_t fix8_16 = vqmovn_u32(fix8_32);
|
||||
uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
|
||||
// vcvtq_n_u32_f32 truncates, so we round manually by adding a half before converting.
|
||||
float32x4_t rounded = vaddq_f32(fVec, vdupq_n_f32(0.5f/255));
|
||||
uint32x4_t fix8_32 = vcvtq_n_u32_f32(rounded, 8);
|
||||
uint16x4_t fix8_16 = vqmovn_u32(fix8_32);
|
||||
uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
|
||||
SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
// TODO: we should be able to beat these loops on all three methods.
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.round();
|
||||
colors[1] = b.round();
|
||||
colors[2] = c.round();
|
||||
colors[3] = d.round();
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.roundClamp();
|
||||
colors[1] = b.roundClamp();
|
||||
colors[2] = c.roundClamp();
|
||||
colors[3] = d.roundClamp();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -8,61 +8,26 @@
|
||||
namespace { // See SkPMFloat.h
|
||||
|
||||
inline SkPMFloat::SkPMFloat(SkPMColor c) {
|
||||
*this = SkPMFloat::FromARGB(SkGetPackedA32(c),
|
||||
SkGetPackedR32(c),
|
||||
SkGetPackedG32(c),
|
||||
SkGetPackedB32(c));
|
||||
float inv255 = 1.0f/255;
|
||||
*this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255,
|
||||
SkGetPackedR32(c) * inv255,
|
||||
SkGetPackedG32(c) * inv255,
|
||||
SkGetPackedB32(c) * inv255);
|
||||
SkASSERT(this->isValid());
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::trunc() const {
|
||||
return SkPackARGB32(this->a(), this->r(), this->g(), this->b());
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::round() const {
|
||||
SkPMColor c = SkPackARGB32(this->a()+0.5f, this->r()+0.5f, this->g()+0.5f, this->b()+0.5f);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::roundClamp() const {
|
||||
float a = this->a(),
|
||||
r = this->r(),
|
||||
g = this->g(),
|
||||
b = this->b();
|
||||
a = a < 0 ? 0 : (a > 255 ? 255 : a);
|
||||
r = r < 0 ? 0 : (r > 255 ? 255 : r);
|
||||
g = g < 0 ? 0 : (g > 255 ? 255 : g);
|
||||
b = b < 0 ? 0 : (b > 255 ? 255 : b);
|
||||
SkPMColor c = SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f);
|
||||
a = a < 0 ? 0 : (a > 1 ? 1 : a);
|
||||
r = r < 0 ? 0 : (r > 1 ? 1 : r);
|
||||
g = g < 0 ? 0 : (g > 1 ? 1 : g);
|
||||
b = b < 0 ? 0 : (b > 1 ? 1 : b);
|
||||
SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.round();
|
||||
colors[1] = b.round();
|
||||
colors[2] = c.round();
|
||||
colors[3] = d.round();
|
||||
}
|
||||
|
||||
inline void SkPMFloat::RoundClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.roundClamp();
|
||||
colors[1] = b.roundClamp();
|
||||
colors[2] = c.roundClamp();
|
||||
colors[3] = d.roundClamp();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
36
src/opts/SkPMFloat_sse.h
Normal file
36
src/opts/SkPMFloat_sse.h
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
namespace { // See SkPMFloat.h
|
||||
|
||||
inline SkPMFloat::SkPMFloat(SkPMColor c) {
|
||||
SkPMColorAssert(c);
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
const int _ = 255; // Zero these bytes.
|
||||
__m128i fix8 = _mm_cvtsi32_si128((int)c),
|
||||
fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
|
||||
#else
|
||||
__m128i fix8 = _mm_cvtsi32_si128((int)c),
|
||||
fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
|
||||
fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
|
||||
#endif
|
||||
fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255));
|
||||
SkASSERT(this->isValid());
|
||||
}
|
||||
|
||||
inline SkPMColor SkPMFloat::round() const {
|
||||
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
|
||||
__m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec);
|
||||
__m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)),
|
||||
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
|
||||
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
|
||||
SkPMColor c = _mm_cvtsi128_si32(fix8);
|
||||
SkPMColorAssert(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
} // namespace
|
@ -12,47 +12,27 @@ DEF_TEST(SkPMFloat, r) {
|
||||
// Test SkPMColor <-> SkPMFloat
|
||||
SkPMColor c = SkPreMultiplyColor(0xFFCC9933);
|
||||
SkPMFloat pmf(c);
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, pmf.a()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, pmf.r()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, pmf.g()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, pmf.b()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*pmf.a()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, 255*pmf.r()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*pmf.g()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, 255*pmf.b()));
|
||||
REPORTER_ASSERT(r, c == pmf.round());
|
||||
|
||||
// Test rounding.
|
||||
pmf = SkPMFloat(254.5f, 203.5f, 153.1f, 50.8f);
|
||||
pmf = SkPMFloat(254.5f/255, 203.5f/255, 153.1f/255, 50.8f/255);
|
||||
REPORTER_ASSERT(r, c == pmf.round());
|
||||
|
||||
pmf = SkPMFloat(255.9f, 204.01f, 153.0f, -0.9f);
|
||||
REPORTER_ASSERT(r, SkPreMultiplyColor(0xFFCC9900) == pmf.trunc());
|
||||
|
||||
// Test clamping.
|
||||
SkPMFloat clamped(SkPMFloat(510.0f, 153.0f, 1.0f, -0.2f).roundClamp());
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, clamped.a()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, clamped.r()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, clamped.g()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, clamped.b()));
|
||||
SkPMFloat clamped(SkPMFloat(510.0f/255, 153.0f/255, 1.0f/255, -0.2f/255).round());
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*clamped.a()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*clamped.r()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, 255*clamped.g()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, 255*clamped.b()));
|
||||
|
||||
// Test SkPMFloat <-> Sk4f conversion.
|
||||
Sk4f fs = clamped;
|
||||
SkPMFloat scaled = fs * Sk4f(0.25f);
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, scaled.a()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, scaled.r()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, scaled.g()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, scaled.b()));
|
||||
|
||||
// Test 4-at-a-time conversions.
|
||||
SkPMColor colors[4] = { 0xFF000000, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF };
|
||||
SkPMFloat floats[4];
|
||||
SkPMFloat::From4PMColors(colors, floats+0, floats+1, floats+2, floats+3);
|
||||
|
||||
SkPMColor back[4];
|
||||
SkPMFloat::RoundTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
REPORTER_ASSERT(r, back[i] == colors[i]);
|
||||
}
|
||||
|
||||
SkPMFloat::RoundClampTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
REPORTER_ASSERT(r, back[i] == colors[i]);
|
||||
}
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, 255*scaled.a()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, 255*scaled.r()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, 255*scaled.g()));
|
||||
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, 255*scaled.b()));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user