Convert SkPMFloat to [0,1] range and prune its API.

Now that Sk4px exists, there's a lot less sense in eeking out every
cycle of speed from SkPMFloat: if we need to go _really_ fast, we
should use Sk4px.  SkPMFloat's going to be used for things that are
already slow: large-range intermediates, divides, sqrts, etc.

A [0,1] range is easier to work with, and can even be faster if we
eliminate enough *255 and *1/255 steps.  This is particularly true
on ARM, where NEON can do the *255 and /255 steps for us while
converting float<->int.

We have lots of experimental SkPMFloat <-> SkPMColor APIs that
I'm now removing.  Of the existing APIs, roundClamp() is the sanest,
so I've kept only that, now called round().  The 4-at-a-time APIs
never panned out, so they're gone.

There will be small diffs on:
colormatrix coloremoji colorfilterimagefilter fadefilter imagefilters_xfermodes imagefilterscropexpand imagefiltersgraph tileimagefilter

BUG=skia:

Review URL: https://codereview.chromium.org/1201343004
This commit is contained in:
mtklein 2015-06-25 08:56:28 -07:00 committed by Commit bot
parent 538bacb4bb
commit e9a3e3c17a
9 changed files with 100 additions and 417 deletions

View File

@ -20,20 +20,10 @@ static uint32_t lcg_rand(uint32_t* seed) {
}
// I'm having better luck getting these to constant-propagate away as template parameters.
template <bool kClamp, bool kWide>
struct PMFloatGetSetBench : public Benchmark {
PMFloatGetSetBench() {}
struct PMFloatRoundtripBench : public Benchmark {
PMFloatRoundtripBench() {}
const char* onGetName() override {
switch (kClamp << 1 | kWide) {
case 0: return "SkPMFloat_get_1x";
case 1: return "SkPMFloat_get_4x";
case 2: return "SkPMFloat_clamp_1x";
case 3: return "SkPMFloat_clamp_4x";
}
SkFAIL("unreachable");
return "oh bother";
}
const char* onGetName() override { return "SkPMFloat_roundtrip"; }
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
void onDraw(const int loops, SkCanvas* canvas) override {
@ -41,61 +31,23 @@ struct PMFloatGetSetBench : public Benchmark {
uint32_t junk = 0;
uint32_t seed = 0;
for (int i = 0; i < loops; i++) {
SkPMColor colors[4];
SkPMColor color;
#ifdef SK_DEBUG
for (int i = 0; i < 4; i++) {
// Our SkASSERTs will remind us that it's technically required that we premultiply.
colors[i] = SkPreMultiplyColor(lcg_rand(&seed));
}
color = SkPreMultiplyColor(lcg_rand(&seed));
#else
// But it's a lot faster not to, and this code won't really mind the non-PM colors.
(void)lcg_rand(&seed);
colors[0] = seed + 0;
colors[1] = seed + 1;
colors[2] = seed + 2;
colors[3] = seed + 3;
color = lcg_rand(&seed);
#endif
SkPMFloat fa,fb,fc,fd;
if (kWide) {
SkPMFloat::From4PMColors(colors, &fa, &fb, &fc, &fd);
} else {
fa = SkPMFloat::FromPMColor(colors[0]);
fb = SkPMFloat::FromPMColor(colors[1]);
fc = SkPMFloat::FromPMColor(colors[2]);
fd = SkPMFloat::FromPMColor(colors[3]);
}
SkPMColor back[4];
switch (kClamp << 1 | kWide) {
case 0: {
back[0] = fa.round();
back[1] = fb.round();
back[2] = fc.round();
back[3] = fd.round();
} break;
case 1: SkPMFloat::RoundTo4PMColors(fa, fb, fc, fd, back); break;
case 2: {
back[0] = fa.roundClamp();
back[1] = fb.roundClamp();
back[2] = fc.roundClamp();
back[3] = fd.roundClamp();
} break;
case 3: SkPMFloat::RoundClampTo4PMColors(fa, fb, fc, fd, back); break;
}
for (int i = 0; i < 4; i++) {
junk ^= back[i];
}
auto f = SkPMFloat::FromPMColor(color);
SkPMColor back = f.round();
junk ^= back;
}
blackhole ^= junk;
}
};
// Extra () help DEF_BENCH not get confused by the comma inside the <>.
DEF_BENCH(return (new PMFloatGetSetBench< true, true>);)
DEF_BENCH(return (new PMFloatGetSetBench<false, true>);)
DEF_BENCH(return (new PMFloatGetSetBench< true, false>);)
DEF_BENCH(return (new PMFloatGetSetBench<false, false>);)
DEF_BENCH(return new PMFloatRoundtripBench;)
struct PMFloatGradientBench : public Benchmark {
const char* onGetName() override { return "PMFloat_gradient"; }
@ -103,8 +55,8 @@ struct PMFloatGradientBench : public Benchmark {
SkPMColor fDevice[100];
void onDraw(const int loops, SkCanvas*) override {
Sk4f c0 = SkPMFloat::FromARGB(255, 255, 0, 0),
c1 = SkPMFloat::FromARGB(255, 0, 0, 255),
Sk4f c0 = SkPMFloat::FromARGB(1, 1, 0, 0),
c1 = SkPMFloat::FromARGB(1, 0, 0, 1),
dc = c1 - c0,
fx(0.1f),
dx(0.002f),
@ -112,15 +64,15 @@ struct PMFloatGradientBench : public Benchmark {
dcdx4(dcdx+dcdx+dcdx+dcdx);
for (int n = 0; n < loops; n++) {
Sk4f a = c0 + dc*fx + Sk4f(0.5f), // The +0.5f lets us call trunc() instead of get().
Sk4f a = c0 + dc*fx,
b = a + dcdx,
c = b + dcdx,
d = c + dcdx;
for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
fDevice[i+0] = SkPMFloat(a).trunc();
fDevice[i+1] = SkPMFloat(b).trunc();
fDevice[i+2] = SkPMFloat(c).trunc();
fDevice[i+3] = SkPMFloat(d).trunc();
fDevice[i+0] = SkPMFloat(a).round();
fDevice[i+1] = SkPMFloat(b).round();
fDevice[i+2] = SkPMFloat(c).round();
fDevice[i+3] = SkPMFloat(d).round();
a = a + dcdx4;
b = b + dcdx4;
c = c + dcdx4;

View File

@ -21,15 +21,12 @@
namespace {
// A pre-multiplied color storing each component in the same order as SkPMColor,
// but as a float in the range [0, 255].
// but as a float in the range [0, 1].
class SkPMFloat : public Sk4f {
public:
static SkPMFloat FromPMColor(SkPMColor c) { return SkPMFloat(c); }
static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
// May be more efficient than one at a time. No special alignment assumed for SkPMColors.
static void From4PMColors(const SkPMColor[4], SkPMFloat*, SkPMFloat*, SkPMFloat*, SkPMFloat*);
// Uninitialized.
SkPMFloat() {}
explicit SkPMFloat(SkPMColor);
@ -47,24 +44,10 @@ public:
float g() const { return this->kth<SK_G32_SHIFT / 8>(); }
float b() const { return this->kth<SK_B32_SHIFT / 8>(); }
// N.B. All methods returning an SkPMColor call SkPMColorAssert on that result before returning.
// round() and roundClamp() round component values to the nearest integer.
SkPMColor round() const; // Assumes all values in [0, 255]. Some implementations may clamp.
SkPMColor roundClamp() const; // Will clamp all values to [0, 255].
// Like round(), but truncates instead of rounding.
// The domain of this function is (-1.0f, 256.0f). Values in (-1.0f, 0.0f] trunc to a zero.
SkPMColor trunc() const;
// 4-at-a-time versions of round() and roundClamp(). Like From4PMColors(), no alignment assumed.
static void RoundTo4PMColors(
const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
static void RoundClampTo4PMColors(
const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
SkPMColor round() const; // Rounds from [0.0f, 1.0f] to [0, 255], clamping if out of range.
bool isValid() const {
return this->a() >= 0 && this->a() <= 255
return this->a() >= 0 && this->a() <= 1
&& this->r() >= 0 && this->r() <= this->a()
&& this->g() >= 0 && this->g() <= this->a()
&& this->b() >= 0 && this->b() <= this->a();
@ -80,10 +63,8 @@ private:
// Platform implementations of SkPMFloat assume Sk4f uses SSE or NEON. _none is generic.
#include "../opts/SkPMFloat_none.h"
#else
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
#include "../opts/SkPMFloat_SSSE3.h"
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include "../opts/SkPMFloat_SSE2.h"
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include "../opts/SkPMFloat_sse.h"
#elif defined(SK_ARM_HAS_NEON)
#include "../opts/SkPMFloat_neon.h"
#else

View File

@ -239,25 +239,9 @@ uint32_t SkColorMatrixFilter::getFlags() const {
return this->INHERITED::getFlags() | fFlags;
}
/**
* Need inv255 = 1 / 255 as a constant, so when we premul a SkPMFloat, we can do this
*
* new_red = old_red * alpha * inv255
*
* instead of (much slower)
*
* new_red = old_red * alpha / 255
*
* However, 1.0f/255 comes to (in hex) 0x3B808081, which is slightly bigger than the "actual"
* value of 0x3B808080(repeat 80)... This slightly too-big value can cause us to compute
* new_red > alpha, which is a problem (for valid premul). To fix this, we use a
* hand-computed value of 0x3B808080, 1 ULP smaller. This keeps our colors valid.
*/
static const float gInv255 = 0.0039215683f; // (1.0f / 255) - ULP == SkBits2Float(0x3B808080)
static Sk4f premul(const Sk4f& x) {
float scale = SkPMFloat(x).a() * gInv255;
Sk4f pm = x * Sk4f(scale, scale, scale, 1);
float scale = SkPMFloat(x).a();
Sk4f pm = x * SkPMFloat(1, scale, scale, scale);
#ifdef SK_DEBUG
SkPMFloat pmf(pm);
@ -268,12 +252,12 @@ static Sk4f premul(const Sk4f& x) {
}
static Sk4f unpremul(const SkPMFloat& pm) {
float scale = 255 / pm.a(); // candidate for fast/approx invert?
return pm * Sk4f(scale, scale, scale, 1);
float scale = 1 / pm.a(); // candidate for fast/approx invert?
return pm * SkPMFloat(1, scale, scale, scale);
}
static Sk4f clamp_0_255(const Sk4f& value) {
return Sk4f::Max(Sk4f::Min(value, Sk4f(255)), Sk4f(0));
static Sk4f clamp_0_1(const Sk4f& value) {
return Sk4f::Max(Sk4f::Min(value, Sk4f(1)), Sk4f(0));
}
void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor dst[]) const {
@ -292,14 +276,16 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
#endif
if (use_floats) {
// c0-c3 are already in [0,1].
const Sk4f c0 = Sk4f::Load(fTranspose + 0);
const Sk4f c1 = Sk4f::Load(fTranspose + 4);
const Sk4f c2 = Sk4f::Load(fTranspose + 8);
const Sk4f c3 = Sk4f::Load(fTranspose + 12);
const Sk4f c4 = Sk4f::Load(fTranspose + 16); // translates
// c4 (the translate vector) is in [0, 255]. Bring it back to [0,1].
const Sk4f c4 = Sk4f::Load(fTranspose + 16)*Sk4f(1.0f/255);
// todo: we could cache this in the constructor...
SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_255(c4))).roundClamp();
SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_1(c4))).round();
for (int i = 0; i < count; i++) {
const SkPMColor src_c = src[i];
@ -323,7 +309,7 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
Sk4f dst4 = c0 * r4 + c1 * g4 + c2 * b4 + c3 * a4 + c4;
// clamp, re-premul, and write
dst[i] = SkPMFloat(premul(clamp_0_255(dst4))).round();
dst[i] = SkPMFloat(premul(clamp_0_1(dst4))).round();
}
} else {
const State& state = fState;

View File

@ -1,84 +0,0 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
namespace { // See SkPMFloat.h
// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
// to 8-bit-in-16-bit, back down to 8-bit components.
// _mm_packus_epi16() gives us clamping for free while narrowing.
inline SkPMFloat::SkPMFloat(SkPMColor c) {
SkPMColorAssert(c);
__m128i fix8 = _mm_set_epi32(0,0,0,c),
fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
fVec = _mm_cvtepi32_ps(fix8_32);
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::round() const {
return this->roundClamp(); // Haven't beaten this yet.
}
inline SkPMColor SkPMFloat::roundClamp() const {
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
__m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
SkPMColor c = _mm_cvtsi128_si32(fix8);
SkPMColorAssert(c);
return c;
}
inline SkPMColor SkPMFloat::trunc() const {
// Basically, same as roundClamp(), but no rounding.
__m128i fix8_32 = _mm_cvttps_epi32(fVec),
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
SkPMColor c = _mm_cvtsi128_si32(fix8);
SkPMColorAssert(c);
return c;
}
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
// Haven't beaten this yet.
*a = FromPMColor(colors[0]);
*b = FromPMColor(colors[1]);
*c = FromPMColor(colors[2]);
*d = FromPMColor(colors[3]);
}
inline void SkPMFloat::RoundTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
// Haven't beaten this yet.
RoundClampTo4PMColors(a,b,c,d, colors);
}
inline void SkPMFloat::RoundClampTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
// Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
__m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
_mm_packus_epi16(c2, c3));
_mm_storeu_si128((__m128i*)colors, c3210);
SkPMColorAssert(colors[0]);
SkPMColorAssert(colors[1]);
SkPMColorAssert(colors[2]);
SkPMColorAssert(colors[3]);
}
} // namespace

View File

@ -1,87 +0,0 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
namespace { // See SkPMFloat.h
// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits
// (fix8_32), then convert those to floats.
// round() does the opposite, working from floats to 8-bit-in-32-bits, then back to packed 8 bit.
// roundClamp() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bit, with
// _mm_packus_epi16() both clamping and narrowing.
inline SkPMFloat::SkPMFloat(SkPMColor c) {
SkPMColorAssert(c);
const int _ = 255; // _ means to zero that byte.
__m128i fix8 = _mm_set_epi32(0,0,0,c),
fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0));
fVec = _mm_cvtepi32_ps(fix8_32);
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::trunc() const {
const int _ = 255; // _ means to zero that byte.
__m128i fix8_32 = _mm_cvttps_epi32(fVec),
fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
SkPMColor c = _mm_cvtsi128_si32(fix8);
SkPMColorAssert(c);
return c;
}
inline SkPMColor SkPMFloat::round() const {
return SkPMFloat(Sk4f(0.5f) + *this).trunc();
}
inline SkPMColor SkPMFloat::roundClamp() const {
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
__m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
SkPMColor c = _mm_cvtsi128_si32(fix8);
SkPMColorAssert(c);
return c;
}
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
// Haven't beaten this yet.
*a = FromPMColor(colors[0]);
*b = FromPMColor(colors[1]);
*c = FromPMColor(colors[2]);
*d = FromPMColor(colors[3]);
}
inline void SkPMFloat::RoundTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
// Haven't beaten this yet. Still faster than RoundClampTo4PMColors?
colors[0] = a.round();
colors[1] = b.round();
colors[2] = c.round();
colors[3] = d.round();
}
inline void SkPMFloat::RoundClampTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
// Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
__m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
_mm_packus_epi16(c2, c3));
_mm_storeu_si128((__m128i*)colors, c3210);
SkPMColorAssert(colors[0]);
SkPMColorAssert(colors[1]);
SkPMColorAssert(colors[2]);
SkPMColorAssert(colors[3]);
}
} // namespace

View File

@ -7,38 +7,19 @@
namespace { // See SkPMFloat.h
// For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
// to 8-bit-in-16-bit, back down to 8-bit components.
// roundClamp() uses vqmovn to clamp while narrowing instead of just narrowing with vmovn.
inline SkPMFloat::SkPMFloat(SkPMColor c) {
SkPMColorAssert(c);
uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c);
uint16x8_t fix8_16 = vmovl_u8(fix8);
uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
fVec = vcvtq_f32_u32(fix8_32);
fVec = vcvtq_n_f32_u32(fix8_32, 8);
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::trunc() const {
uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); // vcvtq_u32_f32 truncates
uint16x4_t fix8_16 = vmovn_u32(fix8_32);
uint8x8_t fix8 = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
SkPMColorAssert(c);
return c;
}
inline SkPMColor SkPMFloat::round() const {
return SkPMFloat(Sk4f(0.5f) + *this).trunc();
}
inline SkPMColor SkPMFloat::roundClamp() const {
float32x4_t add_half = vaddq_f32(fVec, vdupq_n_f32(0.5f));
uint32x4_t fix8_32 = vcvtq_u32_f32(add_half); // vcvtq_u32_f32 truncates, so round manually
// vcvtq_n_u32_f32 truncates, so we round manually by adding a half before converting.
float32x4_t rounded = vaddq_f32(fVec, vdupq_n_f32(0.5f/255));
uint32x4_t fix8_32 = vcvtq_n_u32_f32(rounded, 8);
uint16x4_t fix8_16 = vqmovn_u32(fix8_32);
uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
@ -46,31 +27,4 @@ inline SkPMColor SkPMFloat::roundClamp() const {
return c;
}
// TODO: we should be able to beat these loops on all three methods.
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
*a = FromPMColor(colors[0]);
*b = FromPMColor(colors[1]);
*c = FromPMColor(colors[2]);
*d = FromPMColor(colors[3]);
}
inline void SkPMFloat::RoundTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
colors[0] = a.round();
colors[1] = b.round();
colors[2] = c.round();
colors[3] = d.round();
}
inline void SkPMFloat::RoundClampTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
colors[0] = a.roundClamp();
colors[1] = b.roundClamp();
colors[2] = c.roundClamp();
colors[3] = d.roundClamp();
}
} // namespace

View File

@ -8,61 +8,26 @@
namespace { // See SkPMFloat.h
inline SkPMFloat::SkPMFloat(SkPMColor c) {
*this = SkPMFloat::FromARGB(SkGetPackedA32(c),
SkGetPackedR32(c),
SkGetPackedG32(c),
SkGetPackedB32(c));
float inv255 = 1.0f/255;
*this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255,
SkGetPackedR32(c) * inv255,
SkGetPackedG32(c) * inv255,
SkGetPackedB32(c) * inv255);
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::trunc() const {
return SkPackARGB32(this->a(), this->r(), this->g(), this->b());
}
inline SkPMColor SkPMFloat::round() const {
SkPMColor c = SkPackARGB32(this->a()+0.5f, this->r()+0.5f, this->g()+0.5f, this->b()+0.5f);
SkPMColorAssert(c);
return c;
}
inline SkPMColor SkPMFloat::roundClamp() const {
float a = this->a(),
r = this->r(),
g = this->g(),
b = this->b();
a = a < 0 ? 0 : (a > 255 ? 255 : a);
r = r < 0 ? 0 : (r > 255 ? 255 : r);
g = g < 0 ? 0 : (g > 255 ? 255 : g);
b = b < 0 ? 0 : (b > 255 ? 255 : b);
SkPMColor c = SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f);
a = a < 0 ? 0 : (a > 1 ? 1 : a);
r = r < 0 ? 0 : (r > 1 ? 1 : r);
g = g < 0 ? 0 : (g > 1 ? 1 : g);
b = b < 0 ? 0 : (b > 1 ? 1 : b);
SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f);
SkPMColorAssert(c);
return c;
}
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
*a = FromPMColor(colors[0]);
*b = FromPMColor(colors[1]);
*c = FromPMColor(colors[2]);
*d = FromPMColor(colors[3]);
}
inline void SkPMFloat::RoundTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
colors[0] = a.round();
colors[1] = b.round();
colors[2] = c.round();
colors[3] = d.round();
}
inline void SkPMFloat::RoundClampTo4PMColors(
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
SkPMColor colors[4]) {
colors[0] = a.roundClamp();
colors[1] = b.roundClamp();
colors[2] = c.roundClamp();
colors[3] = d.roundClamp();
}
} // namespace

36
src/opts/SkPMFloat_sse.h Normal file
View File

@ -0,0 +1,36 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
namespace { // See SkPMFloat.h
inline SkPMFloat::SkPMFloat(SkPMColor c) {
SkPMColorAssert(c);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
const int _ = 255; // Zero these bytes.
__m128i fix8 = _mm_cvtsi32_si128((int)c),
fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
#else
__m128i fix8 = _mm_cvtsi32_si128((int)c),
fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
#endif
fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255));
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::round() const {
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
__m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec);
__m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)),
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
SkPMColor c = _mm_cvtsi128_si32(fix8);
SkPMColorAssert(c);
return c;
}
} // namespace

View File

@ -12,47 +12,27 @@ DEF_TEST(SkPMFloat, r) {
// Test SkPMColor <-> SkPMFloat
SkPMColor c = SkPreMultiplyColor(0xFFCC9933);
SkPMFloat pmf(c);
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, pmf.a()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, pmf.r()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, pmf.g()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, pmf.b()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*pmf.a()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, 255*pmf.r()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*pmf.g()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, 255*pmf.b()));
REPORTER_ASSERT(r, c == pmf.round());
// Test rounding.
pmf = SkPMFloat(254.5f, 203.5f, 153.1f, 50.8f);
pmf = SkPMFloat(254.5f/255, 203.5f/255, 153.1f/255, 50.8f/255);
REPORTER_ASSERT(r, c == pmf.round());
pmf = SkPMFloat(255.9f, 204.01f, 153.0f, -0.9f);
REPORTER_ASSERT(r, SkPreMultiplyColor(0xFFCC9900) == pmf.trunc());
// Test clamping.
SkPMFloat clamped(SkPMFloat(510.0f, 153.0f, 1.0f, -0.2f).roundClamp());
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, clamped.a()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, clamped.r()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, clamped.g()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, clamped.b()));
SkPMFloat clamped(SkPMFloat(510.0f/255, 153.0f/255, 1.0f/255, -0.2f/255).round());
REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*clamped.a()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*clamped.r()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, 255*clamped.g()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, 255*clamped.b()));
// Test SkPMFloat <-> Sk4f conversion.
Sk4f fs = clamped;
SkPMFloat scaled = fs * Sk4f(0.25f);
REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, scaled.a()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, scaled.r()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, scaled.g()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, scaled.b()));
// Test 4-at-a-time conversions.
SkPMColor colors[4] = { 0xFF000000, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF };
SkPMFloat floats[4];
SkPMFloat::From4PMColors(colors, floats+0, floats+1, floats+2, floats+3);
SkPMColor back[4];
SkPMFloat::RoundTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
for (int i = 0; i < 4; i++) {
REPORTER_ASSERT(r, back[i] == colors[i]);
}
SkPMFloat::RoundClampTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
for (int i = 0; i < 4; i++) {
REPORTER_ASSERT(r, back[i] == colors[i]);
}
REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, 255*scaled.a()));
REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, 255*scaled.r()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, 255*scaled.g()));
REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, 255*scaled.b()));
}