Update 4-at-a-time APIs.
There is no reason to require the 4 SkPMFloats (registers) to be adjacent. The only potential win in loads and stores comes from the SkPMColors being adjacent. Makes no difference to existing bench. BUG=skia: Review URL: https://codereview.chromium.org/1035583002
This commit is contained in:
parent
2af858354d
commit
15391ee4ac
@ -1,3 +1,10 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "Benchmark.h"
|
||||
#include "SkPMFloat.h"
|
||||
|
||||
@ -49,21 +56,32 @@ struct PMFloatBench : public Benchmark {
|
||||
colors[3] = seed + 3;
|
||||
#endif
|
||||
|
||||
SkPMFloat floats[4];
|
||||
SkPMFloat fa,fb,fc,fd;
|
||||
if (kWide) {
|
||||
SkPMFloat::From4PMColors(floats, colors);
|
||||
SkPMFloat::From4PMColors(colors, &fa, &fb, &fc, &fd);
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
floats[i] = SkPMFloat::FromPMColor(colors[i]);
|
||||
}
|
||||
fa = SkPMFloat::FromPMColor(colors[0]);
|
||||
fb = SkPMFloat::FromPMColor(colors[1]);
|
||||
fc = SkPMFloat::FromPMColor(colors[2]);
|
||||
fd = SkPMFloat::FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
SkPMColor back[4];
|
||||
switch (kClamp << 1 | kWide) {
|
||||
case 0: for (int i = 0; i < 4; i++) { back[i] = floats[i].get(); } break;
|
||||
case 1: SkPMFloat::To4PMColors(back, floats); break;
|
||||
case 2: for (int i = 0; i < 4; i++) { back[i] = floats[i].clamped(); } break;
|
||||
case 3: SkPMFloat::ClampTo4PMColors(back, floats); break;
|
||||
case 0: {
|
||||
back[0] = fa.get();
|
||||
back[1] = fb.get();
|
||||
back[2] = fc.get();
|
||||
back[3] = fd.get();
|
||||
} break;
|
||||
case 1: SkPMFloat::To4PMColors(fa, fb, fc, fd, back); break;
|
||||
case 2: {
|
||||
back[0] = fa.clamped();
|
||||
back[1] = fb.clamped();
|
||||
back[2] = fc.clamped();
|
||||
back[3] = fd.clamped();
|
||||
} break;
|
||||
case 3: SkPMFloat::ClampTo4PMColors(fa, fb, fc, fd, back); break;
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
junk ^= back[i];
|
||||
|
@ -1,3 +1,10 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkPM_DEFINED
|
||||
#define SkPM_DEFINED
|
||||
|
||||
@ -20,7 +27,7 @@ public:
|
||||
static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
|
||||
|
||||
// May be more efficient than one at a time. No special alignment assumed for SkPMColors.
|
||||
static void From4PMColors(SkPMFloat[4], const SkPMColor[4]);
|
||||
static void From4PMColors(const SkPMColor[4], SkPMFloat*, SkPMFloat*, SkPMFloat*, SkPMFloat*);
|
||||
|
||||
explicit SkPMFloat(SkPMColor);
|
||||
SkPMFloat(float a, float r, float g, float b) {
|
||||
@ -51,8 +58,10 @@ public:
|
||||
SkPMColor clamped() const; // Will clamp all values to [0, 255]. Then may assert isValid().
|
||||
|
||||
// 4-at-a-time versions of get() and clamped(). Like From4PMColors(), no alignment assumed.
|
||||
static void To4PMColors(SkPMColor[4], const SkPMFloat[4]);
|
||||
static void ClampTo4PMColors(SkPMColor[4], const SkPMFloat[4]);
|
||||
static void To4PMColors(
|
||||
const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
|
||||
static void ClampTo4PMColors(
|
||||
const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
|
||||
|
||||
bool isValid() const {
|
||||
return this->a() >= 0 && this->a() <= 255
|
||||
|
@ -1,3 +1,10 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
|
||||
fColors = that.fColors;
|
||||
return *this;
|
||||
@ -34,25 +41,31 @@ inline SkPMColor SkPMFloat::clamped() const {
|
||||
return c;
|
||||
}
|
||||
|
||||
inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
// Haven't beaten this yet.
|
||||
for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
SkASSERT(floats[0].isValid() && floats[1].isValid()
|
||||
&& floats[2].isValid() && floats[3].isValid());
|
||||
inline void SkPMFloat::To4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Haven't beaten this yet.
|
||||
ClampTo4PMColors(colors, floats);
|
||||
ClampTo4PMColors(a,b,c,d, colors);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
inline void SkPMFloat::ClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
|
||||
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
|
||||
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)),
|
||||
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)),
|
||||
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)),
|
||||
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors));
|
||||
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fColors)),
|
||||
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fColors)),
|
||||
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fColors)),
|
||||
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fColors));
|
||||
__m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
|
||||
_mm_packus_epi16(c2, c3));
|
||||
_mm_storeu_si128((__m128i*)colors, c3210);
|
||||
|
@ -1,3 +1,10 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
|
||||
fColors = that.fColors;
|
||||
return *this;
|
||||
@ -41,23 +48,34 @@ inline SkPMColor SkPMFloat::clamped() const {
|
||||
return c;
|
||||
}
|
||||
|
||||
inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
// Haven't beaten this yet.
|
||||
for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
// Haven't beaten this yet. Still faster than ClampTo4PMColors too.
|
||||
for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); }
|
||||
inline void SkPMFloat::To4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Haven't beaten this yet. Still faster than ClampTo4PMColors?
|
||||
colors[0] = a.get();
|
||||
colors[1] = b.get();
|
||||
colors[2] = c.get();
|
||||
colors[3] = d.get();
|
||||
}
|
||||
|
||||
inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
inline void SkPMFloat::ClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
// Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
|
||||
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
|
||||
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)),
|
||||
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)),
|
||||
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)),
|
||||
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors));
|
||||
__m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fColors)),
|
||||
c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fColors)),
|
||||
c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fColors)),
|
||||
c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fColors));
|
||||
__m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
|
||||
_mm_packus_epi16(c2, c3));
|
||||
_mm_storeu_si128((__m128i*)colors, c3210);
|
||||
|
@ -1,3 +1,10 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
|
||||
fColors = that.fColors;
|
||||
return *this;
|
||||
@ -41,14 +48,28 @@ inline SkPMColor SkPMFloat::clamped() const {
|
||||
}
|
||||
|
||||
// TODO: we should be able to beat these loops on all three methods.
|
||||
inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
|
||||
for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); }
|
||||
inline void SkPMFloat::To4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.get();
|
||||
colors[1] = b.get();
|
||||
colors[2] = c.get();
|
||||
colors[3] = d.get();
|
||||
}
|
||||
|
||||
inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
for (int i = 0; i < 4; i++) { colors[i] = floats[i].clamped(); }
|
||||
inline void SkPMFloat::ClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.clamped();
|
||||
colors[1] = b.clamped();
|
||||
colors[2] = c.clamped();
|
||||
colors[3] = d.clamped();
|
||||
}
|
||||
|
@ -1,3 +1,10 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
|
||||
for (int i = 0; i < 4; i++) { fColor[i] = that.fColor[i]; }
|
||||
return *this;
|
||||
@ -28,14 +35,28 @@ inline SkPMColor SkPMFloat::clamped() const {
|
||||
return SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
|
||||
for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
|
||||
inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
|
||||
SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
|
||||
*a = FromPMColor(colors[0]);
|
||||
*b = FromPMColor(colors[1]);
|
||||
*c = FromPMColor(colors[2]);
|
||||
*d = FromPMColor(colors[3]);
|
||||
}
|
||||
|
||||
inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); }
|
||||
inline void SkPMFloat::To4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.get();
|
||||
colors[1] = b.get();
|
||||
colors[2] = c.get();
|
||||
colors[3] = d.get();
|
||||
}
|
||||
|
||||
inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
|
||||
for (int i = 0; i < 4; i++) { colors[i] = floats[i].clamped(); }
|
||||
inline void SkPMFloat::ClampTo4PMColors(
|
||||
const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
|
||||
SkPMColor colors[4]) {
|
||||
colors[0] = a.clamped();
|
||||
colors[1] = b.clamped();
|
||||
colors[2] = c.clamped();
|
||||
colors[3] = d.clamped();
|
||||
}
|
||||
|
@ -1,3 +1,10 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkPMFloat.h"
|
||||
#include "Test.h"
|
||||
|
||||
@ -33,15 +40,15 @@ DEF_TEST(SkPMFloat, r) {
|
||||
// Test 4-at-a-time conversions.
|
||||
SkPMColor colors[4] = { 0xFF000000, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF };
|
||||
SkPMFloat floats[4];
|
||||
SkPMFloat::From4PMColors(floats, colors);
|
||||
SkPMFloat::From4PMColors(colors, floats+0, floats+1, floats+2, floats+3);
|
||||
|
||||
SkPMColor back[4];
|
||||
SkPMFloat::To4PMColors(back, floats);
|
||||
SkPMFloat::To4PMColors(floats[0], floats[1], floats[2], floats[3], back);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
REPORTER_ASSERT(r, back[i] == colors[i]);
|
||||
}
|
||||
|
||||
SkPMFloat::ClampTo4PMColors(back, floats);
|
||||
SkPMFloat::ClampTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
REPORTER_ASSERT(r, back[i] == colors[i]);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user