Move float<->byte conversions into Sk4f.

This lets us avoid conversions to [0.0, 1.0] space and rounding that aren't necessary
for SkColorCubeFilter_opts.h.

Dropping rounding on the way back to bytes means we'll see a bunch of off-by-1 diffs.

Rough perf effect:
  SSSE3: 110 -> 93  (~15%)
  NEON: 465 -> 375  (~20%)

This is the beginning of the end for SkPMFloat as an entity distinct from Sk4f.
I've kept it for now so I can convert sites one by one and think about how things
that really want to keep PM color order will work.

BUG=skia:4117

Review URL: https://codereview.chromium.org/1319413003
This commit is contained in:
mtklein 2015-08-31 14:39:59 -07:00 committed by Commit bot
parent b2885d59bc
commit aba1dc8c6a
8 changed files with 69 additions and 194 deletions

View File

@ -92,6 +92,12 @@ public:
static SkNf Load(const T vals[N]) { static SkNf Load(const T vals[N]) {
return SkNf(SkNf<N/2,T>::Load(vals), SkNf<N/2,T>::Load(vals+N/2)); return SkNf(SkNf<N/2,T>::Load(vals), SkNf<N/2,T>::Load(vals+N/2));
} }
// FromBytes() and toBytes() specializations may assume their argument is N-byte aligned.
// E.g. Sk4f::FromBytes() may assume it's reading from a 4-byte-aligned pointer.
// Converts [0,255] bytes to [0.0, 255.0] floats.
static SkNf FromBytes(const uint8_t bytes[N]) {
return SkNf(SkNf<N/2,T>::FromBytes(bytes), SkNf<N/2,T>::FromBytes(bytes+N/2));
}
SkNf(T a, T b) : fLo(a), fHi(b) { REQUIRE(N==2); } SkNf(T a, T b) : fLo(a), fHi(b) { REQUIRE(N==2); }
SkNf(T a, T b, T c, T d) : fLo(a,b), fHi(c,d) { REQUIRE(N==4); } SkNf(T a, T b, T c, T d) : fLo(a,b), fHi(c,d) { REQUIRE(N==4); }
@ -101,6 +107,12 @@ public:
fLo.store(vals); fLo.store(vals);
fHi.store(vals+N/2); fHi.store(vals+N/2);
} }
// Please see note on FromBytes().
// Truncates [0.0,256.0) floats to [0,255] bytes. Other inputs are unspecified.
void toBytes(uint8_t bytes[N]) const {
fLo.toBytes(bytes);
fHi.toBytes(bytes+N/2);
}
SkNi<N,I> castTrunc() const { return SkNi<N,I>(fLo.castTrunc(), fHi.castTrunc()); } SkNi<N,I> castTrunc() const { return SkNi<N,I>(fLo.castTrunc(), fHi.castTrunc()); }
@ -201,8 +213,10 @@ public:
SkNf() {} SkNf() {}
explicit SkNf(T val) : fVal(val) {} explicit SkNf(T val) : fVal(val) {}
static SkNf Load(const T vals[1]) { return SkNf(vals[0]); } static SkNf Load(const T vals[1]) { return SkNf(vals[0]); }
static SkNf FromBytes(const uint8_t bytes[1]) { return SkNf((T)bytes[0]); }
void store(T vals[1]) const { vals[0] = fVal; } void store(T vals[1]) const { vals[0] = fVal; }
void toBytes(uint8_t bytes[1]) const { bytes[0] = (uint8_t)(fVal); }
SkNi<1,I> castTrunc() const { return SkNi<1,I>(fVal); } SkNi<1,I> castTrunc() const { return SkNi<1,I>(fVal); }

View File

@ -28,11 +28,11 @@ public:
static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); } static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
static SkPMFloat FromOpaqueColor(SkColor c); // Requires c's alpha == 0xFF. static SkPMFloat FromOpaqueColor(SkColor c); // Requires c's alpha == 0xFF.
Sk4f alphas() const; // argb -> aaaa, generally faster than the equivalent Sk4f(this->a()). Sk4f alphas() const { return Sk4f(this->a()); }
// Uninitialized. // Uninitialized.
SkPMFloat() {} SkPMFloat() {}
explicit SkPMFloat(SkPMColor); explicit SkPMFloat(SkPMColor c) { *this = Sk4f::FromBytes((uint8_t*)&c) * Sk4f(1.0f/255); }
SkPMFloat(float a, float r, float g, float b) SkPMFloat(float a, float r, float g, float b)
#ifdef SK_PMCOLOR_IS_RGBA #ifdef SK_PMCOLOR_IS_RGBA
: INHERITED(r,g,b,a) {} : INHERITED(r,g,b,a) {}
@ -47,7 +47,11 @@ public:
float g() const { return this->kth<SK_G32_SHIFT / 8>(); } float g() const { return this->kth<SK_G32_SHIFT / 8>(); }
float b() const { return this->kth<SK_B32_SHIFT / 8>(); } float b() const { return this->kth<SK_B32_SHIFT / 8>(); }
SkPMColor round() const; // Rounds from [0.0f, 1.0f] to [0, 255], clamping if out of range. SkPMColor round() const {
SkPMColor c;
(*this * Sk4f(255) + Sk4f(0.5f)).toBytes((uint8_t*)&c);
return c;
}
bool isValid() const { bool isValid() const {
return this->a() >= 0 && this->a() <= 1 return this->a() >= 0 && this->a() <= 1
@ -62,17 +66,4 @@ private:
} // namespace } // namespace
#ifdef SKNX_NO_SIMD
// Platform implementations of SkPMFloat assume Sk4f uses SSE or NEON. _none is generic.
#include "../opts/SkPMFloat_none.h"
#else
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include "../opts/SkPMFloat_sse.h"
#elif defined(SK_ARM_HAS_NEON)
#include "../opts/SkPMFloat_neon.h"
#else
#include "../opts/SkPMFloat_none.h"
#endif
#endif
#endif//SkPM_DEFINED #endif//SkPM_DEFINED

View File

@ -6,7 +6,7 @@
#define SkColorCubeFilter_opts_DEFINED #define SkColorCubeFilter_opts_DEFINED
#include "SkColor.h" #include "SkColor.h"
#include "SkPMFloat.h" #include "SkNx.h"
#include "SkUnPreMultiply.h" #include "SkUnPreMultiply.h"
namespace SK_OPTS_NS { namespace SK_OPTS_NS {
@ -18,7 +18,6 @@ void color_cube_filter_span(const SkPMColor src[],
const SkScalar* colorToFactors[2], const SkScalar* colorToFactors[2],
int dim, int dim,
const SkColor* colorCube) { const SkColor* colorCube) {
uint8_t* ptr_dst = reinterpret_cast<uint8_t*>(dst);
uint8_t r, g, b, a; uint8_t r, g, b, a;
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
@ -51,8 +50,7 @@ void color_cube_filter_span(const SkPMColor src[],
const int i10 = (colorToIndex[1][g] + colorToIndex[0][b] * dim) * dim; const int i10 = (colorToIndex[1][g] + colorToIndex[0][b] * dim) * dim;
const int i11 = (colorToIndex[1][g] + colorToIndex[1][b] * dim) * dim; const int i11 = (colorToIndex[1][g] + colorToIndex[1][b] * dim) * dim;
SkPMFloat color(0,0,0,0); Sk4f color(0,0,0,0);
for (int x = 0; x < 2; ++x) { for (int x = 0; x < 2; ++x) {
const int ix = colorToIndex[x][r]; const int ix = colorToIndex[x][r];
@ -61,22 +59,23 @@ void color_cube_filter_span(const SkPMColor src[],
const SkColor lutColor10 = colorCube[ix + i10]; const SkColor lutColor10 = colorCube[ix + i10];
const SkColor lutColor11 = colorCube[ix + i11]; const SkColor lutColor11 = colorCube[ix + i11];
Sk4f sum = SkPMFloat::FromOpaqueColor(lutColor00) * g0b0; Sk4f sum = Sk4f::FromBytes((const uint8_t*)&lutColor00) * g0b0;
sum = sum + SkPMFloat::FromOpaqueColor(lutColor01) * g0b1; sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor01) * g0b1;
sum = sum + SkPMFloat::FromOpaqueColor(lutColor10) * g1b0; sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor10) * g1b0;
sum = sum + SkPMFloat::FromOpaqueColor(lutColor11) * g1b1; sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor11) * g1b1;
color = color + sum * Sk4f((float)colorToFactors[x][r]); color = color + sum * Sk4f((float)colorToFactors[x][r]);
} }
if (a != 255) { if (a != 255) {
color = color * Sk4f(a * 1.0f/255); color = color * Sk4f(a * (1.0f/255));
} }
dst[i] = color.round(); // color is BGRA (SkColor order), dst is SkPMColor order, so may need to swap R+B.
#if defined(SK_PMCOLOR_IS_RGBA)
ptr_dst[SK_A32_SHIFT / 8] = a; color = Sk4f(color.kth<2>(), color.kth<1>(), color.kth<0>(), color.kth<3>());
ptr_dst += 4; #endif
uint8_t* dstBytes = (uint8_t*)(dst+i);
color.toBytes(dstBytes);
dstBytes[SK_A32_SHIFT/8] = a;
} }
} }

View File

@ -223,9 +223,22 @@ public:
SkNf() {} SkNf() {}
explicit SkNf(float val) : fVec(vdupq_n_f32(val)) {} explicit SkNf(float val) : fVec(vdupq_n_f32(val)) {}
static SkNf Load(const float vals[4]) { return vld1q_f32(vals); } static SkNf Load(const float vals[4]) { return vld1q_f32(vals); }
static SkNf FromBytes(const uint8_t vals[4]) {
uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals);
uint16x8_t fix8_16 = vmovl_u8(fix8);
uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
return SkNf(vcvtq_f32_u32(fix8_32));
}
SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; } SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
void store(float vals[4]) const { vst1q_f32(vals, fVec); } void store(float vals[4]) const { vst1q_f32(vals, fVec); }
void toBytes(uint8_t bytes[4]) const {
uint32x4_t fix8_32 = vcvtq_u32_f32(fVec);
uint16x4_t fix8_16 = vqmovn_u32(fix8_32);
uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0);
}
SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); } SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); }

View File

@ -155,9 +155,30 @@ public:
SkNf() {} SkNf() {}
explicit SkNf(float val) : fVec( _mm_set1_ps(val) ) {} explicit SkNf(float val) : fVec( _mm_set1_ps(val) ) {}
static SkNf Load(const float vals[4]) { return _mm_loadu_ps(vals); } static SkNf Load(const float vals[4]) { return _mm_loadu_ps(vals); }
static SkNf FromBytes(const uint8_t bytes[4]) {
__m128i fix8 = _mm_cvtsi32_si128(*(const int*)bytes);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
const char _ = ~0; // Zero these bytes.
__m128i fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
#else
__m128i fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
#endif
return SkNf(_mm_cvtepi32_ps(fix8_32));
// TODO: use _mm_cvtepu8_epi32 w/SSE4.1?
}
SkNf(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} SkNf(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); }
void toBytes(uint8_t bytes[4]) const {
__m128i fix8_32 = _mm_cvttps_epi32(fVec),
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
*(int*)bytes = _mm_cvtsi128_si32(fix8);
// TODO: use _mm_shuffle_epi8 w/SSSE3?
}
SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); } SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); }

View File

@ -1,50 +0,0 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
namespace { // See SkPMFloat.h
static_assert(SK_A32_SHIFT == 24, "This file assumes little-endian.");
inline SkPMFloat::SkPMFloat(SkPMColor c) {
SkPMColorAssert(c);
uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c);
uint16x8_t fix8_16 = vmovl_u8(fix8);
uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
fVec = vmulq_f32(vcvtq_f32_u32(fix8_32), vdupq_n_f32(1.0f/255));
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::round() const {
// vcvt_u32_f32 truncates, so we round manually by adding a half before converting.
float32x4_t rounded = vmlaq_f32(vdupq_n_f32(0.5f), fVec, vdupq_n_f32(255));
uint32x4_t fix8_32 = vcvtq_u32_f32(rounded);
uint16x4_t fix8_16 = vqmovn_u32(fix8_32);
uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
SkPMColorAssert(c);
return c;
}
inline Sk4f SkPMFloat::alphas() const {
return vdupq_lane_f32(vget_high_f32(fVec), 1); // Duplicate high lane of high half i.e. lane 3.
}
inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) {
SkASSERT(SkColorGetA(c) == 0xFF);
uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c);
#if defined(SK_PMCOLOR_IS_RGBA)
fix8 = vtbl1_u8(fix8, vcreate_u8(0x0300010203000102ULL)); // 03 00 01 02, 2x, i.e. swap R&B.
#endif
uint16x8_t fix8_16 = vmovl_u8(fix8);
uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
SkPMFloat pmf = Sk4f(vmulq_f32(vcvtq_f32_u32(fix8_32), vdupq_n_f32(1.0f/255)));
SkASSERT(pmf.isValid());
return pmf;
}
} // namespace

View File

@ -1,48 +0,0 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
namespace { // See SkPMFloat.h
inline SkPMFloat::SkPMFloat(SkPMColor c) {
float inv255 = 1.0f/255;
*this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255,
SkGetPackedR32(c) * inv255,
SkGetPackedG32(c) * inv255,
SkGetPackedB32(c) * inv255);
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::round() const {
float a = this->a(),
r = this->r(),
g = this->g(),
b = this->b();
a = a < 0 ? 0 : (a > 1 ? 1 : a);
r = r < 0 ? 0 : (r > 1 ? 1 : r);
g = g < 0 ? 0 : (g > 1 ? 1 : g);
b = b < 0 ? 0 : (b > 1 ? 1 : b);
SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f);
SkPMColorAssert(c);
return c;
}
inline Sk4f SkPMFloat::alphas() const {
return Sk4f(this->a());
}
inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) {
SkASSERT(SkColorGetA(c) == 0xFF);
float inv255 = 1.0f / 255;
SkPMFloat pmf = SkPMFloat::FromARGB(1.0f,
SkColorGetR(c) * inv255,
SkColorGetG(c) * inv255,
SkColorGetB(c) * inv255);
SkASSERT(pmf.isValid());
return pmf;
}
} // namespace

View File

@ -1,65 +0,0 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
namespace { // See SkPMFloat.h
inline SkPMFloat::SkPMFloat(SkPMColor c) {
SkPMColorAssert(c);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
const char _ = ~0; // Zero these bytes.
__m128i fix8 = _mm_cvtsi32_si128((int)c),
fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
#else
__m128i fix8 = _mm_cvtsi32_si128((int)c),
fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
#endif
fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255));
SkASSERT(this->isValid());
}
inline SkPMColor SkPMFloat::round() const {
// We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
__m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec);
__m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)),
fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
fix8 = _mm_packus_epi16(fix8_16, fix8_16);
SkPMColor c = _mm_cvtsi128_si32(fix8);
SkPMColorAssert(c);
return c;
}
inline Sk4f SkPMFloat::alphas() const {
static_assert(SK_A32_SHIFT == 24, "");
return _mm_shuffle_ps(fVec, fVec, 0xff); // Read as 11 11 11 11, copying lane 3 to all lanes.
}
inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) {
SkASSERT(SkColorGetA(c) == 0xFF);
__m128i fix8 = _mm_cvtsi32_si128((int)c);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
const char _ = ~0; // Zero these bytes.
__m128i fix8_32 = _mm_shuffle_epi8(fix8,
#if defined(SK_PMCOLOR_IS_BGRA)
_mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)
#else
_mm_setr_epi8(2,_,_,_, 1,_,_,_, 0,_,_,_, 3,_,_,_)
#endif
);
#else
__m128i fix8_16 = _mm_unpacklo_epi8 (fix8 , _mm_setzero_si128()),
fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
#if defined(SK_PMCOLOR_IS_RGBA)
fix8_32 = _mm_shuffle_epi32(fix8_32, 0xC6); // C6 == 11 00 01 10, i.e swap lanes 0 and 2.
#endif
#endif
SkPMFloat pmf = Sk4f(_mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f/255)));
SkASSERT(pmf.isValid());
return pmf;
}
} // namespace