Add tail handling for SSE* to SkJumper.

Change-Id: Icb9d385333082de2f99b7a25cfd7251717e3f663
Reviewed-on: https://skia-review.googlesource.com/17580
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Herb Derby <herb@google.com>
This commit is contained in:
Herb Derby 2017-05-25 16:00:10 -04:00 committed by Skia Commit-Bot
parent fabe0b26d0
commit e7ba8b05d2
7 changed files with 6497 additions and 4743 deletions

View File

@ -19,7 +19,6 @@ tests_sources = [
"$_tests/BitSetTest.cpp", "$_tests/BitSetTest.cpp",
"$_tests/BlendTest.cpp", "$_tests/BlendTest.cpp",
"$_tests/BlitMaskClip.cpp", "$_tests/BlitMaskClip.cpp",
"$_tests/BlitRowTest.cpp",
"$_tests/BlurTest.cpp", "$_tests/BlurTest.cpp",
"$_tests/CachedDataTest.cpp", "$_tests/CachedDataTest.cpp",
"$_tests/CachedDecodingPixelRefTest.cpp", "$_tests/CachedDecodingPixelRefTest.cpp",

View File

@ -157,7 +157,7 @@ static SkJumper_Engine choose_engine() {
return { return {
#define M(stage) ASM(stage, sse41), #define M(stage) ASM(stage, sse41),
{ SK_RASTER_PIPELINE_STAGES(M) }, { SK_RASTER_PIPELINE_STAGES(M) },
4, M(start_pipeline) M(just_return) 1, M(start_pipeline) M(just_return)
#undef M #undef M
}; };
} }
@ -165,7 +165,7 @@ static SkJumper_Engine choose_engine() {
return { return {
#define M(stage) ASM(stage, sse2), #define M(stage) ASM(stage, sse2),
{ SK_RASTER_PIPELINE_STAGES(M) }, { SK_RASTER_PIPELINE_STAGES(M) },
4, M(start_pipeline) M(just_return) 1, M(start_pipeline) M(just_return)
#undef M #undef M
}; };
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -76,10 +76,8 @@ struct LazyCtx {
// We're finally going to get to what a Stage function looks like! // We're finally going to get to what a Stage function looks like!
// It's best to jump down to the #else case first, then to come back up here for AVX. // It's best to jump down to the #else case first, then to come back up here for AVX.
#if defined(JUMPER) && defined(__AVX__) #if defined(JUMPER) && defined(__SSE2__)
// There's a big cost to switch between SSE and AVX, so we do a little // Process the tail on all x86 processors with SSE2 or better instructions.
// extra work to handle even the jagged <kStride tail in AVX mode.
// Compared to normal stages, we maintain an extra tail register:
// tail == 0 ~~> work on a full kStride pixels // tail == 0 ~~> work on a full kStride pixels
// tail != 0 ~~> work on only the first tail pixels // tail != 0 ~~> work on only the first tail pixels
// tail is always < kStride. // tail is always < kStride.
@ -113,8 +111,7 @@ struct LazyCtx {
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#else #else
// Other instruction sets (SSE, NEON, portable) can fall back on narrower // Other instruction sets (NEON, portable) currently always assume tail == 0.
// pipelines cheaply, which frees us to always assume tail==0.
// Stages tail call between each other by following program as described above. // Stages tail call between each other by following program as described above.
// x is our induction variable, stepping forward kStride at a time. // x is our induction variable, stepping forward kStride at a time.

View File

@ -488,13 +488,27 @@
} }
SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
__m128i _0, _1, _2, _3;
if (__builtin_expect(tail,0)) {
_1 = _2 = _3 = _mm_setzero_si128();
auto load_rgb = [](const uint16_t* src) {
auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
return _mm_insert_epi16(v, src[2], 2);
};
if ( true ) { _0 = load_rgb(ptr + 0); }
if (tail > 1) { _1 = load_rgb(ptr + 3); }
if (tail > 2) { _2 = load_rgb(ptr + 6); }
} else {
// Load slightly weirdly to make sure we don't load past the end of 4x48 bits. // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ,
_23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
// Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
auto _0 = _01, _1 = _mm_srli_si128(_01, 6), _0 = _01;
_2 = _23, _3 = _mm_srli_si128(_23, 6); _1 = _mm_srli_si128(_01, 6);
_2 = _23;
_3 = _mm_srli_si128(_23, 6);
}
// De-interlace to R,G,B. // De-interlace to R,G,B.
auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
@ -508,9 +522,19 @@
*g = unaligned_load<U16>(&G); *g = unaligned_load<U16>(&G);
*b = unaligned_load<U16>(&B); *b = unaligned_load<U16>(&B);
} }
SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), __m128i _01, _23;
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1); if (__builtin_expect(tail,0)) {
_01 = _23 = _mm_setzero_si128();
auto src = (const double*)ptr;
if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00
if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
} else {
_01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
}
auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
_13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
@ -523,31 +547,55 @@
*b = unaligned_load<U16>((uint16_t*)&ba + 0); *b = unaligned_load<U16>((uint16_t*)&ba + 0);
*a = unaligned_load<U16>((uint16_t*)&ba + 4); *a = unaligned_load<U16>((uint16_t*)&ba + 4);
} }
SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
if (__builtin_expect(tail, 0)) {
auto dst = (double*)ptr;
if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); }
if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
} else {
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
} }
}
SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
auto _0 = _mm_loadu_ps(ptr+ 0), F _0, _1, _2, _3;
_1 = _mm_loadu_ps(ptr+ 4), if (__builtin_expect(tail, 0)) {
_2 = _mm_loadu_ps(ptr+ 8), _1 = _2 = _3 = _mm_setzero_si128();
if ( true ) { _0 = _mm_loadu_ps(ptr + 0); }
if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
} else {
_0 = _mm_loadu_ps(ptr + 0);
_1 = _mm_loadu_ps(ptr + 4);
_2 = _mm_loadu_ps(ptr + 8);
_3 = _mm_loadu_ps(ptr +12); _3 = _mm_loadu_ps(ptr +12);
}
_MM_TRANSPOSE4_PS(_0,_1,_2,_3); _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
*r = _0; *r = _0;
*g = _1; *g = _1;
*b = _2; *b = _2;
*a = _3; *a = _3;
} }
SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
_MM_TRANSPOSE4_PS(r,g,b,a); _MM_TRANSPOSE4_PS(r,g,b,a);
if (__builtin_expect(tail, 0)) {
if ( true ) { _mm_storeu_ps(ptr + 0, r); }
if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
} else {
_mm_storeu_ps(ptr + 0, r); _mm_storeu_ps(ptr + 0, r);
_mm_storeu_ps(ptr + 4, g); _mm_storeu_ps(ptr + 4, g);
_mm_storeu_ps(ptr + 8, b); _mm_storeu_ps(ptr + 8, b);
_mm_storeu_ps(ptr +12, a); _mm_storeu_ps(ptr +12, a);
} }
}
#endif #endif
// We need to be a careful with casts. // We need to be a careful with casts.

View File

@ -1,290 +0,0 @@
/*
* Copyright 2011 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkBitmap.h"
#include "SkCanvas.h"
#include "SkColorPriv.h"
#include "SkGradientShader.h"
#include "SkRect.h"
#include "SkVertices.h"
#include "Test.h"
#include "sk_tool_utils.h"
// these are in the same order as the SkColorType enum
static const char* gColorTypeName[] = {
"None", "A8", "565", "4444", "RGBA", "BGRA", "Index8"
};
/** Returns -1 on success, else the x coord of the first bad pixel, return its
value in bad
*/
typedef int (*Proc)(const void*, int width, uint32_t expected, uint32_t* bad);
static int proc_32(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
const SkPMColor* addr = static_cast<const SkPMColor*>(ptr);
for (int x = 0; x < w; x++) {
if (addr[x] != expected) {
*bad = addr[x];
return x;
}
}
return -1;
}
static int proc_16(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
const uint16_t* addr = static_cast<const uint16_t*>(ptr);
for (int x = 0; x < w; x++) {
if (addr[x] != expected) {
*bad = addr[x];
return x;
}
}
return -1;
}
static int proc_8(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
const SkPMColor* addr = static_cast<const SkPMColor*>(ptr);
for (int x = 0; x < w; x++) {
if (SkGetPackedA32(addr[x]) != expected) {
*bad = SkGetPackedA32(addr[x]);
return x;
}
}
return -1;
}
static int proc_bad(const void*, int, uint32_t, uint32_t* bad) {
*bad = 0;
return 0;
}
static Proc find_proc(const SkBitmap& bm, SkPMColor expect32, uint16_t expect16,
uint8_t expect8, uint32_t* expect) {
switch (bm.colorType()) {
case kN32_SkColorType:
*expect = expect32;
return proc_32;
case kARGB_4444_SkColorType:
case kRGB_565_SkColorType:
*expect = expect16;
return proc_16;
case kAlpha_8_SkColorType:
*expect = expect8;
return proc_8;
default:
*expect = 0;
return proc_bad;
}
}
static bool check_color(const SkBitmap& bm, SkPMColor expect32,
uint16_t expect16, uint8_t expect8,
skiatest::Reporter* reporter) {
uint32_t expect;
Proc proc = find_proc(bm, expect32, expect16, expect8, &expect);
for (int y = 0; y < bm.height(); y++) {
uint32_t bad;
int x = proc(bm.getAddr(0, y), bm.width(), expect, &bad);
if (x >= 0) {
ERRORF(reporter, "BlitRow colortype=%s [%d %d] expected %x got %x",
gColorTypeName[bm.colorType()], x, y, expect, bad);
return false;
}
}
return true;
}
// Make sure our blits always map src==0 to a noop, and src==FF to full opaque
static void test_00_FF(skiatest::Reporter* reporter) {
static const int W = 256;
static const SkColorType gDstColorType[] = {
kN32_SkColorType,
kRGB_565_SkColorType,
};
static const struct {
SkColor fSrc;
SkColor fDst;
SkPMColor fResult32;
uint16_t fResult16;
uint8_t fResult8;
} gSrcRec[] = {
{ 0, 0, 0, 0, 0 },
{ 0, 0xFFFFFFFF, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
{ 0xFFFFFFFF, 0, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
{ 0xFFFFFFFF, 0xFFFFFFFF, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
};
SkPaint paint;
SkBitmap srcBM;
srcBM.allocN32Pixels(W, 1);
for (size_t i = 0; i < SK_ARRAY_COUNT(gDstColorType); i++) {
SkImageInfo info = SkImageInfo::Make(W, 1, gDstColorType[i],
kPremul_SkAlphaType);
SkBitmap dstBM;
dstBM.allocPixels(info);
SkCanvas canvas(dstBM);
for (size_t j = 0; j < SK_ARRAY_COUNT(gSrcRec); j++) {
srcBM.eraseColor(gSrcRec[j].fSrc);
dstBM.eraseColor(gSrcRec[j].fDst);
for (int k = 0; k < 4; k++) {
bool dither = (k & 1) != 0;
bool blend = (k & 2) != 0;
if (gSrcRec[j].fSrc != 0 && blend) {
// can't make a numerical promise about blending anything
// but 0
// continue;
}
paint.setDither(dither);
paint.setAlpha(blend ? 0x80 : 0xFF);
canvas.drawBitmap(srcBM, 0, 0, &paint);
if (!check_color(dstBM, gSrcRec[j].fResult32, gSrcRec[j].fResult16,
gSrcRec[j].fResult8, reporter)) {
SkDebugf("--- src index %d dither %d blend %d\n", j, dither, blend);
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
struct Mesh {
SkPoint fPts[4];
Mesh(const SkBitmap& bm, SkPaint* paint) {
const SkScalar w = SkIntToScalar(bm.width());
const SkScalar h = SkIntToScalar(bm.height());
fPts[0].set(0, 0);
fPts[1].set(w, 0);
fPts[2].set(w, h);
fPts[3].set(0, h);
paint->setShader(SkShader::MakeBitmapShader(bm, SkShader::kClamp_TileMode,
SkShader::kClamp_TileMode));
}
void draw(SkCanvas* canvas, SkPaint* paint) {
canvas->drawVertices(SkVertices::MakeCopy(SkVertices::kTriangleFan_VertexMode, 4, fPts,
fPts, nullptr),
SkBlendMode::kModulate, *paint);
}
};
#include "SkImageEncoder.h"
static void save_bm(const SkBitmap& bm, const char name[]) {
sk_tool_utils::EncodeImageToFile(name, bm, SkEncodedImageFormat::kPNG, 100);
}
static int max_diff(uint32_t u, uint32_t v) {
int d0 = SkAbs32(int((u >> 24) & 0xFF) - int((v >> 24) & 0xFF));
int d1 = SkAbs32(int((u >> 16) & 0xFF) - int((v >> 16) & 0xFF));
int d2 = SkAbs32(int((u >> 8) & 0xFF) - int((v >> 8) & 0xFF));
int d3 = SkAbs32(int((u >> 0) & 0xFF) - int((v >> 0) & 0xFF));
return SkMax32(d0, SkMax32(d1, SkMax32(d2, d3)));
}
static bool nearly_eq(const SkBitmap& a, const SkBitmap& b) {
switch (a.colorType()) {
case kN32_SkColorType: {
for (int y = 0; y < a.width(); ++y) {
const SkPMColor* ap = a.getAddr32(0, y);
const SkPMColor* bp = b.getAddr32(0, y);
for (int x = 0; x < a.width(); ++x) {
int diff = max_diff(ap[x], bp[x]);
if (diff > 1) {
return false;
}
}
}
return true;
} break;
default:
break;
}
return !memcmp(a.getPixels(), b.getPixels(), a.getSize());
}
static bool gOnce;
// Make sure our blits are invariant with the width of the blit (i.e. that
// special case for 8 at a time have the same results as narrower blits)
static void test_diagonal(skiatest::Reporter* reporter) {
static const int W = 64;
static const int H = W;
static const SkColorType gDstColorType[] = {
kN32_SkColorType,
kRGB_565_SkColorType,
};
static const SkColor gDstBG[] = { 0, 0xFFFFFFFF };
const SkRect srcR = SkRect::MakeIWH(W, H);
SkBitmap srcBM;
srcBM.allocN32Pixels(W, H);
SkImageInfo info = SkImageInfo::Make(W, H, kUnknown_SkColorType, kPremul_SkAlphaType);
for (size_t i = 0; i < SK_ARRAY_COUNT(gDstColorType); i++) {
info = info.makeColorType(gDstColorType[i]);
SkBitmap dstBM0, dstBM1;
dstBM0.allocPixels(info);
dstBM1.allocPixels(info);
SkCanvas canvas0(dstBM0);
SkCanvas canvas1(dstBM1);
SkColor bgColor;
for (size_t j = 0; j < SK_ARRAY_COUNT(gDstBG); j++) {
bgColor = gDstBG[j];
for (int c = 0; c <= 0xFF; c++) {
// cons up a mesh to draw the bitmap with
SkPaint paint;
srcBM.eraseARGB(0xFF, c, c, c);
Mesh mesh(srcBM, &paint);
for (int k = 0; k < 4; k++) {
bool dither = (k & 1) != 0;
uint8_t alpha = (k & 2) ? 0x80 : 0xFF;
paint.setDither(dither);
paint.setAlpha(alpha);
dstBM0.eraseColor(bgColor);
dstBM1.eraseColor(bgColor);
canvas0.drawRect(srcR, paint);
mesh.draw(&canvas1, &paint);
if (!gOnce && false) {
save_bm(dstBM0, "drawBitmap.png");
save_bm(dstBM1, "drawMesh.png");
gOnce = true;
}
if (!nearly_eq(dstBM0, dstBM1)) {
ERRORF(reporter, "Diagonal colortype=%s bg=0x%x dither=%d"
" alpha=0x%x src=0x%x",
gColorTypeName[gDstColorType[i]], bgColor, dither,
alpha, c);
}
}
}
}
}
}
DEF_TEST(BlitRow, reporter) {
test_00_FF(reporter);
test_diagonal(reporter);
}