Add tail handling for SSE* to SkJumper.
Change-Id: Icb9d385333082de2f99b7a25cfd7251717e3f663 Reviewed-on: https://skia-review.googlesource.com/17580 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Herb Derby <herb@google.com>
This commit is contained in:
parent
fabe0b26d0
commit
e7ba8b05d2
@ -19,7 +19,6 @@ tests_sources = [
|
||||
"$_tests/BitSetTest.cpp",
|
||||
"$_tests/BlendTest.cpp",
|
||||
"$_tests/BlitMaskClip.cpp",
|
||||
"$_tests/BlitRowTest.cpp",
|
||||
"$_tests/BlurTest.cpp",
|
||||
"$_tests/CachedDataTest.cpp",
|
||||
"$_tests/CachedDecodingPixelRefTest.cpp",
|
||||
|
@ -157,7 +157,7 @@ static SkJumper_Engine choose_engine() {
|
||||
return {
|
||||
#define M(stage) ASM(stage, sse41),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
4, M(start_pipeline) M(just_return)
|
||||
1, M(start_pipeline) M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
@ -165,7 +165,7 @@ static SkJumper_Engine choose_engine() {
|
||||
return {
|
||||
#define M(stage) ASM(stage, sse2),
|
||||
{ SK_RASTER_PIPELINE_STAGES(M) },
|
||||
4, M(start_pipeline) M(just_return)
|
||||
1, M(start_pipeline) M(just_return)
|
||||
#undef M
|
||||
};
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -76,10 +76,8 @@ struct LazyCtx {
|
||||
// We're finally going to get to what a Stage function looks like!
|
||||
// It's best to jump down to the #else case first, then to come back up here for AVX.
|
||||
|
||||
#if defined(JUMPER) && defined(__AVX__)
|
||||
// There's a big cost to switch between SSE and AVX, so we do a little
|
||||
// extra work to handle even the jagged <kStride tail in AVX mode.
|
||||
// Compared to normal stages, we maintain an extra tail register:
|
||||
#if defined(JUMPER) && defined(__SSE2__)
|
||||
// Process the tail on all x86 processors with SSE2 or better instructions.
|
||||
// tail == 0 ~~> work on a full kStride pixels
|
||||
// tail != 0 ~~> work on only the first tail pixels
|
||||
// tail is always < kStride.
|
||||
@ -113,8 +111,7 @@ struct LazyCtx {
|
||||
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
|
||||
|
||||
#else
|
||||
// Other instruction sets (SSE, NEON, portable) can fall back on narrower
|
||||
// pipelines cheaply, which frees us to always assume tail==0.
|
||||
// Other instruction sets (NEON, portable) currently always assume tail == 0.
|
||||
|
||||
// Stages tail call between each other by following program as described above.
|
||||
// x is our induction variable, stepping forward kStride at a time.
|
||||
|
@ -488,13 +488,27 @@
|
||||
}
|
||||
|
||||
SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
|
||||
__m128i _0, _1, _2, _3;
|
||||
if (__builtin_expect(tail,0)) {
|
||||
_1 = _2 = _3 = _mm_setzero_si128();
|
||||
auto load_rgb = [](const uint16_t* src) {
|
||||
auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
|
||||
return _mm_insert_epi16(v, src[2], 2);
|
||||
};
|
||||
if ( true ) { _0 = load_rgb(ptr + 0); }
|
||||
if (tail > 1) { _1 = load_rgb(ptr + 3); }
|
||||
if (tail > 2) { _2 = load_rgb(ptr + 6); }
|
||||
} else {
|
||||
// Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
|
||||
auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ,
|
||||
_23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
|
||||
|
||||
// Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
|
||||
auto _0 = _01, _1 = _mm_srli_si128(_01, 6),
|
||||
_2 = _23, _3 = _mm_srli_si128(_23, 6);
|
||||
_0 = _01;
|
||||
_1 = _mm_srli_si128(_01, 6);
|
||||
_2 = _23;
|
||||
_3 = _mm_srli_si128(_23, 6);
|
||||
}
|
||||
|
||||
// De-interlace to R,G,B.
|
||||
auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
|
||||
@ -508,9 +522,19 @@
|
||||
*g = unaligned_load<U16>(&G);
|
||||
*b = unaligned_load<U16>(&B);
|
||||
}
|
||||
|
||||
SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
|
||||
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
|
||||
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
|
||||
__m128i _01, _23;
|
||||
if (__builtin_expect(tail,0)) {
|
||||
_01 = _23 = _mm_setzero_si128();
|
||||
auto src = (const double*)ptr;
|
||||
if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00
|
||||
if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
|
||||
if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
|
||||
} else {
|
||||
_01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
|
||||
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
|
||||
}
|
||||
|
||||
auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
|
||||
_13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
|
||||
@ -523,31 +547,55 @@
|
||||
*b = unaligned_load<U16>((uint16_t*)&ba + 0);
|
||||
*a = unaligned_load<U16>((uint16_t*)&ba + 4);
|
||||
}
|
||||
|
||||
SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
|
||||
auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
|
||||
ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
|
||||
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
auto dst = (double*)ptr;
|
||||
if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); }
|
||||
if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
|
||||
if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
|
||||
} else {
|
||||
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
|
||||
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
|
||||
}
|
||||
}
|
||||
|
||||
SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
|
||||
auto _0 = _mm_loadu_ps(ptr+ 0),
|
||||
_1 = _mm_loadu_ps(ptr+ 4),
|
||||
_2 = _mm_loadu_ps(ptr+ 8),
|
||||
F _0, _1, _2, _3;
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
_1 = _2 = _3 = _mm_setzero_si128();
|
||||
if ( true ) { _0 = _mm_loadu_ps(ptr + 0); }
|
||||
if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
|
||||
if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
|
||||
} else {
|
||||
_0 = _mm_loadu_ps(ptr + 0);
|
||||
_1 = _mm_loadu_ps(ptr + 4);
|
||||
_2 = _mm_loadu_ps(ptr + 8);
|
||||
_3 = _mm_loadu_ps(ptr +12);
|
||||
}
|
||||
_MM_TRANSPOSE4_PS(_0,_1,_2,_3);
|
||||
*r = _0;
|
||||
*g = _1;
|
||||
*b = _2;
|
||||
*a = _3;
|
||||
}
|
||||
|
||||
SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
|
||||
_MM_TRANSPOSE4_PS(r,g,b,a);
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
if ( true ) { _mm_storeu_ps(ptr + 0, r); }
|
||||
if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
|
||||
if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
|
||||
} else {
|
||||
_mm_storeu_ps(ptr + 0, r);
|
||||
_mm_storeu_ps(ptr + 4, g);
|
||||
_mm_storeu_ps(ptr + 8, b);
|
||||
_mm_storeu_ps(ptr +12, a);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// We need to be a careful with casts.
|
||||
|
@ -1,290 +0,0 @@
|
||||
/*
|
||||
* Copyright 2011 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBitmap.h"
|
||||
#include "SkCanvas.h"
|
||||
#include "SkColorPriv.h"
|
||||
#include "SkGradientShader.h"
|
||||
#include "SkRect.h"
|
||||
#include "SkVertices.h"
|
||||
#include "Test.h"
|
||||
|
||||
#include "sk_tool_utils.h"
|
||||
|
||||
// these are in the same order as the SkColorType enum
|
||||
static const char* gColorTypeName[] = {
|
||||
"None", "A8", "565", "4444", "RGBA", "BGRA", "Index8"
|
||||
};
|
||||
|
||||
/** Returns -1 on success, else the x coord of the first bad pixel, return its
|
||||
value in bad
|
||||
*/
|
||||
typedef int (*Proc)(const void*, int width, uint32_t expected, uint32_t* bad);
|
||||
|
||||
static int proc_32(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
|
||||
const SkPMColor* addr = static_cast<const SkPMColor*>(ptr);
|
||||
for (int x = 0; x < w; x++) {
|
||||
if (addr[x] != expected) {
|
||||
*bad = addr[x];
|
||||
return x;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int proc_16(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
|
||||
const uint16_t* addr = static_cast<const uint16_t*>(ptr);
|
||||
for (int x = 0; x < w; x++) {
|
||||
if (addr[x] != expected) {
|
||||
*bad = addr[x];
|
||||
return x;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int proc_8(const void* ptr, int w, uint32_t expected, uint32_t* bad) {
|
||||
const SkPMColor* addr = static_cast<const SkPMColor*>(ptr);
|
||||
for (int x = 0; x < w; x++) {
|
||||
if (SkGetPackedA32(addr[x]) != expected) {
|
||||
*bad = SkGetPackedA32(addr[x]);
|
||||
return x;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int proc_bad(const void*, int, uint32_t, uint32_t* bad) {
|
||||
*bad = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static Proc find_proc(const SkBitmap& bm, SkPMColor expect32, uint16_t expect16,
|
||||
uint8_t expect8, uint32_t* expect) {
|
||||
switch (bm.colorType()) {
|
||||
case kN32_SkColorType:
|
||||
*expect = expect32;
|
||||
return proc_32;
|
||||
case kARGB_4444_SkColorType:
|
||||
case kRGB_565_SkColorType:
|
||||
*expect = expect16;
|
||||
return proc_16;
|
||||
case kAlpha_8_SkColorType:
|
||||
*expect = expect8;
|
||||
return proc_8;
|
||||
default:
|
||||
*expect = 0;
|
||||
return proc_bad;
|
||||
}
|
||||
}
|
||||
|
||||
static bool check_color(const SkBitmap& bm, SkPMColor expect32,
|
||||
uint16_t expect16, uint8_t expect8,
|
||||
skiatest::Reporter* reporter) {
|
||||
uint32_t expect;
|
||||
Proc proc = find_proc(bm, expect32, expect16, expect8, &expect);
|
||||
for (int y = 0; y < bm.height(); y++) {
|
||||
uint32_t bad;
|
||||
int x = proc(bm.getAddr(0, y), bm.width(), expect, &bad);
|
||||
if (x >= 0) {
|
||||
ERRORF(reporter, "BlitRow colortype=%s [%d %d] expected %x got %x",
|
||||
gColorTypeName[bm.colorType()], x, y, expect, bad);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Make sure our blits always map src==0 to a noop, and src==FF to full opaque
|
||||
static void test_00_FF(skiatest::Reporter* reporter) {
|
||||
static const int W = 256;
|
||||
|
||||
static const SkColorType gDstColorType[] = {
|
||||
kN32_SkColorType,
|
||||
kRGB_565_SkColorType,
|
||||
};
|
||||
|
||||
static const struct {
|
||||
SkColor fSrc;
|
||||
SkColor fDst;
|
||||
SkPMColor fResult32;
|
||||
uint16_t fResult16;
|
||||
uint8_t fResult8;
|
||||
} gSrcRec[] = {
|
||||
{ 0, 0, 0, 0, 0 },
|
||||
{ 0, 0xFFFFFFFF, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
|
||||
{ 0xFFFFFFFF, 0, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
|
||||
{ 0xFFFFFFFF, 0xFFFFFFFF, SkPackARGB32(0xFF, 0xFF, 0xFF, 0xFF), 0xFFFF, 0xFF },
|
||||
};
|
||||
|
||||
SkPaint paint;
|
||||
|
||||
SkBitmap srcBM;
|
||||
srcBM.allocN32Pixels(W, 1);
|
||||
|
||||
for (size_t i = 0; i < SK_ARRAY_COUNT(gDstColorType); i++) {
|
||||
SkImageInfo info = SkImageInfo::Make(W, 1, gDstColorType[i],
|
||||
kPremul_SkAlphaType);
|
||||
SkBitmap dstBM;
|
||||
dstBM.allocPixels(info);
|
||||
|
||||
SkCanvas canvas(dstBM);
|
||||
for (size_t j = 0; j < SK_ARRAY_COUNT(gSrcRec); j++) {
|
||||
srcBM.eraseColor(gSrcRec[j].fSrc);
|
||||
dstBM.eraseColor(gSrcRec[j].fDst);
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
bool dither = (k & 1) != 0;
|
||||
bool blend = (k & 2) != 0;
|
||||
if (gSrcRec[j].fSrc != 0 && blend) {
|
||||
// can't make a numerical promise about blending anything
|
||||
// but 0
|
||||
// continue;
|
||||
}
|
||||
paint.setDither(dither);
|
||||
paint.setAlpha(blend ? 0x80 : 0xFF);
|
||||
canvas.drawBitmap(srcBM, 0, 0, &paint);
|
||||
if (!check_color(dstBM, gSrcRec[j].fResult32, gSrcRec[j].fResult16,
|
||||
gSrcRec[j].fResult8, reporter)) {
|
||||
SkDebugf("--- src index %d dither %d blend %d\n", j, dither, blend);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct Mesh {
|
||||
SkPoint fPts[4];
|
||||
|
||||
Mesh(const SkBitmap& bm, SkPaint* paint) {
|
||||
const SkScalar w = SkIntToScalar(bm.width());
|
||||
const SkScalar h = SkIntToScalar(bm.height());
|
||||
fPts[0].set(0, 0);
|
||||
fPts[1].set(w, 0);
|
||||
fPts[2].set(w, h);
|
||||
fPts[3].set(0, h);
|
||||
paint->setShader(SkShader::MakeBitmapShader(bm, SkShader::kClamp_TileMode,
|
||||
SkShader::kClamp_TileMode));
|
||||
}
|
||||
|
||||
void draw(SkCanvas* canvas, SkPaint* paint) {
|
||||
canvas->drawVertices(SkVertices::MakeCopy(SkVertices::kTriangleFan_VertexMode, 4, fPts,
|
||||
fPts, nullptr),
|
||||
SkBlendMode::kModulate, *paint);
|
||||
}
|
||||
};
|
||||
|
||||
#include "SkImageEncoder.h"
|
||||
static void save_bm(const SkBitmap& bm, const char name[]) {
|
||||
sk_tool_utils::EncodeImageToFile(name, bm, SkEncodedImageFormat::kPNG, 100);
|
||||
}
|
||||
|
||||
static int max_diff(uint32_t u, uint32_t v) {
|
||||
int d0 = SkAbs32(int((u >> 24) & 0xFF) - int((v >> 24) & 0xFF));
|
||||
int d1 = SkAbs32(int((u >> 16) & 0xFF) - int((v >> 16) & 0xFF));
|
||||
int d2 = SkAbs32(int((u >> 8) & 0xFF) - int((v >> 8) & 0xFF));
|
||||
int d3 = SkAbs32(int((u >> 0) & 0xFF) - int((v >> 0) & 0xFF));
|
||||
return SkMax32(d0, SkMax32(d1, SkMax32(d2, d3)));
|
||||
}
|
||||
|
||||
static bool nearly_eq(const SkBitmap& a, const SkBitmap& b) {
|
||||
switch (a.colorType()) {
|
||||
case kN32_SkColorType: {
|
||||
for (int y = 0; y < a.width(); ++y) {
|
||||
const SkPMColor* ap = a.getAddr32(0, y);
|
||||
const SkPMColor* bp = b.getAddr32(0, y);
|
||||
for (int x = 0; x < a.width(); ++x) {
|
||||
int diff = max_diff(ap[x], bp[x]);
|
||||
if (diff > 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return !memcmp(a.getPixels(), b.getPixels(), a.getSize());
|
||||
}
|
||||
|
||||
static bool gOnce;
|
||||
|
||||
// Make sure our blits are invariant with the width of the blit (i.e. that
|
||||
// special case for 8 at a time have the same results as narrower blits)
|
||||
static void test_diagonal(skiatest::Reporter* reporter) {
|
||||
static const int W = 64;
|
||||
static const int H = W;
|
||||
|
||||
static const SkColorType gDstColorType[] = {
|
||||
kN32_SkColorType,
|
||||
kRGB_565_SkColorType,
|
||||
};
|
||||
|
||||
static const SkColor gDstBG[] = { 0, 0xFFFFFFFF };
|
||||
const SkRect srcR = SkRect::MakeIWH(W, H);
|
||||
|
||||
SkBitmap srcBM;
|
||||
srcBM.allocN32Pixels(W, H);
|
||||
SkImageInfo info = SkImageInfo::Make(W, H, kUnknown_SkColorType, kPremul_SkAlphaType);
|
||||
|
||||
for (size_t i = 0; i < SK_ARRAY_COUNT(gDstColorType); i++) {
|
||||
info = info.makeColorType(gDstColorType[i]);
|
||||
|
||||
SkBitmap dstBM0, dstBM1;
|
||||
dstBM0.allocPixels(info);
|
||||
dstBM1.allocPixels(info);
|
||||
|
||||
SkCanvas canvas0(dstBM0);
|
||||
SkCanvas canvas1(dstBM1);
|
||||
SkColor bgColor;
|
||||
|
||||
for (size_t j = 0; j < SK_ARRAY_COUNT(gDstBG); j++) {
|
||||
bgColor = gDstBG[j];
|
||||
|
||||
for (int c = 0; c <= 0xFF; c++) {
|
||||
// cons up a mesh to draw the bitmap with
|
||||
SkPaint paint;
|
||||
srcBM.eraseARGB(0xFF, c, c, c);
|
||||
Mesh mesh(srcBM, &paint);
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
bool dither = (k & 1) != 0;
|
||||
uint8_t alpha = (k & 2) ? 0x80 : 0xFF;
|
||||
paint.setDither(dither);
|
||||
paint.setAlpha(alpha);
|
||||
|
||||
dstBM0.eraseColor(bgColor);
|
||||
dstBM1.eraseColor(bgColor);
|
||||
|
||||
canvas0.drawRect(srcR, paint);
|
||||
mesh.draw(&canvas1, &paint);
|
||||
|
||||
if (!gOnce && false) {
|
||||
save_bm(dstBM0, "drawBitmap.png");
|
||||
save_bm(dstBM1, "drawMesh.png");
|
||||
gOnce = true;
|
||||
}
|
||||
|
||||
if (!nearly_eq(dstBM0, dstBM1)) {
|
||||
ERRORF(reporter, "Diagonal colortype=%s bg=0x%x dither=%d"
|
||||
" alpha=0x%x src=0x%x",
|
||||
gColorTypeName[gDstColorType[i]], bgColor, dither,
|
||||
alpha, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEF_TEST(BlitRow, reporter) {
|
||||
test_00_FF(reporter);
|
||||
test_diagonal(reporter);
|
||||
}
|
Loading…
Reference in New Issue
Block a user