I have found a more efficient way of detecting 1 and 0 alpha in SSE2. In addition, I found a stall on an execution unit for the lea instruction and rearranged to code to avoid that.
Before 1,362.01 LinearSrcOvericonstrip.pngVSkOptsSSE41 2,132.54 LinearSrcOvericonstrip.pngVSkOptsDefault 1,717.77 LinearSrcOvericonstrip.pngVSkOptsNonSimdCore 3,525.14 LinearSrcOvericonstrip.pngVSkOptsTrivial 11,181.78 LinearSrcOvericonstrip.pngVSkOptsBruteForce 644.77 LinearSrcOvermandrill_512.pngVSkOptsSSE41 682.51 LinearSrcOvermandrill_512.pngVSkOptsDefault 1,169.65 LinearSrcOvermandrill_512.pngVSkOptsNonSimdCore 2,486.45 LinearSrcOvermandrill_512.pngVSkOptsTrivial 11,635.94 LinearSrcOvermandrill_512.pngVSkOptsBruteForce 217.76 LinearSrcOverplane.pngVSkOptsSSE41 437.09 LinearSrcOverplane.pngVSkOptsDefault 275.91 LinearSrcOverplane.pngVSkOptsNonSimdCore 481.70 LinearSrcOverplane.pngVSkOptsTrivial 1,504.66 LinearSrcOverplane.pngVSkOptsBruteForce 323.90 LinearSrcOverbaby_tux.pngVSkOptsSSE41 497.49 LinearSrcOverbaby_tux.pngVSkOptsDefault 456.08 LinearSrcOverbaby_tux.pngVSkOptsNonSimdCore 786.46 LinearSrcOverbaby_tux.pngVSkOptsTrivial 2,554.65 LinearSrcOverbaby_tux.pngVSkOptsBruteForce 484.83 LinearSrcOveryellow_rose.pngVSkOptsSSE41 821.86 LinearSrcOveryellow_rose.pngVSkOptsDefault 655.37 LinearSrcOveryellow_rose.pngVSkOptsNonSimdCore 1,323.80 LinearSrcOveryellow_rose.pngVSkOptsTrivial 5,802.61 LinearSrcOveryellow_rose.pngVSkOptsBruteForce After changes to sse2 and sse4.1 1,343.12 LinearSrcOvericonstrip.pngVSkOptsSSE41 1,441.17 LinearSrcOvericonstrip.pngVSkOptsDefault 1,679.97 LinearSrcOvericonstrip.pngVSkOptsNonSimdCore 3,481.05 LinearSrcOvericonstrip.pngVSkOptsTrivial 10,979.99 LinearSrcOvericonstrip.pngVSkOptsBruteForce 574.17 LinearSrcOvermandrill_512.pngVSkOptsSSE41 641.40 LinearSrcOvermandrill_512.pngVSkOptsDefault 1,169.44 LinearSrcOvermandrill_512.pngVSkOptsNonSimdCore 2,359.84 LinearSrcOvermandrill_512.pngVSkOptsTrivial 12,106.02 LinearSrcOvermandrill_512.pngVSkOptsBruteForce 209.95 LinearSrcOverplane.pngVSkOptsSSE41 249.12 LinearSrcOverplane.pngVSkOptsDefault 270.36 LinearSrcOverplane.pngVSkOptsNonSimdCore 466.30 LinearSrcOverplane.pngVSkOptsTrivial 1,431.14 LinearSrcOverplane.pngVSkOptsBruteForce 309.70 LinearSrcOverbaby_tux.pngVSkOptsSSE41 354.86 LinearSrcOverbaby_tux.pngVSkOptsDefault 442.69 LinearSrcOverbaby_tux.pngVSkOptsNonSimdCore 764.12 LinearSrcOverbaby_tux.pngVSkOptsTrivial 2,756.16 LinearSrcOverbaby_tux.pngVSkOptsBruteForce 457.70 LinearSrcOveryellow_rose.pngVSkOptsSSE41 500.50 LinearSrcOveryellow_rose.pngVSkOptsDefault 677.84 LinearSrcOveryellow_rose.pngVSkOptsNonSimdCore 1,301.50 LinearSrcOveryellow_rose.pngVSkOptsTrivial 5,786.40 LinearSrcOveryellow_rose.pngVSkOptsBruteForce BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1998373002 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/1998373002
This commit is contained in:
parent
1d15596200
commit
074b48ecb5
@ -127,27 +127,33 @@ void trivial_srcover_srgb_srgb(
|
||||
int count = SkTMin(ndst, nsrc);
|
||||
ndst -= count;
|
||||
const uint32_t* src = srcStart;
|
||||
const uint32_t* end = src + (count & ~3);
|
||||
const uint32_t* end = dst + (count & ~3);
|
||||
ptrdiff_t delta = src - dst;
|
||||
|
||||
while (src < end) {
|
||||
while (dst < end) {
|
||||
__m128i pixels = load(src);
|
||||
if (_mm_testc_si128(pixels, alphaMask)) {
|
||||
uint32_t* start = dst;
|
||||
do {
|
||||
store(dst, pixels);
|
||||
dst += 4;
|
||||
src += 4;
|
||||
} while (src < end && _mm_testc_si128(pixels = load(src), alphaMask));
|
||||
} while (dst < end
|
||||
&& _mm_testc_si128(pixels = load(dst + delta), alphaMask));
|
||||
src += dst - start;
|
||||
} else if (_mm_testz_si128(pixels, alphaMask)) {
|
||||
do {
|
||||
dst += 4;
|
||||
src += 4;
|
||||
} while (src < end && _mm_testz_si128(pixels = load(src), alphaMask));
|
||||
} while (dst < end
|
||||
&& _mm_testz_si128(pixels = load(src), alphaMask));
|
||||
} else {
|
||||
uint32_t* start = dst;
|
||||
do {
|
||||
srcover_srgb_srgb_4(dst, src);
|
||||
srcover_srgb_srgb_4(dst, dst + delta);
|
||||
dst += 4;
|
||||
src += 4;
|
||||
} while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask));
|
||||
} while (dst < end
|
||||
&& _mm_testnzc_si128(pixels = load(dst + delta), alphaMask));
|
||||
src += dst - start;
|
||||
}
|
||||
}
|
||||
|
||||
@ -159,32 +165,34 @@ void trivial_srcover_srgb_srgb(
|
||||
}
|
||||
#else
|
||||
// SSE2 versions
|
||||
|
||||
// Note: In the next three comparisons a group of 4 pixels is converted to a group of
|
||||
// "signed" pixels because the sse2 does not have an unsigned comparison.
|
||||
// Make it so that we can use the signed comparison operators by biasing
|
||||
// 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to
|
||||
// 0x7fxxxxxx which is the largest set of values.
|
||||
static inline bool check_opaque_alphas(__m128i pixels) {
|
||||
__m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
|
||||
int mask =
|
||||
_mm_movemask_epi8(
|
||||
_mm_cmpeq_epi32(
|
||||
_mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)),
|
||||
_mm_setzero_si128()));
|
||||
return mask == 0xFFFF;
|
||||
_mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)));
|
||||
return mask == 0;
|
||||
}
|
||||
|
||||
static inline bool check_transparent_alphas(__m128i pixels) {
|
||||
__m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
|
||||
int mask =
|
||||
_mm_movemask_epi8(
|
||||
_mm_cmpeq_epi32(
|
||||
_mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)),
|
||||
_mm_setzero_si128()));
|
||||
return mask == 0xFFFF;
|
||||
_mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)));
|
||||
return mask == 0;
|
||||
}
|
||||
|
||||
static inline bool check_partial_alphas(__m128i pixels) {
|
||||
__m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000));
|
||||
int mask =
|
||||
_mm_movemask_epi8(
|
||||
_mm_cmpeq_epi8(
|
||||
_mm_srai_epi32(alphas, 8),
|
||||
alphas));
|
||||
return mask == 0xFFFF;
|
||||
__m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
|
||||
__m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000));
|
||||
__m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF));
|
||||
int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent));
|
||||
return mask == 0;
|
||||
}
|
||||
|
||||
void srcover_srgb_srgb(
|
||||
@ -193,30 +201,33 @@ void trivial_srcover_srgb_srgb(
|
||||
int count = SkTMin(ndst, nsrc);
|
||||
ndst -= count;
|
||||
const uint32_t* src = srcStart;
|
||||
const uint32_t* end = src + (count & ~3);
|
||||
const uint32_t* end = dst + (count & ~3);
|
||||
const ptrdiff_t delta = src - dst;
|
||||
|
||||
__m128i pixels = load(src);
|
||||
do {
|
||||
if (check_opaque_alphas(pixels)) {
|
||||
uint32_t* start = dst;
|
||||
do {
|
||||
store(dst, pixels);
|
||||
dst += 4;
|
||||
src += 4;
|
||||
} while (src < end && check_opaque_alphas(pixels = load(src)));
|
||||
} while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
|
||||
src += dst - start;
|
||||
} else if (check_transparent_alphas(pixels)) {
|
||||
const uint32_t* start = src;
|
||||
const uint32_t* start = dst;
|
||||
do {
|
||||
src += 4;
|
||||
} while (src < end && check_transparent_alphas(pixels = load(src)));
|
||||
dst += src - start;
|
||||
} else {
|
||||
do {
|
||||
srcover_srgb_srgb_4(dst, src);
|
||||
dst += 4;
|
||||
src += 4;
|
||||
} while (src < end && check_partial_alphas(pixels = load(src)));
|
||||
} while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
|
||||
src += dst - start;
|
||||
} else {
|
||||
const uint32_t* start = dst;
|
||||
do {
|
||||
srcover_srgb_srgb_4(dst, dst + delta);
|
||||
dst += 4;
|
||||
} while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
|
||||
src += dst - start;
|
||||
}
|
||||
} while (src < end);
|
||||
} while (dst < end);
|
||||
|
||||
count = count & 3;
|
||||
while (count-- > 0) {
|
||||
|
Loading…
Reference in New Issue
Block a user