Xfermode: SSE2 implementation of a number of simple transfer modes

These modes share some common code and not very complex, so group them
together. This CL yields about 50% performance improvement on desktop
i7-3770. Here are the data:
before:
   Xfermode_Screen   8888:  cmsecs =     30.25   565:  cmsecs =     46.81
 Xfermode_Modulate   8888:  cmsecs =     22.48   565:  cmsecs =     40.06
     Xfermode_Plus   8888:  cmsecs =     21.04   565:  cmsecs =     37.51
      Xfermode_Xor   8888:  cmsecs =     37.18   565:  cmsecs =     52.53
  Xfermode_DstATop   8888:  cmsecs =     28.97   565:  cmsecs =     46.42
  Xfermode_SrcATop   8888:  cmsecs =     29.74   565:  cmsecs =     46.25
   Xfermode_DstOut   8888:  cmsecs =      5.34   565:  cmsecs =     24.53
   Xfermode_SrcOut   8888:  cmsecs =     12.25   565:  cmsecs =     24.39
    Xfermode_DstIn   8888:  cmsecs =      5.30   565:  cmsecs =     24.50
    Xfermode_SrcIn   8888:  cmsecs =     12.05   565:  cmsecs =     25.40
  Xfermode_DstOver   8888:  cmsecs =     12.45   565:  cmsecs =      0.15
  Xfermode_SrcOver   8888:  cmsecs =      2.68   565:  cmsecs =      4.42
after:
   Xfermode_Screen   8888:  cmsecs =     13.68   565:  cmsecs =     21.73
 Xfermode_Modulate   8888:  cmsecs =     13.25   565:  cmsecs =     20.97
     Xfermode_Plus   8888:  cmsecs =      9.77   565:  cmsecs =     16.71
      Xfermode_Xor   8888:  cmsecs =     17.64   565:  cmsecs =     25.62
  Xfermode_DstATop   8888:  cmsecs =     15.99   565:  cmsecs =     23.74
  Xfermode_SrcATop   8888:  cmsecs =     15.69   565:  cmsecs =     23.40
   Xfermode_DstOut   8888:  cmsecs =      4.77   565:  cmsecs =     11.85
   Xfermode_SrcOut   8888:  cmsecs =      4.98   565:  cmsecs =     11.84
    Xfermode_DstIn   8888:  cmsecs =      4.68   565:  cmsecs =     11.72
    Xfermode_SrcIn   8888:  cmsecs =      4.93   565:  cmsecs =     11.79
  Xfermode_DstOver   8888:  cmsecs =      5.04   565:  cmsecs =      0.15
  Xfermode_SrcOver   8888:  cmsecs =      2.69   565:  cmsecs =      4.42

BUG=skia:
R=mtklein@google.com

Author: qiankun.miao@intel.com

Review URL: https://codereview.chromium.org/232793002

git-svn-id: http://skia.googlecode.com/svn/trunk@14176 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
commit-bot@chromium.org 2014-04-14 14:54:22 +00:00
parent 282333ffa2
commit 54299654e9
2 changed files with 182 additions and 12 deletions

View File

@ -10,6 +10,10 @@
#include <emmintrin.h>
static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) {
return _mm_add_epi32(alpha, _mm_set1_epi32(1));
}
// See #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) in SkXfermode.cpp.
static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a,
const __m128i& b) {
@ -21,6 +25,27 @@ static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a,
return prod;
}
// Portable version SkAlphaMulQ is in SkColorPriv.h.
static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) {
__m128i mask = _mm_set1_epi32(gMask_00FF00FF);
__m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
// uint32_t rb = ((c & mask) * scale) >> 8
__m128i rb = _mm_and_si128(mask, c);
rb = _mm_mullo_epi16(rb, s);
rb = _mm_srli_epi16(rb, 8);
// uint32_t ag = ((c >> 8) & mask) * scale
__m128i ag = _mm_srli_epi16(c, 8);
ag = _mm_and_si128(ag, mask);
ag = _mm_mullo_epi16(ag, s);
// (rb & mask) | (ag & ~mask)
rb = _mm_and_si128(mask, rb);
ag = _mm_andnot_si128(mask, ag);
return _mm_or_si128(rb, ag);
}
static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
__m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));
return _mm_srli_epi32(a, 24);

View File

@ -17,6 +17,15 @@ static inline __m128i SkDiv255Round_SSE2(const __m128i& a) {
return prod;
}
static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) {
__m128i sum = _mm_add_epi32(a, b);
__m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255));
sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)),
_mm_andnot_si128(cmp, sum));
return sum;
}
static inline __m128i clamp_div255round_SSE2(const __m128i& prod) {
// test if > 0
__m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128());
@ -38,6 +47,130 @@ static inline __m128i clamp_div255round_SSE2(const __m128i& prod) {
return ret;
}
static __m128i srcover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
return _mm_add_epi32(src, SkAlphaMulQ_SSE2(dst, isa));
}
static __m128i dstover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
return _mm_add_epi32(dst, SkAlphaMulQ_SSE2(src, ida));
}
static __m128i srcin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i da = SkGetPackedA32_SSE2(dst);
return SkAlphaMulQ_SSE2(src, SkAlpha255To256_SSE2(da));
}
static __m128i dstin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i sa = SkGetPackedA32_SSE2(src);
return SkAlphaMulQ_SSE2(dst, SkAlpha255To256_SSE2(sa));
}
static __m128i srcout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
return SkAlphaMulQ_SSE2(src, ida);
}
static __m128i dstout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
return SkAlphaMulQ_SSE2(dst, isa);
}
static __m128i srcatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i sa = SkGetPackedA32_SSE2(src);
__m128i da = SkGetPackedA32_SSE2(dst);
__m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
__m128i a = da;
__m128i r1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedR32_SSE2(src));
__m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
__m128i r = _mm_add_epi32(r1, r2);
__m128i g1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedG32_SSE2(src));
__m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
__m128i g = _mm_add_epi32(g1, g2);
__m128i b1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedB32_SSE2(src));
__m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
__m128i b = _mm_add_epi32(b1, b2);
return SkPackARGB32_SSE2(a, r, g, b);
}
static __m128i dstatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i sa = SkGetPackedA32_SSE2(src);
__m128i da = SkGetPackedA32_SSE2(dst);
__m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
__m128i a = sa;
__m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
__m128i r2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedR32_SSE2(dst));
__m128i r = _mm_add_epi32(r1, r2);
__m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
__m128i g2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedG32_SSE2(dst));
__m128i g = _mm_add_epi32(g1, g2);
__m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
__m128i b2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedB32_SSE2(dst));
__m128i b = _mm_add_epi32(b1, b2);
return SkPackARGB32_SSE2(a, r, g, b);
}
static __m128i xor_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i sa = SkGetPackedA32_SSE2(src);
__m128i da = SkGetPackedA32_SSE2(dst);
__m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
__m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
__m128i a1 = _mm_add_epi32(sa, da);
__m128i a2 = SkAlphaMulAlpha_SSE2(sa, da);
a2 = _mm_slli_epi32(a2, 1);
__m128i a = _mm_sub_epi32(a1, a2);
__m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
__m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
__m128i r = _mm_add_epi32(r1, r2);
__m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
__m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
__m128i g = _mm_add_epi32(g1, g2);
__m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
__m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
__m128i b = _mm_add_epi32(b1, b2);
return SkPackARGB32_SSE2(a, r, g, b);
}
static __m128i plus_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i b = saturated_add_SSE2(SkGetPackedB32_SSE2(src),
SkGetPackedB32_SSE2(dst));
__m128i g = saturated_add_SSE2(SkGetPackedG32_SSE2(src),
SkGetPackedG32_SSE2(dst));
__m128i r = saturated_add_SSE2(SkGetPackedR32_SSE2(src),
SkGetPackedR32_SSE2(dst));
__m128i a = saturated_add_SSE2(SkGetPackedA32_SSE2(src),
SkGetPackedA32_SSE2(dst));
return SkPackARGB32_SSE2(a, r, g, b);
}
static __m128i modulate_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i a = SkAlphaMulAlpha_SSE2(SkGetPackedA32_SSE2(src),
SkGetPackedA32_SSE2(dst));
__m128i r = SkAlphaMulAlpha_SSE2(SkGetPackedR32_SSE2(src),
SkGetPackedR32_SSE2(dst));
__m128i g = SkAlphaMulAlpha_SSE2(SkGetPackedG32_SSE2(src),
SkGetPackedG32_SSE2(dst));
__m128i b = SkAlphaMulAlpha_SSE2(SkGetPackedB32_SSE2(src),
SkGetPackedB32_SSE2(dst));
return SkPackARGB32_SSE2(a, r, g, b);
}
static inline __m128i srcover_byte_SSE2(const __m128i& a, const __m128i& b) {
// a + b - SkAlphaMulAlpha(a, b);
return _mm_sub_epi32(_mm_add_epi32(a, b), SkAlphaMulAlpha_SSE2(a, b));
@ -84,6 +217,18 @@ static __m128i multiply_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
return SkPackARGB32_SSE2(a, r, g, b);
}
static __m128i screen_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
__m128i a = srcover_byte_SSE2(SkGetPackedA32_SSE2(src),
SkGetPackedA32_SSE2(dst));
__m128i r = srcover_byte_SSE2(SkGetPackedR32_SSE2(src),
SkGetPackedR32_SSE2(dst));
__m128i g = srcover_byte_SSE2(SkGetPackedG32_SSE2(src),
SkGetPackedG32_SSE2(dst));
__m128i b = srcover_byte_SSE2(SkGetPackedB32_SSE2(src),
SkGetPackedB32_SSE2(dst));
return SkPackARGB32_SSE2(a, r, g, b);
}
////////////////////////////////////////////////////////////////////////////////
typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst);
@ -226,18 +371,18 @@ SkXfermodeProcSIMD gSSE2XfermodeProcs[] = {
NULL, // kClear_Mode
NULL, // kSrc_Mode
NULL, // kDst_Mode
NULL, // kSrcOver_Mode
NULL, // kDstOver_Mode
NULL, // kSrcIn_Mode
NULL, // kDstIn_Mode
NULL, // kSrcOut_Mode
NULL, // kDstOut_Mode
NULL, // kSrcATop_Mode
NULL, // kDstATop_Mode
NULL, // kXor_Mode
NULL, // kPlus_Mode
NULL, // kModulate_Mode
NULL, // kScreen_Mode
srcover_modeproc_SSE2,
dstover_modeproc_SSE2,
srcin_modeproc_SSE2,
dstin_modeproc_SSE2,
srcout_modeproc_SSE2,
dstout_modeproc_SSE2,
srcatop_modeproc_SSE2,
dstatop_modeproc_SSE2,
xor_modeproc_SSE2,
plus_modeproc_SSE2,
modulate_modeproc_SSE2,
screen_modeproc_SSE2,
NULL, // kOverlay_Mode
NULL, // kDarken_Mode