move LCD blits to SkBlitter_ARGB32.cpp
They're only specialized up to SSE2 or NEON, both of which are typical baseline builds now. Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel Change-Id: If2b2bbd5b002038c68c0064ee78d75911a33b988 Reviewed-on: https://skia-review.googlesource.com/c/170064 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
879dab87ab
commit
32828eb006
@ -44,8 +44,6 @@ skia_core_sources = [
|
||||
"$_src/core/SkBitmapProvider.h",
|
||||
"$_src/core/SkBlendMode.cpp",
|
||||
"$_src/core/SkBlitBWMaskTemplate.h",
|
||||
"$_src/core/SkBlitMask.h",
|
||||
"$_src/core/SkBlitMask_D32.cpp",
|
||||
"$_src/core/SkBlitRow.h",
|
||||
"$_src/core/SkBlitRow_D32.cpp",
|
||||
"$_src/core/SkBlitter.h",
|
||||
|
@ -9,13 +9,11 @@ _src = get_path_info("../src", "abspath")
|
||||
none = [
|
||||
"$_src/opts/Sk4px_none.h",
|
||||
"$_src/opts/SkBitmapProcState_opts_none.cpp",
|
||||
"$_src/opts/SkBlitMask_opts_none.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_none.cpp",
|
||||
]
|
||||
|
||||
armv7 = [
|
||||
"$_src/opts/SkBitmapProcState_opts_none.cpp",
|
||||
"$_src/opts/SkBlitMask_opts_arm.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_arm.cpp",
|
||||
]
|
||||
|
||||
@ -25,8 +23,6 @@ neon = [
|
||||
"$_src/opts/SkBitmapProcState_filter_neon.h",
|
||||
"$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
|
||||
"$_src/opts/SkBitmapProcState_matrix_neon.h",
|
||||
"$_src/opts/SkBlitMask_opts_arm_neon.cpp",
|
||||
"$_src/opts/SkBlitMask_opts_arm_neon.h",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.h",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.cpp",
|
||||
"$_src/opts/SkColor_opts_neon.h",
|
||||
@ -39,9 +35,6 @@ arm64 = [
|
||||
"$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
|
||||
"$_src/opts/SkBitmapProcState_matrix_neon.h",
|
||||
"$_src/opts/SkBitmapProcState_opts_none.cpp",
|
||||
"$_src/opts/SkBlitMask_opts_arm.cpp",
|
||||
"$_src/opts/SkBlitMask_opts_arm_neon.cpp",
|
||||
"$_src/opts/SkBlitMask_opts_arm_neon.h",
|
||||
"$_src/opts/SkBlitRow_opts_arm.cpp",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.h",
|
||||
"$_src/opts/SkBlitRow_opts_arm_neon.cpp",
|
||||
|
@ -399,119 +399,6 @@ static inline SkPMColor SkPixel4444ToPixel32(U16CPU c) {
|
||||
return d | (d << 4);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static inline int SkUpscale31To32(int value) {
|
||||
SkASSERT((unsigned)value <= 31);
|
||||
return value + (value >> 4);
|
||||
}
|
||||
|
||||
static inline int SkBlend32(int src, int dst, int scale) {
|
||||
SkASSERT((unsigned)src <= 0xFF);
|
||||
SkASSERT((unsigned)dst <= 0xFF);
|
||||
SkASSERT((unsigned)scale <= 32);
|
||||
return dst + ((src - dst) * scale >> 5);
|
||||
}
|
||||
|
||||
static inline SkPMColor SkBlendLCD16(int srcA, int srcR, int srcG, int srcB,
|
||||
SkPMColor dst, uint16_t mask) {
|
||||
if (mask == 0) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* We want all of these in 5bits, hence the shifts in case one of them
|
||||
* (green) is 6bits.
|
||||
*/
|
||||
int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
|
||||
int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
|
||||
int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
|
||||
|
||||
// Now upscale them to 0..32, so we can use blend32
|
||||
maskR = SkUpscale31To32(maskR);
|
||||
maskG = SkUpscale31To32(maskG);
|
||||
maskB = SkUpscale31To32(maskB);
|
||||
|
||||
// srcA has been upscaled to 256 before passed into this function
|
||||
maskR = maskR * srcA >> 8;
|
||||
maskG = maskG * srcA >> 8;
|
||||
maskB = maskB * srcA >> 8;
|
||||
|
||||
int dstR = SkGetPackedR32(dst);
|
||||
int dstG = SkGetPackedG32(dst);
|
||||
int dstB = SkGetPackedB32(dst);
|
||||
|
||||
// LCD blitting is only supported if the dst is known/required
|
||||
// to be opaque
|
||||
return SkPackARGB32(0xFF,
|
||||
SkBlend32(srcR, dstR, maskR),
|
||||
SkBlend32(srcG, dstG, maskG),
|
||||
SkBlend32(srcB, dstB, maskB));
|
||||
}
|
||||
|
||||
static inline SkPMColor SkBlendLCD16Opaque(int srcR, int srcG, int srcB,
|
||||
SkPMColor dst, uint16_t mask,
|
||||
SkPMColor opaqueDst) {
|
||||
if (mask == 0) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
if (0xFFFF == mask) {
|
||||
return opaqueDst;
|
||||
}
|
||||
|
||||
/* We want all of these in 5bits, hence the shifts in case one of them
|
||||
* (green) is 6bits.
|
||||
*/
|
||||
int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
|
||||
int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
|
||||
int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
|
||||
|
||||
// Now upscale them to 0..32, so we can use blend32
|
||||
maskR = SkUpscale31To32(maskR);
|
||||
maskG = SkUpscale31To32(maskG);
|
||||
maskB = SkUpscale31To32(maskB);
|
||||
|
||||
int dstR = SkGetPackedR32(dst);
|
||||
int dstG = SkGetPackedG32(dst);
|
||||
int dstB = SkGetPackedB32(dst);
|
||||
|
||||
// LCD blitting is only supported if the dst is known/required
|
||||
// to be opaque
|
||||
return SkPackARGB32(0xFF,
|
||||
SkBlend32(srcR, dstR, maskR),
|
||||
SkBlend32(srcG, dstG, maskG),
|
||||
SkBlend32(srcB, dstB, maskB));
|
||||
}
|
||||
|
||||
static inline void SkBlitLCD16Row(SkPMColor dst[], const uint16_t mask[],
|
||||
SkColor src, int width, SkPMColor) {
|
||||
int srcA = SkColorGetA(src);
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
srcA = SkAlpha255To256(srcA);
|
||||
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = SkBlendLCD16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void SkBlitLCD16OpaqueRow(SkPMColor dst[], const uint16_t mask[],
|
||||
SkColor src, int width,
|
||||
SkPMColor opaqueDst) {
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = SkBlendLCD16Opaque(srcR, srcG, srcB, dst[i], mask[i],
|
||||
opaqueDst);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static inline Sk4f swizzle_rb(const Sk4f& x) {
|
||||
return SkNx_shuffle<2, 1, 0, 3>(x);
|
||||
}
|
||||
|
@ -1,45 +0,0 @@
|
||||
/*
|
||||
* Copyright 2011 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkBlitMask_DEFINED
|
||||
#define SkBlitMask_DEFINED
|
||||
|
||||
#include "SkColor.h"
|
||||
#include "SkMask.h"
|
||||
#include "SkPixmap.h"
|
||||
|
||||
class SkBlitMask {
|
||||
public:
|
||||
/**
|
||||
* Returns true if the device config and mask format were supported.
|
||||
* else return false (nothing was drawn)
|
||||
*/
|
||||
static bool BlitColor(const SkPixmap& device, const SkMask& mask,
|
||||
const SkIRect& clip, SkColor color);
|
||||
|
||||
/**
|
||||
* Function pointer that blits a row of mask(lcd16) into a row of dst
|
||||
* colorized by a single color. The number of pixels to blit is specified
|
||||
* by width.
|
||||
*/
|
||||
typedef void (*BlitLCD16RowProc)(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width,
|
||||
SkPMColor opaqueDst);
|
||||
|
||||
/**
|
||||
* Public entry-point to return a blitcolor BlitLCD16RowProc.
|
||||
*/
|
||||
static BlitLCD16RowProc BlitLCD16RowFactory(bool isOpaque);
|
||||
|
||||
/**
|
||||
* Return either platform specific optimized blitcolor BlitLCD16RowProc,
|
||||
* or nullptr if no optimized routine is available.
|
||||
*/
|
||||
static BlitLCD16RowProc PlatformBlitRowProcs16(bool isOpaque);
|
||||
};
|
||||
|
||||
#endif
|
@ -1,73 +0,0 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkColor.h"
|
||||
#include "SkOpts.h"
|
||||
|
||||
SkBlitMask::BlitLCD16RowProc SkBlitMask::BlitLCD16RowFactory(bool isOpaque) {
|
||||
BlitLCD16RowProc proc = PlatformBlitRowProcs16(isOpaque);
|
||||
if (proc) {
|
||||
return proc;
|
||||
}
|
||||
|
||||
if (isOpaque) {
|
||||
return SkBlitLCD16OpaqueRow;
|
||||
} else {
|
||||
return SkBlitLCD16Row;
|
||||
}
|
||||
}
|
||||
|
||||
static void D32_LCD16_Proc(void* dst, size_t dstRB,
|
||||
const void* mask, size_t maskRB,
|
||||
SkColor color, int width, int height) {
|
||||
|
||||
SkPMColor* dstRow = (SkPMColor*)dst;
|
||||
const uint16_t* srcRow = (const uint16_t*)mask;
|
||||
SkPMColor opaqueDst;
|
||||
|
||||
SkBlitMask::BlitLCD16RowProc proc = nullptr;
|
||||
bool isOpaque = (0xFF == SkColorGetA(color));
|
||||
proc = SkBlitMask::BlitLCD16RowFactory(isOpaque);
|
||||
SkASSERT(proc != nullptr);
|
||||
|
||||
if (isOpaque) {
|
||||
opaqueDst = SkPreMultiplyColor(color);
|
||||
} else {
|
||||
opaqueDst = 0; // ignored
|
||||
}
|
||||
|
||||
do {
|
||||
proc(dstRow, srcRow, color, width, opaqueDst);
|
||||
dstRow = (SkPMColor*)((char*)dstRow + dstRB);
|
||||
srcRow = (const uint16_t*)((const char*)srcRow + maskRB);
|
||||
} while (--height != 0);
|
||||
}
|
||||
|
||||
bool SkBlitMask::BlitColor(const SkPixmap& device,
|
||||
const SkMask& mask,
|
||||
const SkIRect& clip,
|
||||
SkColor color) {
|
||||
int x = clip.fLeft,
|
||||
y = clip.fTop;
|
||||
|
||||
if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
|
||||
SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
|
||||
(const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
|
||||
color, clip.width(), clip.height());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
|
||||
D32_LCD16_Proc(device.writable_addr32(x,y), device.rowBytes(),
|
||||
mask.getAddr(x,y), mask.fRowBytes,
|
||||
color, clip.width(), clip.height());
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
@ -6,7 +6,6 @@
|
||||
*/
|
||||
|
||||
#include "Sk4px.h"
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkColorData.h"
|
||||
#include "SkOpts.h"
|
||||
|
@ -6,13 +6,635 @@
|
||||
*/
|
||||
|
||||
#include "Sk4px.h"
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkColorData.h"
|
||||
#include "SkCoreBlitters.h"
|
||||
#include "SkShader.h"
|
||||
#include "SkUtils.h"
|
||||
#include "SkXfermodePriv.h"
|
||||
|
||||
static inline int upscale_31_to_32(int value) {
|
||||
SkASSERT((unsigned)value <= 31);
|
||||
return value + (value >> 4);
|
||||
}
|
||||
|
||||
static inline int blend_32(int src, int dst, int scale) {
|
||||
SkASSERT((unsigned)src <= 0xFF);
|
||||
SkASSERT((unsigned)dst <= 0xFF);
|
||||
SkASSERT((unsigned)scale <= 32);
|
||||
return dst + ((src - dst) * scale >> 5);
|
||||
}
|
||||
|
||||
static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
|
||||
SkPMColor dst, uint16_t mask) {
|
||||
if (mask == 0) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* We want all of these in 5bits, hence the shifts in case one of them
|
||||
* (green) is 6bits.
|
||||
*/
|
||||
int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
|
||||
int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
|
||||
int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
|
||||
|
||||
// Now upscale them to 0..32, so we can use blend32
|
||||
maskR = upscale_31_to_32(maskR);
|
||||
maskG = upscale_31_to_32(maskG);
|
||||
maskB = upscale_31_to_32(maskB);
|
||||
|
||||
// srcA has been upscaled to 256 before passed into this function
|
||||
maskR = maskR * srcA >> 8;
|
||||
maskG = maskG * srcA >> 8;
|
||||
maskB = maskB * srcA >> 8;
|
||||
|
||||
int dstR = SkGetPackedR32(dst);
|
||||
int dstG = SkGetPackedG32(dst);
|
||||
int dstB = SkGetPackedB32(dst);
|
||||
|
||||
// LCD blitting is only supported if the dst is known/required
|
||||
// to be opaque
|
||||
return SkPackARGB32(0xFF,
|
||||
blend_32(srcR, dstR, maskR),
|
||||
blend_32(srcG, dstG, maskG),
|
||||
blend_32(srcB, dstB, maskB));
|
||||
}
|
||||
|
||||
static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
|
||||
SkPMColor dst, uint16_t mask,
|
||||
SkPMColor opaqueDst) {
|
||||
if (mask == 0) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
if (0xFFFF == mask) {
|
||||
return opaqueDst;
|
||||
}
|
||||
|
||||
/* We want all of these in 5bits, hence the shifts in case one of them
|
||||
* (green) is 6bits.
|
||||
*/
|
||||
int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
|
||||
int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
|
||||
int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
|
||||
|
||||
// Now upscale them to 0..32, so we can use blend32
|
||||
maskR = upscale_31_to_32(maskR);
|
||||
maskG = upscale_31_to_32(maskG);
|
||||
maskB = upscale_31_to_32(maskB);
|
||||
|
||||
int dstR = SkGetPackedR32(dst);
|
||||
int dstG = SkGetPackedG32(dst);
|
||||
int dstB = SkGetPackedB32(dst);
|
||||
|
||||
// LCD blitting is only supported if the dst is known/required
|
||||
// to be opaque
|
||||
return SkPackARGB32(0xFF,
|
||||
blend_32(srcR, dstR, maskR),
|
||||
blend_32(srcG, dstG, maskG),
|
||||
blend_32(srcB, dstB, maskB));
|
||||
}
|
||||
|
||||
|
||||
// TODO: rewrite at least the SSE code here. It's miserable.
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include <emmintrin.h>
|
||||
|
||||
// The following (left) shifts cause the top 5 bits of the mask components to
|
||||
// line up with the corresponding components in an SkPMColor.
|
||||
// Note that the mask's RGB16 order may differ from the SkPMColor order.
|
||||
#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
|
||||
#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
|
||||
#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
|
||||
|
||||
#if SK_R16x5_R32x5_SHIFT == 0
|
||||
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
|
||||
#elif SK_R16x5_R32x5_SHIFT > 0
|
||||
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
|
||||
#else
|
||||
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
|
||||
#endif
|
||||
|
||||
#if SK_G16x5_G32x5_SHIFT == 0
|
||||
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
|
||||
#elif SK_G16x5_G32x5_SHIFT > 0
|
||||
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
|
||||
#else
|
||||
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
|
||||
#endif
|
||||
|
||||
#if SK_B16x5_B32x5_SHIFT == 0
|
||||
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
|
||||
#elif SK_B16x5_B32x5_SHIFT > 0
|
||||
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
|
||||
#else
|
||||
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
|
||||
#endif
|
||||
|
||||
static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
|
||||
// In the following comments, the components of src, dst and mask are
|
||||
// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
|
||||
// by an R, G, B, or A suffix. Components of one of the four pixels that
|
||||
// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
|
||||
// example is the blue channel of the second destination pixel. Memory
|
||||
// layout is shown for an ARGB byte order in a color value.
|
||||
|
||||
// src and srcA store 8-bit values interleaved with zeros.
|
||||
// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
// srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
|
||||
// srcA, 0, srcA, 0, srcA, 0, srcA, 0)
|
||||
// mask stores 16-bit values (compressed three channels) interleaved with zeros.
|
||||
// Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
|
||||
// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
|
||||
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
|
||||
// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
|
||||
__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_R32_SHIFT));
|
||||
|
||||
// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
|
||||
__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_G32_SHIFT));
|
||||
|
||||
// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
|
||||
__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_B32_SHIFT));
|
||||
|
||||
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
|
||||
// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
|
||||
// 8-bit position
|
||||
// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
|
||||
// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
|
||||
mask = _mm_or_si128(_mm_or_si128(r, g), b);
|
||||
|
||||
// Interleave R,G,B into the lower byte of word.
|
||||
// i.e. split the sixteen 8-bit values from mask into two sets of eight
|
||||
// 16-bit values, padded by zero.
|
||||
__m128i maskLo, maskHi;
|
||||
// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
|
||||
maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
|
||||
// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
|
||||
maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
|
||||
|
||||
// Upscale from 0..31 to 0..32
|
||||
// (allows to replace division by left-shift further down)
|
||||
// Left-shift each component by 4 and add the result back to that component,
|
||||
// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
|
||||
maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
|
||||
maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
|
||||
|
||||
// Multiply each component of maskLo and maskHi by srcA
|
||||
maskLo = _mm_mullo_epi16(maskLo, srcA);
|
||||
maskHi = _mm_mullo_epi16(maskHi, srcA);
|
||||
|
||||
// Left shift mask components by 8 (divide by 256)
|
||||
maskLo = _mm_srli_epi16(maskLo, 8);
|
||||
maskHi = _mm_srli_epi16(maskHi, 8);
|
||||
|
||||
// Interleave R,G,B into the lower byte of the word
|
||||
// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
|
||||
__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
|
||||
// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
|
||||
__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
|
||||
|
||||
// mask = (src - dst) * mask
|
||||
maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
|
||||
maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
|
||||
|
||||
// mask = (src - dst) * mask >> 5
|
||||
maskLo = _mm_srai_epi16(maskLo, 5);
|
||||
maskHi = _mm_srai_epi16(maskHi, 5);
|
||||
|
||||
// Add two pixels into result.
|
||||
// result = dst + ((src - dst) * mask >> 5)
|
||||
__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
|
||||
__m128i resultHi = _mm_add_epi16(dstHi, maskHi);
|
||||
|
||||
// Pack into 4 32bit dst pixels.
|
||||
// resultLo and resultHi contain eight 16-bit components (two pixels) each.
|
||||
// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
|
||||
// clamping to 255 if necessary.
|
||||
return _mm_packus_epi16(resultLo, resultHi);
|
||||
}
|
||||
|
||||
static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
|
||||
// In the following comments, the components of src, dst and mask are
|
||||
// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
|
||||
// by an R, G, B, or A suffix. Components of one of the four pixels that
|
||||
// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
|
||||
// example is the blue channel of the second destination pixel. Memory
|
||||
// layout is shown for an ARGB byte order in a color value.
|
||||
|
||||
// src and srcA store 8-bit values interleaved with zeros.
|
||||
// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
// mask stores 16-bit values (shown as high and low bytes) interleaved with
|
||||
// zeros
|
||||
// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
|
||||
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
|
||||
// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
|
||||
__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_R32_SHIFT));
|
||||
|
||||
// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
|
||||
__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_G32_SHIFT));
|
||||
|
||||
// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
|
||||
__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_B32_SHIFT));
|
||||
|
||||
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
|
||||
// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
|
||||
// 8-bit position
|
||||
// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
|
||||
// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
|
||||
mask = _mm_or_si128(_mm_or_si128(r, g), b);
|
||||
|
||||
// Interleave R,G,B into the lower byte of word.
|
||||
// i.e. split the sixteen 8-bit values from mask into two sets of eight
|
||||
// 16-bit values, padded by zero.
|
||||
__m128i maskLo, maskHi;
|
||||
// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
|
||||
maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
|
||||
// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
|
||||
maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
|
||||
|
||||
// Upscale from 0..31 to 0..32
|
||||
// (allows to replace division by left-shift further down)
|
||||
// Left-shift each component by 4 and add the result back to that component,
|
||||
// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
|
||||
maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
|
||||
maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
|
||||
|
||||
// Interleave R,G,B into the lower byte of the word
|
||||
// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
|
||||
__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
|
||||
// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
|
||||
__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
|
||||
|
||||
// mask = (src - dst) * mask
|
||||
maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
|
||||
maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
|
||||
|
||||
// mask = (src - dst) * mask >> 5
|
||||
maskLo = _mm_srai_epi16(maskLo, 5);
|
||||
maskHi = _mm_srai_epi16(maskHi, 5);
|
||||
|
||||
// Add two pixels into result.
|
||||
// result = dst + ((src - dst) * mask >> 5)
|
||||
__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
|
||||
__m128i resultHi = _mm_add_epi16(dstHi, maskHi);
|
||||
|
||||
// Pack into 4 32bit dst pixels and force opaque.
|
||||
// resultLo and resultHi contain eight 16-bit components (two pixels) each.
|
||||
// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
|
||||
// clamping to 255 if necessary. Set alpha components to 0xFF.
|
||||
return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
|
||||
_mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
|
||||
}
|
||||
|
||||
void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
|
||||
if (width <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int srcA = SkColorGetA(src);
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
srcA = SkAlpha255To256(srcA);
|
||||
|
||||
if (width >= 4) {
|
||||
SkASSERT(((size_t)dst & 0x03) == 0);
|
||||
while (((size_t)dst & 0x0F) != 0) {
|
||||
*dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
// Set alpha to 0xFF and replicate source four times in SSE register.
|
||||
__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
|
||||
// Interleave with zeros to get two sets of four 16-bit values.
|
||||
src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
|
||||
// Set srcA_sse to contain eight copies of srcA, padded with zero.
|
||||
// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
__m128i srcA_sse = _mm_set1_epi16(srcA);
|
||||
while (width >= 4) {
|
||||
// Load four destination pixels into dst_sse.
|
||||
__m128i dst_sse = _mm_load_si128(d);
|
||||
// Load four 16-bit masks into lower half of mask_sse.
|
||||
__m128i mask_sse = _mm_loadl_epi64(
|
||||
reinterpret_cast<const __m128i*>(mask));
|
||||
|
||||
// Check whether masks are equal to 0 and get the highest bit
|
||||
// of each byte of result, if masks are all zero, we will get
|
||||
// pack_cmp to 0xFFFF
|
||||
int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
|
||||
_mm_setzero_si128()));
|
||||
|
||||
// if mask pixels are not all zero, we will blend the dst pixels
|
||||
if (pack_cmp != 0xFFFF) {
|
||||
// Unpack 4 16bit mask pixels to
|
||||
// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
mask_sse = _mm_unpacklo_epi16(mask_sse,
|
||||
_mm_setzero_si128());
|
||||
|
||||
// Process 4 32bit dst pixels
|
||||
__m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
|
||||
_mm_store_si128(d, result);
|
||||
}
|
||||
|
||||
d++;
|
||||
mask += 4;
|
||||
width -= 4;
|
||||
}
|
||||
|
||||
dst = reinterpret_cast<SkPMColor*>(d);
|
||||
}
|
||||
|
||||
while (width > 0) {
|
||||
*dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
}
|
||||
|
||||
void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
|
||||
SkColor src, int width, SkPMColor opaqueDst) {
|
||||
if (width <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
if (width >= 4) {
|
||||
SkASSERT(((size_t)dst & 0x03) == 0);
|
||||
while (((size_t)dst & 0x0F) != 0) {
|
||||
*dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
// Set alpha to 0xFF and replicate source four times in SSE register.
|
||||
__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
|
||||
// Set srcA_sse to contain eight copies of srcA, padded with zero.
|
||||
// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
|
||||
while (width >= 4) {
|
||||
// Load four destination pixels into dst_sse.
|
||||
__m128i dst_sse = _mm_load_si128(d);
|
||||
// Load four 16-bit masks into lower half of mask_sse.
|
||||
__m128i mask_sse = _mm_loadl_epi64(
|
||||
reinterpret_cast<const __m128i*>(mask));
|
||||
|
||||
// Check whether masks are equal to 0 and get the highest bit
|
||||
// of each byte of result, if masks are all zero, we will get
|
||||
// pack_cmp to 0xFFFF
|
||||
int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
|
||||
_mm_setzero_si128()));
|
||||
|
||||
// if mask pixels are not all zero, we will blend the dst pixels
|
||||
if (pack_cmp != 0xFFFF) {
|
||||
// Unpack 4 16bit mask pixels to
|
||||
// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
mask_sse = _mm_unpacklo_epi16(mask_sse,
|
||||
_mm_setzero_si128());
|
||||
|
||||
// Process 4 32bit dst pixels
|
||||
__m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
|
||||
_mm_store_si128(d, result);
|
||||
}
|
||||
|
||||
d++;
|
||||
mask += 4;
|
||||
width -= 4;
|
||||
}
|
||||
|
||||
dst = reinterpret_cast<SkPMColor*>(d);
|
||||
}
|
||||
|
||||
while (width > 0) {
|
||||
*dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(SK_ARM_HAS_NEON)
|
||||
#include "SkColor_opts_neon.h"
|
||||
|
||||
static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
|
||||
int16x8_t src_wide, dst_wide;
|
||||
|
||||
src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
|
||||
dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
|
||||
|
||||
src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
|
||||
|
||||
dst_wide += vshrq_n_s16(src_wide, 5);
|
||||
|
||||
return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
|
||||
}
|
||||
|
||||
void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width,
|
||||
SkPMColor opaqueDst) {
|
||||
int colR = SkColorGetR(color);
|
||||
int colG = SkColorGetG(color);
|
||||
int colB = SkColorGetB(color);
|
||||
|
||||
uint8x8_t vcolR = vdup_n_u8(colR);
|
||||
uint8x8_t vcolG = vdup_n_u8(colG);
|
||||
uint8x8_t vcolB = vdup_n_u8(colB);
|
||||
uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
|
||||
uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
|
||||
uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
|
||||
uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
|
||||
|
||||
while (width >= 8) {
|
||||
uint8x8x4_t vdst;
|
||||
uint16x8_t vmask;
|
||||
uint16x8_t vmaskR, vmaskG, vmaskB;
|
||||
uint8x8_t vsel_trans, vsel_opq;
|
||||
|
||||
vdst = vld4_u8((uint8_t*)dst);
|
||||
vmask = vld1q_u16(src);
|
||||
|
||||
// Prepare compare masks
|
||||
vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
|
||||
vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
|
||||
|
||||
// Get all the color masks on 5 bits
|
||||
vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
|
||||
vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
|
||||
SK_B16_BITS + SK_R16_BITS + 1);
|
||||
vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
|
||||
|
||||
// Upscale to 0..32
|
||||
vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
|
||||
vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
|
||||
vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
|
||||
|
||||
vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
|
||||
vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
|
||||
|
||||
vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
|
||||
vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
|
||||
vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
|
||||
|
||||
vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
|
||||
vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
|
||||
vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
|
||||
|
||||
vst4_u8((uint8_t*)dst, vdst);
|
||||
|
||||
dst += 8;
|
||||
src += 8;
|
||||
width -= 8;
|
||||
}
|
||||
|
||||
// Leftovers
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
|
||||
}
|
||||
}
|
||||
|
||||
void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width, SkPMColor) {
|
||||
int colA = SkColorGetA(color);
|
||||
int colR = SkColorGetR(color);
|
||||
int colG = SkColorGetG(color);
|
||||
int colB = SkColorGetB(color);
|
||||
|
||||
colA = SkAlpha255To256(colA);
|
||||
|
||||
uint16x8_t vcolA = vdupq_n_u16(colA);
|
||||
uint8x8_t vcolR = vdup_n_u8(colR);
|
||||
uint8x8_t vcolG = vdup_n_u8(colG);
|
||||
uint8x8_t vcolB = vdup_n_u8(colB);
|
||||
|
||||
while (width >= 8) {
|
||||
uint8x8x4_t vdst;
|
||||
uint16x8_t vmask;
|
||||
uint16x8_t vmaskR, vmaskG, vmaskB;
|
||||
|
||||
vdst = vld4_u8((uint8_t*)dst);
|
||||
vmask = vld1q_u16(src);
|
||||
|
||||
// Get all the color masks on 5 bits
|
||||
vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
|
||||
vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
|
||||
SK_B16_BITS + SK_R16_BITS + 1);
|
||||
vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
|
||||
|
||||
// Upscale to 0..32
|
||||
vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
|
||||
vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
|
||||
vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
|
||||
|
||||
vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
|
||||
vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
|
||||
vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
|
||||
|
||||
vdst.val[NEON_A] = vdup_n_u8(0xFF);
|
||||
vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
|
||||
vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
|
||||
vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
|
||||
|
||||
vst4_u8((uint8_t*)dst, vdst);
|
||||
|
||||
dst += 8;
|
||||
src += 8;
|
||||
width -= 8;
|
||||
}
|
||||
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
|
||||
SkColor src, int width, SkPMColor) {
|
||||
int srcA = SkColorGetA(src);
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
srcA = SkAlpha255To256(srcA);
|
||||
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
|
||||
SkColor src, int width,
|
||||
SkPMColor opaqueDst) {
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static bool blit_color(const SkPixmap& device,
|
||||
const SkMask& mask,
|
||||
const SkIRect& clip,
|
||||
SkColor color) {
|
||||
int x = clip.fLeft,
|
||||
y = clip.fTop;
|
||||
|
||||
if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
|
||||
SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
|
||||
(const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
|
||||
color, clip.width(), clip.height());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
|
||||
auto dstRow = device.writable_addr32(x,y);
|
||||
auto maskRow = (const uint16_t*)mask.getAddr(x,y);
|
||||
|
||||
auto blit_row = blit_row_lcd16;
|
||||
SkPMColor opaqueDst = 0; // ignored unless opaque
|
||||
|
||||
if (0xff == SkColorGetA(color)) {
|
||||
blit_row = blit_row_lcd16_opaque;
|
||||
opaqueDst = SkPreMultiplyColor(color);
|
||||
}
|
||||
|
||||
for (int height = clip.height(); height --> 0; ) {
|
||||
blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
|
||||
|
||||
dstRow = (SkPMColor*) (( char*) dstRow + device.rowBytes());
|
||||
maskRow = (const uint16_t*)((const char*)maskRow + mask.fRowBytes);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
|
||||
@ -171,7 +793,7 @@ void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (SkBlitMask::BlitColor(fDevice, mask, clip, fColor)) {
|
||||
if (blit_color(fDevice, mask, clip, fColor)) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -191,7 +813,7 @@ void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
|
||||
const SkIRect& clip) {
|
||||
SkASSERT(mask.fBounds.contains(clip));
|
||||
|
||||
if (SkBlitMask::BlitColor(fDevice, mask, clip, fColor)) {
|
||||
if (blit_color(fDevice, mask, clip, fColor)) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -542,7 +1164,7 @@ static void blend_row_A8_opaque(SkPMColor* dst, const void* vmask, const SkPMCol
|
||||
});
|
||||
}
|
||||
|
||||
static void blend_row_LCD16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
|
||||
static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
|
||||
auto src_alpha_blend = [](int s, int d, int sa, int m) {
|
||||
return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
|
||||
};
|
||||
@ -607,16 +1229,16 @@ static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPM
|
||||
int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
|
||||
int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
|
||||
|
||||
// Now upscale them to 0..32, so we can use SkBlend32.
|
||||
maskR = SkUpscale31To32(maskR);
|
||||
maskG = SkUpscale31To32(maskG);
|
||||
maskB = SkUpscale31To32(maskB);
|
||||
// Now upscale them to 0..32, so we can use blend_32.
|
||||
maskR = upscale_31_to_32(maskR);
|
||||
maskG = upscale_31_to_32(maskG);
|
||||
maskB = upscale_31_to_32(maskB);
|
||||
|
||||
// This LCD blit routine only works if the destination is opaque.
|
||||
dst[i] = SkPackARGB32(0xFF,
|
||||
SkBlend32(srcR, SkGetPackedR32(d), maskR),
|
||||
SkBlend32(srcG, SkGetPackedG32(d), maskG),
|
||||
SkBlend32(srcB, SkGetPackedB32(d), maskB));
|
||||
blend_32(srcR, SkGetPackedR32(d), maskR),
|
||||
blend_32(srcG, SkGetPackedG32(d), maskG),
|
||||
blend_32(srcB, SkGetPackedB32(d), maskB));
|
||||
}
|
||||
}
|
||||
|
||||
@ -641,7 +1263,7 @@ void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip)
|
||||
} else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
|
||||
blend_row = blend_row_LCD16_opaque;
|
||||
} else if (mask.fFormat == SkMask::kLCD16_Format) {
|
||||
blend_row = blend_row_LCD16;
|
||||
blend_row = blend_row_lcd16;
|
||||
} else {
|
||||
this->INHERITED::blitMask(mask, clip);
|
||||
return;
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include "SkShader.h"
|
||||
#include "SkUTF.h"
|
||||
#include "SkXfermodePriv.h"
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkColorData.h"
|
||||
|
||||
#include "SkNx.h"
|
||||
|
@ -1,20 +0,0 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkColor.h"
|
||||
#include "SkColorData.h"
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkUtilsArm.h"
|
||||
#include "SkBlitMask_opts_arm_neon.h"
|
||||
|
||||
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
|
||||
if (isOpaque) {
|
||||
return SK_ARM_NEON_WRAP(SkBlitLCD16OpaqueRow);
|
||||
} else {
|
||||
return SK_ARM_NEON_WRAP(SkBlitLCD16Row);
|
||||
}
|
||||
}
|
@ -1,127 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkColor_opts_neon.h"
|
||||
|
||||
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width,
|
||||
SkPMColor opaqueDst) {
|
||||
int colR = SkColorGetR(color);
|
||||
int colG = SkColorGetG(color);
|
||||
int colB = SkColorGetB(color);
|
||||
|
||||
uint8x8_t vcolR = vdup_n_u8(colR);
|
||||
uint8x8_t vcolG = vdup_n_u8(colG);
|
||||
uint8x8_t vcolB = vdup_n_u8(colB);
|
||||
uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
|
||||
uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
|
||||
uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
|
||||
uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
|
||||
|
||||
while (width >= 8) {
|
||||
uint8x8x4_t vdst;
|
||||
uint16x8_t vmask;
|
||||
uint16x8_t vmaskR, vmaskG, vmaskB;
|
||||
uint8x8_t vsel_trans, vsel_opq;
|
||||
|
||||
vdst = vld4_u8((uint8_t*)dst);
|
||||
vmask = vld1q_u16(src);
|
||||
|
||||
// Prepare compare masks
|
||||
vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
|
||||
vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
|
||||
|
||||
// Get all the color masks on 5 bits
|
||||
vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
|
||||
vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
|
||||
SK_B16_BITS + SK_R16_BITS + 1);
|
||||
vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
|
||||
|
||||
// Upscale to 0..32
|
||||
vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
|
||||
vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
|
||||
vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
|
||||
|
||||
vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
|
||||
vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
|
||||
|
||||
vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
|
||||
vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
|
||||
vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
|
||||
|
||||
vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
|
||||
vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
|
||||
vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
|
||||
|
||||
vst4_u8((uint8_t*)dst, vdst);
|
||||
|
||||
dst += 8;
|
||||
src += 8;
|
||||
width -= 8;
|
||||
}
|
||||
|
||||
// Leftovers
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
|
||||
opaqueDst);
|
||||
}
|
||||
}
|
||||
|
||||
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width, SkPMColor) {
|
||||
int colA = SkColorGetA(color);
|
||||
int colR = SkColorGetR(color);
|
||||
int colG = SkColorGetG(color);
|
||||
int colB = SkColorGetB(color);
|
||||
|
||||
colA = SkAlpha255To256(colA);
|
||||
|
||||
uint16x8_t vcolA = vdupq_n_u16(colA);
|
||||
uint8x8_t vcolR = vdup_n_u8(colR);
|
||||
uint8x8_t vcolG = vdup_n_u8(colG);
|
||||
uint8x8_t vcolB = vdup_n_u8(colB);
|
||||
|
||||
while (width >= 8) {
|
||||
uint8x8x4_t vdst;
|
||||
uint16x8_t vmask;
|
||||
uint16x8_t vmaskR, vmaskG, vmaskB;
|
||||
|
||||
vdst = vld4_u8((uint8_t*)dst);
|
||||
vmask = vld1q_u16(src);
|
||||
|
||||
// Get all the color masks on 5 bits
|
||||
vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
|
||||
vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
|
||||
SK_B16_BITS + SK_R16_BITS + 1);
|
||||
vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
|
||||
|
||||
// Upscale to 0..32
|
||||
vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
|
||||
vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
|
||||
vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
|
||||
|
||||
vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
|
||||
vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
|
||||
vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
|
||||
|
||||
vdst.val[NEON_A] = vdup_n_u8(0xFF);
|
||||
vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
|
||||
vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
|
||||
vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
|
||||
|
||||
vst4_u8((uint8_t*)dst, vdst);
|
||||
|
||||
dst += 8;
|
||||
src += 8;
|
||||
width -= 8;
|
||||
}
|
||||
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkBlitMask_opts_arm_neon_DEFINED
|
||||
#define SkBlitMask_opts_arm_neon_DEFINED
|
||||
|
||||
#include "SkColor.h"
|
||||
#include "SkBlitMask.h"
|
||||
|
||||
extern void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width,
|
||||
SkPMColor opaqueDst);
|
||||
|
||||
extern void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width, SkPMColor);
|
||||
|
||||
#endif // #ifndef SkBlitMask_opts_arm_neon_DEFINED
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkBlitMask.h"
|
||||
|
||||
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
|
||||
return nullptr;
|
||||
}
|
@ -101,342 +101,3 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
count--;
|
||||
}
|
||||
}
|
||||
|
||||
// The following (left) shifts cause the top 5 bits of the mask components to
|
||||
// line up with the corresponding components in an SkPMColor.
|
||||
// Note that the mask's RGB16 order may differ from the SkPMColor order.
|
||||
#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
|
||||
#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
|
||||
#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
|
||||
|
||||
#if SK_R16x5_R32x5_SHIFT == 0
|
||||
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
|
||||
#elif SK_R16x5_R32x5_SHIFT > 0
|
||||
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
|
||||
#else
|
||||
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
|
||||
#endif
|
||||
|
||||
#if SK_G16x5_G32x5_SHIFT == 0
|
||||
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
|
||||
#elif SK_G16x5_G32x5_SHIFT > 0
|
||||
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
|
||||
#else
|
||||
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
|
||||
#endif
|
||||
|
||||
#if SK_B16x5_B32x5_SHIFT == 0
|
||||
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
|
||||
#elif SK_B16x5_B32x5_SHIFT > 0
|
||||
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
|
||||
#else
|
||||
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
|
||||
#endif
|
||||
|
||||
static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
|
||||
__m128i &mask, __m128i &srcA) {
|
||||
// In the following comments, the components of src, dst and mask are
|
||||
// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
|
||||
// by an R, G, B, or A suffix. Components of one of the four pixels that
|
||||
// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
|
||||
// example is the blue channel of the second destination pixel. Memory
|
||||
// layout is shown for an ARGB byte order in a color value.
|
||||
|
||||
// src and srcA store 8-bit values interleaved with zeros.
|
||||
// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
// srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
|
||||
// srcA, 0, srcA, 0, srcA, 0, srcA, 0)
|
||||
// mask stores 16-bit values (compressed three channels) interleaved with zeros.
|
||||
// Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
|
||||
// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
|
||||
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
|
||||
// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
|
||||
__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_R32_SHIFT));
|
||||
|
||||
// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
|
||||
__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_G32_SHIFT));
|
||||
|
||||
// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
|
||||
__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_B32_SHIFT));
|
||||
|
||||
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
|
||||
// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
|
||||
// 8-bit position
|
||||
// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
|
||||
// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
|
||||
mask = _mm_or_si128(_mm_or_si128(r, g), b);
|
||||
|
||||
// Interleave R,G,B into the lower byte of word.
|
||||
// i.e. split the sixteen 8-bit values from mask into two sets of eight
|
||||
// 16-bit values, padded by zero.
|
||||
__m128i maskLo, maskHi;
|
||||
// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
|
||||
maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
|
||||
// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
|
||||
maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
|
||||
|
||||
// Upscale from 0..31 to 0..32
|
||||
// (allows to replace division by left-shift further down)
|
||||
// Left-shift each component by 4 and add the result back to that component,
|
||||
// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
|
||||
maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
|
||||
maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
|
||||
|
||||
// Multiply each component of maskLo and maskHi by srcA
|
||||
maskLo = _mm_mullo_epi16(maskLo, srcA);
|
||||
maskHi = _mm_mullo_epi16(maskHi, srcA);
|
||||
|
||||
// Left shift mask components by 8 (divide by 256)
|
||||
maskLo = _mm_srli_epi16(maskLo, 8);
|
||||
maskHi = _mm_srli_epi16(maskHi, 8);
|
||||
|
||||
// Interleave R,G,B into the lower byte of the word
|
||||
// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
|
||||
__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
|
||||
// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
|
||||
__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
|
||||
|
||||
// mask = (src - dst) * mask
|
||||
maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
|
||||
maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
|
||||
|
||||
// mask = (src - dst) * mask >> 5
|
||||
maskLo = _mm_srai_epi16(maskLo, 5);
|
||||
maskHi = _mm_srai_epi16(maskHi, 5);
|
||||
|
||||
// Add two pixels into result.
|
||||
// result = dst + ((src - dst) * mask >> 5)
|
||||
__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
|
||||
__m128i resultHi = _mm_add_epi16(dstHi, maskHi);
|
||||
|
||||
// Pack into 4 32bit dst pixels.
|
||||
// resultLo and resultHi contain eight 16-bit components (two pixels) each.
|
||||
// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
|
||||
// clamping to 255 if necessary.
|
||||
return _mm_packus_epi16(resultLo, resultHi);
|
||||
}
|
||||
|
||||
static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
|
||||
__m128i &mask) {
|
||||
// In the following comments, the components of src, dst and mask are
|
||||
// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
|
||||
// by an R, G, B, or A suffix. Components of one of the four pixels that
|
||||
// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
|
||||
// example is the blue channel of the second destination pixel. Memory
|
||||
// layout is shown for an ARGB byte order in a color value.
|
||||
|
||||
// src and srcA store 8-bit values interleaved with zeros.
|
||||
// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
// mask stores 16-bit values (shown as high and low bytes) interleaved with
|
||||
// zeros
|
||||
// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
|
||||
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
|
||||
// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
|
||||
__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_R32_SHIFT));
|
||||
|
||||
// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
|
||||
__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_G32_SHIFT));
|
||||
|
||||
// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
|
||||
__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
|
||||
_mm_set1_epi32(0x1F << SK_B32_SHIFT));
|
||||
|
||||
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
|
||||
// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
|
||||
// 8-bit position
|
||||
// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
|
||||
// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
|
||||
mask = _mm_or_si128(_mm_or_si128(r, g), b);
|
||||
|
||||
// Interleave R,G,B into the lower byte of word.
|
||||
// i.e. split the sixteen 8-bit values from mask into two sets of eight
|
||||
// 16-bit values, padded by zero.
|
||||
__m128i maskLo, maskHi;
|
||||
// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
|
||||
maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
|
||||
// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
|
||||
maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
|
||||
|
||||
// Upscale from 0..31 to 0..32
|
||||
// (allows to replace division by left-shift further down)
|
||||
// Left-shift each component by 4 and add the result back to that component,
|
||||
// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
|
||||
maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
|
||||
maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
|
||||
|
||||
// Interleave R,G,B into the lower byte of the word
|
||||
// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
|
||||
__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
|
||||
// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
|
||||
__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
|
||||
|
||||
// mask = (src - dst) * mask
|
||||
maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
|
||||
maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
|
||||
|
||||
// mask = (src - dst) * mask >> 5
|
||||
maskLo = _mm_srai_epi16(maskLo, 5);
|
||||
maskHi = _mm_srai_epi16(maskHi, 5);
|
||||
|
||||
// Add two pixels into result.
|
||||
// result = dst + ((src - dst) * mask >> 5)
|
||||
__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
|
||||
__m128i resultHi = _mm_add_epi16(dstHi, maskHi);
|
||||
|
||||
// Pack into 4 32bit dst pixels and force opaque.
|
||||
// resultLo and resultHi contain eight 16-bit components (two pixels) each.
|
||||
// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
|
||||
// clamping to 255 if necessary. Set alpha components to 0xFF.
|
||||
return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
|
||||
_mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
|
||||
}
|
||||
|
||||
void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
|
||||
SkColor src, int width, SkPMColor) {
|
||||
if (width <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int srcA = SkColorGetA(src);
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
srcA = SkAlpha255To256(srcA);
|
||||
|
||||
if (width >= 4) {
|
||||
SkASSERT(((size_t)dst & 0x03) == 0);
|
||||
while (((size_t)dst & 0x0F) != 0) {
|
||||
*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
// Set alpha to 0xFF and replicate source four times in SSE register.
|
||||
__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
|
||||
// Interleave with zeros to get two sets of four 16-bit values.
|
||||
src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
|
||||
// Set srcA_sse to contain eight copies of srcA, padded with zero.
|
||||
// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
__m128i srcA_sse = _mm_set1_epi16(srcA);
|
||||
while (width >= 4) {
|
||||
// Load four destination pixels into dst_sse.
|
||||
__m128i dst_sse = _mm_load_si128(d);
|
||||
// Load four 16-bit masks into lower half of mask_sse.
|
||||
__m128i mask_sse = _mm_loadl_epi64(
|
||||
reinterpret_cast<const __m128i*>(mask));
|
||||
|
||||
// Check whether masks are equal to 0 and get the highest bit
|
||||
// of each byte of result, if masks are all zero, we will get
|
||||
// pack_cmp to 0xFFFF
|
||||
int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
|
||||
_mm_setzero_si128()));
|
||||
|
||||
// if mask pixels are not all zero, we will blend the dst pixels
|
||||
if (pack_cmp != 0xFFFF) {
|
||||
// Unpack 4 16bit mask pixels to
|
||||
// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
mask_sse = _mm_unpacklo_epi16(mask_sse,
|
||||
_mm_setzero_si128());
|
||||
|
||||
// Process 4 32bit dst pixels
|
||||
__m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
|
||||
mask_sse, srcA_sse);
|
||||
_mm_store_si128(d, result);
|
||||
}
|
||||
|
||||
d++;
|
||||
mask += 4;
|
||||
width -= 4;
|
||||
}
|
||||
|
||||
dst = reinterpret_cast<SkPMColor*>(d);
|
||||
}
|
||||
|
||||
while (width > 0) {
|
||||
*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
}
|
||||
|
||||
void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
|
||||
SkColor src, int width, SkPMColor opaqueDst) {
|
||||
if (width <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int srcR = SkColorGetR(src);
|
||||
int srcG = SkColorGetG(src);
|
||||
int srcB = SkColorGetB(src);
|
||||
|
||||
if (width >= 4) {
|
||||
SkASSERT(((size_t)dst & 0x03) == 0);
|
||||
while (((size_t)dst & 0x0F) != 0) {
|
||||
*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
// Set alpha to 0xFF and replicate source four times in SSE register.
|
||||
__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
|
||||
// Set srcA_sse to contain eight copies of srcA, padded with zero.
|
||||
// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
|
||||
src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
|
||||
while (width >= 4) {
|
||||
// Load four destination pixels into dst_sse.
|
||||
__m128i dst_sse = _mm_load_si128(d);
|
||||
// Load four 16-bit masks into lower half of mask_sse.
|
||||
__m128i mask_sse = _mm_loadl_epi64(
|
||||
reinterpret_cast<const __m128i*>(mask));
|
||||
|
||||
// Check whether masks are equal to 0 and get the highest bit
|
||||
// of each byte of result, if masks are all zero, we will get
|
||||
// pack_cmp to 0xFFFF
|
||||
int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
|
||||
_mm_setzero_si128()));
|
||||
|
||||
// if mask pixels are not all zero, we will blend the dst pixels
|
||||
if (pack_cmp != 0xFFFF) {
|
||||
// Unpack 4 16bit mask pixels to
|
||||
// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
|
||||
// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
|
||||
mask_sse = _mm_unpacklo_epi16(mask_sse,
|
||||
_mm_setzero_si128());
|
||||
|
||||
// Process 4 32bit dst pixels
|
||||
__m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
|
||||
mask_sse);
|
||||
_mm_store_si128(d, result);
|
||||
}
|
||||
|
||||
d++;
|
||||
mask += 4;
|
||||
width -= 4;
|
||||
}
|
||||
|
||||
dst = reinterpret_cast<SkPMColor*>(d);
|
||||
}
|
||||
|
||||
while (width > 0) {
|
||||
*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
|
||||
mask++;
|
||||
dst++;
|
||||
width--;
|
||||
}
|
||||
}
|
||||
|
@ -18,9 +18,4 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
int count, U8CPU alpha);
|
||||
|
||||
void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width, SkPMColor);
|
||||
void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width, SkPMColor opaqueDst);
|
||||
|
||||
#endif
|
||||
|
@ -7,7 +7,6 @@
|
||||
|
||||
#include "SkBlitRow_opts_arm_neon.h"
|
||||
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkColorData.h"
|
||||
#include "SkMathPriv.h"
|
||||
|
@ -73,22 +73,6 @@ static inline uint16x8_t SkPixel32ToPixel16_neon8(uint8x8x4_t vsrc) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* This function blends 8 pixels of the same channel in the exact same way as
|
||||
* SkBlend32.
|
||||
*/
|
||||
static inline uint8x8_t SkBlend32_neon8(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
|
||||
int16x8_t src_wide, dst_wide;
|
||||
|
||||
src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
|
||||
dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
|
||||
|
||||
src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
|
||||
|
||||
dst_wide += vshrq_n_s16(src_wide, 5);
|
||||
|
||||
return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
|
||||
}
|
||||
|
||||
static inline SkPMColor SkFourByteInterp256_neon(SkPMColor src, SkPMColor dst,
|
||||
unsigned srcScale) {
|
||||
SkASSERT(srcScale <= 256);
|
||||
|
@ -7,7 +7,6 @@
|
||||
|
||||
#include "SkBitmapProcState_opts_SSE2.h"
|
||||
#include "SkBitmapProcState_opts_SSSE3.h"
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkBlitRow_opts_SSE2.h"
|
||||
#include "SkCpu.h"
|
||||
@ -79,18 +78,3 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
|
||||
if (SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (isOpaque) {
|
||||
return SkBlitLCD16OpaqueRow_SSE2;
|
||||
} else {
|
||||
return SkBlitLCD16Row_SSE2;
|
||||
}
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user