ARM Skia NEON patches - 21 - new NEON S32_D565_Opaque
BlitRow565: NEON version of S32_D565_Opaque Here's a new implementation of S32_D565_Opaque in NEON. It improves dramatically the speed compared to S32A_D565_Opaque. Here are the benchmark results (speedup vs. existing NEON): +-------+-----------+------------+ | count | Cortex-A9 | Cortex-A15 | +-------+-----------+------------+ | 1 | +130% | +139% | +-------+-----------+------------+ | 2 | +65,2% | +51% | +-------+-----------+------------+ | 4 | -25,5% | +10,2% | +-------+-----------+------------+ | 8 | +63,8% | +32,1% | +-------+-----------+------------+ | 16 | +110% | +49,2% | +-------+-----------+------------+ | 64 | +153% | +123,5% | +-------+-----------+------------+ | 256 | +151% | +144,7% | +-------+-----------+------------+ | 1024 | +272% | +157,2% | +-------+-----------+------------+ Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG= R=djsollen@google.com, mtklein@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/22351006 git-svn-id: http://skia.googlecode.com/svn/trunk@11415 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
519f9677a4
commit
0060159457
@ -15,9 +15,45 @@
|
||||
#include "SkUtils.h"
|
||||
|
||||
#include "SkCachePreload_arm.h"
|
||||
|
||||
#include "SkColor_opts_neon.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src, int count,
|
||||
U8CPU alpha, int /*x*/, int /*y*/) {
|
||||
SkASSERT(255 == alpha);
|
||||
|
||||
while (count >= 8) {
|
||||
uint8x8x4_t vsrc;
|
||||
uint16x8_t vdst;
|
||||
|
||||
// Load
|
||||
vsrc = vld4_u8((uint8_t*)src);
|
||||
|
||||
// Convert src to 565
|
||||
vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
|
||||
vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
|
||||
vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
|
||||
|
||||
// Store
|
||||
vst1q_u16(dst, vdst);
|
||||
|
||||
// Prepare next iteration
|
||||
dst += 8;
|
||||
src += 8;
|
||||
count -= 8;
|
||||
};
|
||||
|
||||
// Leftovers
|
||||
while (count > 0) {
|
||||
SkPMColor c = *src++;
|
||||
SkPMColorAssert(c);
|
||||
*dst = SkPixel32ToPixel16_ToU16(c);
|
||||
dst++;
|
||||
count--;
|
||||
};
|
||||
}
|
||||
|
||||
void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src, int count,
|
||||
U8CPU alpha, int /*x*/, int /*y*/) {
|
||||
@ -1330,10 +1366,10 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
|
||||
|
||||
const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
|
||||
// no dither
|
||||
// NOTE: For the two functions below, we don't have a special version
|
||||
// that assumes that each source pixel is opaque. But our S32A is
|
||||
// still faster than the default, so use it.
|
||||
S32A_D565_Opaque_neon, // really S32_D565_Opaque
|
||||
// NOTE: For the S32_D565_Blend function below, we don't have a special
|
||||
// version that assumes that each source pixel is opaque. But our
|
||||
// S32A is still faster than the default, so use it.
|
||||
S32_D565_Opaque_neon,
|
||||
S32A_D565_Blend_neon, // really S32_D565_Blend
|
||||
S32A_D565_Opaque_neon,
|
||||
S32A_D565_Blend_neon,
|
||||
|
12
src/opts/SkColor_opts_neon.h
Normal file
12
src/opts/SkColor_opts_neon.h
Normal file
@ -0,0 +1,12 @@
|
||||
#ifndef SkColor_opts_neon_DEFINED
|
||||
#define SkColor_opts_neon_DEFINED
|
||||
|
||||
#include "SkTypes.h"
|
||||
|
||||
#define NEON_A (SK_A32_SHIFT / 8)
|
||||
#define NEON_R (SK_R32_SHIFT / 8)
|
||||
#define NEON_G (SK_G32_SHIFT / 8)
|
||||
#define NEON_B (SK_B32_SHIFT / 8)
|
||||
|
||||
#endif /* #ifndef SkColor_opts_neon_DEFINED */
|
||||
|
Loading…
Reference in New Issue
Block a user