ARM Skia NEON patches - 21 - new NEON S32_D565_Opaque

BlitRow565: NEON version of S32_D565_Opaque

Here's a new implementation of S32_D565_Opaque in NEON. It
improves dramatically the speed compared to S32A_D565_Opaque.

Here are the benchmark results (speedup vs. existing NEON):

+-------+-----------+------------+
| count | Cortex-A9 | Cortex-A15 |
+-------+-----------+------------+
| 1     | +130%     | +139%      |
+-------+-----------+------------+
| 2     | +65,2%    | +51%       |
+-------+-----------+------------+
| 4     | -25,5%    | +10,2%     |
+-------+-----------+------------+
| 8     | +63,8%    | +32,1%     |
+-------+-----------+------------+
| 16    | +110%     | +49,2%     |
+-------+-----------+------------+
| 64    | +153%     | +123,5%    |
+-------+-----------+------------+
| 256   | +151%     | +144,7%    |
+-------+-----------+------------+
| 1024  | +272%     | +157,2%    |
+-------+-----------+------------+

Signed-off-by: Kévin PETIT <kevin.petit@arm.com>

BUG=
R=djsollen@google.com, mtklein@google.com

Author: kevin.petit.arm@gmail.com

Review URL: https://chromiumcodereview.appspot.com/22351006

git-svn-id: http://skia.googlecode.com/svn/trunk@11415 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
commit-bot@chromium.org 2013-09-20 15:38:49 +00:00
parent 519f9677a4
commit 0060159457
2 changed files with 53 additions and 5 deletions

View File

@ -15,9 +15,45 @@
#include "SkUtils.h"
#include "SkCachePreload_arm.h"
#include "SkColor_opts_neon.h"
#include <arm_neon.h>
void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
SkASSERT(255 == alpha);
while (count >= 8) {
uint8x8x4_t vsrc;
uint16x8_t vdst;
// Load
vsrc = vld4_u8((uint8_t*)src);
// Convert src to 565
vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
// Store
vst1q_u16(dst, vdst);
// Prepare next iteration
dst += 8;
src += 8;
count -= 8;
};
// Leftovers
while (count > 0) {
SkPMColor c = *src++;
SkPMColorAssert(c);
*dst = SkPixel32ToPixel16_ToU16(c);
dst++;
count--;
};
}
void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
@ -1330,10 +1366,10 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
// no dither
// NOTE: For the two functions below, we don't have a special version
// that assumes that each source pixel is opaque. But our S32A is
// still faster than the default, so use it.
S32A_D565_Opaque_neon, // really S32_D565_Opaque
// NOTE: For the S32_D565_Blend function below, we don't have a special
// version that assumes that each source pixel is opaque. But our
// S32A is still faster than the default, so use it.
S32_D565_Opaque_neon,
S32A_D565_Blend_neon, // really S32_D565_Blend
S32A_D565_Opaque_neon,
S32A_D565_Blend_neon,

View File

@ -0,0 +1,12 @@
#ifndef SkColor_opts_neon_DEFINED
#define SkColor_opts_neon_DEFINED
#include "SkTypes.h"
#define NEON_A (SK_A32_SHIFT / 8)
#define NEON_R (SK_R32_SHIFT / 8)
#define NEON_G (SK_G32_SHIFT / 8)
#define NEON_B (SK_B32_SHIFT / 8)
#endif /* #ifndef SkColor_opts_neon_DEFINED */