From 6a2c42f893ddb0141bef6e90c14fb68dda5ccb30 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Fri, 9 Nov 2018 12:09:36 -0500 Subject: [PATCH] clean up SkBlitRow_opts SSE2 and NEON are common baseline instruction sets now, so there's no need to runtime detect support for these routines. I simplified the SSE and portable implementations while moving them. Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel Change-Id: I34e96851735c8d7ad90198f3ac4bf86ff508f17c Reviewed-on: https://skia-review.googlesource.com/c/170220 Reviewed-by: Mike Klein Commit-Queue: Mike Klein --- gn/opts.gni | 8 - src/core/SkBlitRow.h | 13 -- src/core/SkBlitRow_D32.cpp | 286 +++++++++++++++++++-------- src/opts/SkBlitRow_opts_SSE2.cpp | 103 ---------- src/opts/SkBlitRow_opts_SSE2.h | 21 -- src/opts/SkBlitRow_opts_arm.cpp | 19 -- src/opts/SkBlitRow_opts_arm_neon.cpp | 200 ------------------- src/opts/SkBlitRow_opts_arm_neon.h | 14 -- src/opts/SkBlitRow_opts_none.cpp | 14 -- src/opts/opts_check_x86.cpp | 19 -- 10 files changed, 208 insertions(+), 489 deletions(-) delete mode 100644 src/opts/SkBlitRow_opts_SSE2.cpp delete mode 100644 src/opts/SkBlitRow_opts_SSE2.h delete mode 100644 src/opts/SkBlitRow_opts_arm.cpp delete mode 100644 src/opts/SkBlitRow_opts_arm_neon.cpp delete mode 100644 src/opts/SkBlitRow_opts_arm_neon.h delete mode 100644 src/opts/SkBlitRow_opts_none.cpp diff --git a/gn/opts.gni b/gn/opts.gni index 355e15c492..3d3e040473 100644 --- a/gn/opts.gni +++ b/gn/opts.gni @@ -9,12 +9,10 @@ _src = get_path_info("../src", "abspath") none = [ "$_src/opts/Sk4px_none.h", "$_src/opts/SkBitmapProcState_opts_none.cpp", - "$_src/opts/SkBlitRow_opts_none.cpp", ] armv7 = [ "$_src/opts/SkBitmapProcState_opts_none.cpp", - "$_src/opts/SkBlitRow_opts_arm.cpp", ] neon = [ @@ -23,8 +21,6 @@ neon = [ "$_src/opts/SkBitmapProcState_filter_neon.h", "$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp", "$_src/opts/SkBitmapProcState_matrix_neon.h", - "$_src/opts/SkBlitRow_opts_arm_neon.h", - "$_src/opts/SkBlitRow_opts_arm_neon.cpp", "$_src/opts/SkColor_opts_neon.h", ] @@ -35,9 +31,6 @@ arm64 = [ "$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp", "$_src/opts/SkBitmapProcState_matrix_neon.h", "$_src/opts/SkBitmapProcState_opts_none.cpp", - "$_src/opts/SkBlitRow_opts_arm.cpp", - "$_src/opts/SkBlitRow_opts_arm_neon.h", - "$_src/opts/SkBlitRow_opts_arm_neon.cpp", "$_src/opts/SkColor_opts_neon.h", ] @@ -47,7 +40,6 @@ sse2 = [ "$_src/opts/Sk4px_SSE2.h", "$_src/opts/SkBitmapProcState_opts_SSE2.h", "$_src/opts/SkBitmapProcState_opts_SSE2.cpp", - "$_src/opts/SkBlitRow_opts_SSE2.cpp", "$_src/opts/SkColor_opts_SSE2.h", "$_src/opts/opts_check_x86.cpp", ] diff --git a/src/core/SkBlitRow.h b/src/core/SkBlitRow.h index d35f1c460e..19d1f73afc 100644 --- a/src/core/SkBlitRow.h +++ b/src/core/SkBlitRow.h @@ -33,19 +33,6 @@ public: if they are not, they may not overlap. */ static void Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color); - - /** These static functions are called by the Factory and Factory32 - functions, and should return either NULL, or a - platform-specific function-ptr to be used in place of the - system default. - */ - - static Proc32 PlatformProcs32(unsigned flags); - -private: - enum { - kFlags32_Mask = 3 - }; }; #endif diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp index b82665a19c..b8095d4c05 100644 --- a/src/core/SkBlitRow_D32.cpp +++ b/src/core/SkBlitRow_D32.cpp @@ -11,103 +11,233 @@ #include "SkOpts.h" #include "SkUtils.h" -#define UNROLL - -static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { +// Everyone agrees memcpy() is the best way to do this. +static void blit_row_s32_opaque(SkPMColor* dst, + const SkPMColor* src, + int count, + U8CPU alpha) { SkASSERT(255 == alpha); - memcpy(dst, src, count * 4); + memcpy(dst, src, count * sizeof(SkPMColor)); } -static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - SkASSERT(alpha <= 255); - if (count > 0) { - unsigned src_scale = SkAlpha255To256(alpha); +// We have SSE2, NEON, and portable implementations of +// blit_row_s32_blend() and blit_row_s32a_blend(). -#ifdef UNROLL - if (count & 1) { - *dst = SkPMLerp(*src, *dst, src_scale); - src += 1; - dst += 1; - count -= 1; +// TODO(mtklein): can we do better in NEON than 2 pixels at a time? + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + #include + #include "SkColor_opts_SSE2.h" + + static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + + auto src4 = (const __m128i*)src; + auto dst4 = ( __m128i*)dst; + + while (count >= 4) { + _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4), + _mm_loadu_si128(dst4), + SkAlpha255To256(alpha))); + src4++; + dst4++; + count -= 4; } - const SkPMColor* SK_RESTRICT srcEnd = src + count; - while (src != srcEnd) { - *dst = SkPMLerp(*src, *dst, src_scale); - src += 1; - dst += 1; - *dst = SkPMLerp(*src, *dst, src_scale); - src += 1; - dst += 1; + src = (const SkPMColor*)src4; + dst = ( SkPMColor*)dst4; + + while (count --> 0) { + *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); + src++; + dst++; } -#else - do { - *dst = SkPMLerp(*src, *dst, src_scale); - src += 1; - dst += 1; - } while (--count > 0); -#endif } -} -static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - SkASSERT(alpha <= 255); - if (count > 0) { -#ifdef UNROLL - if (count & 1) { - *dst = SkBlendARGB32(*(src++), *dst, alpha); - dst += 1; - count -= 1; + static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + + auto src4 = (const __m128i*)src; + auto dst4 = ( __m128i*)dst; + + while (count >= 4) { + _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4), + _mm_loadu_si128(dst4), + alpha)); + src4++; + dst4++; + count -= 4; } - const SkPMColor* SK_RESTRICT srcEnd = src + count; - while (src != srcEnd) { - *dst = SkBlendARGB32(*(src++), *dst, alpha); - dst += 1; - *dst = SkBlendARGB32(*(src++), *dst, alpha); - dst += 1; - } -#else - do { + src = (const SkPMColor*)src4; + dst = ( SkPMColor*)dst4; + + while (count --> 0) { *dst = SkBlendARGB32(*src, *dst, alpha); - src += 1; - dst += 1; - } while (--count > 0); -#endif + src++; + dst++; + } } -} -/////////////////////////////////////////////////////////////////////////////// +#elif defined(SK_ARM_HAS_NEON) -static const SkBlitRow::Proc32 gDefault_Procs32[] = { - S32_Opaque_BlitRow32, - S32_Blend_BlitRow32, - nullptr, - S32A_Blend_BlitRow32 -}; + #include "SkColor_opts_neon.h" + #include + + static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + + uint16_t src_scale = SkAlpha255To256(alpha); + uint16_t dst_scale = 256 - src_scale; + + while (count >= 2) { + uint8x8_t vsrc, vdst, vres; + uint16x8_t vsrc_wide, vdst_wide; + + vsrc = vreinterpret_u8_u32(vld1_u32(src)); + vdst = vreinterpret_u8_u32(vld1_u32(dst)); + + vsrc_wide = vmovl_u8(vsrc); + vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); + + vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); + + vdst_wide += vsrc_wide; + vres = vshrn_n_u16(vdst_wide, 8); + + vst1_u32(dst, vreinterpret_u32_u8(vres)); + + src += 2; + dst += 2; + count -= 2; + } + + if (count == 1) { + uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; + uint16x8_t vsrc_wide, vdst_wide; + + vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); + vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); + + vsrc_wide = vmovl_u8(vsrc); + vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); + vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); + vdst_wide += vsrc_wide; + vres = vshrn_n_u16(vdst_wide, 8); + + vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); + } + } + + static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { + SkASSERT(alpha < 255); + + unsigned alpha256 = SkAlpha255To256(alpha); + + if (count & 1) { + uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; + uint16x8_t vdst_wide, vsrc_wide; + unsigned dst_scale; + + vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); + vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); + + dst_scale = vget_lane_u8(vsrc, 3); + dst_scale = SkAlphaMulInv256(dst_scale, alpha256); + + vsrc_wide = vmovl_u8(vsrc); + vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); + + vdst_wide = vmovl_u8(vdst); + vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); + + vdst_wide += vsrc_wide; + vres = vshrn_n_u16(vdst_wide, 8); + + vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); + dst++; + src++; + count--; + } + + uint8x8_t alpha_mask; + static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; + alpha_mask = vld1_u8(alpha_mask_setup); + + while (count) { + + uint8x8_t vsrc, vdst, vres, vsrc_alphas; + uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; + + __builtin_prefetch(src+32); + __builtin_prefetch(dst+32); + + vsrc = vreinterpret_u8_u32(vld1_u32(src)); + vdst = vreinterpret_u8_u32(vld1_u32(dst)); + + vsrc_scale = vdupq_n_u16(alpha256); + + vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); + vdst_scale = vmovl_u8(vsrc_alphas); + // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). + // A 16-bit lane would overflow if we used 0xFFFF here, + // so use an approximation with 0xFF00 that is off by 1, + // and add back 1 after to get the correct value. + // This is valid if alpha256 <= 255. + vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); + vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); + vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); + + vsrc_wide = vmovl_u8(vsrc); + vsrc_wide *= vsrc_scale; + + vdst_wide = vmovl_u8(vdst); + vdst_wide *= vdst_scale; + + vdst_wide += vsrc_wide; + vres = vshrn_n_u16(vdst_wide, 8); + + vst1_u32(dst, vreinterpret_u32_u8(vres)); + + src += 2; + dst += 2; + count -= 2; + } + } + +#else + static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + while (count --> 0) { + *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); + src++; + dst++; + } + } + + static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + while (count --> 0) { + *dst = SkBlendARGB32(*src, *dst, alpha); + src++; + dst++; + } + } +#endif SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) { - SkASSERT(flags < SK_ARRAY_COUNT(gDefault_Procs32)); - // just so we don't crash - flags &= kFlags32_Mask; + static const SkBlitRow::Proc32 kProcs[] = { + blit_row_s32_opaque, + blit_row_s32_blend, + nullptr, // blit_row_s32a_opaque is in SkOpts + blit_row_s32a_blend + }; - if (flags == 2) { - // S32A_Opaque_BlitRow32 has been ported to SkOpts, but not the others yet. - return SkOpts::blit_row_s32a_opaque; - } + SkASSERT(flags < SK_ARRAY_COUNT(kProcs)); + flags &= SK_ARRAY_COUNT(kProcs) - 1; // just to be safe - SkBlitRow::Proc32 proc = PlatformProcs32(flags); - if (nullptr == proc) { - proc = gDefault_Procs32[flags]; - } - SkASSERT(proc); - return proc; + return flags == 2 ? SkOpts::blit_row_s32a_opaque + : kProcs[flags]; } void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) { diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp deleted file mode 100644 index 36cc3f4b19..0000000000 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2012 The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#include -#include "SkBitmapProcState_opts_SSE2.h" -#include "SkBlitRow_opts_SSE2.h" -#include "SkColorData.h" -#include "SkColor_opts_SSE2.h" -#include "SkMSAN.h" -#include "SkUTF.h" - -/* SSE2 version of S32_Blend_BlitRow32() - * portable version is in core/SkBlitRow_D32.cpp - */ -void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - SkASSERT(alpha <= 255); - if (count <= 0) { - return; - } - - uint32_t src_scale = SkAlpha255To256(alpha); - - if (count >= 4) { - SkASSERT(((size_t)dst & 0x03) == 0); - while (((size_t)dst & 0x0F) != 0) { - *dst = SkPMLerp(*src, *dst, src_scale); - src++; - dst++; - count--; - } - - const __m128i *s = reinterpret_cast(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); - - while (count >= 4) { - // Load 4 pixels each of src and dest. - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_load_si128(d); - - __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale); - _mm_store_si128(d, result); - s++; - d++; - count -= 4; - } - src = reinterpret_cast(s); - dst = reinterpret_cast(d); - } - - while (count > 0) { - *dst = SkPMLerp(*src, *dst, src_scale); - src++; - dst++; - count--; - } -} - -void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - SkASSERT(alpha <= 255); - if (count <= 0) { - return; - } - - if (count >= 4) { - while (((size_t)dst & 0x0F) != 0) { - *dst = SkBlendARGB32(*src, *dst, alpha); - src++; - dst++; - count--; - } - - const __m128i *s = reinterpret_cast(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); - while (count >= 4) { - // Load 4 pixels each of src and dest. - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_load_si128(d); - - __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); - _mm_store_si128(d, result); - s++; - d++; - count -= 4; - } - src = reinterpret_cast(s); - dst = reinterpret_cast(d); - } - - while (count > 0) { - *dst = SkBlendARGB32(*src, *dst, alpha); - src++; - dst++; - count--; - } -} diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h deleted file mode 100644 index 826a5ccaaf..0000000000 --- a/src/opts/SkBlitRow_opts_SSE2.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright 2009 The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#ifndef SkBlitRow_opts_SSE2_DEFINED -#define SkBlitRow_opts_SSE2_DEFINED - -#include "SkBlitRow.h" - -void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha); - -void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha); - -#endif diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp deleted file mode 100644 index 543640a57f..0000000000 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright 2012 The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#include "SkBlitRow.h" -#include "SkUtilsArm.h" - -#include "SkBlitRow_opts_arm_neon.h" - -extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { - nullptr, nullptr, nullptr, nullptr, -}; - -SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { - return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags]; -} diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp deleted file mode 100644 index 23ea938dfb..0000000000 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright 2012 The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#include "SkBlitRow_opts_arm_neon.h" - -#include "SkBlitRow.h" -#include "SkColorData.h" -#include "SkMathPriv.h" -#include "SkUTF.h" - -#include "SkColor_opts_neon.h" -#include - -/* Neon version of S32_Blend_BlitRow32() - * portable version is in src/core/SkBlitRow_D32.cpp - */ -void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - SkASSERT(alpha <= 255); - - if (count <= 0) { - return; - } - - uint16_t src_scale = SkAlpha255To256(alpha); - uint16_t dst_scale = 256 - src_scale; - - while (count >= 2) { - uint8x8_t vsrc, vdst, vres; - uint16x8_t vsrc_wide, vdst_wide; - - /* These commented prefetches are a big win for count - * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. - * They also hurt a little (<5%) on an A15 - */ - //__builtin_prefetch(src+32); - //__builtin_prefetch(dst+32); - - // Load - vsrc = vreinterpret_u8_u32(vld1_u32(src)); - vdst = vreinterpret_u8_u32(vld1_u32(dst)); - - // Process src - vsrc_wide = vmovl_u8(vsrc); - vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); - - // Process dst - vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); - - // Combine - vdst_wide += vsrc_wide; - vres = vshrn_n_u16(vdst_wide, 8); - - // Store - vst1_u32(dst, vreinterpret_u32_u8(vres)); - - src += 2; - dst += 2; - count -= 2; - } - - if (count == 1) { - uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; - uint16x8_t vsrc_wide, vdst_wide; - - // Load - vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); - vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); - - // Process - vsrc_wide = vmovl_u8(vsrc); - vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); - vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); - vdst_wide += vsrc_wide; - vres = vshrn_n_u16(vdst_wide, 8); - - // Store - vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); - } -} - -#ifdef SK_CPU_ARM32 -void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - - SkASSERT(255 > alpha); - - if (count <= 0) { - return; - } - - unsigned alpha256 = SkAlpha255To256(alpha); - - // First deal with odd counts - if (count & 1) { - uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; - uint16x8_t vdst_wide, vsrc_wide; - unsigned dst_scale; - - // Load - vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); - vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); - - // Calc dst_scale - dst_scale = vget_lane_u8(vsrc, 3); - dst_scale = SkAlphaMulInv256(dst_scale, alpha256); - - // Process src - vsrc_wide = vmovl_u8(vsrc); - vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); - - // Process dst - vdst_wide = vmovl_u8(vdst); - vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); - - // Combine - vdst_wide += vsrc_wide; - vres = vshrn_n_u16(vdst_wide, 8); - - vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); - dst++; - src++; - count--; - } - - if (count) { - uint8x8_t alpha_mask; - static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; - alpha_mask = vld1_u8(alpha_mask_setup); - - do { - - uint8x8_t vsrc, vdst, vres, vsrc_alphas; - uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; - - __builtin_prefetch(src+32); - __builtin_prefetch(dst+32); - - // Load - vsrc = vreinterpret_u8_u32(vld1_u32(src)); - vdst = vreinterpret_u8_u32(vld1_u32(dst)); - - // Prepare src_scale - vsrc_scale = vdupq_n_u16(alpha256); - - // Calc dst_scale - vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); - vdst_scale = vmovl_u8(vsrc_alphas); - // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). - // A 16-bit lane would overflow if we used 0xFFFF here, - // so use an approximation with 0xFF00 that is off by 1, - // and add back 1 after to get the correct value. - // This is valid if alpha256 <= 255. - vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); - vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); - vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); - - // Process src - vsrc_wide = vmovl_u8(vsrc); - vsrc_wide *= vsrc_scale; - - // Process dst - vdst_wide = vmovl_u8(vdst); - vdst_wide *= vdst_scale; - - // Combine - vdst_wide += vsrc_wide; - vres = vshrn_n_u16(vdst_wide, 8); - - vst1_u32(dst, vreinterpret_u32_u8(vres)); - - src += 2; - dst += 2; - count -= 2; - } while(count); - } -} - -/////////////////////////////////////////////////////////////////////////////// - -#endif // #ifdef SK_CPU_ARM32 - -/////////////////////////////////////////////////////////////////////////////// - -const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { - nullptr, // S32_Opaque, - S32_Blend_BlitRow32_neon, // S32_Blend, - nullptr, // Ported to SkOpts -#ifdef SK_CPU_ARM32 - S32A_Blend_BlitRow32_neon // S32A_Blend -#else - nullptr -#endif -}; diff --git a/src/opts/SkBlitRow_opts_arm_neon.h b/src/opts/SkBlitRow_opts_arm_neon.h deleted file mode 100644 index 815c2b7476..0000000000 --- a/src/opts/SkBlitRow_opts_arm_neon.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright 2012 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ -#ifndef SkBlitRow_opts_arm_neon_DEFINED -#define SkBlitRow_opts_arm_neon_DEFINED - -#include "SkBlitRow.h" - -extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[]; - -#endif diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp deleted file mode 100644 index 289bb7e88c..0000000000 --- a/src/opts/SkBlitRow_opts_none.cpp +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright 2011 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#include "SkBlitRow.h" - -// Platform impl of Platform_procs with no overrides - -SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { - return nullptr; -} diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index 4ef210ac02..82d2d47550 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -7,8 +7,6 @@ #include "SkBitmapProcState_opts_SSE2.h" #include "SkBitmapProcState_opts_SSSE3.h" -#include "SkBlitRow.h" -#include "SkBlitRow_opts_SSE2.h" #include "SkCpu.h" @@ -61,20 +59,3 @@ void SkBitmapProcState::platformProcs() { fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2; } } - -//////////////////////////////////////////////////////////////////////////////// - -static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = { - nullptr, // S32_Opaque, - S32_Blend_BlitRow32_SSE2, // S32_Blend, - nullptr, // Ported to SkOpts - S32A_Blend_BlitRow32_SSE2, // S32A_Blend, -}; - -SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { - if (SkCpu::Supports(SkCpu::SSE2)) { - return platform_32_procs_SSE2[flags]; - } else { - return nullptr; - } -}