Port SkMatrix opts to SkOpts.

No changes to the code, just moved around.

This will have the effect of enabling vectorized code on ARMv7.
Should be no effect on ARMv8 or x86, which would have been vectorized already.

nanobench --match mappoints changes on Nexus 5 (ARMv7):

_affine: 132 -> 95
_scale: 118 -> 47
_trans: 60 -> 37

A teaser:
We should next look at the ABCD->BADC shuffle we've noted that we need in _affine.  A quick hack showed doing that optimally is another ~35% speedup on x86.  Got to figure out how to do it best on ARM though: that same quick hack was a 2x slowdown there.  Good reason to resurrect that SkNx_shuffle() CL!

(I believe the answers are vrev64q_f32(v) and _mm_shuffle_ps(v,v, _MM_SHUFFLE(2,3,0,1), but we should probably find out in another CL.)

BUG=skia:4117

Review URL: https://codereview.chromium.org/1320673014
This commit is contained in:
mtklein 2015-09-10 11:18:31 -07:00 committed by Commit bot
parent b3b9aec221
commit 4e8a09d367
5 changed files with 126 additions and 85 deletions

View File

@ -10,6 +10,7 @@
#include "SkRSXform.h"
#include "SkString.h"
#include "SkNx.h"
#include "SkOpts.h"
#include <stddef.h>
@ -907,64 +908,11 @@ void SkMatrix::Identity_pts(const SkMatrix& m, SkPoint dst[], const SkPoint src[
}
void SkMatrix::Trans_pts(const SkMatrix& m, SkPoint dst[], const SkPoint src[], int count) {
SkASSERT(m.getType() <= kTranslate_Mask);
if (count > 0) {
SkScalar tx = m.getTranslateX();
SkScalar ty = m.getTranslateY();
if (count & 1) {
dst->fX = src->fX + tx;
dst->fY = src->fY + ty;
src += 1;
dst += 1;
}
Sk4s trans4(tx, ty, tx, ty);
count >>= 1;
if (count & 1) {
(Sk4s::Load(&src->fX) + trans4).store(&dst->fX);
src += 2;
dst += 2;
}
count >>= 1;
for (int i = 0; i < count; ++i) {
(Sk4s::Load(&src[0].fX) + trans4).store(&dst[0].fX);
(Sk4s::Load(&src[2].fX) + trans4).store(&dst[2].fX);
src += 4;
dst += 4;
}
}
return SkOpts::matrix_translate(m,dst,src,count);
}
void SkMatrix::Scale_pts(const SkMatrix& m, SkPoint dst[], const SkPoint src[], int count) {
SkASSERT(m.getType() <= (kScale_Mask | kTranslate_Mask));
if (count > 0) {
SkScalar tx = m.getTranslateX();
SkScalar ty = m.getTranslateY();
SkScalar sx = m.getScaleX();
SkScalar sy = m.getScaleY();
if (count & 1) {
dst->fX = src->fX * sx + tx;
dst->fY = src->fY * sy + ty;
src += 1;
dst += 1;
}
Sk4s trans4(tx, ty, tx, ty);
Sk4s scale4(sx, sy, sx, sy);
count >>= 1;
if (count & 1) {
(Sk4s::Load(&src->fX) * scale4 + trans4).store(&dst->fX);
src += 2;
dst += 2;
}
count >>= 1;
for (int i = 0; i < count; ++i) {
(Sk4s::Load(&src[0].fX) * scale4 + trans4).store(&dst[0].fX);
(Sk4s::Load(&src[2].fX) * scale4 + trans4).store(&dst[2].fX);
src += 4;
dst += 4;
}
}
return SkOpts::matrix_scale_translate(m,dst,src,count);
}
void SkMatrix::Persp_pts(const SkMatrix& m, SkPoint dst[],
@ -996,33 +944,7 @@ void SkMatrix::Persp_pts(const SkMatrix& m, SkPoint dst[],
}
void SkMatrix::Affine_vpts(const SkMatrix& m, SkPoint dst[], const SkPoint src[], int count) {
SkASSERT(m.getType() != kPerspective_Mask);
if (count > 0) {
SkScalar tx = m.getTranslateX();
SkScalar ty = m.getTranslateY();
SkScalar sx = m.getScaleX();
SkScalar sy = m.getScaleY();
SkScalar kx = m.getSkewX();
SkScalar ky = m.getSkewY();
if (count & 1) {
dst->set(src->fX * sx + src->fY * kx + tx,
src->fX * ky + src->fY * sy + ty);
src += 1;
dst += 1;
}
Sk4s trans4(tx, ty, tx, ty);
Sk4s scale4(sx, sy, sx, sy);
Sk4s skew4(kx, ky, kx, ky); // applied to swizzle of src4
count >>= 1;
for (int i = 0; i < count; ++i) {
Sk4s src4 = Sk4s::Load(&src->fX);
Sk4s swz4(src[0].fY, src[0].fX, src[1].fY, src[1].fX); // need ABCD -> BADC
(src4 * scale4 + swz4 * skew4 + trans4).store(&dst->fX);
src += 2;
dst += 2;
}
}
return SkOpts::matrix_affine(m,dst,src,count);
}
const SkMatrix::MapPtsProc SkMatrix::gMapPtsProcs[] = {

View File

@ -14,6 +14,7 @@
#include "SkBlurImageFilter_opts.h"
#include "SkColorCubeFilter_opts.h"
#include "SkFloatingPoint_opts.h"
#include "SkMatrix_opts.h"
#include "SkMorphologyImageFilter_opts.h"
#include "SkTextureCompressor_opts.h"
#include "SkUtils_opts.h"
@ -58,6 +59,10 @@ namespace SkOpts {
decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
decltype(matrix_translate) matrix_translate = sk_default::matrix_translate;
decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;
decltype(matrix_affine) matrix_affine = sk_default::matrix_affine;
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();
void Init_sse41();

View File

@ -8,6 +8,7 @@
#ifndef SkOpts_DEFINED
#define SkOpts_DEFINED
#include "SkMatrix.h"
#include "SkTextureCompressor.h"
#include "SkTypes.h"
#include "SkXfermode.h"
@ -54,6 +55,8 @@ namespace SkOpts {
const SkScalar * [2],
int,
const SkColor*);
extern SkMatrix::MapPtsProc matrix_translate, matrix_scale_translate, matrix_affine;
}
#endif//SkOpts_DEFINED

106
src/opts/SkMatrix_opts.h Normal file
View File

@ -0,0 +1,106 @@
/*
* Copyright 2015 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkMatrix_opts_DEFINED
#define SkMatrix_opts_DEFINED
#include "SkMatrix.h"
#include "SkNx.h"
namespace SK_OPTS_NS {
static void matrix_translate(const SkMatrix& m, SkPoint* dst, const SkPoint* src, int count) {
SkASSERT(m.getType() <= SkMatrix::kTranslate_Mask);
if (count > 0) {
SkScalar tx = m.getTranslateX();
SkScalar ty = m.getTranslateY();
if (count & 1) {
dst->fX = src->fX + tx;
dst->fY = src->fY + ty;
src += 1;
dst += 1;
}
Sk4s trans4(tx, ty, tx, ty);
count >>= 1;
if (count & 1) {
(Sk4s::Load(&src->fX) + trans4).store(&dst->fX);
src += 2;
dst += 2;
}
count >>= 1;
for (int i = 0; i < count; ++i) {
(Sk4s::Load(&src[0].fX) + trans4).store(&dst[0].fX);
(Sk4s::Load(&src[2].fX) + trans4).store(&dst[2].fX);
src += 4;
dst += 4;
}
}
}
static void matrix_scale_translate(const SkMatrix& m, SkPoint* dst, const SkPoint* src, int count) {
SkASSERT(m.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask));
if (count > 0) {
SkScalar tx = m.getTranslateX();
SkScalar ty = m.getTranslateY();
SkScalar sx = m.getScaleX();
SkScalar sy = m.getScaleY();
if (count & 1) {
dst->fX = src->fX * sx + tx;
dst->fY = src->fY * sy + ty;
src += 1;
dst += 1;
}
Sk4s trans4(tx, ty, tx, ty);
Sk4s scale4(sx, sy, sx, sy);
count >>= 1;
if (count & 1) {
(Sk4s::Load(&src->fX) * scale4 + trans4).store(&dst->fX);
src += 2;
dst += 2;
}
count >>= 1;
for (int i = 0; i < count; ++i) {
(Sk4s::Load(&src[0].fX) * scale4 + trans4).store(&dst[0].fX);
(Sk4s::Load(&src[2].fX) * scale4 + trans4).store(&dst[2].fX);
src += 4;
dst += 4;
}
}
}
static void matrix_affine(const SkMatrix& m, SkPoint* dst, const SkPoint* src, int count) {
SkASSERT(m.getType() != SkMatrix::kPerspective_Mask);
if (count > 0) {
SkScalar tx = m.getTranslateX();
SkScalar ty = m.getTranslateY();
SkScalar sx = m.getScaleX();
SkScalar sy = m.getScaleY();
SkScalar kx = m.getSkewX();
SkScalar ky = m.getSkewY();
if (count & 1) {
dst->set(src->fX * sx + src->fY * kx + tx,
src->fX * ky + src->fY * sy + ty);
src += 1;
dst += 1;
}
Sk4s trans4(tx, ty, tx, ty);
Sk4s scale4(sx, sy, sx, sy);
Sk4s skew4(kx, ky, kx, ky); // applied to swizzle of src4
count >>= 1;
for (int i = 0; i < count; ++i) {
Sk4s src4 = Sk4s::Load(&src->fX);
Sk4s swz4(src[0].fY, src[0].fX, src[1].fY, src[1].fX); // need ABCD -> BADC
(src4 * scale4 + swz4 * skew4 + trans4).store(&dst->fX);
src += 2;
dst += 2;
}
}
}
} // namespace SK_OPTS_NS
#endif//SkMatrix_opts_DEFINED

View File

@ -13,6 +13,7 @@
#include "SkBlurImageFilter_opts.h"
#include "SkColorCubeFilter_opts.h"
#include "SkFloatingPoint_opts.h"
#include "SkMatrix_opts.h"
#include "SkMorphologyImageFilter_opts.h"
#include "SkTextureCompressor_opts.h"
#include "SkUtils_opts.h"
@ -42,5 +43,9 @@ namespace SkOpts {
blit_row_color32 = sk_neon::blit_row_color32;
color_cube_filter_span = sk_neon::color_cube_filter_span;
matrix_translate = sk_neon::matrix_translate;
matrix_scale_translate = sk_neon::matrix_scale_translate;
matrix_affine = sk_neon::matrix_affine;
}
}