Move test around in cubic_solver to test function, not delta_t.

Add SkOpts variant for avx2 to get FMA
Decrease tolerance now that we're testing the function

Before

  15/15  MB	1	1.13ms	1.17ms	1.18ms	1.26ms	4%	cubicmap_0_1_1_1
  15/15  MB	1	1.08ms	1.13ms	1.12ms	1.17ms	3%	cubicmap_0_1_1_0
  15/15  MB	1	862µs	904µs	900µs	937µs	3%	cubicmap_0_1_0_1
  15/15  MB	1	861µs	878µs	882µs	934µs	3%	cubicmap_0_1_0_0
  15/15  MB	1	1.44ms	1.47ms	1.49ms	1.55ms	3%	cubicmap_1_0_1_1
  15/15  MB	1	1.44ms	1.48ms	1.48ms	1.55ms	3%	cubicmap_1_0_1_0
  15/15  MB	1	1.42ms	1.42ms	1.46ms	1.53ms	3%	cubicmap_1_0_0_1
  15/15  MB	1	1.42ms	1.42ms	1.44ms	1.51ms	2%	cubicmap_1_0_0_0

After moving the check to the function, not delta_t

  15/15  MB	1	900µs	900µs	915µs	971µs	3%	cubicmap_0_1_1_1
  15/15  MB	1	899µs	900µs	914µs	988µs	3%	cubicmap_0_1_1_0
  15/15  MB	1	865µs	896µs	890µs	946µs	3%	cubicmap_0_1_0_1
  15/15  MB	1	866µs	910µs	914µs	959µs	3%	cubicmap_0_1_0_0
  15/15  MB	1	1.29ms	1.29ms	1.33ms	1.44ms	4%	cubicmap_1_0_1_1
  15/15  MB	1	1.28ms	1.29ms	1.34ms	1.54ms	6%	cubicmap_1_0_1_0
  15/15  MB	1	1.26ms	1.26ms	1.27ms	1.34ms	3%	cubicmap_1_0_0_1
  15/15  MB	1	1.26ms	1.26ms	1.27ms	1.3ms	2%	cubicmap_1_0_0_0

After SkOpts (on an avx2 machine)

  15/15  MB	1	613µs	613µs	616µs	646µs	2%	cubicmap_0_1_1_1
  15/15  MB	1	613µs	613µs	624µs	654µs	3%	cubicmap_0_1_1_0
  15/15  MB	1	862µs	865µs	867µs	887µs	1%	cubicmap_0_1_0_1
  15/15  MB	1	865µs	901µs	896µs	949µs	3%	cubicmap_0_1_0_0
  15/15  MB	1	849µs	850µs	868µs	929µs	4%	cubicmap_1_0_1_1
  15/15  MB	1	849µs	850µs	873µs	940µs	4%	cubicmap_1_0_1_0
  15/15  MB	1	831µs	831µs	856µs	950µs	5%	cubicmap_1_0_0_1
  15/15  MB	1	831µs	831µs	848µs	911µs	3%	cubicmap_1_0_0_0

(not checked in) if we also enable the pragma in cubic_solver

  15/15  MB	1	593µs	594µs	597µs	623µs	2%	cubicmap_0_1_1_1
  15/15  MB	1	593µs	595µs	605µs	629µs	2%	cubicmap_0_1_1_0
  15/15  MB	1	864µs	867µs	869µs	890µs	1%	cubicmap_0_1_0_1
  15/15  MB	1	864µs	866µs	886µs	950µs	4%	cubicmap_0_1_0_0
  15/15  MB	1	809µs	831µs	841µs	891µs	4%	cubicmap_1_0_1_1
  15/15  MB	1	809µs	810µs	855µs	1.11ms	11%	cubicmap_1_0_1_0
  15/15  MB	1	794µs	861µs	856µs	914µs	4%	cubicmap_1_0_0_1
  15/15  MB	1	794µs	821µs	818µs	853µs	3%	cubicmap_1_0_0_0

Change-Id: I260391be956d31a5cf3d0367d1285e56af7568f8
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/226499
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Reed <reed@google.com>
Auto-Submit: Mike Reed <reed@google.com>
This commit is contained in:
Mike Reed 2019-07-10 12:16:39 -04:00 committed by Skia Commit-Bot
parent cc32642882
commit 3a8ff230ab
6 changed files with 92 additions and 114 deletions

View File

@ -13,6 +13,7 @@
#include "include/private/SkSafe_math.h"
#include <float.h>
#include <math.h>
#include <cmath>
#include <cstring>
#include <limits>
@ -248,4 +249,12 @@ static inline float sk_ieee_double_divide_TODO_IS_DIVIDE_BY_ZERO_SAFE_HERE(doubl
return sk_ieee_double_divide(n,d);
}
static inline float sk_fmaf(float f, float m, float a) {
#if defined(FP_FAST_FMA)
return std::fmaf(f,m,a);
#else
return f*m+a;
#endif
}
#endif

View File

@ -7,6 +7,7 @@
#include "include/core/SkCubicMap.h"
#include "include/private/SkNx.h"
#include "src/core/SkOpts.h"
//#define CUBICMAP_TRACK_MAX_ERROR
@ -14,126 +15,15 @@
#include "src/pathops/SkPathOpsCubic.h"
#endif
static float eval_poly3(float a, float b, float c, float d, float t) {
return ((a * t + b) * t + c) * t + d;
}
static float eval_poly2(float a, float b, float c, float t) {
return (a * t + b) * t + c;
}
static float eval_poly1(float a, float b, float t) {
return a * t + b;
}
static float guess_nice_cubic_root(float A, float B, float C, float D) {
return -D;
}
#ifdef SK_DEBUG
static bool valid(float r) {
return r >= 0 && r <= 1;
}
#endif
static inline bool nearly_zero(SkScalar x) {
SkASSERT(x >= 0);
return x <= 0.0000000001f;
}
static inline bool delta_nearly_zero(float delta) {
return sk_float_abs(delta) <= 0.0001f;
}
#ifdef CUBICMAP_TRACK_MAX_ERROR
static int max_iters;
#endif
/*
* TODO: will this be faster if we algebraically compute the polynomials for the numer and denom
* rather than compute them in parts?
*/
static float solve_nice_cubic_halley(float A, float B, float C, float D) {
const int MAX_ITERS = 8;
const float A3 = 3 * A;
const float B2 = B + B;
float t = guess_nice_cubic_root(A, B, C, D);
int iters = 0;
for (; iters < MAX_ITERS; ++iters) {
float f = eval_poly3(A, B, C, D, t); // f = At^3 + Bt^2 + Ct + D
float fp = eval_poly2(A3, B2, C, t); // f' = 3At^2 + 2Bt + C
float fpp = eval_poly1(A3 + A3, B2, t); // f'' = 6At + 2B
float numer = 2 * fp * f;
if (numer == 0) {
break;
}
float denom = 2 * fp * fp - f * fpp;
float delta = numer / denom;
// SkDebugf("[%d] delta %g t %g\n", iters, delta, t);
if (delta_nearly_zero(delta)) {
break;
}
float new_t = t - delta;
SkASSERT(valid(new_t));
t = new_t;
}
SkASSERT(valid(t));
#ifdef CUBICMAP_TRACK_MAX_ERROR
if (iters > max_iters) {
max_iters = iters;
SkDebugf("max_iters %d\n", max_iters);
}
#endif
return t;
}
// At the moment, this technique does not appear to be better (i.e. faster at same precision)
// but the code is left here (at least for a while) to document the attempt.
static float solve_nice_cubic_householder(float A, float B, float C, float D) {
const int MAX_ITERS = 8;
const float A3 = 3 * A;
const float B2 = B + B;
float t = guess_nice_cubic_root(A, B, C, D);
int iters = 0;
for (; iters < MAX_ITERS; ++iters) {
float f = eval_poly3(A, B, C, D, t); // f = At^3 + Bt^2 + Ct + D
float fp = eval_poly2(A3, B2, C, t); // f' = 3At^2 + 2Bt + C
float fpp = eval_poly1(A3 + A3, B2, t); // f'' = 6At + 2B
float fppp = A3 + A3; // f''' = 6A
float f2 = f * f;
float fp2 = fp * fp;
// float numer = 6 * f * fp * fp - 3 * f * f * fpp;
// float denom = 6 * fp * fp * fp - 6 * f * fp * fpp + f * f * fppp;
float numer = 6 * f * fp2 - 3 * f2 * fpp;
if (numer == 0) {
break;
}
float denom = 6 * (fp2 * fp - f * fp * fpp) + f2 * fppp;
float delta = numer / denom;
// SkDebugf("[%d] delta %g t %g\n", iters, delta, t);
if (delta_nearly_zero(delta)) {
break;
}
float new_t = t - delta;
SkASSERT(valid(new_t));
t = new_t;
}
SkASSERT(valid(t));
#ifdef CUBICMAP_TRACK_MAX_ERROR
if (iters > max_iters) {
max_iters = iters;
SkDebugf("max_iters %d\n", max_iters);
}
#endif
return t;
}
#ifdef CUBICMAP_TRACK_MAX_ERROR
static float compute_slow(float A, float B, float C, float x) {
double roots[3];
@ -149,9 +39,8 @@ static float compute_t_from_x(float A, float B, float C, float x) {
#ifdef CUBICMAP_TRACK_MAX_ERROR
float answer = compute_slow(A, B, C, x);
#endif
float answer2 = true ?
solve_nice_cubic_halley(A, B, C, -x) :
solve_nice_cubic_householder(A, B, C, -x);
float answer2 = SkOpts::cubic_solver(A, B, C, -x);
#ifdef CUBICMAP_TRACK_MAX_ERROR
float err = sk_float_abs(answer - answer2);
if (err > max_err) {

71
src/core/SkCubicSolver.h Normal file
View File

@ -0,0 +1,71 @@
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkCubicSolver_DEFINED
#define SkCubicSolver_DEFINED
#include "include/core/SkTypes.h"
#include "include/private/SkFloatingPoint.h"
//#define CUBICMAP_TRACK_MAX_ERROR
namespace SK_OPTS_NS {
static float eval_poly(float t, float b) {
return b;
}
template <typename... Rest>
static float eval_poly(float t, float m, float b, Rest... rest) {
return eval_poly(t, sk_fmaf(m,t,b), rest...);
}
inline float cubic_solver(float A, float B, float C, float D) {
#ifdef CUBICMAP_TRACK_MAX_ERROR
static int max_iters = 0;
#endif
#ifdef SK_DEBUG
auto valid = [](float t) {
return t >= 0 && t <= 1;
};
#endif
auto guess_nice_cubic_root = [](float a, float b, float c, float d) {
return -d;
};
float t = guess_nice_cubic_root(A, B, C, D);
int iters = 0;
const int MAX_ITERS = 8;
for (; iters < MAX_ITERS; ++iters) {
SkASSERT(valid(t));
float f = eval_poly(t, A,B,C,D); // f = At^3 + Bt^2 + Ct + D
if (sk_float_abs(f) <= 0.00005f) {
break;
}
float fp = eval_poly(t, 3*A, 2*B, C); // f' = 3At^2 + 2Bt + C
float fpp = eval_poly(t, 3*A+3*A, 2*B); // f'' = 6At + 2B
float numer = 2 * fp * f;
float denom = sk_fmaf(2*fp, fp, -(f*fpp));
t -= numer / denom;
}
#ifdef CUBICMAP_TRACK_MAX_ERROR
if (max_iters < iters) {
max_iters = iters;
SkDebugf("max_iters %d\n", max_iters);
}
#endif
SkASSERT(valid(t));
return t;
}
} // namespace SK_OPTS_NS
#endif

View File

@ -45,6 +45,8 @@
#include "src/opts/SkUtils_opts.h"
#include "src/opts/SkXfermode_opts.h"
#include "src/core/SkCubicSolver.h"
namespace SkOpts {
// Define default function pointer values here...
// If our global compile options are set high enough, these defaults might even be
@ -77,6 +79,8 @@ namespace SkOpts {
DEFINE_DEFAULT(rect_memset32);
DEFINE_DEFAULT(rect_memset64);
DEFINE_DEFAULT(cubic_solver);
DEFINE_DEFAULT(hash_fn);
DEFINE_DEFAULT(S32_alpha_D32_filter_DX);

View File

@ -52,6 +52,8 @@ namespace SkOpts {
extern void (*rect_memset32)(uint32_t[], uint32_t, int, size_t, int);
extern void (*rect_memset64)(uint64_t[], uint64_t, int, size_t, int);
extern float (*cubic_solver)(float, float, float, float);
// The fastest high quality 32-bit hash we can provide on this platform.
extern uint32_t (*hash_fn)(const void*, size_t, uint32_t seed);
static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {

View File

@ -11,12 +11,15 @@
#include "src/opts/SkBlitRow_opts.h"
#include "src/opts/SkRasterPipeline_opts.h"
#include "src/opts/SkUtils_opts.h"
#include "src/core/SkCubicSolver.h"
namespace SkOpts {
void Init_hsw() {
blit_row_color32 = hsw::blit_row_color32;
blit_row_s32a_opaque = hsw::blit_row_s32a_opaque;
cubic_solver = SK_OPTS_NS::cubic_solver;
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_highp = (StageFn)SK_OPTS_NS::just_return;