skia2/src/core/SkNx.h

/*
 * Copyright 2015 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#ifndef SkNx_DEFINED
#define SkNx_DEFINED


#define SKNX_NO_SIMDx  // Remove the x to disable SIMD for all SkNx types.


#include "SkScalar.h"
#include "SkTypes.h"
#include <math.h>
#define REQUIRE(x) static_assert(x, #x)

// The default implementations of SkNi<N,T> and SkNf<N,T> just fall back on a pair of size N/2.
template <int N, typename T>
class SkNi {
public:
    // For now SkNi is a _very_ minimal sketch just to support comparison operators on SkNf.
    SkNi() {}
    SkNi(const SkNi<N/2, T>& lo, const SkNi<N/2, T>& hi) : fLo(lo), fHi(hi) {}
    bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); }
    bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); }

private:
    REQUIRE(0 == (N & (N-1)));
    SkNi<N/2, T> fLo, fHi;
};

template <int N, typename T>
class SkNf {
    static SkNi<N,int32_t> ToNi(float);
    static SkNi<N,int64_t> ToNi(double);
    typedef decltype(ToNi(T())) Ni;
public:
    SkNf() {}
    explicit SkNf(T val)           : fLo(val),  fHi(val)      {}
    static SkNf Load(const T vals[N]) {
        return SkNf(SkNf<N/2,T>::Load(vals), SkNf<N/2,T>::Load(vals+N/2));
    }

    SkNf(T a, T b)                               : fLo(a),       fHi(b)       { REQUIRE(N==2); }
    SkNf(T a, T b, T c, T d)                     : fLo(a,b),     fHi(c,d)     { REQUIRE(N==4); }
    SkNf(T a, T b, T c, T d, T e, T f, T g, T h) : fLo(a,b,c,d), fHi(e,f,g,h) { REQUIRE(N==8); }

    void store(T vals[N]) const {
        fLo.store(vals);
        fHi.store(vals+N/2);
    }

    SkNf operator + (const SkNf& o) const { return SkNf(fLo + o.fLo, fHi + o.fHi); }
    SkNf operator - (const SkNf& o) const { return SkNf(fLo - o.fLo, fHi - o.fHi); }
    SkNf operator * (const SkNf& o) const { return SkNf(fLo * o.fLo, fHi * o.fHi); }
    SkNf operator / (const SkNf& o) const { return SkNf(fLo / o.fLo, fHi / o.fHi); }

    Ni operator == (const SkNf& o) const { return Ni(fLo == o.fLo, fHi == o.fHi); }
    Ni operator != (const SkNf& o) const { return Ni(fLo != o.fLo, fHi != o.fHi); }
    Ni operator  < (const SkNf& o) const { return Ni(fLo  < o.fLo, fHi  < o.fHi); }
    Ni operator  > (const SkNf& o) const { return Ni(fLo  > o.fLo, fHi  > o.fHi); }
    Ni operator <= (const SkNf& o) const { return Ni(fLo <= o.fLo, fHi <= o.fHi); }
    Ni operator >= (const SkNf& o) const { return Ni(fLo >= o.fLo, fHi >= o.fHi); }

    static SkNf Min(const SkNf& l, const SkNf& r) {
        return SkNf(SkNf<N/2,T>::Min(l.fLo, r.fLo), SkNf<N/2,T>::Min(l.fHi, r.fHi));
    }
    static SkNf Max(const SkNf& l, const SkNf& r) {
        return SkNf(SkNf<N/2,T>::Max(l.fLo, r.fLo), SkNf<N/2,T>::Max(l.fHi, r.fHi));
    }

    SkNf  sqrt() const { return SkNf(fLo. sqrt(), fHi. sqrt()); }
    SkNf rsqrt() const { return SkNf(fLo.rsqrt(), fHi.rsqrt()); }

    SkNf       invert() const { return SkNf(fLo.      invert(), fHi.      invert()); }
    SkNf approxInvert() const { return SkNf(fLo.approxInvert(), fHi.approxInvert()); }

    template <int k> T kth() const {
        SkASSERT(0 <= k && k < N);
        return k < N/2 ? fLo.template kth<k>() : fHi.template kth<k-N/2>();
    }

private:
    REQUIRE(0 == (N & (N-1)));
    SkNf(const SkNf<N/2, T>& lo, const SkNf<N/2, T>& hi) : fLo(lo), fHi(hi) {}

    SkNf<N/2, T> fLo, fHi;
};


// Bottom out the default implementation with scalars when nothing's been specialized.
template <typename T>
class SkNi<1,T> {
public:
    SkNi() {}
    explicit SkNi(T val) : fVal(val) {}
    bool allTrue() const { return (bool)fVal; }
    bool anyTrue() const { return (bool)fVal; }

private:
    T fVal;
};

template <typename T>
class SkNf<1,T> {
    static SkNi<1,int32_t> ToNi(float);
    static SkNi<1,int64_t> ToNi(double);
    typedef decltype(ToNi(T())) Ni;
public:
    SkNf() {}
    explicit SkNf(T val)           : fVal(val)     {}
    static SkNf Load(const T vals[1]) { return SkNf(vals[0]); }

    void store(T vals[1]) const { vals[0] = fVal; }

    SkNf operator + (const SkNf& o) const { return SkNf(fVal + o.fVal); }
    SkNf operator - (const SkNf& o) const { return SkNf(fVal - o.fVal); }
    SkNf operator * (const SkNf& o) const { return SkNf(fVal * o.fVal); }
    SkNf operator / (const SkNf& o) const { return SkNf(fVal / o.fVal); }

    Ni operator == (const SkNf& o) const { return Ni(fVal == o.fVal); }
    Ni operator != (const SkNf& o) const { return Ni(fVal != o.fVal); }
    Ni operator  < (const SkNf& o) const { return Ni(fVal  < o.fVal); }
    Ni operator  > (const SkNf& o) const { return Ni(fVal  > o.fVal); }
    Ni operator <= (const SkNf& o) const { return Ni(fVal <= o.fVal); }
    Ni operator >= (const SkNf& o) const { return Ni(fVal >= o.fVal); }

    static SkNf Min(const SkNf& l, const SkNf& r) { return SkNf(SkTMin(l.fVal, r.fVal)); }
    static SkNf Max(const SkNf& l, const SkNf& r) { return SkNf(SkTMax(l.fVal, r.fVal)); }

    SkNf  sqrt() const { return SkNf(Sqrt(fVal));          }
    SkNf rsqrt() const { return SkNf((T)1 / Sqrt(fVal)); }

    SkNf       invert() const { return SkNf((T)1 / fVal); }
    SkNf approxInvert() const { return this->invert();    }

    template <int k> T kth() const {
        SkASSERT(k == 0);
        return fVal;
    }

private:
    // We do double sqrts natively, or via floats for any other type.
    template <typename U>
    static U      Sqrt(U      val) { return (U) ::sqrtf((float)val); }
    static double Sqrt(double val) { return     ::sqrt (       val); }

    T fVal;
};


// Generic syntax sugar that should work equally well for all SkNi and SkNf implementations.
template <typename SkNx> SkNx operator - (const SkNx& l) { return SkNx(0) - l; }

template <typename SkNx> SkNx& operator += (SkNx& l, const SkNx& r) { return (l = l + r); }
template <typename SkNx> SkNx& operator -= (SkNx& l, const SkNx& r) { return (l = l - r); }
template <typename SkNx> SkNx& operator *= (SkNx& l, const SkNx& r) { return (l = l * r); }
template <typename SkNx> SkNx& operator /= (SkNx& l, const SkNx& r) { return (l = l / r); }


// Include platform specific specializations if available.
#ifndef SKNX_NO_SIMD
    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
        #include "../opts/SkNx_sse.h"
    #elif defined(SK_ARM_HAS_NEON)
        #include "../opts/SkNx_neon.h"
    #endif
#endif

#undef REQUIRE

typedef SkNf<2,    float> Sk2f;
typedef SkNf<2,   double> Sk2d;
typedef SkNf<2, SkScalar> Sk2s;

typedef SkNf<4,    float> Sk4f;
typedef SkNf<4,   double> Sk4d;
typedef SkNf<4, SkScalar> Sk4s;

typedef SkNi<4, int32_t> Sk4i;

#endif//SkNx_DEFINED
Guard SIMD code with !defined(SKNX_NO_SIMD). This should make it easy to compare performance of the non-SIMD Sk2x / Sk4x code with our existing portable scalar code. I'm not adding this to SkPMFloat only because we don't have an existing scalar baseline there to compare to. We'll have to keep our wits about us: I just tried your new benchmarks, and Clang's autovectorizer produced almost as good SSE as we did with intrinsics for geo_evalquadat1 and geo_evalquadtangentat1, but not for geo_chopquadat1, which went serial. BUG=skia: Review URL: https://codereview.chromium.org/1026723003 2015-03-20 13:33:02 +00:00			`/*`
			`* Copyright 2015 Google Inc.`
			`*`
			`* Use of this source code is governed by a BSD-style license that can be`
			`* found in the LICENSE file.`
			`*/`

			`#ifndef SkNx_DEFINED`
			`#define SkNx_DEFINED`

Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T> The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc. This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h. This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h. To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful. You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel: - Sk4f, Sk4s, Sk2d: feel awesome - Sk2f, Sk2s, Sk4d: feel pretty good No public API changes. TBR=reed@google.com BUG=skia:3592 Review URL: https://codereview.chromium.org/1048593002 2015-03-30 17:50:27 +00:00
			`#define SKNX_NO_SIMDx // Remove the x to disable SIMD for all SkNx types.`


			`#include "SkScalar.h"`
			`#include "SkTypes.h"`
			`#include <math.h>`
			`#define REQUIRE(x) static_assert(x, #x)`

			`// The default implementations of SkNi<N,T> and SkNf<N,T> just fall back on a pair of size N/2.`
			`template <int N, typename T>`
			`class SkNi {`
			`public:`
			`// For now SkNi is a _very_ minimal sketch just to support comparison operators on SkNf.`
			`SkNi() {}`
			`SkNi(const SkNi<N/2, T>& lo, const SkNi<N/2, T>& hi) : fLo(lo), fHi(hi) {}`
			`bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); }`
			`bool anyTrue() const { return fLo.anyTrue() \|\| fHi.anyTrue(); }`

			`private:`
			`REQUIRE(0 == (N & (N-1)));`
			`SkNi<N/2, T> fLo, fHi;`
			`};`

			`template <int N, typename T>`
			`class SkNf {`
			`static SkNi<N,int32_t> ToNi(float);`
			`static SkNi<N,int64_t> ToNi(double);`
			`typedef decltype(ToNi(T())) Ni;`
			`public:`
			`SkNf() {}`
			`explicit SkNf(T val) : fLo(val), fHi(val) {}`
			`static SkNf Load(const T vals[N]) {`
			`return SkNf(SkNf<N/2,T>::Load(vals), SkNf<N/2,T>::Load(vals+N/2));`
			`}`

			`SkNf(T a, T b) : fLo(a), fHi(b) { REQUIRE(N==2); }`
			`SkNf(T a, T b, T c, T d) : fLo(a,b), fHi(c,d) { REQUIRE(N==4); }`
			`SkNf(T a, T b, T c, T d, T e, T f, T g, T h) : fLo(a,b,c,d), fHi(e,f,g,h) { REQUIRE(N==8); }`

			`void store(T vals[N]) const {`
			`fLo.store(vals);`
			`fHi.store(vals+N/2);`
			`}`

			`SkNf operator + (const SkNf& o) const { return SkNf(fLo + o.fLo, fHi + o.fHi); }`
			`SkNf operator - (const SkNf& o) const { return SkNf(fLo - o.fLo, fHi - o.fHi); }`
			`SkNf operator * (const SkNf& o) const { return SkNf(fLo * o.fLo, fHi * o.fHi); }`
			`SkNf operator / (const SkNf& o) const { return SkNf(fLo / o.fLo, fHi / o.fHi); }`

			`Ni operator == (const SkNf& o) const { return Ni(fLo == o.fLo, fHi == o.fHi); }`
			`Ni operator != (const SkNf& o) const { return Ni(fLo != o.fLo, fHi != o.fHi); }`
			`Ni operator < (const SkNf& o) const { return Ni(fLo < o.fLo, fHi < o.fHi); }`
			`Ni operator > (const SkNf& o) const { return Ni(fLo > o.fLo, fHi > o.fHi); }`
			`Ni operator <= (const SkNf& o) const { return Ni(fLo <= o.fLo, fHi <= o.fHi); }`
			`Ni operator >= (const SkNf& o) const { return Ni(fLo >= o.fLo, fHi >= o.fHi); }`

			`static SkNf Min(const SkNf& l, const SkNf& r) {`
			`return SkNf(SkNf<N/2,T>::Min(l.fLo, r.fLo), SkNf<N/2,T>::Min(l.fHi, r.fHi));`
			`}`
			`static SkNf Max(const SkNf& l, const SkNf& r) {`
			`return SkNf(SkNf<N/2,T>::Max(l.fLo, r.fLo), SkNf<N/2,T>::Max(l.fHi, r.fHi));`
			`}`

			`SkNf sqrt() const { return SkNf(fLo. sqrt(), fHi. sqrt()); }`
			`SkNf rsqrt() const { return SkNf(fLo.rsqrt(), fHi.rsqrt()); }`

			`SkNf invert() const { return SkNf(fLo. invert(), fHi. invert()); }`
			`SkNf approxInvert() const { return SkNf(fLo.approxInvert(), fHi.approxInvert()); }`

Use switch operator[](int) to kth<int>() so we can use vget_lane. #floats BUG=skia: BUG=skia:3592 Review URL: https://codereview.chromium.org/1059743002 2015-04-03 13:16:13 +00:00			`template <int k> T kth() const {`
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T> The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc. This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h. This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h. To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful. You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel: - Sk4f, Sk4s, Sk2d: feel awesome - Sk2f, Sk2s, Sk4d: feel pretty good No public API changes. TBR=reed@google.com BUG=skia:3592 Review URL: https://codereview.chromium.org/1048593002 2015-03-30 17:50:27 +00:00			`SkASSERT(0 <= k && k < N);`
Use switch operator[](int) to kth<int>() so we can use vget_lane. #floats BUG=skia: BUG=skia:3592 Review URL: https://codereview.chromium.org/1059743002 2015-04-03 13:16:13 +00:00			`return k < N/2 ? fLo.template kth<k>() : fHi.template kth<k-N/2>();`
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T> The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc. This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h. This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h. To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful. You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel: - Sk4f, Sk4s, Sk2d: feel awesome - Sk2f, Sk2s, Sk4d: feel pretty good No public API changes. TBR=reed@google.com BUG=skia:3592 Review URL: https://codereview.chromium.org/1048593002 2015-03-30 17:50:27 +00:00			`}`

			`private:`
			`REQUIRE(0 == (N & (N-1)));`
			`SkNf(const SkNf<N/2, T>& lo, const SkNf<N/2, T>& hi) : fLo(lo), fHi(hi) {}`

			`SkNf<N/2, T> fLo, fHi;`
			`};`


			`// Bottom out the default implementation with scalars when nothing's been specialized.`
			`template <typename T>`
			`class SkNi<1,T> {`
			`public:`
			`SkNi() {}`
			`explicit SkNi(T val) : fVal(val) {}`
			`bool allTrue() const { return (bool)fVal; }`
			`bool anyTrue() const { return (bool)fVal; }`

			`private:`
			`T fVal;`
			`};`

			`template <typename T>`
			`class SkNf<1,T> {`
			`static SkNi<1,int32_t> ToNi(float);`
			`static SkNi<1,int64_t> ToNi(double);`
			`typedef decltype(ToNi(T())) Ni;`
			`public:`
			`SkNf() {}`
			`explicit SkNf(T val) : fVal(val) {}`
			`static SkNf Load(const T vals[1]) { return SkNf(vals[0]); }`

			`void store(T vals[1]) const { vals[0] = fVal; }`

			`SkNf operator + (const SkNf& o) const { return SkNf(fVal + o.fVal); }`
			`SkNf operator - (const SkNf& o) const { return SkNf(fVal - o.fVal); }`
			`SkNf operator * (const SkNf& o) const { return SkNf(fVal * o.fVal); }`
			`SkNf operator / (const SkNf& o) const { return SkNf(fVal / o.fVal); }`

			`Ni operator == (const SkNf& o) const { return Ni(fVal == o.fVal); }`
			`Ni operator != (const SkNf& o) const { return Ni(fVal != o.fVal); }`
			`Ni operator < (const SkNf& o) const { return Ni(fVal < o.fVal); }`
			`Ni operator > (const SkNf& o) const { return Ni(fVal > o.fVal); }`
			`Ni operator <= (const SkNf& o) const { return Ni(fVal <= o.fVal); }`
			`Ni operator >= (const SkNf& o) const { return Ni(fVal >= o.fVal); }`

			`static SkNf Min(const SkNf& l, const SkNf& r) { return SkNf(SkTMin(l.fVal, r.fVal)); }`
			`static SkNf Max(const SkNf& l, const SkNf& r) { return SkNf(SkTMax(l.fVal, r.fVal)); }`

			`SkNf sqrt() const { return SkNf(Sqrt(fVal)); }`
			`SkNf rsqrt() const { return SkNf((T)1 / Sqrt(fVal)); }`

			`SkNf invert() const { return SkNf((T)1 / fVal); }`
			`SkNf approxInvert() const { return this->invert(); }`

Use switch operator[](int) to kth<int>() so we can use vget_lane. #floats BUG=skia: BUG=skia:3592 Review URL: https://codereview.chromium.org/1059743002 2015-04-03 13:16:13 +00:00			`template <int k> T kth() const {`
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T> The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc. This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h. This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h. To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful. You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel: - Sk4f, Sk4s, Sk2d: feel awesome - Sk2f, Sk2s, Sk4d: feel pretty good No public API changes. TBR=reed@google.com BUG=skia:3592 Review URL: https://codereview.chromium.org/1048593002 2015-03-30 17:50:27 +00:00			`SkASSERT(k == 0);`
			`return fVal;`
			`}`

			`private:`
			`// We do double sqrts natively, or via floats for any other type.`
			`template <typename U>`
			`static U Sqrt(U val) { return (U) ::sqrtf((float)val); }`
			`static double Sqrt(double val) { return ::sqrt ( val); }`

			`T fVal;`
			`};`


			`// Generic syntax sugar that should work equally well for all SkNi and SkNf implementations.`
Use switch operator[](int) to kth<int>() so we can use vget_lane. #floats BUG=skia: BUG=skia:3592 Review URL: https://codereview.chromium.org/1059743002 2015-04-03 13:16:13 +00:00			`template <typename SkNx> SkNx operator - (const SkNx& l) { return SkNx(0) - l; }`
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T> The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc. This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h. This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h. To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful. You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel: - Sk4f, Sk4s, Sk2d: feel awesome - Sk2f, Sk2s, Sk4d: feel pretty good No public API changes. TBR=reed@google.com BUG=skia:3592 Review URL: https://codereview.chromium.org/1048593002 2015-03-30 17:50:27 +00:00
			`template <typename SkNx> SkNx& operator += (SkNx& l, const SkNx& r) { return (l = l + r); }`
			`template <typename SkNx> SkNx& operator -= (SkNx& l, const SkNx& r) { return (l = l - r); }`
			`template <typename SkNx> SkNx& operator = (SkNx& l, const SkNx& r) { return (l = l r); }`
			`template <typename SkNx> SkNx& operator /= (SkNx& l, const SkNx& r) { return (l = l / r); }`


			`// Include platform specific specializations if available.`
			`#ifndef SKNX_NO_SIMD`
			`#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2`
			`#include "../opts/SkNx_sse.h"`
			`#elif defined(SK_ARM_HAS_NEON)`
			`#include "../opts/SkNx_neon.h"`
			`#endif`
			`#endif`

			`#undef REQUIRE`

			`typedef SkNf<2, float> Sk2f;`
			`typedef SkNf<2, double> Sk2d;`
			`typedef SkNf<2, SkScalar> Sk2s;`

			`typedef SkNf<4, float> Sk4f;`
			`typedef SkNf<4, double> Sk4d;`
			`typedef SkNf<4, SkScalar> Sk4s;`

			`typedef SkNi<4, int32_t> Sk4i;`
Guard SIMD code with !defined(SKNX_NO_SIMD). This should make it easy to compare performance of the non-SIMD Sk2x / Sk4x code with our existing portable scalar code. I'm not adding this to SkPMFloat only because we don't have an existing scalar baseline there to compare to. We'll have to keep our wits about us: I just tried your new benchmarks, and Clang's autovectorizer produced almost as good SSE as we did with intrinsics for geo_evalquadat1 and geo_evalquadtangentat1, but not for geo_chopquadat1, which went serial. BUG=skia: Review URL: https://codereview.chromium.org/1026723003 2015-03-20 13:33:02 +00:00
			`#endif//SkNx_DEFINED`