Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
/*
|
|
|
|
* Copyright 2015 Google Inc.
|
|
|
|
*
|
|
|
|
* Use of this source code is governed by a BSD-style license that can be
|
|
|
|
* found in the LICENSE file.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef SkNx_neon_DEFINED
|
|
|
|
#define SkNx_neon_DEFINED
|
|
|
|
|
2016-06-09 20:40:56 +00:00
|
|
|
#include <arm_neon.h>
|
|
|
|
|
2015-12-03 17:15:25 +00:00
|
|
|
#define SKNX_IS_FAST
|
|
|
|
|
2016-02-09 23:41:36 +00:00
|
|
|
// ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it:
|
2016-02-10 15:55:56 +00:00
|
|
|
// - roundtrip through integers via truncation
|
|
|
|
// - subtract 1 if that's too big (possible for negative values).
|
|
|
|
// This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big.
|
2016-02-09 23:41:36 +00:00
|
|
|
static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) {
|
2016-02-10 15:55:56 +00:00
|
|
|
auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
|
|
|
|
auto too_big = vcgtq_f32(roundtrip, v);
|
|
|
|
return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
|
2016-02-09 23:41:36 +00:00
|
|
|
}
|
|
|
|
|
2015-05-12 13:11:21 +00:00
|
|
|
// Well, this is absurd. The shifts require compile-time constant arguments.
|
|
|
|
|
|
|
|
#define SHIFT8(op, v, bits) switch(bits) { \
|
|
|
|
case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v, 3); \
|
|
|
|
case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v, 6); \
|
|
|
|
case 7: return op(v, 7); \
|
|
|
|
} return fVec
|
|
|
|
|
|
|
|
#define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits) { \
|
|
|
|
case 8: return op(v, 8); case 9: return op(v, 9); \
|
|
|
|
case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v, 12); \
|
|
|
|
case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v, 15); \
|
|
|
|
} return fVec
|
|
|
|
|
|
|
|
#define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bits) { \
|
|
|
|
case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v, 18); \
|
|
|
|
case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v, 21); \
|
|
|
|
case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v, 24); \
|
|
|
|
case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v, 27); \
|
|
|
|
case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v, 30); \
|
|
|
|
case 31: return op(v, 31); } return fVec
|
|
|
|
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
template <>
|
2015-11-20 21:53:19 +00:00
|
|
|
class SkNx<2, float> {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
public:
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(float32x2_t vec) : fVec(vec) {}
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx() {}
|
|
|
|
SkNx(float val) : fVec(vdup_n_f32(val)) {}
|
2016-01-31 16:02:47 +00:00
|
|
|
static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2016-01-31 16:02:47 +00:00
|
|
|
void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
SkNx invert() const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
float32x2_t est0 = vrecpe_f32(fVec),
|
|
|
|
est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
|
|
|
|
return est1;
|
|
|
|
}
|
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
|
|
|
|
SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
|
|
|
|
SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
|
|
|
|
SkNx operator / (const SkNx& o) const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#if defined(SK_CPU_ARM64)
|
|
|
|
return vdiv_f32(fVec, o.fVec);
|
|
|
|
#else
|
2016-03-21 17:04:46 +00:00
|
|
|
float32x2_t est0 = vrecpe_f32(o.fVec),
|
|
|
|
est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
|
|
|
|
est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
|
|
|
|
return vmul_f32(fVec, est2);
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator != (const SkNx& o) const {
|
Implement four more xfermodes with Sk4px.
HardLight, Overlay, Darken, and Lighten are all
~2x faster with SSE, ~25% faster with NEON.
This covers all previously-implemented NEON xfermodes.
3 previous SSE xfermodes remain. Those need division
and sqrt, so I'm planning on using SkPMFloat for them.
It'll help the readability and NEON speed if I move that
into [0,1] space first.
The main new concept here is c.thenElse(t,e), which behaves like
(c ? t : e) except, of course, both t and e are evaluated. This allows
us to emulate conditionals with vectors.
This also removes the concept of SkNb. Instead of a standalone bool
vector, each SkNi or SkNf will just return their own types for
comparisons. Turns out to be a lot more manageable this way.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/b9d4163bebab0f5639f9c5928bb5fc15f472dddc
CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu-GCC-Arm64-Debug-Android-Trybot
Review URL: https://codereview.chromium.org/1196713004
2015-06-24 22:18:39 +00:00
|
|
|
return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
|
|
|
|
}
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
|
|
|
|
static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
SkNx rsqrt() const {
|
|
|
|
float32x2_t est0 = vrsqrte_f32(fVec);
|
2015-04-27 21:22:32 +00:00
|
|
|
return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
|
|
|
|
}
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx sqrt() const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#if defined(SK_CPU_ARM64)
|
|
|
|
return vsqrt_f32(fVec);
|
|
|
|
#else
|
2016-03-21 17:04:46 +00:00
|
|
|
float32x2_t est0 = vrsqrte_f32(fVec),
|
|
|
|
est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0),
|
|
|
|
est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
|
|
|
|
return vmul_f32(fVec, est2);
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
float operator[](int k) const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
SkASSERT(0 <= k && k < 2);
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
union { float32x2_t v; float fs[2]; } pun = {fVec};
|
|
|
|
return pun.fs[k&1];
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
}
|
|
|
|
|
Implement four more xfermodes with Sk4px.
HardLight, Overlay, Darken, and Lighten are all
~2x faster with SSE, ~25% faster with NEON.
This covers all previously-implemented NEON xfermodes.
3 previous SSE xfermodes remain. Those need division
and sqrt, so I'm planning on using SkPMFloat for them.
It'll help the readability and NEON speed if I move that
into [0,1] space first.
The main new concept here is c.thenElse(t,e), which behaves like
(c ? t : e) except, of course, both t and e are evaluated. This allows
us to emulate conditionals with vectors.
This also removes the concept of SkNb. Instead of a standalone bool
vector, each SkNi or SkNf will just return their own types for
comparisons. Turns out to be a lot more manageable this way.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/b9d4163bebab0f5639f9c5928bb5fc15f472dddc
CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu-GCC-Arm64-Debug-Android-Trybot
Review URL: https://codereview.chromium.org/1196713004
2015-06-24 22:18:39 +00:00
|
|
|
bool allTrue() const {
|
|
|
|
auto v = vreinterpret_u32_f32(fVec);
|
|
|
|
return vget_lane_u32(v,0) && vget_lane_u32(v,1);
|
|
|
|
}
|
|
|
|
bool anyTrue() const {
|
|
|
|
auto v = vreinterpret_u32_f32(fVec);
|
|
|
|
return vget_lane_u32(v,0) || vget_lane_u32(v,1);
|
|
|
|
}
|
|
|
|
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
float32x2_t fVec;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <>
|
2015-11-20 21:53:19 +00:00
|
|
|
class SkNx<4, float> {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
public:
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(float32x4_t vec) : fVec(vec) {}
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx() {}
|
|
|
|
SkNx(float val) : fVec(vdupq_n_f32(val)) {}
|
2016-01-31 16:02:47 +00:00
|
|
|
static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2016-01-31 16:02:47 +00:00
|
|
|
void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
|
2016-03-21 17:04:46 +00:00
|
|
|
SkNx invert() const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
float32x4_t est0 = vrecpeq_f32(fVec),
|
|
|
|
est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
|
|
|
|
return est1;
|
|
|
|
}
|
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
|
|
|
|
SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
|
|
|
|
SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
|
|
|
|
SkNx operator / (const SkNx& o) const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#if defined(SK_CPU_ARM64)
|
|
|
|
return vdivq_f32(fVec, o.fVec);
|
|
|
|
#else
|
2016-03-21 17:04:46 +00:00
|
|
|
float32x4_t est0 = vrecpeq_f32(o.fVec),
|
|
|
|
est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
|
|
|
|
est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
|
|
|
|
return vmulq_f32(fVec, est2);
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec)); }
|
|
|
|
SkNx operator!=(const SkNx& o) const {
|
Implement four more xfermodes with Sk4px.
HardLight, Overlay, Darken, and Lighten are all
~2x faster with SSE, ~25% faster with NEON.
This covers all previously-implemented NEON xfermodes.
3 previous SSE xfermodes remain. Those need division
and sqrt, so I'm planning on using SkPMFloat for them.
It'll help the readability and NEON speed if I move that
into [0,1] space first.
The main new concept here is c.thenElse(t,e), which behaves like
(c ? t : e) except, of course, both t and e are evaluated. This allows
us to emulate conditionals with vectors.
This also removes the concept of SkNb. Instead of a standalone bool
vector, each SkNi or SkNf will just return their own types for
comparisons. Turns out to be a lot more manageable this way.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/b9d4163bebab0f5639f9c5928bb5fc15f472dddc
CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu-GCC-Arm64-Debug-Android-Trybot
Review URL: https://codereview.chromium.org/1196713004
2015-06-24 22:18:39 +00:00
|
|
|
return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
|
|
|
|
}
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
|
|
|
|
static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2016-01-15 20:16:40 +00:00
|
|
|
SkNx abs() const { return vabsq_f32(fVec); }
|
2016-02-09 23:41:36 +00:00
|
|
|
SkNx floor() const {
|
|
|
|
#if defined(SK_CPU_ARM64)
|
|
|
|
return vrndmq_f32(fVec);
|
|
|
|
#else
|
|
|
|
return armv7_vrndmq_f32(fVec);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2016-01-15 20:16:40 +00:00
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
SkNx rsqrt() const {
|
|
|
|
float32x4_t est0 = vrsqrteq_f32(fVec);
|
2015-04-27 21:22:32 +00:00
|
|
|
return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
|
|
|
|
}
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx sqrt() const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#if defined(SK_CPU_ARM64)
|
|
|
|
return vsqrtq_f32(fVec);
|
|
|
|
#else
|
2016-03-21 17:04:46 +00:00
|
|
|
float32x4_t est0 = vrsqrteq_f32(fVec),
|
|
|
|
est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0),
|
|
|
|
est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
|
|
|
|
return vmulq_f32(fVec, est2);
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
float operator[](int k) const {
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
SkASSERT(0 <= k && k < 4);
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
union { float32x4_t v; float fs[4]; } pun = {fVec};
|
|
|
|
return pun.fs[k&3];
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
}
|
|
|
|
|
Implement four more xfermodes with Sk4px.
HardLight, Overlay, Darken, and Lighten are all
~2x faster with SSE, ~25% faster with NEON.
This covers all previously-implemented NEON xfermodes.
3 previous SSE xfermodes remain. Those need division
and sqrt, so I'm planning on using SkPMFloat for them.
It'll help the readability and NEON speed if I move that
into [0,1] space first.
The main new concept here is c.thenElse(t,e), which behaves like
(c ? t : e) except, of course, both t and e are evaluated. This allows
us to emulate conditionals with vectors.
This also removes the concept of SkNb. Instead of a standalone bool
vector, each SkNi or SkNf will just return their own types for
comparisons. Turns out to be a lot more manageable this way.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/b9d4163bebab0f5639f9c5928bb5fc15f472dddc
CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu-GCC-Arm64-Debug-Android-Trybot
Review URL: https://codereview.chromium.org/1196713004
2015-06-24 22:18:39 +00:00
|
|
|
bool allTrue() const {
|
|
|
|
auto v = vreinterpretq_u32_f32(fVec);
|
|
|
|
return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
|
|
|
|
&& vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
|
|
|
|
}
|
|
|
|
bool anyTrue() const {
|
|
|
|
auto v = vreinterpretq_u32_f32(fVec);
|
|
|
|
return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
|
|
|
|
|| vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
|
|
|
|
}
|
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
2015-07-27 13:12:05 +00:00
|
|
|
return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
|
Color dodge and burn with SkPMFloat.
Both 25-35% faster with SSE.
With NEON, Burn measures as a ~10% regression, Dodge a huge 2.9x improvement.
The Burn regression is somewhat artificial: we're drawing random colored rects onto an opaque white dst, so we're heavily biased toward the (d==da) fast path in the serial code. In the vector code there's no short-circuiting and we always pay a fixed cost for ColorBurn regardless of src or dst content.
Dodge's fast paths, in contrast, only trigger when (s==sa) or (d==0), neither of which happens any more than randomly in our benchmark. I don't think (d==0) should happen at all. Similarly, the (s==0) Burn fast path is really only going to happen as often as SkRandom allows.
In practice, the existing Burn benchmark is hitting its fast path 100% of the time. So I actually feel really great that this only dings the benchmark by 10%.
Chrome's still guarded by SK_SUPPORT_LEGACY_XFERMODES, which I'll lift after finishing the last xfermode, SoftLight.
BUG=skia:
Review URL: https://codereview.chromium.org/1214443002
2015-06-26 17:46:31 +00:00
|
|
|
}
|
|
|
|
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
float32x4_t fVec;
|
|
|
|
};
|
|
|
|
|
2016-01-20 19:55:51 +00:00
|
|
|
// It's possible that for our current use cases, representing this as
|
|
|
|
// half a uint16x8_t might be better than representing it as a uint16x4_t.
|
|
|
|
// It'd make conversion to Sk4b one step simpler.
|
|
|
|
template <>
|
|
|
|
class SkNx<4, uint16_t> {
|
|
|
|
public:
|
|
|
|
SkNx(const uint16x4_t& vec) : fVec(vec) {}
|
|
|
|
|
|
|
|
SkNx() {}
|
|
|
|
SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
|
2016-01-31 16:02:47 +00:00
|
|
|
static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
|
2016-01-20 19:55:51 +00:00
|
|
|
|
|
|
|
SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
|
|
|
|
fVec = (uint16x4_t) { a,b,c,d };
|
|
|
|
}
|
|
|
|
|
2016-01-31 16:02:47 +00:00
|
|
|
void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
|
2016-01-20 19:55:51 +00:00
|
|
|
|
|
|
|
SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
|
|
|
|
SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
|
|
|
|
SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
|
|
|
|
|
|
|
|
SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); }
|
|
|
|
SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); }
|
|
|
|
|
|
|
|
static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
|
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
uint16_t operator[](int k) const {
|
2016-01-20 19:55:51 +00:00
|
|
|
SkASSERT(0 <= k && k < 4);
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
|
|
|
|
return pun.us[k&3];
|
2016-01-20 19:55:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
|
|
|
return vbsl_u16(fVec, t.fVec, e.fVec);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint16x4_t fVec;
|
|
|
|
};
|
|
|
|
|
2015-05-12 13:11:21 +00:00
|
|
|
template <>
|
2015-11-20 21:53:19 +00:00
|
|
|
class SkNx<8, uint16_t> {
|
2015-05-12 13:11:21 +00:00
|
|
|
public:
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(const uint16x8_t& vec) : fVec(vec) {}
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx() {}
|
|
|
|
SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
|
2016-01-31 16:02:47 +00:00
|
|
|
static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
|
2015-05-12 13:11:21 +00:00
|
|
|
uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
|
|
|
|
fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
|
|
|
|
}
|
|
|
|
|
2016-01-31 16:02:47 +00:00
|
|
|
void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
|
|
|
|
SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
|
|
|
|
SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); }
|
|
|
|
SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); }
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
|
2015-05-15 00:53:04 +00:00
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
uint16_t operator[](int k) const {
|
2015-05-12 13:11:21 +00:00
|
|
|
SkASSERT(0 <= k && k < 8);
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
|
|
|
|
return pun.us[k&7];
|
2015-05-12 13:11:21 +00:00
|
|
|
}
|
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
2015-07-27 13:12:05 +00:00
|
|
|
return vbslq_u16(fVec, t.fVec, e.fVec);
|
2015-07-14 17:54:19 +00:00
|
|
|
}
|
|
|
|
|
2015-05-12 13:11:21 +00:00
|
|
|
uint16x8_t fVec;
|
|
|
|
};
|
|
|
|
|
2015-12-14 19:25:18 +00:00
|
|
|
template <>
|
|
|
|
class SkNx<4, uint8_t> {
|
|
|
|
public:
|
|
|
|
SkNx(const uint8x8_t& vec) : fVec(vec) {}
|
|
|
|
|
|
|
|
SkNx() {}
|
2016-03-01 15:01:23 +00:00
|
|
|
SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
|
|
|
|
fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
|
|
|
|
}
|
2016-01-31 16:02:47 +00:00
|
|
|
static SkNx Load(const void* ptr) {
|
|
|
|
return (uint8x8_t)vld1_dup_u32((const uint32_t*)ptr);
|
2015-12-14 19:25:18 +00:00
|
|
|
}
|
2016-01-31 16:02:47 +00:00
|
|
|
void store(void* ptr) const {
|
|
|
|
return vst1_lane_u32((uint32_t*)ptr, (uint32x2_t)fVec, 0);
|
2015-12-14 19:25:18 +00:00
|
|
|
}
|
2016-03-01 15:01:23 +00:00
|
|
|
uint8_t operator[](int k) const {
|
|
|
|
SkASSERT(0 <= k && k < 4);
|
|
|
|
union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
|
|
|
|
return pun.us[k&3];
|
|
|
|
}
|
2015-12-14 19:25:18 +00:00
|
|
|
|
|
|
|
// TODO as needed
|
|
|
|
|
|
|
|
uint8x8_t fVec;
|
|
|
|
};
|
|
|
|
|
2015-05-12 13:11:21 +00:00
|
|
|
template <>
|
2015-11-20 21:53:19 +00:00
|
|
|
class SkNx<16, uint8_t> {
|
2015-05-12 13:11:21 +00:00
|
|
|
public:
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(const uint8x16_t& vec) : fVec(vec) {}
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx() {}
|
|
|
|
SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
|
2016-01-31 16:02:47 +00:00
|
|
|
static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
|
2015-05-12 13:11:21 +00:00
|
|
|
uint8_t e, uint8_t f, uint8_t g, uint8_t h,
|
|
|
|
uint8_t i, uint8_t j, uint8_t k, uint8_t l,
|
|
|
|
uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
|
|
|
|
fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
|
|
|
|
}
|
|
|
|
|
2016-01-31 16:02:47 +00:00
|
|
|
void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
|
2015-05-13 15:02:14 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
|
|
|
|
SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
|
2015-05-12 13:11:21 +00:00
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
|
|
|
|
SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
|
2015-05-15 00:53:04 +00:00
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
uint8_t operator[](int k) const {
|
|
|
|
SkASSERT(0 <= k && k < 16);
|
|
|
|
union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
|
|
|
|
return pun.us[k&15];
|
2015-05-12 13:11:21 +00:00
|
|
|
}
|
|
|
|
|
2015-11-20 21:53:19 +00:00
|
|
|
SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
2015-07-27 13:12:05 +00:00
|
|
|
return vbslq_u8(fVec, t.fVec, e.fVec);
|
Implement four more xfermodes with Sk4px.
HardLight, Overlay, Darken, and Lighten are all
~2x faster with SSE, ~25% faster with NEON.
This covers all previously-implemented NEON xfermodes.
3 previous SSE xfermodes remain. Those need division
and sqrt, so I'm planning on using SkPMFloat for them.
It'll help the readability and NEON speed if I move that
into [0,1] space first.
The main new concept here is c.thenElse(t,e), which behaves like
(c ? t : e) except, of course, both t and e are evaluated. This allows
us to emulate conditionals with vectors.
This also removes the concept of SkNb. Instead of a standalone bool
vector, each SkNi or SkNf will just return their own types for
comparisons. Turns out to be a lot more manageable this way.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/b9d4163bebab0f5639f9c5928bb5fc15f472dddc
CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu-GCC-Arm64-Debug-Android-Trybot
Review URL: https://codereview.chromium.org/1196713004
2015-06-24 22:18:39 +00:00
|
|
|
}
|
|
|
|
|
2015-05-12 13:11:21 +00:00
|
|
|
uint8x16_t fVec;
|
|
|
|
};
|
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
template <>
|
|
|
|
class SkNx<4, int> {
|
|
|
|
public:
|
|
|
|
SkNx(const int32x4_t& vec) : fVec(vec) {}
|
|
|
|
|
|
|
|
SkNx() {}
|
|
|
|
SkNx(int v) {
|
|
|
|
fVec = vdupq_n_s32(v);
|
|
|
|
}
|
|
|
|
SkNx(int a, int b, int c, int d) {
|
|
|
|
fVec = (int32x4_t){a,b,c,d};
|
|
|
|
}
|
|
|
|
static SkNx Load(const void* ptr) {
|
|
|
|
return vld1q_s32((const int32_t*)ptr);
|
|
|
|
}
|
|
|
|
void store(void* ptr) const {
|
|
|
|
return vst1q_s32((int32_t*)ptr, fVec);
|
|
|
|
}
|
|
|
|
int operator[](int k) const {
|
|
|
|
SkASSERT(0 <= k && k < 4);
|
|
|
|
union { int32x4_t v; int is[4]; } pun = {fVec};
|
|
|
|
return pun.is[k&3];
|
|
|
|
}
|
|
|
|
|
|
|
|
SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
|
|
|
|
SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
|
|
|
|
SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
|
|
|
|
|
2016-07-12 22:01:26 +00:00
|
|
|
SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
|
2016-06-17 19:09:16 +00:00
|
|
|
SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
|
2016-07-15 14:00:11 +00:00
|
|
|
SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); }
|
2016-06-17 19:09:16 +00:00
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
|
|
|
|
SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
|
|
|
|
|
2016-07-15 14:00:11 +00:00
|
|
|
SkNx operator == (const SkNx& o) const {
|
|
|
|
return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec));
|
|
|
|
}
|
|
|
|
SkNx operator < (const SkNx& o) const {
|
|
|
|
return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec));
|
|
|
|
}
|
|
|
|
SkNx operator > (const SkNx& o) const {
|
|
|
|
return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec));
|
|
|
|
}
|
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); }
|
|
|
|
// TODO as needed
|
|
|
|
|
2016-07-15 14:00:11 +00:00
|
|
|
SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
|
|
|
return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec);
|
|
|
|
}
|
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
int32x4_t fVec;
|
|
|
|
};
|
|
|
|
|
2015-05-12 13:11:21 +00:00
|
|
|
#undef SHIFT32
|
|
|
|
#undef SHIFT16
|
|
|
|
#undef SHIFT8
|
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
template<> inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {
|
|
|
|
return vcvtq_s32_f32(src.fVec);
|
|
|
|
|
|
|
|
}
|
|
|
|
template<> inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {
|
|
|
|
return vcvtq_f32_s32(src.fVec);
|
|
|
|
}
|
|
|
|
|
2016-02-19 17:40:24 +00:00
|
|
|
template<> inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
|
|
|
|
return vqmovn_u32(vcvtq_u32_f32(src.fVec));
|
|
|
|
}
|
|
|
|
|
|
|
|
template<> inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
|
|
|
|
return vcvtq_f32_u32(vmovl_u16(src.fVec));
|
|
|
|
}
|
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
|
2015-12-14 19:25:18 +00:00
|
|
|
uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
|
|
|
|
uint16x4_t _16 = vqmovn_u32(_32);
|
|
|
|
return vqmovn_u16(vcombine_u16(_16, _16));
|
|
|
|
}
|
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
|
2015-12-14 19:25:18 +00:00
|
|
|
uint16x8_t _16 = vmovl_u8 (src.fVec) ;
|
|
|
|
uint32x4_t _32 = vmovl_u16(vget_low_u16(_16));
|
|
|
|
return vcvtq_f32_u32(_32);
|
|
|
|
}
|
|
|
|
|
2016-03-21 17:04:46 +00:00
|
|
|
template<> inline Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
|
|
|
|
Sk8f ab, cd;
|
|
|
|
SkNx_split(src, &ab, &cd);
|
|
|
|
|
|
|
|
Sk4f a,b,c,d;
|
|
|
|
SkNx_split(ab, &a, &b);
|
|
|
|
SkNx_split(cd, &c, &d);
|
|
|
|
return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
|
|
|
|
(uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
|
|
|
|
vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
|
|
|
|
(uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0];
|
2015-12-14 19:25:18 +00:00
|
|
|
}
|
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
|
2016-01-20 19:55:51 +00:00
|
|
|
return vget_low_u16(vmovl_u8(src.fVec));
|
|
|
|
}
|
|
|
|
|
sknx refactoring
- trim unused specializations (Sk4i, Sk2d) and apis (SkNx_dup)
- expand apis a little
* v[0] == v.kth<0>()
* SkNx_shuffle can now convert to different-sized vectors, e.g. Sk2f <-> Sk4f
- remove anonymous namespace
I believe it's safe to remove the anonymous namespace right now.
We're worried about violating the One Definition Rule; the anonymous namespace protected us from that.
In Release builds, this is mostly moot, as everything tends to inline completely.
In Debug builds, violating the ODR is at worst an inconvenience, time spent trying to figure out why the bot is broken.
Now that we're building with SSE2/NEON everywhere, very few bots have even a chance about getting confused by two definitions of the same type or function. Where we do compile variants depending on, e.g., SSSE3, we do so in static inline functions. These are not subject to the ODR.
I plan to follow up with a tedious .kth<...>() -> [...] auto-replace.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1683543002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1683543002
2016-02-09 18:35:27 +00:00
|
|
|
template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
|
2016-01-20 19:55:51 +00:00
|
|
|
return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
|
|
|
|
}
|
|
|
|
|
2016-07-12 21:55:45 +00:00
|
|
|
template<> inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
|
|
|
|
uint16x4_t _16 = vqmovun_s32(src.fVec);
|
|
|
|
return vqmovn_u16(vcombine_u16(_16, _16));
|
|
|
|
}
|
|
|
|
|
2016-07-15 14:00:11 +00:00
|
|
|
template<> inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {
|
|
|
|
return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
|
|
|
|
}
|
|
|
|
|
|
|
|
template<> inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
|
|
|
|
return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
|
|
|
|
}
|
|
|
|
|
2016-07-12 21:55:45 +00:00
|
|
|
static inline Sk4i Sk4f_round(const Sk4f& x) {
|
|
|
|
return vcvtq_s32_f32((x + 0.5f).fVec);
|
|
|
|
}
|
|
|
|
|
2016-07-26 15:01:19 +00:00
|
|
|
static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
|
|
|
|
uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
|
|
|
|
*r = rgba.val[0];
|
|
|
|
*g = rgba.val[1];
|
|
|
|
*b = rgba.val[2];
|
|
|
|
*a = rgba.val[3];
|
|
|
|
}
|
|
|
|
|
2016-07-19 16:07:55 +00:00
|
|
|
static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
|
|
|
|
const Sk4h& a) {
|
|
|
|
uint16x4x4_t rgba = {{
|
|
|
|
r.fVec,
|
|
|
|
g.fVec,
|
|
|
|
b.fVec,
|
|
|
|
a.fVec,
|
|
|
|
}};
|
|
|
|
vst4_u16((uint16_t*) dst, rgba);
|
|
|
|
}
|
|
|
|
|
Refactor Sk2x<T> + Sk4x<T> into SkNf<N,T> and SkNi<N,T>
The primary feature this delivers is SkNf and SkNd for arbitrary power-of-two N. Non-specialized types or types larger than 128 bits should now Just Work (and we can drop in a specialization to make them faster). Sk4s is now just a typedef for SkNf<4, SkScalar>; Sk4d is SkNf<4, double>, Sk2f SkNf<2, float>, etc.
This also makes implementing new specializations easier and more encapsulated. We're now using template specialization, which means the specialized versions don't have to leak out so much from SkNx_sse.h and SkNx_neon.h.
This design leaves us room to grow up, e.g to SkNf<8, SkScalar> == Sk8s, and to grown down too, to things like SkNi<8, uint16_t> == Sk8h.
To simplify things, I've stripped away most APIs (swizzles, casts, reinterpret_casts) that no one's using yet. I will happily add them back if they seem useful.
You shouldn't feel bad about using any of the typedef Sk4s, Sk4f, Sk4d, Sk2s, Sk2f, Sk2d, Sk4i, etc. Here's how you should feel:
- Sk4f, Sk4s, Sk2d: feel awesome
- Sk2f, Sk2s, Sk4d: feel pretty good
No public API changes.
TBR=reed@google.com
BUG=skia:3592
Review URL: https://codereview.chromium.org/1048593002
2015-03-30 17:50:27 +00:00
|
|
|
#endif//SkNx_neon_DEFINED
|