Revert "more powerful map()"
This reverts commit a3dd5ec3a7
.
Reason for revert: breaking build on Build-Debian9-Clang-x86_64_Release-Fast
Original change's description:
> more powerful map()
>
> Change-Id: Icbae002999a295e3a9d1d2e6046e686784d5f608
> Reviewed-on: https://skia-review.googlesource.com/69901
> Reviewed-by: Florin Malita <fmalita@chromium.org>
> Commit-Queue: Mike Klein <mtklein@chromium.org>
TBR=mtklein@chromium.org,fmalita@chromium.org
Change-Id: Ice989dd6a6b2786f318791dd91f2c06f689cb979
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Reviewed-on: https://skia-review.googlesource.com/70105
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Greg Daniel <egdaniel@google.com>
This commit is contained in:
parent
77e7500507
commit
9af41d7caa
@ -183,28 +183,14 @@ SI D join(S lo, S hi) {
|
|||||||
memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
|
memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
template <typename V, typename H>
|
||||||
#if defined(__AVX2__)
|
SI V map(V v, H (*fn)(H)) {
|
||||||
template <typename V, typename Fn>
|
H lo,hi;
|
||||||
SI auto map(V v, Fn&& fn) -> decltype(
|
split(v, &lo,&hi);
|
||||||
__builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1, 2, 3, 4, 5, 6, 7)),
|
lo = fn(lo);
|
||||||
fn(__builtin_shufflevector(v,v, 8,9,10,11,12,13,14,15)),
|
hi = fn(hi);
|
||||||
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)) {
|
return join<V>(lo,hi);
|
||||||
return __builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1, 2, 3, 4, 5, 6, 7)),
|
}
|
||||||
fn(__builtin_shufflevector(v,v, 8,9,10,11,12,13,14,15)),
|
|
||||||
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
template <typename V, typename Fn>
|
|
||||||
SI auto map(V v, Fn&& fn) -> decltype(
|
|
||||||
__builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1,2,3)),
|
|
||||||
fn(__builtin_shufflevector(v,v, 4,5,6,7)),
|
|
||||||
0,1,2,3,4,5,6,7)) {
|
|
||||||
return __builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1,2,3)),
|
|
||||||
fn(__builtin_shufflevector(v,v, 4,5,6,7)),
|
|
||||||
0,1,2,3,4,5,6,7);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// TODO: do we need platform-specific intrinsics for any of these?
|
// TODO: do we need platform-specific intrinsics for any of these?
|
||||||
SI F if_then_else(I32 c, F t, F e) {
|
SI F if_then_else(I32 c, F t, F e) {
|
||||||
@ -222,7 +208,7 @@ SI F rcp(F x) {
|
|||||||
#elif defined(__SSE__)
|
#elif defined(__SSE__)
|
||||||
return map(x, _mm_rcp_ps);
|
return map(x, _mm_rcp_ps);
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
return map(x, [](float32x4_t v) {
|
return map(x, +[](float32x4_t v) {
|
||||||
auto est = vrecpeq_f32(v);
|
auto est = vrecpeq_f32(v);
|
||||||
return vrecpsq_f32(v,est)*est;
|
return vrecpsq_f32(v,est)*est;
|
||||||
});
|
});
|
||||||
@ -238,7 +224,7 @@ SI F sqrt_(F x) {
|
|||||||
#elif defined(__aarch64__)
|
#elif defined(__aarch64__)
|
||||||
return map(x, vsqrtq_f32);
|
return map(x, vsqrtq_f32);
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
return map(x, [](float32x4_t v) {
|
return map(x, +[](float32x4_t v) {
|
||||||
auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
|
auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
|
||||||
est *= vrsqrtsq_f32(v,est*est);
|
est *= vrsqrtsq_f32(v,est*est);
|
||||||
est *= vrsqrtsq_f32(v,est*est);
|
est *= vrsqrtsq_f32(v,est*est);
|
||||||
@ -256,9 +242,9 @@ SI F floor_(F x) {
|
|||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__)
|
||||||
return map(x, vrndmq_f32);
|
return map(x, vrndmq_f32);
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
return map(x, [](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro...
|
return map(x, +[](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro...
|
||||||
#elif defined(__SSE4_1__)
|
#elif defined(__SSE4_1__)
|
||||||
return map(x, [](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too.
|
return map(x, +[](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too.
|
||||||
#else
|
#else
|
||||||
F roundtrip = cast<F>(cast<I32>(x));
|
F roundtrip = cast<F>(cast<I32>(x));
|
||||||
return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
|
return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
|
||||||
@ -494,13 +480,21 @@ SI void store(T* ptr, size_t tail, V v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
F gather(const float* ptr, U32 ix) {
|
F gather(const float* p, U32 ix) {
|
||||||
return map(ix, [&](__m256i ix) { return _mm256_i32gather_ps(ptr, ix, 4); });
|
__m256i lo, hi;
|
||||||
|
split(ix, &lo, &hi);
|
||||||
|
|
||||||
|
return join<F>(_mm256_i32gather_ps(p, lo, 4),
|
||||||
|
_mm256_i32gather_ps(p, hi, 4));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
U32 gather(const uint32_t* ptr, U32 ix) {
|
U32 gather(const uint32_t* p, U32 ix) {
|
||||||
return map(ix, [&](__m256i ix) { return _mm256_i32gather_epi32(ptr, ix, 4); });
|
__m256i lo, hi;
|
||||||
|
split(ix, &lo, &hi);
|
||||||
|
|
||||||
|
return join<U32>(_mm256_i32gather_epi32(p, lo, 4),
|
||||||
|
_mm256_i32gather_epi32(p, hi, 4));
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
template <typename V, typename T>
|
template <typename V, typename T>
|
||||||
|
Loading…
Reference in New Issue
Block a user