somewhat less silly tail loads and stores
No reason to keep going one at a time when we know there are generally better ways to handle loading a power-of-two number of low lanes. This strategy scales up too, with quick answers for 8 (one 8 byte load), 12 (one 8 byte, one 4 byte), etc. $ ninja -C out monobench; and out/monobench SkRasterPipeline_compile 300 Before: 46.946ns After: 43.341ns (This happens to be _lowp. Expect similar small speedups elsewhere.) Change-Id: I08f87769ea3c9f06ad13d2b1d5326e542b9b63a8 Reviewed-on: https://skia-review.googlesource.com/20903 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
660cc9910f
commit
c4fcbed6b2
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -94,14 +94,14 @@ SI V load(const T* src, size_t tail) {
|
||||
__builtin_assume(tail < kStride);
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
V v{}; // Any inactive lanes are zeroed.
|
||||
switch (tail-1) {
|
||||
case 6: v[6] = src[6];
|
||||
case 5: v[5] = src[5];
|
||||
case 4: v[4] = src[4];
|
||||
case 3: v[3] = src[3];
|
||||
case 2: v[2] = src[2];
|
||||
case 1: v[1] = src[1];
|
||||
case 0: v[0] = src[0];
|
||||
switch (tail) {
|
||||
case 7: v[6] = src[6];
|
||||
case 6: v[5] = src[5];
|
||||
case 5: v[4] = src[4];
|
||||
case 4: memcpy(&v, src, 4*sizeof(T)); break;
|
||||
case 3: v[2] = src[2];
|
||||
case 2: memcpy(&v, src, 2*sizeof(T)); break;
|
||||
case 1: memcpy(&v, src, 1*sizeof(T)); break;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
@ -114,14 +114,14 @@ SI void store(T* dst, V v, size_t tail) {
|
||||
#if defined(JUMPER)
|
||||
__builtin_assume(tail < kStride);
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
switch (tail-1) {
|
||||
case 6: dst[6] = v[6];
|
||||
case 5: dst[5] = v[5];
|
||||
case 4: dst[4] = v[4];
|
||||
case 3: dst[3] = v[3];
|
||||
case 2: dst[2] = v[2];
|
||||
case 1: dst[1] = v[1];
|
||||
case 0: dst[0] = v[0];
|
||||
switch (tail) {
|
||||
case 7: dst[6] = v[6];
|
||||
case 6: dst[5] = v[5];
|
||||
case 5: dst[4] = v[4];
|
||||
case 4: memcpy(dst, &v, 4*sizeof(T)); break;
|
||||
case 3: dst[2] = v[2];
|
||||
case 2: memcpy(dst, &v, 2*sizeof(T)); break;
|
||||
case 1: memcpy(dst, &v, 1*sizeof(T)); break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -87,14 +87,14 @@ SI V load(const T* src, size_t tail) {
|
||||
__builtin_assume(tail < kStride);
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
V v{}; // Any inactive lanes are zeroed.
|
||||
switch (tail-1) {
|
||||
case 6: v[6] = src[6];
|
||||
case 5: v[5] = src[5];
|
||||
case 4: v[4] = src[4];
|
||||
case 3: v[3] = src[3];
|
||||
case 2: v[2] = src[2];
|
||||
case 1: v[1] = src[1];
|
||||
case 0: v[0] = src[0];
|
||||
switch (tail) {
|
||||
case 7: v[6] = src[6];
|
||||
case 6: v[5] = src[5];
|
||||
case 5: v[4] = src[4];
|
||||
case 4: memcpy(&v, src, 4*sizeof(T)); break;
|
||||
case 3: v[2] = src[2];
|
||||
case 2: memcpy(&v, src, 2*sizeof(T)); break;
|
||||
case 1: memcpy(&v, src, 1*sizeof(T)); break;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
@ -105,14 +105,14 @@ template <typename V, typename T>
|
||||
SI void store(T* dst, V v, size_t tail) {
|
||||
__builtin_assume(tail < kStride);
|
||||
if (__builtin_expect(tail, 0)) {
|
||||
switch (tail-1) {
|
||||
case 6: dst[6] = v[6];
|
||||
case 5: dst[5] = v[5];
|
||||
case 4: dst[4] = v[4];
|
||||
case 3: dst[3] = v[3];
|
||||
case 2: dst[2] = v[2];
|
||||
case 1: dst[1] = v[1];
|
||||
case 0: dst[0] = v[0];
|
||||
switch (tail) {
|
||||
case 7: dst[6] = v[6];
|
||||
case 6: dst[5] = v[5];
|
||||
case 5: dst[4] = v[4];
|
||||
case 4: memcpy(dst, &v, 4*sizeof(T)); break;
|
||||
case 3: dst[2] = v[2];
|
||||
case 2: memcpy(dst, &v, 2*sizeof(T)); break;
|
||||
case 1: memcpy(dst, &v, 1*sizeof(T)); break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user