Merge pull request #1977 from terrelln/copyless

Improve wildcopy performance across the board
This commit is contained in:
Nick Terrell 2020-01-29 15:32:19 -08:00 committed by GitHub
commit 6a426ef65e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -215,7 +215,7 @@ typedef enum {
* - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart. * - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
* The src buffer must be before the dst buffer. * The src buffer must be before the dst buffer.
*/ */
MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE MEM_STATIC FORCE_INLINE_ATTR
void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype) void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype)
{ {
ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src; ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
@ -232,14 +232,13 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
} while (op < oend); } while (op < oend);
} else { } else {
assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
/* Separate out the first two COPY16() calls because the copy length is /* Separate out the first COPY16() call because the copy length is
* almost certain to be short, so the branches have different * almost certain to be short, so the branches have different
* probabilities. * probabilities. Since it is almost certain to be short, only do
* On gcc-9 unrolling once is +1.6%, twice is +2%, thrice is +1.8%. * one COPY16() in the first call. Then, do two calls per loop since
* On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%. * at that point it is more likely to have a high trip count.
*/ */
COPY16(op, ip); COPY16(op, ip);
COPY16(op, ip);
if (op >= oend) return; if (op >= oend) return;
do { do {
COPY16(op, ip); COPY16(op, ip);