From e32e3e8662b73a5f5456b67311ea785ca9c55a2d Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 28 Jan 2020 20:37:04 -0800 Subject: [PATCH] Improve wildcopy performance across the board --- lib/common/zstd_internal.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index 0bb67783..d0b01435 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -215,7 +215,7 @@ typedef enum { * - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart. * The src buffer must be before the dst buffer. */ -MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE +MEM_STATIC FORCE_INLINE_ATTR void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype) { ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src; @@ -232,14 +232,13 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e } while (op < oend); } else { assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); - /* Separate out the first two COPY16() calls because the copy length is + /* Separate out the first COPY16() call because the copy length is * almost certain to be short, so the branches have different - * probabilities. - * On gcc-9 unrolling once is +1.6%, twice is +2%, thrice is +1.8%. - * On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%. + * probabilities. Since it is almost certain to be short, only do + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. */ COPY16(op, ip); - COPY16(op, ip); if (op >= oend) return; do { COPY16(op, ip);