From b3c9fc27b4596beb7452ae71647a97eb7fad08cd Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Tue, 5 Nov 2019 14:14:02 +0100 Subject: [PATCH] Optimized loop bounds to allow the compiler to unroll the loop. This has no measurable impact on large files but improves small file decompression by ~1-2% for 10kB, benchmarked with: head -c 10000 silesia.tar > /tmp/test make CC=/usr/local/bin/clang-9 BUILD_STATIC=1 && ./lzbench -ezstd -t1,5 /tmp/test --- lib/decompress/huf_decompress.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index bb2d0a96..fd417088 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -181,17 +181,29 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize /* fill DTable */ { U32 n; - for (n=0; n> 1; - U32 u; + size_t const nEnd = nbSymbols; + for (n=0; n> 1; + size_t const uStart = rankVal[w]; + size_t const uEnd = uStart + length; + size_t u; HUF_DEltX1 D; - D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w); - for (u = rankVal[w]; u < rankVal[w] + length; u++) - dt[u] = D; - rankVal[w] += length; - } } - + D.byte = (BYTE)n; + D.nbBits = (BYTE)(tableLog + 1 - w); + rankVal[w] = uEnd; + if (length < 4) { + /* Use length in the loop bound so the compiler knows it is short. */ + for (u = 0; u < length; ++u) + dt[uStart + u] = D; + } else { + /* Unroll the loop 4 times, we know it is a power of 2. */ + for (u = uStart; u < uEnd; u += 4) { + dt[u + 0] = D; + dt[u + 1] = D; + dt[u + 2] = D; + dt[u + 3] = D; + } } } } return iSize; }