From 45f8603aae389d34c689d3ff7427b314071ccd2c Mon Sep 17 00:00:00 2001 From: Alexey Tourbin Date: Sat, 28 Apr 2018 11:14:40 +0300 Subject: [PATCH] lz4.c: two-stage shortcut for LZ4_decompress_generic --- lib/lz4.c | 107 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 25 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index 06ff611..1330374 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1397,6 +1397,9 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( const int safeDecode = (endOnInput==endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + /* Set up the "end" pointers for the shortcut. */ + const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; + const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; /* Special cases */ if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => just decode everything */ @@ -1406,39 +1409,90 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( /* Main Loop : decode sequences */ while (1) { - size_t length; const BYTE* match; size_t offset; unsigned const token = *ip++; + size_t length = token >> ML_BITS; /* literal length */ assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ - /* shortcut for common case : - * in most circumstances, we expect to decode small matches (<= 18 bytes) separated by few literals (<= 14 bytes). - * this shortcut was tested on x86 and x64, where it improves decoding speed. - * it has not yet been benchmarked on ARM, Power, mips, etc. - * NOTE: The loop begins with a read, so we must have one byte left at the end. */ - if (endOnInput - && ((ip + 14 /*maxLL*/ + 2 /*offset*/ < iend) - & (op + 14 /*maxLL*/ + 18 /*maxML*/ <= oend) - & (token < (15<> ML_BITS; - size_t const off = LZ4_readLE16(ip+ll); - const BYTE* const matchPtr = op + ll - off; /* pointer underflow risk ? */ - if ((off >= 8) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) { - size_t const ml = (token & ML_MASK) + MINMATCH; - memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/; - memcpy(op + 0, matchPtr + 0, 8); - memcpy(op + 8, matchPtr + 8, 8); - memcpy(op +16, matchPtr +16, 2); - op += ml; - continue; + + /* A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough space, + * enter the shortcut and copy 16 bytes on behalf of the literals + * (in the fast mode, only 8 bytes can be safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes in a similar + * manner; but we ensure that there's enough space in the output for + * those 18 bytes earlier, upon entering the shortcut (in other words, + * there is a combined check for both stages). + */ + if ((endOnInput ? length != RUN_MASK : length <= 8) && + /* strictly "less than" on input, to re-enter the loop with at least one byte */ + likely((endOnInput ? ip < shortiend : 1) && (op <= shortoend))) + { + /* Can we copy the literals with a single memcpy invocation? Sometimes we can't + * copy 16 bytes, because they can clobber the dictionary in the ring buffer. */ + if (!endOnInput /* only 8 bytes */ || /* nothing to clobber */ dict != usingExtDict) { + /* Copy the literals. */ + memcpy(op, ip, endOnInput ? 16 : 8); + op += length; ip += length; + + /* The second stage: prepare for match copying, decode full info. + * It if doesn't work out, the info won't be wasted. */ + length = token & ML_MASK; /* match length */ + offset = LZ4_readLE16(ip); ip += 2; + match = op - offset; + + /* Do not deal with overlapping matches. */ + if ((length != 15) && (offset >= 8) && + (dict==withPrefix64k || match >= lowPrefix)) + { + /* Copy the match. */ + memcpy(op + 0, match + 0, 8); + memcpy(op + 8, match + 8, 8); + memcpy(op +16, match +16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + } else { + /* Save the literal length, can't copy 16 bytes just yet. */ + size_t ll = length; + + /* Prepare for the second satge. */ + length = token & ML_MASK; + offset = LZ4_readLE16(ip+ll); + match = op + ll - offset; + + if ((length != 15) && (offset >= 8) && + (dict==withPrefix64k || match >= lowPrefix)) + { + /* Copy the literals. */ + memcpy(op, ip, 16); + op += ll; ip += ll + 2; + /* Copy the match. */ + memcpy(op + 0, match + 0, 8); + memcpy(op + 8, match + 8, 8); + memcpy(op +16, match +16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* So we took the literlas, but the second stage didn't work. */ + memcpy(op, ip, 8); + if (ll > 8) + memcpy(op + 8, ip + 8, 8); + op += ll; ip += ll + 2; } + + /* The second stage didn't work out, but the info is ready. + * Propel it right to the point of match copying. */ + goto _copy_match; } /* decode literal length */ - if ((length=(token>>ML_BITS)) == RUN_MASK) { + if (length == RUN_MASK) { unsigned s; if (unlikely(endOnInput ? ip >= iend-RUN_MASK : 0)) goto _output_error; /* overflow detection */ do { @@ -1472,11 +1526,14 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( /* get offset */ offset = LZ4_readLE16(ip); ip+=2; match = op - offset; - if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ - LZ4_write32(op, (U32)offset); /* costs ~1%; silence an msan warning when offset==0 */ /* get matchlength */ length = token & ML_MASK; + +_copy_match: + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ + LZ4_write32(op, (U32)offset); /* costs ~1%; silence an msan warning when offset==0 */ + if (length == ML_MASK) { unsigned s; do {