From 5dfa7d422ba6c184a7c7694f56bcd36e38e5ed1a Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 24 Jan 2019 14:17:24 -0800 Subject: [PATCH] decompress_generic: optimize match copy Add an LZ4_wildCopy16, that will wildcopy, potentially smashing up to 16 bytes, and use it for match copy. On x64, this avoids many blocked loads due to store forwarding, similar to issue #411. --- lib/lz4.c | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index a2b49e9..1e938d0 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -297,6 +297,16 @@ void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) do { memcpy(d,s,8); d+=8; s+=8; } while (d oend-MATCH_SAFEGUARD_DISTANCE)) { - BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1); - if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ - if (op < oCopyLimit) { - LZ4_wildCopy(op, match, oCopyLimit); - match += oCopyLimit - op; - op = oCopyLimit; + if (unlikely(offset<16)) { + if (offset < 8) { + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + memcpy(op+4, match, 4); + match -= dec64table[offset]; + op += 8; + } else { + memcpy(op, match, 8); + op += 8; + match += 8; } - while (op < cpy) *op++ = *match++; - } else { + memcpy(op, match, 8); if (length > 16) LZ4_wildCopy(op+8, match+8, cpy); + } else { + LZ4_wildCopy16(op, match, cpy); } + op = cpy; /* wildcopy correction */ }