decompress_generic: optimize match copy

Add an LZ4_wildCopy16, that will wildcopy, potentially smashing up
to 16 bytes, and use it for match copy.  On x64, this avoids many
blocked loads due to store forwarding, similar to issue #411.
This commit is contained in:
Dave Watson 2019-01-24 14:17:24 -08:00
parent 28356e02ad
commit 5dfa7d422b

View File

@ -297,6 +297,16 @@ void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
do { memcpy(d,s,8); d+=8; s+=8; } while (d<e); do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
} }
/* customized variant of memcpy, which can overwrite up to 16 bytes beyond dstEnd */
LZ4_FORCE_O2_INLINE_GCC_PPC64LE
void LZ4_wildCopy16(void* dstPtr, const void* srcPtr, void* dstEnd)
{
BYTE* d = (BYTE*)dstPtr;
const BYTE* s = (const BYTE*)srcPtr;
BYTE* const e = (BYTE*)dstEnd;
do { memcpy(d,s,16); d+=16; s+=16; } while (d<e);
}
/*-************************************ /*-************************************
* Common Constants * Common Constants
@ -1627,7 +1637,8 @@ LZ4_decompress_generic(
continue; continue;
} }
if (unlikely(offset<8)) { if (unlikely(offset<16)) {
if (offset < 8) {
op[0] = match[0]; op[0] = match[0];
op[1] = match[1]; op[1] = match[1];
op[2] = match[2]; op[2] = match[2];
@ -1635,25 +1646,19 @@ LZ4_decompress_generic(
match += inc32table[offset]; match += inc32table[offset];
memcpy(op+4, match, 4); memcpy(op+4, match, 4);
match -= dec64table[offset]; match -= dec64table[offset];
op += 8;
} else { } else {
memcpy(op, match, 8); memcpy(op, match, 8);
op += 8;
match += 8; match += 8;
} }
op += 8;
if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
if (op < oCopyLimit) {
LZ4_wildCopy(op, match, oCopyLimit);
match += oCopyLimit - op;
op = oCopyLimit;
}
while (op < cpy) *op++ = *match++;
} else {
memcpy(op, match, 8); memcpy(op, match, 8);
if (length > 16) LZ4_wildCopy(op+8, match+8, cpy); if (length > 16) LZ4_wildCopy(op+8, match+8, cpy);
} else {
LZ4_wildCopy16(op, match, cpy);
} }
op = cpy; /* wildcopy correction */ op = cpy; /* wildcopy correction */
} }